aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-03-05 00:02:58 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-03-05 00:15:55 +0100
commit2af8f2cea6c94eba3a15820194cb7374b366976a (patch)
tree634d34b8adf1c35cc1bb7c3eb1f2b49775ffbb56 /libavcodec
parent33a183df46355e4b281517e14c9b3c7e2b558dcf (diff)
parent3faa141d15bf9945fa54331e51b3f10b9970d5d2 (diff)
downloadffmpeg-2af8f2cea6c94eba3a15820194cb7374b366976a.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: (27 commits) cmdutils: use new avcodec_is_decoder/encoder() functions. lavc: make codec_is_decoder/encoder() public. lavc: deprecate AVCodecContext.sub_id. libcdio: add a forgotten AVClass to the private context. swscale: remove "cpu flags" from -sws_flags description. proresenc: give user a possibility to alter some encoding parameters vorbisenc: add output buffer overwrite protection libopencore-amrnbenc: fix end-of-stream handling ra144enc: fix end-of-stream handling nellymoserenc: zero any leftover packet bytes nellymoserenc: use proper MDCT overlap delay qpeg: Use bytestream2 functions to prevent buffer overreads. swscale: make %rep unconditional. vp8: convert simple loopfilter x86 assembly to use named arguments. vp8: convert idct x86 assembly to use named arguments. vp8: convert mc x86 assembly to use named arguments. vp8: convert loopfilter x86 assembly to use cpuflags(). vp8: convert idct/mc x86 assembly to use cpuflags(). swscale: remove now unnecessary hack. x86inc: don't "bake" stack_offset in named arguments. ... Conflicts: cmdutils.c doc/APIchanges libavcodec/mpeg12.c libavcodec/options.c libavcodec/qpeg.c libavcodec/utils.c libavcodec/version.h libavdevice/libcdio.c tests/lavf-regression.sh Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/avcodec.h21
-rw-r--r--libavcodec/libopencore-amr.c28
-rw-r--r--libavcodec/mpeg12.c5
-rw-r--r--libavcodec/mpegaudiodec.c2
-rw-r--r--libavcodec/mpegaudiodecheader.c1
-rw-r--r--libavcodec/mpegvideo_parser.c2
-rw-r--r--libavcodec/nellymoserenc.c49
-rw-r--r--libavcodec/options.c2
-rw-r--r--libavcodec/proresenc_kostya.c178
-rw-r--r--libavcodec/pthread.c1
-rw-r--r--libavcodec/qpeg.c104
-rw-r--r--libavcodec/ra144.h1
-rw-r--r--libavcodec/ra144enc.c33
-rw-r--r--libavcodec/rv10.c38
-rw-r--r--libavcodec/utils.c20
-rw-r--r--libavcodec/version.h5
-rw-r--r--libavcodec/vorbisenc.c59
-rw-r--r--libavcodec/wmaenc.c35
-rw-r--r--libavcodec/x86/vp8dsp-init.c112
-rw-r--r--libavcodec/x86/vp8dsp.asm1246
20 files changed, 1037 insertions, 905 deletions
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 592c5341ea..397fcfb7b5 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1280,15 +1280,12 @@ typedef struct AVCodecContext {
*/
unsigned int stream_codec_tag;
+#if FF_API_SUB_ID
/**
- * Some codecs need additional format info. It is stored here.
- * If any muxer uses this then ALL demuxers/parsers AND encoders for the
- * specific codec MUST set it correctly otherwise stream copy breaks.
- * In general use of this field by muxers is not recommended.
- * - encoding: Set by libavcodec.
- * - decoding: Set by libavcodec. (FIXME: Is this OK?)
+ * @deprecated this field is unused
*/
- int sub_id;
+ attribute_deprecated int sub_id;
+#endif
void *priv_data;
@@ -4504,4 +4501,14 @@ const AVClass *avcodec_get_frame_class(void);
*/
int avcodec_is_open(AVCodecContext *s);
+/**
+ * @return a non-zero number if codec is an encoder, zero otherwise
+ */
+int av_codec_is_encoder(AVCodec *codec);
+
+/**
+ * @return a non-zero number if codec is a decoder, zero otherwise
+ */
+int av_codec_is_decoder(AVCodec *codec);
+
#endif /* AVCODEC_AVCODEC_H */
diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c
index 7a0555e6c5..90a8c651e2 100644
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@@ -85,6 +85,7 @@ typedef struct AMRContext {
int enc_bitrate;
int enc_mode;
int enc_dtx;
+ int enc_last_frame;
} AMRContext;
static const AVOption options[] = {
@@ -195,6 +196,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
}
avctx->frame_size = 160;
+ avctx->delay = 50;
avctx->coded_frame = avcodec_alloc_frame();
if (!avctx->coded_frame)
return AVERROR(ENOMEM);
@@ -227,17 +229,40 @@ static int amr_nb_encode_frame(AVCodecContext *avctx,
{
AMRContext *s = avctx->priv_data;
int written;
+ int16_t *flush_buf = NULL;
+ const int16_t *samples = data;
if (s->enc_bitrate != avctx->bit_rate) {
s->enc_mode = get_bitrate_mode(avctx->bit_rate, avctx);
s->enc_bitrate = avctx->bit_rate;
}
- written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, data,
+ if (data) {
+ if (avctx->frame_size < 160) {
+ flush_buf = av_mallocz(160 * sizeof(*flush_buf));
+ if (!flush_buf)
+ return AVERROR(ENOMEM);
+ memcpy(flush_buf, samples, avctx->frame_size * sizeof(*flush_buf));
+ samples = flush_buf;
+ if (avctx->frame_size < 110)
+ s->enc_last_frame = -1;
+ }
+ } else {
+ if (s->enc_last_frame < 0)
+ return 0;
+ flush_buf = av_mallocz(160 * sizeof(*flush_buf));
+ if (!flush_buf)
+ return AVERROR(ENOMEM);
+ samples = flush_buf;
+ s->enc_last_frame = -1;
+ }
+
+ written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples,
frame, 0);
av_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n",
written, s->enc_mode, frame[0]);
+ av_freep(&flush_buf);
return written;
}
@@ -249,6 +274,7 @@ AVCodec ff_libopencore_amrnb_encoder = {
.init = amr_nb_encode_init,
.encode = amr_nb_encode_frame,
.close = amr_nb_encode_close,
+ .capabilities = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME,
.sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
.long_name = NULL_IF_CONFIG_SMALL("OpenCORE Adaptive Multi-Rate (AMR) Narrow-Band"),
.priv_class = &class,
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index 5c5e09ec2d..548e26d9bc 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -1276,7 +1276,6 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
* that behave like P-frames. */
avctx->has_b_frames = !s->low_delay;
- assert((avctx->sub_id == 1) == (avctx->codec_id == CODEC_ID_MPEG1VIDEO));
if (avctx->codec_id == CODEC_ID_MPEG1VIDEO) {
//MPEG-1 fps
avctx->time_base.den = avpriv_frame_rate_tab[s->frame_rate_index].num;
@@ -1420,7 +1419,6 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
av_dlog(s->avctx, "sequence extension\n");
s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO;
- s->avctx->sub_id = 2; /* indicates MPEG-2 found */
if (s->avctx->debug & FF_DEBUG_PICT_INFO)
av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
@@ -2038,7 +2036,6 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
s->frame_pred_frame_dct = 1;
s->chroma_format = 1;
s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG1VIDEO;
- avctx->sub_id = 1; /* indicates MPEG-1 */
s->out_format = FMT_MPEG1;
s->swap_uv = 0; // AFAIK VCR2 does not have SEQ_HEADER
if (s->flags & CODEC_FLAG_LOW_DELAY)
@@ -2097,12 +2094,10 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
s->chroma_format = 1;
if (s->codec_tag == AV_RL32("BW10")) {
s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG1VIDEO;
- avctx->sub_id = 1; /* indicates MPEG-1 */
} else {
exchange_uv(s); // common init reset pblocks, so we swap them here
s->swap_uv = 1; // in case of xvmc we need to swap uv for each MB
s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO;
- avctx->sub_id = 2; /* indicates MPEG-2 */
}
s1->save_width = s->width;
s1->save_height = s->height;
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index b80d7c771a..51db72a177 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1659,7 +1659,6 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
avctx->channel_layout = s->nb_channels == 1 ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
if (!avctx->bit_rate)
avctx->bit_rate = s->bit_rate;
- avctx->sub_id = s->layer;
if (s->frame_size <= 0 || s->frame_size > buf_size) {
av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
@@ -1732,7 +1731,6 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data,
avctx->channels = s->nb_channels;
if (!avctx->bit_rate)
avctx->bit_rate = s->bit_rate;
- avctx->sub_id = s->layer;
s->frame_size = len;
diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c
index 24919ab544..b00d804d0e 100644
--- a/libavcodec/mpegaudiodecheader.c
+++ b/libavcodec/mpegaudiodecheader.c
@@ -142,6 +142,5 @@ int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_r
*sample_rate = s->sample_rate;
*channels = s->nb_channels;
*bit_rate = s->bit_rate;
- avctx->sub_id = s->layer;
return s->frame_size;
}
diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index 7a905a9e3f..776052d252 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -69,7 +69,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
pc->frame_rate.num = avctx->time_base.num = avpriv_frame_rate_tab[frame_rate_index].den;
avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
avctx->codec_id = CODEC_ID_MPEG1VIDEO;
- avctx->sub_id = 1;
}
break;
case EXT_START_CODE:
@@ -94,7 +93,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
avctx->time_base.den = pc->frame_rate.den * (frame_rate_ext_n + 1) * 2;
avctx->time_base.num = pc->frame_rate.num * (frame_rate_ext_d + 1);
avctx->codec_id = CODEC_ID_MPEG2VIDEO;
- avctx->sub_id = 2; /* forces MPEG2 */
}
break;
case 0x8: /* picture coding extension */
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 8e018c1b7f..29ad7a2e26 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -52,13 +52,11 @@
typedef struct NellyMoserEncodeContext {
AVCodecContext *avctx;
int last_frame;
- int bufsel;
- int have_saved;
DSPContext dsp;
FFTContext mdct_ctx;
DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
- DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
+ DECLARE_ALIGNED(32, float, buf)[3 * NELLY_BUF_LEN]; ///< sample buffer
float (*opt )[NELLY_BANDS];
uint8_t (*path)[NELLY_BANDS];
} NellyMoserEncodeContext;
@@ -115,16 +113,17 @@ static const uint8_t quant_lut_offset[8] = { 0, 0, 1, 4, 11, 32, 81, 230 };
static void apply_mdct(NellyMoserEncodeContext *s)
{
- s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN);
- s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128,
- NELLY_BUF_LEN);
+ float *in0 = s->buf;
+ float *in1 = s->buf + NELLY_BUF_LEN;
+ float *in2 = s->buf + 2 * NELLY_BUF_LEN;
+
+ s->dsp.vector_fmul (s->in_buff, in0, ff_sine_128, NELLY_BUF_LEN);
+ s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
- s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN,
- ff_sine_128, NELLY_BUF_LEN);
- s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128,
- NELLY_BUF_LEN);
- s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN);
+ s->dsp.vector_fmul (s->in_buff, in1, ff_sine_128, NELLY_BUF_LEN);
+ s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
+ s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff);
}
static av_cold int encode_end(AVCodecContext *avctx)
@@ -161,6 +160,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
}
avctx->frame_size = NELLY_SAMPLES;
+ avctx->delay = NELLY_BUF_LEN;
s->avctx = avctx;
if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
goto error;
@@ -363,38 +363,33 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
}
flush_put_bits(&pb);
+ memset(put_bits_ptr(&pb), 0, output + output_size - put_bits_ptr(&pb));
}
static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size, void *data)
{
NellyMoserEncodeContext *s = avctx->priv_data;
const float *samples = data;
- int i;
if (s->last_frame)
return 0;
+ memcpy(s->buf, s->buf + NELLY_SAMPLES, NELLY_BUF_LEN * sizeof(*s->buf));
if (data) {
- memcpy(s->buf[s->bufsel], samples, avctx->frame_size * sizeof(*samples));
- for (i = avctx->frame_size; i < NELLY_SAMPLES; i++) {
- s->buf[s->bufsel][i] = 0;
- }
- s->bufsel = 1 - s->bufsel;
- if (!s->have_saved) {
- s->have_saved = 1;
- return 0;
+ memcpy(s->buf + NELLY_BUF_LEN, samples, avctx->frame_size * sizeof(*s->buf));
+ if (avctx->frame_size < NELLY_SAMPLES) {
+ memset(s->buf + NELLY_BUF_LEN + avctx->frame_size, 0,
+ (NELLY_SAMPLES - avctx->frame_size) * sizeof(*s->buf));
+ if (avctx->frame_size >= NELLY_BUF_LEN)
+ s->last_frame = 1;
}
} else {
- memset(s->buf[s->bufsel], 0, sizeof(s->buf[0][0]) * NELLY_BUF_LEN);
- s->bufsel = 1 - s->bufsel;
+ memset(s->buf + NELLY_BUF_LEN, 0, NELLY_SAMPLES * sizeof(*s->buf));
s->last_frame = 1;
}
- if (s->have_saved) {
- encode_block(s, frame, buf_size);
- return NELLY_BLOCK_LEN;
- }
- return 0;
+ encode_block(s, frame, buf_size);
+ return NELLY_BLOCK_LEN;
}
AVCodec ff_nellymoser_encoder = {
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 04277dba4d..9aae191150 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -111,7 +111,9 @@ static const AVOption options[]={
{"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"},
{"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"},
{"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"},
+#if FF_API_SUB_ID
{"sub_id", NULL, OFFSET(sub_id), AV_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
+#endif
{"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.dbl = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"},
{"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" },
{"full", "full motion estimation (slowest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c
index 7920d65ed1..16e64d1d08 100644
--- a/libavcodec/proresenc_kostya.c
+++ b/libavcodec/proresenc_kostya.c
@@ -42,6 +42,67 @@ enum {
PRORES_PROFILE_HQ,
};
+enum {
+ QUANT_MAT_PROXY = 0,
+ QUANT_MAT_LT,
+ QUANT_MAT_STANDARD,
+ QUANT_MAT_HQ,
+ QUANT_MAT_DEFAULT,
+};
+
+static const uint8_t prores_quant_matrices[][64] = {
+ { // proxy
+ 4, 7, 9, 11, 13, 14, 15, 63,
+ 7, 7, 11, 12, 14, 15, 63, 63,
+ 9, 11, 13, 14, 15, 63, 63, 63,
+ 11, 11, 13, 14, 63, 63, 63, 63,
+ 11, 13, 14, 63, 63, 63, 63, 63,
+ 13, 14, 63, 63, 63, 63, 63, 63,
+ 13, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ },
+ { // LT
+ 4, 5, 6, 7, 9, 11, 13, 15,
+ 5, 5, 7, 8, 11, 13, 15, 17,
+ 6, 7, 9, 11, 13, 15, 15, 17,
+ 7, 7, 9, 11, 13, 15, 17, 19,
+ 7, 9, 11, 13, 14, 16, 19, 23,
+ 9, 11, 13, 14, 16, 19, 23, 29,
+ 9, 11, 13, 15, 17, 21, 28, 35,
+ 11, 13, 16, 17, 21, 28, 35, 41,
+ },
+ { // standard
+ 4, 4, 5, 5, 6, 7, 7, 9,
+ 4, 4, 5, 6, 7, 7, 9, 9,
+ 5, 5, 6, 7, 7, 9, 9, 10,
+ 5, 5, 6, 7, 7, 9, 9, 10,
+ 5, 6, 7, 7, 8, 9, 10, 12,
+ 6, 7, 7, 8, 9, 10, 12, 15,
+ 6, 7, 7, 9, 10, 11, 14, 17,
+ 7, 7, 9, 10, 11, 14, 17, 21,
+ },
+ { // high quality
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 5,
+ 4, 4, 4, 4, 4, 4, 5, 5,
+ 4, 4, 4, 4, 4, 5, 5, 6,
+ 4, 4, 4, 4, 5, 5, 6, 7,
+ 4, 4, 4, 4, 5, 6, 7, 7,
+ },
+ { // codec default
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+};
+
#define NUM_MB_LIMITS 4
static const int prores_mb_limits[NUM_MB_LIMITS] = {
1620, // up to 720x576
@@ -56,7 +117,7 @@ static const struct prores_profile {
int min_quant;
int max_quant;
int br_tab[NUM_MB_LIMITS];
- uint8_t quant[64];
+ int quant;
} prores_profile_info[4] = {
{
.full_name = "proxy",
@@ -64,16 +125,7 @@ static const struct prores_profile {
.min_quant = 4,
.max_quant = 8,
.br_tab = { 300, 242, 220, 194 },
- .quant = {
- 4, 7, 9, 11, 13, 14, 15, 63,
- 7, 7, 11, 12, 14, 15, 63, 63,
- 9, 11, 13, 14, 15, 63, 63, 63,
- 11, 11, 13, 14, 63, 63, 63, 63,
- 11, 13, 14, 63, 63, 63, 63, 63,
- 13, 14, 63, 63, 63, 63, 63, 63,
- 13, 63, 63, 63, 63, 63, 63, 63,
- 63, 63, 63, 63, 63, 63, 63, 63,
- },
+ .quant = QUANT_MAT_PROXY,
},
{
.full_name = "LT",
@@ -81,16 +133,7 @@ static const struct prores_profile {
.min_quant = 1,
.max_quant = 9,
.br_tab = { 720, 560, 490, 440 },
- .quant = {
- 4, 5, 6, 7, 9, 11, 13, 15,
- 5, 5, 7, 8, 11, 13, 15, 17,
- 6, 7, 9, 11, 13, 15, 15, 17,
- 7, 7, 9, 11, 13, 15, 17, 19,
- 7, 9, 11, 13, 14, 16, 19, 23,
- 9, 11, 13, 14, 16, 19, 23, 29,
- 9, 11, 13, 15, 17, 21, 28, 35,
- 11, 13, 16, 17, 21, 28, 35, 41,
- },
+ .quant = QUANT_MAT_LT,
},
{
.full_name = "standard",
@@ -98,16 +141,7 @@ static const struct prores_profile {
.min_quant = 1,
.max_quant = 6,
.br_tab = { 1050, 808, 710, 632 },
- .quant = {
- 4, 4, 5, 5, 6, 7, 7, 9,
- 4, 4, 5, 6, 7, 7, 9, 9,
- 5, 5, 6, 7, 7, 9, 9, 10,
- 5, 5, 6, 7, 7, 9, 9, 10,
- 5, 6, 7, 7, 8, 9, 10, 12,
- 6, 7, 7, 8, 9, 10, 12, 15,
- 6, 7, 7, 9, 10, 11, 14, 17,
- 7, 7, 9, 10, 11, 14, 17, 21,
- },
+ .quant = QUANT_MAT_STANDARD,
},
{
.full_name = "high quality",
@@ -115,16 +149,7 @@ static const struct prores_profile {
.min_quant = 1,
.max_quant = 6,
.br_tab = { 1566, 1216, 1070, 950 },
- .quant = {
- 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 5,
- 4, 4, 4, 4, 4, 4, 5, 5,
- 4, 4, 4, 4, 4, 5, 5, 6,
- 4, 4, 4, 4, 5, 5, 6, 7,
- 4, 4, 4, 4, 5, 6, 7, 7,
- },
+ .quant = QUANT_MAT_HQ,
}
// for 4444 profile bitrate numbers are { 2350, 1828, 1600, 1425 }
};
@@ -147,6 +172,7 @@ typedef struct ProresContext {
DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
int16_t quants[MAX_STORED_Q][64];
int16_t custom_q[64];
+ const uint8_t *quant_mat;
ProresDSPContext dsp;
ScanTable scantable;
@@ -159,6 +185,9 @@ typedef struct ProresContext {
int num_planes;
int bits_per_mb;
+ char *vendor;
+ int quant_sel;
+
int frame_size;
int profile;
@@ -373,7 +402,7 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
} else {
qmat = ctx->custom_q;
for (i = 0; i < 64; i++)
- qmat[i] = ctx->profile_info->quant[i] * quant;
+ qmat[i] = ctx->quant_mat[i] * quant;
}
for (i = 0; i < ctx->num_planes; i++) {
@@ -591,7 +620,7 @@ static int find_slice_quant(AVCodecContext *avctx, const AVFrame *pic,
} else {
qmat = ctx->custom_q;
for (i = 0; i < 64; i++)
- qmat[i] = ctx->profile_info->quant[i] * q;
+ qmat[i] = ctx->quant_mat[i] * q;
}
for (i = 0; i < ctx->num_planes; i++) {
bits += estimate_slice_plane(ctx, &error, i,
@@ -684,7 +713,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
tmp = buf;
buf += 2; // frame header size will be stored here
bytestream_put_be16 (&buf, 0); // version 1
- bytestream_put_buffer(&buf, "Lavc", 4); // creator
+ bytestream_put_buffer(&buf, ctx->vendor, 4);
bytestream_put_be16 (&buf, avctx->width);
bytestream_put_be16 (&buf, avctx->height);
bytestream_put_byte (&buf, ctx->chroma_factor << 6); // frame flags
@@ -694,13 +723,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
bytestream_put_byte (&buf, avctx->colorspace);
bytestream_put_byte (&buf, 0x40); // source format and alpha information
bytestream_put_byte (&buf, 0); // reserved
- bytestream_put_byte (&buf, 0x03); // matrix flags - both matrices are present
- // luma quantisation matrix
- for (i = 0; i < 64; i++)
- bytestream_put_byte(&buf, ctx->profile_info->quant[i]);
- // chroma quantisation matrix
- for (i = 0; i < 64; i++)
- bytestream_put_byte(&buf, ctx->profile_info->quant[i]);
+ if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
+ bytestream_put_byte (&buf, 0x03); // matrix flags - both matrices are present
+ // luma quantisation matrix
+ for (i = 0; i < 64; i++)
+ bytestream_put_byte(&buf, ctx->quant_mat[i]);
+ // chroma quantisation matrix
+ for (i = 0; i < 64; i++)
+ bytestream_put_byte(&buf, ctx->quant_mat[i]);
+ } else {
+ bytestream_put_byte (&buf, 0x00); // matrix flags - default matrices are used
+ }
bytestream_put_be16 (&tmp, buf - orig_buf); // write back frame header size
// picture header
@@ -816,10 +849,25 @@ static av_cold int encode_init(AVCodecContext *avctx)
ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
ctx->num_slices = ctx->mb_height * ctx->slices_width;
- for (i = 0; i < NUM_MB_LIMITS - 1; i++)
- if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height)
- break;
- ctx->bits_per_mb = ctx->profile_info->br_tab[i];
+ if (ctx->quant_sel == -1)
+ ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
+ else
+ ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
+
+ if (strlen(ctx->vendor) != 4) {
+ av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
+ return AVERROR_INVALIDDATA;
+ }
+
+ if (!ctx->bits_per_mb) {
+ for (i = 0; i < NUM_MB_LIMITS - 1; i++)
+ if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height)
+ break;
+ ctx->bits_per_mb = ctx->profile_info->br_tab[i];
+ } else if (ctx->bits_per_mb < 128) {
+ av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
+ return AVERROR_INVALIDDATA;
+ }
ctx->frame_size = ctx->num_slices * (2 + 2 * ctx->num_planes
+ (2 * mps * ctx->bits_per_mb) / 8)
@@ -829,7 +877,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
max_quant = ctx->profile_info->max_quant;
for (i = min_quant; i < MAX_STORED_Q; i++) {
for (j = 0; j < 64; j++)
- ctx->quants[i][j] = ctx->profile_info->quant[j] * i;
+ ctx->quants[i][j] = ctx->quant_mat[j] * i;
}
avctx->codec_tag = ctx->profile_info->tag;
@@ -877,6 +925,24 @@ static const AVOption options[] = {
0, 0, VE, "profile" },
{ "hq", NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_HQ },
0, 0, VE, "profile" },
+ { "vendor", "vendor ID", OFFSET(vendor),
+ AV_OPT_TYPE_STRING, { .str = "Lavc" }, CHAR_MIN, CHAR_MAX, VE },
+ { "bits_per_mb", "desired bits per macroblock", OFFSET(bits_per_mb),
+ AV_OPT_TYPE_INT, { 0 }, 0, 8192, VE },
+ { "quant_mat", "quantiser matrix", OFFSET(quant_sel), AV_OPT_TYPE_INT,
+ { -1 }, -1, QUANT_MAT_DEFAULT, VE, "quant_mat" },
+ { "auto", NULL, 0, AV_OPT_TYPE_CONST, { -1 },
+ 0, 0, VE, "quant_mat" },
+ { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_PROXY },
+ 0, 0, VE, "quant_mat" },
+ { "lt", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_LT },
+ 0, 0, VE, "quant_mat" },
+ { "standard", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_STANDARD },
+ 0, 0, VE, "quant_mat" },
+ { "hq", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_HQ },
+ 0, 0, VE, "quant_mat" },
+ { "default", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_DEFAULT },
+ 0, 0, VE, "quant_mat" },
{ NULL }
};
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 8b63288adc..c4e8aab952 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -416,7 +416,6 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
int err = 0;
if (dst != src) {
- dst->sub_id = src->sub_id;
dst->time_base = src->time_base;
dst->width = src->width;
dst->height = src->height;
diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c
index 2d5ae690ca..a0ddfae647 100644
--- a/libavcodec/qpeg.c
+++ b/libavcodec/qpeg.c
@@ -25,15 +25,17 @@
*/
#include "avcodec.h"
+#include "bytestream.h"
typedef struct QpegContext{
AVCodecContext *avctx;
AVFrame pic, ref;
uint32_t pal[256];
+ GetByteContext buffer;
} QpegContext;
-static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
- int stride, int width, int height)
+static void qpeg_decode_intra(QpegContext *qctx, uint8_t *dst,
+ int stride, int width, int height)
{
int i;
int code;
@@ -46,31 +48,26 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
height--;
dst = dst + height * stride;
- while((size > 0) && (rows_to_go > 0)) {
- code = *src++;
- size--;
+ while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (rows_to_go > 0)) {
+ code = bytestream2_get_byte(&qctx->buffer);
run = copy = 0;
if(code == 0xFC) /* end-of-picture code */
break;
if(code >= 0xF8) { /* very long run */
- c0 = *src++;
- c1 = *src++;
- size -= 2;
+ c0 = bytestream2_get_byte(&qctx->buffer);
+ c1 = bytestream2_get_byte(&qctx->buffer);
run = ((code & 0x7) << 16) + (c0 << 8) + c1 + 2;
} else if (code >= 0xF0) { /* long run */
- c0 = *src++;
- size--;
+ c0 = bytestream2_get_byte(&qctx->buffer);
run = ((code & 0xF) << 8) + c0 + 2;
} else if (code >= 0xE0) { /* short run */
run = (code & 0x1F) + 2;
} else if (code >= 0xC0) { /* very long copy */
- c0 = *src++;
- c1 = *src++;
- size -= 2;
+ c0 = bytestream2_get_byte(&qctx->buffer);
+ c1 = bytestream2_get_byte(&qctx->buffer);
copy = ((code & 0x3F) << 16) + (c0 << 8) + c1 + 1;
} else if (code >= 0x80) { /* long copy */
- c0 = *src++;
- size--;
+ c0 = bytestream2_get_byte(&qctx->buffer);
copy = ((code & 0x7F) << 8) + c0 + 1;
} else { /* short copy */
copy = code + 1;
@@ -80,8 +77,7 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
if(run) {
int p;
- p = *src++;
- size--;
+ p = bytestream2_get_byte(&qctx->buffer);
for(i = 0; i < run; i++) {
dst[filled++] = p;
if (filled >= width) {
@@ -93,11 +89,8 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
}
}
} else {
- size -= copy;
- if (size<0)
- return AVERROR_INVALIDDATA;
for(i = 0; i < copy; i++) {
- dst[filled++] = *src++;
+ dst[filled++] = bytestream2_get_byte(&qctx->buffer);
if (filled >= width) {
filled = 0;
dst -= stride;
@@ -108,7 +101,6 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size,
}
}
}
- return 0;
}
static const int qpeg_table_h[16] =
@@ -117,9 +109,10 @@ static const int qpeg_table_w[16] =
{ 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
/* Decodes delta frames */
-static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
- int stride, int width, int height,
- int delta, const uint8_t *ctable, uint8_t *refdata)
+static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
+ int stride, int width, int height,
+ int delta, const uint8_t *ctable,
+ uint8_t *refdata)
{
int i, j;
int code;
@@ -137,13 +130,12 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
height--;
dst = dst + height * stride;
- while((size > 0) && (height >= 0)) {
- code = *src++;
- size--;
+ while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (height >= 0)) {
+ code = bytestream2_get_byte(&qctx->buffer);
if(delta) {
/* motion compensation */
- while(size > 0 && (code & 0xF0) == 0xF0) {
+ while(bytestream2_get_bytes_left(&qctx->buffer) > 0 && (code & 0xF0) == 0xF0) {
if(delta == 1) {
int me_idx;
int me_w, me_h, me_x, me_y;
@@ -156,8 +148,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
me_h = qpeg_table_h[me_idx];
/* extract motion vector */
- corr = *src++;
- size--;
+ corr = bytestream2_get_byte(&qctx->buffer);
val = corr >> 4;
if(val > 7)
@@ -184,8 +175,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
}
}
}
- code = *src++;
- size--;
+ code = bytestream2_get_byte(&qctx->buffer);
}
}
@@ -195,8 +185,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
int p;
code &= 0x1F;
- p = *src++;
- size--;
+ p = bytestream2_get_byte(&qctx->buffer);
for(i = 0; i <= code; i++) {
dst[filled++] = p;
if(filled >= width) {
@@ -210,11 +199,11 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
} else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */
code &= 0x1F;
- if(code + 1 > size)
+ if(code + 1 > bytestream2_get_bytes_left(&qctx->buffer))
break;
for(i = 0; i <= code; i++) {
- dst[filled++] = *src++;
+ dst[filled++] = bytestream2_get_byte(&qctx->buffer);
if(filled >= width) {
filled = 0;
dst -= stride;
@@ -223,18 +212,17 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
break;
}
}
- size -= code + 1;
} else if(code >= 0x80) { /* skip code: 0x80..0xBF */
int skip;
code &= 0x3F;
/* codes 0x80 and 0x81 are actually escape codes,
skip value minus constant is in the next byte */
- if(!code) {
- skip = (*src++) + 64; size--;
- } else if(code == 1) {
- skip = (*src++) + 320; size--;
- } else
+ if(!code)
+ skip = bytestream2_get_byte(&qctx->buffer) + 64;
+ else if(code == 1)
+ skip = bytestream2_get_byte(&qctx->buffer) + 320;
+ else
skip = code;
filled += skip;
while( filled >= width) {
@@ -246,8 +234,9 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size,
}
} else {
/* zero code treated as one-pixel skip */
- if(code)
+ if(code) {
dst[filled++] = ctable[code & 0x7F];
+ }
else
filled++;
if(filled >= width) {
@@ -263,8 +252,7 @@ static int decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
AVPacket *avpkt)
{
- const uint8_t *buf = avpkt->data;
- int buf_size = avpkt->size;
+ uint8_t ctable[128];
QpegContext * const a = avctx->priv_data;
AVFrame * p = &a->pic;
AVFrame * ref= &a->ref;
@@ -272,6 +260,13 @@ static int decode_frame(AVCodecContext *avctx,
int delta, ret = 0;
const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+ if (avpkt->size < 0x86) {
+ av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
+ return AVERROR_INVALIDDATA;
+ }
+
+ bytestream2_init(&a->buffer, avpkt->data, avpkt->size);
+
if(ref->data[0])
avctx->release_buffer(avctx, ref);
FFSWAP(AVFrame, *ref, *p);
@@ -282,16 +277,17 @@ static int decode_frame(AVCodecContext *avctx,
return -1;
}
outdata = a->pic.data[0];
- if(buf[0x85] == 0x10) {
- ret = qpeg_decode_intra(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height);
+ bytestream2_skip(&a->buffer, 4);
+ bytestream2_get_buffer(&a->buffer, ctable, 128);
+ bytestream2_skip(&a->buffer, 1);
+
+ delta = bytestream2_get_byte(&a->buffer);
+ if(delta == 0x10) {
+ qpeg_decode_intra(a, outdata, a->pic.linesize[0], avctx->width, avctx->height);
} else {
- delta = buf[0x85];
- qpeg_decode_inter(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height, delta, buf + 4, a->ref.data[0]);
+ qpeg_decode_inter(a, outdata, a->pic.linesize[0], avctx->width, avctx->height, delta, ctable, a->ref.data[0]);
}
- if (ret<0)
- return ret;
-
/* make the palette available on the way out */
if (pal) {
a->pic.palette_has_changed = 1;
@@ -302,7 +298,7 @@ static int decode_frame(AVCodecContext *avctx,
*data_size = sizeof(AVFrame);
*(AVFrame*)data = a->pic;
- return buf_size;
+ return avpkt->size;
}
static av_cold int decode_init(AVCodecContext *avctx){
diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h
index 9665534f7b..83c0899cc8 100644
--- a/libavcodec/ra144.h
+++ b/libavcodec/ra144.h
@@ -36,6 +36,7 @@ typedef struct {
AVCodecContext *avctx;
AVFrame frame;
LPCContext lpc_ctx;
+ int last_frame;
unsigned int old_energy; ///< previous frame energy
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index b3710e871b..caa7d16b30 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -53,6 +53,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
return -1;
}
avctx->frame_size = NBLOCKS * BLOCKSIZE;
+ avctx->delay = avctx->frame_size;
avctx->bit_rate = 8000;
ractx = avctx->priv_data;
ractx->lpc_coef[0] = ractx->lpc_tables[0];
@@ -433,7 +434,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
{
static const uint8_t sizes[LPC_ORDER] = {64, 32, 32, 16, 16, 8, 8, 8, 8, 4};
static const uint8_t bit_sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2};
- RA144Context *ractx;
+ RA144Context *ractx = avctx->priv_data;
PutBitContext pb;
int32_t lpc_data[NBLOCKS * BLOCKSIZE];
int32_t lpc_coefs[LPC_ORDER][MAX_LPC_ORDER];
@@ -445,11 +446,13 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
int energy = 0;
int i, idx;
+ if (ractx->last_frame)
+ return 0;
+
if (buf_size < FRAMESIZE) {
av_log(avctx, AV_LOG_ERROR, "output buffer too small\n");
return 0;
}
- ractx = avctx->priv_data;
/**
* Since the LPC coefficients are calculated on a frame centered over the
@@ -462,11 +465,15 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
lpc_data[i] = ractx->curr_block[BLOCKSIZE + BLOCKSIZE / 2 + i];
energy += (lpc_data[i] * lpc_data[i]) >> 4;
}
- for (i = 2 * BLOCKSIZE + BLOCKSIZE / 2; i < NBLOCKS * BLOCKSIZE; i++) {
- lpc_data[i] = *((int16_t *)data + i - 2 * BLOCKSIZE - BLOCKSIZE / 2) >>
- 2;
- energy += (lpc_data[i] * lpc_data[i]) >> 4;
+ if (data) {
+ int j;
+ for (j = 0; j < avctx->frame_size && i < NBLOCKS * BLOCKSIZE; i++, j++) {
+ lpc_data[i] = samples[j] >> 2;
+ energy += (lpc_data[i] * lpc_data[i]) >> 4;
+ }
}
+ if (i < NBLOCKS * BLOCKSIZE)
+ memset(&lpc_data[i], 0, (NBLOCKS * BLOCKSIZE - i) * sizeof(*lpc_data));
energy = ff_energy_tab[quantize(ff_t_sqrt(energy >> 5) >> 10, ff_energy_tab,
32)];
@@ -515,8 +522,17 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
ractx->old_energy = energy;
ractx->lpc_refl_rms[1] = ractx->lpc_refl_rms[0];
FFSWAP(unsigned int *, ractx->lpc_coef[0], ractx->lpc_coef[1]);
- for (i = 0; i < NBLOCKS * BLOCKSIZE; i++)
- ractx->curr_block[i] = samples[i] >> 2;
+
+ /* copy input samples to current block for processing in next call */
+ i = 0;
+ if (data) {
+ for (; i < avctx->frame_size; i++)
+ ractx->curr_block[i] = samples[i] >> 2;
+ } else
+ ractx->last_frame = 1;
+ memset(&ractx->curr_block[i], 0,
+ (NBLOCKS * BLOCKSIZE - i) * sizeof(*ractx->curr_block));
+
return FRAMESIZE;
}
@@ -529,6 +545,7 @@ AVCodec ff_ra_144_encoder = {
.init = ra144_encode_init,
.encode = ra144_encode_frame,
.close = ra144_encode_close,
+ .capabilities = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME,
.sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
AV_SAMPLE_FMT_NONE },
.long_name = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"),
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index bc86b69a2e..ab382ac7bc 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -40,6 +40,11 @@
#define DC_VLC_BITS 14 //FIXME find a better solution
+typedef struct RVDecContext {
+ MpegEncContext m;
+ int sub_id;
+} RVDecContext;
+
static const uint16_t rv_lum_code[256] =
{
0x3e7f, 0x0f00, 0x0f01, 0x0f02, 0x0f03, 0x0f04, 0x0f05, 0x0f06,
@@ -293,8 +298,9 @@ static int rv10_decode_picture_header(MpegEncContext *s)
return mb_count;
}
-static int rv20_decode_picture_header(MpegEncContext *s)
+static int rv20_decode_picture_header(RVDecContext *rv)
{
+ MpegEncContext *s = &rv->m;
int seq, mb_pos, i;
int rpr_bits;
@@ -342,10 +348,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
return -1;
}
- if(RV_GET_MINOR_VER(s->avctx->sub_id) >= 2)
+ if(RV_GET_MINOR_VER(rv->sub_id) >= 2)
s->loop_filter = get_bits1(&s->gb);
- if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1)
+ if(RV_GET_MINOR_VER(rv->sub_id) <= 1)
seq = get_bits(&s->gb, 8) << 7;
else
seq = get_bits(&s->gb, 13) << 2;
@@ -410,7 +416,7 @@ static int rv20_decode_picture_header(MpegEncContext *s)
av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
s->no_rounding= get_bits1(&s->gb);
- if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B)
+ if(RV_GET_MINOR_VER(rv->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B)
skip_bits(&s->gb, 5); // binary decoder reads 3+2 bits here but they don't seem to be used
s->f_code = 1;
@@ -435,7 +441,8 @@ av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
static av_cold int rv10_decode_init(AVCodecContext *avctx)
{
- MpegEncContext *s = avctx->priv_data;
+ RVDecContext *rv = avctx->priv_data;
+ MpegEncContext *s = &rv->m;
static int done=0;
int major_ver, minor_ver, micro_ver;
@@ -454,11 +461,11 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
s->orig_height= s->height = avctx->coded_height;
s->h263_long_vectors= ((uint8_t*)avctx->extradata)[3] & 1;
- avctx->sub_id= AV_RB32((uint8_t*)avctx->extradata + 4);
+ rv->sub_id = AV_RB32((uint8_t*)avctx->extradata + 4);
- major_ver = RV_GET_MAJOR_VER(avctx->sub_id);
- minor_ver = RV_GET_MINOR_VER(avctx->sub_id);
- micro_ver = RV_GET_MICRO_VER(avctx->sub_id);
+ major_ver = RV_GET_MAJOR_VER(rv->sub_id);
+ minor_ver = RV_GET_MINOR_VER(rv->sub_id);
+ micro_ver = RV_GET_MICRO_VER(rv->sub_id);
s->low_delay = 1;
switch (major_ver) {
@@ -473,13 +480,13 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
}
break;
default:
- av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", avctx->sub_id);
+ av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", rv->sub_id);
av_log_missing_feature(avctx, "RV1/2 version", 1);
return AVERROR_PATCHWELCOME;
}
if(avctx->debug & FF_DEBUG_PICT_INFO){
- av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", avctx->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1);
+ av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1);
}
avctx->pix_fmt = PIX_FMT_YUV420P;
@@ -514,7 +521,8 @@ static av_cold int rv10_decode_end(AVCodecContext *avctx)
static int rv10_decode_packet(AVCodecContext *avctx,
const uint8_t *buf, int buf_size, int buf_size2)
{
- MpegEncContext *s = avctx->priv_data;
+ RVDecContext *rv = avctx->priv_data;
+ MpegEncContext *s = &rv->m;
int mb_count, mb_pos, left, start_mb_x, active_bits_size;
active_bits_size = buf_size * 8;
@@ -522,7 +530,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
if(s->codec_id ==CODEC_ID_RV10)
mb_count = rv10_decode_picture_header(s);
else
- mb_count = rv20_decode_picture_header(s);
+ mb_count = rv20_decode_picture_header(rv);
if (mb_count < 0) {
av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
return -1;
@@ -733,7 +741,7 @@ AVCodec ff_rv10_decoder = {
.name = "rv10",
.type = AVMEDIA_TYPE_VIDEO,
.id = CODEC_ID_RV10,
- .priv_data_size = sizeof(MpegEncContext),
+ .priv_data_size = sizeof(RVDecContext),
.init = rv10_decode_init,
.close = rv10_decode_end,
.decode = rv10_decode_frame,
@@ -747,7 +755,7 @@ AVCodec ff_rv20_decoder = {
.name = "rv20",
.type = AVMEDIA_TYPE_VIDEO,
.id = CODEC_ID_RV20,
- .priv_data_size = sizeof(MpegEncContext),
+ .priv_data_size = sizeof(RVDecContext),
.init = rv10_decode_init,
.close = rv10_decode_end,
.decode = rv10_decode_frame,
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 47e29c9151..6c49905e65 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -118,12 +118,12 @@ static void avcodec_init(void)
ff_dsputil_static_init();
}
-static av_always_inline int codec_is_encoder(AVCodec *codec)
+int av_codec_is_encoder(AVCodec *codec)
{
return codec && (codec->encode || codec->encode2);
}
-static av_always_inline int codec_is_decoder(AVCodec *codec)
+int av_codec_is_decoder(AVCodec *codec)
{
return codec && codec->decode;
}
@@ -798,7 +798,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
/* if the decoder init function was already called previously,
free the already allocated subtitle_header before overwriting it */
- if (codec_is_decoder(codec))
+ if (av_codec_is_decoder(codec))
av_freep(&avctx->subtitle_header);
#define SANE_NB_CHANNELS 128U
@@ -845,7 +845,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
ret = AVERROR(EINVAL);
goto free_and_end;
}
- if (codec_is_encoder(avctx->codec)) {
+ if (av_codec_is_encoder(avctx->codec)) {
int i;
if (avctx->codec->sample_fmts) {
for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++)
@@ -914,7 +914,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
}
}
- if (codec_is_decoder(avctx->codec) && !avctx->bit_rate)
+ if (av_codec_is_decoder(avctx->codec) && !avctx->bit_rate)
avctx->bit_rate = get_bit_rate(avctx);
ret=0;
@@ -1527,7 +1527,7 @@ av_cold int avcodec_close(AVCodecContext *avctx)
av_opt_free(avctx->priv_data);
av_opt_free(avctx);
av_freep(&avctx->priv_data);
- if (codec_is_encoder(avctx->codec))
+ if (av_codec_is_encoder(avctx->codec))
av_freep(&avctx->extradata);
avctx->codec = NULL;
avctx->active_thread_type = 0;
@@ -1556,7 +1556,7 @@ AVCodec *avcodec_find_encoder(enum CodecID id)
p = first_avcodec;
id= remap_deprecated_codec_id(id);
while (p) {
- if (codec_is_encoder(p) && p->id == id) {
+ if (av_codec_is_encoder(p) && p->id == id) {
if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
experimental = p;
} else
@@ -1574,7 +1574,7 @@ AVCodec *avcodec_find_encoder_by_name(const char *name)
return NULL;
p = first_avcodec;
while (p) {
- if (codec_is_encoder(p) && strcmp(name,p->name) == 0)
+ if (av_codec_is_encoder(p) && strcmp(name,p->name) == 0)
return p;
p = p->next;
}
@@ -1587,7 +1587,7 @@ AVCodec *avcodec_find_decoder(enum CodecID id)
p = first_avcodec;
id= remap_deprecated_codec_id(id);
while (p) {
- if (codec_is_decoder(p) && p->id == id) {
+ if (av_codec_is_decoder(p) && p->id == id) {
if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
experimental = p;
} else
@@ -1605,7 +1605,7 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
return NULL;
p = first_avcodec;
while (p) {
- if (codec_is_decoder(p) && strcmp(name,p->name) == 0)
+ if (av_codec_is_decoder(p) && strcmp(name,p->name) == 0)
return p;
p = p->next;
}
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 99691f21bd..b8d2bbf2b3 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -21,7 +21,7 @@
#define AVCODEC_VERSION_H
#define LIBAVCODEC_VERSION_MAJOR 54
-#define LIBAVCODEC_VERSION_MINOR 7
+#define LIBAVCODEC_VERSION_MINOR 8
#define LIBAVCODEC_VERSION_MICRO 100
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
@@ -69,5 +69,8 @@
#ifndef FF_API_INTER_THRESHOLD
#define FF_API_INTER_THRESHOLD (LIBAVCODEC_VERSION_MAJOR < 55)
#endif
+#ifndef FF_API_SUB_ID
+#define FF_API_SUB_ID (LIBAVCODEC_VERSION_MAJOR < 55)
+#endif
#endif /* AVCODEC_VERSION_H */
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index e0c8d0a193..12f21553e5 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -137,13 +137,16 @@ typedef struct {
#define RESIDUE_PART_SIZE 32
#define NUM_RESIDUE_PARTITIONS (RESIDUE_SIZE/RESIDUE_PART_SIZE)
-static inline void put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
- int entry)
+static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
+ int entry)
{
assert(entry >= 0);
assert(entry < cb->nentries);
assert(cb->lens[entry]);
+ if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry])
+ return AVERROR(EINVAL);
put_bits(pb, cb->lens[entry], cb->codewords[entry]);
+ return 0;
}
static int cb_lookup_vals(int lookup, int dimentions, int entries)
@@ -751,14 +754,16 @@ static int render_point(int x0, int y0, int x1, int y1, int x)
return y0 + (x - x0) * (y1 - y0) / (x1 - x0);
}
-static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
- PutBitContext *pb, uint16_t *posts,
- float *floor, int samples)
+static int floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
+ PutBitContext *pb, uint16_t *posts,
+ float *floor, int samples)
{
int range = 255 / fc->multiplier + 1;
int coded[MAX_FLOOR_VALUES]; // first 2 values are unused
int i, counter;
+ if (pb->size_in_bits - put_bits_count(pb) < 1 + 2 * ilog(range - 1))
+ return AVERROR(EINVAL);
put_bits(pb, 1, 1); // non zero
put_bits(pb, ilog(range - 1), posts[0]);
put_bits(pb, ilog(range - 1), posts[1]);
@@ -816,7 +821,8 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
cval |= l << cshift;
cshift += c->subclass;
}
- put_codeword(pb, book, cval);
+ if (put_codeword(pb, book, cval))
+ return AVERROR(EINVAL);
}
for (k = 0; k < c->dim; k++) {
int book = c->books[cval & (csub-1)];
@@ -826,12 +832,15 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc,
continue;
if (entry == -1)
entry = 0;
- put_codeword(pb, &venc->codebooks[book], entry);
+ if (put_codeword(pb, &venc->codebooks[book], entry))
+ return AVERROR(EINVAL);
}
}
ff_vorbis_floor1_render_list(fc->list, fc->values, posts, coded,
fc->multiplier, floor, samples);
+
+ return 0;
}
static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb,
@@ -852,13 +861,14 @@ static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb,
distance = d;
}
}
- put_codeword(pb, book, entry);
+ if (put_codeword(pb, book, entry))
+ return NULL;
return &book->dimentions[entry * book->ndimentions];
}
-static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
- PutBitContext *pb, float *coeffs, int samples,
- int real_ch)
+static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
+ PutBitContext *pb, float *coeffs, int samples,
+ int real_ch)
{
int pass, i, j, p, k;
int psize = rc->partition_size;
@@ -894,7 +904,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
entry *= rc->classifications;
entry += classes[j][p + i];
}
- put_codeword(pb, book, entry);
+ if (put_codeword(pb, book, entry))
+ return AVERROR(EINVAL);
}
for (i = 0; i < classwords && p < partitions; i++, p++) {
for (j = 0; j < channels; j++) {
@@ -909,8 +920,10 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
if (rc->type == 0) {
for (k = 0; k < psize; k += book->ndimentions) {
- float *a = put_vector(book, pb, &buf[k]);
int l;
+ float *a = put_vector(book, pb, &buf[k]);
+ if (!a)
+ return AVERROR(EINVAL);
for (l = 0; l < book->ndimentions; l++)
buf[k + l] -= a[l];
}
@@ -930,6 +943,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
}
}
pv = put_vector(book, pb, vec);
+ if (!pv)
+ return AVERROR(EINVAL);
for (dim = book->ndimentions; dim--; ) {
coeffs[a1 + b1] -= *pv++;
if ((a1 += samples) == s) {
@@ -943,6 +958,7 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
}
}
}
+ return 0;
}
static int apply_window_and_mdct(vorbis_enc_context *venc, const signed short *audio,
@@ -1016,6 +1032,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
init_put_bits(&pb, packets, buf_size);
+ if (pb.size_in_bits - put_bits_count(&pb) < 1 + ilog(venc->nmodes - 1)) {
+ av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+ return AVERROR(EINVAL);
+ }
+
put_bits(&pb, 1, 0); // magic bit
put_bits(&pb, ilog(venc->nmodes - 1), 0); // 0 bits, the mode
@@ -1031,7 +1052,10 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
vorbis_enc_floor *fc = &venc->floors[mapping->floor[mapping->mux[i]]];
uint16_t posts[MAX_FLOOR_VALUES];
floor_fit(venc, fc, &venc->coeffs[i * samples], posts, samples);
- floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples);
+ if (floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples)) {
+ av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+ return AVERROR(EINVAL);
+ }
}
for (i = 0; i < venc->channels * samples; i++)
@@ -1051,8 +1075,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext,
}
}
- residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]],
- &pb, venc->coeffs, samples, venc->channels);
+ if (residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]],
+ &pb, venc->coeffs, samples, venc->channels)) {
+ av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n");
+ return AVERROR(EINVAL);
+ }
avccontext->coded_frame->pts = venc->sample_count;
venc->sample_count += avccontext->frame_size;
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 6ec6d7ce57..29baa10230 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -39,6 +39,12 @@ static int encode_init(AVCodecContext * avctx){
return AVERROR(EINVAL);
}
+ if (avctx->sample_rate > 48000) {
+ av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz",
+ avctx->sample_rate);
+ return AVERROR(EINVAL);
+ }
+
if(avctx->bit_rate < 24*1000) {
av_log(avctx, AV_LOG_ERROR, "bitrate too low: got %i, need 24000 or higher\n",
avctx->bit_rate);
@@ -64,6 +70,8 @@ static int encode_init(AVCodecContext * avctx){
s->use_exp_vlc = flags2 & 0x0001;
s->use_bit_reservoir = flags2 & 0x0002;
s->use_variable_block_len = flags2 & 0x0004;
+ if (avctx->channels == 2)
+ s->ms_stereo = 1;
ff_wma_init(avctx, flags2);
@@ -71,8 +79,12 @@ static int encode_init(AVCodecContext * avctx){
for(i = 0; i < s->nb_block_sizes; i++)
ff_mdct_init(&s->mdct_ctx[i], s->frame_len_bits - i + 1, 0, 1.0);
- avctx->block_align=
- s->block_align= avctx->bit_rate*(int64_t)s->frame_len / (avctx->sample_rate*8);
+ s->block_align = avctx->bit_rate * (int64_t)s->frame_len /
+ (avctx->sample_rate * 8);
+ s->block_align = FFMIN(s->block_align, MAX_CODED_SUPERFRAME_SIZE);
+ avctx->block_align = s->block_align;
+ avctx->bit_rate = avctx->block_align * 8LL * avctx->sample_rate /
+ s->frame_len;
//av_log(NULL, AV_LOG_ERROR, "%d %d %d %d\n", s->block_align, avctx->bit_rate, s->frame_len, avctx->sample_rate);
avctx->frame_size= s->frame_len;
@@ -181,7 +193,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
}
if (s->nb_channels == 2) {
- put_bits(&s->pb, 1, s->ms_stereo= 1);
+ put_bits(&s->pb, 1, !!s->ms_stereo);
}
for(ch = 0; ch < s->nb_channels; ch++) {
@@ -355,6 +367,11 @@ static int encode_superframe(AVCodecContext *avctx,
}
}
+ if (buf_size < 2 * MAX_CODED_SUPERFRAME_SIZE) {
+ av_log(avctx, AV_LOG_ERROR, "output buffer size is too small\n");
+ return AVERROR(EINVAL);
+ }
+
#if 1
total_gain= 128;
for(i=64; i; i>>=1){
@@ -379,15 +396,17 @@ static int encode_superframe(AVCodecContext *avctx,
}
#endif
- encode_frame(s, s->coefs, buf, buf_size, total_gain);
+ if ((i = encode_frame(s, s->coefs, buf, buf_size, total_gain)) >= 0) {
+ av_log(avctx, AV_LOG_ERROR, "required frame size too large. please "
+ "use a higher bit rate.\n");
+ return AVERROR(EINVAL);
+ }
assert((put_bits_count(&s->pb) & 7) == 0);
- i= s->block_align - (put_bits_count(&s->pb)+7)/8;
- assert(i>=0);
- while(i--)
+ while (i++)
put_bits(&s->pb, 8, 'N');
flush_put_bits(&s->pb);
- return put_bits_ptr(&s->pb) - s->pb.buf;
+ return s->block_align;
}
AVCodec ff_wmav1_encoder = {
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index e3b727d1b1..f4f6e92877 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -29,16 +29,16 @@
/*
* MC functions
*/
-extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
-extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
+extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
}
#if ARCH_X86_32
-TAP_W8 (mmxext, epel, h4)
-TAP_W8 (mmxext, epel, h6)
-TAP_W16(mmxext, epel, h6)
-TAP_W8 (mmxext, epel, v4)
-TAP_W8 (mmxext, epel, v6)
-TAP_W16(mmxext, epel, v6)
-TAP_W8 (mmxext, bilinear, h)
-TAP_W16(mmxext, bilinear, h)
-TAP_W8 (mmxext, bilinear, v)
-TAP_W16(mmxext, bilinear, v)
+TAP_W8 (mmx2, epel, h4)
+TAP_W8 (mmx2, epel, h6)
+TAP_W16(mmx2, epel, h6)
+TAP_W8 (mmx2, epel, v4)
+TAP_W8 (mmx2, epel, v6)
+TAP_W16(mmx2, epel, v6)
+TAP_W8 (mmx2, bilinear, h)
+TAP_W16(mmx2, bilinear, h)
+TAP_W8 (mmx2, bilinear, v)
+TAP_W16(mmx2, bilinear, v)
#endif
-TAP_W16(sse2, epel, h6)
-TAP_W16(sse2, epel, v6)
-TAP_W16(sse2, bilinear, h)
-TAP_W16(sse2, bilinear, v)
+TAP_W16(sse2, epel, h6)
+TAP_W16(sse2, epel, v6)
+TAP_W16(sse2, bilinear, h)
+TAP_W16(sse2, bilinear, v)
-TAP_W16(ssse3, epel, h6)
-TAP_W16(ssse3, epel, v6)
-TAP_W16(ssse3, bilinear, h)
-TAP_W16(ssse3, bilinear, v)
+TAP_W16(ssse3, epel, h6)
+TAP_W16(ssse3, epel, v6)
+TAP_W16(ssse3, bilinear, h)
+TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y, 4, 8) \
-HVTAP(mmxext, 8, x, y, 8, 16)
+HVTAP(mmx2, 8, x, y, 4, 8) \
+HVTAP(mmx2, 8, x, y, 8, 16)
-HVTAP(mmxext, 8, 6, 6, 16, 16)
+HVTAP(mmx2, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y, 4, 8)
+HVTAP(mmx2, 8, x, y, 4, 8)
#endif
HVTAPMMX(4, 4)
@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
-HVBILIN(mmxext, 8, 4, 8)
+HVBILIN(mmx2, 8, 4, 8)
#if ARCH_X86_32
-HVBILIN(mmxext, 8, 8, 16)
-HVBILIN(mmxext, 8, 16, 16)
+HVBILIN(mmx2, 8, 8, 16)
+HVBILIN(mmx2, 8, 16, 16)
#endif
-HVBILIN(sse2, 8, 8, 16)
-HVBILIN(sse2, 8, 16, 16)
-HVBILIN(ssse3, 8, 4, 8)
-HVBILIN(ssse3, 8, 8, 16)
-HVBILIN(ssse3, 8, 16, 16)
+HVBILIN(sse2, 8, 8, 16)
+HVBILIN(sse2, 8, 16, 16)
+HVBILIN(ssse3, 8, 4, 8)
+HVBILIN(ssse3, 8, 8, 16)
+HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
ptrdiff_t stride);
@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx)
-DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(mmx2)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & AV_CPU_FLAG_MMX2) {
- VP8_MC_FUNC(2, 4, mmxext);
- VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
+ VP8_MC_FUNC(2, 4, mmx2);
+ VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
#if ARCH_X86_32
- VP8_LUMA_MC_FUNC(0, 16, mmxext);
- VP8_MC_FUNC(1, 8, mmxext);
- VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
- VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
-
- c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
-
- c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
- c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
-
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+ VP8_LUMA_MC_FUNC(0, 16, mmx2);
+ VP8_MC_FUNC(1, 8, mmx2);
+ VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
+ VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
#endif
}
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 140597031f..05a2a5712b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 1, 7
%ifdef PIC
-%define fourtap_filter_hw r11
-%define sixtap_filter_hw r11
-%define fourtap_filter_hb r11
-%define sixtap_filter_hb r11
-%define fourtap_filter_v r11
-%define sixtap_filter_v r11
-%define bilinear_filter_vw r11
-%define bilinear_filter_vb r11
+%define fourtap_filter_hw picregq
+%define sixtap_filter_hw picregq
+%define fourtap_filter_hb picregq
+%define sixtap_filter_hb picregq
+%define fourtap_filter_v picregq
+%define sixtap_filter_v picregq
+%define bilinear_filter_vw picregq
+%define bilinear_filter_vb picregq
+%define npicregs 1
%else
-%define fourtap_filter_hw fourtap_filter_hw_m
-%define sixtap_filter_hw sixtap_filter_hw_m
-%define fourtap_filter_hb fourtap_filter_hb_m
-%define sixtap_filter_hb sixtap_filter_hb_m
-%define fourtap_filter_v fourtap_filter_v_m
-%define sixtap_filter_v sixtap_filter_v_m
+%define fourtap_filter_hw fourtap_filter_hw_m
+%define sixtap_filter_hw sixtap_filter_hw_m
+%define fourtap_filter_hb fourtap_filter_hb_m
+%define sixtap_filter_hb sixtap_filter_hb_m
+%define fourtap_filter_v fourtap_filter_v_m
+%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
+%define npicregs 0
%endif
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
@@ -173,26 +175,26 @@ SECTION .text
; int height, int mx, int my);
;-----------------------------------------------------------------------------
-%macro FILTER_SSSE3 3
-cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
- lea r5d, [r5*3]
+%macro FILTER_SSSE3 1
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
%ifdef PIC
- lea r11, [sixtap_filter_hb_m]
+ lea picregq, [sixtap_filter_hb_m]
%endif
- mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
- mova m6, [sixtap_filter_hb+r5*8-32]
- mova m7, [sixtap_filter_hb+r5*8-16]
+ mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
+ mova m6, [sixtap_filter_hb+mxq*8-32]
+ mova m7, [sixtap_filter_hb+mxq*8-16]
.nextrow
- movu m0, [r2-2]
+ movu m0, [srcq-2]
mova m1, m0
mova m2, m0
-%ifidn %1, 4
+%if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
- punpcklbw m0, [r2+3]
+ punpcklbw m0, [srcq+3]
%else
pshufb m0, [filter_h6_shuf1]
%endif
@@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
paddsw m0, [pw_64]
psraw m0, 7
packuswb m0, m0
- movh [r0], m0 ; store
+ movh [dstq], m0 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
- shl r5d, 4
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
mova m2, [pw_64]
mova m3, [filter_h2_shuf]
mova m4, [filter_h4_shuf]
%ifdef PIC
- lea r11, [fourtap_filter_hb_m]
+ lea picregq, [fourtap_filter_hb_m]
%endif
- mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
- mova m6, [fourtap_filter_hb+r5]
+ mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
+ mova m6, [fourtap_filter_hb+mxq]
.nextrow
- movu m0, [r2-1]
+ movu m0, [srcq-1]
mova m1, m0
pshufb m0, m3
pshufb m1, m4
@@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
paddsw m0, m1
psraw m0, 7
packuswb m0, m0
- movh [r0], m0 ; store
+ movh [dstq], m0 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
- shl r6d, 4
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
%ifdef PIC
- lea r11, [fourtap_filter_hb_m]
+ lea picregq, [fourtap_filter_hb_m]
%endif
- mova m5, [fourtap_filter_hb+r6-16]
- mova m6, [fourtap_filter_hb+r6]
+ mova m5, [fourtap_filter_hb+myq-16]
+ mova m6, [fourtap_filter_hb+myq]
mova m7, [pw_64]
; read 3 lines
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+ r3]
- movh m2, [r2+2*r3]
- add r2, r3
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
.nextrow
- movh m3, [r2+2*r3] ; read new row
+ movh m3, [srcq+2*srcstrideq] ; read new row
mova m4, m0
mova m0, m1
punpcklbw m4, m1
@@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
paddsw m4, m7
psraw m4, 7
packuswb m4, m4
- movh [r0], m4
+ movh [dstq], m4
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
- lea r6d, [r6*3]
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ lea myd, [myq*3]
%ifdef PIC
- lea r11, [sixtap_filter_hb_m]
+ lea picregq, [sixtap_filter_hb_m]
%endif
- lea r6, [sixtap_filter_hb+r6*8]
+ lea myq, [sixtap_filter_hb+myq*8]
; read 5 lines
- sub r2, r3
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+r3]
- movh m2, [r2+r3*2]
- lea r2, [r2+r3*2]
- add r2, r3
- movh m3, [r2]
- movh m4, [r2+r3]
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
.nextrow
- movh m5, [r2+2*r3] ; read new row
+ movh m5, [srcq+2*srcstrideq] ; read new row
mova m6, m0
punpcklbw m6, m5
mova m0, m1
punpcklbw m1, m2
mova m7, m3
punpcklbw m7, m4
- pmaddubsw m6, [r6-48]
- pmaddubsw m1, [r6-32]
- pmaddubsw m7, [r6-16]
+ pmaddubsw m6, [myq-48]
+ pmaddubsw m1, [myq-32]
+ pmaddubsw m7, [myq-16]
paddsw m6, m1
paddsw m6, m7
mova m1, m2
@@ -323,34 +325,35 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
mova m3, m4
packuswb m6, m6
mova m4, m5
- movh [r0], m6
+ movh [dstq], m6
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
-INIT_MMX
-FILTER_SSSE3 4, 0, 0
-INIT_XMM
-FILTER_SSSE3 8, 8, 7
+INIT_MMX ssse3
+FILTER_SSSE3 4
+INIT_XMM ssse3
+FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter
-cglobal put_vp8_epel4_h4_mmxext, 6, 6
- shl r5d, 4
+INIT_MMX mmx2
+cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
%ifdef PIC
- lea r11, [fourtap_filter_hw_m]
+ lea picregq, [fourtap_filter_hw_m]
%endif
- movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
- movq mm5, [fourtap_filter_hw+r5]
+ movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
+ movq mm5, [fourtap_filter_hw+mxq]
movq mm7, [pw_64]
pxor mm6, mm6
.nextrow
- movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
+ movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
@@ -376,29 +379,30 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
paddsw mm3, mm7 ; rounding
psraw mm3, 7
packuswb mm3, mm6 ; clip and word->bytes
- movd [r0], mm3 ; store
+ movd [dstq], mm3 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
; 4x4 block, H-only 6-tap filter
-cglobal put_vp8_epel4_h6_mmxext, 6, 6
- lea r5d, [r5*3]
+INIT_MMX mmx2
+cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
%ifdef PIC
- lea r11, [sixtap_filter_hw_m]
+ lea picregq, [sixtap_filter_hw_m]
%endif
- movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
- movq mm5, [sixtap_filter_hw+r5*8-32]
- movq mm6, [sixtap_filter_hw+r5*8-16]
+ movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
+ movq mm5, [sixtap_filter_hw+mxq*8-32]
+ movq mm6, [sixtap_filter_hw+mxq*8-16]
movq mm7, [pw_64]
pxor mm3, mm3
.nextrow
- movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
+ movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
@@ -418,7 +422,7 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
paddd mm1, mm2 ; finish 1st 2px
; second set of 2 pixels, use backup of above
- movd mm2, [r2+3] ; byte FGHI (prevent overreads)
+ movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
paddd mm0, mm3 ; add to 2nd 2px cache
@@ -433,35 +437,35 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
paddsw mm1, mm7 ; rounding
psraw mm1, 7
packuswb mm1, mm3 ; clip and word->bytes
- movd [r0], mm1 ; store
+ movd [dstq], mm1 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-INIT_XMM
-cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
- shl r5d, 5
+INIT_XMM sse2
+cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 5
%ifdef PIC
- lea r11, [fourtap_filter_v_m]
+ lea picregq, [fourtap_filter_v_m]
%endif
- lea r5, [fourtap_filter_v+r5-32]
+ lea mxq, [fourtap_filter_v+mxq-32]
pxor m7, m7
mova m4, [pw_64]
- mova m5, [r5+ 0]
- mova m6, [r5+16]
+ mova m5, [mxq+ 0]
+ mova m6, [mxq+16]
%ifdef m8
- mova m8, [r5+32]
- mova m9, [r5+48]
+ mova m8, [mxq+32]
+ mova m9, [mxq+48]
%endif
.nextrow
- movq m0, [r2-1]
- movq m1, [r2-0]
- movq m2, [r2+1]
- movq m3, [r2+2]
+ movq m0, [srcq-1]
+ movq m1, [srcq-0]
+ movq m2, [srcq+1]
+ movq m3, [srcq+2]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
@@ -472,8 +476,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
pmullw m2, m8
pmullw m3, m9
%else
- pmullw m2, [r5+32]
- pmullw m3, [r5+48]
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
%endif
paddsw m0, m1
paddsw m2, m3
@@ -481,39 +485,40 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
paddsw m0, m4
psraw m0, 7
packuswb m0, m7
- movh [r0], m0 ; store
+ movh [dstq], m0 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
- lea r5d, [r5*3]
- shl r5d, 4
+INIT_XMM sse2
+cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+ shl mxd, 4
%ifdef PIC
- lea r11, [sixtap_filter_v_m]
+ lea picregq, [sixtap_filter_v_m]
%endif
- lea r5, [sixtap_filter_v+r5-96]
+ lea mxq, [sixtap_filter_v+mxq-96]
pxor m7, m7
mova m6, [pw_64]
%ifdef m8
- mova m8, [r5+ 0]
- mova m9, [r5+16]
- mova m10, [r5+32]
- mova m11, [r5+48]
- mova m12, [r5+64]
- mova m13, [r5+80]
+ mova m8, [mxq+ 0]
+ mova m9, [mxq+16]
+ mova m10, [mxq+32]
+ mova m11, [mxq+48]
+ mova m12, [mxq+64]
+ mova m13, [mxq+80]
%endif
.nextrow
- movq m0, [r2-2]
- movq m1, [r2-1]
- movq m2, [r2-0]
- movq m3, [r2+1]
- movq m4, [r2+2]
- movq m5, [r2+3]
+ movq m0, [srcq-2]
+ movq m1, [srcq-1]
+ movq m2, [srcq-0]
+ movq m3, [srcq+1]
+ movq m4, [srcq+2]
+ movq m5, [srcq+3]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
@@ -528,12 +533,12 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
pmullw m4, m12
pmullw m5, m13
%else
- pmullw m0, [r5+ 0]
- pmullw m1, [r5+16]
- pmullw m2, [r5+32]
- pmullw m3, [r5+48]
- pmullw m4, [r5+64]
- pmullw m5, [r5+80]
+ pmullw m0, [mxq+ 0]
+ pmullw m1, [mxq+16]
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
+ pmullw m4, [mxq+64]
+ pmullw m5, [mxq+80]
%endif
paddsw m1, m4
paddsw m0, m5
@@ -543,52 +548,52 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
paddsw m0, m6
psraw m0, 7
packuswb m0, m7
- movh [r0], m0 ; store
+ movh [dstq], m0 ; store
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
-%macro FILTER_V 3
+%macro FILTER_V 1
; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
- shl r6d, 5
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 5
%ifdef PIC
- lea r11, [fourtap_filter_v_m]
+ lea picregq, [fourtap_filter_v_m]
%endif
- lea r6, [fourtap_filter_v+r6-32]
+ lea myq, [fourtap_filter_v+myq-32]
mova m6, [pw_64]
pxor m7, m7
- mova m5, [r6+48]
+ mova m5, [myq+48]
; read 3 lines
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+ r3]
- movh m2, [r2+2*r3]
- add r2, r3
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
.nextrow
; first calculate negative taps (to prevent losing positive overflows)
- movh m4, [r2+2*r3] ; read new row
+ movh m4, [srcq+2*srcstrideq] ; read new row
punpcklbw m4, m7
mova m3, m4
- pmullw m0, [r6+0]
+ pmullw m0, [myq+0]
pmullw m4, m5
paddsw m4, m0
; then calculate positive taps
mova m0, m1
- pmullw m1, [r6+16]
+ pmullw m1, [myq+16]
paddsw m4, m1
mova m1, m2
- pmullw m2, [r6+32]
+ pmullw m2, [myq+32]
paddsw m4, m2
mova m2, m3
@@ -596,36 +601,36 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
paddsw m4, m6
psraw m4, 7
packuswb m4, m7
- movh [r0], m4
+ movh [dstq], m4
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
- shl r6d, 4
- lea r6, [r6*3]
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+ lea myq, [myq*3]
%ifdef PIC
- lea r11, [sixtap_filter_v_m]
+ lea picregq, [sixtap_filter_v_m]
%endif
- lea r6, [sixtap_filter_v+r6-96]
+ lea myq, [sixtap_filter_v+myq-96]
pxor m7, m7
; read 5 lines
- sub r2, r3
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+r3]
- movh m2, [r2+r3*2]
- lea r2, [r2+r3*2]
- add r2, r3
- movh m3, [r2]
- movh m4, [r2+r3]
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
@@ -635,62 +640,61 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
.nextrow
; first calculate negative taps (to prevent losing positive overflows)
mova m5, m1
- pmullw m5, [r6+16]
+ pmullw m5, [myq+16]
mova m6, m4
- pmullw m6, [r6+64]
+ pmullw m6, [myq+64]
paddsw m6, m5
; then calculate positive taps
- movh m5, [r2+2*r3] ; read new row
+ movh m5, [srcq+2*srcstrideq] ; read new row
punpcklbw m5, m7
- pmullw m0, [r6+0]
+ pmullw m0, [myq+0]
paddsw m6, m0
mova m0, m1
mova m1, m2
- pmullw m2, [r6+32]
+ pmullw m2, [myq+32]
paddsw m6, m2
mova m2, m3
- pmullw m3, [r6+48]
+ pmullw m3, [myq+48]
paddsw m6, m3
mova m3, m4
mova m4, m5
- pmullw m5, [r6+80]
+ pmullw m5, [myq+80]
paddsw m6, m5
; round/clip/store
paddsw m6, [pw_64]
psraw m6, 7
packuswb m6, m7
- movh [r0], m6
+ movh [dstq], m6
; go to next line
- add r0, r1
- add r2, r3
- dec r4d ; next row
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
-INIT_MMX
-FILTER_V mmxext, 4, 0
-INIT_XMM
-FILTER_V sse2, 8, 8
+INIT_MMX mmx2
+FILTER_V 4
+INIT_XMM sse2
+FILTER_V 8
-%macro FILTER_BILINEAR 3
-cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
- mov r5d, 8*16
- shl r6d, 4
- sub r5d, r6d
+%macro FILTER_BILINEAR 1
+cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
%ifdef PIC
- lea r11, [bilinear_filter_vw_m]
+ lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6
- mova m4, [bilinear_filter_vw+r5-16]
- mova m5, [bilinear_filter_vw+r6-16]
+ mova m5, [bilinear_filter_vw+myq-1*16]
+ neg myq
+ mova m4, [bilinear_filter_vw+myq+7*16]
.nextrow
- movh m0, [r2+r3*0]
- movh m1, [r2+r3*1]
- movh m3, [r2+r3*2]
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m3, [srcq+srcstrideq*2]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m3, m6
@@ -705,38 +709,37 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
-%ifidn %1, mmxext
+%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
- movh [r0+r1*0], m0
- movh [r0+r1*1], m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
- movh [r0+r1*0], m0
- movhps [r0+r1*1], m0
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
%endif
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r4d, 2
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
-cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
- mov r6d, 8*16
- shl r5d, 4
- sub r6d, r5d
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
%ifdef PIC
- lea r11, [bilinear_filter_vw_m]
+ lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6
- mova m4, [bilinear_filter_vw+r6-16]
- mova m5, [bilinear_filter_vw+r5-16]
+ mova m5, [bilinear_filter_vw+mxq-1*16]
+ neg mxq
+ mova m4, [bilinear_filter_vw+mxq+7*16]
.nextrow
- movh m0, [r2+r3*0+0]
- movh m1, [r2+r3*0+1]
- movh m2, [r2+r3*1+0]
- movh m3, [r2+r3*1+1]
+ movh m0, [srcq+srcstrideq*0+0]
+ movh m1, [srcq+srcstrideq*0+1]
+ movh m2, [srcq+srcstrideq*1+0]
+ movh m3, [srcq+srcstrideq*1+1]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m2, m6
@@ -751,41 +754,41 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
-%ifidn %1, mmxext
+%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
- movh [r0+r1*0], m0
- movh [r0+r1*1], m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
- movh [r0+r1*0], m0
- movhps [r0+r1*1], m0
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
%endif
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r4d, 2
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
%endmacro
-INIT_MMX
-FILTER_BILINEAR mmxext, 4, 0
-INIT_XMM
-FILTER_BILINEAR sse2, 8, 7
+INIT_MMX mmx2
+FILTER_BILINEAR 4
+INIT_XMM sse2
+FILTER_BILINEAR 8
%macro FILTER_BILINEAR_SSSE3 1
-cglobal put_vp8_bilinear%1_v_ssse3, 7,7
- shl r6d, 4
+cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
%ifdef PIC
- lea r11, [bilinear_filter_vb_m]
+ lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
- mova m3, [bilinear_filter_vb+r6-16]
+ mova m3, [bilinear_filter_vb+myq-16]
.nextrow
- movh m0, [r2+r3*0]
- movh m1, [r2+r3*1]
- movh m2, [r2+r3*2]
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m2, [srcq+srcstrideq*2]
punpcklbw m0, m1
punpcklbw m1, m2
pmaddubsw m0, m3
@@ -797,31 +800,31 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
%if mmsize==8
packuswb m0, m0
packuswb m1, m1
- movh [r0+r1*0], m0
- movh [r0+r1*1], m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
%else
packuswb m0, m1
- movh [r0+r1*0], m0
- movhps [r0+r1*1], m0
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
%endif
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r4d, 2
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
-cglobal put_vp8_bilinear%1_h_ssse3, 7,7
- shl r5d, 4
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
%ifdef PIC
- lea r11, [bilinear_filter_vb_m]
+ lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
mova m2, [filter_h2_shuf]
- mova m3, [bilinear_filter_vb+r5-16]
+ mova m3, [bilinear_filter_vb+mxq-16]
.nextrow
- movu m0, [r2+r3*0]
- movu m1, [r2+r3*1]
+ movu m0, [srcq+srcstrideq*0]
+ movu m1, [srcq+srcstrideq*1]
pshufb m0, m2
pshufb m1, m2
pmaddubsw m0, m3
@@ -833,65 +836,68 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
%if mmsize==8
packuswb m0, m0
packuswb m1, m1
- movh [r0+r1*0], m0
- movh [r0+r1*1], m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
%else
packuswb m0, m1
- movh [r0+r1*0], m0
- movhps [r0+r1*1], m0
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
%endif
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r4d, 2
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
%endmacro
-INIT_MMX
+INIT_MMX ssse3
FILTER_BILINEAR_SSSE3 4
-INIT_XMM
+INIT_XMM ssse3
FILTER_BILINEAR_SSSE3 8
-cglobal put_vp8_pixels8_mmx, 5,5
+INIT_MMX mmx
+cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
.nextrow:
- movq mm0, [r2+r3*0]
- movq mm1, [r2+r3*1]
- lea r2, [r2+r3*2]
- movq [r0+r1*0], mm0
- movq [r0+r1*1], mm1
- lea r0, [r0+r1*2]
- sub r4d, 2
+ movq mm0, [srcq+srcstrideq*0]
+ movq mm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0], mm0
+ movq [dstq+dststrideq*1], mm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
%if ARCH_X86_32
-cglobal put_vp8_pixels16_mmx, 5,5
+INIT_MMX mmx
+cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
.nextrow:
- movq mm0, [r2+r3*0+0]
- movq mm1, [r2+r3*0+8]
- movq mm2, [r2+r3*1+0]
- movq mm3, [r2+r3*1+8]
- lea r2, [r2+r3*2]
- movq [r0+r1*0+0], mm0
- movq [r0+r1*0+8], mm1
- movq [r0+r1*1+0], mm2
- movq [r0+r1*1+8], mm3
- lea r0, [r0+r1*2]
- sub r4d, 2
+ movq mm0, [srcq+srcstrideq*0+0]
+ movq mm1, [srcq+srcstrideq*0+8]
+ movq mm2, [srcq+srcstrideq*1+0]
+ movq mm3, [srcq+srcstrideq*1+8]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0+0], mm0
+ movq [dstq+dststrideq*0+8], mm1
+ movq [dstq+dststrideq*1+0], mm2
+ movq [dstq+dststrideq*1+8], mm3
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
%endif
-cglobal put_vp8_pixels16_sse, 5,5,2
+INIT_XMM sse
+cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
.nextrow:
- movups xmm0, [r2+r3*0]
- movups xmm1, [r2+r3*1]
- lea r2, [r2+r3*2]
- movaps [r0+r1*0], xmm0
- movaps [r0+r1*1], xmm1
- lea r0, [r0+r1*2]
- sub r4d, 2
+ movups xmm0, [srcq+srcstrideq*0]
+ movups xmm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movaps [dstq+dststrideq*0], xmm0
+ movaps [dstq+dststrideq*1], xmm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
jg .nextrow
REP_RET
@@ -900,10 +906,10 @@ cglobal put_vp8_pixels16_sse, 5,5,2
;-----------------------------------------------------------------------------
%macro ADD_DC 4
- %4 m2, [r0+%3]
- %4 m3, [r0+r2+%3]
- %4 m4, [r1+%3]
- %4 m5, [r1+r2+%3]
+ %4 m2, [dst1q+%3]
+ %4 m3, [dst1q+strideq+%3]
+ %4 m4, [dst2q+%3]
+ %4 m5, [dst2q+strideq+%3]
paddusb m2, %1
paddusb m3, %1
paddusb m4, %1
@@ -912,22 +918,22 @@ cglobal put_vp8_pixels16_sse, 5,5,2
psubusb m3, %2
psubusb m4, %2
psubusb m5, %2
- %4 [r0+%3], m2
- %4 [r0+r2+%3], m3
- %4 [r1+%3], m4
- %4 [r1+r2+%3], m5
+ %4 [dst1q+%3], m2
+ %4 [dst1q+strideq+%3], m3
+ %4 [dst2q+%3], m4
+ %4 [dst2q+strideq+%3], m5
%endmacro
-INIT_MMX
-cglobal vp8_idct_dc_add_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1]
+ movd m0, [blockq]
; calculate DC
paddw m0, [pw_4]
pxor m1, m1
psraw m0, 3
- movd [r1], m1
+ movd [blockq], m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
@@ -937,24 +943,26 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
punpcklwd m1, m1
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh
RET
-INIT_XMM
-cglobal vp8_idct_dc_add_sse4, 3, 3, 6
+INIT_XMM sse4
+cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data
- movd m0, [r1]
+ movd m0, [blockq]
pxor m1, m1
; calculate DC
paddw m0, [pw_4]
- movd [r1], m1
- lea r1, [r0+r2*2]
- movd m2, [r0]
- movd m3, [r0+r2]
- movd m4, [r1]
- movd m5, [r1+r2]
+ movd [blockq], m1
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ movd m2, [dst1q]
+ movd m3, [dst1q+strideq]
+ movd m4, [dst2q]
+ movd m5, [dst2q+strideq]
psraw m0, 3
pshuflw m0, m0, 0
punpcklqdq m0, m0
@@ -965,10 +973,10 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
paddw m2, m0
paddw m4, m0
packuswb m2, m4
- movd [r0], m2
- pextrd [r0+r2], m2, 1
- pextrd [r1], m2, 2
- pextrd [r1+r2], m2, 3
+ movd [dst1q], m2
+ pextrd [dst1q+strideq], m2, 1
+ pextrd [dst2q], m2, 2
+ pextrd [dst2q+strideq], m2, 3
RET
;-----------------------------------------------------------------------------
@@ -976,22 +984,22 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
;-----------------------------------------------------------------------------
%if ARCH_X86_32
-INIT_MMX
-cglobal vp8_idct_dc_add4y_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m6
- movd [r1+32*1], m6
- movd [r1+32*2], m6
- movd [r1+32*3], m6
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
@@ -1006,28 +1014,29 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
punpckhbw m7, m7 ; CCCCDDDD
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
ADD_DC m1, m7, 8, mova
RET
%endif
-INIT_XMM
-cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
+INIT_XMM sse2
+cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m1, m1
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m1
- movd [r1+32*1], m1
- movd [r1+32*2], m1
- movd [r1+32*3], m1
+ movd [blockq+32*0], m1
+ movd [blockq+32*1], m1
+ movd [blockq+32*2], m1
+ movd [blockq+32*3], m1
psraw m0, 3
psubw m1, m0
packuswb m0, m0
@@ -1038,7 +1047,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
punpcklbw m1, m1
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, mova
RET
@@ -1046,22 +1056,22 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal vp8_idct_dc_add4uv_mmx, 3, 3
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
; load data
- movd m0, [r1+32*0] ; A
- movd m1, [r1+32*2] ; C
- punpcklwd m0, [r1+32*1] ; A B
- punpcklwd m1, [r1+32*3] ; C D
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
; calculate DC
paddw m0, [pw_4]
- movd [r1+32*0], m6
- movd [r1+32*1], m6
- movd [r1+32*2], m6
- movd [r1+32*3], m6
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
@@ -1076,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
punpckhbw m7, m7 ; CCCCDDDD
; add DC
- lea r1, [r0+r2*2]
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
+ lea dst1q, [dst1q+strideq*4]
+ lea dst2q, [dst2q+strideq*4]
ADD_DC m1, m7, 0, mova
RET
@@ -1118,26 +1129,25 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
SWAP %4, %3
%endmacro
-INIT_MMX
-%macro VP8_IDCT_ADD 1
-cglobal vp8_idct_add_%1, 3, 3
+%macro VP8_IDCT_ADD 0
+cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
; load block data
- movq m0, [r1+ 0]
- movq m1, [r1+ 8]
- movq m2, [r1+16]
- movq m3, [r1+24]
+ movq m0, [blockq+ 0]
+ movq m1, [blockq+ 8]
+ movq m2, [blockq+16]
+ movq m3, [blockq+24]
movq m6, [pw_20091]
movq m7, [pw_17734]
-%ifidn %1, sse
+%if cpuflag(sse)
xorps xmm0, xmm0
- movaps [r1+ 0], xmm0
- movaps [r1+16], xmm0
+ movaps [blockq+ 0], xmm0
+ movaps [blockq+16], xmm0
%else
pxor m4, m4
- movq [r1+ 0], m4
- movq [r1+ 8], m4
- movq [r1+16], m4
- movq [r1+24], m4
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+ movq [blockq+16], m4
+ movq [blockq+24], m4
%endif
; actual IDCT
@@ -1149,41 +1159,44 @@ cglobal vp8_idct_add_%1, 3, 3
; store
pxor m4, m4
- lea r1, [r0+2*r2]
- STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
- STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+2*strideq]
+ STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
+ STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
RET
%endmacro
%if ARCH_X86_32
-VP8_IDCT_ADD mmx
+INIT_MMX mmx
+VP8_IDCT_ADD
%endif
-VP8_IDCT_ADD sse
+INIT_MMX sse
+VP8_IDCT_ADD
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
;-----------------------------------------------------------------------------
%macro SCATTER_WHT 3
- movd r1d, m%1
- movd r2d, m%2
- mov [r0+2*16*(0+%3)], r1w
- mov [r0+2*16*(1+%3)], r2w
- shr r1d, 16
- shr r2d, 16
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(0+%3)], dc1w
+ mov [blockq+2*16*(1+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
psrlq m%1, 32
psrlq m%2, 32
- mov [r0+2*16*(4+%3)], r1w
- mov [r0+2*16*(5+%3)], r2w
- movd r1d, m%1
- movd r2d, m%2
- mov [r0+2*16*(8+%3)], r1w
- mov [r0+2*16*(9+%3)], r2w
- shr r1d, 16
- shr r2d, 16
- mov [r0+2*16*(12+%3)], r1w
- mov [r0+2*16*(13+%3)], r2w
+ mov [blockq+2*16*(4+%3)], dc1w
+ mov [blockq+2*16*(5+%3)], dc2w
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(8+%3)], dc1w
+ mov [blockq+2*16*(9+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ mov [blockq+2*16*(12+%3)], dc1w
+ mov [blockq+2*16*(13+%3)], dc2w
%endmacro
%macro HADAMARD4_1D 4
@@ -1192,22 +1205,22 @@ VP8_IDCT_ADD sse
SWAP %1, %4, %3
%endmacro
-%macro VP8_DC_WHT 1
-cglobal vp8_luma_dc_wht_%1, 2,3
- movq m0, [r1]
- movq m1, [r1+8]
- movq m2, [r1+16]
- movq m3, [r1+24]
-%ifidn %1, sse
+%macro VP8_DC_WHT 0
+cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
+ movq m0, [dc1q]
+ movq m1, [dc1q+8]
+ movq m2, [dc1q+16]
+ movq m3, [dc1q+24]
+%if cpuflag(sse)
xorps xmm0, xmm0
- movaps [r1+ 0], xmm0
- movaps [r1+16], xmm0
+ movaps [dc1q+ 0], xmm0
+ movaps [dc1q+16], xmm0
%else
pxor m4, m4
- movq [r1+ 0], m4
- movq [r1+ 8], m4
- movq [r1+16], m4
- movq [r1+24], m4
+ movq [dc1q+ 0], m4
+ movq [dc1q+ 8], m4
+ movq [dc1q+16], m4
+ movq [dc1q+24], m4
%endif
HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4
@@ -1222,11 +1235,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
RET
%endmacro
-INIT_MMX
%if ARCH_X86_32
-VP8_DC_WHT mmx
+INIT_MMX mmx
+VP8_DC_WHT
%endif
-VP8_DC_WHT sse
+INIT_MMX sse
+VP8_DC_WHT
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
@@ -1414,7 +1428,17 @@ VP8_DC_WHT sse
add %4, %5
%endmacro
-%macro WRITE_8W_SSE2 5
+%macro WRITE_8W 5
+%if cpuflag(sse4)
+ pextrw [%3+%4*4], %1, 0
+ pextrw [%2+%4*4], %1, 1
+ pextrw [%3+%4*2], %1, 2
+ pextrw [%3+%4 ], %1, 3
+ pextrw [%3 ], %1, 4
+ pextrw [%2 ], %1, 5
+ pextrw [%2+%5 ], %1, 6
+ pextrw [%2+%5*2], %1, 7
+%else
movd %2d, %1
psrldq %1, 4
mov [%3+%4*4], %2w
@@ -1440,79 +1464,68 @@ VP8_DC_WHT sse
mov [%3+%5 ], %2w
shr %2, 16
mov [%3+%5*2], %2w
+%endif
%endmacro
-%macro WRITE_8W_SSE4 5
- pextrw [%3+%4*4], %1, 0
- pextrw [%2+%4*4], %1, 1
- pextrw [%3+%4*2], %1, 2
- pextrw [%3+%4 ], %1, 3
- pextrw [%3 ], %1, 4
- pextrw [%2 ], %1, 5
- pextrw [%2+%5 ], %1, 6
- pextrw [%2+%5*2], %1, 7
-%endmacro
-
-%macro SPLATB_REG_MMX 2-3
+%macro SPLATB_REG 2-3
+%if cpuflag(ssse3)
+ movd %1, %2d
+ pshufb %1, %3
+%elif cpuflag(sse2)
movd %1, %2d
punpcklbw %1, %1
- punpcklwd %1, %1
- punpckldq %1, %1
-%endmacro
-
-%macro SPLATB_REG_MMXEXT 2-3
+ pshuflw %1, %1, 0x0
+ punpcklqdq %1, %1
+%elif cpuflag(mmx2)
movd %1, %2d
punpcklbw %1, %1
pshufw %1, %1, 0x0
-%endmacro
-
-%macro SPLATB_REG_SSE2 2-3
+%else
movd %1, %2d
punpcklbw %1, %1
- pshuflw %1, %1, 0x0
- punpcklqdq %1, %1
-%endmacro
-
-%macro SPLATB_REG_SSSE3 3
- movd %1, %2d
- pshufb %1, %3
+ punpcklwd %1, %1
+ punpckldq %1, %1
+%endif
%endmacro
-%macro SIMPLE_LOOPFILTER 4
-cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
+%macro SIMPLE_LOOPFILTER 2
+cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
%if mmsize == 8 ; mmx/mmxext
- mov r3, 2
+ mov cntrq, 2
%endif
-%ifnidn %1, sse2
-%if mmsize == 16
+%if cpuflag(ssse3)
pxor m0, m0
%endif
-%endif
- SPLATB_REG m7, r2, m0 ; splat "flim" into register
+ SPLATB_REG m7, flim, m0 ; splat "flim" into register
; set up indexes to address 4 rows
- mov r2, r1
- neg r1
-%ifidn %2, h
- lea r0, [r0+4*r2-2]
+%if mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, cntr, dst2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst3, dst2
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+4*strideq-2]
%endif
%if mmsize == 8 ; mmx / mmxext
.next8px
%endif
-%ifidn %2, v
+%ifidn %1, v
; read 4 half/full rows of pixels
- mova m0, [r0+r1*2] ; p1
- mova m1, [r0+r1] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r2] ; q1
+ mova m0, [dst1q+mstrideq*2] ; p1
+ mova m1, [dst1q+mstrideq] ; p0
+ mova m2, [dst1q] ; q0
+ mova m3, [dst1q+ strideq] ; q1
%else ; h
- lea r4, [r0+r2]
+ lea dst2q, [dst1q+ strideq]
%if mmsize == 8 ; mmx/mmxext
- READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
+ READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
%else ; sse2
- READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
+ READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
%endif
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
@@ -1581,36 +1594,36 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
psubusb m6, m3 ; p0+f2
; store
-%ifidn %2, v
- mova [r0], m4
- mova [r0+r1], m6
+%ifidn %1, v
+ mova [dst1q], m4
+ mova [dst1q+mstrideq], m6
%else ; h
- inc r0
+ inc dst1q
SBUTTERFLY bw, 6, 4, 0
%if mmsize == 16 ; sse2
-%ifidn %1, sse4
- inc r4
+%if cpuflag(sse4)
+ inc dst2q
%endif
- WRITE_8W m6, r4, r0, r1, r2
- lea r4, [r3+r1+1]
-%ifidn %1, sse4
- inc r3
+ WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
+ lea dst2q, [dst3q+mstrideq+1]
+%if cpuflag(sse4)
+ inc dst3q
%endif
- WRITE_8W m4, r3, r4, r1, r2
+ WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
%else ; mmx/mmxext
- WRITE_2x4W m6, m4, r4, r0, r1, r2
+ WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
%endif
%endif
%if mmsize == 8 ; mmx/mmxext
; next 8 pixels
-%ifidn %2, v
- add r0, 8 ; advance 8 cols = pixels
+%ifidn %1, v
+ add dst1q, 8 ; advance 8 cols = pixels
%else ; h
- lea r0, [r0+r2*8-1] ; advance 8 rows = lines
+ lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
%endif
- dec r3
+ dec cntrq
jg .next8px
REP_RET
%else ; sse2
@@ -1619,41 +1632,38 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
%endmacro
%if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-SIMPLE_LOOPFILTER mmx, v, 4, 0
-SIMPLE_LOOPFILTER mmx, h, 5, 0
-%define SPLATB_REG SPLATB_REG_MMXEXT
-SIMPLE_LOOPFILTER mmxext, v, 4, 0
-SIMPLE_LOOPFILTER mmxext, h, 5, 0
-%endif
-
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-%define WRITE_8W WRITE_8W_SSE2
-SIMPLE_LOOPFILTER sse2, v, 3, 8
-SIMPLE_LOOPFILTER sse2, h, 5, 8
-%define SPLATB_REG SPLATB_REG_SSSE3
-SIMPLE_LOOPFILTER ssse3, v, 3, 8
-SIMPLE_LOOPFILTER ssse3, h, 5, 8
-%define WRITE_8W WRITE_8W_SSE4
-SIMPLE_LOOPFILTER sse4, h, 5, 8
+INIT_MMX mmx
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+INIT_MMX mmx2
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+%endif
+
+INIT_XMM sse2
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM ssse3
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM sse4
+SIMPLE_LOOPFILTER h, 5
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
-%macro INNER_LOOPFILTER 5
-%if %4 == 8 ; chroma
-cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
+%macro INNER_LOOPFILTER 3
+%if %3 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_inner, 6, %2, 13
%define dst8_reg r1
%define mstride_reg r2
%define E_reg r3
%define I_reg r4
%define hev_thr_reg r5
%else ; luma
-cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
+cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
%define mstride_reg r1
%define E_reg r2
%define I_reg r3
@@ -1673,11 +1683,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define stack_reg hev_thr_reg
%endif
-%ifnidn %1, sse2
-%if mmsize == 16
+%if cpuflag(ssse3)
pxor m7, m7
%endif
-%endif
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments
@@ -1688,7 +1696,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
; align stack
mov stack_reg, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
-%ifidn %2, v
+%ifidn %1, v
sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
; [3]=hev() result
%else ; h
@@ -1721,14 +1729,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
%endif
-%if mmsize == 8 && %4 == 16 ; mmx/mmxext
+%if mmsize == 8 && %3 == 16 ; mmx/mmxext
mov cnt_reg, 2
%endif
mov stride_reg, mstride_reg
neg mstride_reg
-%ifidn %2, h
+%ifidn %1, h
lea dst_reg, [dst_reg + stride_reg*4-4]
-%if %4 == 8
+%if %3 == 8
lea dst8_reg, [dst8_reg+ stride_reg*4-4]
%endif
%endif
@@ -1738,8 +1746,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endif
; read
lea dst2_reg, [dst_reg + stride_reg]
-%ifidn %2, v
-%if %4 == 8 && mmsize == 16
+%ifidn %1, v
+%if %3 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
@@ -1750,7 +1758,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
movrow m5, [dst2_reg] ; q1
movrow m6, [dst2_reg+ stride_reg] ; q2
movrow m7, [dst2_reg+ stride_reg*2] ; q3
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m0, [dst8_reg+mstride_reg*4]
movhps m2, [dst8_reg+mstride_reg*2]
add dst8_reg, stride_reg
@@ -1787,7 +1795,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
SWAP 6, 3
SWAP 5, 3
%else ; sse2 (h)
-%if %4 == 16
+%if %3 == 16
lea dst8_reg, [dst_reg + stride_reg*8]
%endif
@@ -1874,7 +1882,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb m6, m5 ; q2-q1
por m6, m4 ; abs(q2-q1)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m4, flim_I
pxor m3, m3
psubusb m0, m4
@@ -1896,9 +1904,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP 7, 3 ; now m7 is zero
-%ifidn %2, v
+%ifidn %1, v
movrow m3, [dst_reg +mstride_reg] ; p0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m3, [dst8_reg+mstride_reg]
%endif
%elifdef m12
@@ -1914,7 +1922,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb m1, m3 ; p1-p0
psubusb m6, m2 ; p0-p1
por m1, m6 ; abs(p1-p0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m6, m1
psubusb m1, m4
psubusb m6, hev_thr
@@ -1928,9 +1936,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endif
SWAP 6, 4 ; now m6 is I
-%ifidn %2, v
+%ifidn %1, v
movrow m4, [dst_reg] ; q0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m4, [dst8_reg]
%endif
%elifdef m8
@@ -1945,7 +1953,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb m1, m5 ; q0-q1
psubusb m7, m4 ; q1-q0
por m1, m7 ; abs(q1-q0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m7, m1
psubusb m1, m6
psubusb m7, hev_thr
@@ -2053,14 +2061,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%else
mova m6, mask_res
%endif
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m7, [pb_1]
%else ; mmxext/sse2
pxor m7, m7
%endif
pand m0, m6
pand m1, m6
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
paddusb m0, m7
pand m1, [pb_FE]
pandn m7, m0
@@ -2078,12 +2086,12 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
paddusb m2, m0 ; p1+a
; store
-%ifidn %2, v
+%ifidn %1, v
movrow [dst_reg +mstride_reg*2], m2
movrow [dst_reg +mstride_reg ], m3
movrow [dst_reg], m4
movrow [dst_reg + stride_reg ], m5
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps [dst8_reg+mstride_reg*2], m2
movhps [dst8_reg+mstride_reg ], m3
movhps [dst8_reg], m4
@@ -2100,20 +2108,20 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
%else ; sse2 (h)
lea dst8_reg, [dst8_reg+mstride_reg+2]
- WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
+ WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3
%endif
%endif
%if mmsize == 8
-%if %4 == 8 ; chroma
-%ifidn %2, h
+%if %3 == 8 ; chroma
+%ifidn %1, h
sub dst_reg, 2
%endif
cmp dst_reg, dst8_reg
mov dst_reg, dst8_reg
jnz .next8px
%else
-%ifidn %2, h
+%ifidn %1, h
lea dst_reg, [dst_reg + stride_reg*8-2]
%else ; v
add dst_reg, 8
@@ -2130,56 +2138,46 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endmacro
%if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-INNER_LOOPFILTER mmx, v, 6, 16, 0
-INNER_LOOPFILTER mmx, h, 6, 16, 0
-INNER_LOOPFILTER mmx, v, 6, 8, 0
-INNER_LOOPFILTER mmx, h, 6, 8, 0
-
-%define SPLATB_REG SPLATB_REG_MMXEXT
-INNER_LOOPFILTER mmxext, v, 6, 16, 0
-INNER_LOOPFILTER mmxext, h, 6, 16, 0
-INNER_LOOPFILTER mmxext, v, 6, 8, 0
-INNER_LOOPFILTER mmxext, h, 6, 8, 0
-%endif
-
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-INNER_LOOPFILTER sse2, v, 5, 16, 13
-%ifdef m8
-INNER_LOOPFILTER sse2, h, 5, 16, 13
-%else
-INNER_LOOPFILTER sse2, h, 6, 16, 13
-%endif
-INNER_LOOPFILTER sse2, v, 6, 8, 13
-INNER_LOOPFILTER sse2, h, 6, 8, 13
-
-%define SPLATB_REG SPLATB_REG_SSSE3
-INNER_LOOPFILTER ssse3, v, 5, 16, 13
-%ifdef m8
-INNER_LOOPFILTER ssse3, h, 5, 16, 13
-%else
-INNER_LOOPFILTER ssse3, h, 6, 16, 13
-%endif
-INNER_LOOPFILTER ssse3, v, 6, 8, 13
-INNER_LOOPFILTER ssse3, h, 6, 8, 13
+INIT_MMX mmx
+INNER_LOOPFILTER v, 6, 16
+INNER_LOOPFILTER h, 6, 16
+INNER_LOOPFILTER v, 6, 8
+INNER_LOOPFILTER h, 6, 8
+
+INIT_MMX mmx2
+INNER_LOOPFILTER v, 6, 16
+INNER_LOOPFILTER h, 6, 16
+INNER_LOOPFILTER v, 6, 8
+INNER_LOOPFILTER h, 6, 8
+%endif
+
+INIT_XMM sse2
+INNER_LOOPFILTER v, 5, 16
+INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16
+INNER_LOOPFILTER v, 6, 8
+INNER_LOOPFILTER h, 6, 8
+
+INIT_XMM ssse3
+INNER_LOOPFILTER v, 5, 16
+INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16
+INNER_LOOPFILTER v, 6, 8
+INNER_LOOPFILTER h, 6, 8
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
-%macro MBEDGE_LOOPFILTER 5
-%if %4 == 8 ; chroma
-cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
+%macro MBEDGE_LOOPFILTER 3
+%if %3 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_mbedge, 6, %2, 15
%define dst8_reg r1
%define mstride_reg r2
%define E_reg r3
%define I_reg r4
%define hev_thr_reg r5
%else ; luma
-cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
+cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15
%define mstride_reg r1
%define E_reg r2
%define I_reg r3
@@ -2199,14 +2197,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define stack_reg hev_thr_reg
%endif
-%define ssse3_or_higher 0
-%ifnidn %1, sse2
-%if mmsize == 16
-%define ssse3_or_higher 1
-%endif
-%endif
-
-%if ssse3_or_higher
+%if cpuflag(ssse3)
pxor m7, m7
%endif
@@ -2267,14 +2258,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
%endif
-%if mmsize == 8 && %4 == 16 ; mmx/mmxext
+%if mmsize == 8 && %3 == 16 ; mmx/mmxext
mov cnt_reg, 2
%endif
mov stride_reg, mstride_reg
neg mstride_reg
-%ifidn %2, h
+%ifidn %1, h
lea dst_reg, [dst_reg + stride_reg*4-4]
-%if %4 == 8
+%if %3 == 8
lea dst8_reg, [dst8_reg+ stride_reg*4-4]
%endif
%endif
@@ -2284,8 +2275,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
; read
lea dst2_reg, [dst_reg + stride_reg]
-%ifidn %2, v
-%if %4 == 8 && mmsize == 16
+%ifidn %1, v
+%if %3 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
@@ -2296,7 +2287,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
movrow m5, [dst2_reg] ; q1
movrow m6, [dst2_reg+ stride_reg] ; q2
movrow m7, [dst2_reg+ stride_reg*2] ; q3
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m0, [dst8_reg+mstride_reg*4]
movhps m2, [dst8_reg+mstride_reg*2]
add dst8_reg, stride_reg
@@ -2333,7 +2324,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
SWAP 6, 3
SWAP 5, 3
%else ; sse2 (h)
-%if %4 == 16
+%if %3 == 16
lea dst8_reg, [dst_reg + stride_reg*8]
%endif
@@ -2422,7 +2413,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb m6, m5 ; q2-q1
por m6, m4 ; abs(q2-q1)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m4, flim_I
pxor m3, m3
psubusb m0, m4
@@ -2444,9 +2435,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP 7, 3 ; now m7 is zero
-%ifidn %2, v
+%ifidn %1, v
movrow m3, [dst_reg +mstride_reg] ; p0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m3, [dst8_reg+mstride_reg]
%endif
%elifdef m12
@@ -2462,7 +2453,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb m1, m3 ; p1-p0
psubusb m6, m2 ; p0-p1
por m1, m6 ; abs(p1-p0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m6, m1
psubusb m1, m4
psubusb m6, hev_thr
@@ -2476,9 +2467,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
SWAP 6, 4 ; now m6 is I
-%ifidn %2, v
+%ifidn %1, v
movrow m4, [dst_reg] ; q0
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
movhps m4, [dst8_reg]
%endif
%elifdef m8
@@ -2493,7 +2484,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb m1, m5 ; q0-q1
psubusb m7, m4 ; q1-q0
por m1, m7 ; abs(q1-q0)
-%ifidn %1, mmx
+%if notcpuflag(mmx2)
mova m7, m1
psubusb m1, m6
psubusb m7, hev_thr
@@ -2605,7 +2596,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb m4, m1 ; q0-f1
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
-%if ssse3_or_higher
+%if cpuflag(ssse3)
mova m7, [pb_1]
%else
mova m7, [pw_63]
@@ -2618,7 +2609,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
pxor m0, m0
mova m6, m1
pcmpgtb m0, m1 ; which are negative
-%if ssse3_or_higher
+%if cpuflag(ssse3)
punpcklbw m6, m7 ; interleave with "1" for rounding
punpckhbw m1, m7
%else
@@ -2626,7 +2617,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
punpckhbw m1, m0
%endif
mova lim_sign, m0
-%if ssse3_or_higher
+%if cpuflag(ssse3)
mova m7, [pb_27_63]
%ifndef m8
mova lim_res, m1
@@ -2659,7 +2650,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubb m1, m6
pand m1, m0 ; -a0
pandn m0, m6 ; +a0
-%if ssse3_or_higher
+%if cpuflag(ssse3)
mova m6, [pb_18_63] ; pipelining
%endif
psubusb m3, m1
@@ -2667,7 +2658,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb m3, m0 ; p0+a0
psubusb m4, m0 ; q0-a0
-%if ssse3_or_higher
+%if cpuflag(ssse3)
SWAP 6, 7
%ifdef m10
SWAP 1, 10
@@ -2699,7 +2690,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubb m1, m6
pand m1, m0 ; -a1
pandn m0, m6 ; +a1
-%if ssse3_or_higher
+%if cpuflag(ssse3)
mova m6, [pb_9_63]
%endif
psubusb m2, m1
@@ -2707,7 +2698,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb m2, m0 ; p1+a1
psubusb m5, m0 ; q1-a1
-%if ssse3_or_higher
+%if cpuflag(ssse3)
SWAP 6, 7
%ifdef m10
SWAP 1, 10
@@ -2757,14 +2748,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb m6, m7 ; q1-a1
; store
-%ifidn %2, v
+%ifidn %1, v
movrow [dst2_reg+mstride_reg*4], m1
movrow [dst_reg +mstride_reg*2], m2
movrow [dst_reg +mstride_reg ], m3
movrow [dst_reg], m4
movrow [dst2_reg], m5
movrow [dst2_reg+ stride_reg ], m6
-%if mmsize == 16 && %4 == 8
+%if mmsize == 16 && %3 == 8
add dst8_reg, mstride_reg
movhps [dst8_reg+mstride_reg*2], m1
movhps [dst8_reg+mstride_reg ], m2
@@ -2788,14 +2779,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
%else ; sse2 (h)
lea dst8_reg, [dst8_reg+mstride_reg+1]
- WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
+ WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3
lea dst_reg, [dst2_reg+mstride_reg+4]
lea dst8_reg, [dst8_reg+mstride_reg+4]
-%ifidn %1, sse4
+%if cpuflag(sse4)
add dst2_reg, 4
%endif
WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg
-%ifidn %1, sse4
+%if cpuflag(sse4)
lea dst2_reg, [dst8_reg+ stride_reg]
%endif
WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
@@ -2803,15 +2794,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
%if mmsize == 8
-%if %4 == 8 ; chroma
-%ifidn %2, h
+%if %3 == 8 ; chroma
+%ifidn %1, h
sub dst_reg, 5
%endif
cmp dst_reg, dst8_reg
mov dst_reg, dst8_reg
jnz .next8px
%else
-%ifidn %2, h
+%ifidn %1, h
lea dst_reg, [dst_reg + stride_reg*8-5]
%else ; v
add dst_reg, 8
@@ -2828,46 +2819,31 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endmacro
%if ARCH_X86_32
-INIT_MMX
-%define SPLATB_REG SPLATB_REG_MMX
-MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
-MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
-MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
-MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
-
-%define SPLATB_REG SPLATB_REG_MMXEXT
-MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
-MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
-MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
-MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
-%endif
-
-INIT_XMM
-%define SPLATB_REG SPLATB_REG_SSE2
-%define WRITE_8W WRITE_8W_SSE2
-MBEDGE_LOOPFILTER sse2, v, 5, 16, 15
-%ifdef m8
-MBEDGE_LOOPFILTER sse2, h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER sse2, h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER sse2, v, 6, 8, 15
-MBEDGE_LOOPFILTER sse2, h, 6, 8, 15
-
-%define SPLATB_REG SPLATB_REG_SSSE3
-MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15
-%ifdef m8
-MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15
-MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15
-
-%define WRITE_8W WRITE_8W_SSE4
-%ifdef m8
-MBEDGE_LOOPFILTER sse4, h, 5, 16, 15
-%else
-MBEDGE_LOOPFILTER sse4, h, 6, 16, 15
-%endif
-MBEDGE_LOOPFILTER sse4, h, 6, 8, 15
+INIT_MMX mmx
+MBEDGE_LOOPFILTER v, 6, 16
+MBEDGE_LOOPFILTER h, 6, 16
+MBEDGE_LOOPFILTER v, 6, 8
+MBEDGE_LOOPFILTER h, 6, 8
+
+INIT_MMX mmx2
+MBEDGE_LOOPFILTER v, 6, 16
+MBEDGE_LOOPFILTER h, 6, 16
+MBEDGE_LOOPFILTER v, 6, 8
+MBEDGE_LOOPFILTER h, 6, 8
+%endif
+
+INIT_XMM sse2
+MBEDGE_LOOPFILTER v, 5, 16
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER v, 6, 8
+MBEDGE_LOOPFILTER h, 6, 8
+
+INIT_XMM ssse3
+MBEDGE_LOOPFILTER v, 5, 16
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER v, 6, 8
+MBEDGE_LOOPFILTER h, 6, 8
+
+INIT_XMM sse4
+MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16
+MBEDGE_LOOPFILTER h, 6, 8