diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-03-05 00:02:58 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-03-05 00:15:55 +0100 |
commit | 2af8f2cea6c94eba3a15820194cb7374b366976a (patch) | |
tree | 634d34b8adf1c35cc1bb7c3eb1f2b49775ffbb56 /libavcodec | |
parent | 33a183df46355e4b281517e14c9b3c7e2b558dcf (diff) | |
parent | 3faa141d15bf9945fa54331e51b3f10b9970d5d2 (diff) | |
download | ffmpeg-2af8f2cea6c94eba3a15820194cb7374b366976a.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (27 commits)
cmdutils: use new avcodec_is_decoder/encoder() functions.
lavc: make codec_is_decoder/encoder() public.
lavc: deprecate AVCodecContext.sub_id.
libcdio: add a forgotten AVClass to the private context.
swscale: remove "cpu flags" from -sws_flags description.
proresenc: give user a possibility to alter some encoding parameters
vorbisenc: add output buffer overwrite protection
libopencore-amrnbenc: fix end-of-stream handling
ra144enc: fix end-of-stream handling
nellymoserenc: zero any leftover packet bytes
nellymoserenc: use proper MDCT overlap delay
qpeg: Use bytestream2 functions to prevent buffer overreads.
swscale: make %rep unconditional.
vp8: convert simple loopfilter x86 assembly to use named arguments.
vp8: convert idct x86 assembly to use named arguments.
vp8: convert mc x86 assembly to use named arguments.
vp8: convert loopfilter x86 assembly to use cpuflags().
vp8: convert idct/mc x86 assembly to use cpuflags().
swscale: remove now unnecessary hack.
x86inc: don't "bake" stack_offset in named arguments.
...
Conflicts:
cmdutils.c
doc/APIchanges
libavcodec/mpeg12.c
libavcodec/options.c
libavcodec/qpeg.c
libavcodec/utils.c
libavcodec/version.h
libavdevice/libcdio.c
tests/lavf-regression.sh
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/avcodec.h | 21 | ||||
-rw-r--r-- | libavcodec/libopencore-amr.c | 28 | ||||
-rw-r--r-- | libavcodec/mpeg12.c | 5 | ||||
-rw-r--r-- | libavcodec/mpegaudiodec.c | 2 | ||||
-rw-r--r-- | libavcodec/mpegaudiodecheader.c | 1 | ||||
-rw-r--r-- | libavcodec/mpegvideo_parser.c | 2 | ||||
-rw-r--r-- | libavcodec/nellymoserenc.c | 49 | ||||
-rw-r--r-- | libavcodec/options.c | 2 | ||||
-rw-r--r-- | libavcodec/proresenc_kostya.c | 178 | ||||
-rw-r--r-- | libavcodec/pthread.c | 1 | ||||
-rw-r--r-- | libavcodec/qpeg.c | 104 | ||||
-rw-r--r-- | libavcodec/ra144.h | 1 | ||||
-rw-r--r-- | libavcodec/ra144enc.c | 33 | ||||
-rw-r--r-- | libavcodec/rv10.c | 38 | ||||
-rw-r--r-- | libavcodec/utils.c | 20 | ||||
-rw-r--r-- | libavcodec/version.h | 5 | ||||
-rw-r--r-- | libavcodec/vorbisenc.c | 59 | ||||
-rw-r--r-- | libavcodec/wmaenc.c | 35 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp-init.c | 112 | ||||
-rw-r--r-- | libavcodec/x86/vp8dsp.asm | 1246 |
20 files changed, 1037 insertions, 905 deletions
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 592c5341ea..397fcfb7b5 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1280,15 +1280,12 @@ typedef struct AVCodecContext { */ unsigned int stream_codec_tag; +#if FF_API_SUB_ID /** - * Some codecs need additional format info. It is stored here. - * If any muxer uses this then ALL demuxers/parsers AND encoders for the - * specific codec MUST set it correctly otherwise stream copy breaks. - * In general use of this field by muxers is not recommended. - * - encoding: Set by libavcodec. - * - decoding: Set by libavcodec. (FIXME: Is this OK?) + * @deprecated this field is unused */ - int sub_id; + attribute_deprecated int sub_id; +#endif void *priv_data; @@ -4504,4 +4501,14 @@ const AVClass *avcodec_get_frame_class(void); */ int avcodec_is_open(AVCodecContext *s); +/** + * @return a non-zero number if codec is an encoder, zero otherwise + */ +int av_codec_is_encoder(AVCodec *codec); + +/** + * @return a non-zero number if codec is a decoder, zero otherwise + */ +int av_codec_is_decoder(AVCodec *codec); + #endif /* AVCODEC_AVCODEC_H */ diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c index 7a0555e6c5..90a8c651e2 100644 --- a/libavcodec/libopencore-amr.c +++ b/libavcodec/libopencore-amr.c @@ -85,6 +85,7 @@ typedef struct AMRContext { int enc_bitrate; int enc_mode; int enc_dtx; + int enc_last_frame; } AMRContext; static const AVOption options[] = { @@ -195,6 +196,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx) } avctx->frame_size = 160; + avctx->delay = 50; avctx->coded_frame = avcodec_alloc_frame(); if (!avctx->coded_frame) return AVERROR(ENOMEM); @@ -227,17 +229,40 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, { AMRContext *s = avctx->priv_data; int written; + int16_t *flush_buf = NULL; + const int16_t *samples = data; if (s->enc_bitrate != avctx->bit_rate) { s->enc_mode = get_bitrate_mode(avctx->bit_rate, avctx); s->enc_bitrate = avctx->bit_rate; } - written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, data, + if (data) { + if (avctx->frame_size < 160) { + flush_buf = av_mallocz(160 * sizeof(*flush_buf)); + if (!flush_buf) + return AVERROR(ENOMEM); + memcpy(flush_buf, samples, avctx->frame_size * sizeof(*flush_buf)); + samples = flush_buf; + if (avctx->frame_size < 110) + s->enc_last_frame = -1; + } + } else { + if (s->enc_last_frame < 0) + return 0; + flush_buf = av_mallocz(160 * sizeof(*flush_buf)); + if (!flush_buf) + return AVERROR(ENOMEM); + samples = flush_buf; + s->enc_last_frame = -1; + } + + written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples, frame, 0); av_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n", written, s->enc_mode, frame[0]); + av_freep(&flush_buf); return written; } @@ -249,6 +274,7 @@ AVCodec ff_libopencore_amrnb_encoder = { .init = amr_nb_encode_init, .encode = amr_nb_encode_frame, .close = amr_nb_encode_close, + .capabilities = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME, .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE}, .long_name = NULL_IF_CONFIG_SMALL("OpenCORE Adaptive Multi-Rate (AMR) Narrow-Band"), .priv_class = &class, diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c index 5c5e09ec2d..548e26d9bc 100644 --- a/libavcodec/mpeg12.c +++ b/libavcodec/mpeg12.c @@ -1276,7 +1276,6 @@ static int mpeg_decode_postinit(AVCodecContext *avctx) * that behave like P-frames. */ avctx->has_b_frames = !s->low_delay; - assert((avctx->sub_id == 1) == (avctx->codec_id == CODEC_ID_MPEG1VIDEO)); if (avctx->codec_id == CODEC_ID_MPEG1VIDEO) { //MPEG-1 fps avctx->time_base.den = avpriv_frame_rate_tab[s->frame_rate_index].num; @@ -1420,7 +1419,6 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1) av_dlog(s->avctx, "sequence extension\n"); s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO; - s->avctx->sub_id = 2; /* indicates MPEG-2 found */ if (s->avctx->debug & FF_DEBUG_PICT_INFO) av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n", @@ -2038,7 +2036,6 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, s->frame_pred_frame_dct = 1; s->chroma_format = 1; s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG1VIDEO; - avctx->sub_id = 1; /* indicates MPEG-1 */ s->out_format = FMT_MPEG1; s->swap_uv = 0; // AFAIK VCR2 does not have SEQ_HEADER if (s->flags & CODEC_FLAG_LOW_DELAY) @@ -2097,12 +2094,10 @@ static int vcr2_init_sequence(AVCodecContext *avctx) s->chroma_format = 1; if (s->codec_tag == AV_RL32("BW10")) { s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG1VIDEO; - avctx->sub_id = 1; /* indicates MPEG-1 */ } else { exchange_uv(s); // common init reset pblocks, so we swap them here s->swap_uv = 1; // in case of xvmc we need to swap uv for each MB s->codec_id = s->avctx->codec_id = CODEC_ID_MPEG2VIDEO; - avctx->sub_id = 2; /* indicates MPEG-2 */ } s1->save_width = s->width; s1->save_height = s->height; diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index b80d7c771a..51db72a177 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1659,7 +1659,6 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr, avctx->channel_layout = s->nb_channels == 1 ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO; if (!avctx->bit_rate) avctx->bit_rate = s->bit_rate; - avctx->sub_id = s->layer; if (s->frame_size <= 0 || s->frame_size > buf_size) { av_log(avctx, AV_LOG_ERROR, "incomplete frame\n"); @@ -1732,7 +1731,6 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data, avctx->channels = s->nb_channels; if (!avctx->bit_rate) avctx->bit_rate = s->bit_rate; - avctx->sub_id = s->layer; s->frame_size = len; diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c index 24919ab544..b00d804d0e 100644 --- a/libavcodec/mpegaudiodecheader.c +++ b/libavcodec/mpegaudiodecheader.c @@ -142,6 +142,5 @@ int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_r *sample_rate = s->sample_rate; *channels = s->nb_channels; *bit_rate = s->bit_rate; - avctx->sub_id = s->layer; return s->frame_size; } diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c index 7a905a9e3f..776052d252 100644 --- a/libavcodec/mpegvideo_parser.c +++ b/libavcodec/mpegvideo_parser.c @@ -69,7 +69,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s, pc->frame_rate.num = avctx->time_base.num = avpriv_frame_rate_tab[frame_rate_index].den; avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400; avctx->codec_id = CODEC_ID_MPEG1VIDEO; - avctx->sub_id = 1; } break; case EXT_START_CODE: @@ -94,7 +93,6 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s, avctx->time_base.den = pc->frame_rate.den * (frame_rate_ext_n + 1) * 2; avctx->time_base.num = pc->frame_rate.num * (frame_rate_ext_d + 1); avctx->codec_id = CODEC_ID_MPEG2VIDEO; - avctx->sub_id = 2; /* forces MPEG2 */ } break; case 0x8: /* picture coding extension */ diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c index 8e018c1b7f..29ad7a2e26 100644 --- a/libavcodec/nellymoserenc.c +++ b/libavcodec/nellymoserenc.c @@ -52,13 +52,11 @@ typedef struct NellyMoserEncodeContext { AVCodecContext *avctx; int last_frame; - int bufsel; - int have_saved; DSPContext dsp; FFTContext mdct_ctx; DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES]; DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES]; - DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer + DECLARE_ALIGNED(32, float, buf)[3 * NELLY_BUF_LEN]; ///< sample buffer float (*opt )[NELLY_BANDS]; uint8_t (*path)[NELLY_BANDS]; } NellyMoserEncodeContext; @@ -115,16 +113,17 @@ static const uint8_t quant_lut_offset[8] = { 0, 0, 1, 4, 11, 32, 81, 230 }; static void apply_mdct(NellyMoserEncodeContext *s) { - s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN); - s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, - NELLY_BUF_LEN); + float *in0 = s->buf; + float *in1 = s->buf + NELLY_BUF_LEN; + float *in2 = s->buf + 2 * NELLY_BUF_LEN; + + s->dsp.vector_fmul (s->in_buff, in0, ff_sine_128, NELLY_BUF_LEN); + s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN); s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff); - s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, - ff_sine_128, NELLY_BUF_LEN); - s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128, - NELLY_BUF_LEN); - s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN); + s->dsp.vector_fmul (s->in_buff, in1, ff_sine_128, NELLY_BUF_LEN); + s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN); + s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff); } static av_cold int encode_end(AVCodecContext *avctx) @@ -161,6 +160,7 @@ static av_cold int encode_init(AVCodecContext *avctx) } avctx->frame_size = NELLY_SAMPLES; + avctx->delay = NELLY_BUF_LEN; s->avctx = avctx; if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0) goto error; @@ -363,38 +363,33 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int } flush_put_bits(&pb); + memset(put_bits_ptr(&pb), 0, output + output_size - put_bits_ptr(&pb)); } static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size, void *data) { NellyMoserEncodeContext *s = avctx->priv_data; const float *samples = data; - int i; if (s->last_frame) return 0; + memcpy(s->buf, s->buf + NELLY_SAMPLES, NELLY_BUF_LEN * sizeof(*s->buf)); if (data) { - memcpy(s->buf[s->bufsel], samples, avctx->frame_size * sizeof(*samples)); - for (i = avctx->frame_size; i < NELLY_SAMPLES; i++) { - s->buf[s->bufsel][i] = 0; - } - s->bufsel = 1 - s->bufsel; - if (!s->have_saved) { - s->have_saved = 1; - return 0; + memcpy(s->buf + NELLY_BUF_LEN, samples, avctx->frame_size * sizeof(*s->buf)); + if (avctx->frame_size < NELLY_SAMPLES) { + memset(s->buf + NELLY_BUF_LEN + avctx->frame_size, 0, + (NELLY_SAMPLES - avctx->frame_size) * sizeof(*s->buf)); + if (avctx->frame_size >= NELLY_BUF_LEN) + s->last_frame = 1; } } else { - memset(s->buf[s->bufsel], 0, sizeof(s->buf[0][0]) * NELLY_BUF_LEN); - s->bufsel = 1 - s->bufsel; + memset(s->buf + NELLY_BUF_LEN, 0, NELLY_SAMPLES * sizeof(*s->buf)); s->last_frame = 1; } - if (s->have_saved) { - encode_block(s, frame, buf_size); - return NELLY_BLOCK_LEN; - } - return 0; + encode_block(s, frame, buf_size); + return NELLY_BLOCK_LEN; } AVCodec ff_nellymoser_encoder = { diff --git a/libavcodec/options.c b/libavcodec/options.c index 04277dba4d..9aae191150 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -111,7 +111,9 @@ static const AVOption options[]={ {"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"}, {"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"}, {"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"}, +#if FF_API_SUB_ID {"sub_id", NULL, OFFSET(sub_id), AV_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX}, +#endif {"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.dbl = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"}, {"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" }, {"full", "full motion estimation (slowest)", 0, AV_OPT_TYPE_CONST, {.dbl = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" }, diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c index 7920d65ed1..16e64d1d08 100644 --- a/libavcodec/proresenc_kostya.c +++ b/libavcodec/proresenc_kostya.c @@ -42,6 +42,67 @@ enum { PRORES_PROFILE_HQ, }; +enum { + QUANT_MAT_PROXY = 0, + QUANT_MAT_LT, + QUANT_MAT_STANDARD, + QUANT_MAT_HQ, + QUANT_MAT_DEFAULT, +}; + +static const uint8_t prores_quant_matrices[][64] = { + { // proxy + 4, 7, 9, 11, 13, 14, 15, 63, + 7, 7, 11, 12, 14, 15, 63, 63, + 9, 11, 13, 14, 15, 63, 63, 63, + 11, 11, 13, 14, 63, 63, 63, 63, + 11, 13, 14, 63, 63, 63, 63, 63, + 13, 14, 63, 63, 63, 63, 63, 63, + 13, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + }, + { // LT + 4, 5, 6, 7, 9, 11, 13, 15, + 5, 5, 7, 8, 11, 13, 15, 17, + 6, 7, 9, 11, 13, 15, 15, 17, + 7, 7, 9, 11, 13, 15, 17, 19, + 7, 9, 11, 13, 14, 16, 19, 23, + 9, 11, 13, 14, 16, 19, 23, 29, + 9, 11, 13, 15, 17, 21, 28, 35, + 11, 13, 16, 17, 21, 28, 35, 41, + }, + { // standard + 4, 4, 5, 5, 6, 7, 7, 9, + 4, 4, 5, 6, 7, 7, 9, 9, + 5, 5, 6, 7, 7, 9, 9, 10, + 5, 5, 6, 7, 7, 9, 9, 10, + 5, 6, 7, 7, 8, 9, 10, 12, + 6, 7, 7, 8, 9, 10, 12, 15, + 6, 7, 7, 9, 10, 11, 14, 17, + 7, 7, 9, 10, 11, 14, 17, 21, + }, + { // high quality + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 5, 5, + 4, 4, 4, 4, 4, 5, 5, 6, + 4, 4, 4, 4, 5, 5, 6, 7, + 4, 4, 4, 4, 5, 6, 7, 7, + }, + { // codec default + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + }, +}; + #define NUM_MB_LIMITS 4 static const int prores_mb_limits[NUM_MB_LIMITS] = { 1620, // up to 720x576 @@ -56,7 +117,7 @@ static const struct prores_profile { int min_quant; int max_quant; int br_tab[NUM_MB_LIMITS]; - uint8_t quant[64]; + int quant; } prores_profile_info[4] = { { .full_name = "proxy", @@ -64,16 +125,7 @@ static const struct prores_profile { .min_quant = 4, .max_quant = 8, .br_tab = { 300, 242, 220, 194 }, - .quant = { - 4, 7, 9, 11, 13, 14, 15, 63, - 7, 7, 11, 12, 14, 15, 63, 63, - 9, 11, 13, 14, 15, 63, 63, 63, - 11, 11, 13, 14, 63, 63, 63, 63, - 11, 13, 14, 63, 63, 63, 63, 63, - 13, 14, 63, 63, 63, 63, 63, 63, - 13, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63, 63, - }, + .quant = QUANT_MAT_PROXY, }, { .full_name = "LT", @@ -81,16 +133,7 @@ static const struct prores_profile { .min_quant = 1, .max_quant = 9, .br_tab = { 720, 560, 490, 440 }, - .quant = { - 4, 5, 6, 7, 9, 11, 13, 15, - 5, 5, 7, 8, 11, 13, 15, 17, - 6, 7, 9, 11, 13, 15, 15, 17, - 7, 7, 9, 11, 13, 15, 17, 19, - 7, 9, 11, 13, 14, 16, 19, 23, - 9, 11, 13, 14, 16, 19, 23, 29, - 9, 11, 13, 15, 17, 21, 28, 35, - 11, 13, 16, 17, 21, 28, 35, 41, - }, + .quant = QUANT_MAT_LT, }, { .full_name = "standard", @@ -98,16 +141,7 @@ static const struct prores_profile { .min_quant = 1, .max_quant = 6, .br_tab = { 1050, 808, 710, 632 }, - .quant = { - 4, 4, 5, 5, 6, 7, 7, 9, - 4, 4, 5, 6, 7, 7, 9, 9, - 5, 5, 6, 7, 7, 9, 9, 10, - 5, 5, 6, 7, 7, 9, 9, 10, - 5, 6, 7, 7, 8, 9, 10, 12, - 6, 7, 7, 8, 9, 10, 12, 15, - 6, 7, 7, 9, 10, 11, 14, 17, - 7, 7, 9, 10, 11, 14, 17, 21, - }, + .quant = QUANT_MAT_STANDARD, }, { .full_name = "high quality", @@ -115,16 +149,7 @@ static const struct prores_profile { .min_quant = 1, .max_quant = 6, .br_tab = { 1566, 1216, 1070, 950 }, - .quant = { - 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 5, - 4, 4, 4, 4, 4, 4, 5, 5, - 4, 4, 4, 4, 4, 5, 5, 6, - 4, 4, 4, 4, 5, 5, 6, 7, - 4, 4, 4, 4, 5, 6, 7, 7, - }, + .quant = QUANT_MAT_HQ, } // for 4444 profile bitrate numbers are { 2350, 1828, 1600, 1425 } }; @@ -147,6 +172,7 @@ typedef struct ProresContext { DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16]; int16_t quants[MAX_STORED_Q][64]; int16_t custom_q[64]; + const uint8_t *quant_mat; ProresDSPContext dsp; ScanTable scantable; @@ -159,6 +185,9 @@ typedef struct ProresContext { int num_planes; int bits_per_mb; + char *vendor; + int quant_sel; + int frame_size; int profile; @@ -373,7 +402,7 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, } else { qmat = ctx->custom_q; for (i = 0; i < 64; i++) - qmat[i] = ctx->profile_info->quant[i] * quant; + qmat[i] = ctx->quant_mat[i] * quant; } for (i = 0; i < ctx->num_planes; i++) { @@ -591,7 +620,7 @@ static int find_slice_quant(AVCodecContext *avctx, const AVFrame *pic, } else { qmat = ctx->custom_q; for (i = 0; i < 64; i++) - qmat[i] = ctx->profile_info->quant[i] * q; + qmat[i] = ctx->quant_mat[i] * q; } for (i = 0; i < ctx->num_planes; i++) { bits += estimate_slice_plane(ctx, &error, i, @@ -684,7 +713,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, tmp = buf; buf += 2; // frame header size will be stored here bytestream_put_be16 (&buf, 0); // version 1 - bytestream_put_buffer(&buf, "Lavc", 4); // creator + bytestream_put_buffer(&buf, ctx->vendor, 4); bytestream_put_be16 (&buf, avctx->width); bytestream_put_be16 (&buf, avctx->height); bytestream_put_byte (&buf, ctx->chroma_factor << 6); // frame flags @@ -694,13 +723,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, bytestream_put_byte (&buf, avctx->colorspace); bytestream_put_byte (&buf, 0x40); // source format and alpha information bytestream_put_byte (&buf, 0); // reserved - bytestream_put_byte (&buf, 0x03); // matrix flags - both matrices are present - // luma quantisation matrix - for (i = 0; i < 64; i++) - bytestream_put_byte(&buf, ctx->profile_info->quant[i]); - // chroma quantisation matrix - for (i = 0; i < 64; i++) - bytestream_put_byte(&buf, ctx->profile_info->quant[i]); + if (ctx->quant_sel != QUANT_MAT_DEFAULT) { + bytestream_put_byte (&buf, 0x03); // matrix flags - both matrices are present + // luma quantisation matrix + for (i = 0; i < 64; i++) + bytestream_put_byte(&buf, ctx->quant_mat[i]); + // chroma quantisation matrix + for (i = 0; i < 64; i++) + bytestream_put_byte(&buf, ctx->quant_mat[i]); + } else { + bytestream_put_byte (&buf, 0x00); // matrix flags - default matrices are used + } bytestream_put_be16 (&tmp, buf - orig_buf); // write back frame header size // picture header @@ -816,10 +849,25 @@ static av_cold int encode_init(AVCodecContext *avctx) ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps); ctx->num_slices = ctx->mb_height * ctx->slices_width; - for (i = 0; i < NUM_MB_LIMITS - 1; i++) - if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height) - break; - ctx->bits_per_mb = ctx->profile_info->br_tab[i]; + if (ctx->quant_sel == -1) + ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant]; + else + ctx->quant_mat = prores_quant_matrices[ctx->quant_sel]; + + if (strlen(ctx->vendor) != 4) { + av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n"); + return AVERROR_INVALIDDATA; + } + + if (!ctx->bits_per_mb) { + for (i = 0; i < NUM_MB_LIMITS - 1; i++) + if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height) + break; + ctx->bits_per_mb = ctx->profile_info->br_tab[i]; + } else if (ctx->bits_per_mb < 128) { + av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n"); + return AVERROR_INVALIDDATA; + } ctx->frame_size = ctx->num_slices * (2 + 2 * ctx->num_planes + (2 * mps * ctx->bits_per_mb) / 8) @@ -829,7 +877,7 @@ static av_cold int encode_init(AVCodecContext *avctx) max_quant = ctx->profile_info->max_quant; for (i = min_quant; i < MAX_STORED_Q; i++) { for (j = 0; j < 64; j++) - ctx->quants[i][j] = ctx->profile_info->quant[j] * i; + ctx->quants[i][j] = ctx->quant_mat[j] * i; } avctx->codec_tag = ctx->profile_info->tag; @@ -877,6 +925,24 @@ static const AVOption options[] = { 0, 0, VE, "profile" }, { "hq", NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_HQ }, 0, 0, VE, "profile" }, + { "vendor", "vendor ID", OFFSET(vendor), + AV_OPT_TYPE_STRING, { .str = "Lavc" }, CHAR_MIN, CHAR_MAX, VE }, + { "bits_per_mb", "desired bits per macroblock", OFFSET(bits_per_mb), + AV_OPT_TYPE_INT, { 0 }, 0, 8192, VE }, + { "quant_mat", "quantiser matrix", OFFSET(quant_sel), AV_OPT_TYPE_INT, + { -1 }, -1, QUANT_MAT_DEFAULT, VE, "quant_mat" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, { -1 }, + 0, 0, VE, "quant_mat" }, + { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_PROXY }, + 0, 0, VE, "quant_mat" }, + { "lt", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_LT }, + 0, 0, VE, "quant_mat" }, + { "standard", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_STANDARD }, + 0, 0, VE, "quant_mat" }, + { "hq", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_HQ }, + 0, 0, VE, "quant_mat" }, + { "default", NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_DEFAULT }, + 0, 0, VE, "quant_mat" }, { NULL } }; diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c index 8b63288adc..c4e8aab952 100644 --- a/libavcodec/pthread.c +++ b/libavcodec/pthread.c @@ -416,7 +416,6 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src, int err = 0; if (dst != src) { - dst->sub_id = src->sub_id; dst->time_base = src->time_base; dst->width = src->width; dst->height = src->height; diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c index 2d5ae690ca..a0ddfae647 100644 --- a/libavcodec/qpeg.c +++ b/libavcodec/qpeg.c @@ -25,15 +25,17 @@ */ #include "avcodec.h" +#include "bytestream.h" typedef struct QpegContext{ AVCodecContext *avctx; AVFrame pic, ref; uint32_t pal[256]; + GetByteContext buffer; } QpegContext; -static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size, - int stride, int width, int height) +static void qpeg_decode_intra(QpegContext *qctx, uint8_t *dst, + int stride, int width, int height) { int i; int code; @@ -46,31 +48,26 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size, height--; dst = dst + height * stride; - while((size > 0) && (rows_to_go > 0)) { - code = *src++; - size--; + while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (rows_to_go > 0)) { + code = bytestream2_get_byte(&qctx->buffer); run = copy = 0; if(code == 0xFC) /* end-of-picture code */ break; if(code >= 0xF8) { /* very long run */ - c0 = *src++; - c1 = *src++; - size -= 2; + c0 = bytestream2_get_byte(&qctx->buffer); + c1 = bytestream2_get_byte(&qctx->buffer); run = ((code & 0x7) << 16) + (c0 << 8) + c1 + 2; } else if (code >= 0xF0) { /* long run */ - c0 = *src++; - size--; + c0 = bytestream2_get_byte(&qctx->buffer); run = ((code & 0xF) << 8) + c0 + 2; } else if (code >= 0xE0) { /* short run */ run = (code & 0x1F) + 2; } else if (code >= 0xC0) { /* very long copy */ - c0 = *src++; - c1 = *src++; - size -= 2; + c0 = bytestream2_get_byte(&qctx->buffer); + c1 = bytestream2_get_byte(&qctx->buffer); copy = ((code & 0x3F) << 16) + (c0 << 8) + c1 + 1; } else if (code >= 0x80) { /* long copy */ - c0 = *src++; - size--; + c0 = bytestream2_get_byte(&qctx->buffer); copy = ((code & 0x7F) << 8) + c0 + 1; } else { /* short copy */ copy = code + 1; @@ -80,8 +77,7 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size, if(run) { int p; - p = *src++; - size--; + p = bytestream2_get_byte(&qctx->buffer); for(i = 0; i < run; i++) { dst[filled++] = p; if (filled >= width) { @@ -93,11 +89,8 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size, } } } else { - size -= copy; - if (size<0) - return AVERROR_INVALIDDATA; for(i = 0; i < copy; i++) { - dst[filled++] = *src++; + dst[filled++] = bytestream2_get_byte(&qctx->buffer); if (filled >= width) { filled = 0; dst -= stride; @@ -108,7 +101,6 @@ static int qpeg_decode_intra(const uint8_t *src, uint8_t *dst, int size, } } } - return 0; } static const int qpeg_table_h[16] = @@ -117,9 +109,10 @@ static const int qpeg_table_w[16] = { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04}; /* Decodes delta frames */ -static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, - int stride, int width, int height, - int delta, const uint8_t *ctable, uint8_t *refdata) +static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst, + int stride, int width, int height, + int delta, const uint8_t *ctable, + uint8_t *refdata) { int i, j; int code; @@ -137,13 +130,12 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, height--; dst = dst + height * stride; - while((size > 0) && (height >= 0)) { - code = *src++; - size--; + while ((bytestream2_get_bytes_left(&qctx->buffer) > 0) && (height >= 0)) { + code = bytestream2_get_byte(&qctx->buffer); if(delta) { /* motion compensation */ - while(size > 0 && (code & 0xF0) == 0xF0) { + while(bytestream2_get_bytes_left(&qctx->buffer) > 0 && (code & 0xF0) == 0xF0) { if(delta == 1) { int me_idx; int me_w, me_h, me_x, me_y; @@ -156,8 +148,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, me_h = qpeg_table_h[me_idx]; /* extract motion vector */ - corr = *src++; - size--; + corr = bytestream2_get_byte(&qctx->buffer); val = corr >> 4; if(val > 7) @@ -184,8 +175,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, } } } - code = *src++; - size--; + code = bytestream2_get_byte(&qctx->buffer); } } @@ -195,8 +185,7 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, int p; code &= 0x1F; - p = *src++; - size--; + p = bytestream2_get_byte(&qctx->buffer); for(i = 0; i <= code; i++) { dst[filled++] = p; if(filled >= width) { @@ -210,11 +199,11 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, } else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */ code &= 0x1F; - if(code + 1 > size) + if(code + 1 > bytestream2_get_bytes_left(&qctx->buffer)) break; for(i = 0; i <= code; i++) { - dst[filled++] = *src++; + dst[filled++] = bytestream2_get_byte(&qctx->buffer); if(filled >= width) { filled = 0; dst -= stride; @@ -223,18 +212,17 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, break; } } - size -= code + 1; } else if(code >= 0x80) { /* skip code: 0x80..0xBF */ int skip; code &= 0x3F; /* codes 0x80 and 0x81 are actually escape codes, skip value minus constant is in the next byte */ - if(!code) { - skip = (*src++) + 64; size--; - } else if(code == 1) { - skip = (*src++) + 320; size--; - } else + if(!code) + skip = bytestream2_get_byte(&qctx->buffer) + 64; + else if(code == 1) + skip = bytestream2_get_byte(&qctx->buffer) + 320; + else skip = code; filled += skip; while( filled >= width) { @@ -246,8 +234,9 @@ static void qpeg_decode_inter(const uint8_t *src, uint8_t *dst, int size, } } else { /* zero code treated as one-pixel skip */ - if(code) + if(code) { dst[filled++] = ctable[code & 0x7F]; + } else filled++; if(filled >= width) { @@ -263,8 +252,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { - const uint8_t *buf = avpkt->data; - int buf_size = avpkt->size; + uint8_t ctable[128]; QpegContext * const a = avctx->priv_data; AVFrame * p = &a->pic; AVFrame * ref= &a->ref; @@ -272,6 +260,13 @@ static int decode_frame(AVCodecContext *avctx, int delta, ret = 0; const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL); + if (avpkt->size < 0x86) { + av_log(avctx, AV_LOG_ERROR, "Packet is too small\n"); + return AVERROR_INVALIDDATA; + } + + bytestream2_init(&a->buffer, avpkt->data, avpkt->size); + if(ref->data[0]) avctx->release_buffer(avctx, ref); FFSWAP(AVFrame, *ref, *p); @@ -282,16 +277,17 @@ static int decode_frame(AVCodecContext *avctx, return -1; } outdata = a->pic.data[0]; - if(buf[0x85] == 0x10) { - ret = qpeg_decode_intra(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height); + bytestream2_skip(&a->buffer, 4); + bytestream2_get_buffer(&a->buffer, ctable, 128); + bytestream2_skip(&a->buffer, 1); + + delta = bytestream2_get_byte(&a->buffer); + if(delta == 0x10) { + qpeg_decode_intra(a, outdata, a->pic.linesize[0], avctx->width, avctx->height); } else { - delta = buf[0x85]; - qpeg_decode_inter(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height, delta, buf + 4, a->ref.data[0]); + qpeg_decode_inter(a, outdata, a->pic.linesize[0], avctx->width, avctx->height, delta, ctable, a->ref.data[0]); } - if (ret<0) - return ret; - /* make the palette available on the way out */ if (pal) { a->pic.palette_has_changed = 1; @@ -302,7 +298,7 @@ static int decode_frame(AVCodecContext *avctx, *data_size = sizeof(AVFrame); *(AVFrame*)data = a->pic; - return buf_size; + return avpkt->size; } static av_cold int decode_init(AVCodecContext *avctx){ diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h index 9665534f7b..83c0899cc8 100644 --- a/libavcodec/ra144.h +++ b/libavcodec/ra144.h @@ -36,6 +36,7 @@ typedef struct { AVCodecContext *avctx; AVFrame frame; LPCContext lpc_ctx; + int last_frame; unsigned int old_energy; ///< previous frame energy diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c index b3710e871b..caa7d16b30 100644 --- a/libavcodec/ra144enc.c +++ b/libavcodec/ra144enc.c @@ -53,6 +53,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx) return -1; } avctx->frame_size = NBLOCKS * BLOCKSIZE; + avctx->delay = avctx->frame_size; avctx->bit_rate = 8000; ractx = avctx->priv_data; ractx->lpc_coef[0] = ractx->lpc_tables[0]; @@ -433,7 +434,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame, { static const uint8_t sizes[LPC_ORDER] = {64, 32, 32, 16, 16, 8, 8, 8, 8, 4}; static const uint8_t bit_sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2}; - RA144Context *ractx; + RA144Context *ractx = avctx->priv_data; PutBitContext pb; int32_t lpc_data[NBLOCKS * BLOCKSIZE]; int32_t lpc_coefs[LPC_ORDER][MAX_LPC_ORDER]; @@ -445,11 +446,13 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame, int energy = 0; int i, idx; + if (ractx->last_frame) + return 0; + if (buf_size < FRAMESIZE) { av_log(avctx, AV_LOG_ERROR, "output buffer too small\n"); return 0; } - ractx = avctx->priv_data; /** * Since the LPC coefficients are calculated on a frame centered over the @@ -462,11 +465,15 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame, lpc_data[i] = ractx->curr_block[BLOCKSIZE + BLOCKSIZE / 2 + i]; energy += (lpc_data[i] * lpc_data[i]) >> 4; } - for (i = 2 * BLOCKSIZE + BLOCKSIZE / 2; i < NBLOCKS * BLOCKSIZE; i++) { - lpc_data[i] = *((int16_t *)data + i - 2 * BLOCKSIZE - BLOCKSIZE / 2) >> - 2; - energy += (lpc_data[i] * lpc_data[i]) >> 4; + if (data) { + int j; + for (j = 0; j < avctx->frame_size && i < NBLOCKS * BLOCKSIZE; i++, j++) { + lpc_data[i] = samples[j] >> 2; + energy += (lpc_data[i] * lpc_data[i]) >> 4; + } } + if (i < NBLOCKS * BLOCKSIZE) + memset(&lpc_data[i], 0, (NBLOCKS * BLOCKSIZE - i) * sizeof(*lpc_data)); energy = ff_energy_tab[quantize(ff_t_sqrt(energy >> 5) >> 10, ff_energy_tab, 32)]; @@ -515,8 +522,17 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame, ractx->old_energy = energy; ractx->lpc_refl_rms[1] = ractx->lpc_refl_rms[0]; FFSWAP(unsigned int *, ractx->lpc_coef[0], ractx->lpc_coef[1]); - for (i = 0; i < NBLOCKS * BLOCKSIZE; i++) - ractx->curr_block[i] = samples[i] >> 2; + + /* copy input samples to current block for processing in next call */ + i = 0; + if (data) { + for (; i < avctx->frame_size; i++) + ractx->curr_block[i] = samples[i] >> 2; + } else + ractx->last_frame = 1; + memset(&ractx->curr_block[i], 0, + (NBLOCKS * BLOCKSIZE - i) * sizeof(*ractx->curr_block)); + return FRAMESIZE; } @@ -529,6 +545,7 @@ AVCodec ff_ra_144_encoder = { .init = ra144_encode_init, .encode = ra144_encode_frame, .close = ra144_encode_close, + .capabilities = CODEC_CAP_DELAY | CODEC_CAP_SMALL_LAST_FRAME, .sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE }, .long_name = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"), diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c index bc86b69a2e..ab382ac7bc 100644 --- a/libavcodec/rv10.c +++ b/libavcodec/rv10.c @@ -40,6 +40,11 @@ #define DC_VLC_BITS 14 //FIXME find a better solution +typedef struct RVDecContext { + MpegEncContext m; + int sub_id; +} RVDecContext; + static const uint16_t rv_lum_code[256] = { 0x3e7f, 0x0f00, 0x0f01, 0x0f02, 0x0f03, 0x0f04, 0x0f05, 0x0f06, @@ -293,8 +298,9 @@ static int rv10_decode_picture_header(MpegEncContext *s) return mb_count; } -static int rv20_decode_picture_header(MpegEncContext *s) +static int rv20_decode_picture_header(RVDecContext *rv) { + MpegEncContext *s = &rv->m; int seq, mb_pos, i; int rpr_bits; @@ -342,10 +348,10 @@ static int rv20_decode_picture_header(MpegEncContext *s) return -1; } - if(RV_GET_MINOR_VER(s->avctx->sub_id) >= 2) + if(RV_GET_MINOR_VER(rv->sub_id) >= 2) s->loop_filter = get_bits1(&s->gb); - if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1) + if(RV_GET_MINOR_VER(rv->sub_id) <= 1) seq = get_bits(&s->gb, 8) << 7; else seq = get_bits(&s->gb, 13) << 2; @@ -410,7 +416,7 @@ static int rv20_decode_picture_header(MpegEncContext *s) av_log(s->avctx, AV_LOG_DEBUG, "\n");*/ s->no_rounding= get_bits1(&s->gb); - if(RV_GET_MINOR_VER(s->avctx->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B) + if(RV_GET_MINOR_VER(rv->sub_id) <= 1 && s->pict_type == AV_PICTURE_TYPE_B) skip_bits(&s->gb, 5); // binary decoder reads 3+2 bits here but they don't seem to be used s->f_code = 1; @@ -435,7 +441,8 @@ av_log(s->avctx, AV_LOG_DEBUG, "\n");*/ static av_cold int rv10_decode_init(AVCodecContext *avctx) { - MpegEncContext *s = avctx->priv_data; + RVDecContext *rv = avctx->priv_data; + MpegEncContext *s = &rv->m; static int done=0; int major_ver, minor_ver, micro_ver; @@ -454,11 +461,11 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx) s->orig_height= s->height = avctx->coded_height; s->h263_long_vectors= ((uint8_t*)avctx->extradata)[3] & 1; - avctx->sub_id= AV_RB32((uint8_t*)avctx->extradata + 4); + rv->sub_id = AV_RB32((uint8_t*)avctx->extradata + 4); - major_ver = RV_GET_MAJOR_VER(avctx->sub_id); - minor_ver = RV_GET_MINOR_VER(avctx->sub_id); - micro_ver = RV_GET_MICRO_VER(avctx->sub_id); + major_ver = RV_GET_MAJOR_VER(rv->sub_id); + minor_ver = RV_GET_MINOR_VER(rv->sub_id); + micro_ver = RV_GET_MICRO_VER(rv->sub_id); s->low_delay = 1; switch (major_ver) { @@ -473,13 +480,13 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx) } break; default: - av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", avctx->sub_id); + av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", rv->sub_id); av_log_missing_feature(avctx, "RV1/2 version", 1); return AVERROR_PATCHWELCOME; } if(avctx->debug & FF_DEBUG_PICT_INFO){ - av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", avctx->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1); + av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1); } avctx->pix_fmt = PIX_FMT_YUV420P; @@ -514,7 +521,8 @@ static av_cold int rv10_decode_end(AVCodecContext *avctx) static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf, int buf_size, int buf_size2) { - MpegEncContext *s = avctx->priv_data; + RVDecContext *rv = avctx->priv_data; + MpegEncContext *s = &rv->m; int mb_count, mb_pos, left, start_mb_x, active_bits_size; active_bits_size = buf_size * 8; @@ -522,7 +530,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, if(s->codec_id ==CODEC_ID_RV10) mb_count = rv10_decode_picture_header(s); else - mb_count = rv20_decode_picture_header(s); + mb_count = rv20_decode_picture_header(rv); if (mb_count < 0) { av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n"); return -1; @@ -733,7 +741,7 @@ AVCodec ff_rv10_decoder = { .name = "rv10", .type = AVMEDIA_TYPE_VIDEO, .id = CODEC_ID_RV10, - .priv_data_size = sizeof(MpegEncContext), + .priv_data_size = sizeof(RVDecContext), .init = rv10_decode_init, .close = rv10_decode_end, .decode = rv10_decode_frame, @@ -747,7 +755,7 @@ AVCodec ff_rv20_decoder = { .name = "rv20", .type = AVMEDIA_TYPE_VIDEO, .id = CODEC_ID_RV20, - .priv_data_size = sizeof(MpegEncContext), + .priv_data_size = sizeof(RVDecContext), .init = rv10_decode_init, .close = rv10_decode_end, .decode = rv10_decode_frame, diff --git a/libavcodec/utils.c b/libavcodec/utils.c index 47e29c9151..6c49905e65 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -118,12 +118,12 @@ static void avcodec_init(void) ff_dsputil_static_init(); } -static av_always_inline int codec_is_encoder(AVCodec *codec) +int av_codec_is_encoder(AVCodec *codec) { return codec && (codec->encode || codec->encode2); } -static av_always_inline int codec_is_decoder(AVCodec *codec) +int av_codec_is_decoder(AVCodec *codec) { return codec && codec->decode; } @@ -798,7 +798,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD /* if the decoder init function was already called previously, free the already allocated subtitle_header before overwriting it */ - if (codec_is_decoder(codec)) + if (av_codec_is_decoder(codec)) av_freep(&avctx->subtitle_header); #define SANE_NB_CHANNELS 128U @@ -845,7 +845,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD ret = AVERROR(EINVAL); goto free_and_end; } - if (codec_is_encoder(avctx->codec)) { + if (av_codec_is_encoder(avctx->codec)) { int i; if (avctx->codec->sample_fmts) { for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++) @@ -914,7 +914,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD } } - if (codec_is_decoder(avctx->codec) && !avctx->bit_rate) + if (av_codec_is_decoder(avctx->codec) && !avctx->bit_rate) avctx->bit_rate = get_bit_rate(avctx); ret=0; @@ -1527,7 +1527,7 @@ av_cold int avcodec_close(AVCodecContext *avctx) av_opt_free(avctx->priv_data); av_opt_free(avctx); av_freep(&avctx->priv_data); - if (codec_is_encoder(avctx->codec)) + if (av_codec_is_encoder(avctx->codec)) av_freep(&avctx->extradata); avctx->codec = NULL; avctx->active_thread_type = 0; @@ -1556,7 +1556,7 @@ AVCodec *avcodec_find_encoder(enum CodecID id) p = first_avcodec; id= remap_deprecated_codec_id(id); while (p) { - if (codec_is_encoder(p) && p->id == id) { + if (av_codec_is_encoder(p) && p->id == id) { if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) { experimental = p; } else @@ -1574,7 +1574,7 @@ AVCodec *avcodec_find_encoder_by_name(const char *name) return NULL; p = first_avcodec; while (p) { - if (codec_is_encoder(p) && strcmp(name,p->name) == 0) + if (av_codec_is_encoder(p) && strcmp(name,p->name) == 0) return p; p = p->next; } @@ -1587,7 +1587,7 @@ AVCodec *avcodec_find_decoder(enum CodecID id) p = first_avcodec; id= remap_deprecated_codec_id(id); while (p) { - if (codec_is_decoder(p) && p->id == id) { + if (av_codec_is_decoder(p) && p->id == id) { if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) { experimental = p; } else @@ -1605,7 +1605,7 @@ AVCodec *avcodec_find_decoder_by_name(const char *name) return NULL; p = first_avcodec; while (p) { - if (codec_is_decoder(p) && strcmp(name,p->name) == 0) + if (av_codec_is_decoder(p) && strcmp(name,p->name) == 0) return p; p = p->next; } diff --git a/libavcodec/version.h b/libavcodec/version.h index 99691f21bd..b8d2bbf2b3 100644 --- a/libavcodec/version.h +++ b/libavcodec/version.h @@ -21,7 +21,7 @@ #define AVCODEC_VERSION_H #define LIBAVCODEC_VERSION_MAJOR 54 -#define LIBAVCODEC_VERSION_MINOR 7 +#define LIBAVCODEC_VERSION_MINOR 8 #define LIBAVCODEC_VERSION_MICRO 100 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ @@ -69,5 +69,8 @@ #ifndef FF_API_INTER_THRESHOLD #define FF_API_INTER_THRESHOLD (LIBAVCODEC_VERSION_MAJOR < 55) #endif +#ifndef FF_API_SUB_ID +#define FF_API_SUB_ID (LIBAVCODEC_VERSION_MAJOR < 55) +#endif #endif /* AVCODEC_VERSION_H */ diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c index e0c8d0a193..12f21553e5 100644 --- a/libavcodec/vorbisenc.c +++ b/libavcodec/vorbisenc.c @@ -137,13 +137,16 @@ typedef struct { #define RESIDUE_PART_SIZE 32 #define NUM_RESIDUE_PARTITIONS (RESIDUE_SIZE/RESIDUE_PART_SIZE) -static inline void put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb, - int entry) +static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb, + int entry) { assert(entry >= 0); assert(entry < cb->nentries); assert(cb->lens[entry]); + if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry]) + return AVERROR(EINVAL); put_bits(pb, cb->lens[entry], cb->codewords[entry]); + return 0; } static int cb_lookup_vals(int lookup, int dimentions, int entries) @@ -751,14 +754,16 @@ static int render_point(int x0, int y0, int x1, int y1, int x) return y0 + (x - x0) * (y1 - y0) / (x1 - x0); } -static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc, - PutBitContext *pb, uint16_t *posts, - float *floor, int samples) +static int floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc, + PutBitContext *pb, uint16_t *posts, + float *floor, int samples) { int range = 255 / fc->multiplier + 1; int coded[MAX_FLOOR_VALUES]; // first 2 values are unused int i, counter; + if (pb->size_in_bits - put_bits_count(pb) < 1 + 2 * ilog(range - 1)) + return AVERROR(EINVAL); put_bits(pb, 1, 1); // non zero put_bits(pb, ilog(range - 1), posts[0]); put_bits(pb, ilog(range - 1), posts[1]); @@ -816,7 +821,8 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc, cval |= l << cshift; cshift += c->subclass; } - put_codeword(pb, book, cval); + if (put_codeword(pb, book, cval)) + return AVERROR(EINVAL); } for (k = 0; k < c->dim; k++) { int book = c->books[cval & (csub-1)]; @@ -826,12 +832,15 @@ static void floor_encode(vorbis_enc_context *venc, vorbis_enc_floor *fc, continue; if (entry == -1) entry = 0; - put_codeword(pb, &venc->codebooks[book], entry); + if (put_codeword(pb, &venc->codebooks[book], entry)) + return AVERROR(EINVAL); } } ff_vorbis_floor1_render_list(fc->list, fc->values, posts, coded, fc->multiplier, floor, samples); + + return 0; } static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb, @@ -852,13 +861,14 @@ static float *put_vector(vorbis_enc_codebook *book, PutBitContext *pb, distance = d; } } - put_codeword(pb, book, entry); + if (put_codeword(pb, book, entry)) + return NULL; return &book->dimentions[entry * book->ndimentions]; } -static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, - PutBitContext *pb, float *coeffs, int samples, - int real_ch) +static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, + PutBitContext *pb, float *coeffs, int samples, + int real_ch) { int pass, i, j, p, k; int psize = rc->partition_size; @@ -894,7 +904,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, entry *= rc->classifications; entry += classes[j][p + i]; } - put_codeword(pb, book, entry); + if (put_codeword(pb, book, entry)) + return AVERROR(EINVAL); } for (i = 0; i < classwords && p < partitions; i++, p++) { for (j = 0; j < channels; j++) { @@ -909,8 +920,10 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, if (rc->type == 0) { for (k = 0; k < psize; k += book->ndimentions) { - float *a = put_vector(book, pb, &buf[k]); int l; + float *a = put_vector(book, pb, &buf[k]); + if (!a) + return AVERROR(EINVAL); for (l = 0; l < book->ndimentions; l++) buf[k + l] -= a[l]; } @@ -930,6 +943,8 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, } } pv = put_vector(book, pb, vec); + if (!pv) + return AVERROR(EINVAL); for (dim = book->ndimentions; dim--; ) { coeffs[a1 + b1] -= *pv++; if ((a1 += samples) == s) { @@ -943,6 +958,7 @@ static void residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc, } } } + return 0; } static int apply_window_and_mdct(vorbis_enc_context *venc, const signed short *audio, @@ -1016,6 +1032,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext, init_put_bits(&pb, packets, buf_size); + if (pb.size_in_bits - put_bits_count(&pb) < 1 + ilog(venc->nmodes - 1)) { + av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n"); + return AVERROR(EINVAL); + } + put_bits(&pb, 1, 0); // magic bit put_bits(&pb, ilog(venc->nmodes - 1), 0); // 0 bits, the mode @@ -1031,7 +1052,10 @@ static int vorbis_encode_frame(AVCodecContext *avccontext, vorbis_enc_floor *fc = &venc->floors[mapping->floor[mapping->mux[i]]]; uint16_t posts[MAX_FLOOR_VALUES]; floor_fit(venc, fc, &venc->coeffs[i * samples], posts, samples); - floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples); + if (floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples)) { + av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n"); + return AVERROR(EINVAL); + } } for (i = 0; i < venc->channels * samples; i++) @@ -1051,8 +1075,11 @@ static int vorbis_encode_frame(AVCodecContext *avccontext, } } - residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]], - &pb, venc->coeffs, samples, venc->channels); + if (residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]], + &pb, venc->coeffs, samples, venc->channels)) { + av_log(avccontext, AV_LOG_ERROR, "output buffer is too small\n"); + return AVERROR(EINVAL); + } avccontext->coded_frame->pts = venc->sample_count; venc->sample_count += avccontext->frame_size; diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c index 6ec6d7ce57..29baa10230 100644 --- a/libavcodec/wmaenc.c +++ b/libavcodec/wmaenc.c @@ -39,6 +39,12 @@ static int encode_init(AVCodecContext * avctx){ return AVERROR(EINVAL); } + if (avctx->sample_rate > 48000) { + av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz", + avctx->sample_rate); + return AVERROR(EINVAL); + } + if(avctx->bit_rate < 24*1000) { av_log(avctx, AV_LOG_ERROR, "bitrate too low: got %i, need 24000 or higher\n", avctx->bit_rate); @@ -64,6 +70,8 @@ static int encode_init(AVCodecContext * avctx){ s->use_exp_vlc = flags2 & 0x0001; s->use_bit_reservoir = flags2 & 0x0002; s->use_variable_block_len = flags2 & 0x0004; + if (avctx->channels == 2) + s->ms_stereo = 1; ff_wma_init(avctx, flags2); @@ -71,8 +79,12 @@ static int encode_init(AVCodecContext * avctx){ for(i = 0; i < s->nb_block_sizes; i++) ff_mdct_init(&s->mdct_ctx[i], s->frame_len_bits - i + 1, 0, 1.0); - avctx->block_align= - s->block_align= avctx->bit_rate*(int64_t)s->frame_len / (avctx->sample_rate*8); + s->block_align = avctx->bit_rate * (int64_t)s->frame_len / + (avctx->sample_rate * 8); + s->block_align = FFMIN(s->block_align, MAX_CODED_SUPERFRAME_SIZE); + avctx->block_align = s->block_align; + avctx->bit_rate = avctx->block_align * 8LL * avctx->sample_rate / + s->frame_len; //av_log(NULL, AV_LOG_ERROR, "%d %d %d %d\n", s->block_align, avctx->bit_rate, s->frame_len, avctx->sample_rate); avctx->frame_size= s->frame_len; @@ -181,7 +193,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE], } if (s->nb_channels == 2) { - put_bits(&s->pb, 1, s->ms_stereo= 1); + put_bits(&s->pb, 1, !!s->ms_stereo); } for(ch = 0; ch < s->nb_channels; ch++) { @@ -355,6 +367,11 @@ static int encode_superframe(AVCodecContext *avctx, } } + if (buf_size < 2 * MAX_CODED_SUPERFRAME_SIZE) { + av_log(avctx, AV_LOG_ERROR, "output buffer size is too small\n"); + return AVERROR(EINVAL); + } + #if 1 total_gain= 128; for(i=64; i; i>>=1){ @@ -379,15 +396,17 @@ static int encode_superframe(AVCodecContext *avctx, } #endif - encode_frame(s, s->coefs, buf, buf_size, total_gain); + if ((i = encode_frame(s, s->coefs, buf, buf_size, total_gain)) >= 0) { + av_log(avctx, AV_LOG_ERROR, "required frame size too large. please " + "use a higher bit rate.\n"); + return AVERROR(EINVAL); + } assert((put_bits_count(&s->pb) & 7) == 0); - i= s->block_align - (put_bits_count(&s->pb)+7)/8; - assert(i>=0); - while(i--) + while (i++) put_bits(&s->pb, 8, 'N'); flush_put_bits(&s->pb); - return put_bits_ptr(&s->pb) - s->pb.buf; + return s->block_align; } AVCodec ff_wmav1_encoder = { diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index e3b727d1b1..f4f6e92877 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -29,16 +29,16 @@ /* * MC functions */ -extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, @@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, @@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ } #if ARCH_X86_32 -TAP_W8 (mmxext, epel, h4) -TAP_W8 (mmxext, epel, h6) -TAP_W16(mmxext, epel, h6) -TAP_W8 (mmxext, epel, v4) -TAP_W8 (mmxext, epel, v6) -TAP_W16(mmxext, epel, v6) -TAP_W8 (mmxext, bilinear, h) -TAP_W16(mmxext, bilinear, h) -TAP_W8 (mmxext, bilinear, v) -TAP_W16(mmxext, bilinear, v) +TAP_W8 (mmx2, epel, h4) +TAP_W8 (mmx2, epel, h6) +TAP_W16(mmx2, epel, h6) +TAP_W8 (mmx2, epel, v4) +TAP_W8 (mmx2, epel, v6) +TAP_W16(mmx2, epel, v6) +TAP_W8 (mmx2, bilinear, h) +TAP_W16(mmx2, bilinear, h) +TAP_W8 (mmx2, bilinear, v) +TAP_W16(mmx2, bilinear, v) #endif -TAP_W16(sse2, epel, h6) -TAP_W16(sse2, epel, v6) -TAP_W16(sse2, bilinear, h) -TAP_W16(sse2, bilinear, v) +TAP_W16(sse2, epel, h6) +TAP_W16(sse2, epel, v6) +TAP_W16(sse2, bilinear, h) +TAP_W16(sse2, bilinear, v) -TAP_W16(ssse3, epel, h6) -TAP_W16(ssse3, epel, v6) -TAP_W16(ssse3, bilinear, h) -TAP_W16(ssse3, bilinear, v) +TAP_W16(ssse3, epel, h6) +TAP_W16(ssse3, epel, v6) +TAP_W16(ssse3, bilinear, h) +TAP_W16(ssse3, bilinear, v) #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ @@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT #if ARCH_X86_32 #define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) \ -HVTAP(mmxext, 8, x, y, 8, 16) +HVTAP(mmx2, 8, x, y, 4, 8) \ +HVTAP(mmx2, 8, x, y, 8, 16) -HVTAP(mmxext, 8, 6, 6, 16, 16) +HVTAP(mmx2, 8, 6, 6, 16, 16) #else #define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) +HVTAP(mmx2, 8, x, y, 4, 8) #endif HVTAPMMX(4, 4) @@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ dst, dststride, tmp, SIZE, height, mx, my); \ } -HVBILIN(mmxext, 8, 4, 8) +HVBILIN(mmx2, 8, 4, 8) #if ARCH_X86_32 -HVBILIN(mmxext, 8, 8, 16) -HVBILIN(mmxext, 8, 16, 16) +HVBILIN(mmx2, 8, 8, 16) +HVBILIN(mmx2, 8, 16, 16) #endif -HVBILIN(sse2, 8, 8, 16) -HVBILIN(sse2, 8, 16, 16) -HVBILIN(ssse3, 8, 4, 8) -HVBILIN(ssse3, 8, 8, 16) -HVBILIN(ssse3, 8, 16, 16) +HVBILIN(sse2, 8, 8, 16) +HVBILIN(sse2, 8, 16, 16) +HVBILIN(ssse3, 8, 4, 8) +HVBILIN(ssse3, 8, 8, 16) +HVBILIN(ssse3, 8, 16, 16) extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride); @@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ int e, int i, int hvt); DECLARE_LOOP_FILTER(mmx) -DECLARE_LOOP_FILTER(mmxext) +DECLARE_LOOP_FILTER(mmx2) DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(sse4) @@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) /* note that 4-tap width=16 functions are missing because w=16 * is only used for luma, and luma is always a copy or sixtap. */ if (mm_flags & AV_CPU_FLAG_MMX2) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); + VP8_MC_FUNC(2, 4, mmx2); + VP8_BILINEAR_MC_FUNC(2, 4, mmx2); #if ARCH_X86_32 - VP8_LUMA_MC_FUNC(0, 16, mmxext); - VP8_MC_FUNC(1, 8, mmxext); - VP8_BILINEAR_MC_FUNC(0, 16, mmxext); - VP8_BILINEAR_MC_FUNC(1, 8, mmxext); - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; + VP8_LUMA_MC_FUNC(0, 16, mmx2); + VP8_MC_FUNC(1, 8, mmx2); + VP8_BILINEAR_MC_FUNC(0, 16, mmx2); + VP8_BILINEAR_MC_FUNC(1, 8, mmx2); + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2; #endif } diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 140597031f..05a2a5712b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %ifdef PIC -%define fourtap_filter_hw r11 -%define sixtap_filter_hw r11 -%define fourtap_filter_hb r11 -%define sixtap_filter_hb r11 -%define fourtap_filter_v r11 -%define sixtap_filter_v r11 -%define bilinear_filter_vw r11 -%define bilinear_filter_vb r11 +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m -%define fourtap_filter_hb fourtap_filter_hb_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define fourtap_filter_v fourtap_filter_v_m -%define sixtap_filter_v sixtap_filter_v_m +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m %define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 %endif filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -173,26 +175,26 @@ SECTION .text ; int height, int mx, int my); ;----------------------------------------------------------------------------- -%macro FILTER_SSSE3 3 -cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 - lea r5d, [r5*3] +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+r5*8-32] - mova m7, [sixtap_filter_hb+r5*8-16] + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] .nextrow - movu m0, [r2-2] + movu m0, [srcq-2] mova m1, m0 mova m2, m0 -%ifidn %1, 4 +%if mmsize == 8 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; shuffle with a memory operand - punpcklbw m0, [r2+3] + punpcklbw m0, [srcq+3] %else pshufb m0, [filter_h6_shuf1] %endif @@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 paddsw m0, [pw_64] psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 - shl r5d, 4 +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 mova m2, [pw_64] mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+r5] + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] .nextrow - movu m0, [r2-1] + movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 @@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 paddsw m0, m1 psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 - shl r6d, 4 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r6-16] - mova m6, [fourtap_filter_hb+r6] + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] mova m7, [pw_64] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq .nextrow - movh m3, [r2+2*r3] ; read new row + movh m3, [srcq+2*srcstrideq] ; read new row mova m4, m0 mova m0, m1 punpcklbw m4, m1 @@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 paddsw m4, m7 psraw m4, 7 packuswb m4, m4 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 - lea r6d, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - lea r6, [sixtap_filter_hb+r6*8] + lea myq, [sixtap_filter_hb+myq*8] ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] .nextrow - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 mova m0, m1 punpcklbw m1, m2 mova m7, m3 punpcklbw m7, m4 - pmaddubsw m6, [r6-48] - pmaddubsw m1, [r6-32] - pmaddubsw m7, [r6-16] + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] paddsw m6, m1 paddsw m6, m7 mova m1, m2 @@ -323,34 +325,35 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 mova m3, m4 packuswb m6, m6 mova m4, m5 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_SSSE3 4, 0, 0 -INIT_XMM -FILTER_SSSE3 8, 8, 7 +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 ; 4x4 block, H-only 4-tap filter -cglobal put_vp8_epel4_h4_mmxext, 6, 6 - shl r5d, 4 +INIT_MMX mmx2 +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea picregq, [fourtap_filter_hw_m] %endif - movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+r5] + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] movq mm7, [pw_64] pxor mm6, mm6 .nextrow - movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -376,29 +379,30 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 paddsw mm3, mm7 ; rounding psraw mm3, 7 packuswb mm3, mm6 ; clip and word->bytes - movd [r0], mm3 ; store + movd [dstq], mm3 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, H-only 6-tap filter -cglobal put_vp8_epel4_h6_mmxext, 6, 6 - lea r5d, [r5*3] +INIT_MMX mmx2 +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea picregq, [sixtap_filter_hw_m] %endif - movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+r5*8-32] - movq mm6, [sixtap_filter_hw+r5*8-16] + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] movq mm7, [pw_64] pxor mm3, mm3 .nextrow - movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -418,7 +422,7 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddd mm1, mm2 ; finish 1st 2px ; second set of 2 pixels, use backup of above - movd mm2, [r2+3] ; byte FGHI (prevent overreads) + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 paddd mm0, mm3 ; add to 2nd 2px cache @@ -433,35 +437,35 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddsw mm1, mm7 ; rounding psraw mm1, 7 packuswb mm1, mm3 ; clip and word->bytes - movd [r0], mm1 ; store + movd [dstq], mm1 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -INIT_XMM -cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 - shl r5d, 5 +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r5, [fourtap_filter_v+r5-32] + lea mxq, [fourtap_filter_v+mxq-32] pxor m7, m7 mova m4, [pw_64] - mova m5, [r5+ 0] - mova m6, [r5+16] + mova m5, [mxq+ 0] + mova m6, [mxq+16] %ifdef m8 - mova m8, [r5+32] - mova m9, [r5+48] + mova m8, [mxq+32] + mova m9, [mxq+48] %endif .nextrow - movq m0, [r2-1] - movq m1, [r2-0] - movq m2, [r2+1] - movq m3, [r2+2] + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -472,8 +476,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 pmullw m2, m8 pmullw m3, m9 %else - pmullw m2, [r5+32] - pmullw m3, [r5+48] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] %endif paddsw m0, m1 paddsw m2, m3 @@ -481,39 +485,40 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 paddsw m0, m4 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 - lea r5d, [r5*3] - shl r5d, 4 +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r5, [sixtap_filter_v+r5-96] + lea mxq, [sixtap_filter_v+mxq-96] pxor m7, m7 mova m6, [pw_64] %ifdef m8 - mova m8, [r5+ 0] - mova m9, [r5+16] - mova m10, [r5+32] - mova m11, [r5+48] - mova m12, [r5+64] - mova m13, [r5+80] + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] %endif .nextrow - movq m0, [r2-2] - movq m1, [r2-1] - movq m2, [r2-0] - movq m3, [r2+1] - movq m4, [r2+2] - movq m5, [r2+3] + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -528,12 +533,12 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 pmullw m4, m12 pmullw m5, m13 %else - pmullw m0, [r5+ 0] - pmullw m1, [r5+16] - pmullw m2, [r5+32] - pmullw m3, [r5+48] - pmullw m4, [r5+64] - pmullw m5, [r5+80] + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] %endif paddsw m1, m4 paddsw m0, m5 @@ -543,52 +548,52 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 paddsw m0, m6 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -%macro FILTER_V 3 +%macro FILTER_V 1 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 - shl r6d, 5 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r6, [fourtap_filter_v+r6-32] + lea myq, [fourtap_filter_v+myq-32] mova m6, [pw_64] pxor m7, m7 - mova m5, [r6+48] + mova m5, [myq+48] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 .nextrow ; first calculate negative taps (to prevent losing positive overflows) - movh m4, [r2+2*r3] ; read new row + movh m4, [srcq+2*srcstrideq] ; read new row punpcklbw m4, m7 mova m3, m4 - pmullw m0, [r6+0] + pmullw m0, [myq+0] pmullw m4, m5 paddsw m4, m0 ; then calculate positive taps mova m0, m1 - pmullw m1, [r6+16] + pmullw m1, [myq+16] paddsw m4, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m4, m2 mova m2, m3 @@ -596,36 +601,36 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 paddsw m4, m6 psraw m4, 7 packuswb m4, m7 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 - shl r6d, 4 - lea r6, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r6, [sixtap_filter_v+r6-96] + lea myq, [sixtap_filter_v+myq-96] pxor m7, m7 ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -635,62 +640,61 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 .nextrow ; first calculate negative taps (to prevent losing positive overflows) mova m5, m1 - pmullw m5, [r6+16] + pmullw m5, [myq+16] mova m6, m4 - pmullw m6, [r6+64] + pmullw m6, [myq+64] paddsw m6, m5 ; then calculate positive taps - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row punpcklbw m5, m7 - pmullw m0, [r6+0] + pmullw m0, [myq+0] paddsw m6, m0 mova m0, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m6, m2 mova m2, m3 - pmullw m3, [r6+48] + pmullw m3, [myq+48] paddsw m6, m3 mova m3, m4 mova m4, m5 - pmullw m5, [r6+80] + pmullw m5, [myq+80] paddsw m6, m5 ; round/clip/store paddsw m6, [pw_64] psraw m6, 7 packuswb m6, m7 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_V mmxext, 4, 0 -INIT_XMM -FILTER_V sse2, 8, 8 +INIT_MMX mmx2 +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 -%macro FILTER_BILINEAR 3 -cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 - mov r5d, 8*16 - shl r6d, 4 - sub r5d, r6d +%macro FILTER_BILINEAR 1 +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r5-16] - mova m5, [bilinear_filter_vw+r6-16] + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m3, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m3, m6 @@ -705,38 +709,37 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 - mov r6d, 8*16 - shl r5d, 4 - sub r6d, r5d +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r6-16] - mova m5, [bilinear_filter_vw+r5-16] + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] .nextrow - movh m0, [r2+r3*0+0] - movh m1, [r2+r3*0+1] - movh m2, [r2+r3*1+0] - movh m3, [r2+r3*1+1] + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m2, m6 @@ -751,41 +754,41 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_BILINEAR mmxext, 4, 0 -INIT_XMM -FILTER_BILINEAR sse2, 8, 7 +INIT_MMX mmx2 +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 %macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v_ssse3, 7,7 - shl r6d, 4 +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 - mova m3, [bilinear_filter_vb+r6-16] + mova m3, [bilinear_filter_vb+myq-16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m2, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] punpcklbw m0, m1 punpcklbw m1, m2 pmaddubsw m0, m3 @@ -797,31 +800,31 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h_ssse3, 7,7 - shl r5d, 4 +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 mova m2, [filter_h2_shuf] - mova m3, [bilinear_filter_vb+r5-16] + mova m3, [bilinear_filter_vb+mxq-16] .nextrow - movu m0, [r2+r3*0] - movu m1, [r2+r3*1] + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] pshufb m0, m2 pshufb m1, m2 pmaddubsw m0, m3 @@ -833,65 +836,68 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX +INIT_MMX ssse3 FILTER_BILINEAR_SSSE3 4 -INIT_XMM +INIT_XMM ssse3 FILTER_BILINEAR_SSSE3 8 -cglobal put_vp8_pixels8_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0] - movq mm1, [r2+r3*1] - lea r2, [r2+r3*2] - movq [r0+r1*0], mm0 - movq [r0+r1*1], mm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %if ARCH_X86_32 -cglobal put_vp8_pixels16_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0+0] - movq mm1, [r2+r3*0+8] - movq mm2, [r2+r3*1+0] - movq mm3, [r2+r3*1+8] - lea r2, [r2+r3*2] - movq [r0+r1*0+0], mm0 - movq [r0+r1*0+8], mm1 - movq [r0+r1*1+0], mm2 - movq [r0+r1*1+8], mm3 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0+0] + movq mm1, [srcq+srcstrideq*0+8] + movq mm2, [srcq+srcstrideq*1+0] + movq mm3, [srcq+srcstrideq*1+8] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0+0], mm0 + movq [dstq+dststrideq*0+8], mm1 + movq [dstq+dststrideq*1+0], mm2 + movq [dstq+dststrideq*1+8], mm3 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endif -cglobal put_vp8_pixels16_sse, 5,5,2 +INIT_XMM sse +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height .nextrow: - movups xmm0, [r2+r3*0] - movups xmm1, [r2+r3*1] - lea r2, [r2+r3*2] - movaps [r0+r1*0], xmm0 - movaps [r0+r1*1], xmm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET @@ -900,10 +906,10 @@ cglobal put_vp8_pixels16_sse, 5,5,2 ;----------------------------------------------------------------------------- %macro ADD_DC 4 - %4 m2, [r0+%3] - %4 m3, [r0+r2+%3] - %4 m4, [r1+%3] - %4 m5, [r1+r2+%3] + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] paddusb m2, %1 paddusb m3, %1 paddusb m4, %1 @@ -912,22 +918,22 @@ cglobal put_vp8_pixels16_sse, 5,5,2 psubusb m3, %2 psubusb m4, %2 psubusb m5, %2 - %4 [r0+%3], m2 - %4 [r0+r2+%3], m3 - %4 [r1+%3], m4 - %4 [r1+r2+%3], m5 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 %endmacro -INIT_MMX -cglobal vp8_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] ; calculate DC paddw m0, [pw_4] pxor m1, m1 psraw m0, 3 - movd [r1], m1 + movd [blockq], m1 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 @@ -937,24 +943,26 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 punpcklwd m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, movh RET -INIT_XMM -cglobal vp8_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1], m1 - lea r1, [r0+r2*2] - movd m2, [r0] - movd m3, [r0+r2] - movd m4, [r1] - movd m5, [r1+r2] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] psraw m0, 3 pshuflw m0, m0, 0 punpcklqdq m0, m0 @@ -965,10 +973,10 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 paddw m2, m0 paddw m4, m0 packuswb m2, m4 - movd [r0], m2 - pextrd [r0+r2], m2, 1 - pextrd [r1], m2, 2 - pextrd [r1+r2], m2, 3 + movd [dst1q], m2 + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 RET ;----------------------------------------------------------------------------- @@ -976,22 +984,22 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ;----------------------------------------------------------------------------- %if ARCH_X86_32 -INIT_MMX -cglobal vp8_idct_dc_add4y_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1006,28 +1014,29 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova ADD_DC m1, m7, 8, mova RET %endif -INIT_XMM -cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m1 - movd [r1+32*1], m1 - movd [r1+32*2], m1 - movd [r1+32*3], m1 + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 psraw m0, 3 psubw m1, m0 packuswb m0, m0 @@ -1038,7 +1047,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 punpcklbw m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, mova RET @@ -1046,22 +1056,22 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal vp8_idct_dc_add4uv_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1076,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova - lea r0, [r0+r2*4] - lea r1, [r1+r2*4] + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] ADD_DC m1, m7, 0, mova RET @@ -1118,26 +1129,25 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 SWAP %4, %3 %endmacro -INIT_MMX -%macro VP8_IDCT_ADD 1 -cglobal vp8_idct_add_%1, 3, 3 +%macro VP8_IDCT_ADD 0 +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride ; load block data - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] movq m6, [pw_20091] movq m7, [pw_17734] -%ifidn %1, sse +%if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [blockq+ 0], m4 + movq [blockq+ 8], m4 + movq [blockq+16], m4 + movq [blockq+24], m4 %endif ; actual IDCT @@ -1149,41 +1159,44 @@ cglobal vp8_idct_add_%1, 3, 3 ; store pxor m4, m4 - lea r1, [r0+2*r2] - STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 - STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq RET %endmacro %if ARCH_X86_32 -VP8_IDCT_ADD mmx +INIT_MMX mmx +VP8_IDCT_ADD %endif -VP8_IDCT_ADD sse +INIT_MMX sse +VP8_IDCT_ADD ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ;----------------------------------------------------------------------------- %macro SCATTER_WHT 3 - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(0+%3)], r1w - mov [r0+2*16*(1+%3)], r2w - shr r1d, 16 - shr r2d, 16 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 psrlq m%1, 32 psrlq m%2, 32 - mov [r0+2*16*(4+%3)], r1w - mov [r0+2*16*(5+%3)], r2w - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(8+%3)], r1w - mov [r0+2*16*(9+%3)], r2w - shr r1d, 16 - shr r2d, 16 - mov [r0+2*16*(12+%3)], r1w - mov [r0+2*16*(13+%3)], r2w + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w %endmacro %macro HADAMARD4_1D 4 @@ -1192,22 +1205,22 @@ VP8_IDCT_ADD sse SWAP %1, %4, %3 %endmacro -%macro VP8_DC_WHT 1 -cglobal vp8_luma_dc_wht_%1, 2,3 - movq m0, [r1] - movq m1, [r1+8] - movq m2, [r1+16] - movq m3, [r1+24] -%ifidn %1, sse +%macro VP8_DC_WHT 0 +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] +%if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [dc1q+ 0], m4 + movq [dc1q+ 8], m4 + movq [dc1q+16], m4 + movq [dc1q+24], m4 %endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 @@ -1222,11 +1235,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3 RET %endmacro -INIT_MMX %if ARCH_X86_32 -VP8_DC_WHT mmx +INIT_MMX mmx +VP8_DC_WHT %endif -VP8_DC_WHT sse +INIT_MMX sse +VP8_DC_WHT ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); @@ -1414,7 +1428,17 @@ VP8_DC_WHT sse add %4, %5 %endmacro -%macro WRITE_8W_SSE2 5 +%macro WRITE_8W 5 +%if cpuflag(sse4) + pextrw [%3+%4*4], %1, 0 + pextrw [%2+%4*4], %1, 1 + pextrw [%3+%4*2], %1, 2 + pextrw [%3+%4 ], %1, 3 + pextrw [%3 ], %1, 4 + pextrw [%2 ], %1, 5 + pextrw [%2+%5 ], %1, 6 + pextrw [%2+%5*2], %1, 7 +%else movd %2d, %1 psrldq %1, 4 mov [%3+%4*4], %2w @@ -1440,79 +1464,68 @@ VP8_DC_WHT sse mov [%3+%5 ], %2w shr %2, 16 mov [%3+%5*2], %2w +%endif %endmacro -%macro WRITE_8W_SSE4 5 - pextrw [%3+%4*4], %1, 0 - pextrw [%2+%4*4], %1, 1 - pextrw [%3+%4*2], %1, 2 - pextrw [%3+%4 ], %1, 3 - pextrw [%3 ], %1, 4 - pextrw [%2 ], %1, 5 - pextrw [%2+%5 ], %1, 6 - pextrw [%2+%5*2], %1, 7 -%endmacro - -%macro SPLATB_REG_MMX 2-3 +%macro SPLATB_REG 2-3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%elif cpuflag(sse2) movd %1, %2d punpcklbw %1, %1 - punpcklwd %1, %1 - punpckldq %1, %1 -%endmacro - -%macro SPLATB_REG_MMXEXT 2-3 + pshuflw %1, %1, 0x0 + punpcklqdq %1, %1 +%elif cpuflag(mmx2) movd %1, %2d punpcklbw %1, %1 pshufw %1, %1, 0x0 -%endmacro - -%macro SPLATB_REG_SSE2 2-3 +%else movd %1, %2d punpcklbw %1, %1 - pshuflw %1, %1, 0x0 - punpcklqdq %1, %1 -%endmacro - -%macro SPLATB_REG_SSSE3 3 - movd %1, %2d - pshufb %1, %3 + punpcklwd %1, %1 + punpckldq %1, %1 +%endif %endmacro -%macro SIMPLE_LOOPFILTER 4 -cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 +%macro SIMPLE_LOOPFILTER 2 +cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %if mmsize == 8 ; mmx/mmxext - mov r3, 2 + mov cntrq, 2 %endif -%ifnidn %1, sse2 -%if mmsize == 16 +%if cpuflag(ssse3) pxor m0, m0 %endif -%endif - SPLATB_REG m7, r2, m0 ; splat "flim" into register + SPLATB_REG m7, flim, m0 ; splat "flim" into register ; set up indexes to address 4 rows - mov r2, r1 - neg r1 -%ifidn %2, h - lea r0, [r0+4*r2-2] +%if mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, cntr, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst3, dst2 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+4*strideq-2] %endif %if mmsize == 8 ; mmx / mmxext .next8px %endif -%ifidn %2, v +%ifidn %1, v ; read 4 half/full rows of pixels - mova m0, [r0+r1*2] ; p1 - mova m1, [r0+r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r2] ; q1 + mova m0, [dst1q+mstrideq*2] ; p1 + mova m1, [dst1q+mstrideq] ; p0 + mova m2, [dst1q] ; q0 + mova m3, [dst1q+ strideq] ; q1 %else ; h - lea r4, [r0+r2] + lea dst2q, [dst1q+ strideq] %if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 + READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq %else ; sse2 - READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q %endif TRANSPOSE4x4W 0, 1, 2, 3, 4 %endif @@ -1581,36 +1594,36 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 psubusb m6, m3 ; p0+f2 ; store -%ifidn %2, v - mova [r0], m4 - mova [r0+r1], m6 +%ifidn %1, v + mova [dst1q], m4 + mova [dst1q+mstrideq], m6 %else ; h - inc r0 + inc dst1q SBUTTERFLY bw, 6, 4, 0 %if mmsize == 16 ; sse2 -%ifidn %1, sse4 - inc r4 +%if cpuflag(sse4) + inc dst2q %endif - WRITE_8W m6, r4, r0, r1, r2 - lea r4, [r3+r1+1] -%ifidn %1, sse4 - inc r3 + WRITE_8W m6, dst2q, dst1q, mstrideq, strideq + lea dst2q, [dst3q+mstrideq+1] +%if cpuflag(sse4) + inc dst3q %endif - WRITE_8W m4, r3, r4, r1, r2 + WRITE_8W m4, dst3q, dst2q, mstrideq, strideq %else ; mmx/mmxext - WRITE_2x4W m6, m4, r4, r0, r1, r2 + WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq %endif %endif %if mmsize == 8 ; mmx/mmxext ; next 8 pixels -%ifidn %2, v - add r0, 8 ; advance 8 cols = pixels +%ifidn %1, v + add dst1q, 8 ; advance 8 cols = pixels %else ; h - lea r0, [r0+r2*8-1] ; advance 8 rows = lines + lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines %endif - dec r3 + dec cntrq jg .next8px REP_RET %else ; sse2 @@ -1619,41 +1632,38 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -SIMPLE_LOOPFILTER mmx, v, 4, 0 -SIMPLE_LOOPFILTER mmx, h, 5, 0 -%define SPLATB_REG SPLATB_REG_MMXEXT -SIMPLE_LOOPFILTER mmxext, v, 4, 0 -SIMPLE_LOOPFILTER mmxext, h, 5, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -%define WRITE_8W WRITE_8W_SSE2 -SIMPLE_LOOPFILTER sse2, v, 3, 8 -SIMPLE_LOOPFILTER sse2, h, 5, 8 -%define SPLATB_REG SPLATB_REG_SSSE3 -SIMPLE_LOOPFILTER ssse3, v, 3, 8 -SIMPLE_LOOPFILTER ssse3, h, 5, 8 -%define WRITE_8W WRITE_8W_SSE4 -SIMPLE_LOOPFILTER sse4, h, 5, 8 +INIT_MMX mmx +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +INIT_MMX mmx2 +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +%endif + +INIT_XMM sse2 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM ssse3 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM sse4 +SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 +%macro INNER_LOOPFILTER 3 +%if %3 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, %2, 13 %define dst8_reg r1 %define mstride_reg r2 %define E_reg r3 %define I_reg r4 %define hev_thr_reg r5 %else ; luma -cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 +cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -1673,11 +1683,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %define stack_reg hev_thr_reg %endif -%ifnidn %1, sse2 -%if mmsize == 16 +%if cpuflag(ssse3) pxor m7, m7 %endif -%endif %ifndef m8 ; mmx/mmxext or sse2 on x86-32 ; splat function arguments @@ -1688,7 +1696,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 ; align stack mov stack_reg, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack -%ifidn %2, v +%ifidn %1, v sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr ; [3]=hev() result %else ; h @@ -1721,14 +1729,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext +%if mmsize == 8 && %3 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 +%if %3 == 8 lea dst8_reg, [dst8_reg+ stride_reg*4-4] %endif %endif @@ -1738,8 +1746,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endif ; read lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 +%ifidn %1, v +%if %3 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova @@ -1750,7 +1758,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 movrow m5, [dst2_reg] ; q1 movrow m6, [dst2_reg+ stride_reg] ; q2 movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m0, [dst8_reg+mstride_reg*4] movhps m2, [dst8_reg+mstride_reg*2] add dst8_reg, stride_reg @@ -1787,7 +1795,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 +%if %3 == 16 lea dst8_reg, [dst_reg + stride_reg*8] %endif @@ -1874,7 +1882,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m4, flim_I pxor m3, m3 psubusb m0, m4 @@ -1896,9 +1904,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v +%ifidn %1, v movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m3, [dst8_reg+mstride_reg] %endif %elifdef m12 @@ -1914,7 +1922,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 psubusb m6, hev_thr @@ -1928,9 +1936,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v +%ifidn %1, v movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m4, [dst8_reg] %endif %elifdef m8 @@ -1945,7 +1953,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 psubusb m7, hev_thr @@ -2053,14 +2061,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %else mova m6, mask_res %endif -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, [pb_1] %else ; mmxext/sse2 pxor m7, m7 %endif pand m0, m6 pand m1, m6 -%ifidn %1, mmx +%if notcpuflag(mmx2) paddusb m0, m7 pand m1, [pb_FE] pandn m7, m0 @@ -2078,12 +2086,12 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 paddusb m2, m0 ; p1+a ; store -%ifidn %2, v +%ifidn %1, v movrow [dst_reg +mstride_reg*2], m2 movrow [dst_reg +mstride_reg ], m3 movrow [dst_reg], m4 movrow [dst_reg + stride_reg ], m5 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps [dst8_reg+mstride_reg*2], m2 movhps [dst8_reg+mstride_reg ], m3 movhps [dst8_reg], m4 @@ -2100,20 +2108,20 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 + WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 %endif %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h +%if %3 == 8 ; chroma +%ifidn %1, h sub dst_reg, 2 %endif cmp dst_reg, dst8_reg mov dst_reg, dst8_reg jnz .next8px %else -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*8-2] %else ; v add dst_reg, 8 @@ -2130,56 +2138,46 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -INNER_LOOPFILTER mmx, v, 6, 16, 0 -INNER_LOOPFILTER mmx, h, 6, 16, 0 -INNER_LOOPFILTER mmx, v, 6, 8, 0 -INNER_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -INNER_LOOPFILTER mmxext, v, 6, 16, 0 -INNER_LOOPFILTER mmxext, h, 6, 16, 0 -INNER_LOOPFILTER mmxext, v, 6, 8, 0 -INNER_LOOPFILTER mmxext, h, 6, 8, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -INNER_LOOPFILTER sse2, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER sse2, h, 5, 16, 13 -%else -INNER_LOOPFILTER sse2, h, 6, 16, 13 -%endif -INNER_LOOPFILTER sse2, v, 6, 8, 13 -INNER_LOOPFILTER sse2, h, 6, 8, 13 - -%define SPLATB_REG SPLATB_REG_SSSE3 -INNER_LOOPFILTER ssse3, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER ssse3, h, 5, 16, 13 -%else -INNER_LOOPFILTER ssse3, h, 6, 16, 13 -%endif -INNER_LOOPFILTER ssse3, v, 6, 8, 13 -INNER_LOOPFILTER ssse3, h, 6, 8, 13 +INIT_MMX mmx +INNER_LOOPFILTER v, 6, 16 +INNER_LOOPFILTER h, 6, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 + +INIT_MMX mmx2 +INNER_LOOPFILTER v, 6, 16 +INNER_LOOPFILTER h, 6, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 +%endif + +INIT_XMM sse2 +INNER_LOOPFILTER v, 5, 16 +INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 + +INIT_XMM ssse3 +INNER_LOOPFILTER v, 5, 16 +INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro MBEDGE_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 +%macro MBEDGE_LOOPFILTER 3 +%if %3 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, %2, 15 %define dst8_reg r1 %define mstride_reg r2 %define E_reg r3 %define I_reg r4 %define hev_thr_reg r5 %else ; luma -cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 +cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -2199,14 +2197,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %define stack_reg hev_thr_reg %endif -%define ssse3_or_higher 0 -%ifnidn %1, sse2 -%if mmsize == 16 -%define ssse3_or_higher 1 -%endif -%endif - -%if ssse3_or_higher +%if cpuflag(ssse3) pxor m7, m7 %endif @@ -2267,14 +2258,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext +%if mmsize == 8 && %3 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 +%if %3 == 8 lea dst8_reg, [dst8_reg+ stride_reg*4-4] %endif %endif @@ -2284,8 +2275,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif ; read lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 +%ifidn %1, v +%if %3 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova @@ -2296,7 +2287,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 movrow m5, [dst2_reg] ; q1 movrow m6, [dst2_reg+ stride_reg] ; q2 movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m0, [dst8_reg+mstride_reg*4] movhps m2, [dst8_reg+mstride_reg*2] add dst8_reg, stride_reg @@ -2333,7 +2324,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 +%if %3 == 16 lea dst8_reg, [dst_reg + stride_reg*8] %endif @@ -2422,7 +2413,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m4, flim_I pxor m3, m3 psubusb m0, m4 @@ -2444,9 +2435,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v +%ifidn %1, v movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m3, [dst8_reg+mstride_reg] %endif %elifdef m12 @@ -2462,7 +2453,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 psubusb m6, hev_thr @@ -2476,9 +2467,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v +%ifidn %1, v movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m4, [dst8_reg] %endif %elifdef m8 @@ -2493,7 +2484,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 psubusb m7, hev_thr @@ -2605,7 +2596,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m4, m1 ; q0-f1 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) -%if ssse3_or_higher +%if cpuflag(ssse3) mova m7, [pb_1] %else mova m7, [pw_63] @@ -2618,7 +2609,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 pxor m0, m0 mova m6, m1 pcmpgtb m0, m1 ; which are negative -%if ssse3_or_higher +%if cpuflag(ssse3) punpcklbw m6, m7 ; interleave with "1" for rounding punpckhbw m1, m7 %else @@ -2626,7 +2617,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 punpckhbw m1, m0 %endif mova lim_sign, m0 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m7, [pb_27_63] %ifndef m8 mova lim_res, m1 @@ -2659,7 +2650,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a0 pandn m0, m6 ; +a0 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m6, [pb_18_63] ; pipelining %endif psubusb m3, m1 @@ -2667,7 +2658,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m3, m0 ; p0+a0 psubusb m4, m0 ; q0-a0 -%if ssse3_or_higher +%if cpuflag(ssse3) SWAP 6, 7 %ifdef m10 SWAP 1, 10 @@ -2699,7 +2690,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a1 pandn m0, m6 ; +a1 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m6, [pb_9_63] %endif psubusb m2, m1 @@ -2707,7 +2698,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m2, m0 ; p1+a1 psubusb m5, m0 ; q1-a1 -%if ssse3_or_higher +%if cpuflag(ssse3) SWAP 6, 7 %ifdef m10 SWAP 1, 10 @@ -2757,14 +2748,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m6, m7 ; q1-a1 ; store -%ifidn %2, v +%ifidn %1, v movrow [dst2_reg+mstride_reg*4], m1 movrow [dst_reg +mstride_reg*2], m2 movrow [dst_reg +mstride_reg ], m3 movrow [dst_reg], m4 movrow [dst2_reg], m5 movrow [dst2_reg+ stride_reg ], m6 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 add dst8_reg, mstride_reg movhps [dst8_reg+mstride_reg*2], m1 movhps [dst8_reg+mstride_reg ], m2 @@ -2788,14 +2779,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+1] - WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 + WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 lea dst_reg, [dst2_reg+mstride_reg+4] lea dst8_reg, [dst8_reg+mstride_reg+4] -%ifidn %1, sse4 +%if cpuflag(sse4) add dst2_reg, 4 %endif WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg -%ifidn %1, sse4 +%if cpuflag(sse4) lea dst2_reg, [dst8_reg+ stride_reg] %endif WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg @@ -2803,15 +2794,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h +%if %3 == 8 ; chroma +%ifidn %1, h sub dst_reg, 5 %endif cmp dst_reg, dst8_reg mov dst_reg, dst8_reg jnz .next8px %else -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*8-5] %else ; v add dst_reg, 8 @@ -2828,46 +2819,31 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -%define WRITE_8W WRITE_8W_SSE2 -MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 -%ifdef m8 -MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 -MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 - -%define SPLATB_REG SPLATB_REG_SSSE3 -MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 -%ifdef m8 -MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 -MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 - -%define WRITE_8W WRITE_8W_SSE4 -%ifdef m8 -MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 +INIT_MMX mmx +MBEDGE_LOOPFILTER v, 6, 16 +MBEDGE_LOOPFILTER h, 6, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_MMX mmx2 +MBEDGE_LOOPFILTER v, 6, 16 +MBEDGE_LOOPFILTER h, 6, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 +%endif + +INIT_XMM sse2 +MBEDGE_LOOPFILTER v, 5, 16 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_XMM ssse3 +MBEDGE_LOOPFILTER v, 5, 16 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_XMM sse4 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER h, 6, 8 |