diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-05-12 04:51:24 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-05-12 04:51:24 +0200 |
commit | 612122b187d711257eecd517e4049cef3bb0b7f0 (patch) | |
tree | 2e0ed86f6f73bbc993a0e7787f331e21d1c7c064 /libavcodec | |
parent | 4ea216e761e02d3f6973b316feaf3484be91a14f (diff) | |
parent | 5705b02079449c685a3dd337fcc3a8b440dca4a0 (diff) | |
download | ffmpeg-612122b187d711257eecd517e4049cef3bb0b7f0.tar.gz |
Merge remote branch 'qatar/master'
* qatar/master: (32 commits)
10-bit H.264 x86 chroma v loopfilter asm
Port SMPTE S302M audio decoder from FFmbc 0.3. [Copyright headers corrected]
Fix crash of interlaced MPEG2 decoding
h264pred: fix one more aliasing violation.
doc/APIchanges: fill in missing hashes and dates.
flacenc: use proper initializers for AVOption default values.
lavc: deprecate named constants for deprecated antialias_algo.
aac: workaround for compilation on cygwin
swscale: extend YUV422p support to 10bits depth
tiff: add support for inverted FillOrder for uncompressed data
Remove unused softfloat implementation.
h264pred: fix aliasing violations.
rotozoom: Eliminate French variable name.
rotozoom: Check return value of fread().
rotozoom: Return an error value instead of calling exit().
rotozoom: Make init_demo() return int and check for errors on invocation.
rotozoom: Drop silly UINT8 typedef.
rotozoom: Drop some unnecessary parentheses.
rotozoom: K&R coding style cosmetics
rtsp: Only do keepalive using GET_PARAMETER if the server supports it
...
Conflicts:
Changelog
cmdutils.c
doc/APIchanges
doc/general.texi
ffmpeg.c
ffplay.c
libavcodec/h264pred_template.c
libavcodec/resample.c
libavutil/pixfmt.h
libavutil/softfloat.c
libavutil/softfloat.h
tests/rotozoom.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/aaccoder.c | 3 | ||||
-rw-r--r-- | libavcodec/allcodecs.c | 1 | ||||
-rw-r--r-- | libavcodec/avcodec.h | 1 | ||||
-rw-r--r-- | libavcodec/flacenc.c | 32 | ||||
-rw-r--r-- | libavcodec/options.c | 2 | ||||
-rw-r--r-- | libavcodec/resample.c | 176 | ||||
-rw-r--r-- | libavcodec/s302m.c | 141 | ||||
-rw-r--r-- | libavcodec/tiff.c | 8 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 1 | ||||
-rw-r--r-- | libavcodec/x86/h264_deblock.asm | 354 | ||||
-rw-r--r-- | libavcodec/x86/h264_deblock_10bit.asm | 910 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 139 | ||||
-rw-r--r-- | libavcodec/x86/x86util.asm | 24 |
15 files changed, 1479 insertions, 315 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile index a4c4d19824..e293438e45 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -329,6 +329,7 @@ OBJS-$(CONFIG_RV30_DECODER) += rv30.o rv34.o rv30dsp.o \ mpegvideo.o error_resilience.o OBJS-$(CONFIG_RV40_DECODER) += rv40.o rv34.o rv40dsp.o \ mpegvideo.o error_resilience.o +OBJS-$(CONFIG_S302M_DECODER) += s302m.o OBJS-$(CONFIG_SGI_DECODER) += sgidec.o OBJS-$(CONFIG_SGI_ENCODER) += sgienc.o rle.o OBJS-$(CONFIG_SHORTEN_DECODER) += shorten.o diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c index 6d55acbc45..187b4ad972 100644 --- a/libavcodec/aaccoder.c +++ b/libavcodec/aaccoder.c @@ -30,6 +30,8 @@ * add sane pulse detection ***********************************/ +#include "libavutil/libm.h" // brought forward to work around cygwin header breakage + #include <float.h> #include <math.h> #include "avcodec.h" @@ -37,7 +39,6 @@ #include "aac.h" #include "aacenc.h" #include "aactab.h" -#include "libavutil/libm.h" /** bits needed to code codebook run value for long windows */ static const uint8_t run_value_bits_long[64] = { diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index c321596d4a..fc74eeaf8c 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -184,6 +184,7 @@ void avcodec_register_all(void) REGISTER_ENCDEC (RV20, rv20); REGISTER_DECODER (RV30, rv30); REGISTER_DECODER (RV40, rv40); + REGISTER_DECODER (S302M, s302m); REGISTER_ENCDEC (SGI, sgi); REGISTER_DECODER (SMACKER, smacker); REGISTER_DECODER (SMC, smc); diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 00e9dd5e2f..2fbf9cfc2a 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -232,6 +232,7 @@ enum CodecID { CODEC_ID_PCM_F64LE, CODEC_ID_PCM_BLURAY, CODEC_ID_PCM_LXF, + CODEC_ID_S302M, /* various ADPCM codecs */ CODEC_ID_ADPCM_IMA_QT= 0x11000, diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c index 811a75a06f..c8d3af5546 100644 --- a/libavcodec/flacenc.c +++ b/libavcodec/flacenc.c @@ -1352,22 +1352,22 @@ static av_cold int flac_encode_close(AVCodecContext *avctx) #define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM static const AVOption options[] = { -{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS }, -{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" }, -{ "none", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "fixed", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS }, -{ "min_partition_order", NULL, offsetof(FlacEncodeContext, options.min_partition_order), FF_OPT_TYPE_INT, -1, -1, MAX_PARTITION_ORDER, FLAGS }, -{ "max_partition_order", NULL, offsetof(FlacEncodeContext, options.max_partition_order), FF_OPT_TYPE_INT, -1, -1, MAX_PARTITION_ORDER, FLAGS }, -{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" }, -{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "2level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "4level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "8level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "search", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "log", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = 15 }, 0, MAX_LPC_PRECISION, FLAGS }, +{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, {.dbl = FF_LPC_TYPE_DEFAULT }, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" }, +{ "none", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_NONE }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "fixed", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_FIXED }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, FLAGS }, +{ "min_partition_order", NULL, offsetof(FlacEncodeContext, options.min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, MAX_PARTITION_ORDER, FLAGS }, +{ "max_partition_order", NULL, offsetof(FlacEncodeContext, options.max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, MAX_PARTITION_ORDER, FLAGS }, +{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" }, +{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_EST }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "2level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_2LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "4level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_4LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "8level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_8LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "search", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_SEARCH }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "log", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_LOG }, INT_MIN, INT_MAX, FLAGS, "predm" }, { NULL }, }; diff --git a/libavcodec/options.c b/libavcodec/options.c index de25635161..a2dbb0ba73 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -305,11 +305,11 @@ static const AVOption options[]={ {"error", NULL, OFFSET(error_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E}, #if FF_API_ANTIALIAS_ALGO {"antialias", "MP3 antialias algorithm", OFFSET(antialias_algo), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|D, "aa"}, -#endif {"auto", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_AUTO }, INT_MIN, INT_MAX, V|D, "aa"}, {"fastint", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FASTINT }, INT_MIN, INT_MAX, V|D, "aa"}, {"int", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_INT }, INT_MIN, INT_MAX, V|D, "aa"}, {"float", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FLOAT }, INT_MIN, INT_MAX, V|D, "aa"}, +#endif {"qns", "quantizer noise shaping", OFFSET(quantizer_noise_shaping), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E}, {"threads", NULL, OFFSET(thread_count), FF_OPT_TYPE_INT, {.dbl = 1 }, INT_MIN, INT_MAX, V|E|D}, {"me_threshold", "motion estimaton threshold", OFFSET(me_threshold), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX}, diff --git a/libavcodec/resample.c b/libavcodec/resample.c index d3c12f6354..9e6defefdf 100644 --- a/libavcodec/resample.c +++ b/libavcodec/resample.c @@ -29,6 +29,8 @@ #include "libavutil/opt.h" #include "libavutil/samplefmt.h" +#define MAX_CHANNELS 8 + struct AVResampleContext; static const char *context_to_name(void *ptr) @@ -37,20 +39,22 @@ static const char *context_to_name(void *ptr) } static const AVOption options[] = {{NULL}}; -static const AVClass audioresample_context_class = { "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT }; +static const AVClass audioresample_context_class = { + "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT +}; struct ReSampleContext { struct AVResampleContext *resample_context; - short *temp[2]; + short *temp[MAX_CHANNELS]; int temp_len; float ratio; /* channel convert */ int input_channels, output_channels, filter_channels; AVAudioConvert *convert_ctx[2]; enum AVSampleFormat sample_fmt[2]; ///< input and output sample format - unsigned sample_size[2]; ///< size of one sample in sample_fmt - short *buffer[2]; ///< buffers used for conversion to S16 - unsigned buffer_size[2]; ///< sizes of allocated buffers + unsigned sample_size[2]; ///< size of one sample in sample_fmt + short *buffer[2]; ///< buffers used for conversion to S16 + unsigned buffer_size[2]; ///< sizes of allocated buffers }; /* n1: number of samples */ @@ -104,41 +108,42 @@ static void mono_to_stereo(short *output, short *input, int n1) } } -/* XXX: should use more abstract 'N' channels system */ -static void stereo_split(short *output1, short *output2, short *input, int n) +static void deinterleave(short **output, short *input, int channels, int samples) { - int i; + int i, j; - for(i=0;i<n;i++) { - *output1++ = *input++; - *output2++ = *input++; + for (i = 0; i < samples; i++) { + for (j = 0; j < channels; j++) { + *output[j]++ = *input++; + } } } -static void stereo_mux(short *output, short *input1, short *input2, int n) +static void interleave(short *output, short **input, int channels, int samples) { - int i; + int i, j; - for(i=0;i<n;i++) { - *output++ = *input1++; - *output++ = *input2++; + for (i = 0; i < samples; i++) { + for (j = 0; j < channels; j++) { + *output++ = *input[j]++; + } } } static void ac3_5p1_mux(short *output, short *input1, short *input2, int n) { int i; - short l,r; - - for(i=0;i<n;i++) { - l=*input1++; - r=*input2++; - *output++ = l; /* left */ - *output++ = (l/2)+(r/2); /* center */ - *output++ = r; /* right */ - *output++ = 0; /* left surround */ - *output++ = 0; /* right surroud */ - *output++ = 0; /* low freq */ + short l, r; + + for (i = 0; i < n; i++) { + l = *input1++; + r = *input2++; + *output++ = l; /* left */ + *output++ = (l / 2) + (r / 2); /* center */ + *output++ = r; /* right */ + *output++ = 0; /* left surround */ + *output++ = 0; /* right surroud */ + *output++ = 0; /* low freq */ } } @@ -151,18 +156,25 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, { ReSampleContext *s; - if ( input_channels > 2) - { - av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n"); + if (input_channels > MAX_CHANNELS) { + av_log(NULL, AV_LOG_ERROR, + "Resampling with input channels greater than %d is unsupported.\n", + MAX_CHANNELS); + return NULL; + } + if (output_channels > 2 && + !(output_channels == 6 && input_channels == 2) && + output_channels != input_channels) { + av_log(NULL, AV_LOG_ERROR, + "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n"); return NULL; - } + } s = av_mallocz(sizeof(ReSampleContext)); - if (!s) - { + if (!s) { av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n"); return NULL; - } + } s->ratio = (float)output_rate / (float)input_rate; @@ -173,10 +185,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, if (s->output_channels < s->filter_channels) s->filter_channels = s->output_channels; - s->sample_fmt [0] = sample_fmt_in; - s->sample_fmt [1] = sample_fmt_out; - s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0])>>3; - s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1])>>3; + s->sample_fmt[0] = sample_fmt_in; + s->sample_fmt[1] = sample_fmt_out; + s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0]) >> 3; + s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1]) >> 3; if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) { if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1, @@ -201,17 +213,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, } } -/* - * AC-3 output is the only case where filter_channels could be greater than 2. - * input channels can't be greater than 2, so resample the 2 channels and then - * expand to 6 channels after the resampling. - */ - if(s->filter_channels>2) - s->filter_channels = 2; - #define TAPS 16 - s->resample_context= av_resample_init(output_rate, input_rate, - filter_length, log2_phase_count, linear, cutoff); + s->resample_context = av_resample_init(output_rate, input_rate, + filter_length, log2_phase_count, + linear, cutoff); *(const AVClass**)s->resample_context = &audioresample_context_class; @@ -223,9 +228,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples) { int i, nb_samples1; - short *bufin[2]; - short *bufout[2]; - short *buftmp2[2], *buftmp3[2]; + short *bufin[MAX_CHANNELS]; + short *bufout[MAX_CHANNELS]; + short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS]; short *output_bak = NULL; int lenout; @@ -240,7 +245,7 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl int ostride[1] = { 2 }; const void *ibuf[1] = { input }; void *obuf[1]; - unsigned input_size = nb_samples*s->input_channels*2; + unsigned input_size = nb_samples * s->input_channels * 2; if (!s->buffer_size[0] || s->buffer_size[0] < input_size) { av_free(s->buffer[0]); @@ -255,12 +260,13 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl obuf[0] = s->buffer[0]; if (av_audio_convert(s->convert_ctx[0], obuf, ostride, - ibuf, istride, nb_samples*s->input_channels) < 0) { - av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format conversion failed\n"); + ibuf, istride, nb_samples * s->input_channels) < 0) { + av_log(s->resample_context, AV_LOG_ERROR, + "Audio sample format conversion failed\n"); return 0; } - input = s->buffer[0]; + input = s->buffer[0]; } lenout= 2*s->output_channels*nb_samples * s->ratio + 16; @@ -282,52 +288,50 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl } /* XXX: move those malloc to resample init code */ - for(i=0; i<s->filter_channels; i++){ - bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) ); + for (i = 0; i < s->filter_channels; i++) { + bufin[i] = av_malloc((nb_samples + s->temp_len) * sizeof(short)); memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short)); buftmp2[i] = bufin[i] + s->temp_len; + bufout[i] = av_malloc(lenout * sizeof(short)); } - /* make some zoom to avoid round pb */ - bufout[0]= av_malloc( lenout * sizeof(short) ); - bufout[1]= av_malloc( lenout * sizeof(short) ); - - if (s->input_channels == 2 && - s->output_channels == 1) { + if (s->input_channels == 2 && s->output_channels == 1) { buftmp3[0] = output; stereo_to_mono(buftmp2[0], input, nb_samples); } else if (s->output_channels >= 2 && s->input_channels == 1) { buftmp3[0] = bufout[0]; - memcpy(buftmp2[0], input, nb_samples*sizeof(short)); - } else if (s->output_channels >= 2) { - buftmp3[0] = bufout[0]; - buftmp3[1] = bufout[1]; - stereo_split(buftmp2[0], buftmp2[1], input, nb_samples); + memcpy(buftmp2[0], input, nb_samples * sizeof(short)); + } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) { + for (i = 0; i < s->input_channels; i++) { + buftmp3[i] = bufout[i]; + } + deinterleave(buftmp2, input, s->input_channels, nb_samples); } else { buftmp3[0] = output; - memcpy(buftmp2[0], input, nb_samples*sizeof(short)); + memcpy(buftmp2[0], input, nb_samples * sizeof(short)); } nb_samples += s->temp_len; /* resample each channel */ nb_samples1 = 0; /* avoid warning */ - for(i=0;i<s->filter_channels;i++) { + for (i = 0; i < s->filter_channels; i++) { int consumed; - int is_last= i+1 == s->filter_channels; + int is_last = i + 1 == s->filter_channels; - nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], &consumed, nb_samples, lenout, is_last); - s->temp_len= nb_samples - consumed; - s->temp[i]= av_realloc(s->temp[i], s->temp_len*sizeof(short)); - memcpy(s->temp[i], bufin[i] + consumed, s->temp_len*sizeof(short)); + nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], + &consumed, nb_samples, lenout, is_last); + s->temp_len = nb_samples - consumed; + s->temp[i] = av_realloc(s->temp[i], s->temp_len * sizeof(short)); + memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short)); } if (s->output_channels == 2 && s->input_channels == 1) { mono_to_stereo(output, buftmp3[0], nb_samples1); - } else if (s->output_channels == 2) { - stereo_mux(output, buftmp3[0], buftmp3[1], nb_samples1); - } else if (s->output_channels == 6) { + } else if (s->output_channels == 6 && s->input_channels == 2) { ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1); + } else if (s->output_channels == s->input_channels && s->input_channels >= 2) { + interleave(output, buftmp3, s->output_channels, nb_samples1); } if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) { @@ -337,25 +341,27 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl void *obuf[1] = { output_bak }; if (av_audio_convert(s->convert_ctx[1], obuf, ostride, - ibuf, istride, nb_samples1*s->output_channels) < 0) { - av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format convertion failed\n"); + ibuf, istride, nb_samples1 * s->output_channels) < 0) { + av_log(s->resample_context, AV_LOG_ERROR, + "Audio sample format convertion failed\n"); return 0; } } - for(i=0; i<s->filter_channels; i++) + for (i = 0; i < s->filter_channels; i++) { av_free(bufin[i]); + av_free(bufout[i]); + } - av_free(bufout[0]); - av_free(bufout[1]); return nb_samples1; } void audio_resample_close(ReSampleContext *s) { + int i; av_resample_close(s->resample_context); - av_freep(&s->temp[0]); - av_freep(&s->temp[1]); + for (i = 0; i < s->filter_channels; i++) + av_freep(&s->temp[i]); av_freep(&s->buffer[0]); av_freep(&s->buffer[1]); av_audio_convert_free(s->convert_ctx[0]); diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c new file mode 100644 index 0000000000..dd0ec2ee19 --- /dev/null +++ b/libavcodec/s302m.c @@ -0,0 +1,141 @@ +/* + * SMPTE 302M decoder + * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org> + * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/intreadwrite.h" +#include "avcodec.h" + +#define AES3_HEADER_LEN 4 + +static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf, + int buf_size) +{ + uint32_t h; + int frame_size, channels, id, bits; + + if (buf_size <= AES3_HEADER_LEN) { + av_log(avctx, AV_LOG_ERROR, "frame is too short\n"); + return AVERROR_INVALIDDATA; + } + + /* + * AES3 header : + * size: 16 + * number channels 2 + * channel_id 8 + * bits per samples 2 + * alignments 4 + */ + + h = AV_RB32(buf); + frame_size = (h >> 16) & 0xffff; + channels = ((h >> 14) & 0x0003) * 2 + 2; + id = (h >> 6) & 0x00ff; + bits = ((h >> 4) & 0x0003) * 4 + 16; + + if (AES3_HEADER_LEN + frame_size != buf_size || bits > 24) { + av_log(avctx, AV_LOG_ERROR, "frame has invalid header\n"); + return AVERROR_INVALIDDATA; + } + + /* Set output properties */ + avctx->bits_per_coded_sample = bits; + if (bits > 16) + avctx->sample_fmt = SAMPLE_FMT_S32; + else + avctx->sample_fmt = SAMPLE_FMT_S16; + + avctx->channels = channels; + avctx->sample_rate = 48000; + avctx->bit_rate = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) + + 32 * (48000 / (buf_size * 8 / + (avctx->channels * + (avctx->bits_per_coded_sample + 4)))); + + return frame_size; +} + +static int s302m_decode_frame(AVCodecContext *avctx, void *data, + int *data_size, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + + int frame_size = s302m_parse_frame_header(avctx, buf, buf_size); + if (frame_size < 0) + return frame_size; + + buf_size -= AES3_HEADER_LEN; + buf += AES3_HEADER_LEN; + + if (*data_size < 4 * buf_size * 8 / (avctx->bits_per_coded_sample + 4)) + return -1; + + if (avctx->bits_per_coded_sample == 24) { + uint32_t *o = data; + for (; buf_size > 6; buf_size -= 7) { + *o++ = (av_reverse[buf[2]] << 24) | + (av_reverse[buf[1]] << 16) | + (av_reverse[buf[0]] << 8); + *o++ = (av_reverse[buf[6] & 0xf0] << 28) | + (av_reverse[buf[5]] << 20) | + (av_reverse[buf[4]] << 12) | + (av_reverse[buf[3] & 0x0f] << 8); + buf += 7; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } else if (avctx->bits_per_coded_sample == 20) { + uint32_t *o = data; + for (; buf_size > 5; buf_size -= 6) { + *o++ = (av_reverse[buf[2] & 0xf0] << 28) | + (av_reverse[buf[1]] << 20) | + (av_reverse[buf[0]] << 12); + *o++ = (av_reverse[buf[5] & 0xf0] << 28) | + (av_reverse[buf[4]] << 20) | + (av_reverse[buf[3]] << 12); + buf += 6; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } else { + uint16_t *o = data; + for (; buf_size > 4; buf_size -= 5) { + *o++ = (av_reverse[buf[1]] << 8) | + av_reverse[buf[0]]; + *o++ = (av_reverse[buf[4] & 0xf0] << 12) | + (av_reverse[buf[3]] << 4) | + av_reverse[buf[2] & 0x0f]; + buf += 5; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } + + return buf - avpkt->data; +} + + +AVCodec ff_s302m_decoder = { + .name = "s302m", + .type = AVMEDIA_TYPE_AUDIO, + .id = CODEC_ID_S302M, + .priv_data_size = 0, + .decode = s302m_decode_frame, + .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"), +}; diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c index f252913b62..7e7ae0f007 100644 --- a/libavcodec/tiff.c +++ b/libavcodec/tiff.c @@ -168,7 +168,13 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin } switch(s->compr){ case TIFF_RAW: - memcpy(dst, src, width); + if (!s->fill_order) { + memcpy(dst, src, width); + } else { + int i; + for (i = 0; i < width; i++) + dst[i] = av_reverse[src[i]]; + } src += width; break; case TIFF_PACKBITS: diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6416e600a3..4775853697 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ + x86/h264_deblock_10bit.o \ x86/h264_weight.o \ x86/h264_idct.o \ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 1a5413bda4..d867dc3e6a 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index fb9cacfd11..dbf3daa02e 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1,10 +1,11 @@ ;***************************************************************************** -;* MMX/SSE2-optimized H.264 deblocking code +;* MMX/SSE2/AVX-optimized H.264 deblocking code ;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project +;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Oskar Arvidsson <oskar@irock.se> ;* ;* This file is part of FFmpeg. ;* @@ -26,96 +27,94 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_A1 -SECTION .text - ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + punpckh%3 m1, m0, m4 + punpckh%3 m3, m2, m6 + punpckl%3 m0, m4 + punpckl%3 m2, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 +%macro TRANSPOSE8x4B_STORE 8 + punpckhdq m4, m0, m0 + punpckhdq m5, m1, m1 + punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + movh %1, m1 punpckhdq m1, m1 - movd %4, m1 + movh %2, m1 + movh %3, m0 + punpckhdq m0, m0 + movh %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + movh %5, m5 punpckhdq m5, m5 - movd %8, m5 + movh %6, m5 + movh %7, m4 + punpckhdq m4, m4 + movh %8, m4 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro %macro SBUTTERFLY3 4 - movq %4, %2 + punpckh%1 %4, %2, %3 punpckl%1 %2, %3 - punpckh%1 %4, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -123,30 +122,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY3 bw, m6, %8, m5 - SBUTTERFLY3 wd, m0, m2, m1 - SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY3 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY3 wd, m7, [%9+0x10], m6 - SBUTTERFLY3 wd, m3, m5, m4 - SBUTTERFLY3 dq, m7, m3, m0 - SBUTTERFLY3 dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY3 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -154,38 +155,44 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - SBUTTERFLY3 bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY3 wd, m0, m2, m3 - SBUTTERFLY3 wd, m4, m6, m2 - SBUTTERFLY3 wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY3 wd, m2, m5, m1 - SBUTTERFLY3 dq, m0, m4, m5 - SBUTTERFLY3 dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY3 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY3 dq, m3, %11, m0 - SBUTTERFLY3 dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY3 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 +%if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%else + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%endif por %4, %5 psubusb %4, %3 %endmacro @@ -193,32 +200,28 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 +%ifdef ARCH_X86_64 + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%else mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 movd m4, %1 movd m5, %2 - SPLATW m4 - SPLATW m5 + SPLATW m4, m4 + SPLATW m5, m5 packuswb m4, m4 ; 16x alpha-1 packuswb m5, m5 ; 16x beta-1 %if %0>2 @@ -237,8 +240,7 @@ SECTION .text ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 + pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 @@ -264,14 +266,12 @@ SECTION .text ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 + pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 + psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 @@ -280,10 +280,10 @@ SECTION .text %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +%macro DEBLOCK_LUMA 1 +cglobal deblock_v_luma_8_%1, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -307,8 +307,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - mova m7, m8 - psubb m7, m6 + psubb m7, m8, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -326,10 +325,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_8_%1, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -350,13 +349,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_8_%1 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -365,7 +364,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -375,7 +374,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -383,14 +382,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7 add rsp, 0x68 %endif RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA sse2 +INIT_AVX +DEBLOCK_LUMA avx %else %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_8_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -419,8 +424,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 + psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -441,10 +445,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_8_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -467,11 +471,11 @@ cglobal x264_deblock_h_luma_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_8_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_8_%1 %endif ADD esp, 20 @@ -484,7 +488,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -492,7 +496,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -502,22 +506,34 @@ INIT_MMX DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 +INIT_AVX +DEBLOCK_LUMA avx, v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory +%ifdef ARCH_X86_64 + pavgb t0, p2, p1 + pavgb t1, p0, q0 +%else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 +%endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 +%ifdef ARCH_X86_64 + paddb t2, p2, p1 + paddb t3, p0, q0 +%else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 +%endif paddb t2, t3 mova t3, t2 mova t4, t2 @@ -527,10 +543,15 @@ DEBLOCK_LUMA sse2, v, 16 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; +%ifdef ARCH_X86_64 + pavgb t1, p2, q1 + psubb t2, p2, q1 +%else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 +%endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 @@ -543,10 +564,8 @@ DEBLOCK_LUMA sse2, v, 16 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 + pxor t3, p0, q1 + pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -560,9 +579,8 @@ DEBLOCK_LUMA sse2, v, 16 mova %1, t1 ; store p0 mova t1, %4 ; p3 - mova t2, t1 + paddb t2, t1, p2 pavgb t1, p2 - paddb t2, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 @@ -624,9 +642,9 @@ DEBLOCK_LUMA sse2, v, 16 %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_8_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -686,9 +704,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_8_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -704,7 +722,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_8_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -717,7 +735,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_8_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -736,10 +754,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_8_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_8_%1 %endif ADD esp, 16 @@ -760,13 +778,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 INIT_XMM DEBLOCK_LUMA_INTRA sse2, v +INIT_AVX +DEBLOCK_LUMA_INTRA avx , v %ifndef ARCH_X86_64 INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif - - INIT_MMX %macro CHROMA_V_START 0 @@ -790,23 +808,23 @@ INIT_MMX %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_8_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_8_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -815,17 +833,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 %define buf1 r2m %endif CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq m0, buf0 movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_inter_body_mmxext: +ff_chroma_inter_body_mmxext: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 @@ -850,31 +868,31 @@ x264_chroma_inter_body_mmxext: %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_8_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_8_mmxext, 4,6 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call x264_chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_intra_body_mmxext: +ff_chroma_intra_body_mmxext: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm new file mode 100644 index 0000000000..c253d02954 --- /dev/null +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -0,0 +1,910 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Oskar Arvidsson <oskar@irock.se> +;* Loren Merritt <lorenm@u.washington.edu> +;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +pw_pixel_max: times 8 dw ((1 << 10)-1) + +SECTION .text + +cextern pw_2 +cextern pw_3 +cextern pw_4 + +; out: %4 = |%1-%2|-%3 +; clobbers: %5 +%macro ABS_SUB 5 + psubusw %5, %2, %1 + psubusw %4, %1, %2 + por %4, %5 + psubw %4, %3 +%endmacro + +; out: %4 = |%1-%2|<%3 +%macro DIFF_LT 5 + psubusw %4, %2, %1 + psubusw %5, %1, %2 + por %5, %4 ; |%1-%2| + pxor %4, %4 + psubw %5, %3 ; |%1-%2|-%3 + pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 +%endmacro + +%macro LOAD_AB 4 + movd %1, %3 + movd %2, %4 + SPLATW %1, %1 + SPLATW %2, %2 +%endmacro + +; in: %2=tc reg +; out: %1=splatted tc +%macro LOAD_TC 2 + movd %1, [%2] + punpcklbw %1, %1 +%if mmsize == 8 + pshufw %1, %1, 0 +%else + pshuflw %1, %1, 01010000b + pshufd %1, %1, 01010000b +%endif + psraw %1, 6 +%endmacro + +; in: %1=p1, %2=p0, %3=q0, %4=q1 +; %5=alpha, %6=beta, %7-%9=tmp +; out: %7=mask +%macro LOAD_MASK 9 + ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha + ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta + pand %8, %9 + ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta + pxor %7, %7 + pand %8, %9 + pcmpgtw %7, %8 +%endmacro + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', m2=q0' +%macro DEBLOCK_P0_Q0 7 + psubw %3, %4 + pxor %7, %7 + paddw %3, [pw_4] + psubw %7, %5 + psubw %6, %2, %1 + psllw %6, 2 + paddw %3, %6 + psraw %3, 3 + mova %6, [pw_pixel_max] + CLIPW %3, %7, %5 + pxor %7, %7 + paddw %1, %3 + psubw %2, %3 + CLIPW %1, %7, %6 + CLIPW %2, %7, %6 +%endmacro + +; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp +%macro LUMA_Q1 6 + pavgw %6, %3, %4 ; (p0+q0+1)>>1 + paddw %1, %6 + pxor %6, %6 + psraw %1, 1 + psubw %6, %5 + psubw %1, %2 + CLIPW %1, %6, %5 + paddw %1, %2 +%endmacro + +%macro LUMA_DEBLOCK_ONE 3 + DIFF_LT m5, %1, bm, m4, m6 + pxor m6, m6 + mova %3, m4 + pcmpgtw m6, tcm + pand m4, tcm + pandn m6, m7 + pand m4, m6 + LUMA_Q1 m5, %2, m1, m2, m4, m6 +%endmacro + +%macro LUMA_H_STORE 2 +%if mmsize == 8 + movq [r0-4], m0 + movq [r0+r1-4], m1 + movq [r0+r1*2-4], m2 + movq [r0+%2-4], m3 +%else + movq [r0-4], m0 + movhps [r0+r1-4], m0 + movq [r0+r1*2-4], m1 + movhps [%1-4], m1 + movq [%1+r1-4], m2 + movhps [%1+r1*2-4], m2 + movq [%1+%2-4], m3 + movhps [%1+r1*4-4], m3 +%endif +%endmacro + +%macro DEBLOCK_LUMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) + %assign pad 5*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define am [rsp+mmsize*3] + %define bm [rsp+mmsize*4] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, 32/mmsize + mov r2, r0 + sub r0, r1 + mova am, m4 + sub r0, r1 + mova bm, m5 + sub r0, r1 +.loop: + mova m0, [r0+r1] + mova m1, [r0+r1*2] + mova m2, [r2] + mova m3, [r2+r1] + + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + mova m5, [r0] + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova [r0+r1], m5 + + mova m5, [r2+r1*2] + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova [r2+r1], m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova [r0+r1*2], m1 + mova [r2], m2 + + add r0, mmsize + add r2, mmsize + add r4, mmsize/8 + dec r3 + jg .loop + ADD rsp, pad + RET + +cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) + %assign pad 7*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define p1m [rsp+mmsize*3] + %define p2m [rsp+mmsize*4] + %define am [rsp+mmsize*5] + %define bm [rsp+mmsize*6] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, r1 + mova am, m4 + add r3, r1 + mov r5, 32/mmsize + mova bm, m5 + add r3, r1 +%if mmsize == 16 + mov r2, r0 + add r2, r3 +%endif +.loop: +%if mmsize == 8 + movq m2, [r0-8] ; y q2 q1 q0 + movq m7, [r0+0] + movq m5, [r0+r1-8] + movq m3, [r0+r1+0] + movq m0, [r0+r1*2-8] + movq m6, [r0+r1*2+0] + movq m1, [r0+r3-8] + TRANSPOSE4x4W 2, 5, 0, 1, 4 + SWAP 2, 7 + movq m7, [r0+r3] + TRANSPOSE4x4W 2, 3, 6, 7, 4 +%else + movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m3, [r2-8] + TRANSPOSE4x4W 5, 0, 2, 3, 6 + mova tcm, m3 + + movu m4, [r2+r1-8] + movu m1, [r2+r1*2-8] + movu m3, [r2+r3-8] + movu m7, [r2+r1*4-8] + TRANSPOSE4x4W 4, 1, 3, 7, 6 + + mova m6, tcm + punpcklqdq m6, m7 + punpckhqdq m5, m4 + SBUTTERFLY qdq, 0, 1, 7 + SBUTTERFLY qdq, 2, 3, 7 +%endif + + mova p2m, m6 + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova p1m, m5 + + mova m5, p2m + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova p2m, m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova m0, p1m + mova m3, p2m + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r2, r3 + + add r4, mmsize/8 + lea r0, [r0+r1*(mmsize/2)] + lea r2, [r2+r1*(mmsize/2)] + dec r5 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +%ifdef ARCH_X86_64 +; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 +; m12=alpha, m13=beta +; out: m0=p1', m3=q1', m1=p0', m2=q0' +; clobbers: m4, m5, m6, m7, m10, m11, m14 +%macro DEBLOCK_LUMA_INTER_SSE2 0 + LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 + LOAD_TC m6, r4 + DIFF_LT m8, m1, m13, m10, m4 + DIFF_LT m9, m2, m13, m11, m4 + pand m6, m7 + + mova m14, m6 + pxor m4, m4 + pcmpgtw m6, m4 + pand m6, m14 + + mova m5, m10 + pand m5, m6 + LUMA_Q1 m8, m0, m1, m2, m5, m4 + + mova m5, m11 + pand m5, m6 + LUMA_Q1 m9, m3, m1, m2, m5, m4 + + pxor m4, m4 + psubw m6, m10 + pcmpgtw m4, m14 + pandn m4, m7 + psubw m6, m11 + pand m4, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 + + SWAP 0, 8 + SWAP 3, 9 +%endmacro + +%macro DEBLOCK_LUMA_64 1 +cglobal deblock_v_luma_10_%1, 5,5,15 + %define p2 m8 + %define p1 m0 + %define p0 m1 + %define q0 m2 + %define q1 m3 + %define q2 m9 + %define mask0 m7 + %define mask1 m10 + %define mask2 m11 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r0 + sub r0, r1 + sub r0, r1 + sub r0, r1 + mov r3, 2 +.loop: + mova p2, [r0] + mova p1, [r0+r1] + mova p0, [r0+r1*2] + mova q0, [r2] + mova q1, [r2+r1] + mova q2, [r2+r1*2] + DEBLOCK_LUMA_INTER_SSE2 + mova [r0+r1], p1 + mova [r0+r1*2], p0 + mova [r2], q0 + mova [r2+r1], q1 + add r0, mmsize + add r2, mmsize + add r4, 2 + dec r3 + jg .loop + REP_RET + +cglobal deblock_h_luma_10_%1, 5,7,15 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r1 + add r2, r1 + add r2, r1 + mov r5, r0 + add r5, r2 + mov r6, 2 +.loop: + movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m9, [r5-8] + movu m5, [r5+r1-8] + movu m1, [r5+r1*2-8] + movu m3, [r5+r2-8] + movu m7, [r5+r1*4-8] + + TRANSPOSE4x4W 8, 0, 2, 9, 10 + TRANSPOSE4x4W 5, 1, 3, 7, 10 + + punpckhqdq m8, m5 + SBUTTERFLY qdq, 0, 1, 10 + SBUTTERFLY qdq, 2, 3, 10 + punpcklqdq m9, m7 + + DEBLOCK_LUMA_INTER_SSE2 + + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r5, r2 + add r4, 2 + lea r0, [r0+r1*8] + lea r5, [r5+r1*8] + dec r6 + jg .loop + REP_RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_64 avx +%endif + +%macro SWAPMOVA 2 +%ifid %1 + SWAP %1, %2 +%else + mova %1, %2 +%endif +%endmacro + +; in: t0-t2: tmp registers +; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 +; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' +%macro LUMA_INTRA_P012 12 ; p0..p3 in memory +%ifdef ARCH_X86_64 + paddw t0, %3, %2 + mova t2, %4 + paddw t2, %3 +%else + mova t0, %3 + mova t2, %4 + paddw t0, %2 + paddw t2, %3 +%endif + paddw t0, %1 + paddw t2, t2 + paddw t0, %5 + paddw t2, %9 + paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) + paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) + + psrlw t2, 3 + psrlw t1, t0, 2 + psubw t2, %3 + psubw t1, %2 + pand t2, %8 + pand t1, %8 + paddw t2, %3 + paddw t1, %2 + SWAPMOVA %11, t1 + + psubw t1, t0, %3 + paddw t0, t0 + psubw t1, %5 + psubw t0, %3 + paddw t1, %6 + paddw t1, %2 + paddw t0, %6 + psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 + psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 + + pxor t0, t1 + pxor t1, %1 + pand t0, %8 + pand t1, %7 + pxor t0, t1 + pxor t0, %1 + SWAPMOVA %10, t0 + SWAPMOVA %12, t2 +%endmacro + +%macro LUMA_INTRA_INIT 1 + %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) + %define t0 m4 + %define t1 m5 + %define t2 m6 + %define t3 m7 + %assign i 4 +%rep %1 + CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] + %assign i i+1 +%endrep + SUB rsp, pad +%endmacro + +; in: %1-%3=tmp, %4=p2, %5=q2 +%macro LUMA_INTRA_INTER 5 + LOAD_AB t0, t1, r2d, r3d + mova %1, t0 + LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 +%ifdef ARCH_X86_64 + mova %2, t0 ; mask0 + psrlw t3, %1, 2 +%else + mova t3, %1 + mova %2, t0 ; mask0 + psrlw t3, 2 +%endif + paddw t3, [pw_2] ; alpha/4+2 + DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 + pand t2, %2 + mova t3, %5 ; q2 + mova %1, t2 ; mask1 + DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta + pand t2, %1 + mova t3, %4 ; p2 + mova %3, t2 ; mask1q + DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta + pand t2, %1 + mova %1, t2 ; mask1p +%endmacro + +%macro LUMA_H_INTRA_LOAD 0 +%if mmsize == 8 + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r4-8] + TRANSPOSE4x4W 4, 5, 0, 1, 2 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + + movu m2, [r0] + movu m3, [r0+r1] + movu t0, [r0+r1*2] + movu t1, [r0+r4] + TRANSPOSE4x4W 2, 3, 4, 5, 6 + mova t6, t0 ; q2 + mova t7, t1 ; q3 +%else + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r5-8] + movu m2, [r4-8] + movu m3, [r4+r1-8] + movu t2, [r4+r1*2-8] + movu t3, [r4+r5-8] + TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + mova t6, t2 ; q2 + mova t7, t3 ; q3 +%endif +%endmacro + +; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp +%macro LUMA_H_INTRA_STORE 9 +%if mmsize == 8 + TRANSPOSE4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r4-8], m%4 + movq m%1, %8 + TRANSPOSE4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r4], m%1 +%else + TRANSPOSE2x4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r5-8], m%4 + movhps [r4-8], m%1 + movhps [r4+r1-8], m%2 + movhps [r4+r1*2-8], m%3 + movhps [r4+r5-8], m%4 +%ifnum %8 + SWAP %1, %8 +%else + mova m%1, %8 +%endif + TRANSPOSE2x4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r5], m%1 + movhps [r4], m%5 + movhps [r4+r1], m%6 + movhps [r4+r1*2], m%7 + movhps [r4+r5], m%1 +%endif +%endmacro + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +%macro DEBLOCK_LUMA_INTRA_64 1 +cglobal deblock_v_luma_intra_10_%1, 4,7,16 + %define t0 m1 + %define t1 m2 + %define t2 m4 + %define p2 m8 + %define p1 m9 + %define p0 m10 + %define q0 m11 + %define q1 m12 + %define q2 m13 + %define aa m5 + %define bb m14 + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + neg r4 + add r4, r0 ; pix-4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 + LOAD_AB aa, bb, r2d, r3d +.loop + mova p2, [r4+r1] + mova p1, [r4+2*r1] + mova p0, [r4+r5] + mova q0, [r0] + mova q1, [r0+r1] + mova q2, [r0+2*r1] + + LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 + mova t2, aa + psrlw t2, 2 + paddw t2, m0 ; alpha/4+2 + DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta + DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] + LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,16 + %define t0 m15 + %define t1 m14 + %define t2 m2 + %define q3 m5 + %define q2 m8 + %define q1 m9 + %define q0 m10 + %define p0 m11 + %define p1 m12 + %define p2 m13 + %define p3 m4 + %define spill [rsp] + %assign pad 24-(stack_offset&15) + SUB rsp, pad + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 +.loop + movu q3, [r0-8] + movu q2, [r0+r1-8] + movu q1, [r0+r1*2-8] + movu q0, [r0+r5-8] + movu p0, [r4-8] + movu p1, [r4+r1-8] + movu p2, [r4+r1*2-8] + movu p3, [r4+r5-8] + TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 + + LOAD_AB m1, m2, r2d, r3d + LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 + psrlw m1, 2 + paddw m1, m0 ; alpha/4+2 + DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta + DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + + mova spill, q3 + LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 + LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 + mova m7, spill + + LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 + + lea r0, [r0+r1*8] + lea r4, [r4+r1*8] + dec r6 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_INTRA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_INTRA_64 avx + +%endif + +%macro DEBLOCK_LUMA_INTRA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 3 + lea r4, [r1*4] + lea r5, [r1*3] + neg r4 + add r4, r0 + mov r6, 32/mmsize + shl r2d, 2 + shl r3d, 2 +.loop: + mova m0, [r4+r1*2] ; p1 + mova m1, [r4+r5] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] + LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] + mova t3, [r0+r1*2] ; q2 + LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + ADD rsp, pad + RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 8 +%if mmsize == 8 + lea r4, [r1*3] + mov r5, 32/mmsize +%else + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 32/mmsize +%endif + shl r2d, 2 + shl r3d, 2 +.loop: + LUMA_H_INTRA_LOAD + LUMA_INTRA_INTER t8, t9, t10, t5, t6 + + LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 + mova t3, t6 ; q2 + LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 + + mova m2, t4 + mova m0, t11 + mova m1, t5 + mova m3, t8 + mova m6, t6 + + LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 + + lea r0, [r0+r1*(mmsize/2)] +%if mmsize == 8 + dec r5 +%else + lea r4, [r4+r1*(mmsize/2)] + dec r6 +%endif + jg .loop + ADD rsp, pad + RET +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_LUMA mmxext +DEBLOCK_LUMA_INTRA mmxext +INIT_XMM +DEBLOCK_LUMA sse2 +DEBLOCK_LUMA_INTRA sse2 +INIT_AVX +DEBLOCK_LUMA avx +DEBLOCK_LUMA_INTRA avx +%endif + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', %2=q0' +%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 + mova %6, [pw_2] + paddw %6, %3 + paddw %6, %4 + paddw %7, %6, %2 + paddw %6, %1 + paddw %6, %3 + paddw %7, %4 + psraw %6, 2 + psraw %7, 2 + psubw %6, %1 + psubw %7, %2 + pand %6, %5 + pand %7, %5 + paddw %1, %6 + paddw %2, %7 +%endmacro + +%macro CHROMA_V_LOAD 1 + mova m0, [r0] ; p1 + mova m1, [r0+r1] ; p0 + mova m2, [%1] ; q0 + mova m3, [%1+r1] ; q1 +%endmacro + +%macro CHROMA_V_STORE 0 + mova [r0+1*r1], m1 + mova [r0+2*r1], m2 +%endmacro + +%macro DEBLOCK_CHROMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) + mov r5, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r6, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r5 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + LOAD_TC m6, r4 + psubw m6, [pw_3] + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r5, mmsize + add r4, mmsize/8 + dec r6 + jg .loop + REP_RET +%else + RET +%endif + +;----------------------------------------------------------------------------- +; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) + mov r4, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r5, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r4 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r4, mmsize + dec r5 + jg .loop + REP_RET +%else + RET +%endif +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif +INIT_XMM +DEBLOCK_CHROMA sse2 +INIT_AVX +DEBLOCK_CHROMA avx diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 7657a85890..b331f94b5e 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -218,41 +218,57 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ); } -#define LF_FUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); -#define LF_IFUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); - -LF_FUNC (h, chroma, mmxext) -LF_IFUNC(h, chroma_intra, mmxext) -LF_FUNC (v, chroma, mmxext) -LF_IFUNC(v, chroma_intra, mmxext) - -LF_FUNC (h, luma, mmxext) -LF_IFUNC(h, luma_intra, mmxext) -#if HAVE_YASM && ARCH_X86_32 -LF_FUNC (v8, luma, mmxext) -static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); +#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); + +#define LF_FUNCS(type, depth)\ +LF_FUNC (h, chroma, depth, mmxext)\ +LF_IFUNC(h, chroma_intra, depth, mmxext)\ +LF_FUNC (v, chroma, depth, mmxext)\ +LF_IFUNC(v, chroma_intra, depth, mmxext)\ +LF_FUNC (h, luma, depth, mmxext)\ +LF_IFUNC(h, luma_intra, depth, mmxext)\ +LF_FUNC (h, luma, depth, sse2)\ +LF_IFUNC(h, luma_intra, depth, sse2)\ +LF_FUNC (v, luma, depth, sse2)\ +LF_IFUNC(v, luma_intra, depth, sse2)\ +LF_FUNC (h, chroma, depth, sse2)\ +LF_IFUNC(h, chroma_intra, depth, sse2)\ +LF_FUNC (v, chroma, depth, sse2)\ +LF_IFUNC(v, chroma_intra, depth, sse2)\ +LF_FUNC (h, luma, depth, avx)\ +LF_IFUNC(h, luma_intra, depth, avx)\ +LF_FUNC (v, luma, depth, avx)\ +LF_IFUNC(v, luma_intra, depth, avx)\ +LF_FUNC (h, chroma, depth, avx)\ +LF_IFUNC(h, chroma_intra, depth, avx)\ +LF_FUNC (v, chroma, depth, avx)\ +LF_IFUNC(v, chroma_intra, depth, avx) + +LF_FUNCS( uint8_t, 8) +LF_FUNCS(uint16_t, 10) + +LF_FUNC (v8, luma, 8, mmxext) +static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0); if((tc0[2] & tc0[3]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); + ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2); } -LF_IFUNC(v8, luma_intra, mmxext) -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +LF_IFUNC(v8, luma_intra, 8, mmxext) +static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta) { - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta); } -#endif -LF_FUNC (h, luma, sse2) -LF_IFUNC(h, luma_intra, sse2) -LF_FUNC (v, luma, sse2) -LF_IFUNC(v, luma_intra, sse2) +LF_FUNC (v, luma, 10, mmxext) +LF_IFUNC(v, luma_intra, 10, mmxext) /***********************************/ /* weighted prediction */ @@ -314,15 +330,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; - c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; - c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; #endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -360,10 +376,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; #endif c->h264_idct_add16 = ff_h264_idct_add16_sse2; @@ -377,6 +393,49 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; } + if (mm_flags&AV_CPU_FLAG_AVX) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; +#endif + } + } + } +#endif + } else if (bit_depth == 10) { +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { +#if ARCH_X86_32 + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; +#endif + if (mm_flags&AV_CPU_FLAG_SSE2) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; +#endif + } + if (mm_flags&AV_CPU_FLAG_AVX) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; +#endif + } } } #endif diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index b28a6198f7..a96b065509 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -24,16 +24,20 @@ ;****************************************************************************** %macro SBUTTERFLY 4 +%if avx_enabled == 0 mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 - mova m%4, m%2 - punpckh%1 m%2, m%3 - punpckl%1 m%4, m%3 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro @@ -444,3 +448,17 @@ %macro PMINUB_MMXEXT 3 ; dst, src, ignored pminub %1, %2 %endmacro + +%macro SPLATW 2-3 0 +%if mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%else + pshufw %1, %2, (%3)*0x55 +%endif +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro |