diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2011-02-13 14:49:50 -0500 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2011-02-13 16:49:39 -0500 |
commit | fbb6b49dabc3398440c6dfa838aa090a7a6ebc0d (patch) | |
tree | 050f0baf5915823f816682340bde83417541854c | |
parent | 1a973feb45826a1998b4286ecfe1fa7a602b8780 (diff) | |
download | ffmpeg-fbb6b49dabc3398440c6dfa838aa090a7a6ebc0d.tar.gz |
ac3enc: Add x86-optimized function to speed up log2_tab().
AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute
value of each element in an array of int16_t.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r-- | libavcodec/ac3dsp.c | 9 | ||||
-rw-r--r-- | libavcodec/ac3dsp.h | 11 | ||||
-rw-r--r-- | libavcodec/ac3enc_fixed.c | 11 | ||||
-rw-r--r-- | libavcodec/x86/ac3dsp.asm | 69 | ||||
-rw-r--r-- | libavcodec/x86/ac3dsp_mmx.c | 11 |
5 files changed, 103 insertions, 8 deletions
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c index f688e6a72b..da3a123e9b 100644 --- a/libavcodec/ac3dsp.c +++ b/libavcodec/ac3dsp.c @@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs) } } +static int ac3_max_msb_abs_int16_c(const int16_t *src, int len) +{ + int i, v = 0; + for (i = 0; i < len; i++) + v |= abs(src[i]); + return v; +} + av_cold void ff_ac3dsp_init(AC3DSPContext *c) { c->ac3_exponent_min = ac3_exponent_min_c; + c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; if (HAVE_MMX) ff_ac3dsp_init_x86(c); diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h index 7f13b11f3b..a4f141f2f5 100644 --- a/libavcodec/ac3dsp.h +++ b/libavcodec/ac3dsp.h @@ -35,6 +35,17 @@ typedef struct AC3DSPContext { * @param nb_coefs number of frequency coefficients. */ void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs); + + /** + * Calculate the maximum MSB of the absolute value of each element in an + * array of int16_t. + * @param src input array + * constraints: align 16. values must be in range [-32767,32767] + * @param len number of values in the array + * constraints: multiple of 16 greater than 0 + * @return a value with the same MSB as max(abs(src[])) + */ + int (*ac3_max_msb_abs_int16)(const int16_t *src, int len); } AC3DSPContext; void ff_ac3dsp_init (AC3DSPContext *c); diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c index 0db41dff2d..3de00ee484 100644 --- a/libavcodec/ac3enc_fixed.c +++ b/libavcodec/ac3enc_fixed.c @@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input, * @param n number of values in the array * @return log2(max(abs(tab[]))) */ -static int log2_tab(int16_t *tab, int n) +static int log2_tab(AC3EncodeContext *s, int16_t *src, int len) { - int i, v; - - v = 0; - for (i = 0; i < n; i++) - v |= abs(tab[i]); - + int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len); return av_log2(v); } @@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift) */ static int normalize_samples(AC3EncodeContext *s) { - int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE); + int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE); lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v); return v - 9; } diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index e71c51cf33..dc71ccf91c 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2 %endif %undef PMINUB %undef LOOP_ALIGN + +;----------------------------------------------------------------------------- +; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) +; +; This function uses 2 different methods to calculate a valid result. +; 1) logical 'or' of abs of each element +; This is used for ssse3 because of the pabsw instruction. +; It is also used for mmx because of the lack of min/max instructions. +; 2) calculate min/max for the array, then or(abs(min),abs(max)) +; This is used for mmxext and sse2 because they have pminsw/pmaxsw. +;----------------------------------------------------------------------------- + +%macro AC3_MAX_MSB_ABS_INT16 2 +cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len + pxor m2, m2 + pxor m3, m3 +.loop: +%ifidn %2, min_max + mova m0, [srcq] + mova m1, [srcq+mmsize] + pminsw m2, m0 + pminsw m2, m1 + pmaxsw m3, m0 + pmaxsw m3, m1 +%else ; or_abs +%ifidn %1, mmx + mova m0, [srcq] + mova m1, [srcq+mmsize] + ABS2 m0, m1, m3, m4 +%else ; ssse3 + ; using memory args is faster for ssse3 + pabsw m0, [srcq] + pabsw m1, [srcq+mmsize] +%endif + por m2, m0 + por m2, m1 +%endif + add srcq, mmsize*2 + sub lend, mmsize + ja .loop +%ifidn %2, min_max + ABS2 m2, m3, m0, m1 + por m2, m3 +%endif +%ifidn mmsize, 16 + mova m0, m2 + punpckhqdq m0, m0 + por m2, m0 +%endif + PSHUFLW m0, m2, 0xe + por m2, m0 + PSHUFLW m0, m2, 0x1 + por m2, m0 + movd eax, m2 + and eax, 0xFFFF + RET +%endmacro + +INIT_MMX +%define ABS2 ABS2_MMX +%define PSHUFLW pshufw +AC3_MAX_MSB_ABS_INT16 mmx, or_abs +%define ABS2 ABS2_MMX2 +AC3_MAX_MSB_ABS_INT16 mmxext, min_max +INIT_XMM +%define PSHUFLW pshuflw +AC3_MAX_MSB_ABS_INT16 sse2, min_max +%define ABS2 ABS2_SSSE3 +AC3_MAX_MSB_ABS_INT16 ssse3, or_abs diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c index 7ce3aa358d..d8af59ce8d 100644 --- a/libavcodec/x86/ac3dsp_mmx.c +++ b/libavcodec/x86/ac3dsp_mmx.c @@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int n extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); +extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len); + av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) { int mm_flags = av_get_cpu_flags(); @@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) #if HAVE_YASM if (mm_flags & AV_CPU_FLAG_MMX) { c->ac3_exponent_min = ff_ac3_exponent_min_mmx; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; } if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) { c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; } if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { c->ac3_exponent_min = ff_ac3_exponent_min_sse2; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; + } + if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) { + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; } #endif } |