diff options
author | Justin Ruggles <justin.ruggles@gmail.com> | 2012-05-03 15:23:32 -0400 |
---|---|---|
committer | Justin Ruggles <justin.ruggles@gmail.com> | 2012-05-09 16:17:59 -0400 |
commit | 5cc6d5244d4ec89b3ac855abff4a3d19caee22f1 (patch) | |
tree | 7c3643f535a67203b60bd001f1382b9b1282d3af /libavresample/x86 | |
parent | 0b45334a5880d6e2a4b3642adcd5feab8a27a150 (diff) | |
download | ffmpeg-5cc6d5244d4ec89b3ac855abff4a3d19caee22f1.tar.gz |
lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
The current SSE version is slower than the MMX version on Athlon64 and Sandy
Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
Diffstat (limited to 'libavresample/x86')
-rw-r--r-- | libavresample/x86/audio_convert.asm | 30 | ||||
-rw-r--r-- | libavresample/x86/audio_convert_init.c | 13 |
2 files changed, 25 insertions, 18 deletions
diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 809c5d1378..ba59f3314f 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len mova m3, [srcq+src3q] mova m4, [srcq+src4q] mova m5, [srcq+src5q] -%if cpuflag(sse) +%if cpuflag(sse4) SBUTTERFLYPS 0, 1, 6 SBUTTERFLYPS 2, 3, 6 SBUTTERFLYPS 4, 5, 6 - movaps m6, m4 - shufps m4, m0, q3210 + blendps m6, m4, m0, 1100b movlhps m0, m2 - movhlps m6, m2 - movaps [dstq ], m0 - movaps [dstq+16], m4 - movaps [dstq+32], m6 - - movaps m6, m5 - shufps m5, m1, q3210 + movhlps m4, m2 + blendps m2, m5, m1, 1100b movlhps m1, m3 - movhlps m6, m3 + movhlps m5, m3 + + movaps [dstq ], m0 + movaps [dstq+16], m6 + movaps [dstq+32], m4 movaps [dstq+48], m1 - movaps [dstq+64], m5 - movaps [dstq+80], m6 + movaps [dstq+64], m2 + movaps [dstq+80], m5 %else ; mmx SBUTTERFLY dq, 0, 1, 6 SBUTTERFLY dq, 2, 3, 6 @@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len INIT_MMX mmx CONV_FLTP_TO_FLT_6CH -INIT_XMM sse +INIT_XMM sse4 +CONV_FLTP_TO_FLT_6CH +%if HAVE_AVX +INIT_XMM avx CONV_FLTP_TO_FLT_6CH +%endif diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index 6883f10a21..206aede751 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -22,8 +22,9 @@ #include "libavutil/cpu.h" #include "libavresample/audio_convert.h" -extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len); -extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len); +extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len); +extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len); +extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len); av_cold void ff_audio_convert_init_x86(AudioConvert *ac) { @@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, 6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx); } - if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { + if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, - 6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse); + 6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4); + } + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, + 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx); } #endif } |