aboutsummaryrefslogtreecommitdiffstats
path: root/libavresample/x86
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2012-05-03 15:23:32 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2012-05-09 16:17:59 -0400
commit5cc6d5244d4ec89b3ac855abff4a3d19caee22f1 (patch)
tree7c3643f535a67203b60bd001f1382b9b1282d3af /libavresample/x86
parent0b45334a5880d6e2a4b3642adcd5feab8a27a150 (diff)
downloadffmpeg-5cc6d5244d4ec89b3ac855abff4a3d19caee22f1.tar.gz
lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
The current SSE version is slower than the MMX version on Athlon64 and Sandy Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
Diffstat (limited to 'libavresample/x86')
-rw-r--r--libavresample/x86/audio_convert.asm30
-rw-r--r--libavresample/x86/audio_convert_init.c13
2 files changed, 25 insertions, 18 deletions
diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm
index 809c5d1378..ba59f3314f 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
mova m3, [srcq+src3q]
mova m4, [srcq+src4q]
mova m5, [srcq+src5q]
-%if cpuflag(sse)
+%if cpuflag(sse4)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
- movaps m6, m4
- shufps m4, m0, q3210
+ blendps m6, m4, m0, 1100b
movlhps m0, m2
- movhlps m6, m2
- movaps [dstq ], m0
- movaps [dstq+16], m4
- movaps [dstq+32], m6
-
- movaps m6, m5
- shufps m5, m1, q3210
+ movhlps m4, m2
+ blendps m2, m5, m1, 1100b
movlhps m1, m3
- movhlps m6, m3
+ movhlps m5, m3
+
+ movaps [dstq ], m0
+ movaps [dstq+16], m6
+ movaps [dstq+32], m4
movaps [dstq+48], m1
- movaps [dstq+64], m5
- movaps [dstq+80], m6
+ movaps [dstq+64], m2
+ movaps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
INIT_MMX mmx
CONV_FLTP_TO_FLT_6CH
-INIT_XMM sse
+INIT_XMM sse4
+CONV_FLTP_TO_FLT_6CH
+%if HAVE_AVX
+INIT_XMM avx
CONV_FLTP_TO_FLT_6CH
+%endif
diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c
index 6883f10a21..206aede751 100644
--- a/libavresample/x86/audio_convert_init.c
+++ b/libavresample/x86/audio_convert_init.c
@@ -22,8 +22,9 @@
#include "libavutil/cpu.h"
#include "libavresample/audio_convert.h"
-extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
-extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{
@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
}
- if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
- 6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
+ 6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
+ }
+ if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+ ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
+ 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
}
#endif
}