aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2012-05-03 15:23:32 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2012-05-09 16:17:59 -0400
commit5cc6d5244d4ec89b3ac855abff4a3d19caee22f1 (patch)
tree7c3643f535a67203b60bd001f1382b9b1282d3af
parent0b45334a5880d6e2a4b3642adcd5feab8a27a150 (diff)
downloadffmpeg-5cc6d5244d4ec89b3ac855abff4a3d19caee22f1.tar.gz
lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
The current SSE version is slower than the MMX version on Athlon64 and Sandy Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
-rw-r--r--libavresample/x86/audio_convert.asm30
-rw-r--r--libavresample/x86/audio_convert_init.c13
-rw-r--r--libavutil/x86/x86util.asm7
3 files changed, 28 insertions, 22 deletions
diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm
index 809c5d1378..ba59f3314f 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
mova m3, [srcq+src3q]
mova m4, [srcq+src4q]
mova m5, [srcq+src5q]
-%if cpuflag(sse)
+%if cpuflag(sse4)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
- movaps m6, m4
- shufps m4, m0, q3210
+ blendps m6, m4, m0, 1100b
movlhps m0, m2
- movhlps m6, m2
- movaps [dstq ], m0
- movaps [dstq+16], m4
- movaps [dstq+32], m6
-
- movaps m6, m5
- shufps m5, m1, q3210
+ movhlps m4, m2
+ blendps m2, m5, m1, 1100b
movlhps m1, m3
- movhlps m6, m3
+ movhlps m5, m3
+
+ movaps [dstq ], m0
+ movaps [dstq+16], m6
+ movaps [dstq+32], m4
movaps [dstq+48], m1
- movaps [dstq+64], m5
- movaps [dstq+80], m6
+ movaps [dstq+64], m2
+ movaps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
INIT_MMX mmx
CONV_FLTP_TO_FLT_6CH
-INIT_XMM sse
+INIT_XMM sse4
+CONV_FLTP_TO_FLT_6CH
+%if HAVE_AVX
+INIT_XMM avx
CONV_FLTP_TO_FLT_6CH
+%endif
diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c
index 6883f10a21..206aede751 100644
--- a/libavresample/x86/audio_convert_init.c
+++ b/libavresample/x86/audio_convert_init.c
@@ -22,8 +22,9 @@
#include "libavutil/cpu.h"
#include "libavresample/audio_convert.h"
-extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
-extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{
@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
}
- if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
- 6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
+ 6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
+ }
+ if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+ ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
+ 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
}
#endif
}
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 55f4a936e2..508f24e2b5 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -42,10 +42,9 @@
%endmacro
%macro SBUTTERFLYPS 3
- movaps m%3, m%1
- unpcklps m%1, m%2
- unpckhps m%3, m%2
- SWAP %2, %3
+ unpcklps m%3, m%1, m%2
+ unpckhps m%1, m%1, m%2
+ SWAP %1, %3, %2
%endmacro
%macro TRANSPOSE4x4B 5