dct32: Add SSE2 ASM optimizations

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
author: Vitor Sessak <vitor1001@gmail.com> 2011-07-30 18:39:25 +0200
committer: Ronald S. Bultje <rsbultje@gmail.com> 2011-08-02 10:17:29 -0700
commit: 18b131de0473a3110c63966cd7c6cd2ab118d401 (patch)
tree: 822a1fd96cdd780477b3a22622b72a9245735e97
parent: 6f7fe4723b9bfbb52341568906e6168966f486b3 (diff)
download: ffmpeg-18b131de0473a3110c63966cd7c6cd2ab118d401.tar.gz
3 files changed, 33 insertions, 9 deletions
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 46daa43d8c..720a061078 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -63,6 +63,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
     mulps  %1, %3
 %endmacro
 
+%macro BUTTERFLY0_SSE2 5
+    pshufd %4, %1, %5
+    xorps  %1, %2
+    addps  %1, %4
+    mulps  %1, %3
+%endmacro
+
 %macro BUTTERFLY0_AVX 5
     vshufps %4, %1, %1, %5
     vxorps  %1, %1, %2
@@ -405,18 +412,17 @@ INIT_XMM
 
 
 INIT_XMM
+%macro DCT32_FUNC 1
 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_sse, 2,3,16, out, in, tmp
+cglobal dct32_float_%1, 2,3,16, out, in, tmp
     ; pass 1
 
     movaps      m0, [inq+0]
-    movaps      m1, [inq+112]
-    shufps      m1, m1, 0x1b
+    LOAD_INV    m1, [inq+112]
     BUTTERFLY   m0, m1, [ps_cos_vec], m3
 
     movaps      m7, [inq+64]
-    movaps      m4, [inq+48]
-    shufps      m4, m4, 0x1b
+    LOAD_INV    m4, [inq+48]
     BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
 
     ; pass 2
@@ -427,13 +433,11 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
 
     ; pass 1
     movaps      m1, [inq+16]
-    movaps      m6, [inq+96]
-    shufps      m6, m6, 0x1b
+    LOAD_INV    m6, [inq+96]
     BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
 
     movaps      m4, [inq+80]
-    movaps      m5, [inq+32]
-    shufps      m5, m5, 0x1b
+    LOAD_INV    m5, [inq+32]
     BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
 
     ; pass 2
@@ -492,3 +496,20 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
     PASS5
     PASS6
     RET
+%endmacro
+
+%macro LOAD_INV_SSE 2
+    movaps      %1, %2
+    shufps      %1, %1, 0x1b
+%endmacro
+
+%define LOAD_INV LOAD_INV_SSE
+DCT32_FUNC sse
+
+%macro LOAD_INV_SSE2 2
+    pshufd      %1, %2, 0x1b
+%endmacro
+
+%define LOAD_INV LOAD_INV_SSE2
+%define BUTTERFLY0 BUTTERFLY0_SSE2
+DCT32_FUNC sse2
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 899f0f7ad5..f7308cca32 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -60,6 +60,8 @@ av_cold void ff_dct_init_mmx(DCTContext *s)
     int has_vectors = av_get_cpu_flags();
     if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
         s->dct32 = ff_dct32_float_avx;
+    else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
+        s->dct32 = ff_dct32_float_sse2;
     else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
         s->dct32 = ff_dct32_float_sse;
 #endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 0ade2b2e7b..9d68d5b219 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -35,6 +35,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
 void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
 
 #endif /* AVCODEC_X86_FFT_H */
author	Vitor Sessak <vitor1001@gmail.com>	2011-07-30 18:39:25 +0200
committer	Ronald S. Bultje <rsbultje@gmail.com>	2011-08-02 10:17:29 -0700
commit	18b131de0473a3110c63966cd7c6cd2ab118d401 (patch)
tree	822a1fd96cdd780477b3a22622b72a9245735e97
parent	6f7fe4723b9bfbb52341568906e6168966f486b3 (diff)
download	ffmpeg-18b131de0473a3110c63966cd7c6cd2ab118d401.tar.gz