exploit mdct symmetry

2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2008-07-13 15:03:58 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2008-07-13 15:03:58 +0000
commit: b9fa32082c71013e90eab9e9997967d2939cf4a6 (patch)
tree: 83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386/fft_sse.c
parent: eb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff)
download: ffmpeg-b9fa32082c71013e90eab9e9997967d2939cf4a6.tar.gz
1 files changed, 54 insertions, 4 deletions
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c
index 83cbd87088..305f44a0ce 100644
--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -142,11 +142,10 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
     } while (nblocks != 0);
 }
 
-void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
-                       const FFTSample *input, FFTSample *tmp)
+static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
 {
     x86_reg k;
-    long n8, n4, n2, n;
+    long n4, n2, n;
     const uint16_t *revtab = s->fft.revtab;
     const FFTSample *tcos = s->tcos;
     const FFTSample *tsin = s->tsin;
@@ -156,7 +155,6 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
     n = 1 << s->nbits;
     n2 = n >> 1;
     n4 = n >> 2;
-    n8 = n >> 3;
 
 #ifdef ARCH_X86_64
     asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
@@ -260,6 +258,20 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
 #endif
         );
     }
+}
+
+void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
+                       const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg k;
+    long n8, n2, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n8 = n >> 3;
+
+    imdct_sse(s, input, tmp);
 
     /*
        Mnemonics:
@@ -301,3 +313,41 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
     );
 }
 
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+                       const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg j, k;
+    long n8, n4, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    imdct_sse(s, input, tmp);
+
+    j = -n;
+    k = n-16;
+    asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
+    asm volatile(
+        "1: \n\t"
+        "movaps     (%3,%1), %%xmm0 \n\t"
+        "movaps     (%3,%0), %%xmm1 \n\t"
+        "xorps       %%xmm7, %%xmm0 \n\t"
+        "movaps      %%xmm0, %%xmm2 \n\t"
+        "shufps $141,%%xmm1, %%xmm0 \n\t"
+        "shufps $216,%%xmm1, %%xmm2 \n\t"
+        "shufps $54, %%xmm0, %%xmm0 \n\t"
+        "shufps $156,%%xmm2, %%xmm2 \n\t"
+        "xorps       %%xmm7, %%xmm0 \n\t"
+        "movaps      %%xmm2, (%2,%1) \n\t"
+        "movaps      %%xmm0, (%2,%0) \n\t"
+        "sub $16, %1 \n\t"
+        "add $16, %0 \n\t"
+        "jl 1b \n\t"
+        :"+r"(j), "+r"(k)
+        :"r"(output+n4), "r"(z+n8)
+        :"memory"
+    );
+}
+
author	Loren Merritt <lorenm@u.washington.edu>	2008-07-13 15:03:58 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2008-07-13 15:03:58 +0000
commit	b9fa32082c71013e90eab9e9997967d2939cf4a6 (patch)
tree	83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386/fft_sse.c
parent	eb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff)
download	ffmpeg-b9fa32082c71013e90eab9e9997967d2939cf4a6.tar.gz