aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/i386/fft_3dn2.c
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2008-07-13 15:03:58 +0000
committerLoren Merritt <lorenm@u.washington.edu>2008-07-13 15:03:58 +0000
commitb9fa32082c71013e90eab9e9997967d2939cf4a6 (patch)
tree83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386/fft_3dn2.c
parenteb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff)
downloadffmpeg-b9fa32082c71013e90eab9e9997967d2939cf4a6.tar.gz
exploit mdct symmetry
2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/fft_3dn2.c')
-rw-r--r--libavcodec/i386/fft_3dn2.c57
1 files changed, 53 insertions, 4 deletions
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c
index 32c4be369b..9068dff24b 100644
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -124,10 +124,9 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
asm volatile("femms");
}
-void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
- const FFTSample *input, FFTSample *tmp)
+static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
{
- long n8, n4, n2, n;
+ long n4, n2, n;
x86_reg k;
const uint16_t *revtab = s->fft.revtab;
const FFTSample *tcos = s->tcos;
@@ -138,7 +137,6 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
n = 1 << s->nbits;
n2 = n >> 1;
n4 = n >> 2;
- n8 = n >> 3;
/* pre rotation */
in1 = input;
@@ -182,6 +180,20 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
:"m"(tcos[k]), "m"(tsin[k])
);
}
+}
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+ const FFTSample *input, FFTSample *tmp)
+{
+ x86_reg k;
+ long n8, n2, n;
+ FFTComplex *z = (FFTComplex *)tmp;
+
+ n = 1 << s->nbits;
+ n2 = n >> 1;
+ n8 = n >> 3;
+
+ imdct_3dn2(s, input, tmp);
k = n-8;
asm volatile("movd %0, %%mm7" ::"r"(1<<31));
@@ -212,3 +224,40 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
asm volatile("femms");
}
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+ const FFTSample *input, FFTSample *tmp)
+{
+ x86_reg j, k;
+ long n8, n4, n;
+ FFTComplex *z = (FFTComplex *)tmp;
+
+ n = 1 << s->nbits;
+ n4 = n >> 2;
+ n8 = n >> 3;
+
+ imdct_3dn2(s, input, tmp);
+
+ j = -n;
+ k = n-8;
+ asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+ asm volatile(
+ "1: \n\t"
+ "movq (%3,%1), %%mm0 \n\t" // z[n8+k]
+ "pswapd (%3,%0), %%mm1 \n\t" // z[n8-1-k]
+ "movq %%mm0, %%mm2 \n\t"
+ "punpckldq %%mm1, %%mm0 \n\t"
+ "punpckhdq %%mm2, %%mm1 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "pxor %%mm7, %%mm1 \n\t"
+ "movq %%mm0, (%2,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im }
+ "movq %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
+ "sub $8, %1 \n\t"
+ "add $8, %0 \n\t"
+ "jl 1b \n\t"
+ :"+r"(j), "+r"(k)
+ :"r"(output+n4), "r"(z+n8)
+ :"memory"
+ );
+ asm volatile("femms");
+}
+