diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2008-07-13 15:03:58 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2008-07-13 15:03:58 +0000 |
commit | b9fa32082c71013e90eab9e9997967d2939cf4a6 (patch) | |
tree | 83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386/fft_sse.c | |
parent | eb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff) | |
download | ffmpeg-b9fa32082c71013e90eab9e9997967d2939cf4a6.tar.gz |
exploit mdct symmetry
2% faster vorbis on conroe, k8. 7% on celeron.
Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/fft_sse.c')
-rw-r--r-- | libavcodec/i386/fft_sse.c | 58 |
1 files changed, 54 insertions, 4 deletions
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c index 83cbd87088..305f44a0ce 100644 --- a/libavcodec/i386/fft_sse.c +++ b/libavcodec/i386/fft_sse.c @@ -142,11 +142,10 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) } while (nblocks != 0); } -void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) +static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp) { x86_reg k; - long n8, n4, n2, n; + long n4, n2, n; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; @@ -156,7 +155,6 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; - n8 = n >> 3; #ifdef ARCH_X86_64 asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1)); @@ -260,6 +258,20 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, #endif ); } +} + +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg k; + long n8, n2, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n8 = n >> 3; + + imdct_sse(s, input, tmp); /* Mnemonics: @@ -301,3 +313,41 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, ); } +void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg j, k; + long n8, n4, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n4 = n >> 2; + n8 = n >> 3; + + imdct_sse(s, input, tmp); + + j = -n; + k = n-16; + asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1)); + asm volatile( + "1: \n\t" + "movaps (%3,%1), %%xmm0 \n\t" + "movaps (%3,%0), %%xmm1 \n\t" + "xorps %%xmm7, %%xmm0 \n\t" + "movaps %%xmm0, %%xmm2 \n\t" + "shufps $141,%%xmm1, %%xmm0 \n\t" + "shufps $216,%%xmm1, %%xmm2 \n\t" + "shufps $54, %%xmm0, %%xmm0 \n\t" + "shufps $156,%%xmm2, %%xmm2 \n\t" + "xorps %%xmm7, %%xmm0 \n\t" + "movaps %%xmm2, (%2,%1) \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "sub $16, %1 \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(z+n8) + :"memory" + ); +} + |