diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2006-08-08 04:01:04 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2006-08-08 04:01:04 +0000 |
commit | bcfa3e58ee0ec7f8a739867ea66f9acb834e498a (patch) | |
tree | a7aec9632c7157f03266a46861d4a898ee2340f5 /libavcodec/i386 | |
parent | 2c5ad5fd74a44145459e74acdf486c084f8de4b4 (diff) | |
download | ffmpeg-bcfa3e58ee0ec7f8a739867ea66f9acb834e498a.tar.gz |
3dnow2 implementation of imdct.
6% faster vorbis and wma.
Originally committed as revision 5954 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r-- | libavcodec/i386/fft_3dn2.c | 83 |
1 files changed, 82 insertions, 1 deletions
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c index aa8f0aee2e..40ec9d8eb1 100644 --- a/libavcodec/i386/fft_3dn2.c +++ b/libavcodec/i386/fft_3dn2.c @@ -1,6 +1,6 @@ /* * FFT/MDCT transform with Extended 3DNow! optimizations - * Copyright (c) 2006 Zuxy MENG Jie. + * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. * * This library is free software; you can redistribute it and/or @@ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) } #endif + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n4, n2, n; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + for(k = 0; k < n4; k++) { + asm volatile( + "movd %1, %%mm0 \n\t" + "movd %3, %%mm1 \n\t" + "punpckldq %2, %%mm0 \n\t" + "punpckldq %4, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(z[revtab[k]]) + :"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + } + + ff_fft_calc(&s->fft, z); + + /* post rotation + reordering */ + for(k = 0; k < n4; k++) { + asm volatile( + "movq %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpckldq %2, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(z[k]) + :"m"(tcos[k]), "m"(tsin[k]) + ); + } + + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); + for(k = 0; k < n8; k++) { + asm volatile( + "movq %4, %%mm0 \n\t" + "pswapd %5, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pxor %%mm7, %%mm2 \n\t" + "punpckldq %%mm1, %%mm2 \n\t" + "pswapd %%mm2, %%mm3 \n\t" + "punpckhdq %%mm1, %%mm0 \n\t" + "pswapd %%mm0, %%mm4 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "pxor %%mm7, %%mm4 \n\t" + "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re } + "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im } + "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im } + "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } + :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), + "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) + :"m"(z[n8+k]), "m"(z[n8-1-k]) + :"memory" + ); + } + asm volatile("emms"); +} |