diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2006-08-18 23:53:49 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2006-08-18 23:53:49 +0000 |
commit | 1e4ecf26f599c894c24aaf9b5918630656c352c2 (patch) | |
tree | e5c94934edc5384c18ade4771ec87bb3b0c7dce0 /libavcodec/i386/fft_3dn2.c | |
parent | 57060f8936e1d28ef328ed7a3999ec478bddc2a6 (diff) | |
download | ffmpeg-1e4ecf26f599c894c24aaf9b5918630656c352c2.tar.gz |
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
2.5% faster fft, 0.5% faster vorbis.
Originally committed as revision 6023 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/fft_3dn2.c')
-rw-r--r-- | libavcodec/i386/fft_3dn2.c | 177 |
1 files changed, 81 insertions, 96 deletions
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c index b03fae62ee..5f066d175a 100644 --- a/libavcodec/i386/fft_3dn2.c +++ b/libavcodec/i386/fft_3dn2.c @@ -18,11 +18,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "../dsputil.h" -#include <math.h> - -#ifdef HAVE_MM3DNOW - -#include <mm3dnow.h> static const int p1m1[2] __attribute__((aligned(8))) = { 0, 1 << 31 }; @@ -33,108 +28,97 @@ static const int m1p1[2] __attribute__((aligned(8))) = void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) { int ln = s->nbits; - int j, np, np2; - int nblocks, nloops; - register FFTComplex *p, *q; - FFTComplex *cptr, *cptr1; - int k; - - np = 1 << ln; - /* FEMMS is not a must here but recommended by AMD */ - _m_femms(); - - { - __m64 *r, a0, a1, b0, b1, c; - - r = (__m64 *)&z[0]; - if (s->inverse) - c = *(__m64 *)m1p1; - else - c = *(__m64 *)p1m1; - - j = (np >> 2); - do { - /* do the pass 0 butterfly */ - a0 = _m_pfadd(r[0], r[1]); - a1 = _m_pfsub(r[0], r[1]); - - /* do the pass 0 butterfly */ - b0 = _m_pfadd(r[2], r[3]); - b1 = _m_pfsub(r[2], r[3]); - - /* multiply third by -i */ - b1 = _m_pswapd(b1); - b1 = _m_pxor(b1, c); - - r[0] = _m_pfadd(a0, b0); - r[1] = _m_pfadd(a1, b1); - r[2] = _m_pfsub(a0, b0); - r[3] = _m_pfsub(a1, b1); - r += 4; - } while (--j != 0); - } + long i, j; + long nblocks, nloops; + FFTComplex *p, *cptr; + + asm volatile( + /* FEMMS is not a must here but recommended by AMD */ + "femms \n\t" + "movq %0, %%mm7 \n\t" + ::"m"(*(s->inverse ? m1p1 : p1m1)) + ); + + i = 8 << ln; + asm volatile( + "1: \n\t" + "sub $32, %0 \n\t" + "movq (%0,%1), %%mm0 \n\t" + "movq 16(%0,%1), %%mm1 \n\t" + "movq 8(%0,%1), %%mm2 \n\t" + "movq 24(%0,%1), %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "movq %%mm1, %%mm5 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "pfsub %%mm2, %%mm4 \n\t" + "pfsub %%mm3, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pswapd %%mm5, %%mm5 \n\t" + "movq %%mm4, %%mm3 \n\t" + "pxor %%mm7, %%mm5 \n\t" + "pfadd %%mm1, %%mm0 \n\t" + "pfadd %%mm5, %%mm4 \n\t" + "pfsub %%mm1, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%0,%1) \n\t" + "movq %%mm4, 8(%0,%1) \n\t" + "movq %%mm2, 16(%0,%1) \n\t" + "movq %%mm3, 24(%0,%1) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(z) + ); /* pass 2 .. ln-1 */ - nblocks = np >> 3; + nblocks = 1 << (ln-3); nloops = 1 << 2; - np2 = np >> 1; - - cptr1 = s->exptab1; + cptr = s->exptab1; do { p = z; - q = z + nloops; j = nblocks; do { - cptr = cptr1; - k = nloops >> 1; - do { - __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21; - - a0 = *(__m64 *)&p[0]; - a1 = *(__m64 *)&p[1]; - b0 = *(__m64 *)&q[0]; - b1 = *(__m64 *)&q[1]; - - /* complex mul */ - c0 = *(__m64 *)&cptr[0]; - c1 = *(__m64 *)&cptr[1]; - /* cre*re cim*im */ - t10 = _m_pfmul(c0, b0); - t11 = _m_pfmul(c1, b1); - /* no need to access cptr[2] & cptr[3] */ - c0 = _m_pswapd(c0); - c1 = _m_pswapd(c1); - /* cim*re cre*im */ - t20 = _m_pfmul(c0, b0); - t21 = _m_pfmul(c1, b1); - - /* cre*re-cim*im cim*re+cre*im */ - b0 = _m_pfpnacc(t10, t20); - b1 = _m_pfpnacc(t11, t21); - - /* butterfly */ - *(__m64 *)&p[0] = _m_pfadd(a0, b0); - *(__m64 *)&p[1] = _m_pfadd(a1, b1); - *(__m64 *)&q[0] = _m_pfsub(a0, b0); - *(__m64 *)&q[1] = _m_pfsub(a1, b1); - - p += 2; - q += 2; - cptr += 4; - } while (--k); - - p += nloops; - q += nloops; + i = nloops*8; + asm volatile( + "1: \n\t" + "sub $16, %0 \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "movq (%2,%0), %%mm2 \n\t" + "movq 8(%2,%0), %%mm3 \n\t" + "movq (%3,%0,2), %%mm4 \n\t" + "movq 8(%3,%0,2), %%mm5 \n\t" + "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3] + "pswapd %%mm5, %%mm7 \n\t" + "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im + "pfmul %%mm3, %%mm5 \n\t" + "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im + "pfmul %%mm3, %%mm7 \n\t" + "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im + "pfpnacc %%mm7, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfadd %%mm4, %%mm0 \n\t" + "pfadd %%mm5, %%mm1 \n\t" + "pfsub %%mm4, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "movq %%mm2, (%2,%0) \n\t" + "movq %%mm3, 8(%2,%0) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(p), "r"(p + nloops), "r"(cptr) + ); + p += nloops*2; } while (--j); - cptr1 += nloops * 2; - nblocks = nblocks >> 1; - nloops = nloops << 1; + cptr += nloops*2; + nblocks >>= 1; + nloops <<= 1; } while (nblocks != 0); - _m_femms(); + asm volatile("femms"); } -#endif - void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp) { @@ -221,3 +205,4 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, } asm volatile("femms"); } + |