diff options
author | Nicolas George <nicolas.george@normalesup.org> | 2012-06-26 13:00:14 +0200 |
---|---|---|
committer | Nicolas George <nicolas.george@normalesup.org> | 2012-06-26 13:00:14 +0200 |
commit | fd91a3ec44de38251b2c15e03e26d14e983c4e44 (patch) | |
tree | 3d6c86d41df62399306894879645ea9a21ef6bfe | |
parent | f767658414fc85dea4006cb82969b6a925fdd380 (diff) | |
download | ffmpeg-fd91a3ec44de38251b2c15e03e26d14e983c4e44.tar.gz |
Revert "x86: fft: convert sse inline asm to yasm"
This reverts commit 82992604706144910f4a2f875d48cfc66c1b70d7.
It breaks shared builds on x86_64.
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/fft_mmx.asm | 139 | ||||
-rw-r--r-- | libavcodec/x86/fft_sse.c | 110 |
3 files changed, 121 insertions, 129 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8acbd0774c..f633cf63e9 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -43,6 +43,7 @@ YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o +YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ $(YASM-OBJS-FFT-yes) diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 7d046df18e..e1b485b697 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -47,10 +47,6 @@ struc FFTContext .mdctbits: resd 1 .tcos: pointer 1 .tsin: pointer 1 - .fftperm: pointer 1 - .fftcalc: pointer 1 - .imdctcalc:pointer 1 - .imdcthalf:pointer 1 endstruc %define M_SQRT1_2 0.70710678118654752440 @@ -69,7 +65,6 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 -ps_m1m1m1m1: times 4 dd 1<<31 ps_m1p1: dd 1<<31, 0 %assign i 16 @@ -538,16 +533,6 @@ DEFINE_ARGS z, w, n, o1, o3 rep ret %endmacro -%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs - lea r2, [dispatch_tab%1] - mov r2, [r2 + (%2q-2)*gprsize] -%ifdef PIC - lea r3, [$$] - add r2, r3 -%endif - call r2 -%endmacro ; FFT_DISPATCH - INIT_YMM avx %if HAVE_AVX @@ -564,14 +549,6 @@ INIT_YMM avx DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_interleave_avx, PASS_BIG 0 - -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - REP_RET - %endif INIT_XMM sse @@ -589,112 +566,6 @@ INIT_XMM sse DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_interleave_sse, PASS_BIG 0 -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - PUSH r1 - PUSH r3 - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - POP rcx - POP r4 - cmp rcx, 4 - jg .end - mov r2, -1 - add rcx, 3 - shl r2, cl - sub r4, r2 -.loop - movaps xmm0, [r4 + r2] - movaps xmm1, xmm0 - unpcklps xmm0, [r4 + r2 + 16] - unpckhps xmm1, [r4 + r2 + 16] - movaps [r4 + r2], xmm0 - movaps [r4 + r2 + 16], xmm1 - add r2, 32 - jl .loop -.end: - REP_RET - -cextern_naked memcpy - -cglobal fft_permute, 2,7,1 - mov r4, [r0 + FFTContext.revtab] - mov r5, [r0 + FFTContext.tmpbuf] - mov ecx, [r0 + FFTContext.nbits] - mov r2, 1 - shl r2, cl - xor r0, r0 -%if ARCH_X86_32 - mov r1, r1m -%endif -.loop: - movaps xmm0, [r1 + 8*r0] - movzx r6, word [r4 + 2*r0] - movzx r3, word [r4 + 2*r0 + 2] - movlps [r5 + 8*r6], xmm0 - movhps [r5 + 8*r3], xmm0 - add r0, 2 - cmp r0, r2 - jl .loop - shl r2, 3 -%if ARCH_X86_64 - mov r0, r1 - mov r1, r5 -%else - push r2 - push r5 - push r1 -%endif -%if ARCH_X86_64 && WIN64 == 0 - jmp memcpy -%else - call memcpy -%if ARCH_X86_32 - add esp, 12 -%endif - REP_RET -%endif - -cglobal imdct_calc, 3,5,3 - mov r3d, [r0 + FFTContext.mdctsize] - mov r4, [r0 + FFTContext.imdcthalf] - add r1, r3 - PUSH r3 - PUSH r1 -%if ARCH_X86_32 - push r2 - push r1 - push r0 -%else - sub rsp, 8 -%endif - call r4 -%if ARCH_X86_32 - add esp, 12 -%else - add rsp, 8 -%endif - POP r1 - POP r3 - lea r0, [r1 + 2*r3] - mov r2, r3 - sub r3, 16 - neg r2 - movaps xmm2, [ps_m1m1m1m1] -.loop: - movaps xmm0, [r1 + r3] - movaps xmm1, [r0 + r2] - shufps xmm0, xmm0, 0x1b - shufps xmm1, xmm1, 0x1b - xorps xmm0, xmm2 - movaps [r0 + r3], xmm1 - movaps [r1 + r2], xmm0 - sub r3, 16 - add r2, 16 - jl .loop - REP_RET - INIT_MMX 3dnow %define mulps pfmul %define addps pfadd @@ -712,6 +583,16 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0 %define SECTION_REL %endif +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + %macro DECL_FFT 1-2 ; nbits, suffix %ifidn %0, 1 %xdefine fullsuffix SUFFIX diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c new file mode 100644 index 0000000000..13b992f47a --- /dev/null +++ b/libavcodec/x86/fft_sse.c @@ -0,0 +1,110 @@ +/* + * FFT/MDCT transform with SSE optimizations + * Copyright (c) 2008 Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "fft.h" +#include "config.h" + +DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] = + { 1U << 31, 1U << 31, 1U << 31, 1U << 31 }; + +void ff_fft_dispatch_sse(FFTComplex *z, int nbits); +void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); +void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits); + +#if HAVE_AVX +void ff_fft_calc_avx(FFTContext *s, FFTComplex *z) +{ + ff_fft_dispatch_interleave_avx(z, s->nbits); +} +#endif + +void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) +{ + int n = 1 << s->nbits; + + ff_fft_dispatch_interleave_sse(z, s->nbits); + + if(n <= 16) { + x86_reg i = -8*n; + __asm__ volatile( + "1: \n" + "movaps (%0,%1), %%xmm0 \n" + "movaps %%xmm0, %%xmm1 \n" + "unpcklps 16(%0,%1), %%xmm0 \n" + "unpckhps 16(%0,%1), %%xmm1 \n" + "movaps %%xmm0, (%0,%1) \n" + "movaps %%xmm1, 16(%0,%1) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(z+n) + :"memory" + ); + } +} + +void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) +{ + int n = 1 << s->nbits; + int i; + for(i=0; i<n; i+=2) { + __asm__ volatile( + "movaps %2, %%xmm0 \n" + "movlps %%xmm0, %0 \n" + "movhps %%xmm0, %1 \n" + :"=m"(s->tmp_buf[s->revtab[i]]), + "=m"(s->tmp_buf[s->revtab[i+1]]) + :"m"(z[i]) + ); + } + memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); +} + +void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + x86_reg j, k; + long n = s->mdct_size; + long n4 = n >> 2; + + s->imdct_half(s, output + n4, input); + + j = -n; + k = n-16; + __asm__ volatile( + "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n" + "1: \n" + "movaps (%2,%1), %%xmm0 \n" + "movaps (%3,%0), %%xmm1 \n" + "shufps $0x1b, %%xmm0, %%xmm0 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "xorps %%xmm7, %%xmm0 \n" + "movaps %%xmm1, (%3,%1) \n" + "movaps %%xmm0, (%2,%0) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(output+n4*3) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7") + ); +} |