diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2012-06-26 16:10:33 +0200 |
---|---|---|
committer | Mans Rullgard <mans@mansr.com> | 2012-06-27 12:49:33 +0100 |
commit | a5bfa66df516b7be55fd08fc62c2b012fc18e340 (patch) | |
tree | ae39590342cc74781342dcb819164037b3b0bc7b | |
parent | 75d339e044f9b87dd9aa4bdaee73b1a8323d4a15 (diff) | |
download | ffmpeg-a5bfa66df516b7be55fd08fc62c2b012fc18e340.tar.gz |
x86: fft: replace call to memcpy by a loop
The function call was a mess to handle, and memcpy cannot make
the assumptions we do in the new code.
Tested on an IMC sample: 430c -> 370c.
Signed-off-by: Mans Rullgard <mans@mansr.com>
-rw-r--r-- | libavcodec/x86/fft_mmx.asm | 37 |
1 files changed, 12 insertions, 25 deletions
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 007f5caf77..1a430b9c2c 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -615,8 +615,6 @@ cglobal fft_calc, 2,5,8 .end: REP_RET -cextern_naked memcpy - cglobal fft_permute, 2,7,1 mov r4, [r0 + FFTContext.revtab] mov r5, [r0 + FFTContext.tmpbuf] @@ -637,29 +635,18 @@ cglobal fft_permute, 2,7,1 cmp r0, r2 jl .loop shl r2, 3 -%if ARCH_X86_64 - mov r0, r1 - mov r1, r5 -%endif -%if WIN64 - sub rsp, 8 - call memcpy - add rsp, 8 - RET -%elif ARCH_X86_64 -%ifdef PIC - jmp memcpy wrt ..plt -%else - jmp memcpy -%endif -%else - push r2 - push r5 - push r1 - call memcpy - add esp, 12 - RET -%endif + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + REP_RET cglobal imdct_calc, 3,5,3 mov r3d, [r0 + FFTContext.mdctsize] |