aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMans Rullgard <mans@mansr.com>2012-06-23 19:08:11 +0100
committerMans Rullgard <mans@mansr.com>2012-06-25 13:31:00 +0100
commit82992604706144910f4a2f875d48cfc66c1b70d7 (patch)
tree967292e4dffd4590c06b589cc2560e864b7a4fed
parent8123e0901fc7faa3d7dcf80af9ed0d874f8e7a06 (diff)
downloadffmpeg-82992604706144910f4a2f875d48cfc66c1b70d7.tar.gz
x86: fft: convert sse inline asm to yasm
-rw-r--r--libavcodec/x86/Makefile1
-rw-r--r--libavcodec/x86/fft_mmx.asm139
-rw-r--r--libavcodec/x86/fft_sse.c110
3 files changed, 129 insertions, 121 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6602cceea6..6464739d03 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
-YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
$(YASM-OBJS-FFT-yes)
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index b60d8b0a47..1cacfb7bd6 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -45,6 +45,10 @@ struc FFTContext
.mdctbits: resd 1
.tcos: pointer 1
.tsin: pointer 1
+ .fftperm: pointer 1
+ .fftcalc: pointer 1
+ .imdctcalc:pointer 1
+ .imdcthalf:pointer 1
endstruc
SECTION_RODATA
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
+ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
%assign i 16
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret
%endmacro
+%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
+ lea r2, [dispatch_tab%1]
+ mov r2, [r2 + (%2q-2)*gprsize]
+%ifdef PIC
+ lea r3, [$$]
+ add r2, r3
+%endif
+ call r2
+%endmacro ; FFT_DISPATCH
+
INIT_YMM avx
%if HAVE_AVX
@@ -548,6 +563,14 @@ INIT_YMM avx
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
+
+cglobal fft_calc, 2,5,8
+ mov r3d, [r0 + FFTContext.nbits]
+ mov r0, r1
+ mov r1, r3
+ FFT_DISPATCH _interleave %+ SUFFIX, r1
+ REP_RET
+
%endif
INIT_XMM sse
@@ -565,6 +588,112 @@ INIT_XMM sse
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
+cglobal fft_calc, 2,5,8
+ mov r3d, [r0 + FFTContext.nbits]
+ PUSH r1
+ PUSH r3
+ mov r0, r1
+ mov r1, r3
+ FFT_DISPATCH _interleave %+ SUFFIX, r1
+ POP rcx
+ POP r4
+ cmp rcx, 4
+ jg .end
+ mov r2, -1
+ add rcx, 3
+ shl r2, cl
+ sub r4, r2
+.loop
+ movaps xmm0, [r4 + r2]
+ movaps xmm1, xmm0
+ unpcklps xmm0, [r4 + r2 + 16]
+ unpckhps xmm1, [r4 + r2 + 16]
+ movaps [r4 + r2], xmm0
+ movaps [r4 + r2 + 16], xmm1
+ add r2, 32
+ jl .loop
+.end:
+ REP_RET
+
+cextern_naked memcpy
+
+cglobal fft_permute, 2,7,1
+ mov r4, [r0 + FFTContext.revtab]
+ mov r5, [r0 + FFTContext.tmpbuf]
+ mov ecx, [r0 + FFTContext.nbits]
+ mov r2, 1
+ shl r2, cl
+ xor r0, r0
+%if ARCH_X86_32
+ mov r1, r1m
+%endif
+.loop:
+ movaps xmm0, [r1 + 8*r0]
+ movzx r6, word [r4 + 2*r0]
+ movzx r3, word [r4 + 2*r0 + 2]
+ movlps [r5 + 8*r6], xmm0
+ movhps [r5 + 8*r3], xmm0
+ add r0, 2
+ cmp r0, r2
+ jl .loop
+ shl r2, 3
+%if ARCH_X86_64
+ mov r0, r1
+ mov r1, r5
+%else
+ push r2
+ push r5
+ push r1
+%endif
+%if ARCH_X86_64 && WIN64 == 0
+ jmp memcpy
+%else
+ call memcpy
+%if ARCH_X86_32
+ add esp, 12
+%endif
+ REP_RET
+%endif
+
+cglobal imdct_calc, 3,5,3
+ mov r3d, [r0 + FFTContext.mdctsize]
+ mov r4, [r0 + FFTContext.imdcthalf]
+ add r1, r3
+ PUSH r3
+ PUSH r1
+%if ARCH_X86_32
+ push r2
+ push r1
+ push r0
+%else
+ sub rsp, 8
+%endif
+ call r4
+%if ARCH_X86_32
+ add esp, 12
+%else
+ add rsp, 8
+%endif
+ POP r1
+ POP r3
+ lea r0, [r1 + 2*r3]
+ mov r2, r3
+ sub r3, 16
+ neg r2
+ movaps xmm2, [ps_m1m1m1m1]
+.loop:
+ movaps xmm0, [r1 + r3]
+ movaps xmm1, [r0 + r2]
+ shufps xmm0, xmm0, 0x1b
+ shufps xmm1, xmm1, 0x1b
+ xorps xmm0, xmm2
+ movaps [r0 + r3], xmm1
+ movaps [r1 + r2], xmm0
+ sub r3, 16
+ add r2, 16
+ jl .loop
+ REP_RET
+
INIT_MMX 3dnow
%define mulps pfmul
%define addps pfadd
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define SECTION_REL
%endif
-%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
- lea r2, [dispatch_tab%1]
- mov r2, [r2 + (%2q-2)*gprsize]
-%ifdef PIC
- lea r3, [$$]
- add r2, r3
-%endif
- call r2
-%endmacro ; FFT_DISPATCH
-
%macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
deleted file mode 100644
index 13b992f47a..0000000000
--- a/libavcodec/x86/fft_sse.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * FFT/MDCT transform with SSE optimizations
- * Copyright (c) 2008 Loren Merritt
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-#include "fft.h"
-#include "config.h"
-
-DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
- { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
-
-void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
-void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
-void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
-
-#if HAVE_AVX
-void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
-{
- ff_fft_dispatch_interleave_avx(z, s->nbits);
-}
-#endif
-
-void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
-{
- int n = 1 << s->nbits;
-
- ff_fft_dispatch_interleave_sse(z, s->nbits);
-
- if(n <= 16) {
- x86_reg i = -8*n;
- __asm__ volatile(
- "1: \n"
- "movaps (%0,%1), %%xmm0 \n"
- "movaps %%xmm0, %%xmm1 \n"
- "unpcklps 16(%0,%1), %%xmm0 \n"
- "unpckhps 16(%0,%1), %%xmm1 \n"
- "movaps %%xmm0, (%0,%1) \n"
- "movaps %%xmm1, 16(%0,%1) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(z+n)
- :"memory"
- );
- }
-}
-
-void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
-{
- int n = 1 << s->nbits;
- int i;
- for(i=0; i<n; i+=2) {
- __asm__ volatile(
- "movaps %2, %%xmm0 \n"
- "movlps %%xmm0, %0 \n"
- "movhps %%xmm0, %1 \n"
- :"=m"(s->tmp_buf[s->revtab[i]]),
- "=m"(s->tmp_buf[s->revtab[i+1]])
- :"m"(z[i])
- );
- }
- memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
-}
-
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
-{
- x86_reg j, k;
- long n = s->mdct_size;
- long n4 = n >> 2;
-
- s->imdct_half(s, output + n4, input);
-
- j = -n;
- k = n-16;
- __asm__ volatile(
- "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
- "1: \n"
- "movaps (%2,%1), %%xmm0 \n"
- "movaps (%3,%0), %%xmm1 \n"
- "shufps $0x1b, %%xmm0, %%xmm0 \n"
- "shufps $0x1b, %%xmm1, %%xmm1 \n"
- "xorps %%xmm7, %%xmm0 \n"
- "movaps %%xmm1, (%3,%1) \n"
- "movaps %%xmm0, (%2,%0) \n"
- "sub $16, %1 \n"
- "add $16, %0 \n"
- "jl 1b \n"
- :"+r"(j), "+r"(k)
- :"r"(output+n4), "r"(output+n4*3)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
- );
-}