aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/dcadsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2016-02-05 16:45:04 -0300
committerJames Almer <jamrial@gmail.com>2016-02-06 01:36:55 -0300
commit8ae744794101e0bfb66f350b7ded38c0711cb717 (patch)
treee90d81fbc902128aaa8339c51aa782e87b3e2022 /libavcodec/x86/dcadsp.asm
parent3e9b8ffc9bfe47a9198866d4846f872645bbc420 (diff)
downloadffmpeg-8ae744794101e0bfb66f350b7ded38c0711cb717.tar.gz
x86/dcadec: add ff_lfe_fir0_float_{sse,sse2,avx,fma3}
Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r--libavcodec/x86/dcadsp.asm203
1 files changed, 203 insertions, 0 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
new file mode 100644
index 0000000000..fb13957044
--- /dev/null
+++ b/libavcodec/x86/dcadsp.asm
@@ -0,0 +1,203 @@
+;******************************************************************************
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64)
+
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 1
+ sub lfeq, 7*sizeof_float
+ mov cnt1d, 32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ lea coeffq, [coeffq+cnt1q*8]
+ add samplesq, cnt1q
+ neg cnt1q
+
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq+16]
+ cvtdq2ps m5, [lfeq ]
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq+16]
+ movu m5, [lfeq ]
+ cvtdq2ps m4, m4
+ cvtdq2ps m5, m5
+ pshufd m7, m4, q0123
+ pshufd m6, m5, q0123
+%else
+ cvtpi2ps m4, [lfeq+16]
+ cvtpi2ps m0, [lfeq+24]
+ cvtpi2ps m5, [lfeq ]
+ cvtpi2ps m1, [lfeq+8 ]
+ shufps m4, m0, q1010
+ shufps m5, m1, q1010
+ shufps m7, m4, m4, q0123
+ shufps m6, m5, m5, q0123
+%endif
+
+.inner_loop:
+%if ARCH_X86_64
+ movaps m8, [coeffq+cnt1q*8 ]
+ movaps m9, [coeffq+cnt1q*8+16]
+ movaps m10, [coeffq+cnt1q*8+32]
+ movaps m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+ movaps m12, [coeffq+cnt1q*8+64]
+ movaps m13, [coeffq+cnt1q*8+80]
+ movaps m14, [coeffq+cnt1q*8+96]
+ movaps m15, [coeffq+cnt1q*8+112]
+ mulps m0, m7, m8
+ mulps m1, m7, m10
+ mulps m2, m7, m12
+ mulps m3, m7, m14
+ fmaddps m0, m6, m9, m0
+ fmaddps m1, m6, m11, m1
+ fmaddps m2, m6, m13, m2
+ fmaddps m3, m6, m15, m3
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
+%else
+ mulps m0, m7, m8
+ mulps m1, m6, m9
+ mulps m2, m7, m10
+ mulps m3, m6, m11
+ addps m0, m1
+ addps m2, m3
+
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ movaps m1, [coeffq+cnt1q*8+16]
+ mulps m2, m7, [coeffq+cnt1q*8+32]
+ fmaddps m0, m6, m1, m0
+ fmaddps m2, m6, [coeffq+cnt1q*8+48], m2
+%else
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m6, [coeffq+cnt1q*8+16]
+ mulps m2, m7, [coeffq+cnt1q*8+32]
+ mulps m3, m6, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+%endif
+ unpckhps m3, m0, m2
+ unpcklps m0, m2
+ addps m3, m0
+ movhlps m2, m3
+ addps m2, m3
+ movlps [samplesq+cnt1q], m2
+%endif; ARCH
+
+%if ARCH_X86_64
+%if cpuflag(fma3)
+ mulps m8, m5
+ mulps m10, m5
+ mulps m12, m5
+ mulps m14, m5
+ fmaddps m8, m4, m9, m8
+ fmaddps m10, m4, m11, m10
+ fmaddps m12, m4, m13, m12
+ fmaddps m14, m4, m15, m14
+
+ haddps m10, m8
+ haddps m14, m12
+ haddps m14, m10
+ movaps [samplesq+cnt2q], m14
+%else
+ mulps m8, m5
+ mulps m9, m4
+ mulps m10, m5
+ mulps m11, m4
+ addps m8, m9
+ addps m10, m11
+
+ unpckhps m11, m10, m8
+ unpcklps m10, m8
+ addps m11, m10
+ movhlps m8, m11
+ addps m8, m11
+ movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m2, m5, [coeffq+cnt1q*8+32]
+ fmaddps m0, m4, m1, m0
+ fmaddps m2, m4, [coeffq+cnt1q*8+48], m2
+%else
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m4, [coeffq+cnt1q*8+16]
+ mulps m2, m5, [coeffq+cnt1q*8+32]
+ mulps m3, m4, [coeffq+cnt1q*8+48]
+ addps m0, m1
+ addps m2, m3
+%endif
+ unpckhps m3, m2, m0
+ unpcklps m2, m0
+ addps m3, m2
+ movhlps m0, m3
+ addps m0, m3
+ movlps [samplesq+cnt2q], m0
+%endif; ARCH
+
+ sub cnt2d, 8 + FMA3_OFFSET
+ add cnt1q, 8 + FMA3_OFFSET
+ jl .inner_loop
+
+ add lfeq, 4
+ add samplesq, 64*sizeof_float
+ mov cnt1q, -32*sizeof_float
+ mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+ sub nblocksd, 1
+ jg .loop
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+LFE_FIR0_FLOAT
+%endif
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
+%endif