diff options
author | James Almer <jamrial@gmail.com> | 2016-02-05 16:45:04 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2016-02-06 01:36:55 -0300 |
commit | 8ae744794101e0bfb66f350b7ded38c0711cb717 (patch) | |
tree | e90d81fbc902128aaa8339c51aa782e87b3e2022 /libavcodec/x86/dcadsp.asm | |
parent | 3e9b8ffc9bfe47a9198866d4846f872645bbc420 (diff) | |
download | ffmpeg-8ae744794101e0bfb66f350b7ded38c0711cb717.tar.gz |
x86/dcadec: add ff_lfe_fir0_float_{sse,sse2,avx,fma3}
Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math.
Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r-- | libavcodec/x86/dcadsp.asm | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm new file mode 100644 index 0000000000..fb13957044 --- /dev/null +++ b/libavcodec/x86/dcadsp.asm @@ -0,0 +1,203 @@ +;****************************************************************************** +;* SIMD-optimized functions for the DCA decoder +;* Copyright (C) 2016 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%define sizeof_float 4 +%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64) + +%macro LFE_FIR0_FLOAT 0 +cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2 + shr nblocksd, 1 + sub lfeq, 7*sizeof_float + mov cnt1d, 32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + lea coeffq, [coeffq+cnt1q*8] + add samplesq, cnt1q + neg cnt1q + +.loop: +%if cpuflag(avx) + cvtdq2ps m4, [lfeq+16] + cvtdq2ps m5, [lfeq ] + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 +%elif cpuflag(sse2) + movu m4, [lfeq+16] + movu m5, [lfeq ] + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + pshufd m7, m4, q0123 + pshufd m6, m5, q0123 +%else + cvtpi2ps m4, [lfeq+16] + cvtpi2ps m0, [lfeq+24] + cvtpi2ps m5, [lfeq ] + cvtpi2ps m1, [lfeq+8 ] + shufps m4, m0, q1010 + shufps m5, m1, q1010 + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 +%endif + +.inner_loop: +%if ARCH_X86_64 + movaps m8, [coeffq+cnt1q*8 ] + movaps m9, [coeffq+cnt1q*8+16] + movaps m10, [coeffq+cnt1q*8+32] + movaps m11, [coeffq+cnt1q*8+48] +%if cpuflag(fma3) + movaps m12, [coeffq+cnt1q*8+64] + movaps m13, [coeffq+cnt1q*8+80] + movaps m14, [coeffq+cnt1q*8+96] + movaps m15, [coeffq+cnt1q*8+112] + mulps m0, m7, m8 + mulps m1, m7, m10 + mulps m2, m7, m12 + mulps m3, m7, m14 + fmaddps m0, m6, m9, m0 + fmaddps m1, m6, m11, m1 + fmaddps m2, m6, m13, m2 + fmaddps m3, m6, m15, m3 + + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [samplesq+cnt1q], m0 +%else + mulps m0, m7, m8 + mulps m1, m6, m9 + mulps m2, m7, m10 + mulps m3, m6, m11 + addps m0, m1 + addps m2, m3 + + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m7, [coeffq+cnt1q*8 ] + movaps m1, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + fmaddps m0, m6, m1, m0 + fmaddps m2, m6, [coeffq+cnt1q*8+48], m2 +%else + mulps m0, m7, [coeffq+cnt1q*8 ] + mulps m1, m6, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + mulps m3, m6, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif; ARCH + +%if ARCH_X86_64 +%if cpuflag(fma3) + mulps m8, m5 + mulps m10, m5 + mulps m12, m5 + mulps m14, m5 + fmaddps m8, m4, m9, m8 + fmaddps m10, m4, m11, m10 + fmaddps m12, m4, m13, m12 + fmaddps m14, m4, m15, m14 + + haddps m10, m8 + haddps m14, m12 + haddps m14, m10 + movaps [samplesq+cnt2q], m14 +%else + mulps m8, m5 + mulps m9, m4 + mulps m10, m5 + mulps m11, m4 + addps m8, m9 + addps m10, m11 + + unpckhps m11, m10, m8 + unpcklps m10, m8 + addps m11, m10 + movhlps m8, m11 + addps m8, m11 + movlps [samplesq+cnt2q], m8 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m2, m5, [coeffq+cnt1q*8+32] + fmaddps m0, m4, m1, m0 + fmaddps m2, m4, [coeffq+cnt1q*8+48], m2 +%else + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m1, m4, [coeffq+cnt1q*8+16] + mulps m2, m5, [coeffq+cnt1q*8+32] + mulps m3, m4, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m2, m0 + unpcklps m2, m0 + addps m3, m2 + movhlps m0, m3 + addps m0, m3 + movlps [samplesq+cnt2q], m0 +%endif; ARCH + + sub cnt2d, 8 + FMA3_OFFSET + add cnt1q, 8 + FMA3_OFFSET + jl .inner_loop + + add lfeq, 4 + add samplesq, 64*sizeof_float + mov cnt1q, -32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + sub nblocksd, 1 + jg .loop + RET +%endmacro + +%if ARCH_X86_32 +INIT_XMM sse +LFE_FIR0_FLOAT +%endif +INIT_XMM sse2 +LFE_FIR0_FLOAT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +LFE_FIR0_FLOAT +%endif +%if HAVE_FMA3_EXTERNAL +INIT_XMM fma3 +LFE_FIR0_FLOAT +%endif |