diff options
author | James Almer <jamrial@gmail.com> | 2016-02-05 16:45:04 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2016-02-06 01:36:55 -0300 |
commit | 8ae744794101e0bfb66f350b7ded38c0711cb717 (patch) | |
tree | e90d81fbc902128aaa8339c51aa782e87b3e2022 | |
parent | 3e9b8ffc9bfe47a9198866d4846f872645bbc420 (diff) | |
download | ffmpeg-8ae744794101e0bfb66f350b7ded38c0711cb717.tar.gz |
x86/dcadec: add ff_lfe_fir0_float_{sse,sse2,avx,fma3}
Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math.
Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/dcadsp.c | 3 | ||||
-rw-r--r-- | libavcodec/dcadsp.h | 1 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 4 | ||||
-rw-r--r-- | libavcodec/x86/dcadsp.asm | 203 | ||||
-rw-r--r-- | libavcodec/x86/dcadsp_init.c | 45 |
5 files changed, 254 insertions, 2 deletions
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c index 6acfe0b7d9..09faee51fb 100644 --- a/libavcodec/dcadsp.c +++ b/libavcodec/dcadsp.c @@ -410,4 +410,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s) s->dmix_scale_inv = dmix_scale_inv_c; s->assemble_freq_bands = assemble_freq_bands_c; + + if (ARCH_X86) + ff_dcadsp_init_x86(s); } diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h index f594abf111..c82b7b1095 100644 --- a/libavcodec/dcadsp.h +++ b/libavcodec/dcadsp.h @@ -87,5 +87,6 @@ typedef struct DCADSPContext { } DCADSPContext; av_cold void ff_dcadsp_init(DCADSPContext *s); +av_cold void ff_dcadsp_init_x86(DCADSPContext *s); #endif diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index bfdf0b3b7c..668a9befd9 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -46,7 +46,7 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o -OBJS-$(CONFIG_DCA_DECODER) += x86/synth_filter_init.o +OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o @@ -133,7 +133,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o -YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/synth_filter.o +YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ x86/dirac_dwt.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm new file mode 100644 index 0000000000..fb13957044 --- /dev/null +++ b/libavcodec/x86/dcadsp.asm @@ -0,0 +1,203 @@ +;****************************************************************************** +;* SIMD-optimized functions for the DCA decoder +;* Copyright (C) 2016 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%define sizeof_float 4 +%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64) + +%macro LFE_FIR0_FLOAT 0 +cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2 + shr nblocksd, 1 + sub lfeq, 7*sizeof_float + mov cnt1d, 32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + lea coeffq, [coeffq+cnt1q*8] + add samplesq, cnt1q + neg cnt1q + +.loop: +%if cpuflag(avx) + cvtdq2ps m4, [lfeq+16] + cvtdq2ps m5, [lfeq ] + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 +%elif cpuflag(sse2) + movu m4, [lfeq+16] + movu m5, [lfeq ] + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + pshufd m7, m4, q0123 + pshufd m6, m5, q0123 +%else + cvtpi2ps m4, [lfeq+16] + cvtpi2ps m0, [lfeq+24] + cvtpi2ps m5, [lfeq ] + cvtpi2ps m1, [lfeq+8 ] + shufps m4, m0, q1010 + shufps m5, m1, q1010 + shufps m7, m4, m4, q0123 + shufps m6, m5, m5, q0123 +%endif + +.inner_loop: +%if ARCH_X86_64 + movaps m8, [coeffq+cnt1q*8 ] + movaps m9, [coeffq+cnt1q*8+16] + movaps m10, [coeffq+cnt1q*8+32] + movaps m11, [coeffq+cnt1q*8+48] +%if cpuflag(fma3) + movaps m12, [coeffq+cnt1q*8+64] + movaps m13, [coeffq+cnt1q*8+80] + movaps m14, [coeffq+cnt1q*8+96] + movaps m15, [coeffq+cnt1q*8+112] + mulps m0, m7, m8 + mulps m1, m7, m10 + mulps m2, m7, m12 + mulps m3, m7, m14 + fmaddps m0, m6, m9, m0 + fmaddps m1, m6, m11, m1 + fmaddps m2, m6, m13, m2 + fmaddps m3, m6, m15, m3 + + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [samplesq+cnt1q], m0 +%else + mulps m0, m7, m8 + mulps m1, m6, m9 + mulps m2, m7, m10 + mulps m3, m6, m11 + addps m0, m1 + addps m2, m3 + + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m7, [coeffq+cnt1q*8 ] + movaps m1, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + fmaddps m0, m6, m1, m0 + fmaddps m2, m6, [coeffq+cnt1q*8+48], m2 +%else + mulps m0, m7, [coeffq+cnt1q*8 ] + mulps m1, m6, [coeffq+cnt1q*8+16] + mulps m2, m7, [coeffq+cnt1q*8+32] + mulps m3, m6, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m0, m2 + unpcklps m0, m2 + addps m3, m0 + movhlps m2, m3 + addps m2, m3 + movlps [samplesq+cnt1q], m2 +%endif; ARCH + +%if ARCH_X86_64 +%if cpuflag(fma3) + mulps m8, m5 + mulps m10, m5 + mulps m12, m5 + mulps m14, m5 + fmaddps m8, m4, m9, m8 + fmaddps m10, m4, m11, m10 + fmaddps m12, m4, m13, m12 + fmaddps m14, m4, m15, m14 + + haddps m10, m8 + haddps m14, m12 + haddps m14, m10 + movaps [samplesq+cnt2q], m14 +%else + mulps m8, m5 + mulps m9, m4 + mulps m10, m5 + mulps m11, m4 + addps m8, m9 + addps m10, m11 + + unpckhps m11, m10, m8 + unpcklps m10, m8 + addps m11, m10 + movhlps m8, m11 + addps m8, m11 + movlps [samplesq+cnt2q], m8 +%endif +%else ; ARCH_X86_32 +%if cpuflag(fma3) + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m2, m5, [coeffq+cnt1q*8+32] + fmaddps m0, m4, m1, m0 + fmaddps m2, m4, [coeffq+cnt1q*8+48], m2 +%else + mulps m0, m5, [coeffq+cnt1q*8 ] + mulps m1, m4, [coeffq+cnt1q*8+16] + mulps m2, m5, [coeffq+cnt1q*8+32] + mulps m3, m4, [coeffq+cnt1q*8+48] + addps m0, m1 + addps m2, m3 +%endif + unpckhps m3, m2, m0 + unpcklps m2, m0 + addps m3, m2 + movhlps m0, m3 + addps m0, m3 + movlps [samplesq+cnt2q], m0 +%endif; ARCH + + sub cnt2d, 8 + FMA3_OFFSET + add cnt1q, 8 + FMA3_OFFSET + jl .inner_loop + + add lfeq, 4 + add samplesq, 64*sizeof_float + mov cnt1q, -32*sizeof_float + mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET + sub nblocksd, 1 + jg .loop + RET +%endmacro + +%if ARCH_X86_32 +INIT_XMM sse +LFE_FIR0_FLOAT +%endif +INIT_XMM sse2 +LFE_FIR0_FLOAT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +LFE_FIR0_FLOAT +%endif +%if HAVE_FMA3_EXTERNAL +INIT_XMM fma3 +LFE_FIR0_FLOAT +%endif diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c new file mode 100644 index 0000000000..bfe13e5a71 --- /dev/null +++ b/libavcodec/x86/dcadsp_init.c @@ -0,0 +1,45 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dcadsp.h" + +#define LFE_FIR_FLOAT_FUNC(opt) \ +void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \ + const float *filter_coeff, ptrdiff_t npcmblocks); + +LFE_FIR_FLOAT_FUNC(sse) +LFE_FIR_FLOAT_FUNC(sse2) +LFE_FIR_FLOAT_FUNC(avx) +LFE_FIR_FLOAT_FUNC(fma3) + +av_cold void ff_dcadsp_init_x86(DCADSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags)) + s->lfe_fir_float[0] = ff_lfe_fir0_float_sse; + if (EXTERNAL_SSE2(cpu_flags)) + s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2; + if (EXTERNAL_AVX(cpu_flags)) + s->lfe_fir_float[0] = ff_lfe_fir0_float_avx; + if (EXTERNAL_FMA3(cpu_flags)) + s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3; +} |