diff options
author | James Almer <jamrial@gmail.com> | 2014-10-02 20:38:01 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-10-02 22:11:55 -0300 |
commit | 0de1d6287ec384e795f5ef1d006589dd18ff95e9 (patch) | |
tree | e3a6433fbeddac7f59dbe92e8e007b2a70f6a132 | |
parent | ddb813b0ef8f135679020240fdca41c29976c23a (diff) | |
download | ffmpeg-0de1d6287ec384e795f5ef1d006589dd18ff95e9.tar.gz |
x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2}
2x to 2.5x faster than the C version.
Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/mlpdec.c | 4 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 6 | ||||
-rw-r--r-- | libavcodec/x86/mlpdsp.asm | 196 | ||||
-rw-r--r-- | libavcodec/x86/mlpdsp_init.c (renamed from libavcodec/x86/mlpdsp.c) | 22 |
4 files changed, 223 insertions, 5 deletions
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c index 2c5426c242..d26c27777e 100644 --- a/libavcodec/mlpdec.c +++ b/libavcodec/mlpdec.c @@ -105,7 +105,7 @@ typedef struct SubStream { /// Whether the LSBs of the matrix output are encoded in the bitstream. uint8_t lsb_bypass[MAX_MATRICES]; /// Matrix coefficients, stored as 2.14 fixed point. - int32_t matrix_coeff[MAX_MATRICES][MAX_CHANNELS]; + DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS]; /// Left shift to apply to noise values in 0x31eb substreams. uint8_t matrix_noise_shift[MAX_MATRICES]; //@} @@ -159,7 +159,7 @@ typedef struct MLPDecodeContext { int8_t noise_buffer[MAX_BLOCKSIZE_POW2]; int8_t bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS]; - int32_t sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS]; + DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS]; MLPDSPContext dsp; } MLPDecodeContext; diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 9f34abddde..2fa56b9aab 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -41,7 +41,7 @@ OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o -OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o +OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o @@ -52,7 +52,7 @@ OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o -OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o @@ -132,6 +132,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ x86/hevc_deblock.o \ x86/hevc_idct.o \ x86/hevc_res_add.o +YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o @@ -139,6 +140,7 @@ YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o +YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm new file mode 100644 index 0000000000..ce656af145 --- /dev/null +++ b/libavcodec/x86/mlpdsp.asm @@ -0,0 +1,196 @@ +;****************************************************************************** +;* SIMD-optimized MLP DSP functions +;* Copyright (c) 2014 James Almer <jamrial@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%if ARCH_X86_64 + +%macro SHLX 2 +%if cpuflag(bmi2) + shlx %1, %1, %2q +%else + shl %1, %2b +%endif +%endmacro + +%macro REMATRIX 0 + movdqa m0, [samplesq] + movdqa m1, [coeffsq ] + pshufd m2, m0, q2301 + pshufd m3, m1, q2301 + pmuldq m0, m1 + pmuldq m3, m2 + paddq m0, m3 +%if notcpuflag(avx2) + movdqa m1, [samplesq + 16] + movdqa m2, [coeffsq + 16] + pshufd m3, m1, q2301 + pshufd m4, m2, q2301 + pmuldq m1, m2 + pmuldq m4, m3 + paddq m0, m1 + paddq m0, m4 +%else + vextracti128 xm1, m0, 1 + paddq xm0, xm1 +%endif +%endmacro + +%macro LOOP_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, blsbsd ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +%macro LOOP_SHIFT_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + and indexd, auspd ; index &= access_unit_size_pow2; + movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] + add indexd, index2d ; index += index2 + SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift + add accumq, noiseq ; accum += noise_buffer[index] + movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, noised ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, +; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, +; int index, unsigned int dest_ch, uint16_t blockpos, +; unsigned int maxchan, int matrix_noise_shift, +; int access_unit_size_pow2, int32_t mask) +%macro MLP_REMATRIX_CHANNEL 0 +cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ + index, dest_ch, blockpos, maxchan, mns, \ + accum, mask, cnt + mov mnsd, mnsm ; load matrix_noise_shift + movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) + mov maxchand, maxchanm ; load maxchan + mov maskd, maskm ; load mask +%if WIN64 + mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) +%endif + shl dest_chd, 2 + lea cntq, [blsbs_ptrq + blockposq*8] + test mnsd, mnsd ; is matrix_noise_shift != 0? + jne .shift ; jump if true + cmp maxchand, 4 ; is maxchan < 4? + jl .loop4 ; jump if true + +align 16 +.loop8: + ; Process 5 or more channels + REMATRIX + LOOP_END + jne .loop8 + RET + +align 16 +.loop4: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_END + jne .loop4 + RET + +.shift: +%if WIN64 + mov indexd, indexm ; load index (not needed on UNIX64) +%endif + mov r9d, r9m ; load access_unit_size_pow2 +%if cpuflag(bmi2) + ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. + DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ + index, dest_ch, accum, index2, mns, \ + ausp, mask, cnt, noise + add mnsd, 7 ; matrix_noise_shift += 7 +%else ; sse4 + mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift +%if WIN64 + ; r0 = rcx + DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ + index2, accum, ausp, mask, cnt, noise +%else ; UNIX64 + ; r3 = rcx + DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ + index2, accum, ausp, mask, cnt, noise +%endif + lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 +%endif ; cpuflag + sub auspd, 1 ; access_unit_size_pow2 -= 1 + cmp r7d, 4 ; is maxchan < 4? + lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; + jl .loop4_shift ; jump if maxchan < 4 + +align 16 +.loop8_shift: + ; Process 5 or more channels + REMATRIX + LOOP_SHIFT_END + jne .loop8_shift + RET + +align 16 +.loop4_shift: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_SHIFT_END + jne .loop4_shift + RET +%endmacro + +INIT_XMM sse4 +MLP_REMATRIX_CHANNEL +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2, bmi2 +MLP_REMATRIX_CHANNEL +%endif + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c index f090fd79b4..dc0bc585c7 100644 --- a/libavcodec/x86/mlpdsp.c +++ b/libavcodec/x86/mlpdsp_init.c @@ -26,6 +26,22 @@ #include "libavcodec/mlpdsp.h" #include "libavcodec/mlp.h" +#define REMATRIX_CHANNEL_FUNC(opt) \ +void ff_mlp_rematrix_channel_##opt(int32_t *samples, \ + const int32_t *coeffs, \ + const uint8_t *bypassed_lsbs, \ + const int8_t *noise_buffer, \ + int index, \ + unsigned int dest_ch, \ + uint16_t blockpos, \ + unsigned int maxchan, \ + int matrix_noise_shift, \ + int access_unit_size_pow2, \ + int32_t mask); + +REMATRIX_CHANNEL_FUNC(sse4) +REMATRIX_CHANNEL_FUNC(avx2_bmi2) + #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS extern char ff_mlp_firorder_8; @@ -178,9 +194,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) { -#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS int cpu_flags = av_get_cpu_flags(); +#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS if (INLINE_MMX(cpu_flags)) c->mlp_filter_channel = mlp_filter_channel_x86; #endif + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags)) + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4; + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2) + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2; } |