diff options
author | James Almer <jamrial@gmail.com> | 2014-10-02 20:38:01 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-10-02 22:11:55 -0300 |
commit | 0de1d6287ec384e795f5ef1d006589dd18ff95e9 (patch) | |
tree | e3a6433fbeddac7f59dbe92e8e007b2a70f6a132 /libavcodec/x86/mlpdsp.asm | |
parent | ddb813b0ef8f135679020240fdca41c29976c23a (diff) | |
download | ffmpeg-0de1d6287ec384e795f5ef1d006589dd18ff95e9.tar.gz |
x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2}
2x to 2.5x faster than the C version.
Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/mlpdsp.asm')
-rw-r--r-- | libavcodec/x86/mlpdsp.asm | 196 |
1 files changed, 196 insertions, 0 deletions
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm new file mode 100644 index 0000000000..ce656af145 --- /dev/null +++ b/libavcodec/x86/mlpdsp.asm @@ -0,0 +1,196 @@ +;****************************************************************************** +;* SIMD-optimized MLP DSP functions +;* Copyright (c) 2014 James Almer <jamrial@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%if ARCH_X86_64 + +%macro SHLX 2 +%if cpuflag(bmi2) + shlx %1, %1, %2q +%else + shl %1, %2b +%endif +%endmacro + +%macro REMATRIX 0 + movdqa m0, [samplesq] + movdqa m1, [coeffsq ] + pshufd m2, m0, q2301 + pshufd m3, m1, q2301 + pmuldq m0, m1 + pmuldq m3, m2 + paddq m0, m3 +%if notcpuflag(avx2) + movdqa m1, [samplesq + 16] + movdqa m2, [coeffsq + 16] + pshufd m3, m1, q2301 + pshufd m4, m2, q2301 + pmuldq m1, m2 + pmuldq m4, m3 + paddq m0, m1 + paddq m0, m4 +%else + vextracti128 xm1, m0, 1 + paddq xm0, xm1 +%endif +%endmacro + +%macro LOOP_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, blsbsd ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +%macro LOOP_SHIFT_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + and indexd, auspd ; index &= access_unit_size_pow2; + movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] + add indexd, index2d ; index += index2 + SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift + add accumq, noiseq ; accum += noise_buffer[index] + movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, noised ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, +; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, +; int index, unsigned int dest_ch, uint16_t blockpos, +; unsigned int maxchan, int matrix_noise_shift, +; int access_unit_size_pow2, int32_t mask) +%macro MLP_REMATRIX_CHANNEL 0 +cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ + index, dest_ch, blockpos, maxchan, mns, \ + accum, mask, cnt + mov mnsd, mnsm ; load matrix_noise_shift + movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) + mov maxchand, maxchanm ; load maxchan + mov maskd, maskm ; load mask +%if WIN64 + mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) +%endif + shl dest_chd, 2 + lea cntq, [blsbs_ptrq + blockposq*8] + test mnsd, mnsd ; is matrix_noise_shift != 0? + jne .shift ; jump if true + cmp maxchand, 4 ; is maxchan < 4? + jl .loop4 ; jump if true + +align 16 +.loop8: + ; Process 5 or more channels + REMATRIX + LOOP_END + jne .loop8 + RET + +align 16 +.loop4: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_END + jne .loop4 + RET + +.shift: +%if WIN64 + mov indexd, indexm ; load index (not needed on UNIX64) +%endif + mov r9d, r9m ; load access_unit_size_pow2 +%if cpuflag(bmi2) + ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. + DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ + index, dest_ch, accum, index2, mns, \ + ausp, mask, cnt, noise + add mnsd, 7 ; matrix_noise_shift += 7 +%else ; sse4 + mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift +%if WIN64 + ; r0 = rcx + DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ + index2, accum, ausp, mask, cnt, noise +%else ; UNIX64 + ; r3 = rcx + DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ + index2, accum, ausp, mask, cnt, noise +%endif + lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 +%endif ; cpuflag + sub auspd, 1 ; access_unit_size_pow2 -= 1 + cmp r7d, 4 ; is maxchan < 4? + lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; + jl .loop4_shift ; jump if maxchan < 4 + +align 16 +.loop8_shift: + ; Process 5 or more channels + REMATRIX + LOOP_SHIFT_END + jne .loop8_shift + RET + +align 16 +.loop4_shift: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_SHIFT_END + jne .loop4_shift + RET +%endmacro + +INIT_XMM sse4 +MLP_REMATRIX_CHANNEL +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2, bmi2 +MLP_REMATRIX_CHANNEL +%endif + +%endif ; ARCH_X86_64 |