diff options
author | Rostislav Pehlivanov <atomnuker@gmail.com> | 2016-10-08 15:59:14 +0100 |
---|---|---|
committer | Rostislav Pehlivanov <atomnuker@gmail.com> | 2016-10-18 21:41:18 +0100 |
commit | d2ae5f77c61a29c3c63cc4c41c74ccfca4167649 (patch) | |
tree | bd615f22e6807e025da052c411e12e9a7be0852e /libavcodec/x86/aacencdsp.asm | |
parent | 3b02f6dd7be880fd6c1bcaf2fd0c1314dcee7aa6 (diff) | |
download | ffmpeg-d2ae5f77c61a29c3c63cc4c41c74ccfca4167649.tar.gz |
aacenc: add SIMD optimizations for abs_pow34 and quantization
Performance improvements:
quant_bands:
with: 681 decicycles in quant_bands, 8388453 runs, 155 skips
without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips
Around 42% for the function
Twoloop coder:
abs_pow34:
with/without: 7.82s/8.17s
Around 4% for the entire encoder
Both:
with/without: 7.15s/8.17s
Around 12% for the entire encoder
Fast coder:
abs_pow34:
with/without: 3.40s/3.77s
Around 10% for the entire encoder
Both:
with/without: 3.02s/3.77s
Around 20% faster for the entire encoder
Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com>
Tested-by: Michael Niedermayer <michael@niedermayer.cc>
Reviewed-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/aacencdsp.asm')
-rw-r--r-- | libavcodec/x86/aacencdsp.asm | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm new file mode 100644 index 0000000000..97af571ec8 --- /dev/null +++ b/libavcodec/x86/aacencdsp.asm @@ -0,0 +1,86 @@ +;****************************************************************************** +;* SIMD optimized AAC encoder DSP functions +;* +;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +float_abs_mask: times 4 dd 0x7fffffff + +SECTION .text + +;******************************************************************* +;void ff_abs_pow34(float *out, const float *in, const int size); +;******************************************************************* +INIT_XMM sse +cglobal abs_pow34, 3, 3, 3, out, in, size + mova m2, [float_abs_mask] + shl sizeq, 2 + add inq, sizeq + add outq, sizeq + neg sizeq +.loop: + andps m0, m2, [inq+sizeq] + sqrtps m1, m0 + mulps m0, m1 + sqrtps m0, m0 + mova [outq+sizeq], m0 + add sizeq, mmsize + jl .loop + RET + +;******************************************************************* +;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled, +; int size, int is_signed, int maxval, const float Q34, +; const float rounding) +;******************************************************************* +INIT_XMM sse2 +cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding +%if UNIX64 == 0 + movss m0, Q34m + movss m1, roundingm + cvtsi2ss m3, dword maxvalm +%else + cvtsi2ss m3, maxvald +%endif + shufps m0, m0, 0 + shufps m1, m1, 0 + shufps m3, m3, 0 + shl is_signedd, 31 + movd m4, is_signedd + shufps m4, m4, 0 + shl sized, 2 + add inq, sizeq + add outq, sizeq + add scaledq, sizeq + neg sizeq +.loop: + mulps m2, m0, [scaledq+sizeq] + addps m2, m1 + minps m2, m3 + andps m5, m4, [inq+sizeq] + orps m2, m5 + cvttps2dq m2, m2 + mova [outq+sizeq], m2 + add sizeq, mmsize + jl .loop + RET |