aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/aacencdsp.asm
diff options
context:
space:
mode:
authorRostislav Pehlivanov <atomnuker@gmail.com>2016-10-08 15:59:14 +0100
committerRostislav Pehlivanov <atomnuker@gmail.com>2016-10-18 21:41:18 +0100
commitd2ae5f77c61a29c3c63cc4c41c74ccfca4167649 (patch)
treebd615f22e6807e025da052c411e12e9a7be0852e /libavcodec/x86/aacencdsp.asm
parent3b02f6dd7be880fd6c1bcaf2fd0c1314dcee7aa6 (diff)
downloadffmpeg-d2ae5f77c61a29c3c63cc4c41c74ccfca4167649.tar.gz
aacenc: add SIMD optimizations for abs_pow34 and quantization
Performance improvements: quant_bands: with: 681 decicycles in quant_bands, 8388453 runs, 155 skips without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips Around 42% for the function Twoloop coder: abs_pow34: with/without: 7.82s/8.17s Around 4% for the entire encoder Both: with/without: 7.15s/8.17s Around 12% for the entire encoder Fast coder: abs_pow34: with/without: 3.40s/3.77s Around 10% for the entire encoder Both: with/without: 3.02s/3.77s Around 20% faster for the entire encoder Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com> Tested-by: Michael Niedermayer <michael@niedermayer.cc> Reviewed-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/aacencdsp.asm')
-rw-r--r--libavcodec/x86/aacencdsp.asm86
1 files changed, 86 insertions, 0 deletions
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
new file mode 100644
index 0000000000..97af571ec8
--- /dev/null
+++ b/libavcodec/x86/aacencdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+ mova m2, [float_abs_mask]
+ shl sizeq, 2
+ add inq, sizeq
+ add outq, sizeq
+ neg sizeq
+.loop:
+ andps m0, m2, [inq+sizeq]
+ sqrtps m1, m0
+ mulps m0, m1
+ sqrtps m0, m0
+ mova [outq+sizeq], m0
+ add sizeq, mmsize
+ jl .loop
+ RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+; int size, int is_signed, int maxval, const float Q34,
+; const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+ movss m0, Q34m
+ movss m1, roundingm
+ cvtsi2ss m3, dword maxvalm
+%else
+ cvtsi2ss m3, maxvald
+%endif
+ shufps m0, m0, 0
+ shufps m1, m1, 0
+ shufps m3, m3, 0
+ shl is_signedd, 31
+ movd m4, is_signedd
+ shufps m4, m4, 0
+ shl sized, 2
+ add inq, sizeq
+ add outq, sizeq
+ add scaledq, sizeq
+ neg sizeq
+.loop:
+ mulps m2, m0, [scaledq+sizeq]
+ addps m2, m1
+ minps m2, m3
+ andps m5, m4, [inq+sizeq]
+ orps m2, m5
+ cvttps2dq m2, m2
+ mova [outq+sizeq], m2
+ add sizeq, mmsize
+ jl .loop
+ RET