diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-02-23 19:48:58 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-02-23 15:50:06 -0800 |
commit | 34454c761f01275d4adaf40df6d70a59011c4a6c (patch) | |
tree | a25a23c028ddee97c1195567f855ce064bdbe916 /libavcodec/x86/sbrdsp.asm | |
parent | 2e74a5abc2fda6cfbc86589852d6194d502332cb (diff) | |
download | ffmpeg-34454c761f01275d4adaf40df6d70a59011c4a6c.tar.gz |
SBR DSP x86: implement SSE sbr_sum_square_sse
The 32bits targets have been compiled with -mfpmath=sse for proper reference.
sbr_sum_square C /32bits: 82c (unrolled)/102c
C /64bits: 69c (unrolled)/82c
SSE/32bits: 42c
SSE/64bits: 31c
Use of SSE4.1 dpps to perform the final sum is slower.
Not unrolling to perform 8 operations in a loop yields 10 more cycles.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/sbrdsp.asm')
-rw-r--r-- | libavcodec/x86/sbrdsp.asm | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm new file mode 100644 index 0000000000..71471bd5ab --- /dev/null +++ b/libavcodec/x86/sbrdsp.asm @@ -0,0 +1,74 @@ +;****************************************************************************** +;* AAC Spectral Band Replication decoding functions +;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +;SECTION_RODATA +SECTION .text + +INIT_XMM sse +cglobal sbr_sum_square, 2, 3, 6 + mov r2, r1 + xorps m0, m0 + xorps m1, m1 + sar r2, 3 + jz .prepare +.loop: + movu m2, [r0 + 0] + movu m3, [r0 + 16] + movu m4, [r0 + 32] + movu m5, [r0 + 48] + mulps m2, m2 + mulps m3, m3 + mulps m4, m4 + mulps m5, m5 + addps m0, m2 + addps m1, m3 + addps m0, m4 + addps m1, m5 + add r0, 64 + dec r2 + jnz .loop +.prepare: + and r1, 7 + sar r1, 1 + jz .end +; len is a multiple of 2, thus there are at least 4 elements to process +.endloop: + movu m2, [r0] + add r0, 16 + mulps m2, m2 + dec r1 + addps m0, m2 + jnz .endloop +.end: + addps m0, m1 + movhlps m2, m0 + addps m0, m2 + movss m1, m0 + shufps m0, m0, 1 + addss m0, m1 +%if ARCH_X86_64 == 0 + movd r0m, m0 + fld dword r0m +%endif + RET |