aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/sbrdsp.asm
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-02-25 04:00:43 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-02-25 04:00:43 +0100
commitb008ac18bb6072acb355445436a999c940538d84 (patch)
tree29d0042d7a4d0bc64f452440c2060a13a1e00e51 /libavcodec/x86/sbrdsp.asm
parent7b9d8703f35585b065c32194b52131b7dd90c710 (diff)
parentd6a77e2b97f3968b99798faeb70e873eb5910849 (diff)
downloadffmpeg-b008ac18bb6072acb355445436a999c940538d84.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: docs: use -bsf:[vas] instead of -[vas]bsf. mpegaudiodec: Prevent premature clipping of mp3 input buffer. lavf: move the packet keyframe setting code. oggenc: free comment header for all codecs lcl: error out if uncompressed input buffer is smaller than framesize. mjpeg: abort decoding if packet is too large. golomb: use HAVE_BITS_REMAINING() macro to prevent infloop on EOF. get_bits: add HAVE_BITS_REMAINING macro. lavf/output-example: use new audio encoding API correctly. lavf/output-example: more proper usage of the new API. tiff: Prevent overreads in the type_sizes array. tiff: Make the TIFF_LONG and TIFF_SHORT types unsigned. apetag: do not leak memory if avio_read() fails apetag: propagate errors. SBR DSP x86: implement SSE sbr_hf_g_filt SBR DSP x86: implement SSE sbr_sum_square_sse SBR DSP: use intptr_t for the ixh parameter. Conflicts: doc/bitstream_filters.texi doc/examples/muxing.c doc/ffmpeg.texi libavcodec/golomb.h libavcodec/x86/Makefile libavformat/oggenc.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/sbrdsp.asm')
-rw-r--r--libavcodec/x86/sbrdsp.asm114
1 files changed, 114 insertions, 0 deletions
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
new file mode 100644
index 0000000000..c165c52ca4
--- /dev/null
+++ b/libavcodec/x86/sbrdsp.asm
@@ -0,0 +1,114 @@
+;******************************************************************************
+;* AAC Spectral Band Replication decoding functions
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+;SECTION_RODATA
+SECTION .text
+
+INIT_XMM sse
+cglobal sbr_sum_square, 2, 3, 6
+ mov r2, r1
+ xorps m0, m0
+ xorps m1, m1
+ sar r2, 3
+ jz .prepare
+.loop:
+ movu m2, [r0 + 0]
+ movu m3, [r0 + 16]
+ movu m4, [r0 + 32]
+ movu m5, [r0 + 48]
+ mulps m2, m2
+ mulps m3, m3
+ mulps m4, m4
+ mulps m5, m5
+ addps m0, m2
+ addps m1, m3
+ addps m0, m4
+ addps m1, m5
+ add r0, 64
+ dec r2
+ jnz .loop
+.prepare:
+ and r1, 7
+ sar r1, 1
+ jz .end
+; len is a multiple of 2, thus there are at least 4 elements to process
+.endloop:
+ movu m2, [r0]
+ add r0, 16
+ mulps m2, m2
+ dec r1
+ addps m0, m2
+ jnz .endloop
+.end:
+ addps m0, m1
+ movhlps m2, m0
+ addps m0, m2
+ movss m1, m0
+ shufps m0, m0, 1
+ addss m0, m1
+%if ARCH_X86_64 == 0
+ movd r0m, m0
+ fld dword r0m
+%endif
+ RET
+
+%define STEP 40*4*2
+cglobal sbr_hf_g_filt, 5, 6, 5
+ lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
+ mov r5, r3
+ and r3, 0xFC
+ lea r2, [r2 + r3*4]
+ lea r0, [r0 + r3*8]
+ neg r3
+.loop4:
+ movq m0, [r2 + 4*r3 + 0]
+ movq m1, [r2 + 4*r3 + 8]
+ movq m2, [r1 + 0*STEP]
+ movq m3, [r1 + 2*STEP]
+ movhps m2, [r1 + 1*STEP]
+ movhps m3, [r1 + 3*STEP]
+ punpckldq m0, m0
+ punpckldq m1, m1
+ mulps m0, m2
+ mulps m1, m3
+ movu [r0 + 8*r3 + 0], m0
+ movu [r0 + 8*r3 + 16], m1
+ add r1, 4*STEP
+ add r3, 4
+ jnz .loop4
+ and r5, 3 ; number of single element loops
+ jz .end
+.loop1: ; element 0 and 1 can be computed at the same time
+ movss m0, [r2]
+ movq m2, [r1]
+ punpckldq m0, m0
+ mulps m2, m0
+ movq [r0], m2
+ add r0, 8
+ add r2, 4
+ add r1, STEP
+ dec r5
+ jnz .loop1
+.end:
+ RET