aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/aacpsdsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2017-06-18 22:33:27 -0300
committerJames Almer <jamrial@gmail.com>2017-06-18 22:33:27 -0300
commite229df9478b2c6d476ea44d0e492609d4ab4e0c5 (patch)
tree77ffac0e550eb344d49c505ddda0af39d4a94a4a /libavcodec/x86/aacpsdsp.asm
parent3c5a53cdfa099bba8bd951f95b85727b4b3b5d68 (diff)
downloadffmpeg-e229df9478b2c6d476ea44d0e492609d4ab4e0c5.tar.gz
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
Diffstat (limited to 'libavcodec/x86/aacpsdsp.asm')
-rw-r--r--libavcodec/x86/aacpsdsp.asm123
1 files changed, 123 insertions, 0 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index 67d3bd4e07..66bbbf4e44 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -166,6 +166,129 @@ align 16
jl .loop
REP_RET
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+; float (*in)[32][2],
+; int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+ movsxdifnidn iq, id
+ mov lend, 32 << 3
+ lea outq, [outq+iq*4]
+ mov tmpd, id
+ shl tmpd, 8
+ add inq, tmpq
+ mov tmpd, 64
+ sub tmpd, id
+ mov id, tmpd
+
+ test id, 1
+ jne .loop4
+ test id, 2
+ jne .loop8
+
+align 16
+.loop16:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop16:
+ movaps m0, [inq]
+ movaps m1, [inq+lenq]
+ movaps m2, [inq+lenq*2]
+ movaps m3, [inq+3*32*2*4]
+ TRANSPOSE4x4PS 0, 1, 2, 3, 4
+ movaps [out0q], m0
+ movaps [out1q], m1
+ movaps [out0q+lenq], m2
+ movaps [out1q+lenq], m3
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop16
+ add outq, 16
+ add inq, 3*32*2*4
+ sub id, 4
+ jg .loop16
+ RET
+
+align 16
+.loop8:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop8:
+ movaps m0, [inq]
+ movaps m1, [inq+lenq]
+ SBUTTERFLYPS 0, 1, 2
+ SBUTTERFLYPD 0, 1, 2
+ MOVH [out0q], m0
+ MOVH [out1q], m1
+ movhps [out0q+lenq], m0
+ movhps [out1q+lenq], m1
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop8
+ add outq, 8
+ add inq, lenq
+ sub id, 2
+ jg .loop16
+ RET
+
+align 16
+.loop4:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop4:
+ movaps m0, [inq]
+ movss [out0q], m0
+%if cpuflag(sse4)
+ extractps [out1q], m0, 1
+ extractps [out0q+lenq], m0, 2
+ extractps [out1q+lenq], m0, 3
+%else
+ movhlps m1, m0
+ movss [out0q+lenq], m1
+ shufps m0, m0, 0xb1
+ movss [out1q], m0
+ movhlps m1, m0
+ movss [out1q+lenq], m1
+%endif
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop4
+ add outq, 4
+ sub id, 1
+ test id, 2
+ jne .loop8
+ cmp id, 4
+ jge .loop16
+ RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],