diff options
author | James Almer <jamrial@gmail.com> | 2017-06-18 22:33:27 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-06-18 22:33:27 -0300 |
commit | e229df9478b2c6d476ea44d0e492609d4ab4e0c5 (patch) | |
tree | 77ffac0e550eb344d49c505ddda0af39d4a94a4a /libavcodec/x86/aacpsdsp.asm | |
parent | 3c5a53cdfa099bba8bd951f95b85727b4b3b5d68 (diff) | |
download | ffmpeg-e229df9478b2c6d476ea44d0e492609d4ab4e0c5.tar.gz |
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
Diffstat (limited to 'libavcodec/x86/aacpsdsp.asm')
-rw-r--r-- | libavcodec/x86/aacpsdsp.asm | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 67d3bd4e07..66bbbf4e44 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -166,6 +166,129 @@ align 16 jl .loop REP_RET +;*********************************************************** +;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;*********************************************************** +%macro HYBRID_SYNTHESIS_DEINT 0 +cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp +%if cpuflag(sse4) +%define MOVH movsd +%else +%define MOVH movlps +%endif + movsxdifnidn iq, id + mov lend, 32 << 3 + lea outq, [outq+iq*4] + mov tmpd, id + shl tmpd, 8 + add inq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [inq] + movaps m1, [inq+lenq] + movaps m2, [inq+lenq*2] + movaps m3, [inq+3*32*2*4] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [out0q], m0 + movaps [out1q], m1 + movaps [out0q+lenq], m2 + movaps [out1q+lenq], m3 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add outq, 16 + add inq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop8: + movaps m0, [inq] + movaps m1, [inq+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + MOVH [out0q], m0 + MOVH [out1q], m1 + movhps [out0q+lenq], m0 + movhps [out1q+lenq], m1 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add outq, 8 + add inq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop4: + movaps m0, [inq] + movss [out0q], m0 +%if cpuflag(sse4) + extractps [out1q], m0, 1 + extractps [out0q+lenq], m0, 2 + extractps [out1q+lenq], m0, 3 +%else + movhlps m1, m0 + movss [out0q+lenq], m1 + shufps m0, m0, 0xb1 + movss [out1q], m0 + movhlps m1, m0 + movss [out1q+lenq], m1 +%endif + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add outq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET +%endmacro + +INIT_XMM sse +HYBRID_SYNTHESIS_DEINT +INIT_XMM sse4 +HYBRID_SYNTHESIS_DEINT + ;******************************************************************* ;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2], ; const float (*filter)[8][2], |