x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}

About 2x faster than the c version.
author: James Almer <jamrial@gmail.com> 2017-06-18 22:33:27 -0300
committer: James Almer <jamrial@gmail.com> 2017-06-18 22:33:27 -0300
commit: e229df9478b2c6d476ea44d0e492609d4ab4e0c5 (patch)
tree: 77ffac0e550eb344d49c505ddda0af39d4a94a4a /libavcodec/x86/aacpsdsp.asm
parent: 3c5a53cdfa099bba8bd951f95b85727b4b3b5d68 (diff)
download: ffmpeg-e229df9478b2c6d476ea44d0e492609d4ab4e0c5.tar.gz
1 files changed, 123 insertions, 0 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index 67d3bd4e07..66bbbf4e44 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -166,6 +166,129 @@ align 16
     jl .loop
     REP_RET
 
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+;                                    float (*in)[32][2],
+;                                    int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea               outq, [outq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add                inq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    movaps              m2, [inq+lenq*2]
+    movaps              m3, [inq+3*32*2*4]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps         [out0q], m0
+    movaps         [out1q], m1
+    movaps    [out0q+lenq], m2
+    movaps    [out1q+lenq], m3
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add               outq, 16
+    add                inq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    MOVH           [out0q], m0
+    MOVH           [out1q], m1
+    movhps    [out0q+lenq], m0
+    movhps    [out1q+lenq], m1
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add               outq, 8
+    add                inq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movaps              m0, [inq]
+    movss          [out0q], m0
+%if cpuflag(sse4)
+    extractps      [out1q], m0, 1
+    extractps [out0q+lenq], m0, 2
+    extractps [out1q+lenq], m0, 3
+%else
+    movhlps             m1, m0
+    movss     [out0q+lenq], m1
+    shufps              m0, m0, 0xb1
+    movss          [out1q], m0
+    movhlps             m1, m0
+    movss     [out1q+lenq], m1
+%endif
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add               outq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
 ;*******************************************************************
 ;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
 ;                                 const float (*filter)[8][2],
author	James Almer <jamrial@gmail.com>	2017-06-18 22:33:27 -0300
committer	James Almer <jamrial@gmail.com>	2017-06-18 22:33:27 -0300
commit	e229df9478b2c6d476ea44d0e492609d4ab4e0c5 (patch)
tree	77ffac0e550eb344d49c505ddda0af39d4a94a4a /libavcodec/x86/aacpsdsp.asm
parent	3c5a53cdfa099bba8bd951f95b85727b4b3b5d68 (diff)
download	ffmpeg-e229df9478b2c6d476ea44d0e492609d4ab4e0c5.tar.gz