x86/sbrdsp: add ff_sbr_autocorrelate_{sse,sse3}

2 to 2.5 times faster. Signed-off-by: James Almer <jamrial@gmail.com>
author: James Almer <jamrial@gmail.com> 2015-01-25 18:06:28 -0300
committer: James Almer <jamrial@gmail.com> 2015-01-25 18:20:39 -0300
commit: 449b21bfab25beafe673971dced5e812f531e157 (patch)
tree: 7a0fa1162568c4035b2696f3f242c85951391c45 /libavcodec/x86
parent: 961353d842bc33ac81cf3cf3e66dc37db613fac5 (diff)
download: ffmpeg-449b21bfab25beafe673971dced5e812f531e157.tar.gz
2 files changed, 122 insertions, 0 deletions
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index 6f2e4f48d9..a8ec7ed408 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -25,6 +25,7 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
 ps_noise0       times 2 dd  1.0,  0.0,
 ps_noise2       times 2 dd -1.0,  0.0
 ps_noise13      dd  0.0,  1.0, 0.0, -1.0
@@ -445,3 +446,116 @@ cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
     add        cq, mmsize
     jl      .loop
     REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+    movddup m5, [xq+cntq]
+%else
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    movlps  m7, [xq+cntq+8 ]
+    movlps  m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    movlps  m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    movlps  m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    movlps  m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    movlps  m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m4, m1, m2
+    mulps   m1, m1
+    addps   m4, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m4, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+%if cpuflag(sse3)
+    movshdup m2, m1
+    haddps  m4, m5
+    haddps  m7, m6
+    addss   m1, m2
+%else
+    movaps  m3, m4
+    movaps  m2, m5
+    movaps  m0, m6
+    shufps  m3, m3, q0301
+    shufps  m2, m2, q0301
+    shufps  m0, m0, q0301
+    addps   m4, m3
+    addps   m5, m2
+    addps   m6, m0
+
+    movss   m2, m7
+    movss   m3, m1
+    shufps  m7, m7, q0001
+    shufps  m1, m1, q0001
+    addss   m7, m2
+    addss   m1, m3
+    shufps  m4, m5, q2020
+    shufps  m7, m6, q2020
+%endif
+    movaps  [phiq     ], m4
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index a2aca742cf..6911a1a515 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -53,6 +53,9 @@ void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
 
 void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
 
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -66,6 +69,7 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
         s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
@@ -76,4 +80,8 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
         s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
     }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
+    }
 }
author	James Almer <jamrial@gmail.com>	2015-01-25 18:06:28 -0300
committer	James Almer <jamrial@gmail.com>	2015-01-25 18:20:39 -0300
commit	449b21bfab25beafe673971dced5e812f531e157 (patch)
tree	7a0fa1162568c4035b2696f3f242c85951391c45 /libavcodec/x86
parent	961353d842bc33ac81cf3cf3e66dc37db613fac5 (diff)
download	ffmpeg-449b21bfab25beafe673971dced5e812f531e157.tar.gz