diff options
author | James Almer <jamrial@gmail.com> | 2017-06-18 22:33:27 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-06-18 22:33:27 -0300 |
commit | e229df9478b2c6d476ea44d0e492609d4ab4e0c5 (patch) | |
tree | 77ffac0e550eb344d49c505ddda0af39d4a94a4a | |
parent | 3c5a53cdfa099bba8bd951f95b85727b4b3b5d68 (diff) | |
download | ffmpeg-e229df9478b2c6d476ea44d0e492609d4ab4e0c5.tar.gz |
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
-rw-r--r-- | libavcodec/x86/aacpsdsp.asm | 123 | ||||
-rw-r--r-- | libavcodec/x86/aacpsdsp_init.c | 8 | ||||
-rw-r--r-- | libavutil/x86/x86util.asm | 15 |
3 files changed, 140 insertions, 6 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 67d3bd4e07..66bbbf4e44 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -166,6 +166,129 @@ align 16 jl .loop REP_RET +;*********************************************************** +;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;*********************************************************** +%macro HYBRID_SYNTHESIS_DEINT 0 +cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp +%if cpuflag(sse4) +%define MOVH movsd +%else +%define MOVH movlps +%endif + movsxdifnidn iq, id + mov lend, 32 << 3 + lea outq, [outq+iq*4] + mov tmpd, id + shl tmpd, 8 + add inq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [inq] + movaps m1, [inq+lenq] + movaps m2, [inq+lenq*2] + movaps m3, [inq+3*32*2*4] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [out0q], m0 + movaps [out1q], m1 + movaps [out0q+lenq], m2 + movaps [out1q+lenq], m3 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add outq, 16 + add inq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop8: + movaps m0, [inq] + movaps m1, [inq+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + MOVH [out0q], m0 + MOVH [out1q], m1 + movhps [out0q+lenq], m0 + movhps [out1q+lenq], m1 + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add outq, 8 + add inq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov out0q, outq + mov out1q, 38*64*4 + add out1q, out0q + mov tmpd, lend + +.inner_loop4: + movaps m0, [inq] + movss [out0q], m0 +%if cpuflag(sse4) + extractps [out1q], m0, 1 + extractps [out0q+lenq], m0, 2 + extractps [out1q+lenq], m0, 3 +%else + movhlps m1, m0 + movss [out0q+lenq], m1 + shufps m0, m0, 0xb1 + movss [out1q], m0 + movhlps m1, m0 + movss [out1q+lenq], m1 +%endif + lea out0q, [out0q+lenq*2] + lea out1q, [out1q+lenq*2] + add inq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add outq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET +%endmacro + +INIT_XMM sse +HYBRID_SYNTHESIS_DEINT +INIT_XMM sse4 +HYBRID_SYNTHESIS_DEINT + ;******************************************************************* ;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2], ; const float (*filter)[8][2], diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index 767ae6588e..25e089c395 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -40,6 +40,10 @@ void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], float h[2][4], float h_step[2][4], int len); +void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2], + int i, int len); +void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2], + int i, int len); av_cold void ff_psdsp_init_x86(PSDSPContext *s) { @@ -48,6 +52,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) if (EXTERNAL_SSE(cpu_flags)) { s->add_squares = ff_ps_add_squares_sse; s->mul_pair_single = ff_ps_mul_pair_single_sse; + s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse; s->hybrid_analysis = ff_ps_hybrid_analysis_sse; } if (EXTERNAL_SSE3(cpu_flags)) { @@ -56,4 +61,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3; s->hybrid_analysis = ff_ps_hybrid_analysis_sse3; } + if (EXTERNAL_SSE4(cpu_flags)) { + s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4; + } } diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index fe9a727e22..cc7d272cad 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -71,6 +71,12 @@ SWAP %1, %3, %2 %endmacro +%macro SBUTTERFLYPD 3 + movlhps m%3, m%1, m%2 + movhlps m%2, m%2, m%1 + SWAP %1, %3 +%endmacro + %macro TRANSPOSE4x4B 5 SBUTTERFLY bw, %1, %2, %5 SBUTTERFLY bw, %3, %4, %5 @@ -117,12 +123,9 @@ %macro TRANSPOSE4x4PS 5 SBUTTERFLYPS %1, %2, %5 SBUTTERFLYPS %3, %4, %5 - movlhps m%5, m%1, m%3 - movhlps m%3, m%1 - SWAP %5, %1 - movlhps m%5, m%2, m%4 - movhlps m%4, m%2 - SWAP %5, %2, %3 + SBUTTERFLYPD %1, %3, %5 + SBUTTERFLYPD %2, %4, %5 + SWAP %2, %3 %endmacro %macro TRANSPOSE8x4D 9-11 |