diff options
author | James Almer <jamrial@gmail.com> | 2017-06-04 23:29:56 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-06-04 23:29:56 -0300 |
commit | 933dd62288ba9e73145932f229f355c985862641 (patch) | |
tree | f3705efed8f4bdd0f473b10ba3d99a131820803d | |
parent | caf7d6178a4d5f24c915da48410a9790b21703aa (diff) | |
download | ffmpeg-933dd62288ba9e73145932f229f355c985862641.tar.gz |
x86/aacpsdsp: optimize ff_ps_mul_pair_single_sse
~2% faster.
-rw-r--r-- | libavcodec/x86/aacpsdsp.asm | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 4548bb4257..22a03f4f76 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -62,24 +62,27 @@ PS_ADD_SQUARES 3 ; float *src1, int n); ;******************************************************************* INIT_XMM sse -cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n - xor r4q, r4q +cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n + shl nd, 3 + add src1q, nq + add dstq, nq + neg nq +align 16 .loop: - movu m0, [src1q+r4q] - movu m1, [src1q+r4q+mmsize] + movu m0, [src1q+nq] + movu m1, [src1q+nq+mmsize] mova m2, [src2q] mova m3, m2 unpcklps m2, m2 unpckhps m3, m3 mulps m0, m2 mulps m1, m3 - mova [dstq+r4q], m0 - mova [dstq+r4q+mmsize], m1 + mova [dstq+nq], m0 + mova [dstq+nq+mmsize], m1 add src2q, mmsize - add r4q, mmsize*2 - sub nd, mmsize/4 - jg .loop + add nq, mmsize*2 + jl .loop REP_RET ;*********************************************************************** |