diff options
author | James Almer <jamrial@gmail.com> | 2017-04-10 12:17:03 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2017-04-10 12:18:55 -0300 |
commit | ed9b25a148f228433d65c31ec8d65d5ad1983215 (patch) | |
tree | 25a99d505e48e2dd27fcc350396ceba04b78ef4a /libavutil/x86/float_dsp.asm | |
parent | 128e1fbf1335d23425609cfae3fae8d63170f875 (diff) | |
download | ffmpeg-ed9b25a148f228433d65c31ec8d65d5ad1983215.tar.gz |
x86/float_dsp: add ff_vector_dmac_scalar_{sse2,avx,fma3}
Diffstat (limited to 'libavutil/x86/float_dsp.asm')
-rw-r--r-- | libavutil/x86/float_dsp.asm | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 021ff03c87..9affacb72b 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -149,6 +149,69 @@ INIT_XMM sse VECTOR_FMUL_SCALAR ;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, ; int len) ;------------------------------------------------------------------------------ |