aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/x86/float_dsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-03-10 17:09:20 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-03-13 04:34:05 +0100
commit7d7487e85c066bf3f4e5821a49081f520b6bc1e7 (patch)
tree435ea72c626de74760ed86b026f4396f242687ac /libavutil/x86/float_dsp.asm
parent12ce58bebdff6bfae9c56dc785e3003968f93277 (diff)
downloadffmpeg-7d7487e85c066bf3f4e5821a49081f520b6bc1e7.tar.gz
x86/float_dsp: add ff_vector_{fmul_add, fmac_scalar}_fma3
~7% faster than AVX Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil/x86/float_dsp.asm')
-rw-r--r--libavutil/x86/float_dsp.asm24
1 files changed, 23 insertions, 1 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index f762e34f6a..d0f4be8c53 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -80,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
.loop:
%assign a 0
%rep 32/mmsize
+%if cpuflag(fma3)
+ mova m1, [dstq+lenq+(a+0)*mmsize]
+ mova m2, [dstq+lenq+(a+1)*mmsize]
+ fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1
+ fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2
+%else
mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
addps m1, m1, [dstq+lenq+(a+0)*mmsize]
addps m2, m2, [dstq+lenq+(a+1)*mmsize]
+%endif
mova [dstq+lenq+(a+0)*mmsize], m1
mova [dstq+lenq+(a+1)*mmsize], m2
%assign a a+2
@@ -99,6 +106,10 @@ VECTOR_FMAC_SCALAR
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMAC_SCALAR
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
@@ -182,16 +193,23 @@ VECTOR_DMUL_SCALAR
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
-cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
+cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
+%if cpuflag(fma3)
+ mova m2, [src2q + lenq]
+ mova m3, [src2q + lenq + mmsize]
+ fmaddps m0, m0, [src1q + lenq], m2
+ fmaddps m1, m1, [src1q + lenq + mmsize], m3
+%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
+%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
@@ -206,6 +224,10 @@ VECTOR_FMUL_ADD
INIT_YMM avx
VECTOR_FMUL_ADD
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMUL_ADD
+%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,