aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2016-07-03 18:48:22 -0300
committerJames Almer <jamrial@gmail.com>2016-07-05 17:48:20 -0300
commit645489cf90b09a3bd1ed57b2c13b37b1512450bb (patch)
tree0bdd6b66eceb397b88599e051b3b9f50eddef5f9 /libavcodec/x86
parentd915b6e592309310c83fa27f4cec5b69ebbd51f2 (diff)
downloadffmpeg-645489cf90b09a3bd1ed57b2c13b37b1512450bb.tar.gz
x86/dcadsp: optimize lfe_fir0_float_fma3 on x86_32
About 10% faster. Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dcadsp.asm43
1 files changed, 31 insertions, 12 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index c5bf21aebd..055361a765 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -24,7 +24,7 @@
SECTION .text
%define sizeof_float 4
-%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64)
+%define FMA3_OFFSET (8 * cpuflag(fma3))
%macro LFE_FIR0_FLOAT 0
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
@@ -101,11 +101,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
%endif
%else ; ARCH_X86_32
%if cpuflag(fma3)
- mulps m0, m7, [coeffq+cnt1q*8 ]
- movaps m1, [coeffq+cnt1q*8+16]
- mulps m2, m7, [coeffq+cnt1q*8+32]
- fmaddps m0, m6, m1, m0
- fmaddps m2, m6, [coeffq+cnt1q*8+48], m2
+ mulps m0, m7, [coeffq+cnt1q*8 ]
+ mulps m1, m7, [coeffq+cnt1q*8+32 ]
+ mulps m2, m7, [coeffq+cnt1q*8+64 ]
+ mulps m3, m7, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
%else
mulps m0, m7, [coeffq+cnt1q*8 ]
mulps m1, m6, [coeffq+cnt1q*8+16]
@@ -113,13 +121,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
mulps m3, m6, [coeffq+cnt1q*8+48]
addps m0, m1
addps m2, m3
-%endif
+
unpckhps m3, m0, m2
unpcklps m0, m2
addps m3, m0
movhlps m2, m3
addps m2, m3
movlps [samplesq+cnt1q], m2
+%endif
%endif; ARCH
%if ARCH_X86_64
@@ -154,10 +163,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
%endif
%else ; ARCH_X86_32
%if cpuflag(fma3)
- mulps m0, m5, [coeffq+cnt1q*8 ]
- mulps m2, m5, [coeffq+cnt1q*8+32]
- fmaddps m0, m4, m1, m0
- fmaddps m2, m4, [coeffq+cnt1q*8+48], m2
+ mulps m0, m5, [coeffq+cnt1q*8 ]
+ mulps m1, m5, [coeffq+cnt1q*8+32 ]
+ mulps m2, m5, [coeffq+cnt1q*8+64 ]
+ mulps m3, m5, [coeffq+cnt1q*8+96 ]
+ fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
+ fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
+ fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
+ fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
+
+ haddps m1, m0
+ haddps m3, m2
+ haddps m3, m1
+ movaps [samplesq+cnt2q], m3
%else
mulps m0, m5, [coeffq+cnt1q*8 ]
mulps m1, m4, [coeffq+cnt1q*8+16]
@@ -165,13 +183,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
mulps m3, m4, [coeffq+cnt1q*8+48]
addps m0, m1
addps m2, m3
-%endif
+
unpckhps m3, m2, m0
unpcklps m2, m0
addps m3, m2
movhlps m0, m3
addps m0, m3
movlps [samplesq+cnt2q], m0
+%endif
%endif; ARCH
sub cnt2d, 8 + FMA3_OFFSET