aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-02-14 15:03:12 +0000
committerJanne Grunau <janne-libav@jannau.net>2014-02-20 14:18:05 +0100
commit996697e266c8adc0ad9b7fc7568406c7529c97cf (patch)
tree4b143794e0a28c92722d81a62b8c55b9cbd00cc1
parentef010f08ae53479c54e2f16be5a7e1a809a9e268 (diff)
downloadffmpeg-996697e266c8adc0ad9b7fc7568406c7529c97cf.tar.gz
x86: float dsp: unroll SSE versions
vector_fmul and vector_fmac_scalar are guaranteed that they can process in batch of 16 elements, but their SSE versions only does 8 at a time. Therefore, unroll them a bit. 299 to 261c for 256 elements in vector_fmac_scalar on Arrandale/Win64. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
-rw-r--r--libavutil/x86/float_dsp.asm40
1 files changed, 24 insertions, 16 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 10330ff336..d96249978a 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -27,17 +27,21 @@ SECTION .text
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
- lea lenq, [lend*4 - 2*mmsize]
+ lea lenq, [lend*4 - 64]
ALIGN 16
.loop:
- mova m0, [src0q + lenq]
- mova m1, [src0q + lenq + mmsize]
- mulps m0, m0, [src1q + lenq]
- mulps m1, m1, [src1q + lenq + mmsize]
- mova [dstq + lenq], m0
- mova [dstq + lenq + mmsize], m1
+%assign a 0
+%rep 32/mmsize
+ mova m0, [src0q + lenq + (a+0)*mmsize]
+ mova m1, [src0q + lenq + (a+1)*mmsize]
+ mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
+ mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
+ mova [dstq + lenq + (a+0)*mmsize], m0
+ mova [dstq + lenq + (a+1)*mmsize], m1
+%assign a a+2
+%endrep
- sub lenq, 2*mmsize
+ sub lenq, 64
jge .loop
REP_RET
%endmacro
@@ -68,15 +72,19 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
vinsertf128 m0, m0, xmm0, 1
%endif
%endif
- lea lenq, [lend*4-2*mmsize]
+ lea lenq, [lend*4-64]
.loop:
- mulps m1, m0, [srcq+lenq ]
- mulps m2, m0, [srcq+lenq+mmsize]
- addps m1, m1, [dstq+lenq ]
- addps m2, m2, [dstq+lenq+mmsize]
- mova [dstq+lenq ], m1
- mova [dstq+lenq+mmsize], m2
- sub lenq, 2*mmsize
+%assign a 0
+%rep 32/mmsize
+ mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
+ mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
+ addps m1, m1, [dstq+lenq+(a+0)*mmsize]
+ addps m2, m2, [dstq+lenq+(a+1)*mmsize]
+ mova [dstq+lenq+(a+0)*mmsize], m1
+ mova [dstq+lenq+(a+1)*mmsize], m2
+%assign a a+2
+%endrep
+ sub lenq, 64
jge .loop
REP_RET
%endmacro