diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-02-14 15:03:12 +0000 |
---|---|---|
committer | Janne Grunau <janne-libav@jannau.net> | 2014-02-20 14:18:05 +0100 |
commit | 996697e266c8adc0ad9b7fc7568406c7529c97cf (patch) | |
tree | 4b143794e0a28c92722d81a62b8c55b9cbd00cc1 | |
parent | ef010f08ae53479c54e2f16be5a7e1a809a9e268 (diff) | |
download | ffmpeg-996697e266c8adc0ad9b7fc7568406c7529c97cf.tar.gz |
x86: float dsp: unroll SSE versions
vector_fmul and vector_fmac_scalar are guaranteed that they can process in
batch of 16 elements, but their SSE versions only does 8 at a time.
Therefore, unroll them a bit.
299 to 261c for 256 elements in vector_fmac_scalar on Arrandale/Win64.
Signed-off-by: Janne Grunau <janne-libav@jannau.net>
-rw-r--r-- | libavutil/x86/float_dsp.asm | 40 |
1 files changed, 24 insertions, 16 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 10330ff336..d96249978a 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -27,17 +27,21 @@ SECTION .text ;----------------------------------------------------------------------------- %macro VECTOR_FMUL 0 cglobal vector_fmul, 4,4,2, dst, src0, src1, len - lea lenq, [lend*4 - 2*mmsize] + lea lenq, [lend*4 - 64] ALIGN 16 .loop: - mova m0, [src0q + lenq] - mova m1, [src0q + lenq + mmsize] - mulps m0, m0, [src1q + lenq] - mulps m1, m1, [src1q + lenq + mmsize] - mova [dstq + lenq], m0 - mova [dstq + lenq + mmsize], m1 +%assign a 0 +%rep 32/mmsize + mova m0, [src0q + lenq + (a+0)*mmsize] + mova m1, [src0q + lenq + (a+1)*mmsize] + mulps m0, m0, [src1q + lenq + (a+0)*mmsize] + mulps m1, m1, [src1q + lenq + (a+1)*mmsize] + mova [dstq + lenq + (a+0)*mmsize], m0 + mova [dstq + lenq + (a+1)*mmsize], m1 +%assign a a+2 +%endrep - sub lenq, 2*mmsize + sub lenq, 64 jge .loop REP_RET %endmacro @@ -68,15 +72,19 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len vinsertf128 m0, m0, xmm0, 1 %endif %endif - lea lenq, [lend*4-2*mmsize] + lea lenq, [lend*4-64] .loop: - mulps m1, m0, [srcq+lenq ] - mulps m2, m0, [srcq+lenq+mmsize] - addps m1, m1, [dstq+lenq ] - addps m2, m2, [dstq+lenq+mmsize] - mova [dstq+lenq ], m1 - mova [dstq+lenq+mmsize], m2 - sub lenq, 2*mmsize +%assign a 0 +%rep 32/mmsize + mulps m1, m0, [srcq+lenq+(a+0)*mmsize] + mulps m2, m0, [srcq+lenq+(a+1)*mmsize] + addps m1, m1, [dstq+lenq+(a+0)*mmsize] + addps m2, m2, [dstq+lenq+(a+1)*mmsize] + mova [dstq+lenq+(a+0)*mmsize], m1 + mova [dstq+lenq+(a+1)*mmsize], m2 +%assign a a+2 +%endrep + sub lenq, 64 jge .loop REP_RET %endmacro |