diff options
author | James Darnley <james.darnley@gmail.com> | 2014-08-12 23:22:03 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-08-13 03:09:26 +0200 |
commit | 54a51d384055a771ba1eeef3c2f399bd03fa2663 (patch) | |
tree | 2d7e770fb0fcfebaba0b4ef02f9714e07109f4c2 | |
parent | a8592db9bb787e6cd3aece69ce211cb97bd718cd (diff) | |
download | ffmpeg-54a51d384055a771ba1eeef3c2f399bd03fa2663.tar.gz |
lavc/flacenc: partially unroll loop in flac_enc_lpc_16
It now does 12 samples per iteration, up from 4.
From 1.8 to 3.2 times faster again. 3.6 to 5.7 times faster overall.
Runtime is reduced by a further 2 to 18%. Overall runtime reduced by
4 to 50%.
Same conditions as before apply.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/flacenc.c | 2 | ||||
-rw-r--r-- | libavcodec/x86/flac_dsp_gpl.asm | 26 |
2 files changed, 22 insertions, 6 deletions
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c index f37bab8f3e..3b72888966 100644 --- a/libavcodec/flacenc.c +++ b/libavcodec/flacenc.c @@ -80,7 +80,7 @@ typedef struct FlacSubframe { int shift; RiceContext rc; int32_t samples[FLAC_MAX_BLOCKSIZE]; - int32_t residual[FLAC_MAX_BLOCKSIZE+3]; + int32_t residual[FLAC_MAX_BLOCKSIZE+11]; } FlacSubframe; typedef struct FlacFrame { diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index 1f28be132a..cedf0837a7 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -26,13 +26,13 @@ SECTION_TEXT INIT_XMM sse4 %if ARCH_X86_64 - cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs + cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 %define length r2d movsxd orderq, orderd %else - cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs + cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs DECLARE_REG_TMP 2, 5 %define length r2mp %endif @@ -59,6 +59,8 @@ neg orderq .looplen: pxor m0, m0 + pxor m4, m4 + pxor m6, m6 mov posj, orderq xor negj, negj @@ -66,20 +68,34 @@ neg orderq movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4-4+mmsize] + movu m7, [smpq+negj*4-4+mmsize*2] pmulld m1, m2 + pmulld m5, m2 + pmulld m7, m2 paddd m0, m1 ; p += c * s + paddd m4, m5 + paddd m6, m7 dec negj inc posj jnz .looporder psrad m0, m3 ; p >>= shift + psrad m4, m3 + psrad m6, m3 movu m1, [smpq] + movu m5, [smpq+mmsize] + movu m7, [smpq+mmsize*2] psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 movu [resq], m1 ; res[i] = smp[i] - (p >> shift) + movu [resq+mmsize], m5 + movu [resq+mmsize*2], m7 - add resq, mmsize - add smpq, mmsize - sub length, mmsize/4 + add resq, 3*mmsize + add smpq, 3*mmsize + sub length, (3*mmsize)/4 jg .looplen RET |