diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-01-18 22:34:29 +0100 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-02-15 20:11:54 -0800 |
commit | f3e084909bff422f0c853507a82f92ff2efc0d28 (patch) | |
tree | f40d51982942666861616d941aebcf910f1b6341 /libavcodec/x86/mpegaudiodec_mmx.c | |
parent | ae591aeea58d64399b8281be31dacec0de85ae04 (diff) | |
download | ffmpeg-f3e084909bff422f0c853507a82f92ff2efc0d28.tar.gz |
mpegaudio: replace memcpy by SIMD code
By replacing memcpy with an unrolled loop using the alignment knowledge
it has, some speedup can be obtained.
Before (gcc 4.6.1): ~400 cycles
After: ~370 cycles
Overall, around 2% speed increase when decoding a 2400s mp3 to f32le.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/mpegaudiodec_mmx.c')
-rw-r--r-- | libavcodec/x86/mpegaudiodec_mmx.c | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index 06ffbca90a..f51a06d14c 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -106,7 +106,26 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, float sum; /* copy to avoid wrap */ - memcpy(in + 512, in, 32 * sizeof(*in)); + __asm__ volatile( + "movaps 0(%0), %%xmm0 \n\t" \ + "movaps 16(%0), %%xmm1 \n\t" \ + "movaps 32(%0), %%xmm2 \n\t" \ + "movaps 48(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 0(%1) \n\t" \ + "movaps %%xmm1, 16(%1) \n\t" \ + "movaps %%xmm2, 32(%1) \n\t" \ + "movaps %%xmm3, 48(%1) \n\t" \ + "movaps 64(%0), %%xmm0 \n\t" \ + "movaps 80(%0), %%xmm1 \n\t" \ + "movaps 96(%0), %%xmm2 \n\t" \ + "movaps 112(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 64(%1) \n\t" \ + "movaps %%xmm1, 80(%1) \n\t" \ + "movaps %%xmm2, 96(%1) \n\t" \ + "movaps %%xmm3, 112(%1) \n\t" + ::"r"(in), "r"(in+512) + :"memory" + ); apply_window(in + 16, win , win + 512, suma, sumc, 16); apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); |