diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2006-02-10 06:55:25 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2006-02-10 06:55:25 +0000 |
commit | ef9d1d15751c6a2e4c570727c198854ce8b44603 (patch) | |
tree | c2ed9fe8f2bf17d05109a494357d40737dd16146 /libavcodec/i386/h264dsp_mmx.c | |
parent | a283db3962c07f9dfab87dc7553b61cbc4e6efb8 (diff) | |
download | ffmpeg-ef9d1d15751c6a2e4c570727c198854ce8b44603.tar.gz |
h264: special case dc-only idct. ~1% faster overall
Originally committed as revision 4971 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/h264dsp_mmx.c')
-rw-r--r-- | libavcodec/i386/h264dsp_mmx.c | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c index 44a4718e9f..6debfd9fc8 100644 --- a/libavcodec/i386/h264dsp_mmx.c +++ b/libavcodec/i386/h264dsp_mmx.c @@ -104,6 +104,87 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) ); } +void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + asm volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dst+0*stride)), + "+m"(*(uint32_t*)(dst+1*stride)), + "+m"(*(uint32_t*)(dst+2*stride)), + "+m"(*(uint32_t*)(dst+3*stride)) + ); +} + +void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + int y; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + for(y=2; y--; dst += 4*stride){ + asm volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint64_t*)(dst+0*stride)), + "+m"(*(uint64_t*)(dst+1*stride)), + "+m"(*(uint64_t*)(dst+2*stride)), + "+m"(*(uint64_t*)(dst+3*stride)) + ); + } +} + /***********************************/ /* deblocking */ |