aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/i386/h264dsp_mmx.c
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2006-02-10 06:55:25 +0000
committerLoren Merritt <lorenm@u.washington.edu>2006-02-10 06:55:25 +0000
commitef9d1d15751c6a2e4c570727c198854ce8b44603 (patch)
treec2ed9fe8f2bf17d05109a494357d40737dd16146 /libavcodec/i386/h264dsp_mmx.c
parenta283db3962c07f9dfab87dc7553b61cbc4e6efb8 (diff)
downloadffmpeg-ef9d1d15751c6a2e4c570727c198854ce8b44603.tar.gz
h264: special case dc-only idct. ~1% faster overall
Originally committed as revision 4971 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/h264dsp_mmx.c')
-rw-r--r--libavcodec/i386/h264dsp_mmx.c81
1 files changed, 81 insertions, 0 deletions
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index 44a4718e9f..6debfd9fc8 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -104,6 +104,87 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
);
}
+void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+ int dc = (block[0] + 32) >> 6;
+ asm volatile(
+ "movd %0, %%mm0 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "pmaxsw %%mm7, %%mm0 \n\t"
+ "pmaxsw %%mm7, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ asm volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dst+0*stride)),
+ "+m"(*(uint32_t*)(dst+1*stride)),
+ "+m"(*(uint32_t*)(dst+2*stride)),
+ "+m"(*(uint32_t*)(dst+3*stride))
+ );
+}
+
+void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+ int dc = (block[0] + 32) >> 6;
+ int y;
+ asm volatile(
+ "movd %0, %%mm0 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "pmaxsw %%mm7, %%mm0 \n\t"
+ "pmaxsw %%mm7, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ for(y=2; y--; dst += 4*stride){
+ asm volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint64_t*)(dst+0*stride)),
+ "+m"(*(uint64_t*)(dst+1*stride)),
+ "+m"(*(uint64_t*)(dst+2*stride)),
+ "+m"(*(uint64_t*)(dst+3*stride))
+ );
+ }
+}
+
/***********************************/
/* deblocking */