aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/vp3dsp_mmx.c
diff options
context:
space:
mode:
authorDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
committerDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
commiteb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch)
tree23225d7976eefaf0292342e6ee8b4ac946efcb8e /libavcodec/x86/vp3dsp_mmx.c
parentf32f7d8b24d1228df447be85046b9346292d936e (diff)
downloadffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz
vp3: DC-only IDCT
2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/vp3dsp_mmx.c')
-rw-r--r--libavcodec/x86/vp3dsp_mmx.c41
1 files changed, 41 insertions, 0 deletions
diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c
index fead8e8cef..309dd4aa5d 100644
--- a/libavcodec/x86/vp3dsp_mmx.c
+++ b/libavcodec/x86/vp3dsp_mmx.c
@@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
ff_vp3_idct_mmx(block);
add_pixels_clamped_mmx(block, dest, line_size);
}
+
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (46341*dc)>>16;
+ dc = (46341*dc + (8<<16))>>20;
+
+ __asm__ volatile(
+ "movd %3, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+
+#define DC_ADD \
+ "movq (%0), %%mm2 \n\t" \
+ "movq (%0,%1), %%mm3 \n\t" \
+ "paddusb %%mm0, %%mm2 \n\t" \
+ "movq (%0,%1,2), %%mm4 \n\t" \
+ "paddusb %%mm0, %%mm3 \n\t" \
+ "movq (%0,%2), %%mm5 \n\t" \
+ "paddusb %%mm0, %%mm4 \n\t" \
+ "paddusb %%mm0, %%mm5 \n\t" \
+ "psubusb %%mm1, %%mm2 \n\t" \
+ "psubusb %%mm1, %%mm3 \n\t" \
+ "movq %%mm2, (%0) \n\t" \
+ "psubusb %%mm1, %%mm4 \n\t" \
+ "movq %%mm3, (%0,%1) \n\t" \
+ "psubusb %%mm1, %%mm5 \n\t" \
+ "movq %%mm4, (%0,%1,2) \n\t" \
+ "movq %%mm5, (%0,%2) \n\t"
+
+ DC_ADD
+ "lea (%0,%1,4), %0 \n\t"
+ DC_ADD
+
+ : "+r"(dest)
+ : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
+ );
+}