aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
committerDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
commiteb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch)
tree23225d7976eefaf0292342e6ee8b4ac946efcb8e /libavcodec/x86
parentf32f7d8b24d1228df447be85046b9346292d936e (diff)
downloadffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz
vp3: DC-only IDCT
2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dsputil_mmx.c3
-rw-r--r--libavcodec/x86/vp3dsp_mmx.c41
-rw-r--r--libavcodec/x86/vp3dsp_mmx.h1
3 files changed, 45 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 31387ebb30..cc2f881303 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
}
}
+ if (CONFIG_VP3_DECODER) {
+ c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
+ }
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c
index fead8e8cef..309dd4aa5d 100644
--- a/libavcodec/x86/vp3dsp_mmx.c
+++ b/libavcodec/x86/vp3dsp_mmx.c
@@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
ff_vp3_idct_mmx(block);
add_pixels_clamped_mmx(block, dest, line_size);
}
+
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (46341*dc)>>16;
+ dc = (46341*dc + (8<<16))>>20;
+
+ __asm__ volatile(
+ "movd %3, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+
+#define DC_ADD \
+ "movq (%0), %%mm2 \n\t" \
+ "movq (%0,%1), %%mm3 \n\t" \
+ "paddusb %%mm0, %%mm2 \n\t" \
+ "movq (%0,%1,2), %%mm4 \n\t" \
+ "paddusb %%mm0, %%mm3 \n\t" \
+ "movq (%0,%2), %%mm5 \n\t" \
+ "paddusb %%mm0, %%mm4 \n\t" \
+ "paddusb %%mm0, %%mm5 \n\t" \
+ "psubusb %%mm1, %%mm2 \n\t" \
+ "psubusb %%mm1, %%mm3 \n\t" \
+ "movq %%mm2, (%0) \n\t" \
+ "psubusb %%mm1, %%mm4 \n\t" \
+ "movq %%mm3, (%0,%1) \n\t" \
+ "psubusb %%mm1, %%mm5 \n\t" \
+ "movq %%mm4, (%0,%1,2) \n\t" \
+ "movq %%mm5, (%0,%2) \n\t"
+
+ DC_ADD
+ "lea (%0,%1,4), %0 \n\t"
+ DC_ADD
+
+ : "+r"(dest)
+ : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
+ );
+}
diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h
index e565a33023..e0ebf0b0f4 100644
--- a/libavcodec/x86/vp3dsp_mmx.h
+++ b/libavcodec/x86/vp3dsp_mmx.h
@@ -28,6 +28,7 @@
void ff_vp3_idct_mmx(int16_t *data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);