diff options
author | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
---|---|---|
committer | David Conrad <lessen42@gmail.com> | 2010-04-17 02:04:30 +0000 |
commit | eb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch) | |
tree | 23225d7976eefaf0292342e6ee8b4ac946efcb8e /libavcodec/x86 | |
parent | f32f7d8b24d1228df447be85046b9346292d936e (diff) | |
download | ffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz |
vp3: DC-only IDCT
2-4% faster overall decode
Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 3 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp_mmx.c | 41 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp_mmx.h | 1 |
3 files changed, 45 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 31387ebb30..cc2f881303 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; } } + if (CONFIG_VP3_DECODER) { + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; + } #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c index fead8e8cef..309dd4aa5d 100644 --- a/libavcodec/x86/vp3dsp_mmx.c +++ b/libavcodec/x86/vp3dsp_mmx.c @@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) ff_vp3_idct_mmx(block); add_pixels_clamped_mmx(block, dest, line_size); } + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) +{ + int dc = block[0]; + dc = (46341*dc)>>16; + dc = (46341*dc + (8<<16))>>20; + + __asm__ volatile( + "movd %3, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + +#define DC_ADD \ + "movq (%0), %%mm2 \n\t" \ + "movq (%0,%1), %%mm3 \n\t" \ + "paddusb %%mm0, %%mm2 \n\t" \ + "movq (%0,%1,2), %%mm4 \n\t" \ + "paddusb %%mm0, %%mm3 \n\t" \ + "movq (%0,%2), %%mm5 \n\t" \ + "paddusb %%mm0, %%mm4 \n\t" \ + "paddusb %%mm0, %%mm5 \n\t" \ + "psubusb %%mm1, %%mm2 \n\t" \ + "psubusb %%mm1, %%mm3 \n\t" \ + "movq %%mm2, (%0) \n\t" \ + "psubusb %%mm1, %%mm4 \n\t" \ + "movq %%mm3, (%0,%1) \n\t" \ + "psubusb %%mm1, %%mm5 \n\t" \ + "movq %%mm4, (%0,%1,2) \n\t" \ + "movq %%mm5, (%0,%2) \n\t" + + DC_ADD + "lea (%0,%1,4), %0 \n\t" + DC_ADD + + : "+r"(dest) + : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) + ); +} diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h index e565a33023..e0ebf0b0f4 100644 --- a/libavcodec/x86/vp3dsp_mmx.h +++ b/libavcodec/x86/vp3dsp_mmx.h @@ -28,6 +28,7 @@ void ff_vp3_idct_mmx(int16_t *data); void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |