aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/vp8dsp.c
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
commit8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree259b69473524a9db002d79b90009817f84af4113 /libavcodec/vp8dsp.c
parentef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
downloadffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/vp8dsp.c')
-rw-r--r--libavcodec/vp8dsp.c26
1 files changed, 23 insertions, 3 deletions
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 3e6463d598..64b09d52ee 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
}
}
+static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+ int i, j;
+ for (j = 0; j < 4; j++) {
+ uint8_t *pix = dst+j*4;
+ int dc = (block[j][0] + 4) >> 3;
+ uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
+ block[j][0] = 0;
+ if (!dc)
+ continue;
+ for (i = 0; i < 4; i++) {
+ pix[0] = cm[pix[0]];
+ pix[1] = cm[pix[1]];
+ pix[2] = cm[pix[2]];
+ pix[3] = cm[pix[3]];
+ pix += stride;
+ }
+ }
+}
// because I like only having two parameters to pass functions...
#define LOAD_PIXELS\
@@ -460,9 +479,10 @@ VP8_BILINEAR(4)
av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
{
- dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
- dsp->vp8_idct_add = vp8_idct_add_c;
- dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
+ dsp->vp8_idct_add = vp8_idct_add_c;
+ dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;