aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/vp8.c
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
commit8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree259b69473524a9db002d79b90009817f84af4113 /libavcodec/vp8.c
parentef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
downloadffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/vp8.c')
-rw-r--r--libavcodec/vp8.c64
1 files changed, 34 insertions, 30 deletions
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 18b69b8de1..92de1bc605 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
}
}
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
- VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
{
- int x, y, nnz;
+ int x, y, ch, nnz;
- if (mb->mode != MODE_I4x4)
+ if (mb->mode != MODE_I4x4) {
+ uint8_t *y_dst = dst[0];
for (y = 0; y < 4; y++) {
- for (x = 0; x < 4; x++) {
- nnz = s->non_zero_count_cache[y][x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
- else
- s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
+ if (nnz) {
+ if (nnz&~0x01010101) {
+ for (x = 0; x < 4; x++) {
+ nnz = s->non_zero_count_cache[y][x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+ else
+ s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ }
+ }
+ } else {
+ s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
}
}
y_dst += 4*s->linesize;
}
+ }
- for (y = 0; y < 2; y++) {
- for (x = 0; x < 2; x++) {
- nnz = s->non_zero_count_cache[4][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- }
-
- nnz = s->non_zero_count_cache[5][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+ for (ch = 0; ch < 2; ch++) {
+ if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
+ uint8_t *ch_dst = dst[1+ch];
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ else
+ s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ }
+ }
+ ch_dst += 4*s->uvlinesize;
}
}
- u_dst += 4*s->uvlinesize;
- v_dst += 4*s->uvlinesize;
}
}
@@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
if (!mb->skip) {
- idct_mb(s, dst[0], dst[1], dst[2], mb);
+ idct_mb(s, dst, mb);
} else {
AV_ZERO64(s->left_nnz);
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned