VP8: 30% faster idct_mb

Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Jason Garrett-Glaser <darkshikari@gmail.com> 2010-07-23 02:58:27 +0000
committer: Jason Garrett-Glaser <darkshikari@gmail.com> 2010-07-23 02:58:27 +0000
commit: 8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree: 259b69473524a9db002d79b90009817f84af4113 /libavcodec/vp8.c
parent: ef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
download: ffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz
1 files changed, 34 insertions, 30 deletions
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 18b69b8de1..92de1bc605 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
     }
 }
 
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
-                    VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
 {
-    int x, y, nnz;
+    int x, y, ch, nnz;
 
-    if (mb->mode != MODE_I4x4)
+    if (mb->mode != MODE_I4x4) {
+        uint8_t *y_dst = dst[0];
         for (y = 0; y < 4; y++) {
-            for (x = 0; x < 4; x++) {
-                nnz = s->non_zero_count_cache[y][x];
-                if (nnz) {
-                    if (nnz == 1)
-                        s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
-                    else
-                        s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+            uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
+            if (nnz) {
+                if (nnz&~0x01010101) {
+                    for (x = 0; x < 4; x++) {
+                        nnz = s->non_zero_count_cache[y][x];
+                        if (nnz) {
+                            if (nnz == 1)
+                                s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+                            else
+                                s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+                        }
+                    }
+                } else {
+                    s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
                 }
             }
             y_dst += 4*s->linesize;
         }
+    }
 
-    for (y = 0; y < 2; y++) {
-        for (x = 0; x < 2; x++) {
-            nnz = s->non_zero_count_cache[4][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-            }
-
-            nnz = s->non_zero_count_cache[5][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+    for (ch = 0; ch < 2; ch++) {
+        if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
+            uint8_t *ch_dst = dst[1+ch];
+            for (y = 0; y < 2; y++) {
+                for (x = 0; x < 2; x++) {
+                    nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+                    if (nnz) {
+                        if (nnz == 1)
+                            s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                        else
+                            s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                    }
+                }
+                ch_dst += 4*s->uvlinesize;
             }
         }
-        u_dst += 4*s->uvlinesize;
-        v_dst += 4*s->uvlinesize;
     }
 }
 
@@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
 
             if (!mb->skip) {
-                idct_mb(s, dst[0], dst[1], dst[2], mb);
+                idct_mb(s, dst, mb);
             } else {
                 AV_ZERO64(s->left_nnz);
                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
author	Jason Garrett-Glaser <darkshikari@gmail.com>	2010-07-23 02:58:27 +0000
committer	Jason Garrett-Glaser <darkshikari@gmail.com>	2010-07-23 02:58:27 +0000
commit	8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree	259b69473524a9db002d79b90009817f84af4113 /libavcodec/vp8.c
parent	ef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
download	ffmpeg-8a467b2d44d20c1a0b731dce9edeff772732a558.tar.gz