hap: Decode using optimal slices sizes

Enjoy some cache locality and use less threads. About 5x speedup (from 60ms to 12ms to decode a 4k frame). Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
author: Luca Barbato <lu_zero@gentoo.org> 2015-07-17 03:07:07 +0200
committer: Luca Barbato <lu_zero@gentoo.org> 2015-07-21 12:14:25 +0200
commit: 977105407cae55876041dddbf4ce0934cdd4cd6c (patch)
tree: 306fb9cc7fa9c055347193456a8e0e2128a84afd
parent: 219b39a71a5694b1c14a07b86477f665a5b6849b (diff)
download: ffmpeg-977105407cae55876041dddbf4ce0934cdd4cd6c.tar.gz
2 files changed, 27 insertions, 8 deletions
diff --git a/libavcodec/hap.h b/libavcodec/hap.h
index 1250a6f683..75299fda74 100644
--- a/libavcodec/hap.h
+++ b/libavcodec/hap.h
@@ -46,6 +46,8 @@ typedef struct HapContext {
     uint8_t *snappied;       /* Buffer interacting with snappy */
     size_t max_snappy;       /* Maximum compressed size for snappy buffer */
 
+    int slice_size;          /* Optimal slice size */
+
     /* Pointer to the selected compress or decompress function */
     int (*tex_fun)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
 } HapContext;
diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c
index 72db9f4702..5133a51323 100644
--- a/libavcodec/hapdec.c
+++ b/libavcodec/hapdec.c
@@ -137,16 +137,30 @@ static int setup_texture(AVCodecContext *avctx, size_t length)
 }
 
 static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
-                                     int block_nb, int thread_nb)
+                                     int slice, int thread_nb)
 {
     HapContext *ctx = avctx->priv_data;
     AVFrame *frame = arg;
-    int x = (TEXTURE_BLOCK_W * block_nb) % avctx->coded_width;
-    int y = TEXTURE_BLOCK_H * (TEXTURE_BLOCK_W * block_nb / avctx->coded_width);
-    uint8_t *p = frame->data[0] + x * 4 + y * frame->linesize[0];
-    const uint8_t *d = ctx->tex_data + block_nb * ctx->tex_rat;
+    const uint8_t *d = ctx->tex_data;
+    int w_block = avctx->coded_width / TEXTURE_BLOCK_W;
+    int x, y;
+    int start_slice, end_slice;
+
+    start_slice = slice * ctx->slice_size;
+    end_slice   = FFMIN(start_slice + ctx->slice_size, avctx->coded_height);
+
+    start_slice /= TEXTURE_BLOCK_H;
+    end_slice   /= TEXTURE_BLOCK_H;
+
+    for (y = start_slice; y < end_slice; y++) {
+        uint8_t *p = frame->data[0] + y * frame->linesize[0] * TEXTURE_BLOCK_H;
+        int off  = y * w_block;
+        for (x = 0; x < w_block; x++) {
+            ctx->tex_fun(p + x * 16, frame->linesize[0],
+                         d + (off + x) * ctx->tex_rat);
+        }
+    }
 
-    ctx->tex_fun(p, frame->linesize[0], d);
     return 0;
 }
 
@@ -156,7 +170,10 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     HapContext *ctx = avctx->priv_data;
     ThreadFrame tframe;
     int ret, length;
-    int blocks = avctx->coded_width * avctx->coded_height / (TEXTURE_BLOCK_W * TEXTURE_BLOCK_H);
+    int slices = FFMIN(avctx->thread_count,
+                       avctx->coded_height / TEXTURE_BLOCK_H);
+
+    ctx->slice_size = avctx->coded_height / slices;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
 
@@ -180,7 +197,7 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     ff_thread_finish_setup(avctx);
 
     /* Use the decompress function on the texture, one block per thread */
-    avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, blocks);
+    avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, slices);
 
     /* Frame is ready to be output */
     tframe.f->pict_type = AV_PICTURE_TYPE_I;
author	Luca Barbato <lu_zero@gentoo.org>	2015-07-17 03:07:07 +0200
committer	Luca Barbato <lu_zero@gentoo.org>	2015-07-21 12:14:25 +0200
commit	977105407cae55876041dddbf4ce0934cdd4cd6c (patch)
tree	306fb9cc7fa9c055347193456a8e0e2128a84afd
parent	219b39a71a5694b1c14a07b86477f665a5b6849b (diff)
download	ffmpeg-977105407cae55876041dddbf4ce0934cdd4cd6c.tar.gz