avcodec/vvcdec: refact out deblock boundary strength stage

The deblock boundary strength stage utilizes ~5% of CPU resources for 8K clips. It's worth considering it as a standalone stage. This stage has been relocated to follow the parser process, allowing us to reuse CUs and TUs before releasing them.
author: Nuo Mi <nuomi2021@gmail.com> 2024-10-04 22:31:12 +0800
committer: Nuo Mi <nuomi2021@gmail.com> 2024-10-16 20:28:09 +0800
commit: 634780f3cf7298401366c41e738ad728d26fee47 (patch)
tree: ec73cd63134bb7ce4e5a2c9b753d8366210cde52 /libavcodec
parent: 48a1a12968345bf673db1e1cbb5c64bd3529c50c (diff)
download: ffmpeg-634780f3cf7298401366c41e738ad728d26fee47.tar.gz
3 files changed, 45 insertions, 15 deletions
diff --git a/libavcodec/vvc/filter.c b/libavcodec/vvc/filter.c
index 25bef45eed..707fc24203 100644
--- a/libavcodec/vvc/filter.c
+++ b/libavcodec/vvc/filter.c
@@ -678,12 +678,14 @@ static void vvc_deblock_bs_chroma(const VVCLocalContext *lc,
 typedef void (*deblock_bs_fn)(const VVCLocalContext *lc, const int x0, const int y0,
     const int width, const int height, const int rs, const int vertical);
 
-static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0, const int rs, const int vertical)
+void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs)
 {
     const VVCFrameContext *fc  = lc->fc;
     const VVCSPS *sps          = fc->ps.sps;
     const VVCPPS *pps          = fc->ps.pps;
     const int ctb_size         = sps->ctb_size_y;
+    const int x0               = rx << sps->ctb_log2_size_y;
+    const int y0               = ry << sps->ctb_log2_size_y;
     const int x_end            = FFMIN(x0 + ctb_size, pps->width) >> MIN_TU_LOG2;
     const int y_end            = FFMIN(y0 + ctb_size, pps->height) >> MIN_TU_LOG2;
     const int has_chroma       = !!sps->r->sps_chroma_format_idc;
@@ -691,15 +693,18 @@ static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0
         vvc_deblock_bs_luma, vvc_deblock_bs_chroma
     };
 
-    for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
-        const int hs = sps->hshift[is_chroma];
-        const int vs = sps->vshift[is_chroma];
-        for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
-            for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
-                const int off = y * fc->ps.pps->min_tu_width + x;
-                if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
-                    deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
-                        fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
+    ff_vvc_decode_neighbour(lc, x0, y0, rx, ry, rs);
+    for (int vertical = 0; vertical <= 1; vertical++) {
+        for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
+            const int hs = sps->hshift[is_chroma];
+            const int vs = sps->vshift[is_chroma];
+            for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
+                for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
+                    const int off = y * fc->ps.pps->min_tu_width + x;
+                    if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
+                        deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
+                            fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
+                    }
                 }
             }
         }
@@ -795,8 +800,6 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs,
     const uint8_t no_p[4]  = { 0 };
     const uint8_t no_q[4]  = { 0 } ;
 
-    vvc_deblock_bs(lc, x0, y0, rs, vertical);
-
     if (!vertical) {
         FFSWAP(int, x_end, y_end);
         FFSWAP(int, x0, y0);
diff --git a/libavcodec/vvc/filter.h b/libavcodec/vvc/filter.h
index 03cc74e071..29abbd98ce 100644
--- a/libavcodec/vvc/filter.h
+++ b/libavcodec/vvc/filter.h
@@ -34,6 +34,15 @@
 void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x0, const int y0);
 
 /**
+ * derive boundary strength for the CTU
+ * @param lc local context for CTU
+ * @param rx raster x position for the CTU
+ * @param ry raster y position for the CTU
+ * @param rs raster position for the CTU
+ */
+void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs);
+
+/**
  * vertical deblock filter for the CTU
  * @param lc local context for CTU
  * @param x0 x position for the CTU
diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
index d75784e242..82c00dd4c9 100644
--- a/libavcodec/vvc/thread.c
+++ b/libavcodec/vvc/thread.c
@@ -42,6 +42,7 @@ typedef struct ProgressListener {
 typedef enum VVCTaskStage {
     VVC_TASK_STAGE_INIT,                    // for CTU(0, 0) only
     VVC_TASK_STAGE_PARSE,
+    VVC_TASK_STAGE_DEBLOCK_BS,
     VVC_TASK_STAGE_INTER,
     VVC_TASK_STAGE_RECON,
     VVC_TASK_STAGE_LMCS,
@@ -111,6 +112,7 @@ static void add_task(VVCContext *s, VVCTask *t)
     const int priorities[] = {
         0,                  // VVC_TASK_STAGE_INIT,
         0,                  // VVC_TASK_STAGE_PARSE,
+        1,                  // VVC_TASK_STAGE_DEBLOCK_BS
         // For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks.
         // We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks.
         PRIORITY_LOWEST,    // VVC_TASK_STAGE_INTER
@@ -181,6 +183,8 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
     // l:left, r:right, t: top, b: bottom
     static const uint8_t target_score[] =
     {
+        2,          //VVC_TASK_STAGE_DEBLOCK_BS,need l + t parse
+        0,          //VVC_TASK_STAGE_INTER,     not used
         2,          //VVC_TASK_STAGE_RECON,     need l + rt recon
         3,          //VVC_TASK_STAGE_LMCS,      need r + b + rb recon
         1,          //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v
@@ -202,7 +206,7 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
     } else if (stage == VVC_TASK_STAGE_INTER) {
         target = atomic_load(&t->target_inter_score);
     } else {
-        target = target_score[stage - VVC_TASK_STAGE_RECON];
+        target = target_score[stage - VVC_TASK_STAGE_DEBLOCK_BS];
     }
 
     //+1 for previous stage
@@ -348,6 +352,10 @@ static void task_stage_done(const VVCTask *t, VVCContext *s)
 
     //this is a reserve map of ready_score, ordered by zigzag
     if (stage == VVC_TASK_STAGE_PARSE) {
+        ADD( 0,  1, VVC_TASK_STAGE_DEBLOCK_BS);
+        ADD( 1,  0, VVC_TASK_STAGE_DEBLOCK_BS);
+        if (t->rx < 0 || t->rx >= ft->ctu_width || t->ry < 0 || t->ry >= ft->ctu_height)
+            return;
         parse_task_done(s, fc, t->rx, t->ry);
     } else if (stage == VVC_TASK_STAGE_RECON) {
         ADD(-1,  1, VVC_TASK_STAGE_RECON);
@@ -481,6 +489,14 @@ static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
     return 0;
 }
 
+static int run_deblock_bs(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
+{
+    if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag)
+        ff_vvc_deblock_bs(lc, t->rx, t->ry, t->rs);
+
+    return 0;
+}
+
 static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
@@ -590,6 +606,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 const static char* task_name[] = {
     "INIT",
     "P",
+    "B",
     "I",
     "R",
     "L",
@@ -611,6 +628,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc)
     static const run_func run[] = {
         run_init,
         run_parse,
+        run_deblock_bs,
         run_inter,
         run_recon,
         run_lmcs,
@@ -701,9 +719,9 @@ static void frame_thread_init_score(VVCFrameContext *fc)
     const VVCFrameThread *ft = fc->ft;
     VVCTask task;
 
-    task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0);
+    task_init(&task, VVC_TASK_STAGE_PARSE, fc, 0, 0);
 
-    for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) {
+    for (int i = VVC_TASK_STAGE_PARSE; i < VVC_TASK_STAGE_LAST; i++) {
         task.stage = i;
 
         for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) {
author	Nuo Mi <nuomi2021@gmail.com>	2024-10-04 22:31:12 +0800
committer	Nuo Mi <nuomi2021@gmail.com>	2024-10-16 20:28:09 +0800
commit	634780f3cf7298401366c41e738ad728d26fee47 (patch)
tree	ec73cd63134bb7ce4e5a2c9b753d8366210cde52 /libavcodec
parent	48a1a12968345bf673db1e1cbb5c64bd3529c50c (diff)
download	ffmpeg-634780f3cf7298401366c41e738ad728d26fee47.tar.gz