diff options
author | Nuo Mi <nuomi2021@gmail.com> | 2024-10-04 22:31:12 +0800 |
---|---|---|
committer | Nuo Mi <nuomi2021@gmail.com> | 2024-10-16 20:28:09 +0800 |
commit | 634780f3cf7298401366c41e738ad728d26fee47 (patch) | |
tree | ec73cd63134bb7ce4e5a2c9b753d8366210cde52 /libavcodec | |
parent | 48a1a12968345bf673db1e1cbb5c64bd3529c50c (diff) | |
download | ffmpeg-634780f3cf7298401366c41e738ad728d26fee47.tar.gz |
avcodec/vvcdec: refact out deblock boundary strength stage
The deblock boundary strength stage utilizes ~5% of CPU resources for 8K clips.
It's worth considering it as a standalone stage. This stage has been relocated
to follow the parser process, allowing us to reuse CUs and TUs before releasing them.
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/vvc/filter.c | 27 | ||||
-rw-r--r-- | libavcodec/vvc/filter.h | 9 | ||||
-rw-r--r-- | libavcodec/vvc/thread.c | 24 |
3 files changed, 45 insertions, 15 deletions
diff --git a/libavcodec/vvc/filter.c b/libavcodec/vvc/filter.c index 25bef45eed..707fc24203 100644 --- a/libavcodec/vvc/filter.c +++ b/libavcodec/vvc/filter.c @@ -678,12 +678,14 @@ static void vvc_deblock_bs_chroma(const VVCLocalContext *lc, typedef void (*deblock_bs_fn)(const VVCLocalContext *lc, const int x0, const int y0, const int width, const int height, const int rs, const int vertical); -static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0, const int rs, const int vertical) +void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs) { const VVCFrameContext *fc = lc->fc; const VVCSPS *sps = fc->ps.sps; const VVCPPS *pps = fc->ps.pps; const int ctb_size = sps->ctb_size_y; + const int x0 = rx << sps->ctb_log2_size_y; + const int y0 = ry << sps->ctb_log2_size_y; const int x_end = FFMIN(x0 + ctb_size, pps->width) >> MIN_TU_LOG2; const int y_end = FFMIN(y0 + ctb_size, pps->height) >> MIN_TU_LOG2; const int has_chroma = !!sps->r->sps_chroma_format_idc; @@ -691,15 +693,18 @@ static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0 vvc_deblock_bs_luma, vvc_deblock_bs_chroma }; - for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) { - const int hs = sps->hshift[is_chroma]; - const int vs = sps->vshift[is_chroma]; - for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) { - for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) { - const int off = y * fc->ps.pps->min_tu_width + x; - if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) { - deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2, - fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical); + ff_vvc_decode_neighbour(lc, x0, y0, rx, ry, rs); + for (int vertical = 0; vertical <= 1; vertical++) { + for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) { + const int hs = sps->hshift[is_chroma]; + const int vs = sps->vshift[is_chroma]; + for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) { + for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) { + const int off = y * fc->ps.pps->min_tu_width + x; + if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) { + deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2, + fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical); + } } } } @@ -795,8 +800,6 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs, const uint8_t no_p[4] = { 0 }; const uint8_t no_q[4] = { 0 } ; - vvc_deblock_bs(lc, x0, y0, rs, vertical); - if (!vertical) { FFSWAP(int, x_end, y_end); FFSWAP(int, x0, y0); diff --git a/libavcodec/vvc/filter.h b/libavcodec/vvc/filter.h index 03cc74e071..29abbd98ce 100644 --- a/libavcodec/vvc/filter.h +++ b/libavcodec/vvc/filter.h @@ -34,6 +34,15 @@ void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x0, const int y0); /** + * derive boundary strength for the CTU + * @param lc local context for CTU + * @param rx raster x position for the CTU + * @param ry raster y position for the CTU + * @param rs raster position for the CTU + */ +void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs); + +/** * vertical deblock filter for the CTU * @param lc local context for CTU * @param x0 x position for the CTU diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c index d75784e242..82c00dd4c9 100644 --- a/libavcodec/vvc/thread.c +++ b/libavcodec/vvc/thread.c @@ -42,6 +42,7 @@ typedef struct ProgressListener { typedef enum VVCTaskStage { VVC_TASK_STAGE_INIT, // for CTU(0, 0) only VVC_TASK_STAGE_PARSE, + VVC_TASK_STAGE_DEBLOCK_BS, VVC_TASK_STAGE_INTER, VVC_TASK_STAGE_RECON, VVC_TASK_STAGE_LMCS, @@ -111,6 +112,7 @@ static void add_task(VVCContext *s, VVCTask *t) const int priorities[] = { 0, // VVC_TASK_STAGE_INIT, 0, // VVC_TASK_STAGE_PARSE, + 1, // VVC_TASK_STAGE_DEBLOCK_BS // For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks. // We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks. PRIORITY_LOWEST, // VVC_TASK_STAGE_INTER @@ -181,6 +183,8 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin // l:left, r:right, t: top, b: bottom static const uint8_t target_score[] = { + 2, //VVC_TASK_STAGE_DEBLOCK_BS,need l + t parse + 0, //VVC_TASK_STAGE_INTER, not used 2, //VVC_TASK_STAGE_RECON, need l + rt recon 3, //VVC_TASK_STAGE_LMCS, need r + b + rb recon 1, //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v @@ -202,7 +206,7 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin } else if (stage == VVC_TASK_STAGE_INTER) { target = atomic_load(&t->target_inter_score); } else { - target = target_score[stage - VVC_TASK_STAGE_RECON]; + target = target_score[stage - VVC_TASK_STAGE_DEBLOCK_BS]; } //+1 for previous stage @@ -348,6 +352,10 @@ static void task_stage_done(const VVCTask *t, VVCContext *s) //this is a reserve map of ready_score, ordered by zigzag if (stage == VVC_TASK_STAGE_PARSE) { + ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_BS); + ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_BS); + if (t->rx < 0 || t->rx >= ft->ctu_width || t->ry < 0 || t->ry >= ft->ctu_height) + return; parse_task_done(s, fc, t->rx, t->ry); } else if (stage == VVC_TASK_STAGE_RECON) { ADD(-1, 1, VVC_TASK_STAGE_RECON); @@ -481,6 +489,14 @@ static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t) return 0; } +static int run_deblock_bs(VVCContext *s, VVCLocalContext *lc, VVCTask *t) +{ + if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) + ff_vvc_deblock_bs(lc, t->rx, t->ry, t->rs); + + return 0; +} + static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t) { VVCFrameContext *fc = lc->fc; @@ -590,6 +606,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t) const static char* task_name[] = { "INIT", "P", + "B", "I", "R", "L", @@ -611,6 +628,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc) static const run_func run[] = { run_init, run_parse, + run_deblock_bs, run_inter, run_recon, run_lmcs, @@ -701,9 +719,9 @@ static void frame_thread_init_score(VVCFrameContext *fc) const VVCFrameThread *ft = fc->ft; VVCTask task; - task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0); + task_init(&task, VVC_TASK_STAGE_PARSE, fc, 0, 0); - for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) { + for (int i = VVC_TASK_STAGE_PARSE; i < VVC_TASK_STAGE_LAST; i++) { task.stage = i; for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) { |