avcodec/vp3: Share coefficient VLCs between threads

These VLCs are very big: The VP3 one have 164382 elements but due to the overallocation enough memory for 313344 elements are allocated (1.195 MiB with sizeof(VLCElem) == 4); for VP4 the numbers are very similar, namely 311296 and 164392 elements. Since 1f4cf92cfbd3accbae582ac63126ed5570ddfd37, each frame thread has its own copy of these VLCs. This commit fixes this by sharing these VLCs across threads. The approach used here will also make it easier to support stream reconfigurations in case of frame-multithreading in the future. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
author: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 2023-09-21 01:28:54 +0200
committer: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 2023-10-31 20:47:00 +0100
commit: 6c7a344b65cb7476d1575cb1504e3a53bcbc83e7 (patch)
tree: a4346325fa1d54710003143781fb8b0dc0a379bb
parent: 7fee90efac005b6f8f5878c9d004f3d2da0d2ead (diff)
download: ffmpeg-6c7a344b65cb7476d1575cb1504e3a53bcbc83e7.tar.gz
1 files changed, 65 insertions, 34 deletions
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index e6ec644cb0..f2b9203a04 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -45,8 +45,10 @@
 #include "decode.h"
 #include "get_bits.h"
 #include "hpeldsp.h"
+#include "internal.h"
 #include "jpegquanttables.h"
 #include "mathops.h"
+#include "refstruct.h"
 #include "thread.h"
 #include "threadframe.h"
 #include "videodsp.h"
@@ -187,6 +189,10 @@ typedef struct HuffTable {
     uint8_t   nb_entries;
 } HuffTable;
 
+typedef struct CoeffVLCs {
+    VLC vlcs[80];
+} CoeffVLCs;
+
 typedef struct Vp3DecodeContext {
     AVCodecContext *avctx;
     int theora, theora_tables, theora_header;
@@ -289,9 +295,12 @@ typedef struct Vp3DecodeContext {
     int *nkf_coded_fragment_list;
     int num_kf_coded_fragment[3];
 
-    /* The first 16 of the following VLCs are for the dc coefficients;
-       the others are four groups of 16 VLCs each for ac coefficients. */
-    VLC coeff_vlc[5 * 16];
+    /**
+     * The first 16 of the following VLCs are for the dc coefficients;
+     * the others are four groups of 16 VLCs each for ac coefficients.
+     * This is a RefStruct reference to share these VLCs between threads.
+     */
+    CoeffVLCs *coeff_vlc;
 
     /* these arrays need to be on 16-byte boundaries since SSE2 operations
      * index into them */
@@ -365,8 +374,7 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     av_frame_free(&s->last_frame.f);
     av_frame_free(&s->golden_frame.f);
 
-    for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++)
-        ff_vlc_free(&s->coeff_vlc[i]);
+    ff_refstruct_unref(&s->coeff_vlc);
 
     return 0;
 }
@@ -1295,13 +1303,14 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
  */
 static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
 {
+    const VLC *coeff_vlc = s->coeff_vlc->vlcs;
     int dc_y_table;
     int dc_c_table;
     int ac_y_table;
     int ac_c_table;
     int residual_eob_run = 0;
-    VLC *y_tables[64];
-    VLC *c_tables[64];
+    const VLC *y_tables[64];
+    const VLC *c_tables[64];
 
     s->dct_tokens[0][0] = s->dct_tokens_base;
 
@@ -1313,7 +1322,7 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
     dc_c_table = get_bits(gb, 4);
 
     /* unpack the Y plane DC coefficients */
-    residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_y_table], 0,
+    residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_y_table], 0,
                                    0, residual_eob_run);
     if (residual_eob_run < 0)
         return residual_eob_run;
@@ -1324,11 +1333,11 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
     reverse_dc_prediction(s, 0, s->fragment_width[0], s->fragment_height[0]);
 
     /* unpack the C plane DC coefficients */
-    residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_c_table], 0,
+    residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_c_table], 0,
                                    1, residual_eob_run);
     if (residual_eob_run < 0)
         return residual_eob_run;
-    residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_c_table], 0,
+    residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_c_table], 0,
                                    2, residual_eob_run);
     if (residual_eob_run < 0)
         return residual_eob_run;
@@ -1350,23 +1359,23 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
     /* build tables of AC VLC tables */
     for (int i = 1; i <= 5; i++) {
         /* AC VLC table group 1 */
-        y_tables[i] = &s->coeff_vlc[ac_y_table + 16];
-        c_tables[i] = &s->coeff_vlc[ac_c_table + 16];
+        y_tables[i] = &coeff_vlc[ac_y_table + 16];
+        c_tables[i] = &coeff_vlc[ac_c_table + 16];
     }
     for (int i = 6; i <= 14; i++) {
         /* AC VLC table group 2 */
-        y_tables[i] = &s->coeff_vlc[ac_y_table + 32];
-        c_tables[i] = &s->coeff_vlc[ac_c_table + 32];
+        y_tables[i] = &coeff_vlc[ac_y_table + 32];
+        c_tables[i] = &coeff_vlc[ac_c_table + 32];
     }
     for (int i = 15; i <= 27; i++) {
         /* AC VLC table group 3 */
-        y_tables[i] = &s->coeff_vlc[ac_y_table + 48];
-        c_tables[i] = &s->coeff_vlc[ac_c_table + 48];
+        y_tables[i] = &coeff_vlc[ac_y_table + 48];
+        c_tables[i] = &coeff_vlc[ac_c_table + 48];
     }
     for (int i = 28; i <= 63; i++) {
         /* AC VLC table group 4 */
-        y_tables[i] = &s->coeff_vlc[ac_y_table + 64];
-        c_tables[i] = &s->coeff_vlc[ac_c_table + 64];
+        y_tables[i] = &coeff_vlc[ac_y_table + 64];
+        c_tables[i] = &coeff_vlc[ac_c_table + 64];
     }
 
     /* decode all AC coefficients */
@@ -1517,6 +1526,7 @@ static void vp4_set_tokens_base(Vp3DecodeContext *s)
 
 static int vp4_unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
 {
+    const VLC *coeff_vlc = s->coeff_vlc->vlcs;
     int dc_y_table;
     int dc_c_table;
     int ac_y_table;
@@ -1539,27 +1549,27 @@ static int vp4_unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
     /* build tables of DC/AC VLC tables */
 
     /* DC table group */
-    tables[0][0] = &s->coeff_vlc[dc_y_table];
-    tables[1][0] = &s->coeff_vlc[dc_c_table];
+    tables[0][0] = &coeff_vlc[dc_y_table];
+    tables[1][0] = &coeff_vlc[dc_c_table];
     for (int i = 1; i <= 5; i++) {
         /* AC VLC table group 1 */
-        tables[0][i] = &s->coeff_vlc[ac_y_table + 16];
-        tables[1][i] = &s->coeff_vlc[ac_c_table + 16];
+        tables[0][i] = &coeff_vlc[ac_y_table + 16];
+        tables[1][i] = &coeff_vlc[ac_c_table + 16];
     }
     for (int i = 6; i <= 14; i++) {
         /* AC VLC table group 2 */
-        tables[0][i] = &s->coeff_vlc[ac_y_table + 32];
-        tables[1][i] = &s->coeff_vlc[ac_c_table + 32];
+        tables[0][i] = &coeff_vlc[ac_y_table + 32];
+        tables[1][i] = &coeff_vlc[ac_c_table + 32];
     }
     for (int i = 15; i <= 27; i++) {
         /* AC VLC table group 3 */
-        tables[0][i] = &s->coeff_vlc[ac_y_table + 48];
-        tables[1][i] = &s->coeff_vlc[ac_c_table + 48];
+        tables[0][i] = &coeff_vlc[ac_y_table + 48];
+        tables[1][i] = &coeff_vlc[ac_c_table + 48];
     }
     for (int i = 28; i <= 63; i++) {
         /* AC VLC table group 4 */
-        tables[0][i] = &s->coeff_vlc[ac_y_table + 64];
-        tables[1][i] = &s->coeff_vlc[ac_c_table + 64];
+        tables[0][i] = &coeff_vlc[ac_y_table + 64];
+        tables[1][i] = &coeff_vlc[ac_c_table + 64];
     }
 
     vp4_set_tokens_base(s);
@@ -2355,6 +2365,14 @@ static av_cold int init_frames(Vp3DecodeContext *s)
     return 0;
 }
 
+static av_cold void free_vlc_tables(FFRefStructOpaque unused, void *obj)
+{
+    CoeffVLCs *vlcs = obj;
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++)
+        ff_vlc_free(&vlcs->vlcs[i]);
+}
+
 static av_cold int vp3_decode_init(AVCodecContext *avctx)
 {
     static AVOnce init_static_once = AV_ONCE_INIT;
@@ -2443,8 +2461,6 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     s->fragment_start[2] = y_fragment_count + c_fragment_count;
 
     if (!s->theora_tables) {
-        const uint8_t (*bias_tabs)[32][2];
-
         for (int i = 0; i < 64; i++) {
             s->coded_dc_scale_factor[0][i] = s->version < 2 ? vp31_dc_scale_factor[i] : vp4_y_dc_scale_factor[i];
             s->coded_dc_scale_factor[1][i] = s->version < 2 ? vp31_dc_scale_factor[i] : vp4_uv_dc_scale_factor[i];
@@ -2463,11 +2479,23 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
                 s->qr_base[inter][plane][1] = 2 * inter + (!!plane) * !inter;
             }
         }
+    }
+
+    if (!avctx->internal->is_copy) {
+        CoeffVLCs *vlcs = ff_refstruct_alloc_ext(sizeof(*s->coeff_vlc), 0,
+                                                 NULL, free_vlc_tables);
+        if (!vlcs)
+            return AVERROR(ENOMEM);
+
+        s->coeff_vlc = vlcs;
+
+    if (!s->theora_tables) {
+        const uint8_t (*bias_tabs)[32][2];
 
         /* init VLC tables */
         bias_tabs = CONFIG_VP4_DECODER && s->version >= 2 ? vp4_bias : vp3_bias;
-        for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++) {
-            ret = ff_vlc_init_from_lengths(&s->coeff_vlc[i], 11, 32,
+        for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++) {
+            ret = ff_vlc_init_from_lengths(&vlcs->vlcs[i], 11, 32,
                                            &bias_tabs[i][0][1], 2,
                                            &bias_tabs[i][0][0], 2, 1,
                                            0, 0, avctx);
@@ -2475,10 +2503,10 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
                 return ret;
         }
     } else {
-        for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++) {
+        for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++) {
             const HuffTable *tab = &s->huffman_table[i];
 
-            ret = ff_vlc_init_from_lengths(&s->coeff_vlc[i], 11, tab->nb_entries,
+            ret = ff_vlc_init_from_lengths(&vlcs->vlcs[i], 11, tab->nb_entries,
                                            &tab->entries[0].len, sizeof(*tab->entries),
                                            &tab->entries[0].sym, sizeof(*tab->entries), 1,
                                            0, 0, avctx);
@@ -2486,6 +2514,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
                 return ret;
         }
     }
+    }
 
     ff_thread_once(&init_static_once, init_tables_once);
 
@@ -2534,6 +2563,8 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
     const Vp3DecodeContext *s1 = src->priv_data;
     int qps_changed = 0, err;
 
+    ff_refstruct_replace(&s->coeff_vlc, s1->coeff_vlc);
+
     if (!s1->current_frame.f->data[0] ||
         s->width != s1->width || s->height != s1->height) {
         if (s != s1)
author	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2023-09-21 01:28:54 +0200
committer	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2023-10-31 20:47:00 +0100
commit	6c7a344b65cb7476d1575cb1504e3a53bcbc83e7 (patch)
tree	a4346325fa1d54710003143781fb8b0dc0a379bb
parent	7fee90efac005b6f8f5878c9d004f3d2da0d2ead (diff)
download	ffmpeg-6c7a344b65cb7476d1575cb1504e3a53bcbc83e7.tar.gz