diff options
author | Andreas Rheinhardt <andreas.rheinhardt@outlook.com> | 2023-09-21 01:28:54 +0200 |
---|---|---|
committer | Andreas Rheinhardt <andreas.rheinhardt@outlook.com> | 2023-10-31 20:47:00 +0100 |
commit | 6c7a344b65cb7476d1575cb1504e3a53bcbc83e7 (patch) | |
tree | a4346325fa1d54710003143781fb8b0dc0a379bb | |
parent | 7fee90efac005b6f8f5878c9d004f3d2da0d2ead (diff) | |
download | ffmpeg-6c7a344b65cb7476d1575cb1504e3a53bcbc83e7.tar.gz |
avcodec/vp3: Share coefficient VLCs between threads
These VLCs are very big: The VP3 one have 164382 elements
but due to the overallocation enough memory for 313344 elements
are allocated (1.195 MiB with sizeof(VLCElem) == 4);
for VP4 the numbers are very similar, namely 311296 and 164392
elements. Since 1f4cf92cfbd3accbae582ac63126ed5570ddfd37, each
frame thread has its own copy of these VLCs.
This commit fixes this by sharing these VLCs across threads.
The approach used here will also make it easier to support
stream reconfigurations in case of frame-multithreading
in the future.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
-rw-r--r-- | libavcodec/vp3.c | 99 |
1 files changed, 65 insertions, 34 deletions
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index e6ec644cb0..f2b9203a04 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -45,8 +45,10 @@ #include "decode.h" #include "get_bits.h" #include "hpeldsp.h" +#include "internal.h" #include "jpegquanttables.h" #include "mathops.h" +#include "refstruct.h" #include "thread.h" #include "threadframe.h" #include "videodsp.h" @@ -187,6 +189,10 @@ typedef struct HuffTable { uint8_t nb_entries; } HuffTable; +typedef struct CoeffVLCs { + VLC vlcs[80]; +} CoeffVLCs; + typedef struct Vp3DecodeContext { AVCodecContext *avctx; int theora, theora_tables, theora_header; @@ -289,9 +295,12 @@ typedef struct Vp3DecodeContext { int *nkf_coded_fragment_list; int num_kf_coded_fragment[3]; - /* The first 16 of the following VLCs are for the dc coefficients; - the others are four groups of 16 VLCs each for ac coefficients. */ - VLC coeff_vlc[5 * 16]; + /** + * The first 16 of the following VLCs are for the dc coefficients; + * the others are four groups of 16 VLCs each for ac coefficients. + * This is a RefStruct reference to share these VLCs between threads. + */ + CoeffVLCs *coeff_vlc; /* these arrays need to be on 16-byte boundaries since SSE2 operations * index into them */ @@ -365,8 +374,7 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx) av_frame_free(&s->last_frame.f); av_frame_free(&s->golden_frame.f); - for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++) - ff_vlc_free(&s->coeff_vlc[i]); + ff_refstruct_unref(&s->coeff_vlc); return 0; } @@ -1295,13 +1303,14 @@ static void reverse_dc_prediction(Vp3DecodeContext *s, */ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) { + const VLC *coeff_vlc = s->coeff_vlc->vlcs; int dc_y_table; int dc_c_table; int ac_y_table; int ac_c_table; int residual_eob_run = 0; - VLC *y_tables[64]; - VLC *c_tables[64]; + const VLC *y_tables[64]; + const VLC *c_tables[64]; s->dct_tokens[0][0] = s->dct_tokens_base; @@ -1313,7 +1322,7 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) dc_c_table = get_bits(gb, 4); /* unpack the Y plane DC coefficients */ - residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_y_table], 0, + residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_y_table], 0, 0, residual_eob_run); if (residual_eob_run < 0) return residual_eob_run; @@ -1324,11 +1333,11 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) reverse_dc_prediction(s, 0, s->fragment_width[0], s->fragment_height[0]); /* unpack the C plane DC coefficients */ - residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_c_table], 0, + residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_c_table], 0, 1, residual_eob_run); if (residual_eob_run < 0) return residual_eob_run; - residual_eob_run = unpack_vlcs(s, gb, &s->coeff_vlc[dc_c_table], 0, + residual_eob_run = unpack_vlcs(s, gb, &coeff_vlc[dc_c_table], 0, 2, residual_eob_run); if (residual_eob_run < 0) return residual_eob_run; @@ -1350,23 +1359,23 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) /* build tables of AC VLC tables */ for (int i = 1; i <= 5; i++) { /* AC VLC table group 1 */ - y_tables[i] = &s->coeff_vlc[ac_y_table + 16]; - c_tables[i] = &s->coeff_vlc[ac_c_table + 16]; + y_tables[i] = &coeff_vlc[ac_y_table + 16]; + c_tables[i] = &coeff_vlc[ac_c_table + 16]; } for (int i = 6; i <= 14; i++) { /* AC VLC table group 2 */ - y_tables[i] = &s->coeff_vlc[ac_y_table + 32]; - c_tables[i] = &s->coeff_vlc[ac_c_table + 32]; + y_tables[i] = &coeff_vlc[ac_y_table + 32]; + c_tables[i] = &coeff_vlc[ac_c_table + 32]; } for (int i = 15; i <= 27; i++) { /* AC VLC table group 3 */ - y_tables[i] = &s->coeff_vlc[ac_y_table + 48]; - c_tables[i] = &s->coeff_vlc[ac_c_table + 48]; + y_tables[i] = &coeff_vlc[ac_y_table + 48]; + c_tables[i] = &coeff_vlc[ac_c_table + 48]; } for (int i = 28; i <= 63; i++) { /* AC VLC table group 4 */ - y_tables[i] = &s->coeff_vlc[ac_y_table + 64]; - c_tables[i] = &s->coeff_vlc[ac_c_table + 64]; + y_tables[i] = &coeff_vlc[ac_y_table + 64]; + c_tables[i] = &coeff_vlc[ac_c_table + 64]; } /* decode all AC coefficients */ @@ -1517,6 +1526,7 @@ static void vp4_set_tokens_base(Vp3DecodeContext *s) static int vp4_unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) { + const VLC *coeff_vlc = s->coeff_vlc->vlcs; int dc_y_table; int dc_c_table; int ac_y_table; @@ -1539,27 +1549,27 @@ static int vp4_unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb) /* build tables of DC/AC VLC tables */ /* DC table group */ - tables[0][0] = &s->coeff_vlc[dc_y_table]; - tables[1][0] = &s->coeff_vlc[dc_c_table]; + tables[0][0] = &coeff_vlc[dc_y_table]; + tables[1][0] = &coeff_vlc[dc_c_table]; for (int i = 1; i <= 5; i++) { /* AC VLC table group 1 */ - tables[0][i] = &s->coeff_vlc[ac_y_table + 16]; - tables[1][i] = &s->coeff_vlc[ac_c_table + 16]; + tables[0][i] = &coeff_vlc[ac_y_table + 16]; + tables[1][i] = &coeff_vlc[ac_c_table + 16]; } for (int i = 6; i <= 14; i++) { /* AC VLC table group 2 */ - tables[0][i] = &s->coeff_vlc[ac_y_table + 32]; - tables[1][i] = &s->coeff_vlc[ac_c_table + 32]; + tables[0][i] = &coeff_vlc[ac_y_table + 32]; + tables[1][i] = &coeff_vlc[ac_c_table + 32]; } for (int i = 15; i <= 27; i++) { /* AC VLC table group 3 */ - tables[0][i] = &s->coeff_vlc[ac_y_table + 48]; - tables[1][i] = &s->coeff_vlc[ac_c_table + 48]; + tables[0][i] = &coeff_vlc[ac_y_table + 48]; + tables[1][i] = &coeff_vlc[ac_c_table + 48]; } for (int i = 28; i <= 63; i++) { /* AC VLC table group 4 */ - tables[0][i] = &s->coeff_vlc[ac_y_table + 64]; - tables[1][i] = &s->coeff_vlc[ac_c_table + 64]; + tables[0][i] = &coeff_vlc[ac_y_table + 64]; + tables[1][i] = &coeff_vlc[ac_c_table + 64]; } vp4_set_tokens_base(s); @@ -2355,6 +2365,14 @@ static av_cold int init_frames(Vp3DecodeContext *s) return 0; } +static av_cold void free_vlc_tables(FFRefStructOpaque unused, void *obj) +{ + CoeffVLCs *vlcs = obj; + + for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++) + ff_vlc_free(&vlcs->vlcs[i]); +} + static av_cold int vp3_decode_init(AVCodecContext *avctx) { static AVOnce init_static_once = AV_ONCE_INIT; @@ -2443,8 +2461,6 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx) s->fragment_start[2] = y_fragment_count + c_fragment_count; if (!s->theora_tables) { - const uint8_t (*bias_tabs)[32][2]; - for (int i = 0; i < 64; i++) { s->coded_dc_scale_factor[0][i] = s->version < 2 ? vp31_dc_scale_factor[i] : vp4_y_dc_scale_factor[i]; s->coded_dc_scale_factor[1][i] = s->version < 2 ? vp31_dc_scale_factor[i] : vp4_uv_dc_scale_factor[i]; @@ -2463,11 +2479,23 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx) s->qr_base[inter][plane][1] = 2 * inter + (!!plane) * !inter; } } + } + + if (!avctx->internal->is_copy) { + CoeffVLCs *vlcs = ff_refstruct_alloc_ext(sizeof(*s->coeff_vlc), 0, + NULL, free_vlc_tables); + if (!vlcs) + return AVERROR(ENOMEM); + + s->coeff_vlc = vlcs; + + if (!s->theora_tables) { + const uint8_t (*bias_tabs)[32][2]; /* init VLC tables */ bias_tabs = CONFIG_VP4_DECODER && s->version >= 2 ? vp4_bias : vp3_bias; - for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++) { - ret = ff_vlc_init_from_lengths(&s->coeff_vlc[i], 11, 32, + for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++) { + ret = ff_vlc_init_from_lengths(&vlcs->vlcs[i], 11, 32, &bias_tabs[i][0][1], 2, &bias_tabs[i][0][0], 2, 1, 0, 0, avctx); @@ -2475,10 +2503,10 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx) return ret; } } else { - for (int i = 0; i < FF_ARRAY_ELEMS(s->coeff_vlc); i++) { + for (int i = 0; i < FF_ARRAY_ELEMS(vlcs->vlcs); i++) { const HuffTable *tab = &s->huffman_table[i]; - ret = ff_vlc_init_from_lengths(&s->coeff_vlc[i], 11, tab->nb_entries, + ret = ff_vlc_init_from_lengths(&vlcs->vlcs[i], 11, tab->nb_entries, &tab->entries[0].len, sizeof(*tab->entries), &tab->entries[0].sym, sizeof(*tab->entries), 1, 0, 0, avctx); @@ -2486,6 +2514,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx) return ret; } } + } ff_thread_once(&init_static_once, init_tables_once); @@ -2534,6 +2563,8 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext * const Vp3DecodeContext *s1 = src->priv_data; int qps_changed = 0, err; + ff_refstruct_replace(&s->coeff_vlc, s1->coeff_vlc); + if (!s1->current_frame.f->data[0] || s->width != s1->width || s->height != s1->height) { if (s != s1) |