diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2014-02-16 16:36:51 -0500 |
---|---|---|
committer | Clément Bœsch <u@pkh.me> | 2014-02-17 13:39:00 +0100 |
commit | 21a0451167bb40035688859302a10d337e2bae1b (patch) | |
tree | 6495ca8e3f4ce23449ccfeb36d554c25920f0602 /libavcodec | |
parent | fdb093c4e43c2145f344c41bbe9a3d1e613c5709 (diff) | |
download | ffmpeg-21a0451167bb40035688859302a10d337e2bae1b.tar.gz |
vp9: split decode_coeff_b loop inside txsz branch.
The advantage of this is that the is32x32 division branch in
decode_coeffs_b is removed from the inner loop to outside the block
coef decoding loop in decode_coeffs. Also, it allows us to merge the
txsz branches from the block coef decoding loop, the context merge
and the context split.
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/vp9.c | 155 |
1 files changed, 99 insertions, 56 deletions
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c index 341bccd0da..cbc885b77d 100644 --- a/libavcodec/vp9.c +++ b/libavcodec/vp9.c @@ -2011,12 +2011,13 @@ static void decode_mode(AVCodecContext *ctx) } } -// FIXME remove tx argument, and merge cnt/eob arguments? -static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs, - enum TxfmMode tx, unsigned (*cnt)[6][3], - unsigned (*eob)[6][2], uint8_t (*p)[6][11], - int nnz, const int16_t *scan, const int16_t (*nb)[2], - const int16_t *band_counts, const int16_t *qmul) +// FIXME merge cnt/eob arguments? +static av_always_inline int +decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs, + int is_tx32x32, unsigned (*cnt)[6][3], + unsigned (*eob)[6][2], uint8_t (*p)[6][11], + int nnz, const int16_t *scan, const int16_t (*nb)[2], + const int16_t *band_counts, const int16_t *qmul) { int i = 0, band = 0, band_left = band_counts[band]; uint8_t *tp = p[0][nnz]; @@ -2108,7 +2109,7 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs, } if (!--band_left) band_left = band_counts[++band]; - if (tx == TX_32X32) // FIXME slow + if (is_tx32x32) coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2; else coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i]; @@ -2119,6 +2120,26 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs, return i; } +static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + +static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + static void decode_coeffs(AVCodecContext *ctx) { VP9Context *s = ctx->priv_data; @@ -2130,8 +2151,7 @@ static void decode_coeffs(AVCodecContext *ctx) int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1; int end_x = FFMIN(2 * (s->cols - col), w4); int end_y = FFMIN(2 * (s->rows - row), h4); - int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2); - int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res; + int n, pl, x, y, res; int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul; int tx = 4 * s->lossless + b->tx; const int16_t * const *yscans = vp9_scans[tx]; @@ -2158,29 +2178,22 @@ static void decode_coeffs(AVCodecContext *ctx) MERGE(a, end_x, step, rd); \ } while (0) - /* y tokens */ - switch (b->tx) { - case TX_8X8: MERGE_CTX(2, AV_RN16A); break; - case TX_16X16: MERGE_CTX(4, AV_RN32A); break; - case TX_32X32: MERGE_CTX(8, AV_RN64A); break; - } - for (n = 0, y = 0; y < end_y; y += step1d) { - for (x = 0; x < end_x; x += step1d, n += step) { - enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 && - b->bs > BS_8x8 ? - n : 0]]; - int nnz = a[x] + l[y]; - res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step, - b->tx, c, e, p, nnz, yscans[txtp], - ynbs[txtp], y_band_counts, qmul[0]); - a[x] = l[y] = !!res; - if (b->tx > TX_8X8) { - AV_WN16A(&s->eob[n], res); - } else { - s->eob[n] = res; - } - } +#define DECODE_Y_COEF_LOOP(step, mode_index, v) \ + for (n = 0, y = 0; y < end_y; y += step) { \ + for (x = 0; x < end_x; x += step, n += step * step) { \ + enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \ + res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \ + c, e, p, a[x] + l[y], yscans[txtp], \ + ynbs[txtp], y_band_counts, qmul[0]); \ + a[x] = l[y] = !!res; \ + if (step >= 4) { \ + AV_WN16A(&s->eob[n], res); \ + } else { \ + s->eob[n] = res; \ + } \ + } \ } + #define SPLAT(la, end, step, cond) \ if (step == 2) { \ for (n = 1; n < end; n += step) \ @@ -2215,10 +2228,38 @@ static void decode_coeffs(AVCodecContext *ctx) SPLAT(a, end_x, step, end_x == w4); \ SPLAT(l, end_y, step, end_y == h4); \ } while (0) + + /* y tokens */ switch (b->tx) { - case TX_8X8: SPLAT_CTX(2); break; - case TX_16X16: SPLAT_CTX(4); break; - case TX_32X32: SPLAT_CTX(8); break; + case TX_4X4: + DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,); + break; + case TX_8X8: + MERGE_CTX(2, AV_RN16A); + DECODE_Y_COEF_LOOP(2, 0,); + SPLAT_CTX(2); + break; + case TX_16X16: + MERGE_CTX(4, AV_RN32A); + DECODE_Y_COEF_LOOP(4, 0,); + SPLAT_CTX(4); + break; + case TX_32X32: + MERGE_CTX(8, AV_RN64A); + DECODE_Y_COEF_LOOP(8, 0, 32); + SPLAT_CTX(8); + break; + } + +#define DECODE_UV_COEF_LOOP(step) \ + for (n = 0, y = 0; y < end_y; y += step) { \ + for (x = 0; x < end_x; x += step, n += step * step) { \ + res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \ + 16 * step * step, c, e, p, a[x] + l[y], \ + uvscan, uvnb, uv_band_counts, qmul[1]); \ + a[x] = l[y] = !!res; \ + s->uveob[pl][n] = res; \ + } \ } p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra]; @@ -2232,28 +2273,30 @@ static void decode_coeffs(AVCodecContext *ctx) a = &s->above_uv_nnz_ctx[pl][col]; l = &s->left_uv_nnz_ctx[pl][row & 7]; switch (b->uvtx) { - case TX_8X8: MERGE_CTX(2, AV_RN16A); break; - case TX_16X16: MERGE_CTX(4, AV_RN32A); break; - case TX_32X32: MERGE_CTX(8, AV_RN64A); break; - } - for (n = 0, y = 0; y < end_y; y += uvstep1d) { - for (x = 0; x < end_x; x += uvstep1d, n += uvstep) { - int nnz = a[x] + l[y]; - res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, - 16 * uvstep, b->uvtx, c, e, p, nnz, - uvscan, uvnb, uv_band_counts, qmul[1]); - a[x] = l[y] = !!res; - if (b->uvtx > TX_8X8) { - AV_WN16A(&s->uveob[pl][n], res); - } else { - s->uveob[pl][n] = res; - } - } - } - switch (b->uvtx) { - case TX_8X8: SPLAT_CTX(2); break; - case TX_16X16: SPLAT_CTX(4); break; - case TX_32X32: SPLAT_CTX(8); break; + case TX_4X4: + DECODE_UV_COEF_LOOP(1); + break; + case TX_8X8: + MERGE_CTX(2, AV_RN16A); + DECODE_UV_COEF_LOOP(2); + SPLAT_CTX(2); + break; + case TX_16X16: + MERGE_CTX(4, AV_RN32A); + DECODE_UV_COEF_LOOP(4); + SPLAT_CTX(4); + break; + case TX_32X32: + MERGE_CTX(8, AV_RN64A); + // a 64x64 (max) uv block can ever only contain 1 tx32x32 block + // so there is no need to loop + res = decode_coeffs_b32(&s->c, s->uvblock[pl], + 1024, c, e, p, a[0] + l[0], + uvscan, uvnb, uv_band_counts, qmul[1]); + a[0] = l[0] = !!res; + AV_WN16A(&s->uveob[pl][0], res); + SPLAT_CTX(8); + break; } } } |