diff options
author | Lynne <dev@lynne.ee> | 2022-08-16 01:11:40 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-08-16 01:22:38 +0200 |
commit | ae66a9db7bc19f00daaad96b3c15cbffe6280a93 (patch) | |
tree | 1c044b8f6d388b624308d8f1be559dfc9ded6708 /libavutil/tx_template.c | |
parent | 412922cc6fa790897ef6bb2be5d6f9a5f030754d (diff) | |
download | ffmpeg-ae66a9db7bc19f00daaad96b3c15cbffe6280a93.tar.gz |
lavu/tx: optimize and simplify inverse MDCTs
Convert the input from a scatter to a gather instead,
which is faster and better for SIMD.
Also, add a pre-shuffled exptab version to avoid
gathering there at all. This doubles the exptab size,
but the speedup makes it worth it. In SIMD, the
exptab will likely be purged to a higher cache
anyway because of the FFT in the middle, and
the amount of loads stays identical.
For a 960-point inverse MDCT, the speedup is 10%.
This makes it possible to write sane and fast SIMD
versions of inverse MDCTs.
Diffstat (limited to 'libavutil/tx_template.c')
-rw-r--r-- | libavutil/tx_template.c | 50 |
1 files changed, 37 insertions, 13 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 1e4354580b..35b61fa477 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -948,7 +948,7 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s, const void *scale) { int ret; - FFTXCodeletOptions sub_opts = { .invert_lookup = 0 }; + FFTXCodeletOptions sub_opts = { .invert_lookup = inv }; s->scale_d = *((SCALE_TYPE *)scale); s->scale_f = s->scale_d; @@ -961,9 +961,14 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s, inv, scale))) return ret; - if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s))) + if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL))) return ret; + /* Saves a multiply in a hot path. */ + if (inv) + for (int i = 0; i < (s->len >> 1); i++) + s->sub->map[i] <<= 1; + return 0; } @@ -1020,12 +1025,14 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src, in2 = src + ((len2*2) - 1) * stride; for (int i = 0; i < len2; i++) { - TXComplex tmp = { in2[-2*i*stride], in1[2*i*stride] }; - CMUL3(z[sub_map[i]], tmp, exp[i]); + int k = sub_map[i]; + TXComplex tmp = { in2[-k*stride], in1[k*stride] }; + CMUL3(z[i], tmp, exp[i]); } s->fn[0](&s->sub[0], z, z, sizeof(TXComplex)); + exp += len2; for (int i = 0; i < len4; i++) { const int i0 = len4 + i, i1 = len4 - i - 1; TXComplex src1 = { z[i1].im, z[i1].re }; @@ -1141,9 +1148,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s, if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len))) return ret; - if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s))) + if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) return ret; + /* Saves multiplies in loops. */ + for (int i = 0; i < len; i++) + s->map[i] <<= 1; + if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) return AVERROR(ENOMEM); @@ -1160,6 +1171,7 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \ TXComplex *z = _dst, *exp = s->exp; \ const TXSample *src = _src, *in1, *in2; \ const int len4 = s->len >> 2; \ + const int len2 = s->len >> 1; \ const int m = s->sub->len; \ const int *in_map = s->map, *out_map = in_map + N*m; \ const int *sub_map = s->sub->map; \ @@ -1168,13 +1180,15 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \ in1 = src; \ in2 = src + ((N*m*2) - 1) * stride; \ \ - for (int i = 0; i < m; i++) { \ + for (int i = 0; i < len2; i += N) { \ for (int j = 0; j < N; j++) { \ - const int k = in_map[i*N + j]; \ + const int k = in_map[j]; \ TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \ - CMUL3(fft##N##in[j], tmp, exp[k >> 1]); \ + CMUL3(fft##N##in[j], tmp, exp[j]); \ } \ - fft##N(s->tmp + sub_map[i], fft##N##in, m); \ + fft##N(s->tmp + *(sub_map++), fft##N##in, m); \ + exp += N; \ + in_map += N; \ } \ \ for (int i = 0; i < N; i++) \ @@ -1405,22 +1419,32 @@ static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = { .prio = FF_TX_PRIO_BASE, }; -int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s) +int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab) { + int off = 0; int len4 = s->len >> 1; double scale = s->scale_d; const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0; + size_t alloc = pre_tab ? 2*len4 : len4; - if (!(s->exp = av_malloc_array(len4, sizeof(*s->exp)))) + if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp)))) return AVERROR(ENOMEM); scale = sqrt(fabs(scale)); + + if (pre_tab) + off = len4; + for (int i = 0; i < len4; i++) { const double alpha = M_PI_2 * (i + theta) / len4; - s->exp[i].re = RESCALE(cos(alpha) * scale); - s->exp[i].im = RESCALE(sin(alpha) * scale); + s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale), + RESCALE(sin(alpha) * scale) }; } + if (pre_tab) + for (int i = 0; i < len4; i++) + s->exp[i] = s->exp[len4 + pre_tab[i]]; + return 0; } |