diff options
author | Lynne <dev@lynne.ee> | 2020-02-08 23:13:28 +0000 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2020-02-13 17:10:34 +0000 |
commit | e8f054b095baa194623b3852f06fc507ae697503 (patch) | |
tree | 32fc6a24fb5747e5a11ee74fb8aaf6d62a517882 /libavutil/tx_template.c | |
parent | e007059d6617ca966380810fe03c571566ecd9c3 (diff) | |
download | ffmpeg-e8f054b095baa194623b3852f06fc507ae697503.tar.gz |
lavu/tx: implement 32 bit fixed point FFT and MDCT
Required minimal changes to the code so made sense to implement.
FFT and MDCT tested, the output of both was properly rounded.
Fun fact: the non-power-of-two fixed-point FFT and MDCT are the fastest ever
non-power-of-two fixed-point FFT and MDCT written.
This can replace the power of two integer MDCTs in aac and ac3 if the
MIPS optimizations are ported across.
Unfortunately the ac3 encoder uses a 16-bit fixed point forward transform,
unlike the encoder which uses a 32bit inverse transform, so some modifications
might be required there.
The 3-point FFT is somewhat less accurate than it otherwise could be,
having minor rounding errors with bigger transforms. However, this
could be improved later, and the way its currently written is the way one
would write assembly for it.
Similar rounding errors can also be found throughout the power of two FFTs
as well, though those are more difficult to correct.
Despite this, the integer transforms are more than accurate enough.
Diffstat (limited to 'libavutil/tx_template.c')
-rw-r--r-- | libavutil/tx_template.c | 139 |
1 files changed, 57 insertions, 82 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 9196ee383d..d33c9ce351 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -66,7 +66,7 @@ static av_always_inline void init_cos_tabs_idx(int index) double freq = 2*M_PI/m; FFTSample *tab = cos_tabs[index]; for(int i = 0; i <= m/4; i++) - tab[i] = cos(i*freq); + tab[i] = RESCALE(cos(i*freq)); for(int i = 1; i < m/4; i++) tab[m/2 - i] = tab[i]; } @@ -94,10 +94,10 @@ INIT_FF_COS_TABS_FUNC(17, 131072) static av_cold void ff_init_53_tabs(void) { - TX_NAME(ff_cos_53)[0] = (FFTComplex){ cos(2 * M_PI / 12), cos(2 * M_PI / 12) }; - TX_NAME(ff_cos_53)[1] = (FFTComplex){ 0.5, 0.5 }; - TX_NAME(ff_cos_53)[2] = (FFTComplex){ cos(2 * M_PI / 5), sin(2 * M_PI / 5) }; - TX_NAME(ff_cos_53)[3] = (FFTComplex){ cos(2 * M_PI / 10), sin(2 * M_PI / 10) }; + TX_NAME(ff_cos_53)[0] = (FFTComplex){ RESCALE(cos(2 * M_PI / 12)), RESCALE(cos(2 * M_PI / 12)) }; + TX_NAME(ff_cos_53)[1] = (FFTComplex){ RESCALE(cos(2 * M_PI / 6)), RESCALE(cos(2 * M_PI / 6)) }; + TX_NAME(ff_cos_53)[2] = (FFTComplex){ RESCALE(cos(2 * M_PI / 5)), RESCALE(sin(2 * M_PI / 5)) }; + TX_NAME(ff_cos_53)[3] = (FFTComplex){ RESCALE(cos(2 * M_PI / 10)), RESCALE(sin(2 * M_PI / 10)) }; } static CosTabsInitOnce cos_tabs_init_once[] = { @@ -132,18 +132,16 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in, { FFTComplex tmp[2]; - tmp[0].re = in[1].im - in[2].im; - tmp[0].im = in[1].re - in[2].re; - tmp[1].re = in[1].re + in[2].re; - tmp[1].im = in[1].im + in[2].im; + BF(tmp[0].re, tmp[1].im, in[1].im, in[2].im); + BF(tmp[0].im, tmp[1].re, in[1].re, in[2].re); out[0*stride].re = in[0].re + tmp[1].re; out[0*stride].im = in[0].im + tmp[1].im; - tmp[0].re *= TX_NAME(ff_cos_53)[0].re; - tmp[0].im *= TX_NAME(ff_cos_53)[0].im; - tmp[1].re *= TX_NAME(ff_cos_53)[1].re; - tmp[1].im *= TX_NAME(ff_cos_53)[1].re; + tmp[0].re = MUL(TX_NAME(ff_cos_53)[0].re, tmp[0].re); + tmp[0].im = MUL(TX_NAME(ff_cos_53)[0].im, tmp[0].im); + tmp[1].re = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].re); + tmp[1].im = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].im); out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re; out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im; @@ -151,61 +149,38 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in, out[2*stride].im = in[0].im - tmp[1].im + tmp[0].im; } -#define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \ -static av_always_inline void NAME(FFTComplex *out, FFTComplex *in, \ - ptrdiff_t stride) \ -{ \ - FFTComplex z0[4], t[6]; \ - \ - t[0].re = in[1].re + in[4].re; \ - t[0].im = in[1].im + in[4].im; \ - t[1].im = in[1].re - in[4].re; \ - t[1].re = in[1].im - in[4].im; \ - t[2].re = in[2].re + in[3].re; \ - t[2].im = in[2].im + in[3].im; \ - t[3].im = in[2].re - in[3].re; \ - t[3].re = in[2].im - in[3].im; \ - \ - out[D0*stride].re = in[0].re + in[1].re + in[2].re + \ - in[3].re + in[4].re; \ - out[D0*stride].im = in[0].im + in[1].im + in[2].im + \ - in[3].im + in[4].im; \ - \ - t[4].re = TX_NAME(ff_cos_53)[2].re * t[2].re; \ - t[4].im = TX_NAME(ff_cos_53)[2].re * t[2].im; \ - t[4].re -= TX_NAME(ff_cos_53)[3].re * t[0].re; \ - t[4].im -= TX_NAME(ff_cos_53)[3].re * t[0].im; \ - t[0].re = TX_NAME(ff_cos_53)[2].re * t[0].re; \ - t[0].im = TX_NAME(ff_cos_53)[2].re * t[0].im; \ - t[0].re -= TX_NAME(ff_cos_53)[3].re * t[2].re; \ - t[0].im -= TX_NAME(ff_cos_53)[3].re * t[2].im; \ - t[5].re = TX_NAME(ff_cos_53)[2].im * t[3].re; \ - t[5].im = TX_NAME(ff_cos_53)[2].im * t[3].im; \ - t[5].re -= TX_NAME(ff_cos_53)[3].im * t[1].re; \ - t[5].im -= TX_NAME(ff_cos_53)[3].im * t[1].im; \ - t[1].re = TX_NAME(ff_cos_53)[2].im * t[1].re; \ - t[1].im = TX_NAME(ff_cos_53)[2].im * t[1].im; \ - t[1].re += TX_NAME(ff_cos_53)[3].im * t[3].re; \ - t[1].im += TX_NAME(ff_cos_53)[3].im * t[3].im; \ - \ - z0[0].re = t[0].re - t[1].re; \ - z0[0].im = t[0].im - t[1].im; \ - z0[1].re = t[4].re + t[5].re; \ - z0[1].im = t[4].im + t[5].im; \ - \ - z0[2].re = t[4].re - t[5].re; \ - z0[2].im = t[4].im - t[5].im; \ - z0[3].re = t[0].re + t[1].re; \ - z0[3].im = t[0].im + t[1].im; \ - \ - out[D1*stride].re = in[0].re + z0[3].re; \ - out[D1*stride].im = in[0].im + z0[0].im; \ - out[D2*stride].re = in[0].re + z0[2].re; \ - out[D2*stride].im = in[0].im + z0[1].im; \ - out[D3*stride].re = in[0].re + z0[1].re; \ - out[D3*stride].im = in[0].im + z0[2].im; \ - out[D4*stride].re = in[0].re + z0[0].re; \ - out[D4*stride].im = in[0].im + z0[3].im; \ +#define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \ +static av_always_inline void NAME(FFTComplex *out, FFTComplex *in, \ + ptrdiff_t stride) \ +{ \ + FFTComplex z0[4], t[6]; \ + \ + BF(t[1].im, t[0].re, in[1].re, in[4].re); \ + BF(t[1].re, t[0].im, in[1].im, in[4].im); \ + BF(t[3].im, t[2].re, in[2].re, in[3].re); \ + BF(t[3].re, t[2].im, in[2].im, in[3].im); \ + \ + out[D0*stride].re = in[0].re + in[1].re + in[2].re + in[3].re + in[4].re; \ + out[D0*stride].im = in[0].im + in[1].im + in[2].im + in[3].im + in[4].im; \ + \ + SMUL(t[4].re, t[0].re, TX_NAME(ff_cos_53)[2].re, TX_NAME(ff_cos_53)[3].re, t[2].re, t[0].re); \ + SMUL(t[4].im, t[0].im, TX_NAME(ff_cos_53)[2].re, TX_NAME(ff_cos_53)[3].re, t[2].im, t[0].im); \ + CMUL(t[5].re, t[1].re, TX_NAME(ff_cos_53)[2].im, TX_NAME(ff_cos_53)[3].im, t[3].re, t[1].re); \ + CMUL(t[5].im, t[1].im, TX_NAME(ff_cos_53)[2].im, TX_NAME(ff_cos_53)[3].im, t[3].im, t[1].im); \ + \ + BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ + BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ + BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \ + BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \ + \ + out[D1*stride].re = in[0].re + z0[3].re; \ + out[D1*stride].im = in[0].im + z0[0].im; \ + out[D2*stride].re = in[0].re + z0[2].re; \ + out[D2*stride].im = in[0].im + z0[1].im; \ + out[D3*stride].re = in[0].re + z0[1].re; \ + out[D3*stride].im = in[0].im + z0[2].im; \ + out[D4*stride].re = in[0].re + z0[0].re; \ + out[D4*stride].im = in[0].im + z0[3].im; \ } DECL_FFT5(fft5, 0, 1, 2, 3, 4) @@ -324,7 +299,7 @@ static void fft8(FFTComplex *z) BF(t6, z[7].im, z[6].im, -z[7].im); BUTTERFLIES(z[0],z[2],z[4],z[6]); - TRANSFORM(z[1],z[3],z[5],z[7],M_SQRT1_2,M_SQRT1_2); + TRANSFORM(z[1],z[3],z[5],z[7],RESCALE(M_SQRT1_2),RESCALE(M_SQRT1_2)); } static void fft16(FFTComplex *z) @@ -338,7 +313,7 @@ static void fft16(FFTComplex *z) fft4(z+12); TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); - TRANSFORM(z[2],z[6],z[10],z[14],M_SQRT1_2,M_SQRT1_2); + TRANSFORM(z[2],z[6],z[10],z[14],RESCALE(M_SQRT1_2),RESCALE(M_SQRT1_2)); TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3); TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1); } @@ -459,11 +434,11 @@ static void compound_mdct_##N##xM(AVTXContext *s, void *_dst, void *_src, \ for (int j = 0; j < N; j++) { \ const int k = in_map[i*N + j]; \ if (k < len4) { \ - tmp.re = -src[ len4 + k] + src[1*len4 - 1 - k]; \ - tmp.im = -src[ len3 + k] - src[1*len3 - 1 - k]; \ + tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \ + tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \ } else { \ - tmp.re = -src[ len4 + k] - src[5*len4 - 1 - k]; \ - tmp.im = src[-len4 + k] - src[1*len3 - 1 - k]; \ + tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \ + tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \ } \ CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \ exp[k >> 1].re, exp[k >> 1].im); \ @@ -533,11 +508,11 @@ static void monolithic_mdct(AVTXContext *s, void *_dst, void *_src, for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ const int k = 2*i; if (k < len4) { - tmp.re = -src[ len4 + k] + src[1*len4 - 1 - k]; - tmp.im = -src[ len3 + k] - src[1*len3 - 1 - k]; + tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); + tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); } else { - tmp.re = -src[ len4 + k] - src[5*len4 - 1 - k]; - tmp.im = src[-len4 + k] - src[1*len3 - 1 - k]; + tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); + tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); } CMUL(z[s->revtab[i]].im, z[s->revtab[i]].re, tmp.re, tmp.im, exp[i].re, exp[i].im); @@ -567,8 +542,8 @@ static int gen_mdct_exptab(AVTXContext *s, int len4, double scale) scale = sqrt(fabs(scale)); for (int i = 0; i < len4; i++) { const double alpha = M_PI_2 * (i + theta) / len4; - s->exptab[i].re = cos(alpha) * scale; - s->exptab[i].im = sin(alpha) * scale; + s->exptab[i].re = RESCALE(cos(alpha) * scale); + s->exptab[i].im = RESCALE(sin(alpha) * scale); } return 0; @@ -578,7 +553,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags) { - const int is_mdct = type == AV_TX_FLOAT_MDCT || type == AV_TX_DOUBLE_MDCT; + const int is_mdct = ff_tx_type_is_mdct(type); int err, n = 1, m = 1, max_ptwo = 1 << (FF_ARRAY_ELEMS(fft_dispatch) + 1); if (is_mdct) @@ -637,7 +612,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx, } if (is_mdct) - return gen_mdct_exptab(s, n*m, *((FFTSample *)scale)); + return gen_mdct_exptab(s, n*m, *((SCALE_TYPE *)scale)); return 0; } |