lavu/tx: implement 32 bit fixed point FFT and MDCT

Required minimal changes to the code so made sense to implement. FFT and MDCT tested, the output of both was properly rounded. Fun fact: the non-power-of-two fixed-point FFT and MDCT are the fastest ever non-power-of-two fixed-point FFT and MDCT written. This can replace the power of two integer MDCTs in aac and ac3 if the MIPS optimizations are ported across. Unfortunately the ac3 encoder uses a 16-bit fixed point forward transform, unlike the encoder which uses a 32bit inverse transform, so some modifications might be required there. The 3-point FFT is somewhat less accurate than it otherwise could be, having minor rounding errors with bigger transforms. However, this could be improved later, and the way its currently written is the way one would write assembly for it. Similar rounding errors can also be found throughout the power of two FFTs as well, though those are more difficult to correct. Despite this, the integer transforms are more than accurate enough.
author: Lynne <dev@lynne.ee> 2020-02-08 23:13:28 +0000
committer: Lynne <dev@lynne.ee> 2020-02-13 17:10:34 +0000
commit: e8f054b095baa194623b3852f06fc507ae697503 (patch)
tree: 32fc6a24fb5747e5a11ee74fb8aaf6d62a517882 /libavutil/tx_template.c
parent: e007059d6617ca966380810fe03c571566ecd9c3 (diff)
download: ffmpeg-e8f054b095baa194623b3852f06fc507ae697503.tar.gz
1 files changed, 57 insertions, 82 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 9196ee383d..d33c9ce351 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -66,7 +66,7 @@ static av_always_inline void init_cos_tabs_idx(int index)
     double freq = 2*M_PI/m;
     FFTSample *tab = cos_tabs[index];
     for(int i = 0; i <= m/4; i++)
-        tab[i] = cos(i*freq);
+        tab[i] = RESCALE(cos(i*freq));
     for(int i = 1; i < m/4; i++)
         tab[m/2 - i] = tab[i];
 }
@@ -94,10 +94,10 @@ INIT_FF_COS_TABS_FUNC(17, 131072)
 
 static av_cold void ff_init_53_tabs(void)
 {
-    TX_NAME(ff_cos_53)[0] = (FFTComplex){ cos(2 * M_PI / 12), cos(2 * M_PI / 12) };
-    TX_NAME(ff_cos_53)[1] = (FFTComplex){ 0.5, 0.5 };
-    TX_NAME(ff_cos_53)[2] = (FFTComplex){ cos(2 * M_PI /  5), sin(2 * M_PI /  5) };
-    TX_NAME(ff_cos_53)[3] = (FFTComplex){ cos(2 * M_PI / 10), sin(2 * M_PI / 10) };
+    TX_NAME(ff_cos_53)[0] = (FFTComplex){ RESCALE(cos(2 * M_PI / 12)), RESCALE(cos(2 * M_PI / 12)) };
+    TX_NAME(ff_cos_53)[1] = (FFTComplex){ RESCALE(cos(2 * M_PI /  6)), RESCALE(cos(2 * M_PI /  6)) };
+    TX_NAME(ff_cos_53)[2] = (FFTComplex){ RESCALE(cos(2 * M_PI /  5)), RESCALE(sin(2 * M_PI /  5)) };
+    TX_NAME(ff_cos_53)[3] = (FFTComplex){ RESCALE(cos(2 * M_PI / 10)), RESCALE(sin(2 * M_PI / 10)) };
 }
 
 static CosTabsInitOnce cos_tabs_init_once[] = {
@@ -132,18 +132,16 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in,
 {
     FFTComplex tmp[2];
 
-    tmp[0].re = in[1].im - in[2].im;
-    tmp[0].im = in[1].re - in[2].re;
-    tmp[1].re = in[1].re + in[2].re;
-    tmp[1].im = in[1].im + in[2].im;
+    BF(tmp[0].re, tmp[1].im, in[1].im, in[2].im);
+    BF(tmp[0].im, tmp[1].re, in[1].re, in[2].re);
 
     out[0*stride].re = in[0].re + tmp[1].re;
     out[0*stride].im = in[0].im + tmp[1].im;
 
-    tmp[0].re *= TX_NAME(ff_cos_53)[0].re;
-    tmp[0].im *= TX_NAME(ff_cos_53)[0].im;
-    tmp[1].re *= TX_NAME(ff_cos_53)[1].re;
-    tmp[1].im *= TX_NAME(ff_cos_53)[1].re;
+    tmp[0].re = MUL(TX_NAME(ff_cos_53)[0].re, tmp[0].re);
+    tmp[0].im = MUL(TX_NAME(ff_cos_53)[0].im, tmp[0].im);
+    tmp[1].re = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].re);
+    tmp[1].im = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].im);
 
     out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
     out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
@@ -151,61 +149,38 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in,
     out[2*stride].im = in[0].im - tmp[1].im + tmp[0].im;
 }
 
-#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                                    \
-static av_always_inline void NAME(FFTComplex *out, FFTComplex *in,             \
-                                  ptrdiff_t stride)                            \
-{                                                                              \
-    FFTComplex z0[4], t[6];                                                    \
-                                                                               \
-    t[0].re = in[1].re + in[4].re;                                             \
-    t[0].im = in[1].im + in[4].im;                                             \
-    t[1].im = in[1].re - in[4].re;                                             \
-    t[1].re = in[1].im - in[4].im;                                             \
-    t[2].re = in[2].re + in[3].re;                                             \
-    t[2].im = in[2].im + in[3].im;                                             \
-    t[3].im = in[2].re - in[3].re;                                             \
-    t[3].re = in[2].im - in[3].im;                                             \
-                                                                               \
-    out[D0*stride].re = in[0].re + in[1].re + in[2].re +                       \
-                        in[3].re + in[4].re;                                   \
-    out[D0*stride].im = in[0].im + in[1].im + in[2].im +                       \
-                        in[3].im + in[4].im;                                   \
-                                                                               \
-    t[4].re  = TX_NAME(ff_cos_53)[2].re * t[2].re;                             \
-    t[4].im  = TX_NAME(ff_cos_53)[2].re * t[2].im;                             \
-    t[4].re -= TX_NAME(ff_cos_53)[3].re * t[0].re;                             \
-    t[4].im -= TX_NAME(ff_cos_53)[3].re * t[0].im;                             \
-    t[0].re  = TX_NAME(ff_cos_53)[2].re * t[0].re;                             \
-    t[0].im  = TX_NAME(ff_cos_53)[2].re * t[0].im;                             \
-    t[0].re -= TX_NAME(ff_cos_53)[3].re * t[2].re;                             \
-    t[0].im -= TX_NAME(ff_cos_53)[3].re * t[2].im;                             \
-    t[5].re  = TX_NAME(ff_cos_53)[2].im * t[3].re;                             \
-    t[5].im  = TX_NAME(ff_cos_53)[2].im * t[3].im;                             \
-    t[5].re -= TX_NAME(ff_cos_53)[3].im * t[1].re;                             \
-    t[5].im -= TX_NAME(ff_cos_53)[3].im * t[1].im;                             \
-    t[1].re  = TX_NAME(ff_cos_53)[2].im * t[1].re;                             \
-    t[1].im  = TX_NAME(ff_cos_53)[2].im * t[1].im;                             \
-    t[1].re += TX_NAME(ff_cos_53)[3].im * t[3].re;                             \
-    t[1].im += TX_NAME(ff_cos_53)[3].im * t[3].im;                             \
-                                                                               \
-    z0[0].re = t[0].re - t[1].re;                                              \
-    z0[0].im = t[0].im - t[1].im;                                              \
-    z0[1].re = t[4].re + t[5].re;                                              \
-    z0[1].im = t[4].im + t[5].im;                                              \
-                                                                               \
-    z0[2].re = t[4].re - t[5].re;                                              \
-    z0[2].im = t[4].im - t[5].im;                                              \
-    z0[3].re = t[0].re + t[1].re;                                              \
-    z0[3].im = t[0].im + t[1].im;                                              \
-                                                                               \
-    out[D1*stride].re = in[0].re + z0[3].re;                                   \
-    out[D1*stride].im = in[0].im + z0[0].im;                                   \
-    out[D2*stride].re = in[0].re + z0[2].re;                                   \
-    out[D2*stride].im = in[0].im + z0[1].im;                                   \
-    out[D3*stride].re = in[0].re + z0[1].re;                                   \
-    out[D3*stride].im = in[0].im + z0[2].im;                                   \
-    out[D4*stride].re = in[0].re + z0[0].re;                                   \
-    out[D4*stride].im = in[0].im + z0[3].im;                                   \
+#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                                                       \
+static av_always_inline void NAME(FFTComplex *out, FFTComplex *in,                                \
+                                  ptrdiff_t stride)                                               \
+{                                                                                                 \
+    FFTComplex z0[4], t[6];                                                                       \
+                                                                                                  \
+    BF(t[1].im, t[0].re, in[1].re, in[4].re);                                                     \
+    BF(t[1].re, t[0].im, in[1].im, in[4].im);                                                     \
+    BF(t[3].im, t[2].re, in[2].re, in[3].re);                                                     \
+    BF(t[3].re, t[2].im, in[2].im, in[3].im);                                                     \
+                                                                                                  \
+    out[D0*stride].re = in[0].re + in[1].re + in[2].re + in[3].re + in[4].re;                     \
+    out[D0*stride].im = in[0].im + in[1].im + in[2].im + in[3].im + in[4].im;                     \
+                                                                                                  \
+    SMUL(t[4].re, t[0].re, TX_NAME(ff_cos_53)[2].re, TX_NAME(ff_cos_53)[3].re, t[2].re, t[0].re); \
+    SMUL(t[4].im, t[0].im, TX_NAME(ff_cos_53)[2].re, TX_NAME(ff_cos_53)[3].re, t[2].im, t[0].im); \
+    CMUL(t[5].re, t[1].re, TX_NAME(ff_cos_53)[2].im, TX_NAME(ff_cos_53)[3].im, t[3].re, t[1].re); \
+    CMUL(t[5].im, t[1].im, TX_NAME(ff_cos_53)[2].im, TX_NAME(ff_cos_53)[3].im, t[3].im, t[1].im); \
+                                                                                                  \
+    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                                                     \
+    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                                                     \
+    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                                                     \
+    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                                                     \
+                                                                                                  \
+    out[D1*stride].re = in[0].re + z0[3].re;                                                      \
+    out[D1*stride].im = in[0].im + z0[0].im;                                                      \
+    out[D2*stride].re = in[0].re + z0[2].re;                                                      \
+    out[D2*stride].im = in[0].im + z0[1].im;                                                      \
+    out[D3*stride].re = in[0].re + z0[1].re;                                                      \
+    out[D3*stride].im = in[0].im + z0[2].im;                                                      \
+    out[D4*stride].re = in[0].re + z0[0].re;                                                      \
+    out[D4*stride].im = in[0].im + z0[3].im;                                                      \
 }
 
 DECL_FFT5(fft5,     0,  1,  2,  3,  4)
@@ -324,7 +299,7 @@ static void fft8(FFTComplex *z)
     BF(t6, z[7].im, z[6].im, -z[7].im);
 
     BUTTERFLIES(z[0],z[2],z[4],z[6]);
-    TRANSFORM(z[1],z[3],z[5],z[7],M_SQRT1_2,M_SQRT1_2);
+    TRANSFORM(z[1],z[3],z[5],z[7],RESCALE(M_SQRT1_2),RESCALE(M_SQRT1_2));
 }
 
 static void fft16(FFTComplex *z)
@@ -338,7 +313,7 @@ static void fft16(FFTComplex *z)
     fft4(z+12);
 
     TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
-    TRANSFORM(z[2],z[6],z[10],z[14],M_SQRT1_2,M_SQRT1_2);
+    TRANSFORM(z[2],z[6],z[10],z[14],RESCALE(M_SQRT1_2),RESCALE(M_SQRT1_2));
     TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3);
     TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1);
 }
@@ -459,11 +434,11 @@ static void compound_mdct_##N##xM(AVTXContext *s, void *_dst, void *_src,      \
         for (int j = 0; j < N; j++) {                                          \
             const int k = in_map[i*N + j];                                     \
             if (k < len4) {                                                    \
-                tmp.re = -src[ len4 + k] + src[1*len4 - 1 - k];                \
-                tmp.im = -src[ len3 + k] - src[1*len3 - 1 - k];                \
+                tmp.re = FOLD(-src[ len4 + k],  src[1*len4 - 1 - k]);          \
+                tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);          \
             } else {                                                           \
-                tmp.re = -src[ len4 + k] - src[5*len4 - 1 - k];                \
-                tmp.im =  src[-len4 + k] - src[1*len3 - 1 - k];                \
+                tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]);          \
+                tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]);          \
             }                                                                  \
             CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im,           \
                  exp[k >> 1].re, exp[k >> 1].im);                              \
@@ -533,11 +508,11 @@ static void monolithic_mdct(AVTXContext *s, void *_dst, void *_src,
     for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */
         const int k = 2*i;
         if (k < len4) {
-            tmp.re = -src[ len4 + k] + src[1*len4 - 1 - k];
-            tmp.im = -src[ len3 + k] - src[1*len3 - 1 - k];
+            tmp.re = FOLD(-src[ len4 + k],  src[1*len4 - 1 - k]);
+            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
         } else {
-            tmp.re = -src[ len4 + k] - src[5*len4 - 1 - k];
-            tmp.im =  src[-len4 + k] - src[1*len3 - 1 - k];
+            tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]);
+            tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]);
         }
         CMUL(z[s->revtab[i]].im, z[s->revtab[i]].re, tmp.re, tmp.im,
              exp[i].re, exp[i].im);
@@ -567,8 +542,8 @@ static int gen_mdct_exptab(AVTXContext *s, int len4, double scale)
     scale = sqrt(fabs(scale));
     for (int i = 0; i < len4; i++) {
         const double alpha = M_PI_2 * (i + theta) / len4;
-        s->exptab[i].re = cos(alpha) * scale;
-        s->exptab[i].im = sin(alpha) * scale;
+        s->exptab[i].re = RESCALE(cos(alpha) * scale);
+        s->exptab[i].im = RESCALE(sin(alpha) * scale);
     }
 
     return 0;
@@ -578,7 +553,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx,
                                  enum AVTXType type, int inv, int len,
                                  const void *scale, uint64_t flags)
 {
-    const int is_mdct = type == AV_TX_FLOAT_MDCT || type == AV_TX_DOUBLE_MDCT;
+    const int is_mdct = ff_tx_type_is_mdct(type);
     int err, n = 1, m = 1, max_ptwo = 1 << (FF_ARRAY_ELEMS(fft_dispatch) + 1);
 
     if (is_mdct)
@@ -637,7 +612,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx,
     }
 
     if (is_mdct)
-        return gen_mdct_exptab(s, n*m, *((FFTSample *)scale));
+        return gen_mdct_exptab(s, n*m, *((SCALE_TYPE *)scale));
 
     return 0;
 }
author	Lynne <dev@lynne.ee>	2020-02-08 23:13:28 +0000
committer	Lynne <dev@lynne.ee>	2020-02-13 17:10:34 +0000
commit	e8f054b095baa194623b3852f06fc507ae697503 (patch)
tree	32fc6a24fb5747e5a11ee74fb8aaf6d62a517882 /libavutil/tx_template.c
parent	e007059d6617ca966380810fe03c571566ecd9c3 (diff)
download	ffmpeg-e8f054b095baa194623b3852f06fc507ae697503.tar.gz