libavcodec: Implementation of 32 bit fixed point FFT

Iterative implementation of 32 bit fixed point split-radix FFT. Max FFT that can be calculated currently is 2^12. Signed-off-by: Nedeljko Babic <nbabic@mips.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
author: Nedeljko Babic <nbabic@mips.com> 2013-06-03 16:11:12 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2013-08-04 14:01:41 +0200
commit: 18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15 (patch)
tree: 0d23cbed3eb59d91ba5c734a7222a6ae02c72cba /libavcodec/fft.c
parent: 27cc3e72f8502d6239dcd0f1dd3fe73f1c85355d (diff)
download: ffmpeg-18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15.tar.gz
1 files changed, 179 insertions, 2 deletions
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 84d236d558..5244dcaa8e 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -32,6 +32,10 @@
 #include "fft.h"
 #include "fft-internal.h"
 
+#if CONFIG_FFT_FIXED_32
+#include "fft_table.h"
+#else /* CONFIG_FFT_FIXED_32 */
+
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 #if !CONFIG_HARDCODED_TABLES
 COSTABLE(16);
@@ -65,6 +69,8 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     FFT_NAME(ff_cos_65536),
 };
 
+#endif /* CONFIG_FFT_FIXED_32 */
+
 static void fft_permute_c(FFTContext *s, FFTComplex *z);
 static void fft_calc_c(FFTContext *s, FFTComplex *z);
 
@@ -81,7 +87,7 @@ static int split_radix_permutation(int i, int n, int inverse)
 
 av_cold void ff_init_ff_cos_tabs(int index)
 {
-#if !CONFIG_HARDCODED_TABLES
+#if (!CONFIG_HARDCODED_TABLES) && (!CONFIG_FFT_FIXED_32)
     int i;
     int m = 1<<index;
     double freq = 2*M_PI/m;
@@ -157,6 +163,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
     s->mdct_calc   = ff_mdct_calc_c;
 #endif
 
+#if CONFIG_FFT_FIXED_32
+    {
+        int n=0;
+        ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
+    }
+#else /* CONFIG_FFT_FIXED_32 */
 #if CONFIG_FFT_FLOAT
     if (ARCH_ARM)     ff_fft_init_arm(s);
     if (ARCH_PPC)     ff_fft_init_ppc(s);
@@ -167,10 +179,11 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
     if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
 #endif
-
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
+#endif /* CONFIG_FFT_FIXED_32 */
+
 
     if (s->fft_permutation == FF_FFT_PERM_AVX) {
         fft_perm_avx(s);
@@ -206,6 +219,169 @@ av_cold void ff_fft_end(FFTContext *s)
     av_freep(&s->tmp_buf);
 }
 
+#if CONFIG_FFT_FIXED_32
+
+static void fft_calc_c(FFTContext *s, FFTComplex *z) {
+
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    FFTSample w_re, w_im;
+    FFTSample *w_re_ptr, *w_im_ptr;
+    const int fft_size = (1 << s->nbits);
+    int64_t accu;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[4].re + tmpz[5].re;
+        tmp3 = tmpz[6].re + tmpz[7].re;
+        tmp2 = tmpz[4].im + tmpz[5].im;
+        tmp4 = tmpz[6].im + tmpz[7].im;
+        tmp5 = tmp1 + tmp3;
+        tmp7 = tmp1 - tmp3;
+        tmp6 = tmp2 + tmp4;
+        tmp8 = tmp2 - tmp4;
+
+        tmp1 = tmpz[4].re - tmpz[5].re;
+        tmp2 = tmpz[4].im - tmpz[5].im;
+        tmp3 = tmpz[6].re - tmpz[7].re;
+        tmp4 = tmpz[6].im - tmpz[7].im;
+
+        tmpz[4].re = tmpz[0].re - tmp5;
+        tmpz[0].re = tmpz[0].re + tmp5;
+        tmpz[4].im = tmpz[0].im - tmp6;
+        tmpz[0].im = tmpz[0].im + tmp6;
+        tmpz[6].re = tmpz[2].re - tmp8;
+        tmpz[2].re = tmpz[2].re + tmp8;
+        tmpz[6].im = tmpz[2].im + tmp7;
+        tmpz[2].im = tmpz[2].im - tmp7;
+
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp1 + tmp2);
+        tmp5 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 - tmp4);
+        tmp7 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp2 - tmp1);
+        tmp6 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 + tmp4);
+        tmp8 = (int32_t)((accu + 0x40000000) >> 31);
+        tmp1 = tmp5 + tmp7;
+        tmp3 = tmp5 - tmp7;
+        tmp2 = tmp6 + tmp8;
+        tmp4 = tmp6 - tmp8;
+
+        tmpz[5].re = tmpz[1].re - tmp1;
+        tmpz[1].re = tmpz[1].re + tmp1;
+        tmpz[5].im = tmpz[1].im - tmp2;
+        tmpz[1].im = tmpz[1].im + tmp2;
+        tmpz[7].re = tmpz[3].re - tmp4;
+        tmpz[3].re = tmpz[3].re + tmp4;
+        tmpz[7].im = tmpz[3].im + tmp3;
+        tmpz[3].im = tmpz[3].im - tmp3;
+    }
+
+    step = 1 << ((MAX_LOG2_NFFT-4) - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++){
+        n2  = 2*n4;
+        n34 = 3*n4;
+        num_transforms = (num_transforms >> 1) | 1;
+
+        for (n=0; n<num_transforms; n++){
+            offset = fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmp5 = tmpz[ n2].re + tmpz[n34].re;
+            tmp1 = tmpz[ n2].re - tmpz[n34].re;
+            tmp6 = tmpz[ n2].im + tmpz[n34].im;
+            tmp2 = tmpz[ n2].im - tmpz[n34].im;
+
+            tmpz[ n2].re = tmpz[ 0].re - tmp5;
+            tmpz[  0].re = tmpz[ 0].re + tmp5;
+            tmpz[ n2].im = tmpz[ 0].im - tmp6;
+            tmpz[  0].im = tmpz[ 0].im + tmp6;
+            tmpz[n34].re = tmpz[n4].re - tmp2;
+            tmpz[ n4].re = tmpz[n4].re + tmp2;
+            tmpz[n34].im = tmpz[n4].im + tmp1;
+            tmpz[ n4].im = tmpz[n4].im - tmp1;
+
+            w_re_ptr = w_tab_sr + step;
+            w_im_ptr = w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
+
+            for (i=1; i<n4; i++){
+                w_re = w_re_ptr[0];
+                w_im = w_im_ptr[0];
+                accu  = (int64_t)w_re*tmpz[ n2+i].re;
+                accu += (int64_t)w_im*tmpz[ n2+i].im;
+                tmp1 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[ n2+i].im;
+                accu -= (int64_t)w_im*tmpz[ n2+i].re;
+                tmp2 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].re;
+                accu -= (int64_t)w_im*tmpz[n34+i].im;
+                tmp3 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].im;
+                accu += (int64_t)w_im*tmpz[n34+i].re;
+                tmp4 = (int32_t)((accu + 0x40000000) >> 31);
+
+                tmp5 = tmp1 + tmp3;
+                tmp1 = tmp1 - tmp3;
+                tmp6 = tmp2 + tmp4;
+                tmp2 = tmp2 - tmp4;
+
+                tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                tmpz[    i].re = tmpz[   i].re + tmp5;
+                tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                tmpz[    i].im = tmpz[   i].im + tmp6;
+                tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+#else /* CONFIG_FFT_FIXED_32 */
+
 #define BUTTERFLIES(a0,a1,a2,a3) {\
     BF(t3, t5, t5, t1);\
     BF(a2.re, a0.re, a0.re, t5);\
@@ -351,3 +527,4 @@ static void fft_calc_c(FFTContext *s, FFTComplex *z)
 {
     fft_dispatch[s->nbits-2](z);
 }
+#endif /* CONFIG_FFT_FIXED_32 */
author	Nedeljko Babic <nbabic@mips.com>	2013-06-03 16:11:12 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2013-08-04 14:01:41 +0200
commit	18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15 (patch)
tree	0d23cbed3eb59d91ba5c734a7222a6ae02c72cba /libavcodec/fft.c
parent	27cc3e72f8502d6239dcd0f1dd3fe73f1c85355d (diff)
download	ffmpeg-18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15.tar.gz