aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/fft.c
diff options
context:
space:
mode:
authorNedeljko Babic <nbabic@mips.com>2013-06-03 16:11:12 +0200
committerMichael Niedermayer <michaelni@gmx.at>2013-08-04 14:01:41 +0200
commit18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15 (patch)
tree0d23cbed3eb59d91ba5c734a7222a6ae02c72cba /libavcodec/fft.c
parent27cc3e72f8502d6239dcd0f1dd3fe73f1c85355d (diff)
downloadffmpeg-18d7074b4e5a1112cfa6a53dde0faa25d2bd0b15.tar.gz
libavcodec: Implementation of 32 bit fixed point FFT
Iterative implementation of 32 bit fixed point split-radix FFT. Max FFT that can be calculated currently is 2^12. Signed-off-by: Nedeljko Babic <nbabic@mips.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/fft.c')
-rw-r--r--libavcodec/fft.c181
1 files changed, 179 insertions, 2 deletions
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 84d236d558..5244dcaa8e 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -32,6 +32,10 @@
#include "fft.h"
#include "fft-internal.h"
+#if CONFIG_FFT_FIXED_32
+#include "fft_table.h"
+#else /* CONFIG_FFT_FIXED_32 */
+
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
#if !CONFIG_HARDCODED_TABLES
COSTABLE(16);
@@ -65,6 +69,8 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
FFT_NAME(ff_cos_65536),
};
+#endif /* CONFIG_FFT_FIXED_32 */
+
static void fft_permute_c(FFTContext *s, FFTComplex *z);
static void fft_calc_c(FFTContext *s, FFTComplex *z);
@@ -81,7 +87,7 @@ static int split_radix_permutation(int i, int n, int inverse)
av_cold void ff_init_ff_cos_tabs(int index)
{
-#if !CONFIG_HARDCODED_TABLES
+#if (!CONFIG_HARDCODED_TABLES) && (!CONFIG_FFT_FIXED_32)
int i;
int m = 1<<index;
double freq = 2*M_PI/m;
@@ -157,6 +163,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
s->mdct_calc = ff_mdct_calc_c;
#endif
+#if CONFIG_FFT_FIXED_32
+ {
+ int n=0;
+ ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
+ }
+#else /* CONFIG_FFT_FIXED_32 */
#if CONFIG_FFT_FLOAT
if (ARCH_ARM) ff_fft_init_arm(s);
if (ARCH_PPC) ff_fft_init_ppc(s);
@@ -167,10 +179,11 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c;
if (ARCH_ARM) ff_fft_fixed_init_arm(s);
#endif
-
for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j);
}
+#endif /* CONFIG_FFT_FIXED_32 */
+
if (s->fft_permutation == FF_FFT_PERM_AVX) {
fft_perm_avx(s);
@@ -206,6 +219,169 @@ av_cold void ff_fft_end(FFTContext *s)
av_freep(&s->tmp_buf);
}
+#if CONFIG_FFT_FIXED_32
+
+static void fft_calc_c(FFTContext *s, FFTComplex *z) {
+
+ int nbits, i, n, num_transforms, offset, step;
+ int n4, n2, n34;
+ FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ FFTComplex *tmpz;
+ FFTSample w_re, w_im;
+ FFTSample *w_re_ptr, *w_im_ptr;
+ const int fft_size = (1 << s->nbits);
+ int64_t accu;
+
+ num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+ for (n=0; n<num_transforms; n++){
+ offset = fft_offsets_lut[n] << 2;
+ tmpz = z + offset;
+
+ tmp1 = tmpz[0].re + tmpz[1].re;
+ tmp5 = tmpz[2].re + tmpz[3].re;
+ tmp2 = tmpz[0].im + tmpz[1].im;
+ tmp6 = tmpz[2].im + tmpz[3].im;
+ tmp3 = tmpz[0].re - tmpz[1].re;
+ tmp8 = tmpz[2].im - tmpz[3].im;
+ tmp4 = tmpz[0].im - tmpz[1].im;
+ tmp7 = tmpz[2].re - tmpz[3].re;
+
+ tmpz[0].re = tmp1 + tmp5;
+ tmpz[2].re = tmp1 - tmp5;
+ tmpz[0].im = tmp2 + tmp6;
+ tmpz[2].im = tmp2 - tmp6;
+ tmpz[1].re = tmp3 + tmp8;
+ tmpz[3].re = tmp3 - tmp8;
+ tmpz[1].im = tmp4 - tmp7;
+ tmpz[3].im = tmp4 + tmp7;
+ }
+
+ if (fft_size < 8)
+ return;
+
+ num_transforms = (num_transforms >> 1) | 1;
+
+ for (n=0; n<num_transforms; n++){
+ offset = fft_offsets_lut[n] << 3;
+ tmpz = z + offset;
+
+ tmp1 = tmpz[4].re + tmpz[5].re;
+ tmp3 = tmpz[6].re + tmpz[7].re;
+ tmp2 = tmpz[4].im + tmpz[5].im;
+ tmp4 = tmpz[6].im + tmpz[7].im;
+ tmp5 = tmp1 + tmp3;
+ tmp7 = tmp1 - tmp3;
+ tmp6 = tmp2 + tmp4;
+ tmp8 = tmp2 - tmp4;
+
+ tmp1 = tmpz[4].re - tmpz[5].re;
+ tmp2 = tmpz[4].im - tmpz[5].im;
+ tmp3 = tmpz[6].re - tmpz[7].re;
+ tmp4 = tmpz[6].im - tmpz[7].im;
+
+ tmpz[4].re = tmpz[0].re - tmp5;
+ tmpz[0].re = tmpz[0].re + tmp5;
+ tmpz[4].im = tmpz[0].im - tmp6;
+ tmpz[0].im = tmpz[0].im + tmp6;
+ tmpz[6].re = tmpz[2].re - tmp8;
+ tmpz[2].re = tmpz[2].re + tmp8;
+ tmpz[6].im = tmpz[2].im + tmp7;
+ tmpz[2].im = tmpz[2].im - tmp7;
+
+ accu = (int64_t)Q31(M_SQRT1_2)*(tmp1 + tmp2);
+ tmp5 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 - tmp4);
+ tmp7 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)Q31(M_SQRT1_2)*(tmp2 - tmp1);
+ tmp6 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 + tmp4);
+ tmp8 = (int32_t)((accu + 0x40000000) >> 31);
+ tmp1 = tmp5 + tmp7;
+ tmp3 = tmp5 - tmp7;
+ tmp2 = tmp6 + tmp8;
+ tmp4 = tmp6 - tmp8;
+
+ tmpz[5].re = tmpz[1].re - tmp1;
+ tmpz[1].re = tmpz[1].re + tmp1;
+ tmpz[5].im = tmpz[1].im - tmp2;
+ tmpz[1].im = tmpz[1].im + tmp2;
+ tmpz[7].re = tmpz[3].re - tmp4;
+ tmpz[3].re = tmpz[3].re + tmp4;
+ tmpz[7].im = tmpz[3].im + tmp3;
+ tmpz[3].im = tmpz[3].im - tmp3;
+ }
+
+ step = 1 << ((MAX_LOG2_NFFT-4) - 4);
+ n4 = 4;
+
+ for (nbits=4; nbits<=s->nbits; nbits++){
+ n2 = 2*n4;
+ n34 = 3*n4;
+ num_transforms = (num_transforms >> 1) | 1;
+
+ for (n=0; n<num_transforms; n++){
+ offset = fft_offsets_lut[n] << nbits;
+ tmpz = z + offset;
+
+ tmp5 = tmpz[ n2].re + tmpz[n34].re;
+ tmp1 = tmpz[ n2].re - tmpz[n34].re;
+ tmp6 = tmpz[ n2].im + tmpz[n34].im;
+ tmp2 = tmpz[ n2].im - tmpz[n34].im;
+
+ tmpz[ n2].re = tmpz[ 0].re - tmp5;
+ tmpz[ 0].re = tmpz[ 0].re + tmp5;
+ tmpz[ n2].im = tmpz[ 0].im - tmp6;
+ tmpz[ 0].im = tmpz[ 0].im + tmp6;
+ tmpz[n34].re = tmpz[n4].re - tmp2;
+ tmpz[ n4].re = tmpz[n4].re + tmp2;
+ tmpz[n34].im = tmpz[n4].im + tmp1;
+ tmpz[ n4].im = tmpz[n4].im - tmp1;
+
+ w_re_ptr = w_tab_sr + step;
+ w_im_ptr = w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
+
+ for (i=1; i<n4; i++){
+ w_re = w_re_ptr[0];
+ w_im = w_im_ptr[0];
+ accu = (int64_t)w_re*tmpz[ n2+i].re;
+ accu += (int64_t)w_im*tmpz[ n2+i].im;
+ tmp1 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)w_re*tmpz[ n2+i].im;
+ accu -= (int64_t)w_im*tmpz[ n2+i].re;
+ tmp2 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)w_re*tmpz[n34+i].re;
+ accu -= (int64_t)w_im*tmpz[n34+i].im;
+ tmp3 = (int32_t)((accu + 0x40000000) >> 31);
+ accu = (int64_t)w_re*tmpz[n34+i].im;
+ accu += (int64_t)w_im*tmpz[n34+i].re;
+ tmp4 = (int32_t)((accu + 0x40000000) >> 31);
+
+ tmp5 = tmp1 + tmp3;
+ tmp1 = tmp1 - tmp3;
+ tmp6 = tmp2 + tmp4;
+ tmp2 = tmp2 - tmp4;
+
+ tmpz[ n2+i].re = tmpz[ i].re - tmp5;
+ tmpz[ i].re = tmpz[ i].re + tmp5;
+ tmpz[ n2+i].im = tmpz[ i].im - tmp6;
+ tmpz[ i].im = tmpz[ i].im + tmp6;
+ tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+ tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+ tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+ tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+
+ w_re_ptr += step;
+ w_im_ptr -= step;
+ }
+ }
+ step >>= 1;
+ n4 <<= 1;
+ }
+}
+
+#else /* CONFIG_FFT_FIXED_32 */
+
#define BUTTERFLIES(a0,a1,a2,a3) {\
BF(t3, t5, t5, t1);\
BF(a2.re, a0.re, a0.re, t5);\
@@ -351,3 +527,4 @@ static void fft_calc_c(FFTContext *s, FFTComplex *z)
{
fft_dispatch[s->nbits-2](z);
}
+#endif /* CONFIG_FFT_FIXED_32 */