diff options
author | Lynne <dev@lynne.ee> | 2022-09-19 05:53:01 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-23 12:35:27 +0200 |
commit | ace42cf581f8c06872bfb58cf575d9e8bd398c0a (patch) | |
tree | 217d6653d5664d47f95c327fdb09d63e01dffcb3 /libavutil/tx_template.c | |
parent | 3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (diff) | |
download | ffmpeg-ace42cf581f8c06872bfb58cf575d9e8bd398c0a.tar.gz |
x86/tx_float: add 15xN PFA FFT AVX SIMD
~4x faster than the C version.
The shuffles in the 15pt dim1 are seriously expensive. Not happy with it,
but I'm contempt.
Can be easily converted to pure AVX by removing all vpermpd/vpermps
instructions.
Diffstat (limited to 'libavutil/tx_template.c')
-rw-r--r-- | libavutil/tx_template.c | 59 |
1 files changed, 33 insertions, 26 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 2c9682ffb7..6b63cc575f 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -48,9 +48,9 @@ SR_TABLE(65536); SR_TABLE(131072); /* Other factors' tables */ -TABLE_DEF(53, 8); -TABLE_DEF( 7, 6); -TABLE_DEF( 9, 8); +TABLE_DEF(53, 12); +TABLE_DEF( 7, 6); +TABLE_DEF( 9, 8); typedef struct FFSRTabsInitOnce { void (*func)(void); @@ -104,19 +104,26 @@ static FFSRTabsInitOnce sr_tabs_init_once[] = { { TX_TAB(ff_tx_init_tab_131072), AV_ONCE_INIT }, }; -static void TX_TAB(ff_tx_init_tab_53)(void) +static av_cold void TX_TAB(ff_tx_init_tab_53)(void) { - TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12)); - TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12)); - TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 6)); - TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI / 6)); - TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI / 5)); - TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI / 5)); - TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10)); - TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI / 5)); + /* 5pt, doubled to eliminate AVX lane shuffles */ + TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10)); + + /* 3pt */ + TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12)); + TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12)); + TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6)); + TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6)); } -static void TX_TAB(ff_tx_init_tab_7)(void) +static av_cold void TX_TAB(ff_tx_init_tab_7)(void) { TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7)); TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7)); @@ -126,7 +133,7 @@ static void TX_TAB(ff_tx_init_tab_7)(void) TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14)); } -static void TX_TAB(ff_tx_init_tab_9)(void) +static av_cold void TX_TAB(ff_tx_init_tab_9)(void) { TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3)); TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3)); @@ -189,19 +196,19 @@ static av_always_inline void fft3(TXComplex *out, TXComplex *in, out[0*stride].im = in[0].im + tmp[1].im; #ifdef TX_INT32 - mtmp[0] = (int64_t)tab[0] * tmp[0].re; - mtmp[1] = (int64_t)tab[1] * tmp[0].im; - mtmp[2] = (int64_t)tab[2] * tmp[1].re; - mtmp[3] = (int64_t)tab[2] * tmp[1].im; + mtmp[0] = (int64_t)tab[ 8] * tmp[0].re; + mtmp[1] = (int64_t)tab[ 9] * tmp[0].im; + mtmp[2] = (int64_t)tab[10] * tmp[1].re; + mtmp[3] = (int64_t)tab[10] * tmp[1].im; out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31); out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31); out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31); out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31); #else - tmp[0].re = tab[0] * tmp[0].re; - tmp[0].im = tab[1] * tmp[0].im; - tmp[1].re = tab[2] * tmp[1].re; - tmp[1].im = tab[2] * tmp[1].im; + tmp[0].re = tab[ 8] * tmp[0].re; + tmp[0].im = tab[ 9] * tmp[0].im; + tmp[1].re = tab[10] * tmp[1].re; + tmp[1].im = tab[10] * tmp[1].im; out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re; out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im; out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re; @@ -224,10 +231,10 @@ static av_always_inline void NAME(TXComplex *out, TXComplex *in, \ out[D0*stride].re = in[0].re + t[0].re + t[2].re; \ out[D0*stride].im = in[0].im + t[0].im + t[2].im; \ \ - SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re); \ - SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im); \ - CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \ - CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \ + SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \ + SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \ + CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \ + CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \ \ BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ |