diff options
author | Lynne <dev@lynne.ee> | 2022-09-19 05:53:01 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-23 12:35:27 +0200 |
commit | ace42cf581f8c06872bfb58cf575d9e8bd398c0a (patch) | |
tree | 217d6653d5664d47f95c327fdb09d63e01dffcb3 /libavutil | |
parent | 3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (diff) | |
download | ffmpeg-ace42cf581f8c06872bfb58cf575d9e8bd398c0a.tar.gz |
x86/tx_float: add 15xN PFA FFT AVX SIMD
~4x faster than the C version.
The shuffles in the 15pt dim1 are seriously expensive. Not happy with it,
but I'm contempt.
Can be easily converted to pure AVX by removing all vpermpd/vpermps
instructions.
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/tx_template.c | 59 | ||||
-rw-r--r-- | libavutil/x86/tx_float.asm | 280 | ||||
-rw-r--r-- | libavutil/x86/tx_float_init.c | 68 |
3 files changed, 381 insertions, 26 deletions
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 2c9682ffb7..6b63cc575f 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -48,9 +48,9 @@ SR_TABLE(65536); SR_TABLE(131072); /* Other factors' tables */ -TABLE_DEF(53, 8); -TABLE_DEF( 7, 6); -TABLE_DEF( 9, 8); +TABLE_DEF(53, 12); +TABLE_DEF( 7, 6); +TABLE_DEF( 9, 8); typedef struct FFSRTabsInitOnce { void (*func)(void); @@ -104,19 +104,26 @@ static FFSRTabsInitOnce sr_tabs_init_once[] = { { TX_TAB(ff_tx_init_tab_131072), AV_ONCE_INIT }, }; -static void TX_TAB(ff_tx_init_tab_53)(void) +static av_cold void TX_TAB(ff_tx_init_tab_53)(void) { - TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12)); - TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12)); - TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 6)); - TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI / 6)); - TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI / 5)); - TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI / 5)); - TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10)); - TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI / 5)); + /* 5pt, doubled to eliminate AVX lane shuffles */ + TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5)); + TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10)); + TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10)); + + /* 3pt */ + TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12)); + TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12)); + TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6)); + TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6)); } -static void TX_TAB(ff_tx_init_tab_7)(void) +static av_cold void TX_TAB(ff_tx_init_tab_7)(void) { TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7)); TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7)); @@ -126,7 +133,7 @@ static void TX_TAB(ff_tx_init_tab_7)(void) TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14)); } -static void TX_TAB(ff_tx_init_tab_9)(void) +static av_cold void TX_TAB(ff_tx_init_tab_9)(void) { TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3)); TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3)); @@ -189,19 +196,19 @@ static av_always_inline void fft3(TXComplex *out, TXComplex *in, out[0*stride].im = in[0].im + tmp[1].im; #ifdef TX_INT32 - mtmp[0] = (int64_t)tab[0] * tmp[0].re; - mtmp[1] = (int64_t)tab[1] * tmp[0].im; - mtmp[2] = (int64_t)tab[2] * tmp[1].re; - mtmp[3] = (int64_t)tab[2] * tmp[1].im; + mtmp[0] = (int64_t)tab[ 8] * tmp[0].re; + mtmp[1] = (int64_t)tab[ 9] * tmp[0].im; + mtmp[2] = (int64_t)tab[10] * tmp[1].re; + mtmp[3] = (int64_t)tab[10] * tmp[1].im; out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31); out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31); out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31); out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31); #else - tmp[0].re = tab[0] * tmp[0].re; - tmp[0].im = tab[1] * tmp[0].im; - tmp[1].re = tab[2] * tmp[1].re; - tmp[1].im = tab[2] * tmp[1].im; + tmp[0].re = tab[ 8] * tmp[0].re; + tmp[0].im = tab[ 9] * tmp[0].im; + tmp[1].re = tab[10] * tmp[1].re; + tmp[1].im = tab[10] * tmp[1].im; out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re; out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im; out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re; @@ -224,10 +231,10 @@ static av_always_inline void NAME(TXComplex *out, TXComplex *in, \ out[D0*stride].re = in[0].re + t[0].re + t[2].re; \ out[D0*stride].im = in[0].im + t[0].im + t[2].im; \ \ - SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re); \ - SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im); \ - CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \ - CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \ + SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \ + SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \ + CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \ + CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \ \ BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 5e0c438b9c..67f363fc01 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -51,6 +51,8 @@ cextern tab_ %+ i %+ _float ; ff_tab_i_float... %assign i (i << 1) %endrep +cextern tab_53_float + struc AVTXContext .len: resd 1 ; Length .inv resd 1 ; Inverse flag @@ -87,6 +89,9 @@ s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, CO s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 +s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1 + +mask_mmmmmmpp: dd NEG, NEG, NEG, NEG, NEG, NEG, POS, POS mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS @@ -1565,3 +1570,278 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL IMDCT_FN avx2 %endif + +%macro PFA_15_FN 2 +INIT_YMM %1 +%if %2 +cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp +%else +cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp +%endif + +%if %2 + PUSH inq + PUSH tgt5q + PUSH stride3q + PUSH stride5q + PUSH btmpq +%endif + + mov btmpq, outq + + mov outq, [ctxq + AVTXContext.tmp] +%if !%2 + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] +%endif + + ; Load stride (second transform's length) and second transform's LUT + mov tmpq, [ctxq + AVTXContext.sub] + movsxd strideq, dword [tmpq + AVTXContext.len] + mov mapq, [tmpq + AVTXContext.map] + + shl strideq, 3 + imul stride3q, strideq, 3 + imul stride5q, strideq, 5 + + movaps m13, [mask_mmmmmmpp] ; mmmmmmpp + vpermpd m12, m13, q0033 ; ppppmmmm + vextractf128 xm11, m13, 1 ; mmpp + movaps m10, [ff_tx_tab_53_float] ; tab5 + movaps xm9, [ff_tx_tab_53_float + 32] ; tab3 + movaps m8, [s15_perm] + +.dim1: + mov tmpd, [mapq] + lea tgtq, [outq + tmpq*8] + +%if %2 + movups xm0, [inq] +%else + LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim +%endif + + shufps xm1, xm0, xm0, q3223 ; in[1].imrereim + shufps xm0, xm0, xm0, q1001 ; in[0].imrereim + + xorps xm1, xm11 + addps xm1, xm0 ; pc[0,1].imre + +%if %2 + movddup xm14, [inq + 16] ; in[2].reimreim +%else + mov tmpd, [lutq + 8] + movddup xm14, [inq + tmpq*8] ; in[2].reimreim +%endif + shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim + addps xm0, xm14 ; dc[0].reimreim + + mulps xm1, xm9 ; tab[0123]*pc[01] + + shufpd xm5, xm1, xm1, 01b ; pc[1,0].reim + xorps xm1, xm11 + addps xm1, xm1, xm5 + addsubps xm1, xm14, xm1 ; dc[1,2].reim + +%if %2 + movups m2, [inq + mmsize*0 + 24] + movups m3, [inq + mmsize*1 + 24] +%else + LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m14, m15 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 +%endif + + subps m7, m2, m3 ; q[0-3].imre + addps m6, m2, m3 ; q[4-7] + shufps m7, m7, m7, q2301 ; q[0-3].reim + +%if %2 + movups m4, [inq + mmsize*2 + 24] +%else + LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m14, m15 +%endif + + addps m5, m4, m6 ; y[0-3] + + vpermpd m14, m9, q1111 ; tab[23232323] + vbroadcastsd m15, xm9 ; tab[01010101] + + mulps m6, m14 + mulps m7, m15 + + subps m2, m6, m7 ; k[0-3] + addps m3, m6, m7 ; k[4-7] + + addsubps m6, m4, m2 ; k[0-3] + addsubps m7, m4, m3 ; k[4-7] + + ; 15pt from here on + vpermpd m2, m5, q0123 ; y[3-0] + vpermpd m3, m6, q0123 ; k[3-0] + vpermpd m4, m7, q0123 ; k[7-4] + + xorps m5, m12 + xorps m6, m12 + xorps m7, m12 + + addps m2, m5 ; t[0-3] + addps m3, m6 ; t[4-7] + addps m4, m7 ; t[8-11] + + movlhps xm14, xm2 ; out[0] + unpcklpd xm7, xm3, xm4 ; out[10,5] + unpckhpd xm5, xm3, xm4 ; out[10,5] + + addps xm14, xm2 ; out[0] + addps xm7, xm5 ; out[10,5] + addps xm14, xm0 ; out[0] + addps xm7, xm1 ; out[10,5] + + movhps [tgtq], xm14 ; out[0] + movhps [tgtq + stride5q*1], xm7 ; out[5] + movlps [tgtq + stride5q*2], xm7 ; out[10] + + shufps m14, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9 + shufps m15, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11 + + mulps m5, m2, m14 ; t[0-3] + mulps m6, m3, m14 ; t[4-7] + mulps m7, m4, m14 ; t[8-11] + + mulps m2, m15 ; r[0-3] + mulps m3, m15 ; r[4-7] + mulps m4, m15 ; r[8-11] + + shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim + shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim + shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim + + lea tgt5q, [tgtq + stride5q] + lea tmpq, [tgtq + stride5q*2] + + xorps m5, m13 + xorps m6, m13 + xorps m7, m13 + + addps m2, m5 ; r[0,1,2,3] + addps m3, m6 ; r[4,5,6,7] + addps m4, m7 ; r[8,9,10,11] + + shufps m5, m2, m2, q2301 + shufps m6, m3, m3, q2301 + shufps m7, m4, m4, q2301 + + xorps m2, m12 + xorps m3, m12 + xorps m4, m12 + + vpermpd m5, m5, q0123 + vpermpd m6, m6, q0123 + vpermpd m7, m7, q0123 + + addps m5, m2 + addps m6, m3 + addps m7, m4 + + vpermps m5, m8, m5 + vpermps m6, m8, m6 + vpermps m7, m8, m7 + + vbroadcastsd m0, xm0 ; dc[0] + vpermpd m2, m1, q1111 ; dc[2] + vbroadcastsd m1, xm1 ; dc[1] + + addps m0, m5 + addps m1, m6 + addps m2, m7 + + vextractf128 xm3, m0, 1 + vextractf128 xm4, m1, 1 + vextractf128 xm5, m2, 1 + + movlps [tgtq + strideq*1], xm1 + movhps [tgtq + strideq*2], xm2 + movlps [tgtq + stride3q*1], xm3 + movhps [tgtq + strideq*4], xm4 + movlps [tgtq + stride3q*2], xm0 + movlps [tgtq + strideq*8], xm5 + movhps [tgtq + stride3q*4], xm0 + movhps [tgt5q + strideq*2], xm1 + movhps [tgt5q + strideq*4], xm3 + movlps [tmpq + strideq*1], xm2 + movlps [tmpq + stride3q*1], xm4 + movhps [tmpq + strideq*4], xm5 + +%if %2 + add inq, mmsize*3 + 24 +%else + add lutq, (mmsize/2)*3 + 12 +%endif + add mapq, 4 + sub lenq, 15 + jg .dim1 + + ; Second transform setup + mov stride5q, ctxq ; backup original context + movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length + mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point + + mov inq, outq ; in-place transform + mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context + mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map + movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length + +.dim2: + call tgt5q ; call the FFT + lea inq, [inq + lenq*8] + lea outq, [outq + lenq*8] + sub stride3q, lenq + jg .dim2 + + mov ctxq, stride5q ; restore original context + mov lutq, [ctxq + AVTXContext.map] + mov inq, [ctxq + AVTXContext.tmp] + movsxd lenq, dword [ctxq + AVTXContext.len] ; full length + + lea stride3q, [lutq + lenq*4] ; second part of the LUT + mov stride5q, lenq + mov tgt5q, btmpq + +.post: + LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9 + movups [tgt5q], m0 + + add tgt5q, mmsize + add stride3q, mmsize/2 + sub stride5q, mmsize/8 + jg .post + +%if %2 + mov outq, btmpq + POP btmpq + POP stride5q + POP stride3q + POP tgt5q + POP inq + ret +%else + RET +%endif + +%if %2 +cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ + tgt5, stride3, stride5, btmp + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] + + call mangle(fft_pfa_15xM_asm_float) + RET +%endif +%endmacro + +%if ARCH_X86_64 +PFA_15_FN avx2, 0 +PFA_15_FN avx2, 1 +%endif diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 06df749fa9..c6695d9afb 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -43,6 +43,9 @@ TX_DECL_FN(fft_sr_ns, fma3) TX_DECL_FN(fft_sr, avx2) TX_DECL_FN(fft_sr_ns, avx2) +TX_DECL_FN(fft_pfa_15xM, avx2) +TX_DECL_FN(fft_pfa_15xM_ns, avx2) + TX_DECL_FN(mdct_sr_inv, avx2) TX_DECL_FN(fft2_asm, sse3) @@ -57,6 +60,8 @@ TX_DECL_FN(fft32_asm, fma3) TX_DECL_FN(fft_sr_asm, fma3) TX_DECL_FN(fft_sr_asm, avx2) +TX_DECL_FN(fft_pfa_15xM_asm, avx2) + #define DECL_INIT_FN(basis, interleave) \ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ const FFTXCodelet *cd, \ @@ -102,6 +107,61 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, return 0; } +static av_cold int fft_pfa_init(AVTXContext *s, + const FFTXCodelet *cd, + uint64_t flags, + FFTXCodeletOptions *opts, + int len, int inv, + const void *scale) +{ + int ret; + int sub_len = len / cd->factors[0]; + FFTXCodeletOptions sub_opts = { .invert_lookup = 0 }; + + flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ + flags |= AV_TX_INPLACE; /* in-place */ + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */ + + if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + sub_len, inv, scale))) + return ret; + + if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len))) + return ret; + + if (cd->factors[0] == 15) { + for (int k = 0; k < s->sub[0].len; k++) { + int cnt = 0; + int tmp[15]; + memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + for (int i = 2; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + for (int i = 0; i < 15; i += 3) { + s->map[k*15 + cnt] = tmp[i]; + cnt++; + } + memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int)); + memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int)); + s->map[k*15 + 1] = tmp[2]; + s->map[k*15 + 2] = tmp[0]; + } + } + + if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) + return AVERROR(ENOMEM); + + TX_TAB(ff_tx_init_tabs)(len / sub_len); + + return 0; +} + const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0), TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, @@ -150,6 +210,7 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), + #if HAVE_AVX2_EXTERNAL TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), @@ -158,6 +219,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 320, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(mdct_sr_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2, FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), #endif |