diff options
author | Lynne <dev@lynne.ee> | 2022-09-23 10:38:29 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-23 12:35:28 +0200 |
commit | 74e8541bab8e885a0e819b8298ae6bfb72042be9 (patch) | |
tree | f46aab664669479681371a8d555c8b563f475f7e /libavutil/x86 | |
parent | ace42cf581f8c06872bfb58cf575d9e8bd398c0a (diff) | |
download | ffmpeg-74e8541bab8e885a0e819b8298ae6bfb72042be9.tar.gz |
x86/tx_float: generalize iMDCT
To support non-aligned buffers during the post-transform step, just iterate
backwards over the array.
This allows using the 15xN-point FFT, with which the speed is 2.1 times
faster than our old libavcodec implementation.
Diffstat (limited to 'libavutil/x86')
-rw-r--r-- | libavutil/x86/tx_float.asm | 52 | ||||
-rw-r--r-- | libavutil/x86/tx_float_init.c | 17 |
2 files changed, 40 insertions, 29 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 67f363fc01..c3b1375bc4 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -1389,17 +1389,16 @@ FFT_SPLIT_RADIX_FN avx2, 1 %macro IMDCT_FN 1 INIT_YMM %1 -cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, t1, t2, t3, t4, t5, bctx +cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ + t4, t5, btmp movsxd lenq, dword [ctxq + AVTXContext.len] mov expq, [ctxq + AVTXContext.exp] lea t1d, [lend - 1] imul t1d, strided - mov bctxq, ctxq ; backup original context - mov t5q, [ctxq + AVTXContext.fn] ; subtransform's jump point - mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context - mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map + mov btmpq, ctxq ; backup original context + mov lutq, [ctxq + AVTXContext.map] ; load map cmp strideq, 4 je .stride4 @@ -1444,8 +1443,8 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, fmaddsubps m10, m12, m2, m10 fmaddsubps m11, m13, m3, m11 - mova [t2q + 0*mmsize], m10 - mova [t2q + 1*mmsize], m11 + movups [t2q + 0*mmsize], m10 + movups [t2q + 1*mmsize], m11 add expq, mmsize*2 add lutq, mmsize @@ -1462,16 +1461,16 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, lea t2q, [lenq*2 - mmsize/2] .stride4_pre: - movaps m4, [inq] - movaps m3, [t1q] + movups m4, [inq] + movups m3, [t1q] movsldup m1, m4 ; im im, im im movshdup m0, m3 ; re re, re re movshdup m4, m4 ; re re, re re (2) movsldup m3, m3 ; im im, im im (2) - movaps m2, [expq] ; tab - movaps m5, [expq + 2*t2q] ; tab (2) + movups m2, [expq] ; tab + movups m5, [expq + 2*t2q] ; tab (2) vpermpd m0, m0, q0123 ; flip shufps m7, m2, m2, q2301 @@ -1513,29 +1512,31 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, add inq, mmsize sub t1q, mmsize sub t2q, mmsize - jg .stride4_pre + jge .stride4_pre .transform: + mov t4q, ctxq ; backup original context + mov t5q, [ctxq + AVTXContext.fn] ; subtransform's jump point + mov ctxq, [ctxq + AVTXContext.sub] + mov lutq, [ctxq + AVTXContext.map] movsxd lenq, dword [ctxq + AVTXContext.len] + mov inq, outq ; in-place transform call t5q ; call the FFT - mov ctxq, bctxq ; restore original context + mov ctxq, t4q ; restore original context movsxd lenq, dword [ctxq + AVTXContext.len] mov expq, [ctxq + AVTXContext.exp] lea expq, [expq + lenq*4] - lea t1q, [lenq*2] ; high - lea t2q, [lenq*2 - mmsize] ; low - - neg lenq - lea outq, [outq + lenq*4] + xor t1q, t1q ; low + lea t2q, [lenq*4 - mmsize] ; high .post: - movaps m2, [expq + t1q] ; tab h - movaps m3, [expq + t2q] ; tab l - movaps m0, [outq + t1q] ; in h - movaps m1, [outq + t2q] ; in l + movaps m2, [expq + t2q] ; tab h + movaps m3, [expq + t1q] ; tab l + movups m0, [outq + t2q] ; in h + movups m1, [outq + t1q] ; in l movshdup m4, m2 ; tab h imim movshdup m5, m3 ; tab l imim @@ -1557,12 +1558,13 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp, blendps m1, m2, m5, 01010101b blendps m0, m3, m4, 01010101b - movaps [outq + t2q], m1 - movaps [outq + t1q], m0 + movups [outq + t2q], m0 + movups [outq + t1q], m1 add t1q, mmsize sub t2q, mmsize - jge .post + sub lenq, mmsize/2 + jg .post RET %endmacro diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index c6695d9afb..20c1ad6869 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -46,7 +46,7 @@ TX_DECL_FN(fft_sr_ns, avx2) TX_DECL_FN(fft_pfa_15xM, avx2) TX_DECL_FN(fft_pfa_15xM_ns, avx2) -TX_DECL_FN(mdct_sr_inv, avx2) +TX_DECL_FN(mdct_inv, avx2) TX_DECL_FN(fft2_asm, sse3) TX_DECL_FN(fft4_fwd_asm, sse2) @@ -87,7 +87,7 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, int len, int inv, const void *scale) { int ret; - FFTXCodeletOptions sub_opts = { .invert_lookup = -1 }; + FFTXCodeletOptions sub_opts = { .invert_lookup = 1 }; s->scale_d = *((SCALE_TYPE *)scale); s->scale_f = s->scale_d; @@ -101,7 +101,16 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, inv, scale))) return ret; - if ((ret = ff_tx_mdct_gen_exp_float(s, s->sub->map))) + s->map = av_malloc(len*sizeof(*s->map)); + if (!s->map) + return AVERROR(ENOMEM); + + memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); + /* Invert lookup table for unstrided path */ + for (int i = 0; i < (len >> 1); i++) + s->map[(len >> 1) + s->map[i]] = i; + + if ((ret = ff_tx_mdct_gen_exp_float(s, s->map))) return ret; return 0; @@ -226,7 +235,7 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), - TX_DEF(mdct_sr_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2, + TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2, FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), #endif #endif |