diff options
author | Lynne <dev@lynne.ee> | 2022-05-20 10:00:43 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-05-20 10:12:34 +0200 |
commit | 82a68a8771ca39564f6a74e0f875d6852e7a0c2a (patch) | |
tree | 92992c7067cf22f88b701ec6a143496592cac03c /libavutil | |
parent | 41a558fea06cc0a23b8d2d0dfb03ef6a25cf5100 (diff) | |
download | ffmpeg-82a68a8771ca39564f6a74e0f875d6852e7a0c2a.tar.gz |
x86/tx_float: remove vgatherdpd usage
Its performance loss ranges from either being just as fast as individual loads
(Skylake), a few percent slower (Alderlake), 8% slower (Zen 3), to completely
disasterous (older/other CPUs).
Sadly, gathers never panned out fast on x86, even with the benefit of time and
implementation experience.
This also saves a register, as there's no need to fill out an additional
register mask.
Zen 3 (16384-point transform):
Before: 1561050 decicycles in av_tx (fft), 131072 runs, 0 skips
After: 1449621 decicycles in av_tx (fft), 131072 runs, 0 skips
Alderlake:
2% slower on big transforms (65536), to 1% (131072), to a few percent for smaller
sizes.
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/x86/tx_float.asm | 63 | ||||
-rw-r--r-- | libavutil/x86/tx_float_init.c | 11 |
2 files changed, 31 insertions, 43 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index bab44b0947..88d0164c3a 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -97,13 +97,7 @@ SECTION .text ; %4 - LUT offset ; %5 - temporary GPR (only used if vgather is not used) ; %6 - temporary register (for avx only) -; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) -%macro LOAD64_LUT 5-7 -%if %0 > 6 && cpuflag(avx2) - pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 - movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction - vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args -%else +%macro LOAD64_LUT 5-6 mov %5d, [%3 + %4 + 0] movsd xmm%1, [%2 + %5q*8] %if mmsize == 32 @@ -117,7 +111,6 @@ SECTION .text movhps xmm%6, [%2 + %5q*8] vinsertf128 %1, %1, xmm%6, 1 %endif -%endif %endmacro ; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode) @@ -820,10 +813,10 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp movaps m7, [inq + 7*mmsize] %else mov ctxq, [ctxq + AVTXContext.map] - LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9 - LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11 - LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13 - LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15 + LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8 + LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9 + LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10 + LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11 %endif FFT8 m4, m5, m6, m7, m8, m9 @@ -834,10 +827,10 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp movaps m2, [inq + 2*mmsize] movaps m3, [inq + 3*mmsize] %else - LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9 - LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11 - LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13 - LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15 + LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8 + LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9 + LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10 + LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11 %endif movaps m8, [tab_32_float] @@ -939,10 +932,10 @@ ALIGN 16 movaps m6, [inq + 6*mmsize] movaps m7, [inq + 7*mmsize] %else - LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9 - LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11 - LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13 - LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15 + LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8 + LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9 + LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10 + LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11 %endif FFT8 m4, m5, m6, m7, m8, m9 @@ -953,10 +946,10 @@ ALIGN 16 movaps m2, [inq + 2*mmsize] movaps m3, [inq + 3*mmsize] %else - LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9 - LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11 - LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13 - LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15 + LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8 + LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9 + LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11 %endif movaps m8, [tab_32_float] @@ -1013,10 +1006,10 @@ ALIGN 16 movaps tx1_o0, [inq + 2*mmsize] movaps tx1_o1, [inq + 3*mmsize] %else - LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o - LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2 - LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o - LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2 + LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e + LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o + LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tmp1 + LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp2 %endif FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 @@ -1027,10 +1020,10 @@ ALIGN 16 movaps tx2_o0, [inq + 6*mmsize] movaps tx2_o1, [inq + 7*mmsize] %else - LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2 - LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o - LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2 - LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o + LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1 + LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tmp2 + LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_o + LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e %endif FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o @@ -1287,8 +1280,6 @@ FFT_SPLIT_RADIX_DEF 131072 %if ARCH_X86_64 FFT_SPLIT_RADIX_FN avx, float, 0 FFT_SPLIT_RADIX_FN avx, ns_float, 1 -%if HAVE_AVX2_EXTERNAL -FFT_SPLIT_RADIX_FN avx2, float, 0 -FFT_SPLIT_RADIX_FN avx2, ns_float, 1 -%endif +FFT_SPLIT_RADIX_FN fma3, float, 0 +FFT_SPLIT_RADIX_FN fma3, ns_float, 1 %endif diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index e60faf7fa6..08038b8a74 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -40,8 +40,8 @@ TX_DECL_FN(fft32, fma3) TX_DECL_FN(fft32_ns, fma3) TX_DECL_FN(fft_sr, avx) TX_DECL_FN(fft_sr_ns, avx) -TX_DECL_FN(fft_sr, avx2) -TX_DECL_FN(fft_sr_ns, avx2) +TX_DECL_FN(fft_sr, fma3) +TX_DECL_FN(fft_sr_ns, fma3) #define DECL_INIT_FN(basis, interleave) \ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ @@ -83,13 +83,10 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), -#if HAVE_AVX2_EXTERNAL TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, 0), TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), - TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, - AV_CPU_FLAG_AVXSLOW), -#endif + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, 0), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), #endif NULL, |