diff options
author | Lynne <dev@lynne.ee> | 2022-09-03 03:34:57 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-06 04:21:41 +0200 |
commit | 2425d5cd7e37387305f85bef63f7441c8b1cc147 (patch) | |
tree | 83c67a7dd2a300607a6cca3df4928ab842c6bb81 /libavutil/x86/tx_float_init.c | |
parent | b881d2db8892e88a625dfa4ac2d5b8ca53ab9595 (diff) | |
download | ffmpeg-2425d5cd7e37387305f85bef63f7441c8b1cc147.tar.gz |
x86/tx_float: add support for calling assembly functions from assembly
Needed for the next patch.
We get this for the extremely small cost of a branch on _ns functions,
which wouldn't be used anyway with assembly.
Diffstat (limited to 'libavutil/x86/tx_float_init.c')
-rw-r--r-- | libavutil/x86/tx_float_init.c | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 5db0b57d13..9fd98103c8 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -43,6 +43,15 @@ TX_DECL_FN(fft_sr_ns, fma3) TX_DECL_FN(fft_sr, avx2) TX_DECL_FN(fft_sr_ns, avx2) +TX_DECL_FN(fft8_asm, sse3) +TX_DECL_FN(fft8_asm, avx) +TX_DECL_FN(fft16_asm, avx) +TX_DECL_FN(fft16_asm, fma3) +TX_DECL_FN(fft32_asm, avx) +TX_DECL_FN(fft32_asm, fma3) +TX_DECL_FN(fft_sr_asm, fma3) +TX_DECL_FN(fft_sr_asm, avx2) + #define DECL_INIT_FN(basis, interleave) \ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ const FFTXCodelet *cd, \ @@ -70,30 +79,46 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), #if ARCH_X86_64 TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), #if HAVE_AVX2_EXTERNAL TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), #endif |