aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/x86/tx_float_init.c
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-09-03 03:34:57 +0200
committerLynne <dev@lynne.ee>2022-09-06 04:21:41 +0200
commit2425d5cd7e37387305f85bef63f7441c8b1cc147 (patch)
tree83c67a7dd2a300607a6cca3df4928ab842c6bb81 /libavutil/x86/tx_float_init.c
parentb881d2db8892e88a625dfa4ac2d5b8ca53ab9595 (diff)
downloadffmpeg-2425d5cd7e37387305f85bef63f7441c8b1cc147.tar.gz
x86/tx_float: add support for calling assembly functions from assembly
Needed for the next patch. We get this for the extremely small cost of a branch on _ns functions, which wouldn't be used anyway with assembly.
Diffstat (limited to 'libavutil/x86/tx_float_init.c')
-rw-r--r--libavutil/x86/tx_float_init.c25
1 files changed, 25 insertions, 0 deletions
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 5db0b57d13..9fd98103c8 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -43,6 +43,15 @@ TX_DECL_FN(fft_sr_ns, fma3)
TX_DECL_FN(fft_sr, avx2)
TX_DECL_FN(fft_sr_ns, avx2)
+TX_DECL_FN(fft8_asm, sse3)
+TX_DECL_FN(fft8_asm, avx)
+TX_DECL_FN(fft16_asm, avx)
+TX_DECL_FN(fft16_asm, fma3)
+TX_DECL_FN(fft32_asm, avx)
+TX_DECL_FN(fft32_asm, fma3)
+TX_DECL_FN(fft_sr_asm, fma3)
+TX_DECL_FN(fft_sr_asm, avx2)
+
#define DECL_INIT_FN(basis, interleave) \
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
const FFTXCodelet *cd, \
@@ -70,30 +79,46 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
+ TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
#if ARCH_X86_64
TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
+ TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
#if HAVE_AVX2_EXTERNAL
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
+ TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2,
+ AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
#endif