diff options
author | Lynne <dev@lynne.ee> | 2022-09-03 03:34:57 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-06 04:21:41 +0200 |
commit | 2425d5cd7e37387305f85bef63f7441c8b1cc147 (patch) | |
tree | 83c67a7dd2a300607a6cca3df4928ab842c6bb81 /libavutil/x86/tx_float.asm | |
parent | b881d2db8892e88a625dfa4ac2d5b8ca53ab9595 (diff) | |
download | ffmpeg-2425d5cd7e37387305f85bef63f7441c8b1cc147.tar.gz |
x86/tx_float: add support for calling assembly functions from assembly
Needed for the next patch.
We get this for the extremely small cost of a branch on _ns functions,
which wouldn't be used anyway with assembly.
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r-- | libavutil/x86/tx_float.asm | 160 |
1 files changed, 113 insertions, 47 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 191af7d68f..791ad7b322 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -707,20 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride FFT4 fwd, 0 FFT4 inv, 1 -%macro FFT8_SSE_FN 2 +%macro FFT8_SSE_FN 1 INIT_XMM sse3 -cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp -%if %2 +%if %1 +cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else +cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq -%else - movaps m0, [inq + 0*mmsize] - movaps m1, [inq + 1*mmsize] - movaps m2, [inq + 2*mmsize] - movaps m3, [inq + 3*mmsize] %endif FFT8 m0, m1, m2, m3, m4, m5 @@ -735,22 +736,33 @@ cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp movups [outq + 2*mmsize], m5 movups [outq + 3*mmsize], m1 +%if %1 + ret +%else + RET +%endif + +%if %1 +cglobal fft8_ns_float, 4, 4, 6, ctx, out, in, tmp + call ff_tx_fft8_asm_float_sse3 RET +%endif %endmacro -FFT8_SSE_FN float, 1 -FFT8_SSE_FN ns_float, 0 +FFT8_SSE_FN 0 +FFT8_SSE_FN 1 -%macro FFT8_AVX_FN 2 +%macro FFT8_AVX_FN 1 INIT_YMM avx -cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp -%if %2 +%if %1 +cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] +%else +cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 -%else - movaps m0, [inq + 0*mmsize] - movaps m1, [inq + 1*mmsize] %endif FFT8_AVX m0, m1, m2, m3 @@ -764,21 +776,32 @@ cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp vextractf128 [outq + 16*2], m2, 1 vextractf128 [outq + 16*3], m0, 1 +%if %1 + ret +%else + RET +%endif + +%if %1 +cglobal fft8_ns_float, 4, 4, 4, ctx, out, in, tmp + call ff_tx_fft8_asm_float_avx RET +%endif %endmacro -FFT8_AVX_FN float, 1 -FFT8_AVX_FN ns_float, 0 +FFT8_AVX_FN 0 +FFT8_AVX_FN 1 -%macro FFT16_FN 3 +%macro FFT16_FN 2 INIT_YMM %1 -cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp -%if %3 +%if %2 +cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, tmp movaps m0, [inq + 0*mmsize] movaps m1, [inq + 1*mmsize] movaps m2, [inq + 2*mmsize] movaps m3, [inq + 3*mmsize] %else +cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 @@ -802,23 +825,34 @@ cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp vextractf128 [outq + 16*6], m5, 1 vextractf128 [outq + 16*7], m1, 1 +%if %2 + ret +%else + RET +%endif + +%if %2 +cglobal fft16_ns_float, 4, 4, 8, ctx, out, in, tmp + call ff_tx_fft16_asm_float_ %+ %1 RET +%endif %endmacro -FFT16_FN avx, float, 0 -FFT16_FN avx, ns_float, 1 -FFT16_FN fma3, float, 0 -FFT16_FN fma3, ns_float, 1 +FFT16_FN avx, 0 +FFT16_FN avx, 1 +FFT16_FN fma3, 0 +FFT16_FN fma3, 1 -%macro FFT32_FN 3 +%macro FFT32_FN 2 INIT_YMM %1 -cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp -%if %3 +%if %2 +cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, tmp movaps m4, [inq + 4*mmsize] movaps m5, [inq + 5*mmsize] movaps m6, [inq + 6*mmsize] movaps m7, [inq + 7*mmsize] %else +cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 @@ -828,7 +862,7 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp FFT8 m4, m5, m6, m7, m8, m9 -%if %3 +%if %2 movaps m0, [inq + 0*mmsize] movaps m1, [inq + 1*mmsize] movaps m2, [inq + 2*mmsize] @@ -875,14 +909,24 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp vextractf128 [outq + 16*14], m10, 1 vextractf128 [outq + 16*15], m5, 1 +%if %2 + ret +%else + RET +%endif + +%if %2 +cglobal fft32_ns_float, 4, 4, 16, ctx, out, in, tmp + call ff_tx_fft32_asm_float_ %+ %1 RET +%endif %endmacro %if ARCH_X86_64 -FFT32_FN avx, float, 0 -FFT32_FN avx, ns_float, 1 -FFT32_FN fma3, float, 0 -FFT32_FN fma3, ns_float, 1 +FFT32_FN avx, 0 +FFT32_FN avx, 1 +FFT32_FN fma3, 0 +FFT32_FN fma3, 1 %endif %macro FFT_SPLIT_RADIX_DEF 1-2 @@ -923,17 +967,21 @@ ALIGN 16 %endif %endmacro -%macro FFT_SPLIT_RADIX_FN 3 +%macro FFT_SPLIT_RADIX_FN 2 INIT_YMM %1 -cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt - movsxd lenq, dword [lutq + AVTXContext.len] - mov lutq, [lutq + AVTXContext.map] +%if %2 +cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, tmp, len, lut, itab, rtab, tgt +%else +cglobal fft_sr_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] mov tgtq, lenq +%endif ; Bottom-most/32-point transform =============================================== ALIGN 16 .32pt: -%if %3 +%if %2 movaps m4, [inq + 4*mmsize] movaps m5, [inq + 5*mmsize] movaps m6, [inq + 6*mmsize] @@ -947,7 +995,7 @@ ALIGN 16 FFT8 m4, m5, m6, m7, m8, m9 -%if %3 +%if %2 movaps m0, [inq + 0*mmsize] movaps m1, [inq + 1*mmsize] movaps m2, [inq + 2*mmsize] @@ -972,7 +1020,7 @@ ALIGN 16 movaps [outq + 5*mmsize], m5 movaps [outq + 7*mmsize], m7 -%if %3 +%if %2 add inq, 8*mmsize %else add lutq, (mmsize/2)*8 @@ -1007,7 +1055,7 @@ ALIGN 16 SWAP m4, m1 SWAP m6, m3 -%if %3 +%if %2 movaps tx1_e0, [inq + 0*mmsize] movaps tx1_e1, [inq + 1*mmsize] movaps tx1_o0, [inq + 2*mmsize] @@ -1021,7 +1069,7 @@ ALIGN 16 FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 -%if %3 +%if %2 movaps tx2_e0, [inq + 4*mmsize] movaps tx2_e1, [inq + 5*mmsize] movaps tx2_o0, [inq + 6*mmsize] @@ -1038,7 +1086,7 @@ ALIGN 16 movaps tw_e, [tab_64_float] vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 -%if %3 +%if %2 add inq, 8*mmsize %else add lutq, (mmsize/2)*8 @@ -1201,7 +1249,11 @@ FFT_SPLIT_RADIX_DEF 131072 sub lenq, 4*mmsize jg .synth_deinterleave +%if %2 + ret +%else RET +%endif ; 64-point deinterleave which only has to load 4 registers ===================== .64pt_deint: @@ -1278,14 +1330,28 @@ FFT_SPLIT_RADIX_DEF 131072 vextractf128 [outq + 15*mmsize + 0], tw_o, 1 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 +%if %2 + ret +%else RET +%endif + +%if %2 +cglobal fft_sr_ns_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt + movsxd lenq, dword [ctxq + AVTXContext.len] + mov lutq, [ctxq + AVTXContext.map] + mov tgtq, lenq + + call ff_tx_fft_sr_asm_float_ %+ %1 + RET +%endif %endmacro %if ARCH_X86_64 -FFT_SPLIT_RADIX_FN fma3, float, 0 -FFT_SPLIT_RADIX_FN fma3, ns_float, 1 +FFT_SPLIT_RADIX_FN fma3, 0 +FFT_SPLIT_RADIX_FN fma3, 1 %if HAVE_AVX2_EXTERNAL -FFT_SPLIT_RADIX_FN avx2, float, 0 -FFT_SPLIT_RADIX_FN avx2, ns_float, 1 +FFT_SPLIT_RADIX_FN avx2, 0 +FFT_SPLIT_RADIX_FN avx2, 1 %endif %endif |