diff options
author | Lynne <dev@lynne.ee> | 2022-09-23 10:34:08 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-23 12:33:35 +0200 |
commit | 3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (patch) | |
tree | 3e3b555c251837688b7664c50ef0148693d49cf7 | |
parent | 7e7baf8ab86c4ae715f12d2c0babf831a5b18c39 (diff) | |
download | ffmpeg-3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023.tar.gz |
x86/tx_float: adjust internal ASM call ABI again
There are many ways to go about it, and this one seems optimal for both
MDCTs and PFA FFTs without requiring excessive instructions or stack usage.
-rw-r--r-- | libavutil/x86/tx_float.asm | 28 |
1 files changed, 8 insertions, 20 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index b3a85a7cb9..5e0c438b9c 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -22,11 +22,10 @@ ; based upon and compare. ; Intra-asm call convention: -; 272 bytes of stack available -; First 10 GPRs available +; 320 bytes of stack available +; 14 GPRs available (last 4 must not be clobbered) +; Additionally, don't clobber ctx, in, out, len, lut ; All vector regs available -; Don't clobber ctx, len, lut -; in and out must point to the end ; TODO: ; carry over registers from smaller transforms to save on ~8 loads/stores @@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride movaps m0, [inq] FFT2 m0, m1 movaps [outq], m0 - add inq, mmsize*1 - add outq, mmsize*1 ret cglobal fft2_float, 4, 4, 2, ctx, out, in, stride @@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride movaps [outq + 1*mmsize], m0 %if %3 - add inq, mmsize*2 - add outq, mmsize*2 ret %else RET @@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp movups [outq + 3*mmsize], m1 %if %1 - add inq, mmsize*4 - add outq, mmsize*4 ret %else RET @@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp vextractf128 [outq + 16*3], m0, 1 %if %1 - add inq, mmsize*2 - add outq, mmsize*2 ret %else RET @@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp vextractf128 [outq + 16*7], m1, 1 %if %2 - add inq, mmsize*4 - add outq, mmsize*4 ret %else RET @@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp vextractf128 [outq + 16*15], m5, 1 %if %2 - add inq, mmsize*8 - add outq, mmsize*8 ret %else RET @@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072 add outq, 8*mmsize add rtabq, 4*mmsize sub itabq, 4*mmsize - sub lenq, 4*mmsize + sub tgtq, 4*mmsize jg .synth_deinterleave %if %2 - mov lenq, tgtq - add outq, offq + sub outq, tmpq + neg tmpq + lea inq, [inq + tmpq*4] ret %else RET @@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 %if %2 - add outq, 16*mmsize + sub inq, 16*mmsize ret %else RET |