aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/x86/tx_float.asm
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-01-20 07:14:46 +0100
committerLynne <dev@lynne.ee>2022-01-26 04:12:44 +0100
commitef4bd8161575a79f0ac247ad0aa2f05b8c20052b (patch)
treecf8488b2f2e9b0b88dd04b511113289d79852486 /libavutil/x86/tx_float.asm
parentc14976be045f3fe658c12d7e30946cdb380452ec (diff)
downloadffmpeg-ef4bd8161575a79f0ac247ad0aa2f05b8c20052b.tar.gz
lavu/tx: rewrite internal code as a tree-based codelet constructor
This commit rewrites the internal transform code into a constructor that stitches transforms (codelets). This allows for transforms to reuse arbitrary parts of other transforms, and allows transforms to be stacked onto one another (such as a full iMDCT using a half-iMDCT which in turn uses an FFT). It also permits for each step to be individually replaced by assembly or a custom implementation (such as an ASIC).
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r--libavutil/x86/tx_float.asm85
1 files changed, 41 insertions, 44 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 4d2283fae1..e3b48d7c1f 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -31,6 +31,8 @@
%include "x86util.asm"
+%define private_prefix ff_tx
+
%if ARCH_X86_64
%define ptr resq
%else
@@ -39,25 +41,22 @@
%assign i 16
%rep 14
-cextern cos_ %+ i %+ _float ; ff_cos_i_float...
+cextern tab_ %+ i %+ _float ; ff_tab_i_float...
%assign i (i << 1)
%endrep
struc AVTXContext
- .n: resd 1 ; Non-power-of-two part
- .m: resd 1 ; Power-of-two part
- .inv: resd 1 ; Is inverse
- .type: resd 1 ; Type
- .flags: resq 1 ; Flags
- .scale: resq 1 ; Scale
-
- .exptab: ptr 1 ; MDCT exptab
- .tmp: ptr 1 ; Temporary buffer needed for all compound transforms
- .pfatab: ptr 1 ; Input/Output mapping for compound transforms
- .revtab: ptr 1 ; Input mapping for power of two transforms
- .inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms
-
- .top_tx ptr 1 ; Used for transforms derived from other transforms
+ .len: resd 1 ; Length
+ .inv resd 1 ; Inverse flag
+ .map: ptr 1 ; Lookup table(s)
+ .exp: ptr 1 ; Exponentiation factors
+ .tmp: ptr 1 ; Temporary data
+
+ .sub: ptr 1 ; Subcontexts
+ .fn: ptr 4 ; Subcontext functions
+ .nb_sub: resd 1 ; Subcontext count
+
+ ; Everything else is inaccessible
endstruc
SECTION_RODATA 32
@@ -485,8 +484,8 @@ SECTION .text
movaps [outq + 10*mmsize], tx1_o0
movaps [outq + 14*mmsize], tx2_o0
- movaps tw_e, [cos_64_float + mmsize]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
movaps m0, [outq + 1*mmsize]
movaps m1, [outq + 3*mmsize]
@@ -710,8 +709,7 @@ FFT4 inv, 1
INIT_XMM sse3
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
-
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
@@ -733,8 +731,7 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
INIT_YMM avx
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
-
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
@@ -754,7 +751,7 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
%macro FFT16_FN 1
INIT_YMM %1
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
@@ -786,7 +783,7 @@ FFT16_FN fma3
%macro FFT32_FN 1
INIT_YMM %1
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
@@ -800,8 +797,8 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
- movaps m8, [cos_32_float]
- vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -858,8 +855,8 @@ ALIGN 16
POP lenq
sub outq, (%1*4) + (%1*2) + (%1/2)
- lea rtabq, [cos_ %+ %1 %+ _float]
- lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7]
+ lea rtabq, [tab_ %+ %1 %+ _float]
+ lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
%if %0 > 1
cmp tgtq, %1
@@ -883,9 +880,9 @@ ALIGN 16
%macro FFT_SPLIT_RADIX_FN 1
INIT_YMM %1
-cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
- movsxd lenq, dword [lutq + AVTXContext.m]
- mov lutq, [lutq + AVTXContext.revtab]
+cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
+ movsxd lenq, dword [lutq + AVTXContext.len]
+ mov lutq, [lutq + AVTXContext.map]
mov tgtq, lenq
; Bottom-most/32-point transform ===============================================
@@ -903,8 +900,8 @@ ALIGN 16
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
- movaps m8, [cos_32_float]
- vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -961,8 +958,8 @@ ALIGN 16
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
- movaps tw_e, [cos_64_float]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23
+ movaps tw_e, [tab_64_float]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
add lutq, (mmsize/2)*8
cmp tgtq, 64
@@ -989,8 +986,8 @@ ALIGN 16
POP lenq
sub outq, 24*mmsize
- lea rtabq, [cos_128_float]
- lea itabq, [cos_128_float + 128 - 4*7]
+ lea rtabq, [tab_128_float]
+ lea itabq, [tab_128_float + 128 - 4*7]
cmp tgtq, 128
je .deinterleave
@@ -1016,8 +1013,8 @@ ALIGN 16
POP lenq
sub outq, 48*mmsize
- lea rtabq, [cos_256_float]
- lea itabq, [cos_256_float + 256 - 4*7]
+ lea rtabq, [tab_256_float]
+ lea itabq, [tab_256_float + 256 - 4*7]
cmp tgtq, 256
je .deinterleave
@@ -1044,8 +1041,8 @@ ALIGN 16
POP lenq
sub outq, 96*mmsize
- lea rtabq, [cos_512_float]
- lea itabq, [cos_512_float + 512 - 4*7]
+ lea rtabq, [tab_512_float]
+ lea itabq, [tab_512_float + 512 - 4*7]
cmp tgtq, 512
je .deinterleave
@@ -1079,8 +1076,8 @@ ALIGN 16
POP lenq
sub outq, 192*mmsize
- lea rtabq, [cos_1024_float]
- lea itabq, [cos_1024_float + 1024 - 4*7]
+ lea rtabq, [tab_1024_float]
+ lea itabq, [tab_1024_float + 1024 - 4*7]
cmp tgtq, 1024
je .deinterleave
@@ -1160,8 +1157,8 @@ FFT_SPLIT_RADIX_DEF 131072
vextractf128 [outq + 13*mmsize + 0], tw_e, 1
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
- movaps tw_e, [cos_64_float + mmsize]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
movaps m0, [outq + 1*mmsize]
movaps m1, [outq + 3*mmsize]