aboutsummaryrefslogtreecommitdiffstats
path: root/libavfilter/x86/vf_lut3d.asm
diff options
context:
space:
mode:
authorMark Reid <mindmark@gmail.com>2021-10-05 20:58:30 -0700
committerPaul B Mahol <onemda@gmail.com>2021-10-10 22:23:48 +0200
commit716b39674059d5b416faef92afd41654a6d9469b (patch)
tree25652c77af1ac70c439ba3e1a7f879d4b08cfb5b /libavfilter/x86/vf_lut3d.asm
parent5133f4c2c1149feef3248ba2cb29537e8d8fbe38 (diff)
downloadffmpeg-716b39674059d5b416faef92afd41654a6d9469b.tar.gz
avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation
I spotted an interesting pattern that I didn't see before that leads to the implementation being faster. The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines.  I also add use of FMA on the AVX2 version. f32 1920x1080 1 thread with prelut c impl 1434012700 UNITS in lut3d->interp,       1 runs,      0 skips 1434035335 UNITS in lut3d->interp,       2 runs,      0 skips 1423615347 UNITS in lut3d->interp,       4 runs,      0 skips 1426268863 UNITS in lut3d->interp,       8 runs,      0 skips sse2 905484420 UNITS in lut3d->interp,       1 runs,      0 skips 905659010 UNITS in lut3d->interp,       2 runs,      0 skips 915167140 UNITS in lut3d->interp,       4 runs,      0 skips 915834222 UNITS in lut3d->interp,       8 runs,      0 skips avx 574794860 UNITS in lut3d->interp,       1 runs,      0 skips 581035090 UNITS in lut3d->interp,       2 runs,      0 skips 584116720 UNITS in lut3d->interp,       4 runs,      0 skips 581460290 UNITS in lut3d->interp,       8 runs,      0 skips avx2 301698880 UNITS in lut3d->interp,       1 runs,      0 skips 301982880 UNITS in lut3d->interp,       2 runs,      0 skips 306962430 UNITS in lut3d->interp,       4 runs,      0 skips 305472025 UNITS in lut3d->interp,       8 runs,      0 skips gbrap16 1920x1080 1 thread with prelut c impl 1480894840 UNITS in lut3d->interp,       1 runs,      0 skips 1502922990 UNITS in lut3d->interp,       2 runs,      0 skips 1496114307 UNITS in lut3d->interp,       4 runs,      0 skips 1492554551 UNITS in lut3d->interp,       8 runs,      0 skips sse2 980777180 UNITS in lut3d->interp,       1 runs,      0 skips 986121520 UNITS in lut3d->interp,       2 runs,      0 skips 986489840 UNITS in lut3d->interp,       4 runs,      0 skips 998832248 UNITS in lut3d->interp,       8 runs,      0 skips avx 622212360 UNITS in lut3d->interp,       1 runs,      0 skips 622981160 UNITS in lut3d->interp,       2 runs,      0 skips 645396315 UNITS in lut3d->interp,       4 runs,      0 skips 641057075 UNITS in lut3d->interp,       8 runs,      0 skips avx2 321336400 UNITS in lut3d->interp,       1 runs,      0 skips 321268920 UNITS in lut3d->interp,       2 runs,      0 skips 323459895 UNITS in lut3d->interp,       4 runs,      0 skips 324949967 UNITS in lut3d->interp,       8 runs,      0 skips
Diffstat (limited to 'libavfilter/x86/vf_lut3d.asm')
-rw-r--r--libavfilter/x86/vf_lut3d.asm662
1 files changed, 662 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
new file mode 100644
index 0000000000..31a46e327d
--- /dev/null
+++ b/libavfilter/x86/vf_lut3d.asm
@@ -0,0 +1,662 @@
+;*****************************************************************************
+;* x86-optimized functions for lut3d filter
+;*
+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pd_1f: times 8 dd 1.0
+pd_3f: times 8 dd 3.0
+pd_65535f: times 8 dd 65535.0
+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
+
+pb_shuffle16: db 0, 1, 0x80, 0x80, \
+ 2, 3, 0x80, 0x80, \
+ 4, 5, 0x80, 0x80, \
+ 6, 7, 0x80, 0x80
+
+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \
+ 8, 9, 12, 13, \
+ 0x80, 0x80, 0x80, 0x80, \
+ 0x80, 0x80, 0x80, 0x80
+
+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
+ 0x80, 0x80, 0x80, 0x80, \
+ 0, 1, 4, 5, \
+ 8, 9, 12, 13
+
+SECTION .text
+
+struc Lut3DPreLut
+ .size: resd 1
+ .min: resd 3
+ .max: resd 3
+ .scale: resd 3
+ .lut: resq 3
+endstruc
+
+struc LUT3DContext
+ .class: resq 1
+ .lut: resq 1
+ .lutsize: resd 1
+ .lutsize2: resd 1
+ .scale: resd 3
+endstruc
+
+%define AV_NUM_DATA_POINTERS 8
+
+struc AVFrame
+ .data: resq AV_NUM_DATA_POINTERS
+ .linesize: resd AV_NUM_DATA_POINTERS
+ .extended_data: resq 1
+ .width: resd 1
+ .height: resd 1
+endstruc
+
+%define rm rsp
+%define gm rsp+mmsize
+%define bm rsp+(mmsize*2)
+
+%define lut3dsizem [rsp+mmsize*3]
+%define lut3dsize2m [rsp+mmsize*4]
+%define lut3dmaxm [rsp+mmsize*5]
+%define prelutmaxm [rsp+mmsize*6]
+
+%define scalerm [rsp+mmsize*7]
+%define scalegm [rsp+mmsize*8]
+%define scalebm [rsp+mmsize*9]
+
+%define prelutminrm [rsp+mmsize*10]
+%define prelutmingm [rsp+mmsize*11]
+%define prelutminbm [rsp+mmsize*12]
+
+%define prelutscalerm [rsp+mmsize*13]
+%define prelutscalegm [rsp+mmsize*14]
+%define prelutscalebm [rsp+mmsize*15]
+
+; data pointers
+%define srcrm [rsp+mmsize*16 + 0]
+%define srcgm [rsp+mmsize*16 + 8]
+%define srcbm [rsp+mmsize*16 + 16]
+%define srcam [rsp+mmsize*16 + 24]
+
+%define dstrm [rsp+mmsize*16 + 32]
+%define dstgm [rsp+mmsize*16 + 40]
+%define dstbm [rsp+mmsize*16 + 48]
+%define dstam [rsp+mmsize*16 + 56]
+
+; 1 - prev
+; 2 - next
+; 3 - offset
+%macro FETCH_PRELUT_PN 3
+ mov tmp2d, [rm + %3]
+ mov tmp3d, [gm + %3]
+ movss xm%1, [tmpq + tmp2q*4]
+ movss xm%2, [tmpq + tmp3q*4]
+ movss [rm + %3], xm%1
+ movss [gm + %3], xm%2
+%endmacro
+
+; 1 - p
+; 2 - n
+; 3 - p indices
+; 4 - n indices
+%macro GATHER_PRELUT 4
+ %if cpuflag(avx2)
+ vpcmpeqb m7, m7
+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p
+ vpcmpeqb m9, m9
+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n
+ %else
+ mova [rm], m%3
+ mova [gm], m%4
+ FETCH_PRELUT_PN %1, %2, 0
+ FETCH_PRELUT_PN %1, %2, 4
+ FETCH_PRELUT_PN %1, %2, 8
+ FETCH_PRELUT_PN %1, %2, 12
+ %if mmsize > 16
+ FETCH_PRELUT_PN %1, %2, 16
+ FETCH_PRELUT_PN %1, %2, 20
+ FETCH_PRELUT_PN %1, %2, 24
+ FETCH_PRELUT_PN %1, %2, 28
+ %endif
+ movu m%1, [rm]
+ movu m%2, [gm]
+ %endif
+%endmacro
+
+%macro FLOORPS 2
+ %if mmsize > 16
+ vroundps %1, %2, 0x01
+ %else
+ cvttps2dq %1, %2
+ cvtdq2ps %1, %1
+ %endif
+%endmacro
+
+; %1 = %2 * %3 + %1
+%macro MADD3 3
+%if cpuflag(avx2)
+ vfmadd231ps %1, %2, %3
+%else
+ mulps %2, %2, %3
+ addps %1, %1, %2
+%endif
+%endmacro
+
+; 1 - dst
+; 2 - index
+; 3 - min
+; 4 - scale
+; assumes lut max m13, m14 1.0f, zero m15
+%macro APPLY_PRELUT 4
+ ; scale
+ subps m5, m%1, %3 ; v - min
+ mulps m5, m5, %4 ; v * scale
+ ; clamp
+ maxps m5, m5, m15 ; max zero, Max first, NAN set to zero
+ minps m5, m5, m13 ; min lut max
+
+ FLOORPS m3, m5 ; prev index
+ subps m5, m5, m3 ; d
+ addps m4, m3, m14 ; p+1 = n index
+ minps m4, m4, m13 ; clamp n idex
+
+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
+ cvttps2dq m6, m3
+ cvttps2dq m10, m4
+ GATHER_PRELUT %1, 4, 6, 10
+
+ ; lerp
+ subps m8, m4, m%1
+ MADD3 m%1, m8, m5
+
+%endmacro
+
+; 1 - dst
+; 2 - scale
+; assumes lut max m13, zero m15
+%macro APPLY_SCALE 2
+ mulps m%1, m%1, %2
+ maxps m%1, m%1, m15 ; Max first, NAN set to zero
+ minps m%1, m%1, m13
+%endmacro
+
+%macro BLEND 4
+%if mmsize > 16
+ vblendvps %1, %2, %3, %4
+%else
+ %ifidni %1,%2
+ %error operand 1 must not equal operand 2
+ %endif
+ %ifidni %1,%3
+ %error operand 1 must not equal operand 3
+ %endif
+ mova %1, %2
+ xorps %1, %3
+ andps %1, %4
+ xorps %1, %2
+%endif
+%endmacro
+
+%macro ADD3 4
+ addps %1, %2, %3
+ addps %1, %1, %4
+%endmacro
+
+%macro FETCH_LUT3D_RGB 4
+ mov tmp2d, [rm + %4]
+ movss xm%1, [tmpq + tmp2q*4 + 0]
+ movss xm%2, [tmpq + tmp2q*4 + 4]
+ movss xm%3, [tmpq + tmp2q*4 + 8]
+ movss [rm + %4], xm%1
+ movss [gm + %4], xm%2
+ movss [bm + %4], xm%3
+%endmacro
+
+; 1 - dstr
+; 2 - dstg
+; 3 - dstb
+; 4 - indices
+%macro GATHER_LUT3D_INDICES 4
+%if cpuflag(avx2)
+ vpcmpeqb m3, m3
+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3
+ vpcmpeqb m14, m14
+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14
+ vpcmpeqb m15, m15
+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15
+%else
+ movu [rm], m%4
+ FETCH_LUT3D_RGB %1, %2, %3, 0
+ FETCH_LUT3D_RGB %1, %2, %3, 4
+ FETCH_LUT3D_RGB %1, %2, %3, 8
+ FETCH_LUT3D_RGB %1, %2, %3, 12
+%if mmsize > 16
+ FETCH_LUT3D_RGB %1, %2, %3, 16
+ FETCH_LUT3D_RGB %1, %2, %3, 20
+ FETCH_LUT3D_RGB %1, %2, %3, 24
+ FETCH_LUT3D_RGB %1, %2, %3, 28
+%endif
+ movu m%1, [rm]
+ movu m%2, [gm]
+ movu m%3, [bm]
+%endif
+%endmacro
+
+%macro interp_tetrahedral 0
+ %define d_r m0
+ %define d_g m1
+ %define d_b m2
+
+ %define prev_r m3
+ %define prev_g m4
+ %define prev_b m5
+
+ %define next_r m6
+ %define next_g m7
+ %define next_b m8
+
+ %define x0 m4
+ %define x1 m5
+ %define x2 m6
+
+ ; setup prev index
+ FLOORPS prev_r, m0
+ FLOORPS prev_g, m1
+ FLOORPS prev_b, m2
+
+ ; setup deltas
+ subps d_r, m0, prev_r
+ subps d_g, m1, prev_g
+ subps d_b, m2, prev_b
+
+ ; setup next index
+ addps next_r, prev_r, m14 ; +1
+ minps next_r, next_r, m13 ; clamp lutmax
+
+ addps next_g, prev_g, m14 ; +1
+ minps next_g, next_g, m13 ; clamp lutmax
+
+ addps next_b, prev_b, m14 ; +1
+ minps next_b, next_b, m13 ; clamp lutmax
+
+ ; prescale indices
+ mulps prev_r, prev_r, lut3dsize2m
+ mulps next_r, next_r, lut3dsize2m
+
+ mulps prev_g, prev_g, lut3dsizem
+ mulps next_g, next_g, lut3dsizem
+
+ mulps prev_b, prev_b, [pd_3f]
+ mulps next_b, next_b, [pd_3f]
+
+ ; cxxxa m10
+ ; 1 is the delta that is the largest
+ ; r> == c100 == (r>g && r>b)
+ ; g> == c010 == (g>r && g>b)
+ ; b> == c001 == (b>r && b>g)
+ ; if delta > other 2 use next else prev
+
+ ; cxxxb m11;
+ ; 0 is the delta that is the smallest
+ ; r< == c011 == (r<=g && r<=b)
+ ; g< == c101 == (g<=r && g<=b)
+ ; b< == c110 == (b<=r && b<=g)
+ ; if delta <= other 2 use prev else next
+
+ cmpps m13, d_r, d_g, 0x1E ; r>g
+ cmpps m14, d_g, d_b, 0x1E ; g>b
+ cmpps m15, d_b, d_r, 0x1E ; b>r
+
+ ; r> !b>r && r>g
+ andnps m9, m15, m13
+ BLEND m10, prev_r, next_r, m9
+
+ ; r< !r>g && b>r
+ andnps m9, m13, m15
+ BLEND m11, next_r, prev_r, m9
+
+ ; g> !r>g && g>b
+ andnps m9, m13, m14
+ BLEND m12, prev_g, next_g, m9
+ addps m10, m12
+
+ ; g< !g>b && r>g
+ andnps m9, m14, m13
+ BLEND m12, next_g, prev_g, m9
+ addps m11, m12
+
+ ; b> !g>b && b>r
+ andnps m9, m14, m15
+ BLEND m12, prev_b, next_b, m9
+ addps m10, m12
+
+ ; b< !b>r && g>b
+ andnps m9, m15, m14
+ BLEND m12, next_b, prev_b, m9
+ addps m11, m12
+
+ ; c000 m12;
+ ADD3 m12, prev_r, prev_g, prev_b
+
+ ; c111 m13;
+ ADD3 m13, next_r, next_g, next_b
+
+ ; sort delta r,g,b x0 >= x1 >= x2
+ minps m7, d_r, d_g
+ maxps m8, d_r, d_g
+
+ minps x2, m7, d_b
+ maxps m7, m7, d_b
+
+ maxps x0, m8, d_b
+ minps x1, m8, m7
+
+ ; convert indices to integer
+ cvttps2dq m12, m12
+ cvttps2dq m10, m10
+ cvttps2dq m11, m11
+ cvttps2dq m13, m13
+
+ ; now the gathering festival
+ mov tmpq, [ctxq + LUT3DContext.lut]
+
+ GATHER_LUT3D_INDICES 0, 1, 2, 12
+ movu m14, [pd_1f]
+ subps m14, m14, x0; 1 - x0
+
+ mulps m0, m0, m14
+ mulps m1, m1, m14
+ mulps m2, m2, m14
+
+ GATHER_LUT3D_INDICES 7, 8, 9, 10
+ subps m14, x0, x1; x0 - x1
+ MADD3 m0, m7, m14
+ MADD3 m1, m8, m14
+ MADD3 m2, m9, m14
+
+ GATHER_LUT3D_INDICES 7, 8, 9, 11
+ subps m14, x1, x2; x1 - x2
+ MADD3 m0, m7, m14
+ MADD3 m1, m8, m14
+ MADD3 m2, m9, m14
+
+ GATHER_LUT3D_INDICES 7, 8, 9, 13
+ MADD3 m0, m7, x2
+ MADD3 m1, m8, x2
+ MADD3 m2, m9, x2
+
+%endmacro
+
+%macro INIT_DATA_PTR 3
+ mov ptrq, [%2 + AVFrame.data + %3 * 8]
+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
+ imul tmpd, slice_startd
+ add ptrq, tmpq
+ mov %1, ptrq
+%endmacro
+
+%macro INC_DATA_PTR 3
+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
+ mov ptrq, %1
+ add ptrq, tmpq
+ mov %1, ptrq
+%endmacro
+
+%macro LOAD16 2
+ mov ptrq, %2
+ %if mmsize > 16
+ movu xm%1, [ptrq + xq*2]
+ %else
+ movsd xm%1, [ptrq + xq*2]
+ %endif
+ %if cpuflag(avx2)
+ vpmovzxwd m%1, xm%1
+ %else
+ %if mmsize > 16
+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
+ pshufb xm%1, xm6 ; pb_shuffle16
+ pshufb xm4, xm6 ; pb_shuffle16
+ vinsertf128 m%1, m%1, xm4, 1
+ %else
+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+ %endif
+ %endif
+ cvtdq2ps m%1, m%1
+ mulps m%1, m%1, m7 ; pd_65535_invf
+%endmacro
+
+%macro STORE16 2
+ mulps m%2, m%2, m5 ; [pd_65535f]
+ minps m%2, m%2, m5 ; [pd_65535f]
+ maxps m%2, m%2, m15 ; zero
+ cvttps2dq m%2, m%2
+ %if mmsize > 16
+ vextractf128 xm4, m%2, 1
+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16]
+ por xm%2, xm4
+ %else
+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
+ %endif
+ mov ptrq, %1
+ %if mmsize > 16
+ movu [ptrq + xq*2], xm%2
+ %else
+ movsd [ptrq + xq*2], xm%2
+ %endif
+%endmacro
+
+; 1 - interp method
+; 2 - format_name
+; 3 - depth
+; 4 - is float format
+%macro DEFINE_INTERP_FUNC 4
+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
+ ; store lut max and lutsize
+ mov tmpd, dword [ctxq + LUT3DContext.lutsize]
+ cvtsi2ss xm0, tmpd
+ mulss xm0, xm0, [pd_3f]
+ VBROADCASTSS m0, xm0
+ mova lut3dsizem, m0
+ sub tmpd, 1
+ cvtsi2ss xm0, tmpd
+ VBROADCASTSS m0, xm0
+ mova lut3dmaxm, m0
+
+ ; scale_r
+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
+ VBROADCASTSS m1, xm1
+ mova scalerm, m1
+
+ ; scale_g
+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
+ VBROADCASTSS m1, xm1
+ mova scalegm, m1
+
+ ; scale_b
+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
+ VBROADCASTSS m1, xm1
+ mova scalebm, m1
+
+ ; store lutsize2
+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
+ mulss xm0, xm0, [pd_3f]
+ VBROADCASTSS m0, xm0
+ mova lut3dsize2m, m0
+
+ ; init prelut values
+ cmp prelutq, 0
+ je %%skip_init_prelut
+ mov tmpd, dword [prelutq + Lut3DPreLut.size]
+ sub tmpd, 1
+ cvtsi2ss xm0, tmpd
+ VBROADCASTSS m0, xm0
+ mova prelutmaxm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
+ mova prelutminrm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
+ mova prelutmingm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
+ mova prelutminbm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
+ mova prelutscalerm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
+ mova prelutscalegm, m0
+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
+ mova prelutscalebm, m0
+ %%skip_init_prelut:
+
+ mov widthd, [src_imageq + AVFrame.width]
+
+ ; gbra pixel order
+ INIT_DATA_PTR srcrm, src_imageq, 2
+ INIT_DATA_PTR srcgm, src_imageq, 0
+ INIT_DATA_PTR srcbm, src_imageq, 1
+ INIT_DATA_PTR srcam, src_imageq, 3
+
+ INIT_DATA_PTR dstrm, dst_imageq, 2
+ INIT_DATA_PTR dstgm, dst_imageq, 0
+ INIT_DATA_PTR dstbm, dst_imageq, 1
+ INIT_DATA_PTR dstam, dst_imageq, 3
+
+ %%loop_y:
+ xor xq, xq
+ %%loop_x:
+ movu m14, [pd_1f]
+ xorps m15, m15, m15
+ %if %4 ; float
+ mov ptrq, srcrm
+ movu m0, [ptrq + xq*4]
+ mov ptrq, srcgm
+ movu m1, [ptrq + xq*4]
+ mov ptrq, srcbm
+ movu m2, [ptrq + xq*4]
+ %else
+ ; constants for LOAD16
+ movu m7, [pd_65535_invf]
+ %if notcpuflag(avx2) && mmsize >= 32
+ movu xm6, [pb_shuffle16]
+ %endif
+ LOAD16 0, srcrm
+ LOAD16 1, srcgm
+ LOAD16 2, srcbm
+ %endif
+
+ cmp prelutq, 0
+ je %%skip_prelut
+ mova m13, prelutmaxm
+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
+ %%skip_prelut:
+
+ mova m13, lut3dmaxm
+ APPLY_SCALE 0, scalerm
+ APPLY_SCALE 1, scalegm
+ APPLY_SCALE 2, scalebm
+
+ interp_%1
+
+ %if %4 ; float
+ mov ptrq, dstrm
+ movu [ptrq + xq*4], m0
+ mov ptrq, dstgm
+ movu [ptrq + xq*4], m1
+ mov ptrq, dstbm
+ movu [ptrq + xq*4], m2
+ cmp has_alphad, 0
+ je %%skip_alphaf
+ mov ptrq, srcam
+ movu m0, [ptrq + xq*4]
+ mov ptrq, dstam
+ movu [ptrq + xq*4], m0
+ %%skip_alphaf:
+ %else
+ ; constants for STORE16
+ movu m5, [pd_65535f]
+ %if mmsize > 16
+ movu xm6, [pb_lo_pack_shuffle16]
+ movu xm7, [pb_hi_pack_shuffle16]
+ %endif
+
+ xorps m15, m15, m15
+ STORE16 dstrm, 0
+ STORE16 dstgm, 1
+ STORE16 dstbm, 2
+
+ cmp has_alphad, 0
+ je %%skip_alpha
+ %if mmsize > 16
+ mov ptrq, srcam
+ movu xm0, [ptrq + xq*2]
+ mov ptrq, dstam
+ movu [ptrq + xq*2], xm0
+ %else
+ mov ptrq, srcam
+ movsd xm0, [ptrq + xq*2]
+ mov ptrq, dstam
+ movsd [ptrq + xq*2], xm0
+ %endif
+
+ %%skip_alpha:
+ %endif
+
+ add xq, mmsize/4
+ cmp xd, widthd
+ jl %%loop_x
+
+ INC_DATA_PTR srcrm, src_imageq, 2
+ INC_DATA_PTR srcgm, src_imageq, 0
+ INC_DATA_PTR srcbm, src_imageq, 1
+ INC_DATA_PTR srcam, src_imageq, 3
+
+ INC_DATA_PTR dstrm, dst_imageq, 2
+ INC_DATA_PTR dstgm, dst_imageq, 0
+ INC_DATA_PTR dstbm, dst_imageq, 1
+ INC_DATA_PTR dstam, dst_imageq, 3
+
+ inc slice_startd
+ cmp slice_startd, slice_endd
+ jl %%loop_y
+
+ RET
+%endmacro
+%if ARCH_X86_64
+ %if HAVE_AVX2_EXTERNAL
+ INIT_YMM avx2
+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+ %endif
+ %if HAVE_AVX_EXTERNAL
+ INIT_YMM avx
+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+ %endif
+ INIT_XMM sse2
+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+%endif