summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNiklas Haas <[email protected]>2025-09-15 17:47:39 +0200
committerNiklas Haas <[email protected]>2025-09-21 11:02:41 +0000
commit843920d5d6bdcecbfd4eeac66cd175348bf99496 (patch)
tree9d8d8354ff072c3eb7459d49dc22fa80aef8c3e0
parent4c067d0778c932ab85f35cc4c07ace9d612e1905 (diff)
avfilter/x86/vf_idetdsp: add AVX2 and AVX512 implementations
The only thing that changes slightly is the horizontal sum at the end.
-rw-r--r--libavfilter/x86/vf_idetdsp.asm36
-rw-r--r--libavfilter/x86/vf_idetdsp_init.c12
2 files changed, 42 insertions, 6 deletions
diff --git a/libavfilter/x86/vf_idetdsp.asm b/libavfilter/x86/vf_idetdsp.asm
index 63d9f4533d..12d65000ab 100644
--- a/libavfilter/x86/vf_idetdsp.asm
+++ b/libavfilter/x86/vf_idetdsp.asm
@@ -39,7 +39,7 @@ SECTION .text
paddd %1, %2
%endmacro
-%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
+%macro IDET_FILTER_LINE_16BIT 0
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
xor indexq, indexq
%define m_zero m1
@@ -54,7 +54,7 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
psubusw m5, m2, m3 ; ba
movu m4, [cq + indexq * 2] ; C
- add indexq, %1
+ add indexq, mmsize >> 1
psubusw m3, m2 ; ab
CMP indexd, widthd
@@ -67,13 +67,23 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
paddd m_sum, m5
jl .loop_16bit
+%if mmsize > 32
+ vextracti64x4 ym1, m0, 1
+ paddq ym0, ym1
+%endif
HADDD m_sum, m2
movd eax, m_sum
RET
%endmacro
INIT_XMM sse2
-IDET_FILTER_LINE_16BIT 8
+IDET_FILTER_LINE_16BIT
+
+INIT_XMM avx2
+IDET_FILTER_LINE_16BIT
+
+INIT_XMM avx512icl
+IDET_FILTER_LINE_16BIT
;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
@@ -106,11 +116,25 @@ cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
jl .sse2_loop
paddq m0, m1
- movhlps m1, m0
- paddq m0, m1
- movd eax, m0
+%if mmsize > 32
+ vextracti64x4 ym1, m0, 1
+ paddq ym0, ym1
+%endif
+%if mmsize > 16
+ vextracti128 xm1, ym0, 1
+ paddq xm0, xm1
+%endif
+ movhlps xm1, xm0
+ paddq xm0, xm1
+ movd eax, xm0
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE
+
+INIT_YMM avx2
+IDET_FILTER_LINE
+
+INIT_ZMM avx512icl
+IDET_FILTER_LINE
diff --git a/libavfilter/x86/vf_idetdsp_init.c b/libavfilter/x86/vf_idetdsp_init.c
index e49b01a68c..1f4c2e2bc3 100644
--- a/libavfilter/x86/vf_idetdsp_init.c
+++ b/libavfilter/x86/vf_idetdsp_init.c
@@ -62,6 +62,12 @@ static int idet_filter_line_16bit_##KIND(const uint8_t *a, const uint8_t *b, \
FUNC_MAIN_DECL(sse2, 16)
FUNC_MAIN_DECL_16bit(sse2, 8)
+FUNC_MAIN_DECL(avx2, 32)
+FUNC_MAIN_DECL_16bit(avx2, 16)
+
+FUNC_MAIN_DECL(avx512icl, 64)
+FUNC_MAIN_DECL_16bit(avx512icl, 32)
+
#endif
av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth)
{
@@ -71,5 +77,11 @@ av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth)
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->filter_line = depth > 8 ? idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
}
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx2 : idet_filter_line_avx2;
+ }
+ if (EXTERNAL_AVX512ICL(cpu_flags)) {
+ dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx512icl : idet_filter_line_avx512icl;
+ }
#endif // HAVE_X86ASM
}