diff options
author | Niklas Haas <[email protected]> | 2025-09-15 17:47:39 +0200 |
---|---|---|
committer | Niklas Haas <[email protected]> | 2025-09-21 11:02:41 +0000 |
commit | 843920d5d6bdcecbfd4eeac66cd175348bf99496 (patch) | |
tree | 9d8d8354ff072c3eb7459d49dc22fa80aef8c3e0 | |
parent | 4c067d0778c932ab85f35cc4c07ace9d612e1905 (diff) |
avfilter/x86/vf_idetdsp: add AVX2 and AVX512 implementations
The only thing that changes slightly is the horizontal sum at the end.
-rw-r--r-- | libavfilter/x86/vf_idetdsp.asm | 36 | ||||
-rw-r--r-- | libavfilter/x86/vf_idetdsp_init.c | 12 |
2 files changed, 42 insertions, 6 deletions
diff --git a/libavfilter/x86/vf_idetdsp.asm b/libavfilter/x86/vf_idetdsp.asm index 63d9f4533d..12d65000ab 100644 --- a/libavfilter/x86/vf_idetdsp.asm +++ b/libavfilter/x86/vf_idetdsp.asm @@ -39,7 +39,7 @@ SECTION .text paddd %1, %2 %endmacro -%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) +%macro IDET_FILTER_LINE_16BIT 0 cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index xor indexq, indexq %define m_zero m1 @@ -54,7 +54,7 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index psubusw m5, m2, m3 ; ba movu m4, [cq + indexq * 2] ; C - add indexq, %1 + add indexq, mmsize >> 1 psubusw m3, m2 ; ab CMP indexd, widthd @@ -67,13 +67,23 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index paddd m_sum, m5 jl .loop_16bit +%if mmsize > 32 + vextracti64x4 ym1, m0, 1 + paddq ym0, ym1 +%endif HADDD m_sum, m2 movd eax, m_sum RET %endmacro INIT_XMM sse2 -IDET_FILTER_LINE_16BIT 8 +IDET_FILTER_LINE_16BIT + +INIT_XMM avx2 +IDET_FILTER_LINE_16BIT + +INIT_XMM avx512icl +IDET_FILTER_LINE_16BIT ;****************************************************************************** ; SSE2 8-bit implementation that does 16-bytes at a time: @@ -106,11 +116,25 @@ cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total jl .sse2_loop paddq m0, m1 - movhlps m1, m0 - paddq m0, m1 - movd eax, m0 +%if mmsize > 32 + vextracti64x4 ym1, m0, 1 + paddq ym0, ym1 +%endif +%if mmsize > 16 + vextracti128 xm1, ym0, 1 + paddq xm0, xm1 +%endif + movhlps xm1, xm0 + paddq xm0, xm1 + movd eax, xm0 RET %endmacro INIT_XMM sse2 IDET_FILTER_LINE + +INIT_YMM avx2 +IDET_FILTER_LINE + +INIT_ZMM avx512icl +IDET_FILTER_LINE diff --git a/libavfilter/x86/vf_idetdsp_init.c b/libavfilter/x86/vf_idetdsp_init.c index e49b01a68c..1f4c2e2bc3 100644 --- a/libavfilter/x86/vf_idetdsp_init.c +++ b/libavfilter/x86/vf_idetdsp_init.c @@ -62,6 +62,12 @@ static int idet_filter_line_16bit_##KIND(const uint8_t *a, const uint8_t *b, \ FUNC_MAIN_DECL(sse2, 16) FUNC_MAIN_DECL_16bit(sse2, 8) +FUNC_MAIN_DECL(avx2, 32) +FUNC_MAIN_DECL_16bit(avx2, 16) + +FUNC_MAIN_DECL(avx512icl, 64) +FUNC_MAIN_DECL_16bit(avx512icl, 32) + #endif av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth) { @@ -71,5 +77,11 @@ av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth) if (EXTERNAL_SSE2(cpu_flags)) { dsp->filter_line = depth > 8 ? idet_filter_line_16bit_sse2 : idet_filter_line_sse2; } + if (EXTERNAL_AVX2(cpu_flags)) { + dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx2 : idet_filter_line_avx2; + } + if (EXTERNAL_AVX512ICL(cpu_flags)) { + dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx512icl : idet_filter_line_avx512icl; + } #endif // HAVE_X86ASM } |