aboutsummaryrefslogtreecommitdiffstats
path: root/libavfilter/x86/vf_gblur.asm
diff options
context:
space:
mode:
authorWu Jianhua <jianhua.wu@intel.com>2021-08-04 10:06:13 +0800
committerPaul B Mahol <onemda@gmail.com>2021-08-29 19:58:33 +0200
commit68a2722aee2868084ad3ba1a7a5431735eab049e (patch)
tree647893d1ea8883e2b9a15e707b12b6aeb1be6618 /libavfilter/x86/vf_gblur.asm
parent4a5e24721c2bd1839aec57730061884fe2c5dd3b (diff)
downloadffmpeg-68a2722aee2868084ad3ba1a7a5431735eab049e.tar.gz
libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()
The new vertical slice with AVX2/512 acceleration can significantly improve the performance of Gaussian Filter 2D. Performance data: ff_verti_slice_c: 32.57 ff_verti_slice_avx2: 476.19 ff_verti_slice_avx512: 833.33 Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com> Co-authored-by: Jin Jun <jun.i.jin@intel.com> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
Diffstat (limited to 'libavfilter/x86/vf_gblur.asm')
-rw-r--r--libavfilter/x86/vf_gblur.asm189
1 files changed, 189 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index 276fe347f5..ac4debba74 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -22,6 +22,43 @@
SECTION .text
+%xdefine AVX2_MMSIZE 32
+%xdefine AVX512_MMSIZE 64
+
+%macro MOVSXDIFNIDN 1-*
+ %rep %0
+ movsxdifnidn %1q, %1d
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro PUSH_MASK 5
+%if mmsize == AVX2_MMSIZE
+ %assign %%n mmsize/4
+ %assign %%i 0
+ %rep %%n
+ mov %4, %3
+ and %4, 1
+ neg %4
+ mov dword [%5 + %%i*4], %4
+ sar %3, 1
+ %assign %%i %%i+1
+ %endrep
+ movu %1, [%5]
+%else
+ kmovd %2, %3
+%endif
+%endmacro
+
+%macro VMASKMOVPS 4
+%if mmsize == AVX2_MMSIZE
+ vpmaskmovd %1, %3, %2
+%else
+ kmovw k7, %4
+ vmovups %1{k7}, %2
+%endif
+%endmacro
+
; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
; float nu, float bscale)
@@ -232,3 +269,155 @@ POSTSCALE_SLICE
INIT_ZMM avx512
POSTSCALE_SLICE
%endif
+
+
+;*******************************************************************************
+; void ff_verti_slice(float *buffer, int width, int height, int column_begin,
+; int column_end, int steps, float nu, float bscale);
+;*******************************************************************************
+%macro VERTI_SLICE 0
+%if UNIX64
+cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
+ steps, x, y, cwidth, step, ptr, stride
+%else
+cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
+ steps, nu, bscale, x, y, cwidth, step, \
+ ptr, stride
+%endif
+%assign cols mmsize/4
+%if WIN64
+ VBROADCASTSS m0, num
+ VBROADCASTSS m1, bscalem
+ DEFINE_ARGS buffer, width, height, cbegin, cend, \
+ steps, x, y, cwidth, step, ptr, stride
+ MOVSXDIFNIDN width, height, cbegin, cend, steps
+%else
+ VBROADCASTSS m0, xmm0 ; nu
+ VBROADCASTSS m1, xmm1 ; bscale
+%endif
+ mov cwidthq, cendq
+ sub cwidthq, cbeginq
+ lea strideq, [widthq * 4]
+
+ xor xq, xq ; x = 0
+ cmp cwidthq, cols
+ jl .x_scalar
+ cmp cwidthq, 0x0
+ je .end_scalar
+
+ sub cwidthq, cols
+.loop_x:
+ xor stepq, stepq
+ .loop_step:
+ ; ptr = buffer + x + column_begin;
+ lea ptrq, [xq + cbeginq]
+ lea ptrq, [bufferq + ptrq*4]
+
+ ; ptr[15:0] *= bcale;
+ movu m2, [ptrq]
+ mulps m2, m1
+ movu [ptrq], m2
+
+ ; Filter downwards
+ mov yq, 1
+ .loop_y_down:
+ add ptrq, strideq ; ptrq += width
+ movu m3, [ptrq]
+ FMULADD_PS m2, m2, m0, m3, m2
+ movu [ptrq], m2
+
+ inc yq
+ cmp yq, heightq
+ jl .loop_y_down
+
+ mulps m2, m1
+ movu [ptrq], m2
+
+ ; Filter upwards
+ dec yq
+ .loop_y_up:
+ sub ptrq, strideq
+ movu m3, [ptrq]
+ FMULADD_PS m2, m2, m0, m3, m2
+ movu [ptrq], m2
+
+ dec yq
+ cmp yq, 0
+ jg .loop_y_up
+
+ inc stepq
+ cmp stepq, stepsq
+ jl .loop_step
+
+ add xq, cols
+ cmp xq, cwidthq
+ jle .loop_x
+
+ add cwidthq, cols
+ cmp xq, cwidthq
+ jge .end_scalar
+
+.x_scalar:
+ xor stepq, stepq
+ mov qword [rsp + 0x10], xq
+ sub cwidthq, xq
+ mov xq, 1
+ shlx cwidthq, xq, cwidthq
+ sub cwidthq, 1
+ PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20
+ mov xq, qword [rsp + 0x10]
+
+ .loop_step_scalar:
+ lea ptrq, [xq + cbeginq]
+ lea ptrq, [bufferq + ptrq*4]
+
+ VMASKMOVPS m2, [ptrq], m4, k1
+ mulps m2, m1
+ VMASKMOVPS [ptrq], m2, m4, k1
+
+ ; Filter downwards
+ mov yq, 1
+ .x_scalar_loop_y_down:
+ add ptrq, strideq
+ VMASKMOVPS m3, [ptrq], m4, k1
+ FMULADD_PS m2, m2, m0, m3, m2
+ VMASKMOVPS [ptrq], m2, m4, k1
+
+ inc yq
+ cmp yq, heightq
+ jl .x_scalar_loop_y_down
+
+ mulps m2, m1
+ VMASKMOVPS [ptrq], m2, m4, k1
+
+ ; Filter upwards
+ dec yq
+ .x_scalar_loop_y_up:
+ sub ptrq, strideq
+ VMASKMOVPS m3, [ptrq], m4, k1
+ FMULADD_PS m2, m2, m0, m3, m2
+ VMASKMOVPS [ptrq], m2, m4, k1
+
+ dec yq
+ cmp yq, 0
+ jg .x_scalar_loop_y_up
+
+ inc stepq
+ cmp stepq, stepsq
+ jl .loop_step_scalar
+
+.end_scalar:
+ RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VERTI_SLICE
+%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+VERTI_SLICE
+%endif
+%endif