diff options
author | Wu Jianhua <jianhua.wu@intel.com> | 2021-08-04 10:06:13 +0800 |
---|---|---|
committer | Paul B Mahol <onemda@gmail.com> | 2021-08-29 19:58:33 +0200 |
commit | 68a2722aee2868084ad3ba1a7a5431735eab049e (patch) | |
tree | 647893d1ea8883e2b9a15e707b12b6aeb1be6618 | |
parent | 4a5e24721c2bd1839aec57730061884fe2c5dd3b (diff) | |
download | ffmpeg-68a2722aee2868084ad3ba1a7a5431735eab049e.tar.gz |
libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()
The new vertical slice with AVX2/512 acceleration can significantly
improve the performance of Gaussian Filter 2D.
Performance data:
ff_verti_slice_c: 32.57
ff_verti_slice_avx2: 476.19
ff_verti_slice_avx512: 833.33
Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com>
Co-authored-by: Jin Jun <jun.i.jin@intel.com>
Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
-rw-r--r-- | libavfilter/gblur.h | 2 | ||||
-rw-r--r-- | libavfilter/vf_gblur.c | 24 | ||||
-rw-r--r-- | libavfilter/x86/vf_gblur.asm | 189 | ||||
-rw-r--r-- | libavfilter/x86/vf_gblur_init.c | 7 |
4 files changed, 214 insertions, 8 deletions
diff --git a/libavfilter/gblur.h b/libavfilter/gblur.h index dce50671f6..367575a6db 100644 --- a/libavfilter/gblur.h +++ b/libavfilter/gblur.h @@ -50,6 +50,8 @@ typedef struct GBlurContext { float nuV; int nb_planes; void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale); + void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps, + float nu, float bscale); void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max); } GBlurContext; diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c index 4780bb6204..a2c410c07b 100644 --- a/libavfilter/vf_gblur.c +++ b/libavfilter/vf_gblur.c @@ -138,6 +138,19 @@ static void do_vertical_columns(float *buffer, int width, int height, } } +static void verti_slice_c(float *buffer, int width, int height, + int slice_start, int slice_end, int steps, + float nu, float boundaryscale) +{ + int aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3); + /* Filter vertically along columns (process 8 columns in each step) */ + do_vertical_columns(buffer, width, height, slice_start, aligned_end, + steps, nu, boundaryscale, 8); + /* Filter un-aligned columns one by one */ + do_vertical_columns(buffer, width, height, aligned_end, slice_end, + steps, nu, boundaryscale, 1); +} + static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { GBlurContext *s = ctx->priv; @@ -150,16 +163,10 @@ static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_ const int steps = s->steps; const float nu = s->nuV; float *buffer = s->buffer; - int aligned_end; - aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3); - /* Filter vertically along columns (process 8 columns in each step) */ - do_vertical_columns(buffer, width, height, slice_start, aligned_end, - steps, nu, boundaryscale, 8); + s->verti_slice(buffer, width, height, slice_start, slice_end, + steps, nu, boundaryscale); - /* Filter un-aligned columns one by one */ - do_vertical_columns(buffer, width, height, aligned_end, slice_end, - steps, nu, boundaryscale, 1); return 0; } @@ -236,6 +243,7 @@ static int query_formats(AVFilterContext *ctx) void ff_gblur_init(GBlurContext *s) { s->horiz_slice = horiz_slice_c; + s->verti_slice = verti_slice_c; s->postscale_slice = postscale_c; if (ARCH_X86) ff_gblur_init_x86(s); diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm index 276fe347f5..ac4debba74 100644 --- a/libavfilter/x86/vf_gblur.asm +++ b/libavfilter/x86/vf_gblur.asm @@ -22,6 +22,43 @@ SECTION .text +%xdefine AVX2_MMSIZE 32 +%xdefine AVX512_MMSIZE 64 + +%macro MOVSXDIFNIDN 1-* + %rep %0 + movsxdifnidn %1q, %1d + %rotate 1 + %endrep +%endmacro + +%macro PUSH_MASK 5 +%if mmsize == AVX2_MMSIZE + %assign %%n mmsize/4 + %assign %%i 0 + %rep %%n + mov %4, %3 + and %4, 1 + neg %4 + mov dword [%5 + %%i*4], %4 + sar %3, 1 + %assign %%i %%i+1 + %endrep + movu %1, [%5] +%else + kmovd %2, %3 +%endif +%endmacro + +%macro VMASKMOVPS 4 +%if mmsize == AVX2_MMSIZE + vpmaskmovd %1, %3, %2 +%else + kmovw k7, %4 + vmovups %1{k7}, %2 +%endif +%endmacro + ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, ; float nu, float bscale) @@ -232,3 +269,155 @@ POSTSCALE_SLICE INIT_ZMM avx512 POSTSCALE_SLICE %endif + + +;******************************************************************************* +; void ff_verti_slice(float *buffer, int width, int height, int column_begin, +; int column_end, int steps, float nu, float bscale); +;******************************************************************************* +%macro VERTI_SLICE 0 +%if UNIX64 +cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ + steps, x, y, cwidth, step, ptr, stride +%else +cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ + steps, nu, bscale, x, y, cwidth, step, \ + ptr, stride +%endif +%assign cols mmsize/4 +%if WIN64 + VBROADCASTSS m0, num + VBROADCASTSS m1, bscalem + DEFINE_ARGS buffer, width, height, cbegin, cend, \ + steps, x, y, cwidth, step, ptr, stride + MOVSXDIFNIDN width, height, cbegin, cend, steps +%else + VBROADCASTSS m0, xmm0 ; nu + VBROADCASTSS m1, xmm1 ; bscale +%endif + mov cwidthq, cendq + sub cwidthq, cbeginq + lea strideq, [widthq * 4] + + xor xq, xq ; x = 0 + cmp cwidthq, cols + jl .x_scalar + cmp cwidthq, 0x0 + je .end_scalar + + sub cwidthq, cols +.loop_x: + xor stepq, stepq + .loop_step: + ; ptr = buffer + x + column_begin; + lea ptrq, [xq + cbeginq] + lea ptrq, [bufferq + ptrq*4] + + ; ptr[15:0] *= bcale; + movu m2, [ptrq] + mulps m2, m1 + movu [ptrq], m2 + + ; Filter downwards + mov yq, 1 + .loop_y_down: + add ptrq, strideq ; ptrq += width + movu m3, [ptrq] + FMULADD_PS m2, m2, m0, m3, m2 + movu [ptrq], m2 + + inc yq + cmp yq, heightq + jl .loop_y_down + + mulps m2, m1 + movu [ptrq], m2 + + ; Filter upwards + dec yq + .loop_y_up: + sub ptrq, strideq + movu m3, [ptrq] + FMULADD_PS m2, m2, m0, m3, m2 + movu [ptrq], m2 + + dec yq + cmp yq, 0 + jg .loop_y_up + + inc stepq + cmp stepq, stepsq + jl .loop_step + + add xq, cols + cmp xq, cwidthq + jle .loop_x + + add cwidthq, cols + cmp xq, cwidthq + jge .end_scalar + +.x_scalar: + xor stepq, stepq + mov qword [rsp + 0x10], xq + sub cwidthq, xq + mov xq, 1 + shlx cwidthq, xq, cwidthq + sub cwidthq, 1 + PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20 + mov xq, qword [rsp + 0x10] + + .loop_step_scalar: + lea ptrq, [xq + cbeginq] + lea ptrq, [bufferq + ptrq*4] + + VMASKMOVPS m2, [ptrq], m4, k1 + mulps m2, m1 + VMASKMOVPS [ptrq], m2, m4, k1 + + ; Filter downwards + mov yq, 1 + .x_scalar_loop_y_down: + add ptrq, strideq + VMASKMOVPS m3, [ptrq], m4, k1 + FMULADD_PS m2, m2, m0, m3, m2 + VMASKMOVPS [ptrq], m2, m4, k1 + + inc yq + cmp yq, heightq + jl .x_scalar_loop_y_down + + mulps m2, m1 + VMASKMOVPS [ptrq], m2, m4, k1 + + ; Filter upwards + dec yq + .x_scalar_loop_y_up: + sub ptrq, strideq + VMASKMOVPS m3, [ptrq], m4, k1 + FMULADD_PS m2, m2, m0, m3, m2 + VMASKMOVPS [ptrq], m2, m4, k1 + + dec yq + cmp yq, 0 + jg .x_scalar_loop_y_up + + inc stepq + cmp stepq, stepsq + jl .loop_step_scalar + +.end_scalar: + RET +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VERTI_SLICE +%endif + +%if HAVE_AVX512_EXTERNAL +INIT_ZMM avx512 +VERTI_SLICE +%endif +%endif diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c index 34aba4ca6e..3e173410c2 100644 --- a/libavfilter/x86/vf_gblur_init.c +++ b/libavfilter/x86/vf_gblur_init.c @@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max); +void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end, + int steps, float nu, float bscale); +void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end, + int steps, float nu, float bscale); + av_cold void ff_gblur_init_x86(GBlurContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s) } if (EXTERNAL_AVX2(cpu_flags)) { s->horiz_slice = ff_horiz_slice_avx2; + s->verti_slice = ff_verti_slice_avx2; } if (EXTERNAL_AVX512(cpu_flags)) { s->postscale_slice = ff_postscale_slice_avx512; + s->verti_slice = ff_verti_slice_avx512; } #endif } |