diff options
author | Rémi Denis-Courmont <remi@remlab.net> | 2024-09-01 15:47:26 +0300 |
---|---|---|
committer | Rémi Denis-Courmont <remi@remlab.net> | 2024-09-24 20:04:51 +0300 |
commit | 4936bb25083ebdd7b0c514ab8b81159bb4273265 (patch) | |
tree | 8dcbbbd6bb19a18837c23c5000124bac8f17b819 | |
parent | ba7d0d5fc39127ce243bb93e375f3a633492c273 (diff) | |
download | ffmpeg-4936bb25083ebdd7b0c514ab8b81159bb4273265.tar.gz |
lavc/h264dsp: optimise R-V V weight for shorter heights
The height is a power of two of up to 16 rows. The current code was
optimised for large sample counts.
T-Head C908:
h264_weight2_8_c: 211.7 ( 1.00x)
h264_weight2_8_rvv_i32: before 184.0 ( 1.15x)
h264_weight2_8_rvv_i32: after 54.2 ( 3.90x)
h264_weight4_8_c: 285.7 ( 1.00x)
h264_weight4_8_rvv_i32: before 341.2 ( 0.86x)
h264_weight4_8_rvv_i32: after 82.2 ( 3.47x)
h264_weight8_8_c: 498.7 ( 1.00x)
h264_weight8_8_rvv_i32: before 683.7 ( 0.73x)
h264_weight8_8_rvv_i64: after 128.5 ( 3.95x)
h264_weight16_8_c: 878.2 ( 1.00x)
h264_weight16_8_rvv_i32: unchanged 239.5 ( 3.67x)
SpacemiT X60:
h264_weight2_8_c: 207.2 ( 1.00x)
h264_weight2_8_rvv_i32: before 259.6 ( 0.80x)
h264_weight2_8_rvv_i32: after 82.2 ( 2.52x)
h264_weight4_8_c: 290.8 ( 1.00x)
h264_weight4_8_rvv_i32: before 509.6 ( 0.57x)
h264_weight4_8_rvv_i32: after 61.5 ( 4.73x)
h264_weight8_8_c: 498.8 ( 1.00x)
h264_weight8_8_rvv_i32: before 1019.8 ( 0.49x)
h264_weight8_8_rvv_i64: after 71.8 ( 6.95x)
h264_weight16_8_c: 874.0 ( 1.00x)
h264_weight16_8_rvv_i32: unchanged 249.0 ( 3.51x)
-rw-r--r-- | libavcodec/riscv/h264dsp_init.c | 18 | ||||
-rw-r--r-- | libavcodec/riscv/h264dsp_rvv.S | 62 |
2 files changed, 42 insertions, 38 deletions
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 9ffc9b0333..6391667a40 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -96,13 +96,23 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, if (flags & AV_CPU_FLAG_RVV_I32) { const bool zvl128b = ff_rv_vlen_least(128); + if (bit_depth == 8) { + if (zvl128b) + dsp->weight_h264_pixels_tab[0] = + ff_h264_weight_funcs_8_rvv[0].weight; + if (flags & AV_CPU_FLAG_RVV_I64) + dsp->weight_h264_pixels_tab[1] = + ff_h264_weight_funcs_8_rvv[1].weight; + dsp->weight_h264_pixels_tab[2] = + ff_h264_weight_funcs_8_rvv[2].weight; + dsp->weight_h264_pixels_tab[3] = + ff_h264_weight_funcs_8_rvv[3].weight; + } + if (bit_depth == 8 && zvl128b) { - for (int i = 0; i < 4; i++) { - dsp->weight_h264_pixels_tab[i] = - ff_h264_weight_funcs_8_rvv[i].weight; + for (int i = 0; i < 4; i++) dsp->biweight_h264_pixels_tab[i] = ff_h264_weight_funcs_8_rvv[i].biweight; - } dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv; dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv; diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S index 422ac02222..b081e156a0 100644 --- a/libavcodec/riscv/h264dsp_rvv.S +++ b/libavcodec/riscv/h264dsp_rvv.S @@ -28,11 +28,12 @@ #include "libavutil/riscv/asm.S" + .variant_cc ff_h264_weight_pixels_simple_8_rvv func ff_h264_weight_pixels_simple_8_rvv, zve32x csrwi vxrm, 0 sll a5, a5, a3 1: - vsetvli zero, a6, e16, m2, ta, ma + vsetvli zero, t6, e16, m2, ta, ma vle8.v v8, (a0) addi a2, a2, -1 vzext.vf2 v24, v8 @@ -76,38 +77,36 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x ret endfunc -func ff_h264_weight_pixels_8_rvv, zve32x +.macro h264_weight depth, w, b= +func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x + lpad 0 + .ifb \b + li t6, \w + j ff_h264_weight_pixels_simple_\depth\()_rvv + .else csrwi vxrm, 0 sll a5, a5, a3 1: - mv t0, a0 - mv t6, a6 -2: - vsetvli t2, a2, e16, m8, ta, ma - vlsseg2e8.v v0, (t0), a1 - addi t6, t6, -2 - vzext.vf2 v16, v0 - vzext.vf2 v24, v4 - vmul.vx v16, v16, a4 - vmul.vx v24, v24, a4 + vsetvli t1, a2, e\b, m2, ta, ma + vlse\b\().v v8, (a0), a1 + vsetvli t0, zero, e16, m4, ta, ma + vzext.vf2 v24, v8 + sub a2, a2, t1 + vmul.vx v16, v24, a4 + mul t2, t1, a1 vsadd.vx v16, v16, a5 - vsadd.vx v24, v24, a5 vmax.vx v16, v16, zero - vmax.vx v24, v24, zero - vsetvli zero, zero, e8, m4, ta, ma - vnclipu.wx v0, v16, a3 - vnclipu.wx v4, v24, a3 - vssseg2e8.v v0, (t0), a1 - addi t0, t0, 2 - bnez t6, 2b - - mul t3, a1, t2 - sub a2, a2, t2 - add a0, a0, t3 + vsetvli zero, zero, e8, m2, ta, ma + vnclipu.wx v8, v16, a3 + vsetvli zero, t1, e\b, m2, ta, ma + vsse\b\().v v8, (a0), a1 + add a0, a0, t2 bnez a2, 1b ret + .endif endfunc +.endm .variant_cc ff_h264_biweight_pixels_8_rvv func ff_h264_biweight_pixels_8_rvv, zve32x @@ -152,17 +151,12 @@ func ff_h264_biweight_pixels_8_rvv, zve32x ret endfunc -.irp w, 16, 8, 4, 2 -func ff_h264_weight_pixels\w\()_8_rvv, zve32x - lpad 0 - li a6, \w - .if \w == 16 - j ff_h264_weight_pixels_simple_8_rvv - .else - j ff_h264_weight_pixels_8_rvv - .endif -endfunc +h264_weight 8, 2, 16 +h264_weight 8, 4, 32 +h264_weight 8, 8, 64 +h264_weight 8, 16 +.irp w, 16, 8, 4, 2 func ff_h264_biweight_pixels\w\()_8_rvv, zve32x lpad 0 li t6, \w |