diff options
author | Rémi Denis-Courmont <remi@remlab.net> | 2024-06-01 21:32:56 +0300 |
---|---|---|
committer | Rémi Denis-Courmont <remi@remlab.net> | 2024-06-04 17:40:41 +0300 |
commit | 4e120fbbbd087c3acbad6ce2e8c7b1262a5c8632 (patch) | |
tree | b04252a83e826cf23cc0509fbe7118ccefd3f2c1 /libavcodec/riscv | |
parent | 30797e4ff6c8c537471c386cd019a6a48a721f01 (diff) | |
download | ffmpeg-4e120fbbbd087c3acbad6ce2e8c7b1262a5c8632.tar.gz |
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y
As with idct_dc_add, most of the code is shared with, and replaces, the
previous VP8 function. To improve performance, we break down the 16x4
matrix into 4 rows, rather than 4 squares. Thus strided loads and
stores are avoided, and the 4 DC calculations are vectored.
Unfortunately this requires a vector gather to splat the DC values, but
overall this is still a win for performance:
T-Head C908:
vp7_idct_dc_add4y_c: 7.2
vp7_idct_dc_add4y_rvv_i32: 2.2
vp8_idct_dc_add4y_c: 6.2
vp8_idct_dc_add4y_rvv_i32: 2.2 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7
SpacemiT X60:
vp7_idct_dc_add4y_c: 6.2
vp7_idct_dc_add4y_rvv_i32: 2.0
vp8_idct_dc_add4y_c: 5.5
vp8_idct_dc_add4y_rvv_i32: 2.5 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7
I also tried to provision the DC values using indexed loads. It ends up
slower overall, especially for VP7, as we then have to compute 16 DC's
instead of just 4.
Diffstat (limited to 'libavcodec/riscv')
-rw-r--r-- | libavcodec/riscv/vp7dsp_init.c | 2 | ||||
-rw-r--r-- | libavcodec/riscv/vp7dsp_rvv.S | 16 | ||||
-rw-r--r-- | libavcodec/riscv/vp8dsp_rvv.S | 46 |
3 files changed, 54 insertions, 10 deletions
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c index 491874483f..fa5fb9d2ae 100644 --- a/libavcodec/riscv/vp7dsp_init.c +++ b/libavcodec/riscv/vp7dsp_init.c @@ -28,6 +28,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]); void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc); +void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride) @@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) c->vp8_idct_add = ff_vp7_idct_add_rvv; #endif c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv; + c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv; } #endif } diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index 2a4c404bbb..39b23c2e79 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -127,3 +127,19 @@ func ff_vp7_idct_add_rvv, zve32x ret endfunc #endif + +func ff_vp7_idct_dc_add4y_rvv, zve32x + li t0, 32 + vsetivli zero, 4, e16, mf2, ta, ma + li t1, 23170 + vlse16.v v8, (a1), t0 # block[0..3][0] + vwmul.vx v0, v8, t1 + li t2, 0x20000 + vsetvli zero, zero, e32, m1, ta, ma + vsra.vi v0, v0, 14 + vmul.vx v0, v0, t1 + vadd.vx v0, v0, t2 + vsetvli zero, zero, e16, mf2, ta, ma + vnsra.wi v8, v0, 18 # 4x DC + tail ff_vp78_idct_dc_add4y_rvv +endfunc diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 7a3ab576e9..8ea0a0c9bd 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -105,6 +105,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x # fall through endfunc +# a3 = DC func ff_vp78_idct_dc_add_rvv, zve32x csrwi vxrm, 0 vsetivli zero, 4, e8, mf4, ta, ma @@ -121,6 +122,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x ret endfunc +func ff_vp8_idct_dc_add4y_rvv, zve32x + li t0, 32 + vsetivli zero, 4, e16, mf2, ta, ma + vlse16.v v8, (a1), t0 + vadd.vi v8, v8, 4 + vsra.vi v8, v8, 3 + # fall through +endfunc + + .variant_cc ff_vp78_idct_dc_add4y_rvv +# v8 = [dc0, dc1, dc2, dc3] +func ff_vp78_idct_dc_add4y_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vid.v v4 + vsrl.vi v4, v4, 2 + vrgather.vv v0, v8, v4 # replicate each DC four times + vsetvli zero, zero, e8, m1, ta, ma + li a4, 4 +1: + vle8.v v8, (a0) + addi a4, a4, -1 + vwaddu.wv v16, v0, v8 + sh zero, (a1) + vsetvli zero, zero, e16, m2, ta, ma + vmax.vx v16, v16, zero + addi a1, a1, 32 + vsetvli zero, zero, e8, m1, ta, ma + vnclipu.wi v8, v16, 0 + vse8.v v8, (a0) + add a0, a0, a2 + bnez a4, 1b + + ret +endfunc + .macro vp8_idct_dc_add vlse32.v v0, (a0), a2 lh a5, 0(a1) @@ -143,16 +179,6 @@ endfunc addi a1, a1, 32 .endm -func ff_vp8_idct_dc_add4y_rvv, zve32x - vsetivli zero, 4, e8, mf4, ta, ma - .rept 3 - vp8_idct_dc_addy - .endr - vp8_idct_dc_add - - ret -endfunc - func ff_vp8_idct_dc_add4uv_rvv, zve32x vsetivli zero, 4, e8, mf4, ta, ma vp8_idct_dc_addy |