aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/riscv
diff options
context:
space:
mode:
authorRémi Denis-Courmont <remi@remlab.net>2024-06-01 21:32:56 +0300
committerRémi Denis-Courmont <remi@remlab.net>2024-06-04 17:40:41 +0300
commit4e120fbbbd087c3acbad6ce2e8c7b1262a5c8632 (patch)
treeb04252a83e826cf23cc0509fbe7118ccefd3f2c1 /libavcodec/riscv
parent30797e4ff6c8c537471c386cd019a6a48a721f01 (diff)
downloadffmpeg-4e120fbbbd087c3acbad6ce2e8c7b1262a5c8632.tar.gz
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y
As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4.
Diffstat (limited to 'libavcodec/riscv')
-rw-r--r--libavcodec/riscv/vp7dsp_init.c2
-rw-r--r--libavcodec/riscv/vp7dsp_rvv.S16
-rw-r--r--libavcodec/riscv/vp8dsp_rvv.S46
3 files changed, 54 insertions, 10 deletions
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index 491874483f..fa5fb9d2ae 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -28,6 +28,7 @@
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
+void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
ptrdiff_t stride)
@@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_add = ff_vp7_idct_add_rvv;
#endif
c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
+ c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 2a4c404bbb..39b23c2e79 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -127,3 +127,19 @@ func ff_vp7_idct_add_rvv, zve32x
ret
endfunc
#endif
+
+func ff_vp7_idct_dc_add4y_rvv, zve32x
+ li t0, 32
+ vsetivli zero, 4, e16, mf2, ta, ma
+ li t1, 23170
+ vlse16.v v8, (a1), t0 # block[0..3][0]
+ vwmul.vx v0, v8, t1
+ li t2, 0x20000
+ vsetvli zero, zero, e32, m1, ta, ma
+ vsra.vi v0, v0, 14
+ vmul.vx v0, v0, t1
+ vadd.vx v0, v0, t2
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vnsra.wi v8, v0, 18 # 4x DC
+ tail ff_vp78_idct_dc_add4y_rvv
+endfunc
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 7a3ab576e9..8ea0a0c9bd 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -105,6 +105,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x
# fall through
endfunc
+# a3 = DC
func ff_vp78_idct_dc_add_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e8, mf4, ta, ma
@@ -121,6 +122,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x
ret
endfunc
+func ff_vp8_idct_dc_add4y_rvv, zve32x
+ li t0, 32
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vlse16.v v8, (a1), t0
+ vadd.vi v8, v8, 4
+ vsra.vi v8, v8, 3
+ # fall through
+endfunc
+
+ .variant_cc ff_vp78_idct_dc_add4y_rvv
+# v8 = [dc0, dc1, dc2, dc3]
+func ff_vp78_idct_dc_add4y_rvv, zve32x
+ vsetivli zero, 16, e16, m2, ta, ma
+ vid.v v4
+ vsrl.vi v4, v4, 2
+ vrgather.vv v0, v8, v4 # replicate each DC four times
+ vsetvli zero, zero, e8, m1, ta, ma
+ li a4, 4
+1:
+ vle8.v v8, (a0)
+ addi a4, a4, -1
+ vwaddu.wv v16, v0, v8
+ sh zero, (a1)
+ vsetvli zero, zero, e16, m2, ta, ma
+ vmax.vx v16, v16, zero
+ addi a1, a1, 32
+ vsetvli zero, zero, e8, m1, ta, ma
+ vnclipu.wi v8, v16, 0
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ bnez a4, 1b
+
+ ret
+endfunc
+
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@@ -143,16 +179,6 @@ endfunc
addi a1, a1, 32
.endm
-func ff_vp8_idct_dc_add4y_rvv, zve32x
- vsetivli zero, 4, e8, mf4, ta, ma
- .rept 3
- vp8_idct_dc_addy
- .endr
- vp8_idct_dc_add
-
- ret
-endfunc
-
func ff_vp8_idct_dc_add4uv_rvv, zve32x
vsetivli zero, 4, e8, mf4, ta, ma
vp8_idct_dc_addy