aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsunyuechi <sunyuechi@iscas.ac.cn>2024-05-30 23:26:53 +0800
committerRémi Denis-Courmont <remi@remlab.net>2024-06-12 18:38:41 +0300
commit8d9fb7b5cf0a3b32e2e59be5c32cbe0dab84567b (patch)
treed6fcb030ec056b8cd61dbeded03b7dab88fc5209
parentd72a5fe719c01da07af30e4402a7c3cd994b4cfc (diff)
downloadffmpeg-8d9fb7b5cf0a3b32e2e59be5c32cbe0dab84567b.tar.gz
lavc/vp8dsp: R-V V put_bilin_h v unroll
Since len < 64, the registers are sufficient, so it can be directly unrolled (a4 is even). Another benefit of unrolling is that it reduces one load operation vertically compared to horizontally. old new C908 X60 C908 X60 vp8_put_bilin4_h_c : 6.2 5.5 : 6.2 5.5 vp8_put_bilin4_h_rvv_i32 : 2.2 2.0 : 1.5 1.5 vp8_put_bilin4_v_c : 6.5 5.7 : 6.2 5.7 vp8_put_bilin4_v_rvv_i32 : 2.2 2.0 : 1.2 1.5 vp8_put_bilin8_h_c : 24.2 21.5 : 24.2 21.5 vp8_put_bilin8_h_rvv_i32 : 5.2 4.7 : 3.5 3.5 vp8_put_bilin8_v_c : 24.5 21.7 : 24.5 21.7 vp8_put_bilin8_v_rvv_i32 : 5.2 4.7 : 3.5 3.2 vp8_put_bilin16_h_c : 48.0 42.7 : 48.0 42.7 vp8_put_bilin16_h_rvv_i32 : 5.7 5.0 : 5.2 4.5 vp8_put_bilin16_v_c : 48.2 43.0 : 48.2 42.7 vp8_put_bilin16_v_rvv_i32 : 5.7 5.2 : 4.5 4.2 Signed-off-by: Rémi Denis-Courmont <remi@remlab.net>
-rw-r--r--libavcodec/riscv/vp8dsp_rvv.S34
1 files changed, 29 insertions, 5 deletions
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2766f7c41e..2e8259b24f 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -275,11 +275,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
li t4, 4
sub t1, t1, \mn
1:
- addi a4, a4, -1
- bilin_load v0, \type, \mn
- vse8.v v0, (a0)
- add a2, a2, a3
- add a0, a0, a1
+ add t0, a2, a3
+ add t2, a0, a1
+ addi a4, a4, -2
+.ifc \type,v
+ add t3, t0, a3
+.else
+ addi t5, a2, 1
+ addi t3, t0, 1
+ vle8.v v2, (t5)
+.endif
+ vle8.v v0, (a2)
+ vle8.v v4, (t0)
+ vle8.v v6, (t3)
+ vwmulu.vx v28, v0, t1
+ vwmulu.vx v26, v4, t1
+.ifc \type,v
+ vwmaccu.vx v28, \mn, v4
+.else
+ vwmaccu.vx v28, \mn, v2
+.endif
+ vwmaccu.vx v26, \mn, v6
+ vwaddu.wx v24, v28, t4
+ vwaddu.wx v22, v26, t4
+ vnsra.wi v30, v24, 3
+ vnsra.wi v0, v22, 3
+ vse8.v v30, (a0)
+ vse8.v v0, (t2)
+ add a2, t0, a3
+ add a0, t2, a1
bnez a4, 1b
ret