aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/riscv
diff options
context:
space:
mode:
authorRémi Denis-Courmont <remi@remlab.net>2023-11-07 21:56:02 +0200
committerRémi Denis-Courmont <remi@remlab.net>2023-11-12 14:03:09 +0200
commitf576a0835b45940aad08507257ecd8d61d42452c (patch)
tree11927b194531d80e7097f051ef6cc3ae3949f945 /libavcodec/riscv
parenteb508702a899f4a41d3653d90b077b487083d2ab (diff)
downloadffmpeg-f576a0835b45940aad08507257ecd8d61d42452c.tar.gz
lavc/aacpsdsp: rework R-V V hybrid_synthesis_deint
Given the size of the data set, strided memory accesses cannot be avoided. We can still do better than the current code. ps_hybrid_synthesis_deint_c: 12065.5 ps_hybrid_synthesis_deint_rvv_i32: 13650.2 (before) ps_hybrid_synthesis_deint_rvv_i64: 8181.0 (after)
Diffstat (limited to 'libavcodec/riscv')
-rw-r--r--libavcodec/riscv/aacpsdsp_init.c8
-rw-r--r--libavcodec/riscv/aacpsdsp_rvv.S61
2 files changed, 36 insertions, 33 deletions
diff --git a/libavcodec/riscv/aacpsdsp_init.c b/libavcodec/riscv/aacpsdsp_init.c
index f72d1bc330..e094660cf3 100644
--- a/libavcodec/riscv/aacpsdsp_init.c
+++ b/libavcodec/riscv/aacpsdsp_init.c
@@ -46,16 +46,16 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
if (flags & AV_CPU_FLAG_RVB_ADDR) {
- if (flags & AV_CPU_FLAG_RVV_I64)
+ if (flags & AV_CPU_FLAG_RVV_I64) {
c->add_squares = ff_ps_add_squares_rvv;
+ c->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_rvv;
+ }
c->mul_pair_single = ff_ps_mul_pair_single_rvv;
c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
}
}
- if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR))
c->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_rvv;
- c->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_rvv;
- }
#endif
}
diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index cf872599c8..1dc426e01c 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -190,38 +190,41 @@ func ff_ps_hybrid_analysis_ileave_rvv, zve32x /* no needs for zve32f here */
ret
endfunc
-func ff_ps_hybrid_synthesis_deint_rvv, zve32x
- slli t1, a2, 5 + 1 + 2
- sh2add a0, a2, a0
- add a1, a1, t1
- addi a2, a2, -64
- li t1, 38 * 64 * 4
- li t6, 64 * 4
- add a4, a0, t1
- beqz a2, 3f
+func ff_ps_hybrid_synthesis_deint_rvv, zve64x
+ slli t0, a2, 5 + 1 + 2
+ sh2add a0, a2, a0
+ add a1, a1, t0
+ addi t2, a2, -64
+ li t0, 38 * 64
+ li t1, 32 * 2 * 4
+ li t4, 8 - 16384 // offset from in[64][n][0] to in[0][n + 1][0]
+ slli t5, a2, 5 + 1 + 2 // and from in[0][n+1][0] to in[0][n+1][s]
+ neg t2, t2
+ li t3, 32
+ add a4, t4, t5
+ sh2add t0, t0, a0
1:
- mv t0, a0
- mv t1, a1
- mv t3, a3
- mv t4, a4
- addi a2, a2, 1
+ mv t4, t2
+ addi a3, a3, -1
2:
- vsetvli t5, t3, e32, m4, ta, ma
- vlseg2e32.v v16, (t1)
- sub t3, t3, t5
- vsse32.v v16, (t0), t6
- mul t2, t5, t6
- vsse32.v v20, (t4), t6
- sh3add t1, t5, t1
- add t0, t0, t2
- add t4, t4, t2
- bnez t3, 2b
+ vsetvli t5, t4, e32, m4, ta, ma
+ vlse64.v v16, (a1), t1 /* sizeof (float[32][2]) */
+ sub t4, t4, t5
+ vnsrl.wx v24, v16, zero
+ slli t6, t5, 5 + 1 + 2
+ vnsrl.wx v28, v16, t3 /* 32 */
+ add a1, a1, t6
+ vse32.v v24, (a0)
+ sh2add a0, t5, a0
+ vse32.v v28, (t0)
+ sh2add t0, t5, t0
+ bnez t4, 2b
+
+ add a1, a1, a4
+ sh2add a0, a2, a0
+ sh2add t0, a2, t0
+ bnez a3, 1b
- add a0, a0, 4
- add a1, a1, 32 * 2 * 4
- add a4, a4, 4
- bnez a2, 1b
-3:
ret
endfunc