aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorRémi Denis-Courmont <remi@remlab.net>2023-11-09 21:57:28 +0200
committerRémi Denis-Courmont <remi@remlab.net>2023-11-13 18:33:02 +0200
commit5b33104fca4057edb21598264ee17e087f10d816 (patch)
tree1c68e11c3423c109a46baf5ae40d4b25a0e85836 /libavcodec
parent67a2571a5547d39990e7f709f24d7a5b452ff8b9 (diff)
downloadffmpeg-5b33104fca4057edb21598264ee17e087f10d816.tar.gz
lavc/sbrdsp: R-V V hf_gen
hf_gen_c: 2922.7 hf_gen_rvv_f32: 731.5
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/riscv/sbrdsp_init.c4
-rw-r--r--libavcodec/riscv/sbrdsp_rvv.S50
2 files changed, 54 insertions, 0 deletions
diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c
index c1ed5b639c..e5736452ec 100644
--- a/libavcodec/riscv/sbrdsp_init.c
+++ b/libavcodec/riscv/sbrdsp_init.c
@@ -27,6 +27,9 @@ void ff_sbr_sum64x5_rvv(float *z);
float ff_sbr_sum_square_rvv(float (*x)[2], int n);
void ff_sbr_neg_odd_64_rvv(float *x);
void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]);
+void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2],
+ const float alpha0[2], const float alpha1[2],
+ float bw, int start, int end);
void ff_sbr_hf_g_filt_rvv(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
@@ -39,6 +42,7 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
c->sum64x5 = ff_sbr_sum64x5_rvv;
c->sum_square = ff_sbr_sum_square_rvv;
+ c->hf_gen = ff_sbr_hf_gen_rvv;
c->hf_g_filt = ff_sbr_hf_g_filt_rvv;
}
c->autocorrelate = ff_sbr_autocorrelate_rvv;
diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S
index 2f3a0969d7..43fab1f65f 100644
--- a/libavcodec/riscv/sbrdsp_rvv.S
+++ b/libavcodec/riscv/sbrdsp_rvv.S
@@ -174,6 +174,56 @@ func ff_sbr_autocorrelate_rvv, zve32f
ret
endfunc
+func ff_sbr_hf_gen_rvv, zve32f
+NOHWF fmv.w.x fa0, a4
+NOHWF mv a4, a5
+NOHWF mv a5, a6
+ flw ft2, 0(a2)
+ fmul.s fa1, fa0, fa0 // bw * bw
+ sh3add a1, a5, a1
+ flw ft3, 4(a2)
+ fmul.s fa2, ft2, fa0 // alpha[2]
+ sh3add a0, a5, a0
+ flw ft0, 0(a3)
+ fmul.s fa3, ft3, fa0 // alpha[3]
+ sub a5, a5, a4
+ flw ft1, 4(a3)
+ fmul.s fa0, ft0, fa1 // alpha[0]
+ flw ft0, -16(a1) // X_low[end - 2][0]
+ fmul.s fa1, ft1, fa1 // alpha[1]
+ flw ft1, -12(a1) // X_low[end - 2][1]
+ flw ft2, -8(a1) // X_low[end - 1][0]
+ flw ft3, -4(a1) // X_low[end - 1][1]
+ addi a1, a1, -16
+1:
+ vsetvli t0, a5, e32, m4, ta, ma
+ slli t1, t0, 3
+ sub a1, a1, t1
+ vlseg2e32.v v0, (a1) // X_low[i - 2]
+ sub a0, a0, t1
+ vfslide1down.vf v8, v0, ft0 // X_low[i - 1][0]
+ sub a5, a5, t0
+ vfslide1down.vf v12, v4, ft1 // X_low[i - 1][1]
+ vfslide1down.vf v16, v8, ft2 // X_low[i ][0]
+ vfslide1down.vf v20, v12, ft3 // X_low[i ][1]
+ vfmacc.vf v16, fa0, v0
+ vfmacc.vf v20, fa0, v4
+ vfmv.f.s ft0, v0
+ vfnmsac.vf v16, fa1, v4
+ vfmacc.vf v20, fa1, v0
+ vfmv.f.s ft1, v4
+ vfmacc.vf v16, fa2, v8
+ vfmacc.vf v20, fa2, v12
+ vfmv.f.s ft2, v8
+ vfnmsac.vf v16, fa3, v12
+ vfmacc.vf v20, fa3, v8
+ vfmv.f.s ft3, v12
+ vsseg2e32.v v16, (a0)
+ bnez a5, 1b
+
+ ret
+endfunc
+
func ff_sbr_hf_g_filt_rvv, zve32f
li t1, 40 * 2 * 4
sh3add a1, a4, a1