diff options
author | Rémi Denis-Courmont <remi@remlab.net> | 2023-09-29 19:04:38 +0300 |
---|---|---|
committer | Rémi Denis-Courmont <remi@remlab.net> | 2023-10-02 18:08:23 +0300 |
commit | c270928cc0d47363b932b64ecd28e2815fddcb01 (patch) | |
tree | 93518eb9bcd8035d03f864b5efc9a61ac0d2b5d7 /libavcodec/riscv/aacpsdsp_rvv.S | |
parent | 27d74fc1ef1b3169633e2dbef9c788f328d764f9 (diff) | |
download | ffmpeg-c270928cc0d47363b932b64ecd28e2815fddcb01.tar.gz |
lavc/aacpsdsp: unroll R-V V stereo interpolate
Diffstat (limited to 'libavcodec/riscv/aacpsdsp_rvv.S')
-rw-r--r-- | libavcodec/riscv/aacpsdsp_rvv.S | 46 |
1 files changed, 23 insertions, 23 deletions
diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S index b85a5cc92c..1a92fed515 100644 --- a/libavcodec/riscv/aacpsdsp_rvv.S +++ b/libavcodec/riscv/aacpsdsp_rvv.S @@ -223,7 +223,7 @@ func ff_ps_hybrid_synthesis_deint_rvv, zve32x endfunc func ff_ps_stereo_interpolate_rvv, zve32f - vsetvli t0, zero, e32, m1, ta, ma + vsetvli t0, zero, e32, m2, ta, ma vid.v v24 flw ft0, (a2) vadd.vi v24, v24, 1 // v24[i] = i + 1 @@ -232,43 +232,43 @@ func ff_ps_stereo_interpolate_rvv, zve32f flw ft2, 8(a2) vfmv.v.f v16, ft0 flw ft3, 12(a2) - vfmv.v.f v17, ft1 + vfmv.v.f v18, ft1 flw ft0, (a3) - vfmv.v.f v18, ft2 + vfmv.v.f v20, ft2 flw ft1, 4(a3) - vfmv.v.f v19, ft3 + vfmv.v.f v22, ft3 flw ft2, 8(a3) flw ft3, 12(a3) fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float)) vfmacc.vf v16, ft0, v24 // h0 += (i + 1) * h0_step fmul.s ft0, ft0, ft4 - vfmacc.vf v17, ft1, v24 + vfmacc.vf v18, ft1, v24 fmul.s ft1, ft1, ft4 - vfmacc.vf v18, ft2, v24 + vfmacc.vf v20, ft2, v24 fmul.s ft2, ft2, ft4 - vfmacc.vf v19, ft3, v24 + vfmacc.vf v22, ft3, v24 fmul.s ft3, ft3, ft4 1: - vsetvli t0, a4, e32, m1, ta, ma - vlseg2e32.v v8, (a0) // v8:l_re, v9:l_im + vsetvli t0, a4, e32, m2, ta, ma + vlseg2e32.v v0, (a0) // v0:l_re, v2:l_im sub a4, a4, t0 - vlseg2e32.v v10, (a1) // v10:r_re, v11:r_im - vfmul.vv v12, v8, v16 - vfmul.vv v13, v9, v16 - vfmul.vv v14, v8, v17 - vfmul.vv v15, v9, v17 - vfmacc.vv v12, v10, v18 - vfmacc.vv v13, v11, v18 - vfmacc.vv v14, v10, v19 - vfmacc.vv v15, v11, v19 - vsseg2e32.v v12, (a0) + vlseg2e32.v v4, (a1) // v4:r_re, v6:r_im + vfmul.vv v8, v0, v16 + vfmul.vv v10, v2, v16 + vfmul.vv v12, v0, v18 + vfmul.vv v14, v2, v18 + vfmacc.vv v8, v4, v20 + vfmacc.vv v10, v6, v20 + vfmacc.vv v12, v4, v22 + vfmacc.vv v14, v6, v22 + vsseg2e32.v v8, (a0) sh3add a0, t0, a0 - vsseg2e32.v v14, (a1) + vsseg2e32.v v12, (a1) sh3add a1, t0, a1 vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step - vfadd.vf v17, v17, ft1 - vfadd.vf v18, v18, ft2 - vfadd.vf v19, v19, ft3 + vfadd.vf v18, v18, ft1 + vfadd.vf v20, v20, ft2 + vfadd.vf v22, v22, ft3 bnez a4, 1b ret |