lavc/ac3dsp: R-V V sum_square_butterfly_int32

ac3_sum_square_bufferfly_int32_c: 61.0 ac3_sum_square_bufferfly_int32_rvv_i64: 14.7
author: Rémi Denis-Courmont <remi@remlab.net> 2024-04-29 22:10:15 +0300
committer: Rémi Denis-Courmont <remi@remlab.net> 2024-05-03 17:48:46 +0300
commit: 6459966bebc07b3c26338cbecf72f3607feb961f (patch)
tree: 23a8aeb0878c163b1e4d5886a2d06909c2d29c28
parent: 95568c4e316e8f5f3252596b1f01ce1de22216b6 (diff)
download: ffmpeg-6459966bebc07b3c26338cbecf72f3607feb961f.tar.gz
2 files changed, 47 insertions, 0 deletions
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index b9e14d56ca..be5e153fac 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -28,6 +28,8 @@
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
+void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
+                                       const int32_t *, int);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -39,6 +41,10 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
             c->extract_exponents = ff_extract_exponents_rvb;
         if (flags & AV_CPU_FLAG_RVV_F32)
             c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+# if __riscv_xlen >= 64
+        if (flags & AV_CPU_FLAG_RVV_I64)
+            c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;
+# endif
     }
 #endif
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
index b8d32c4677..dd0b4cd797 100644
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -37,3 +37,44 @@ func ff_float_to_fixed24_rvv, zve32f
 
         ret
 endfunc
+
+#if __riscv_xlen >= 64
+func ff_sum_square_butterfly_int32_rvv, zve64x
+        vsetvli    t0, zero, e64, m8, ta, ma
+        vmv.v.x    v0, zero
+        vmv.v.x    v8, zero
+1:
+        vsetvli    t0, a3, e32, m2, tu, ma
+        vle32.v    v16, (a1)
+        sub        a3, a3, t0
+        vle32.v    v20, (a2)
+        sh2add     a1, t0, a1
+        vadd.vv    v24, v16, v20
+        sh2add     a2, t0, a2
+        vsub.vv    v28, v16, v20
+        vwmacc.vv  v0, v16, v16
+        vwmacc.vv  v4, v20, v20
+        vwmacc.vv  v8, v24, v24
+        vwmacc.vv  v12, v28, v28
+        bnez       a3, 1b
+
+        vsetvli    t0, zero, e64, m4, ta, ma
+        vmv.s.x    v16, zero
+        vmv.s.x    v17, zero
+        vredsum.vs v16, v0, v16
+        vmv.s.x    v18, zero
+        vredsum.vs v17, v4, v17
+        vmv.s.x    v19, zero
+        vredsum.vs v18, v8, v18
+        vmv.x.s    t0, v16
+        vredsum.vs v19, v12, v19
+        vmv.x.s    t1, v17
+        sd         t0,   (a0)
+        vmv.x.s    t2, v18
+        sd         t1,  8(a0)
+        vmv.x.s    t3, v19
+        sd         t2, 16(a0)
+        sd         t3, 24(a0)
+        ret
+endfunc
+#endif
author	Rémi Denis-Courmont <remi@remlab.net>	2024-04-29 22:10:15 +0300
committer	Rémi Denis-Courmont <remi@remlab.net>	2024-05-03 17:48:46 +0300
commit	6459966bebc07b3c26338cbecf72f3607feb961f (patch)
tree	23a8aeb0878c163b1e4d5886a2d06909c2d29c28
parent	95568c4e316e8f5f3252596b1f01ce1de22216b6 (diff)
download	ffmpeg-6459966bebc07b3c26338cbecf72f3607feb961f.tar.gz