lavc/lpc: R-V V compute_autocorr

The loop iterates over the length of the vector, not the order. This is to avoid reloading the same data for each lag value. However this means the loop only works if the maximum order is no larger than VLENB. The loop is roughly equivalent to: for (size_t j = 0; j < lag; j++) autoc[j] = 1.; while (len > lag) { for (ptrdiff_t j = 0; j < lag; j++) autoc[j] += data[j] * *data; data++; len--; } while (len > 0) { for (ptrdiff_t j = 0; j < len; j++) autoc[j] += data[j] * *data; data++; len--; } Since register pressure is only at 50%, it should be possible to implement the same loop for order up to 2xVLENB. But this is left for future work. Performance numbers are all over the place from ~1.25x to ~4x speedups, but at least they are always noticeably better than nothing.
author: Rémi Denis-Courmont <remi@remlab.net> 2023-12-08 21:38:20 +0200
committer: Rémi Denis-Courmont <remi@remlab.net> 2023-12-16 11:18:01 +0200
commit: 918b3ed2d51c11a474b8a9ce7b784f7a20d9645b (patch)
tree: 7071fc81077765ce27a1ac96f392a8c7d85c5e1b
parent: 1a049595320094bb3a03a0893fe69990c2eb4964 (diff)
download: ffmpeg-918b3ed2d51c11a474b8a9ce7b784f7a20d9645b.tar.gz
2 files changed, 36 insertions, 1 deletions
diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
index c16e5745f0..ab91956f2d 100644
--- a/libavcodec/riscv/lpc_init.c
+++ b/libavcodec/riscv/lpc_init.c
@@ -22,16 +22,22 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
 #include "libavcodec/lpc.h"
 
 void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
+void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
 
 av_cold void ff_lpc_init_riscv(LPCContext *c)
 {
 #if HAVE_RVV && (__riscv_xlen >= 64)
     int flags = av_get_cpu_flags();
 
-    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
+    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
+
+        if (ff_get_rv_vlenb() >= c->max_order)
+            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
+    }
 #endif
 }
diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
index f81a2392c1..d4ea515fee 100644
--- a/libavcodec/riscv/lpc_rvv.S
+++ b/libavcodec/riscv/lpc_rvv.S
@@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
 
         ret
 endfunc
+
+func ff_lpc_compute_autocorr_rvv, zve64d
+        li        t0, 1
+        vsetvli   zero, a2, e64, m8, ta, ma
+        fcvt.d.l  ft0, t0
+        vle64.v   v0, (a0)
+        sh3add    a0, a2, a0   # data += lag
+        vfmv.v.f  v16, ft0
+        bge       a2, a1, 2f
+1:
+        vfmv.f.s  ft0, v0
+        fld       ft1, (a0)    # ft1 = data[lag + i]
+        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
+        addi      a1, a1, -1
+        vfslide1down.vf v0, v0, ft1
+        addi      a0, a0, 8
+        bgt       a1, a2, 1b   # while (len > lag);
+2:
+        vfmv.f.s  ft0, v0
+        vsetvli   zero, a1, e64, m8, tu, ma
+        vfmacc.vf v16, ft0, v0
+        addi      a1, a1, -1
+        vslide1down.vx v0, v0, zero
+        bnez      a1, 2b       # while (len > 0);
+
+        vsetvli   zero, a2, e64, m8, ta, ma
+        vse64.v   v16, (a3)
+        ret
+endfunc
 #endif
author	Rémi Denis-Courmont <remi@remlab.net>	2023-12-08 21:38:20 +0200
committer	Rémi Denis-Courmont <remi@remlab.net>	2023-12-16 11:18:01 +0200
commit	918b3ed2d51c11a474b8a9ce7b784f7a20d9645b (patch)
tree	7071fc81077765ce27a1ac96f392a8c7d85c5e1b
parent	1a049595320094bb3a03a0893fe69990c2eb4964 (diff)
download	ffmpeg-918b3ed2d51c11a474b8a9ce7b784f7a20d9645b.tar.gz