lavc/h264dsp: R-V V 8-bit MBAFF loop filter

Performance is (unfortunately) the same as with non-MBAFF, since the hardware under test does not short-circuit vector tail calculations. (IMO, a generic solution or work-around should be agreed on, rather than bespoke approaches all over the place.)
author: Rémi Denis-Courmont <remi@remlab.net> 2024-06-30 11:24:43 +0300
committer: Rémi Denis-Courmont <remi@remlab.net> 2024-07-04 19:57:42 +0300
commit: e2af5904f0fa86dbb2b7755ab579f54d14523e8e (patch)
tree: 98271578611e91a6676bf6939dfac642647581c7 /libavcodec/riscv/h264dsp_rvv.S
parent: 5a6e333fc7ac514255bcd6b424924d92ef558bf0 (diff)
download: ffmpeg-e2af5904f0fa86dbb2b7755ab579f54d14523e8e.tar.gz
1 files changed, 14 insertions, 0 deletions
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 77bf40db1f..96a8a0a8a3 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -138,3 +138,17 @@ func ff_h264_h_loop_filter_luma_8_rvv, zve32x
         vssseg6e8.v v8, (a0), a1
         ret
 endfunc
+
+func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vle8.v      v4, (a4)
+        li          t0, 0x0101
+        vzext.vf2   v6, v4
+        addi        a0, a0, -3
+        vmul.vx     v6, v6, t0    # tc_orig
+        vsetivli    zero, 8, e8, m1, ta, ma
+        vlsseg6e8.v v8, (a0), a1
+        jal         t0, ff_h264_loop_filter_luma_8_rvv
+        vssseg6e8.v v8, (a0), a1
+        ret
+endfunc
author	Rémi Denis-Courmont <remi@remlab.net>	2024-06-30 11:24:43 +0300
committer	Rémi Denis-Courmont <remi@remlab.net>	2024-07-04 19:57:42 +0300
commit	e2af5904f0fa86dbb2b7755ab579f54d14523e8e (patch)
tree	98271578611e91a6676bf6939dfac642647581c7 /libavcodec/riscv/h264dsp_rvv.S
parent	5a6e333fc7ac514255bcd6b424924d92ef558bf0 (diff)
download	ffmpeg-e2af5904f0fa86dbb2b7755ab579f54d14523e8e.tar.gz