aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRémi Denis-Courmont <remi@remlab.net>2024-07-02 22:03:07 +0300
committerRémi Denis-Courmont <remi@remlab.net>2024-07-14 11:39:35 +0300
commit4e0e872881ff57f1953517067cf82d7ea925b432 (patch)
treefc383921b54b45bd0c3328efb5da8255ba119d21
parentd059ea56638c093a34af2408e1a29825f6554ddd (diff)
downloadffmpeg-4e0e872881ff57f1953517067cf82d7ea925b432.tar.gz
lavc/h264dsp: R-V V high-depth h264_idct_add
T-Head C908 (cycles): h264_idct4_add_9bpp_c: 248.2 h264_idct4_add_9bpp_rvv_i32: 128.7 h264_idct4_add_10bpp_c: 256.7 h264_idct4_add_10bpp_rvv_i32: 128.7 h264_idct4_add_12bpp_c: 252.5 h264_idct4_add_12bpp_rvv_i32: 129.7 h264_idct4_add_14bpp_c: 258.0 h264_idct4_add_14bpp_rvv_i32: 129.7
-rw-r--r--libavcodec/riscv/h264dsp_init.c19
-rw-r--r--libavcodec/riscv/h264idct_rvv.S63
2 files changed, 81 insertions, 1 deletions
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 88afec8df0..463ffe7202 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -52,6 +52,11 @@ void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset,
int16_t *block, int stride,
const uint8_t nnzc[5 * 8]);
+void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
+
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
@@ -65,7 +70,9 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
- if (bit_depth == 8 && ff_rv_vlen_least(128)) {
+ const bool zvl128b = ff_rv_vlen_least(128);
+
+ if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++) {
dsp->weight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].weight;
@@ -86,6 +93,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
# endif
}
+
+ if (bit_depth == 9 && zvl128b)
+ dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
+ if (bit_depth == 10 && zvl128b)
+ dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
+ if (bit_depth == 12 && zvl128b)
+ dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
+ if (bit_depth == 14 && zvl128b)
+ dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
+
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
}
# endif
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 8d0edf1c0b..001ce0a0f4 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -105,6 +105,69 @@ func ff_h264_idct_add_8_rvv, zve32x
ret
endfunc
+func ff_h264_idct_add_16_rvv, zve32x
+ csrwi vxrm, 0
+ vsetivli zero, 4, e32, m1, ta, ma
+ addi t1, a1, 1 * 4 * 4
+ vle32.v v0, (a1)
+ addi t2, a1, 2 * 4 * 4
+ vle32.v v1, (t1)
+ addi t3, a1, 3 * 4 * 4
+ vle32.v v2, (t2)
+ vle32.v v3, (t3)
+ jal t0, ff_h264_idct4_rvv
+ vse32.v v0, (a1)
+ vse32.v v1, (t1)
+ vse32.v v2, (t2)
+ vse32.v v3, (t3)
+ vlseg4e32.v v0, (a1)
+ .equ offset, 0
+ .rept 512 / __riscv_xlen
+ sx zero, offset(a1)
+ .equ offset, offset + (__riscv_xlen / 8)
+ .endr
+ jal t0, ff_h264_idct4_rvv
+ add t1, a0, a2
+ vle16.v v4, (a0)
+ add t2, t1, a2
+ vle16.v v5, (t1)
+ add t3, t2, a2
+ vle16.v v6, (t2)
+ vle16.v v7, (t3)
+ .irp n,0,1,2,3
+ vssra.vi v\n, v\n, 6
+ .endr
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+ vsetvli zero, zero, e32, m1, ta, ma
+ .irp n,0,1,2,3
+ vmax.vx v\n, v\n, zero
+ .endr
+ .irp n,0,1,2,3
+ vmin.vx v\n, v\n, a3
+ .endr
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vncvt.x.x.w v4, v0
+ vncvt.x.x.w v5, v1
+ vncvt.x.x.w v6, v2
+ vncvt.x.x.w v7, v3
+ vse16.v v4, (a0)
+ vse16.v v5, (t1)
+ vse16.v v6, (t2)
+ vse16.v v7, (t3)
+ ret
+endfunc
+
+.irp depth, 9, 10, 12, 14
+func ff_h264_idct_add_\depth\()_rvv, zve32x
+ li a3, (1 << \depth) - 1
+ j ff_h264_idct_add_16_rvv
+endfunc
+.endr
+
.variant_cc ff_h264_idct8_rvv
func ff_h264_idct8_rvv, zve32x
vsra.vi v9, v7, 1