aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJ. Dekker <jdek@itanimul.li>2024-07-18 20:41:06 +0300
committerRémi Denis-Courmont <remi@remlab.net>2024-07-21 22:39:45 +0300
commitc9dc2ad09bd53ccb43e86216b13117c334efb6f0 (patch)
tree9bff31ccec1ce32873f334e835b5e6594d556f67
parentd15169c51fd5bc6ea768c83977315954bf583e57 (diff)
downloadffmpeg-c9dc2ad09bd53ccb43e86216b13117c334efb6f0.tar.gz
lavc/h264dsp: move R-V V idct_dc_add
No functional changes. This just moves the assembler so that it can be referenced by other functions in h264idct_rvv.S with local jumps. Edited-by: Rémi Denis-Courmont <remi@remlab.net>
-rw-r--r--libavcodec/riscv/h264dsp_rvv.S103
-rw-r--r--libavcodec/riscv/h264idct_rvv.S105
2 files changed, 105 insertions, 103 deletions
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 5c70709cf2..ed6a16a9c4 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -1,7 +1,6 @@
/*
* SPDX-License-Identifier: BSD-2-Clause
*
- * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
@@ -326,105 +325,3 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
vssseg6e8.v v8, (a0), a1
ret
endfunc
-
-.macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
-.if \width == 8
- vsetivli zero, \width, e16, m1, ta, ma
-.else
- vsetivli zero, \width, e16, mf2, ta, ma
-.endif
- lh a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
- sh zero, 0(a1)
-.if \width == 8
- vlse64.v v24, (a0), a2
- vsetvli t0, zero, e16, m8, ta, ma
-.else
- vlse32.v v24, (a0), a2
- vsetvli t0, zero, e16, m4, ta, ma
-.endif
- vzext.vf2 v0, v24
- vadd.vx v0, v0, a3
- vmax.vx v0, v0, zero
-.if \width == 8
- vsetvli zero, zero, e8, m4, ta, ma
-.else
- vsetvli zero, zero, e8, m2, ta, ma
-.endif
- vnclipu.wi v24, v0, 0
- vsetivli zero, \width, e8, m1, ta, ma
-.if \width == 8
- vsse64.v v24, (a0), a2
-.else
- vsse32.v v24, (a0), a2
-.endif
- ret
-endfunc
-.endm
-
-idct_dc_add8 4
-idct_dc_add8 8
-
-.macro idct_dc_add width
-func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
- vsetivli zero, \width, e16, m1, ta, ma
- lw a3, 0(a1)
- addi a3, a3, 32
- srai a3, a3, 6
- sw zero, 0(a1)
- add t4, a0, a2
- sh1add t5, a2, a0
- sh1add t6, a2, t4
-.if \width == 8
- sh2add t0, a2, a0
- sh2add t1, a2, t4
- sh2add t2, a2, t5
- sh2add t3, a2, t6
-.endif
- vle16.v v0, (a0)
- vle16.v v1, (t4)
- vle16.v v2, (t5)
- vle16.v v3, (t6)
-.if \width == 8
- vle16.v v4, (t0)
- vle16.v v5, (t1)
- vle16.v v6, (t2)
- vle16.v v7, (t3)
- vsetvli a6, zero, e16, m8, ta, ma
-.else
- vsetvli a6, zero, e16, m4, ta, ma
-.endif
- vadd.vx v0, v0, a3
- vmax.vx v0, v0, zero
- vmin.vx v0, v0, a5
- vsetivli zero, \width, e16, m1, ta, ma
- vse16.v v0, (a0)
- vse16.v v1, (t4)
- vse16.v v2, (t5)
- vse16.v v3, (t6)
-.if \width == 8
- vse16.v v4, (t0)
- vse16.v v5, (t1)
- vse16.v v6, (t2)
- vse16.v v7, (t3)
-.endif
- ret
-endfunc
-.endm
-
-idct_dc_add 4
-idct_dc_add 8
-
-.irp depth,9,10,12,14
-func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
- li a5, (1 << \depth) - 1
- j ff_h264_idct4_dc_add_16_rvv
-endfunc
-
-func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
- li a5, (1 << \depth) - 1
- j ff_h264_idct8_dc_add_16_rvv
-endfunc
-.endr
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 505f491308..37b27fc92a 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -1,4 +1,7 @@
/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
@@ -412,6 +415,108 @@ func ff_h264_idct8_add_\depth\()_rvv, zve32x
endfunc
.endr
+.macro idct_dc_add8 width
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+.if \width == 8
+ vsetivli zero, \width, e16, m1, ta, ma
+.else
+ vsetivli zero, \width, e16, mf2, ta, ma
+.endif
+ lh a3, 0(a1)
+ addi a3, a3, 32
+ srai a3, a3, 6
+ sh zero, 0(a1)
+.if \width == 8
+ vlse64.v v24, (a0), a2
+ vsetvli t0, zero, e16, m8, ta, ma
+.else
+ vlse32.v v24, (a0), a2
+ vsetvli t0, zero, e16, m4, ta, ma
+.endif
+ vzext.vf2 v0, v24
+ vadd.vx v0, v0, a3
+ vmax.vx v0, v0, zero
+.if \width == 8
+ vsetvli zero, zero, e8, m4, ta, ma
+.else
+ vsetvli zero, zero, e8, m2, ta, ma
+.endif
+ vnclipu.wi v24, v0, 0
+ vsetivli zero, \width, e8, m1, ta, ma
+.if \width == 8
+ vsse64.v v24, (a0), a2
+.else
+ vsse32.v v24, (a0), a2
+.endif
+ ret
+endfunc
+.endm
+
+idct_dc_add8 4
+idct_dc_add8 8
+
+.macro idct_dc_add width
+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+ vsetivli zero, \width, e16, m1, ta, ma
+ lw a3, 0(a1)
+ addi a3, a3, 32
+ srai a3, a3, 6
+ sw zero, 0(a1)
+ add t4, a0, a2
+ sh1add t5, a2, a0
+ sh1add t6, a2, t4
+.if \width == 8
+ sh2add t0, a2, a0
+ sh2add t1, a2, t4
+ sh2add t2, a2, t5
+ sh2add t3, a2, t6
+.endif
+ vle16.v v0, (a0)
+ vle16.v v1, (t4)
+ vle16.v v2, (t5)
+ vle16.v v3, (t6)
+.if \width == 8
+ vle16.v v4, (t0)
+ vle16.v v5, (t1)
+ vle16.v v6, (t2)
+ vle16.v v7, (t3)
+ vsetvli a6, zero, e16, m8, ta, ma
+.else
+ vsetvli a6, zero, e16, m4, ta, ma
+.endif
+ vadd.vx v0, v0, a3
+ vmax.vx v0, v0, zero
+ vmin.vx v0, v0, a5
+ vsetivli zero, \width, e16, m1, ta, ma
+ vse16.v v0, (a0)
+ vse16.v v1, (t4)
+ vse16.v v2, (t5)
+ vse16.v v3, (t6)
+.if \width == 8
+ vse16.v v4, (t0)
+ vse16.v v5, (t1)
+ vse16.v v6, (t2)
+ vse16.v v7, (t3)
+.endif
+ ret
+endfunc
+.endm
+
+idct_dc_add 4
+idct_dc_add 8
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct4_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct8_dc_add_16_rvv
+endfunc
+.endr
+
const ff_h264_scan8
.byte 014, 015, 024, 025, 016, 017, 026, 027
.byte 034, 035, 044, 045, 036, 037, 046, 047