1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
|
/*
* Copyright © 2022-2023 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/riscv/asm.S"
func ff_ps_add_squares_rvv, zve64f
li t1, 32
1:
vsetvli t0, a2, e32, m4, ta, ma
vle64.v v8, (a1)
sub a2, a2, t0
vnsrl.wx v24, v8, zero
vle32.v v16, (a0)
sh3add a1, t0, a1
vnsrl.wx v28, v8, t1
vfmacc.vv v16, v24, v24
vfmacc.vv v16, v28, v28
vse32.v v16, (a0)
sh2add a0, t0, a0
bnez a2, 1b
ret
endfunc
func ff_ps_mul_pair_single_rvv, zve32f
1:
vsetvli t0, a3, e32, m4, ta, ma
vlseg2e32.v v24, (a1)
sub a3, a3, t0
vle32.v v16, (a2)
sh3add a1, t0, a1
vfmul.vv v24, v24, v16
sh2add a2, t0, a2
vfmul.vv v28, v28, v16
vsseg2e32.v v24, (a0)
sh3add a0, t0, a0
bnez a3, 1b
ret
endfunc
func ff_ps_hybrid_analysis_rvv, zve32f
/* We need 26 FP registers, for 20 scratch ones. Spill fs0-fs5. */
addi sp, sp, -48
.irp n, 0, 1, 2, 3, 4, 5
HWD fsd fs\n, (8 * \n)(sp)
NOHWD fsw fs\n, (4 * \n)(sp)
.endr
.macro input, j, fd0, fd1, fd2, fd3
flw \fd0, (4 * ((\j * 2) + 0))(a1)
flw fs4, (4 * (((12 - \j) * 2) + 0))(a1)
flw \fd1, (4 * ((\j * 2) + 1))(a1)
fsub.s \fd3, \fd0, fs4
flw fs5, (4 * (((12 - \j) * 2) + 1))(a1)
fadd.s \fd2, \fd1, fs5
fadd.s \fd0, \fd0, fs4
fsub.s \fd1, \fd1, fs5
.endm
// re0, re1, im0, im1
input 0, ft0, ft1, ft2, ft3
input 1, ft4, ft5, ft6, ft7
input 2, ft8, ft9, ft10, ft11
input 3, fa0, fa1, fa2, fa3
input 4, fa4, fa5, fa6, fa7
input 5, fs0, fs1, fs2, fs3
flw fs4, (4 * ((6 * 2) + 0))(a1)
flw fs5, (4 * ((6 * 2) + 1))(a1)
add t2, a2, 6 * 2 * 4 // point to filter[i][6][0]
li t4, 8 * 2 * 4 // filter byte stride
slli a3, a3, 3 // output byte stride
1:
.macro filter, vs0, vs1, fo0, fo1, fo2, fo3
vfmacc.vf v8, \fo0, \vs0
vfmacc.vf v10, \fo2, \vs0
vfnmsac.vf v8, \fo1, \vs1
vfmacc.vf v10, \fo3, \vs1
.endm
vsetvli t0, a4, e32, m2, ta, ma
/*
* The filter (a2) has 16 segments, of which 13 need to be extracted.
* R-V V supports only up to 8 segments, so unrolling is unavoidable.
*/
vlse32.v v28, (t2), t4
addi t1, a2, 16
vfmul.vf v8, v28, fs4
vlsseg4e32.v v16, (a2), t4
vfmul.vf v10, v28, fs5
filter v16, v18, ft0, ft1, ft2, ft3
vlsseg4e32.v v24, (t1), t4
filter v20, v22, ft4, ft5, ft6, ft7
addi t1, a2, 32
filter v24, v26, ft8, ft9, ft10, ft11
vlsseg4e32.v v16, (t1), t4
sub a4, a4, t0
filter v28, v30, fa0, fa1, fa2, fa3
slli t1, t0, 3 + 1 + 2 // ctz(8 * 2 * 4)
filter v16, v18, fa4, fa5, fa6, fa7
mul t0, t0, a3
filter v20, v22, fs0, fs1, fs2, fs3
add a2, a2, t1
add t2, t2, t1
vssseg2e32.v v8, (a0), a3
add a0, a0, t0
bnez a4, 1b
.irp n, 5, 4, 3, 2, 1, 0
HWD fld fs\n, (8 * \n)(sp)
NOHWD flw fs\n, (4 * \n)(sp)
.endr
addi sp, sp, 48
ret
.purgem input
.purgem filter
endfunc
func ff_ps_hybrid_analysis_ileave_rvv, zve32x /* no needs for zve32f here */
slli t0, a2, 5 + 1 + 2 // ctz(32 * 2 * 4)
sh2add a1, a2, a1
add a0, a0, t0
addi a2, a2, -64
li t1, 38 * 64 * 4
li t6, 64 * 4 // (uint8_t *)L[x][j+1][i] - L[x][j][i]
add a4, a1, t1 // &L[1]
beqz a2, 3f
1:
mv t0, a0
mv t1, a1
mv t3, a3
mv t4, a4
addi a2, a2, 1
2:
vsetvli t5, t3, e32, m4, ta, ma
vlse32.v v16, (t1), t6
sub t3, t3, t5
vlse32.v v20, (t4), t6
mul t2, t5, t6
vsseg2e32.v v16, (t0)
sh3add t0, t5, t0
add t1, t1, t2
add t4, t4, t2
bnez t3, 2b
add a0, a0, 32 * 2 * 4
add a1, a1, 4
add a4, a4, 4
bnez a2, 1b
3:
ret
endfunc
func ff_ps_hybrid_synthesis_deint_rvv, zve64x
slli t0, a2, 5 + 1 + 2
sh2add a0, a2, a0
add a1, a1, t0
addi t2, a2, -64
li t0, 38 * 64
li t1, 32 * 2 * 4
li t4, 8 - 16384 // offset from in[64][n][0] to in[0][n + 1][0]
slli t5, a2, 5 + 1 + 2 // and from in[0][n+1][0] to in[0][n+1][s]
neg t2, t2
li t3, 32
add a4, t4, t5
sh2add t0, t0, a0
1:
mv t4, t2
addi a3, a3, -1
2:
vsetvli t5, t4, e32, m4, ta, ma
vlse64.v v16, (a1), t1 /* sizeof (float[32][2]) */
sub t4, t4, t5
vnsrl.wx v24, v16, zero
slli t6, t5, 5 + 1 + 2
vnsrl.wx v28, v16, t3 /* 32 */
add a1, a1, t6
vse32.v v24, (a0)
sh2add a0, t5, a0
vse32.v v28, (t0)
sh2add t0, t5, t0
bnez t4, 2b
add a1, a1, a4
sh2add a0, a2, a0
sh2add t0, a2, t0
bnez a3, 1b
ret
endfunc
func ff_ps_stereo_interpolate_rvv, zve32f, zbb
vsetvli t0, zero, e32, m2, ta, ma
vid.v v24
flw ft0, (a2)
vadd.vi v24, v24, 1 // v24[i] = i + 1
flw ft1, 4(a2)
vfcvt.f.xu.v v24, v24
flw ft2, 8(a2)
vfmv.v.f v16, ft0
flw ft3, 12(a2)
vfmv.v.f v18, ft1
flw ft0, (a3)
vfmv.v.f v20, ft2
flw ft1, 4(a3)
vfmv.v.f v22, ft3
flw ft2, 8(a3)
flw ft3, 12(a3)
fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float))
vfmacc.vf v16, ft0, v24 // h0 += (i + 1) * h0_step
fmul.s ft0, ft0, ft4
vfmacc.vf v18, ft1, v24
fmul.s ft1, ft1, ft4
vfmacc.vf v20, ft2, v24
fmul.s ft2, ft2, ft4
vfmacc.vf v22, ft3, v24
fmul.s ft3, ft3, ft4
1:
min t0, t0, a4
vsetvli zero, t0, e32, m2, ta, ma
vlseg2e32.v v0, (a0) // v0:l_re, v2:l_im
sub a4, a4, t0
vlseg2e32.v v4, (a1) // v4:r_re, v6:r_im
vfmul.vv v8, v0, v16
vfmul.vv v10, v2, v16
vfmul.vv v12, v0, v18
vfmul.vv v14, v2, v18
vfmacc.vv v8, v4, v20
vfmacc.vv v10, v6, v20
vfmacc.vv v12, v4, v22
vfmacc.vv v14, v6, v22
vsseg2e32.v v8, (a0)
sh3add a0, t0, a0
vsseg2e32.v v12, (a1)
sh3add a1, t0, a1
vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step
vfadd.vf v18, v18, ft1
vfadd.vf v20, v20, ft2
vfadd.vf v22, v22, ft3
bnez a4, 1b
ret
endfunc
|