1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
|
/*
* Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
* Copyright (c) 2019-2021 Sebastian Pop <spop@amazon.com>
* Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
/*
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bit width or 16-bit width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;----------------------------------------------------------------------------- */
function ff_hscale8to15_X8_neon, export=1
sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
1: ldr w8, [x5], #4 // filterPos[idx]
ldr w0, [x5], #4 // filterPos[idx + 1]
ldr w11, [x5], #4 // filterPos[idx + 2]
ldr w9, [x5], #4 // filterPos[idx + 3]
mov x16, x4 // filter0 = filter
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc
function ff_hscale8to15_X4_neon, export=1
// x0 SwsContext *c (not used)
// x1 int16_t *dst
// w2 int dstW
// x3 const uint8_t *src
// x4 const int16_t *filter
// x5 const int32_t *filterPos
// w6 int filterSize
// This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
// 0 mod 8. It also assumes that dstW is 0 mod 4.
lsl w7, w6, #1 // w7 = filterSize * 2
1:
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
movi v16.2d, #0 // initialize accumulator for idx + 0
movi v17.2d, #0 // initialize accumulator for idx + 1
movi v18.2d, #0 // initialize accumulator for idx + 2
movi v19.2d, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter pointer for idx + 0
add x13, x4, x7 // filter pointer for idx + 1
add x8, x3, w8, uxtw // srcp + filterPos[idx + 0]
add x9, x3, w9, uxtw // srcp + filterPos[idx + 1]
add x14, x13, x7 // filter pointer for idx + 2
add x10, x3, w10, uxtw // srcp + filterPos[idx + 2]
add x11, x3, w11, uxtw // srcp + filterPos[idx + 3]
mov w0, w6 // copy filterSize to a temp register, w0
add x5, x5, #16 // advance the filterPos pointer
add x15, x14, x7 // filter pointer for idx + 3
mov x16, xzr // temp register for offsetting filter pointers
2:
// This section loops over 8-wide chunks of filter size
ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0
ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0
ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1
ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1
uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2
ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2
smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3
ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3
sub w0, w0, #8 // decrement the remaining filterSize counter
smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
cmp w0, #8 // are there at least 8 more elements in filter to consume?
add x16, x16, #16 // advance the offsetting register for filter values
smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
b.ge 2b // branch back to inner loop
// complete the remaining 4 filter elements
sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements
ldr s4, [x8] // load 4 bytes from srcp for idx + 0
ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0
ldr s5, [x9] // load 4 bytes from srcp for idx + 1
ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1
uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
ldr s6, [x10] // load 4 bytes from srcp for idx + 2
ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2
smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
ldr s7, [x11] // load 4 bytes from srcp for idx + 3
ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3
uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1
smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3
addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3
subs w2, w2, #4 // dstW -= 4
sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4h}, [x1], #8 // write to destination idx 0..3
add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4
b.gt 1b // loop until end of line
ret
endfunc
function ff_hscale8to15_4_neon, export=1
// x0 SwsContext *c (not used)
// x1 int16_t *dst
// x2 int dstW
// x3 const uint8_t *src
// x4 const int16_t *filter
// x5 const int32_t *filterPos
// x6 int filterSize
// x8-x15 registers for gathering src data
// v0 madd accumulator 4S
// v1-v4 filter values (16 bit) 8H
// v5 madd accumulator 4S
// v16-v19 src values (8 bit) 8B
// This implementation has 4 sections:
// 1. Prefetch src data
// 2. Interleaved prefetching src data and madd
// 3. Complete madd
// 4. Complete remaining iterations when dstW % 8 != 0
sub sp, sp, #32 // allocate 32 bytes on the stack
cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up
b.lt 2f
// load 8 values from filterPos to be used as offsets into src
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5]
ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7]
add x5, x5, #32 // advance filterPos
// gather random access data from src into contiguous memory
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]][0..3]
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]][0..3]
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]][0..3]
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]][0..3]
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]][0..3]
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]][0..3]
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]][0..3]
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]][0..3]
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
1:
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers
// load 8 values from filterPos to be used as offsets into src
ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
add x5, x5, #32 // advance filterPos
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
sub w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
cmp w2, #16 // continue on main loop if there are at least 16 iterations left
b.ge 1b
// last full iteration
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp]
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
subs w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
add sp, sp, #32 // clean up stack
ret
// finish up when dstW % 8 != 0 or dstW < 16
2:
// load src
ldr w8, [x5], #4 // filterPos[i]
add x9, x3, w8, UXTW // calculate the address for src load
ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3]
// load filter
ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3]
uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit
smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...]
addv s0, v0.4S // add up products of src and filter values
sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
st1 {v0.H}[0], [x1], #2 // dst[i] = ...
sub w2, w2, #1 // dstW--
cbnz w2, 2b
add sp, sp, #32 // clean up stack
ret
endfunc
|