1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
|
; /*
; * Provide AVX2 luma optical flow functions for VVC decoding
; * Copyright (c) 2024 Nuo Mi
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */
%include "libavutil/x86/x86util.asm"
%define MAX_PB_SIZE 128
%define SRC_STRIDE (MAX_PB_SIZE * 2)
%define SRC_PS 2 ; source pixel size, sizeof(int16_t)
%define BDOF_STACK_SIZE 10 ; (4 + 1) * 2, 4 lines + the first line, *2 for h and v
%define bdof_stack_offset(line) ((line) * 2 % BDOF_STACK_SIZE * mmsize)
%define SHIFT 6
%define SHIFT2 4
SECTION_RODATA 32
pd_15 times 8 dd 15
pd_m15 times 8 dd -15
pb_shuffle_w8 times 2 db 0, 1, 0xff, 0xff, 8, 9, 0xff, 0xff, 6, 7, 0xff, 0xff, 14, 15, 0xff, 0xff
pb_shuffle_w16 times 2 db 0, 1, 0xff, 0xff, 6, 7, 0xff, 0xff, 8, 9, 0xff, 0xff, 14, 15, 0xff, 0xff
pd_perm_w16 dd 0, 2, 1, 4, 3, 6, 5, 7
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
SECTION .text
INIT_YMM avx2
; dst = (src0 >> shift) - (src1 >> shift)
%macro DIFF 5 ; dst, src0, src1, shift, tmp
psraw %1, %2, %4
psraw %5, %3, %4
psubw %1, %5
%endmacro
%macro LOAD_GRAD_H 4 ; dst, src, off, tmp
movu %1, [%2 + %3 + 2 * SRC_PS]
movu %4, [%2 + %3]
DIFF %1, %1, %4, SHIFT, %4
%endmacro
%macro SUM_GRAD 2 ;(dst/grad0, grad1)
paddw %1, %2
psraw %1, 1 ; shift3
%endmacro
%macro APPLY_BDOF_MIN_BLOCK_LINE 5 ; dst, vx, vy, tmp, line_num
%define off bdof_stack_offset(%5)
pmullw %1, %2, [rsp + off + 0 * mmsize] ; vx * (gradient_h[0] - gradient_h[1])
pmullw %4, %3, [rsp + off + 1 * mmsize] ; vy * (gradient_v[0] - gradient_v[1])
paddw %1, [src0q + (%5 + 1) * SRC_STRIDE + SRC_PS]
paddw %4, [src1q + (%5 + 1) * SRC_STRIDE + SRC_PS]
paddsw %1, %4 ; src0[x] + src1[x] + bdof_offset
pmulhrsw %1, m11
CLIPW %1, m9, m10
%endmacro
%macro SAVE_8BPC 2 ; dst, src
packuswb m%2, m%2
vpermq m%2, m%2, q0020
cmp wd, 16
je %%w16
movq %1, xm%2
jmp %%wend
%%w16:
movu %1, xm%2
%%wend:
%endmacro
%macro SAVE_16BPC 2 ; dst, src
cmp wd, 16
je %%w16
movu %1, xm%2
jmp %%wend
%%w16:
movu %1, m%2
%%wend:
%endmacro
%macro SAVE 2 ; dst, src
cmp pixel_maxd, (1 << 8) - 1
jne %%save_16bpc
SAVE_8BPC %1, %2
jmp %%end
%%save_16bpc:
SAVE_16BPC %1, %2
%%end:
%endmacro
; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1]
; [rsp + odd * mmsize] are gradient_v[0] - gradient_v[1]
%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd
pxor m9, m9
movd xm10, pixel_maxd
vpbroadcastw m10, xm10
lea tmp0d, [pixel_maxd + 1]
movd xm11, tmp0d
VPBROADCASTW m11, xm11 ;shift_4 for pmulhrsw
APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 0
SAVE [dstq + 0 * dsq], 6
APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 1
SAVE [dstq + 1 * dsq], 6
APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 2
SAVE [dstq + 2 * dsq], 6
APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 3
SAVE [dstq + ds3q], 6
%endmacro
%macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp
pshufb %4, %1, %2
vpermd %4, %3, %4
paddw %1, %4
%endmacro
%macro SUM_MIN_BLOCK_W8 3 ; src/dst, shuffle, tmp
pshufb %3, %1, %2
paddw %1, %3
%endmacro
%macro BDOF_PROF_GRAD 2 ; line_no, last_line
%assign i0 (%1 + 0) % 3
%assign j0 (%1 + 1) % 3
%assign k0 (%1 + 2) % 3
%assign i1 3 + (%1 + 0) % 3
%assign j1 3 + (%1 + 1) % 3
%assign k1 3 + (%1 + 2) % 3
; we cached src0 in m0 to m2
%define t0 m %+ i0
%define c0 m %+ j0
%define b0 m %+ k0
; we cached src1 in m3 to m5
%define t1 m %+ i1
%define c1 m %+ j1
%define b1 m %+ k1
%define ndiff t1
%define off bdof_stack_offset(%1)
movu b0, [src0q + (%1 + 2) * SRC_STRIDE + SRC_PS]
movu b1, [src1q + (%1 + 2) * SRC_STRIDE + SRC_PS]
; gradient_v[0], gradient_v[1]
DIFF m6, b0, t0, SHIFT, t0
DIFF m7, b1, t1, SHIFT, t1
; save gradient_v[0] - gradient_v[1]
psubw m10, m6, m7
mova [rsp + off + mmsize], m10
; gradient_h[0], gradient_h[1]
LOAD_GRAD_H m8, src0q, (%1 + 1) * SRC_STRIDE, t0
LOAD_GRAD_H m9, src1q, (%1 + 1) * SRC_STRIDE, t1
; save gradient_h[0] - gradient_h[1]
psubw m11, m8, m9
mova [rsp + off], m11
SUM_GRAD m8, m9 ; temph
SUM_GRAD m6, m7 ; tempv
DIFF ndiff, c1, c0, SHIFT2, t0 ; -diff
psignw m7, ndiff, m8 ; sgxdi
psignw m9, ndiff, m6 ; sgydi
psignw m10, m8, m6 ; sgxgy
pabsw m6, m6 ; sgy2
pabsw m8, m8 ; sgx2
; use t0, t1 as temporary buffers
cmp wd, 16
je %%w16
mova t0, [pb_shuffle_w8]
SUM_MIN_BLOCK_W8 m6, t0, m11
SUM_MIN_BLOCK_W8 m7, t0, m11
SUM_MIN_BLOCK_W8 m8, t0, m11
SUM_MIN_BLOCK_W8 m9, t0, m11
SUM_MIN_BLOCK_W8 m10, t0, m11
jmp %%wend
%%w16:
mova t0, [pb_shuffle_w16]
mova t1, [pd_perm_w16]
SUM_MIN_BLOCK_W16 m6, t0, t1, m11
SUM_MIN_BLOCK_W16 m7, t0, t1, m11
SUM_MIN_BLOCK_W16 m8, t0, t1, m11
SUM_MIN_BLOCK_W16 m9, t0, t1, m11
SUM_MIN_BLOCK_W16 m10, t0, t1, m11
%%wend:
vpblendd m11, m8, m7, 10101010b
vpblendd m7, m8, m7, 01010101b
pshufd m7, m7, q2301
paddw m8, m7, m11 ;4 x (2sgx2, 2sgxdi)
vpblendd m11, m6, m9, 10101010b
vpblendd m9, m6, m9, 01010101b
pshufd m9, m9, q2301
paddw m6, m9, m11 ;4 x (2sgy2, 2sgydi)
vpblendw m11, m8, m6, 10101010b
vpblendw m6, m8, m6, 01010101b
pshuflw m6, m6, q2301
pshufhw m6, m6, q2301
paddw m8, m6, m11 ; 4 x (4sgx2, 4sgy2, 4sgxdi, 4sgydi)
%if (%1) == 0 || (%2)
; pad for top and bottom
paddw m8, m8
paddw m10, m10
%endif
paddw m12, m8
paddw m13, m10
%endmacro
%macro LOG2 5 ; log_sum, src, cmp, shift, tmp
pcmpgtw %5, %2, %3
pandd %5, %4
paddw %1, %5
psrlw %2, %5
psrlw %4, 1
psrlw %3, %4
%endmacro
%macro LOG2 2 ; dst/src, offset
pextrw tmp0d, xm%1, %2
bsr tmp0d, tmp0d
pinsrw xm%1, tmp0d, %2
%endmacro
%macro LOG2 1 ; dst/src
LOG2 %1, 0
LOG2 %1, 1
LOG2 %1, 2
LOG2 %1, 3
LOG2 %1, 4
LOG2 %1, 5
LOG2 %1, 6
LOG2 %1, 7
%endmacro
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
; %2: 4 (4sgxgy)
%macro BDOF_VX_VY 2 ;
pshufd m6, m%1, q0032
punpckldq m%1, m6
vextracti128 xm7, m%1, 1
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
mova m10, m8
LOG2 10 ; 4 (log2(sgx2), log2(sgy2))
; Promote to dword since vpsrlvw is AVX-512 only
pmovsxwd m8, xm8
pmovsxwd m9, xm9
pmovsxwd m10, xm10
pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2)
psignd m11, m9, m8
vpsravd m11, m11, m10
CLIPD m11, [pd_m15], [pd_15] ; 4 (vx, junk)
pshuflw m%1, m11, q0000
pshufhw m%1, m%1, q0000 ; 4 (2junk, 2vx)
psllq m6, m%2, 32
paddw m%2, m6
pmaddwd m%2, m%1 ; 4 (junk, vx * sgxgy)
psrad m%2, 1
psubd m9, m%2 ; 4 (junk, (sgydi << 2) - (vx * sgxgy >> 1))
psignd m9, m8
vpsravd m%2, m9, m10
CLIPD m%2, [pd_m15], [pd_15] ; 4 (junk, vy)
pshuflw m%2, m%2, q2222
pshufhw m%2, m%2, q2222 ; 4 (4vy)
%endmacro
%macro BDOF_MINI_BLOCKS 2 ; (block_num, last_block)
%if (%1) == 0
movu m0, [src0q + 0 * SRC_STRIDE + SRC_PS]
movu m1, [src0q + 1 * SRC_STRIDE + SRC_PS]
movu m3, [src1q + 0 * SRC_STRIDE + SRC_PS]
movu m4, [src1q + 1 * SRC_STRIDE + SRC_PS]
pxor m12, m12
pxor m13, m13
BDOF_PROF_GRAD 0, 0
%endif
mova m14, m12
mova m15, m13
pxor m12, m12
pxor m13, m13
BDOF_PROF_GRAD %1 * 4 + 1, 0
BDOF_PROF_GRAD %1 * 4 + 2, 0
paddw m14, m12
paddw m15, m13
pxor m12, m12
pxor m13, m13
BDOF_PROF_GRAD %1 * 4 + 3, %2
%if (%2) == 0
BDOF_PROF_GRAD %1 * 4 + 4, 0
%endif
paddw m14, m12
paddw m15, m13
BDOF_VX_VY 14, 15
APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd
lea dstq, [dstq + 4 * dsq]
%endmacro
;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1,
; const int w, const int h, const int int pixel_max)
%macro BDOF_AVX2 0
cglobal vvc_apply_bdof, 7, 10, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0, tmp1
lea ds3q, [dsq * 3]
sub src0q, SRC_STRIDE + SRC_PS
sub src1q, SRC_STRIDE + SRC_PS
BDOF_MINI_BLOCKS 0, 0
cmp hd, 16
je .h16
BDOF_MINI_BLOCKS 1, 1
jmp .end
.h16:
BDOF_MINI_BLOCKS 1, 0
BDOF_MINI_BLOCKS 2, 0
BDOF_MINI_BLOCKS 3, 1
.end:
RET
%endmacro
%macro VVC_OF_AVX2 0
BDOF_AVX2
%endmacro
VVC_OF_AVX2
%endif ; HAVE_AVX2_EXTERNAL
%endif ; ARCH_X86_64
|