aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/aarch64/vvc/inter.S
blob: b6b079b569a738d1ce0ae5b2dd8b518d9f9f91c9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
/*
 * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"

#define VVC_MAX_PB_SIZE 128

.macro vvc_avg type, bit_depth

.macro vvc_\type\()_\bit_depth\()_2_4 tap
.if \tap == 2
        ldr             s0, [src0]
        ldr             s2, [src1]
.else
        ldr             d0, [src0]
        ldr             d2, [src1]
.endif

.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        add             v4.4s, v4.4s, v16.4s
        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
.else
        mov             v4.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
        smlal           v4.4s, v2.4h, v20.4h
        sqshl           v4.4s, v4.4s, v22.4s
        sqxtn           v4.4h, v4.4s
.endif

.if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
.if \tap == 2
        str             h4, [dst]
.else   // tap == 4
        str             s4, [dst]
.endif

.else   // bit_depth > 8
        smin            v4.4h, v4.4h, v17.4h
        smax            v4.4h, v4.4h, v18.4h
.if \tap == 2
        str             s4, [dst]
.else
        str             d4, [dst]
.endif
.endif
        add             src0, src0, x10
        add             src1, src1, x10
        add             dst, dst, dst_stride
.endm

function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        dst             .req x0
        dst_stride      .req x1
        src0            .req x2
        src1            .req x3
        width           .req w4
        height          .req w5

        mov             x10, #(VVC_MAX_PB_SIZE * 2)
        cmp             width, #8
.ifc \type, avg
        movi            v16.4s, #(1 << (14 - \bit_depth))
.else
        lsr             x11, x6, #32        // weight0
        mov             w12, w6             // weight1
        lsr             x13, x7, #32        // offset
        mov             w14, w7             // shift

        dup             v19.8h, w11
        neg             w14, w14            // so we can use sqshl
        dup             v20.8h, w12
        dup             v16.4s, w13
        dup             v22.4s, w14
.endif // avg

 .if \bit_depth >= 10
        // clip pixel
        mov             w6, #((1 << \bit_depth) - 1)
        movi            v18.8h, #0
        dup             v17.8h, w6
.endif

        b.eq            8f
        b.hi            16f
        cmp             width, #4
        b.eq            4f
2:      // width == 2
        subs            height, height, #1
        vvc_\type\()_\bit_depth\()_2_4 2
        b.ne            2b
        b               32f
4:      // width == 4
        subs            height, height, #1
        vvc_\type\()_\bit_depth\()_2_4 4
        b.ne            4b
        b               32f
8:      // width == 8
        ld1             {v0.8h}, [src0], x10
        ld1             {v2.8h}, [src1], x10
.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        saddl2          v5.4s, v0.8h, v2.8h
        add             v4.4s, v4.4s, v16.4s
        add             v5.4s, v5.4s, v16.4s
        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
.else
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
        smlal           v4.4s, v2.4h, v20.4h
        smlal2          v5.4s, v0.8h, v19.8h
        smlal2          v5.4s, v2.8h, v20.8h
        sqshl           v4.4s, v4.4s, v22.4s
        sqshl           v5.4s, v5.4s, v22.4s
        sqxtn           v4.4h, v4.4s
        sqxtn2          v4.8h, v5.4s
.endif
        subs            height, height, #1
.if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
        st1             {v4.8b}, [dst], dst_stride
.else
        smin            v4.8h, v4.8h, v17.8h
        smax            v4.8h, v4.8h, v18.8h
        st1             {v4.8h}, [dst], dst_stride
.endif
        b.ne            8b
        b               32f
16:     // width >= 16
        mov             w6, width
        mov             x7, src0
        mov             x8, src1
        mov             x9, dst
17:
        ldp             q0, q1, [x7], #32
        ldp             q2, q3, [x8], #32
.ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        saddl2          v5.4s, v0.8h, v2.8h
        saddl           v6.4s, v1.4h, v3.4h
        saddl2          v7.4s, v1.8h, v3.8h
        add             v4.4s, v4.4s, v16.4s
        add             v5.4s, v5.4s, v16.4s
        add             v6.4s, v6.4s, v16.4s
        add             v7.4s, v7.4s, v16.4s
        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
        sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
        sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
.else   // avg
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
        mov             v6.16b, v16.16b
        mov             v7.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
        smlal           v4.4s, v2.4h, v20.4h
        smlal2          v5.4s, v0.8h, v19.8h
        smlal2          v5.4s, v2.8h, v20.8h
        smlal           v6.4s, v1.4h, v19.4h
        smlal           v6.4s, v3.4h, v20.4h
        smlal2          v7.4s, v1.8h, v19.8h
        smlal2          v7.4s, v3.8h, v20.8h
        sqshl           v4.4s, v4.4s, v22.4s
        sqshl           v5.4s, v5.4s, v22.4s
        sqshl           v6.4s, v6.4s, v22.4s
        sqshl           v7.4s, v7.4s, v22.4s
        sqxtn           v4.4h, v4.4s
        sqxtn           v6.4h, v6.4s
        sqxtn2          v4.8h, v5.4s
        sqxtn2          v6.8h, v7.4s
.endif  // w_avg
        subs            w6, w6, #16
.if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
        sqxtun2         v4.16b, v6.8h
        str             q4, [x9], #16
.else
        smin            v4.8h, v4.8h, v17.8h
        smin            v6.8h, v6.8h, v17.8h
        smax            v4.8h, v4.8h, v18.8h
        smax            v6.8h, v6.8h, v18.8h
        stp             q4, q6, [x9], #32
.endif
        b.ne            17b

        subs            height, height, #1
        add             src0, src0, x10
        add             src1, src1, x10
        add             dst, dst, dst_stride
        b.ne            16b
32:
        ret

.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.endm

vvc_avg avg, 8
vvc_avg avg, 10
vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12

/* x0: int16_t *dst
 * x1: const uint8_t *_src
 * x2: ptrdiff_t _src_stride
 * w3: int height
 * x4: intptr_t mx
 * x5: intptr_t my
 * w6: int width
 */
function ff_vvc_dmvr_8_neon, export=1
        dst             .req x0
        src             .req x1
        src_stride      .req x2
        height          .req w3
        mx              .req x4
        my              .req x5
        width           .req w6

        sxtw            x6, w6
        mov             x7, #(VVC_MAX_PB_SIZE * 2 + 8)
        cmp             width, #16
        sub             src_stride, src_stride, x6
        cset            w15, gt                     // width > 16
        movi            v16.8h, #2                  // DMVR_SHIFT
        sub             x7, x7, x6, lsl #1
1:
        cbz             w15, 2f
        ldr             q0, [src], #16
        uxtl            v1.8h, v0.8b
        uxtl2           v2.8h, v0.16b
        ushl            v1.8h, v1.8h, v16.8h
        ushl            v2.8h, v2.8h, v16.8h
        stp             q1, q2, [dst], #32
        b               3f
2:
        ldr             d0, [src], #8
        uxtl            v1.8h, v0.8b
        ushl            v1.8h, v1.8h, v16.8h
        str             q1, [dst], #16
3:
        subs            height, height, #1
        ldr             s3, [src], #4
        uxtl            v4.8h, v3.8b
        ushl            v4.4h, v4.4h, v16.4h
        st1             {v4.4h}, [dst], x7

        add             src, src, src_stride
        b.ne            1b

        ret
endfunc

function ff_vvc_dmvr_12_neon, export=1
        sxtw            x6, w6
        mov             x7, #(VVC_MAX_PB_SIZE * 2 + 8)
        cmp             width, #16
        sub             src_stride, src_stride, x6, lsl #1
        cset            w15, gt                     // width > 16
        movi            v16.8h, #2                  // offset4
        sub             x7, x7, x6, lsl #1
1:
        cbz             w15, 2f
        ldp             q0, q1, [src], #32
        uaddl           v2.4s, v0.4h, v16.4h
        uaddl2          v3.4s, v0.8h, v16.8h
        uaddl           v4.4s, v1.4h, v16.4h
        uaddl2          v5.4s, v1.8h, v16.8h
        ushr            v2.4s, v2.4s, #2
        ushr            v3.4s, v3.4s, #2
        ushr            v4.4s, v4.4s, #2
        ushr            v5.4s, v5.4s, #2
        uqxtn           v2.4h, v2.4s
        uqxtn2          v2.8h, v3.4s
        uqxtn           v4.4h, v4.4s
        uqxtn2          v4.8h, v5.4s

        stp             q2, q4, [dst], #32
        b               3f
2:
        ldr             q0, [src], #16
        uaddl           v2.4s, v0.4h, v16.4h
        uaddl2          v3.4s, v0.8h, v16.8h
        ushr            v2.4s, v2.4s, #2
        ushr            v3.4s, v3.4s, #2
        uqxtn           v2.4h, v2.4s
        uqxtn2          v2.8h, v3.4s
        str             q2, [dst], #16
3:
        subs            height, height, #1
        ldr             d0, [src], #8
        uaddl           v3.4s, v0.4h, v16.4h
        ushr            v3.4s, v3.4s, #2
        uqxtn           v3.4h, v3.4s
        st1             {v3.4h}, [dst], x7

        add             src, src, src_stride
        b.ne            1b

        ret
endfunc

function ff_vvc_dmvr_hv_8_neon, export=1
        tmp0            .req x7
        tmp1            .req x8

        sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)

        movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
        add             x12, x9, mx, lsl #1
        ldrb            w10, [x12]
        ldrb            w11, [x12, #1]
        mov             tmp0, sp
        add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
        // We know the value are positive
        dup             v0.8h, w10                  // filter_x[0]
        dup             v1.8h, w11                  // filter_x[1]

        add             x12, x9, my, lsl #1
        ldrb            w10, [x12]
        ldrb            w11, [x12, #1]
        sxtw            x6, w6
        movi            v30.8h, #(1 << (8 - 7))     // offset1
        movi            v31.8h, #8                  // offset2
        dup             v2.8h, w10                  // filter_y[0]
        dup             v3.8h, w11                  // filter_y[1]

        // Valid value for width can only be 8 + 4, 16 + 4
        cmp             width, #16
        mov             w10, #0                     // start filter_y or not
        add             height, height, #1
        sub             dst, dst, #(VVC_MAX_PB_SIZE * 2)
        sub             src_stride, src_stride, x6
        cset            w15, gt                     // width > 16
1:
        mov             x12, tmp0
        mov             x13, tmp1
        mov             x14, dst
        cbz             w15, 2f

        // width > 16
        ldur            q5, [src, #1]
        ldr             q4, [src], #16
        uxtl            v7.8h, v5.8b
        uxtl2           v17.8h, v5.16b
        uxtl            v6.8h, v4.8b
        uxtl2           v16.8h, v4.16b
        mul             v6.8h, v6.8h, v0.8h
        mul             v16.8h, v16.8h, v0.8h
        mla             v6.8h, v7.8h, v1.8h
        mla             v16.8h, v17.8h, v1.8h
        add             v6.8h, v6.8h, v30.8h
        add             v16.8h, v16.8h, v30.8h
        ushr            v6.8h, v6.8h, #(8 - 6)
        ushr            v7.8h, v16.8h, #(8 - 6)
        stp             q6, q7, [x13], #32

        cbz             w10, 3f

        ldp             q16, q17, [x12], #32
        mul             v16.8h, v16.8h, v2.8h
        mul             v17.8h, v17.8h, v2.8h
        mla             v16.8h, v6.8h, v3.8h
        mla             v17.8h, v7.8h, v3.8h
        add             v16.8h, v16.8h, v31.8h
        add             v17.8h, v17.8h, v31.8h
        ushr            v16.8h, v16.8h, #4
        ushr            v17.8h, v17.8h, #4
        stp             q16, q17, [x14], #32
        b               3f
2:
        // width > 8
        ldur            d5, [src, #1]
        ldr             d4, [src], #8
        uxtl            v7.8h, v5.8b
        uxtl            v6.8h, v4.8b
        mul             v6.8h, v6.8h, v0.8h
        mla             v6.8h, v7.8h, v1.8h
        add             v6.8h, v6.8h, v30.8h
        ushr            v6.8h, v6.8h, #(8 - 6)
        str             q6, [x13], #16

        cbz             w10, 3f

        ldr             q16, [x12], #16
        mul             v16.8h, v16.8h, v2.8h
        mla             v16.8h, v6.8h, v3.8h
        add             v16.8h, v16.8h, v31.8h
        ushr            v16.8h, v16.8h, #4
        str             q16, [x14], #16
3:
        ldr             s5, [src, #1]
        ldr             s4, [src], #4
        uxtl            v7.8h, v5.8b
        uxtl            v6.8h, v4.8b
        mul             v6.4h, v6.4h, v0.4h
        mla             v6.4h, v7.4h, v1.4h
        add             v6.4h, v6.4h, v30.4h
        ushr            v6.4h, v6.4h, #(8 - 6)
        str             d6, [x13], #8

        cbz             w10, 4f

        ldr             d16, [x12], #8
        mul             v16.4h, v16.4h, v2.4h
        mla             v16.4h, v6.4h, v3.4h
        add             v16.4h, v16.4h, v31.4h
        ushr            v16.4h, v16.4h, #4
        str             d16, [x14], #8
4:
        subs            height, height, #1
        mov             w10, #1
        add             src, src, src_stride
        add             dst, dst, #(VVC_MAX_PB_SIZE * 2)
        eor             tmp0, tmp0, tmp1
        eor             tmp1, tmp0, tmp1
        eor             tmp0, tmp0, tmp1
        b.ne            1b

        add             sp, sp, #(VVC_MAX_PB_SIZE * 4)
        ret
endfunc

function ff_vvc_dmvr_hv_12_neon, export=1
        movi            v29.4s, #(12 - 6)
        movi            v30.4s, #(1 << (12 - 7))    // offset1
        b               0f
endfunc

function ff_vvc_dmvr_hv_10_neon, export=1
        movi            v29.4s, #(10 - 6)
        movi            v30.4s, #(1 << (10 - 7))    // offset1
0:
        movi            v31.4s, #8                  // offset2
        neg             v29.4s, v29.4s

        sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)

        movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
        add             x12, x9, mx, lsl #1
        ldrb            w10, [x12]
        ldrb            w11, [x12, #1]
        mov             tmp0, sp
        add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
        // We know the value are positive
        dup             v0.8h, w10                  // filter_x[0]
        dup             v1.8h, w11                  // filter_x[1]

        add             x12, x9, my, lsl #1
        ldrb            w10, [x12]
        ldrb            w11, [x12, #1]
        sxtw            x6, w6
        dup             v2.8h, w10                  // filter_y[0]
        dup             v3.8h, w11                  // filter_y[1]

        // Valid value for width can only be 8 + 4, 16 + 4
        cmp             width, #16
        mov             w10, #0                     // start filter_y or not
        add             height, height, #1
        sub             dst, dst, #(VVC_MAX_PB_SIZE * 2)
        sub             src_stride, src_stride, x6, lsl #1
        cset            w15, gt                     // width > 16
1:
        mov             x12, tmp0
        mov             x13, tmp1
        mov             x14, dst
        cbz             w15, 2f

        // width > 16
        add             x16, src, #2
        ldp             q6, q16, [src], #32
        ldp             q7, q17, [x16]
        umull           v4.4s, v6.4h, v0.4h
        umull2          v5.4s, v6.8h, v0.8h
        umull           v18.4s, v16.4h, v0.4h
        umull2          v19.4s, v16.8h, v0.8h
        umlal           v4.4s, v7.4h, v1.4h
        umlal2          v5.4s, v7.8h, v1.8h
        umlal           v18.4s, v17.4h, v1.4h
        umlal2          v19.4s, v17.8h, v1.8h

        add             v4.4s, v4.4s, v30.4s
        add             v5.4s, v5.4s, v30.4s
        add             v18.4s, v18.4s, v30.4s
        add             v19.4s, v19.4s, v30.4s
        ushl            v4.4s, v4.4s, v29.4s
        ushl            v5.4s, v5.4s, v29.4s
        ushl            v18.4s, v18.4s, v29.4s
        ushl            v19.4s, v19.4s, v29.4s
        uqxtn           v6.4h, v4.4s
        uqxtn2          v6.8h, v5.4s
        uqxtn           v7.4h, v18.4s
        uqxtn2          v7.8h, v19.4s
        stp             q6, q7, [x13], #32

        cbz             w10, 3f

        ldp             q4, q5, [x12], #32
        umull           v17.4s, v4.4h, v2.4h
        umull2          v18.4s, v4.8h, v2.8h
        umull           v19.4s, v5.4h, v2.4h
        umull2          v20.4s, v5.8h, v2.8h
        umlal           v17.4s, v6.4h, v3.4h
        umlal2          v18.4s, v6.8h, v3.8h
        umlal           v19.4s, v7.4h, v3.4h
        umlal2          v20.4s, v7.8h, v3.8h
        add             v17.4s, v17.4s, v31.4s
        add             v18.4s, v18.4s, v31.4s
        add             v19.4s, v19.4s, v31.4s
        add             v20.4s, v20.4s, v31.4s
        ushr            v17.4s, v17.4s, #4
        ushr            v18.4s, v18.4s, #4
        ushr            v19.4s, v19.4s, #4
        ushr            v20.4s, v20.4s, #4
        uqxtn           v6.4h, v17.4s
        uqxtn2          v6.8h, v18.4s
        uqxtn           v7.4h, v19.4s
        uqxtn2          v7.8h, v20.4s
        stp             q6, q7, [x14], #32
        b               3f
2:
        // width > 8
        ldur            q7, [src, #2]
        ldr             q6, [src], #16
        umull           v4.4s, v6.4h, v0.4h
        umull2          v5.4s, v6.8h, v0.8h
        umlal           v4.4s, v7.4h, v1.4h
        umlal2          v5.4s, v7.8h, v1.8h

        add             v4.4s, v4.4s, v30.4s
        add             v5.4s, v5.4s, v30.4s
        ushl            v4.4s, v4.4s, v29.4s
        ushl            v5.4s, v5.4s, v29.4s
        uqxtn           v6.4h, v4.4s
        uqxtn2          v6.8h, v5.4s
        str             q6, [x13], #16

        cbz             w10, 3f

        ldr             q16, [x12], #16
        umull           v17.4s, v16.4h, v2.4h
        umull2          v18.4s, v16.8h, v2.8h
        umlal           v17.4s, v6.4h, v3.4h
        umlal2          v18.4s, v6.8h, v3.8h
        add             v17.4s, v17.4s, v31.4s
        add             v18.4s, v18.4s, v31.4s
        ushr            v17.4s, v17.4s, #4
        ushr            v18.4s, v18.4s, #4
        uqxtn           v16.4h, v17.4s
        uqxtn2          v16.8h, v18.4s
        str             q16, [x14], #16
3:
        ldr             d7, [src, #2]
        ldr             d6, [src], #8
        umull           v4.4s, v7.4h, v1.4h
        umlal           v4.4s, v6.4h, v0.4h
        add             v4.4s, v4.4s, v30.4s
        ushl            v4.4s, v4.4s, v29.4s
        uqxtn           v6.4h, v4.4s
        str             d6, [x13], #8

        cbz             w10, 4f

        ldr             d16, [x12], #8
        umull           v17.4s, v16.4h, v2.4h
        umlal           v17.4s, v6.4h, v3.4h
        add             v17.4s, v17.4s, v31.4s
        ushr            v17.4s, v17.4s, #4
        uqxtn           v16.4h, v17.4s
        str             d16, [x14], #8
4:
        subs            height, height, #1
        mov             w10, #1
        add             src, src, src_stride
        add             dst, dst, #(VVC_MAX_PB_SIZE * 2)
        eor             tmp0, tmp0, tmp1
        eor             tmp1, tmp0, tmp1
        eor             tmp0, tmp0, tmp1
        b.ne            1b

        add             sp, sp, #(VVC_MAX_PB_SIZE * 4)
        ret

.unreq dst
.unreq src
.unreq src_stride
.unreq height
.unreq mx
.unreq my
.unreq width
.unreq tmp0
.unreq tmp1
endfunc