libavcodec/x86/vvc/vvc_mc.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303

; /*
; * Provide SIMD MC functions for VVC decoding
; *
; * Copyright © 2021, VideoLAN and dav1d authors
; * Copyright © 2021, Two Orioles, LLC
; * All rights reserved.
; *
; * Copyright (c) 2023-2024 Nuo Mi
; * Copyright (c) 2023-2024 Wu Jianhua
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */

%include "libavutil/x86/x86util.asm"

%define MAX_PB_SIZE 128

SECTION_RODATA 32

pw_0    times 2 dw   0
pw_1    times 2 dw   1
pw_4    times 2 dw   4
pw_12   times 2 dw  12
pw_256  times 2 dw 256

%macro AVG_JMP_TABLE 3-*
    %xdefine %1_%2_%3_table (%%table - 2*%4)
    %xdefine %%base %1_%2_%3_table
    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
    %%table:
    %rep %0 - 3
        dd %%prefix %+ .w%4 - %%base
        %rotate 1
    %endrep
%endmacro

%if ARCH_X86_64
AVG_JMP_TABLE    avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE    avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
%endif

SECTION .text

%macro AVG_W16_FN 3 ; bpc, op, count
    %assign %%i 0
    %rep %3
        %define off %%i
        AVG_LOAD_W16        0, off
        %2
        AVG_SAVE_W16       %1, 0, off


        AVG_LOAD_W16        1, off
        %2
        AVG_SAVE_W16       %1, 1, off

        %assign %%i %%i+1
    %endrep
%endmacro

%macro AVG_FN 2 ; bpc, op
   jmp                  wq

.w2:
    movd                xm0, [src0q]
    pinsrd              xm0, [src0q + AVG_SRC_STRIDE], 1
    movd                xm1, [src1q]
    pinsrd              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W2          %1
    AVG_LOOP_END        .w2

.w4:
    movq                xm0, [src0q]
    pinsrq              xm0, [src0q + AVG_SRC_STRIDE], 1
    movq                xm1, [src1q]
    pinsrq              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W4          %1

    AVG_LOOP_END        .w4

.w8:
    vinserti128         m0, m0, [src0q], 0
    vinserti128         m0, m0, [src0q + AVG_SRC_STRIDE], 1
    vinserti128         m1, m1, [src1q], 0
    vinserti128         m1, m1, [src1q + AVG_SRC_STRIDE], 1
    %2
    AVG_SAVE_W8         %1

    AVG_LOOP_END       .w8

.w16:
    AVG_W16_FN          %1, %2, 1

    AVG_LOOP_END       .w16

.w32:
    AVG_W16_FN          %1, %2, 2

    AVG_LOOP_END       .w32

.w64:
    AVG_W16_FN          %1, %2, 4

    AVG_LOOP_END       .w64

.w128:
    AVG_W16_FN          %1, %2, 8

    AVG_LOOP_END       .w128

.ret:
    RET
%endmacro

%macro AVG   0
    paddsw               m0, m1
    pmulhrsw             m0, m2
    CLIPW                m0, m3, m4
%endmacro

%macro W_AVG 0
    punpckhwd            m5, m0, m1
    pmaddwd              m5, m3
    paddd                m5, m4
    psrad                m5, xm2

    punpcklwd            m0, m0, m1
    pmaddwd              m0, m3
    paddd                m0, m4
    psrad                m0, xm2

    packssdw             m0, m5
    CLIPW                m0, m6, m7
%endmacro

%macro AVG_LOAD_W16 2  ; line, offset
    movu               m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
    movu               m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
%endmacro

%macro AVG_SAVE_W2 1 ;bpc
    %if %1 == 16
        pextrd           [dstq], xm0, 0
        pextrd [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        pextrw           [dstq], xm0, 0
        pextrw [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W4 1 ;bpc
    %if %1 == 16
        pextrq           [dstq], xm0, 0
        pextrq [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        pextrd           [dstq], xm0, 0
        pextrd [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W8 1 ;bpc
    %if %1 == 16
        vextracti128            [dstq], m0, 0
        vextracti128  [dstq + strideq], m0, 1
    %else
        packuswb                    m0, m0
        vpermq                      m0, m0, 1000b
        pextrq                  [dstq], xm0, 0
        pextrq        [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W16 3 ; bpc, line, offset
    %if %1 == 16
        movu               [dstq + %2 * strideq + %3 * 32], m0
    %else
        packuswb                                        m0, m0
        vpermq                                          m0, m0, 1000b
        vextracti128       [dstq + %2 * strideq + %3 * 16], m0, 0
    %endif
%endmacro

%macro AVG_LOOP_END 1
    sub                  hd, 2
    je                 .ret

    lea               src0q, [src0q + 2 * AVG_SRC_STRIDE]
    lea               src1q, [src1q + 2 * AVG_SRC_STRIDE]
    lea                dstq, [dstq + 2 * strideq]
    jmp                  %1
%endmacro

%define AVG_SRC_STRIDE MAX_PB_SIZE*2

;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
;   const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
%macro VVC_AVG_AVX2 1
cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
    movifnidn            hd, hm

    pxor                 m3, m3             ; pixel min
    vpbroadcastw         m4, bdm            ; pixel max

    movifnidn           bdd, bdm
    inc                 bdd
    tzcnt               bdd, bdd            ; bit depth

    sub                 bdd, 8
    movd                xm0, bdd
    vpbroadcastd         m1, [pw_4]
    pminuw               m0, m1
    vpbroadcastd         m2, [pw_256]
    psllw                m2, xm0                ; shift

    lea                  r6, [avg_%1 %+ SUFFIX %+ _table]
    tzcnt                wd, wm
    movsxd               wq, dword [r6+wq*4]
    add                  wq, r6
    AVG_FN               %1, AVG
%endmacro

;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
;    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
;    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, intptr_t pixel_max);
%macro VVC_W_AVG_AVX2 1
cglobal vvc_w_avg_%1bpc, 4, 7, 8, dst, stride, src0, src1, w, h, t0, t1

    movifnidn            hd, hm

    movifnidn           t0d, r8m                ; w1
    shl                 t0d, 16
    mov                 t0w, r7m                ; w0
    movd                xm3, t0d
    vpbroadcastd         m3, xm3                ; w0, w1

    pxor                m6, m6                  ;pixel min
    vpbroadcastw        m7, r11m                ;pixel max

    mov                 t1q, rcx                ; save ecx
    mov                 ecx, r11m
    inc                 ecx                     ; bd
    tzcnt               ecx, ecx
    sub                 ecx, 8
    mov                 t0d, r9m                ; o0
    add                 t0d, r10m               ; o1
    shl                 t0d, cl
    inc                 t0d                     ;((o0 + o1) << (BIT_DEPTH - 8)) + 1

    neg                 ecx
    add                 ecx, 4                  ; bd - 12
    cmovl               ecx, [pw_0]
    add                 ecx, 3
    add                 ecx, r6m
    movd                xm2, ecx                ; shift

    dec                ecx
    shl                t0d, cl
    movd               xm4, t0d
    vpbroadcastd        m4, xm4                 ; offset
    mov                rcx, t1q                 ; restore ecx

    lea                 r6, [w_avg_%1 %+ SUFFIX %+ _table]
    tzcnt               wd, wm
    movsxd              wq, dword [r6+wq*4]
    add                 wq, r6
    AVG_FN              %1, W_AVG
%endmacro

%if ARCH_X86_64

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2

VVC_AVG_AVX2 16

VVC_AVG_AVX2 8

VVC_W_AVG_AVX2 16

VVC_W_AVG_AVX2 8
%endif

%endif