libswscale/x86/ops_common.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

;******************************************************************************
;* Copyright (c) 2025 Niklas Haas
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

; High-level explanation of how the x86 backend works:
;
; sws_processN is the shared entry point for all operation chains. This
; function is responsible for the block loop, as well as initializing the
; plane pointers. It will jump directly into the first operation kernel,
; and each operation kernel will jump directly into the next one, with the
; final kernel jumping back into the sws_process return point. (See label
; `sws_process.return` in ops_int.asm)
;
; To handle the jump back to the return point, we append an extra address
; corresponding to the correct sws_process.return label into the SwsOpChain,
; and have the WRITE kernel jump into it as usual. (See the FINISH macro)
;
; Inside an operation chain, we use a custom calling convention to preserve
; registers between kernels. The exact register allocation is found further
; below in this file, but we basically reserve (and share) the following
; registers:
;
; - const execq (read-only, shared execution data, see SwsOpExec); stores the
;   static metadata for this call and describes the image layouts
;
; - implq (read-only, operation chain, see SwsOpChain); stores the private data
;   for each operation as well as the pointer to the next kernel in the sequence.
;   This register is automatically incremented by the CONTINUE macro, and will
;   be reset back to the first operation kernel by sws_process.
;
; - bxd, yd: current line and block number, used as loop counters in sws_process.
;   Also used by e.g. the dithering code to do position-dependent dithering.
;
; - tmp0, tmp1: two temporary registers which are NOT preserved between kernels
;
; - inNq, outNq: plane pointers. These are incremented automatically after the
;   corresponding read/write operation, by the read/write kernels themselves.
;   sws_process will take care of resetting these to the next line after the
;   block loop is done.
;
; Additionally, we pass data between kernels by directly keeping them inside
; vector registers. For this, we reserve the following registers:
;
; - mx, my, mz, mw:     low half of the X, Y, Z and W components
; - mx2, my2, mz2, mw2: high half of the X, Y, Z and W components
; (As well as sized variants for xmx, ymx, etc.)
;
; The "high half" registers are only sometimes used; in order to enable
; processing more pixels at the same time. See `decl_v2` below, which allows
; assembling the same operation twice, once with only the lower half (V2=0),
; and once with both halves (V2=1). The remaining vectors are free for use
; inside operation kernels, starting from m8.
;
; The basic rule is that we always use the full set of both vector registers
; when processing the largest element size within a pixel chain. For example,
; if we load 8-bit values and convert them to 32-bit floats internally, then
; we would have an operation chain which combines an SSE4 V2=0 u8 kernel (128
; bits = 16 pixels) with an AVX2 V2=1 f32 kernel (512 bits = 16 pixels). This
; keeps the number of pixels being processed (the block size) constant. The
; V2 setting is suffixed to the operation name (_m1 or _m2) during name
; mangling.
;
; This design leaves us with the following set of possibilities:
;
; SSE4:
; - max element is 32-bit: currently unsupported
; - max element is 16-bit: currently unsupported
; - max element is 8-bit:  block size 32, u8_m2_sse4
;
; AVX2:
; - max element is 32-bit: block size 16, u32_m2_avx2, u16_m1_avx2, u8_m1_sse4
; - max element is 16-bit: block size 32, u16_m2_avx2, u8_m1_avx2
; - max element is 8-bit:  block size 64, u8_m2_avx2
;
; Meaning we need to cover the following code paths for each bit depth:
;
; -  8-bit kernels: m1_sse4, m2_sse4, m1_avx2, m2_avx2
; - 16-bit kernels: m1_avx2, m2_avx2
; - 32-bit kernels: m2_avx2
;
; This is achieved by macro'ing each operation kernel and declaring it once
; per SIMD version, and (if needed) once per V2 setting using decl_v2. (See
; the bottom of ops_int.asm for an example)
;
; Finally, we overload some operation kernel to different number of components,
; using the `decl_pattern` and `decl_common_patterns` macros. Inside these
; kernels, the variables X, Y, Z and W will be set to 0 or 1 respectively,
; depending on which components are active for this particular kernel instance.
; They will receive the _pXYZW prefix during name mangling.

struc SwsOpExec
    .in0 resq 1
    .in1 resq 1
    .in2 resq 1
    .in3 resq 1
    .out0 resq 1
    .out1 resq 1
    .out2 resq 1
    .out3 resq 1
    .in_stride0 resq 1
    .in_stride1 resq 1
    .in_stride2 resq 1
    .in_stride3 resq 1
    .out_stride0 resq 1
    .out_stride1 resq 1
    .out_stride2 resq 1
    .out_stride3 resq 1
    .in_bump0 resq 1
    .in_bump1 resq 1
    .in_bump2 resq 1
    .in_bump3 resq 1
    .out_bump0 resq 1
    .out_bump1 resq 1
    .out_bump2 resq 1
    .out_bump3 resq 1
    .width resd 1
    .height resd 1
    .slice_y resd 1
    .slice_h resd 1
    .block_size_in resd 1
    .block_size_out resd 1
endstruc

struc SwsOpImpl
    .cont resb 16
    .priv resb 16
    .next resb 0
endstruc

;---------------------------------------------------------
; Common macros for declaring operations

; Declare an operation kernel with the correct name mangling.
%macro op 1 ; name
    %ifdef X
        %define ADD_PAT(name) p %+ X %+ Y %+ Z %+ W %+ _ %+ name
    %else
        %define ADD_PAT(name) name
    %endif

    %ifdef V2
        %if V2
            %define ADD_MUL(name) name %+ _m2
        %else
            %define ADD_MUL(name) name %+ _m1
        %endif
    %else
        %define ADD_MUL(name) name
    %endif

    cglobal ADD_PAT(ADD_MUL(%1)), 0, 0, 0 ; already allocated by entry point

    %undef ADD_PAT
    %undef ADD_MUL
%endmacro

; Declare an operation kernel twice, once with V2=0 and once with V2=1
%macro decl_v2 2+ ; v2, func
    %xdefine V2 %1
    %2
    %undef V2
%endmacro

; Declare an operation kernel specialized to a given subset of active components
%macro decl_pattern 5+ ; X, Y, Z, W, func
    %xdefine X %1
    %xdefine Y %2
    %xdefine Z %3
    %xdefine W %4
    %5
    %undef X
    %undef Y
    %undef Z
    %undef W
%endmacro

; Declare an operation kernel specialized to each common component pattern
%macro decl_common_patterns 1+ ; func
    decl_pattern 1, 0, 0, 0, %1 ; y
    decl_pattern 1, 0, 0, 1, %1 ; ya
    decl_pattern 1, 1, 1, 0, %1 ; yuv
    decl_pattern 1, 1, 1, 1, %1 ; yuva
%endmacro

;---------------------------------------------------------
; Common names for the internal calling convention
%define mx      m0
%define my      m1
%define mz      m2
%define mw      m3

%define xmx     xm0
%define xmy     xm1
%define xmz     xm2
%define xmw     xm3

%define ymx     ym0
%define ymy     ym1
%define ymz     ym2
%define ymw     ym3

%define mx2     m4
%define my2     m5
%define mz2     m6
%define mw2     m7

%define xmx2    xm4
%define xmy2    xm5
%define xmz2    xm6
%define xmw2    xm7

%define ymx2    ym4
%define ymy2    ym5
%define ymz2    ym6
%define ymw2    ym7

; Reserved in this order by the signature of SwsOpFunc
%define execq   r0q
%define implq   r1q
%define bxd     r2d
%define yd      r3d

; Extra registers for free use by kernels, not saved between ops
%define tmp0q   r4q
%define tmp1q   r5q

%define tmp0d   r4d
%define tmp1d   r5d

; Registers for plane pointers; put at the end (and in ascending plane order)
; so that we can avoid reserving them when not necessary
%define out0q   r6q
%define  in0q   r7q
%define out1q   r8q
%define  in1q   r9q
%define out2q   r10q
%define  in2q   r11q
%define out3q   r12q
%define  in3q   r13q

;---------------------------------------------------------
; Common macros for linking together different kernels

; Load the next operation kernel's address to a register
%macro LOAD_CONT 1 ; reg
    mov %1, [implq + SwsOpImpl.cont]
%endmacro

; Tail call into the next operation kernel, given that kernel's address
%macro CONTINUE 1 ; reg
    add implq, SwsOpImpl.next
    jmp %1
    annotate_function_size
%endmacro

; Convenience macro to load and continue to the next kernel in one step
%macro CONTINUE 0
    LOAD_CONT tmp0q
    CONTINUE tmp0q
%endmacro

; Final macro to end the operation chain, used by WRITE kernels to jump back
; to the process function return point. Very similar to CONTINUE, but skips
; incrementing the implq pointer, and also clears AVX registers to avoid
; phantom dependencies between loop iterations.
%macro FINISH 1 ; reg
    %if vzeroupper_required
        ; we may jump back into an SSE read, so always zero upper regs here
        vzeroupper
    %endif
    jmp %1
    annotate_function_size
%endmacro

; Helper for inline conditionals; used to conditionally include single lines
%macro IF 2+ ; cond, body
    %if %1
        %2
    %endif
%endmacro

; Alternate name; for nested usage (to work around NASM limitations)
%macro IF1 2+
    %if %1
        %2
    %endif
%endmacro