libswscale/arm/yuv2rgb_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

/*
 * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
 * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"


.macro compute_premult
    vsub.u16            q14,q11                                        @ q14 = U * (1 << 3) - 128 * (1 << 3)
    vsub.u16            q15,q11                                        @ q15 = V * (1 << 3) - 128 * (1 << 3)
    vqdmulh.s16         q8, q15, d1[0]                                 @ q8  = V * v2r
    vqdmulh.s16         q9, q14, d1[1]                                 @ q9  = U * u2g
    vqdmulh.s16         q5, q15, d1[2]                                 @ q5  = V * v2g
    vadd.s16            q9, q5                                         @ q9  = U * u2g + V * v2g
    vqdmulh.s16         q10,q14, d1[3]                                 @ q10 = U * u2b
.endm

.macro compute_color dst_comp1 dst_comp2 pre
    vadd.s16            q1, q14, \pre
    vadd.s16            q2, q15, \pre
    vqrshrun.s16        \dst_comp1, q1, #1
    vqrshrun.s16        \dst_comp2, q2, #1
.endm

.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
    compute_color       \r1, \r2, q8
    compute_color       \g1, \g2, q9
    compute_color       \b1, \b2, q10
    vmov.u8             \a1, #255
    vmov.u8             \a2, #255
.endm

.macro compute dst ofmt
    vshll.u8            q14, d14, #3                                   @ q14 = Y * (1 << 3)
    vshll.u8            q15, d15, #3                                   @ q15 = Y * (1 << 3)
    vsub.s16            q14, q12                                       @ q14 = (Y - y_offset)
    vsub.s16            q15, q12                                       @ q15 = (Y - y_offset)
    vqdmulh.s16         q14, q13                                       @ q14 = (Y - y_offset) * y_coeff
    vqdmulh.s16         q15, q13                                       @ q15 = (Y - y_offset) * y_coeff

.ifc \ofmt,argb
    compute_rgba        d7, d8, d9, d6, d11, d12, d13, d10
.endif

.ifc \ofmt,rgba
    compute_rgba        d6, d7, d8, d9, d10, d11, d12, d13
.endif

.ifc \ofmt,abgr
    compute_rgba        d9, d8, d7, d6, d13, d12, d11, d10
.endif

.ifc \ofmt,bgra
    compute_rgba        d8, d7, d6, d9, d12, d11, d10, d13
.endif

    vzip.8              d6, d10                                        @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
    vzip.8              d7, d11                                        @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
    vzip.8              d8, d12                                        @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
    vzip.8              d9, d13                                        @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
    vst4.8              {q3, q4}, [\dst]!
    vst4.8              {q5, q6}, [\dst]!
.endm

.macro process_1l_internal dst src ofmt
    vld2.8              {d14, d15}, [\src]!                            @ q7 = Y (interleaved)
    compute             \dst, \ofmt
.endm

.macro process_1l ofmt
    compute_premult
    process_1l_internal r2, r4, \ofmt
.endm

.macro process_2l ofmt
    compute_premult
    process_1l_internal r2, r4, \ofmt
    process_1l_internal r11,r12,\ofmt
.endm

.macro load_args_nv12
    push                {r4-r12, lr}
    vpush               {q4-q7}
    ldr                 r4, [sp, #104]                                 @ r4  = srcY
    ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
    ldr                 r6, [sp, #112]                                 @ r6  = srcC
    ldr                 r7, [sp, #116]                                 @ r7  = linesizeC
    ldr                 r8, [sp, #120]                                 @ r8  = table
    ldr                 r9, [sp, #124]                                 @ r9  = y_offset
    ldr                 r10,[sp, #128]                                 @ r10 = y_coeff
    vdup.16             d0, r10                                        @ d0  = y_coeff
    vld1.16             {d1}, [r8]                                     @ d1  = *table
    add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
    add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
    lsl                 r3, r3, #1
    lsl                 r5, r5, #1
    sub                 r3, r3, r0, lsl #2                             @ r3 = linesize  * 2 - width * 4 (padding)
    sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
    sub                 r7, r7, r0                                     @ r7 = linesizeC     - width     (paddingC)
.endm

.macro load_args_nv21
    load_args_nv12
.endm

.macro load_args_yuv420p
    push                {r4-r12, lr}
    vpush               {q4-q7}
    ldr                 r4, [sp, #104]                                 @ r4  = srcY
    ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
    ldr                 r6, [sp, #112]                                 @ r6  = srcU
    ldr                 r8, [sp, #128]                                 @ r8  = table
    ldr                 r9, [sp, #132]                                 @ r9  = y_offset
    ldr                 r10,[sp, #136]                                 @ r10 = y_coeff
    vdup.16             d0, r10                                        @ d0  = y_coeff
    vld1.16             {d1}, [r8]                                     @ d1  = *table
    add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
    add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
    lsl                 r3, r3, #1
    lsl                 r5, r5, #1
    sub                 r3, r3, r0, lsl #2                             @ r3 = linesize  * 2 - width * 4 (padding)
    sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
    ldr                 r10,[sp, #120]                                 @ r10 = srcV
.endm

.macro load_args_yuv422p
    push                {r4-r12, lr}
    vpush               {q4-q7}
    ldr                 r4, [sp, #104]                                 @ r4  = srcY
    ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
    ldr                 r6, [sp, #112]                                 @ r6  = srcU
    ldr                 r7, [sp, #116]                                 @ r7  = linesizeU
    ldr                 r12,[sp, #124]                                 @ r12 = linesizeV
    ldr                 r8, [sp, #128]                                 @ r8  = table
    ldr                 r9, [sp, #132]                                 @ r9  = y_offset
    ldr                 r10,[sp, #136]                                 @ r10 = y_coeff
    vdup.16             d0, r10                                        @ d0  = y_coeff
    vld1.16             {d1}, [r8]                                     @ d1  = *table
    sub                 r3, r3, r0, lsl #2                             @ r3  = linesize  - width * 4 (padding)
    sub                 r5, r5, r0                                     @ r5  = linesizeY - width     (paddingY)
    sub                 r7, r7, r0, lsr #1                             @ r7  = linesizeU - width / 2 (paddingU)
    sub                 r12,r12,r0, lsr #1                             @ r12 = linesizeV - width / 2 (paddingV)
    ldr                 r10,[sp, #120]                                 @ r10 = srcV
.endm

.macro load_chroma_nv12
    pld [r12, #64*3]

    vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
    vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
    vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
.endm

.macro load_chroma_nv21
    pld [r12, #64*3]

    vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
    vshll.u8            q14, d3, #3                                    @ q14 = U * (1 << 3)
    vshll.u8            q15, d2, #3                                    @ q15 = V * (1 << 3)
.endm

.macro load_chroma_yuv420p
    pld [r10, #64*3]
    pld [r12, #64*3]

    vld1.8              d2, [r6]!                                      @ d2: chroma red line
    vld1.8              d3, [r10]!                                     @ d3: chroma blue line
    vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
    vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
.endm

.macro load_chroma_yuv422p
    pld [r10, #64*3]

    vld1.8              d2, [r6]!                                      @ d2: chroma red line
    vld1.8              d3, [r10]!                                     @ d3: chroma blue line
    vshll.u8            q14, d2, #3                                    @ q14 = U * (1 << 3)
    vshll.u8            q15, d3, #3                                    @ q15 = V * (1 << 3)
.endm

.macro increment_and_test_nv12
    add                 r11, r11, r3                                   @ dst2  += padding
    add                 r12, r12, r5                                   @ srcY2 += paddingY
    add                 r6, r6, r7                                     @ srcC  += paddingC
    subs                r1, r1, #2                                     @ height -= 2
.endm

.macro increment_and_test_nv21
    increment_and_test_nv12
.endm

.macro increment_and_test_yuv420p
    add                 r11, r11, r3                                   @ dst2  += padding
    add                 r12, r12, r5                                   @ srcY2 += paddingY
    ldr                 r7, [sp, #116]                                 @ r7     = linesizeU
    sub                 r7, r7, r0, lsr #1                             @ r7     = linesizeU - width / 2 (paddingU)
    add                 r6, r6, r7                                     @ srcU  += paddingU
    ldr                 r7, [sp, #124]                                 @ r7     = linesizeV
    sub                 r7, r7, r0, lsr #1                             @ r7     = linesizeV - width / 2 (paddingV)
    add                 r10, r10, r7                                   @ srcV  += paddingV
    subs                r1, r1, #2                                     @ height -= 2
.endm

.macro increment_and_test_yuv422p
    add                 r6, r6, r7                                     @ srcU  += paddingU
    add                 r10,r10,r12                                    @ srcV  += paddingV
    subs                r1, r1, #1                                     @ height -= 1
.endm

.macro process_nv12 ofmt
    process_2l \ofmt
.endm

.macro process_nv21 ofmt
    process_2l \ofmt
.endm

.macro process_yuv420p ofmt
    process_2l \ofmt
.endm

.macro process_yuv422p ofmt
    process_1l \ofmt
.endm

.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
    load_args_\ifmt
    vmov.u16            q11, #1024                                     @ q11 = 128 * (1 << 3)
    vdup.16             q12, r9                                        @ q12 = y_offset
    vmov                d26, d0                                        @ q13 = y_coeff
    vmov                d27, d0                                        @ q13 = y_coeff
1:
    mov                 r8, r0                                         @ r8 = width
2:
    pld [r6, #64*3]
    pld [r4, #64*3]
    vmov.i8             d10, #128
    load_chroma_\ifmt
    process_\ifmt \ofmt
    subs                r8, r8, #16                                    @ width -= 16
    bgt                 2b
    add                 r2, r2, r3                                     @ dst   += padding
    add                 r4, r4, r5                                     @ srcY  += paddingY
    increment_and_test_\ifmt
    bgt                 1b
    vpop                {q4-q7}
    pop                 {r4-r12, lr}
    mov                 pc, lr
endfunc
.endm

.macro declare_rgb_funcs ifmt
    declare_func \ifmt, argb
    declare_func \ifmt, rgba
    declare_func \ifmt, abgr
    declare_func \ifmt, bgra
.endm

declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p