libswscale/arm/yuv2rgb_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

/*
 * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
 * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

.macro compute_premult half_u half_v
    vmov                d2, \half_u                                    @ copy left q14 to left q1
    vmov                d3, \half_u                                    @ copy left q14 to right q1
    vmov                d4, \half_v                                    @ copy left q15 to left q2
    vmov                d5, \half_v                                    @ copy left q15 to right q2

    vzip.16             d2, d3                                         @ U1U1U2U2U3U3U4U4
    vzip.16             d4, d5                                         @ V1V1V2V2V3V3V4V4

    vmull.s16           q8,  d4, d1[0]                                 @  V * v2r             (left,  red)
    vmull.s16           q9,  d5, d1[0]                                 @  V * v2r             (right, red)
    vmull.s16           q10, d2, d1[1]                                 @  U * u2g
    vmull.s16           q11, d3, d1[1]                                 @  U * u2g
    vmlal.s16           q10, d4, d1[2]                                 @  U * u2g + V * v2g   (left,  green)
    vmlal.s16           q11, d5, d1[2]                                 @  U * u2g + V * v2g   (right, green)
    vmull.s16           q12, d2, d1[3]                                 @  U * u2b             (left,  blue)
    vmull.s16           q13, d3, d1[3]                                 @  U * u2b             (right, blue)
.endm

.macro compute_color dst_comp pre1 pre2
    vadd.s32            q3, q1, \pre1
    vadd.s32            q4, q2, \pre2
    vqrshrun.s32        d10, q3, #13
    vqrshrun.s32        d11, q4, #13                                   @ q5 = ({q3,q4} + (1<<12)) >> 13
    vqmovn.u16          \dst_comp, q5                                  @ saturate 16bit -> 8bit
.endm

.macro compute_rgba r g b a
    compute_color       \r, q8,  q9
    compute_color       \g, q10, q11
    compute_color       \b, q12, q13
    vmov.u8             \a, #255
.endm

.macro compute_half_line dst half_y ofmt
    vmovl.u8            q7, \half_y                                    @ 8px of Y
    vdup.16             q5, r9
    vsub.s16            q7, q5
    vmull.s16           q1, d14, d0                                    @ q1 = (srcY - y_offset) * y_coeff (left)
    vmull.s16           q2, d15, d0                                    @ q2 = (srcY - y_offset) * y_coeff (right)

.ifc \ofmt,argb
    compute_rgba        d13, d14, d15, d12
.endif

.ifc \ofmt,rgba
    compute_rgba        d12, d13, d14, d15
.endif

.ifc \ofmt,abgr
    compute_rgba        d15, d14, d13, d12
.endif

.ifc \ofmt,bgra
    compute_rgba        d14, d13, d12, d15
.endif

    vst4.8              {q6, q7}, [\dst,:128]!
.endm

.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
    push                {r4-r12, lr}
    vpush               {q4-q7}
    ldr                 r4, [sp, #104]                                 @ r4  = srcY
    ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
    ldr                 r6, [sp, #112]                                 @ r6  = srcC
    ldr                 r7, [sp, #116]                                 @ r7  = linesizeC
    ldr                 r8, [sp, #120]                                 @ r8  = table
    ldr                 r9, [sp, #124]                                 @ r9  = y_offset
    ldr                 r10,[sp, #128]                                 @ r10 = y_coeff
    vdup.16             d0, r10                                        @ d0  = y_coeff
    vld1.16             {d1}, [r8]                                     @ d1  = *table
    add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
    add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
    lsl                 r3, r3, #1
    lsl                 r5, r5, #1
    lsl                 r8, r0, #2
    sub                 r3, r3, r8                                     @ r3 = linesize  * 2 - width * 4 (padding)
    sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
    sub                 r7, r7, r0                                     @ r7 = linesizeC     - width     (paddingC)
1:
    mov                 r8, r0                                         @ r8 = width
2:
    pld [r6, #64*3]
    pld [r4, #64*3]
    pld [r12, #64*3]

    vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
    vmov.i8             d10, #128
.ifc \ifmt,nv12
    vsubl.u8            q14, d2, d10                                   @ q14 = U - 128
    vsubl.u8            q15, d3, d10                                   @ q15 = V - 128
.else
    vsubl.u8            q14, d3, d10                                   @ q14 = U - 128
    vsubl.u8            q15, d2, d10                                   @ q15 = V - 128
.endif

    compute_premult     d28, d30

    vld1.8              {q7}, [r4]!                                    @ first line of luma
    vmov                d28, d15                                       @ save right of the first line of luma for later use
    compute_half_line   r2, d14, \ofmt

    vld1.8              {q7}, [r12]!                                   @ second line of luma
    vmov                d30, d15                                       @ save right of the second line of luma for later use
    compute_half_line   r11, d14, \ofmt

    compute_premult     d29, d31
    compute_half_line   r2,  d28, \ofmt
    compute_half_line   r11, d30, \ofmt

    subs                r8, r8, #16                                    @ width -= 16
    bgt                 2b

    add                 r2, r2, r3                                     @ dst   += padding
    add                 r4, r4, r5                                     @ srcY  += paddingY
    add                 r11, r11, r3                                   @ dst2  += padding
    add                 r12, r12, r5                                   @ srcY2 += paddingY
    add                 r6, r6, r7                                     @ srcC  += paddingC

    subs                r1, r1, #2                                     @ height -= 2
    bgt                 1b

    vpop                {q4-q7}
    pop                 {r4-r12, lr}
    mov                 pc, lr
endfunc
.endm

.macro declare_rgb_funcs ifmt
    declare_func \ifmt, argb
    declare_func \ifmt, rgba
    declare_func \ifmt, abgr
    declare_func \ifmt, bgra
.endm

declare_rgb_funcs nv12
declare_rgb_funcs nv21