1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright 2020 Google LLC
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
four: times 8 dd 4
SECTION .text
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale8to15_<filterSize>_<opt>
; (SwsInternal *c, int16_t *dst,
; int dstW, const uint8_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------
%macro SCALE_FUNC 1
cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
pxor m0, m0
mova m15, [swizzle]
xor countq, countq
movsxd wq, wd
%ifidn %1, X4
mova m14, [four]
shr fltsized, 2
%endif
cmp wq, 0x10
jl .tail_loop
sub wq, 0x10
.loop:
movu m1, [fltposq]
movu m2, [fltposq+32]
%ifidn %1, X4
pxor m9, m9
pxor m10, m10
pxor m11, m11
pxor m12, m12
xor innerq, innerq
.innerloop:
%endif
vpcmpeqd m13, m13
vpgatherdd m3,[srcmemq + m1], m13
vpcmpeqd m13, m13
vpgatherdd m4,[srcmemq + m2], m13
vpunpcklbw m5, m3, m0
vpunpckhbw m6, m3, m0
vpunpcklbw m7, m4, m0
vpunpckhbw m8, m4, m0
vpmaddwd m5, m5, [filterq]
vpmaddwd m6, m6, [filterq + 32]
vpmaddwd m7, m7, [filterq + 64]
vpmaddwd m8, m8, [filterq + 96]
add filterq, 0x80
%ifidn %1, X4
paddd m9, m5
paddd m10, m6
paddd m11, m7
paddd m12, m8
paddd m1, m14
paddd m2, m14
add innerq, 1
cmp innerq, fltsizeq
jl .innerloop
vphaddd m5, m9, m10
vphaddd m6, m11, m12
%else
vphaddd m5, m5, m6
vphaddd m6, m7, m8
%endif
vpsrad m5, 7
vpsrad m6, 7
vpackssdw m5, m5, m6
vpermd m5, m15, m5
vmovdqu [dstq + countq * 2], m5
add fltposq, 0x40
add countq, 0x10
cmp countq, wq
jle .loop
add wq, 0x10
cmp countq, wq
jge .end
.tail_loop:
movu xm1, [fltposq]
%ifidn %1, X4
pxor xm9, xm9
pxor xm10, xm10
xor innerq, innerq
.tail_innerloop:
%endif
vpcmpeqd xm13, xm13
vpgatherdd xm3,[srcmemq + xm1], xm13
vpunpcklbw xm5, xm3, xm0
vpunpckhbw xm6, xm3, xm0
vpmaddwd xm5, xm5, [filterq]
vpmaddwd xm6, xm6, [filterq + 0x10]
add filterq, 0x20
%ifidn %1, X4
paddd xm9, xm5
paddd xm10, xm6
paddd xm1, xm14
add innerq, 1
cmp innerq, fltsizeq
jl .tail_innerloop
vphaddd xm5, xm9, xm10
%else
vphaddd xm5, xm5, xm6
%endif
vpsrad xm5, 7
vpackssdw xm5, xm5, xm5
vmovq [dstq + countq * 2], xm5
add fltposq, 0x10
add countq, 0x4
cmp countq, wq
jl .tail_loop
.end:
RET
%endmacro
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
SCALE_FUNC 4
SCALE_FUNC X4
%endif
%endif
|