1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
|
;******************************************************************************
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86inc.asm"
SECTION_RODATA
pw_3: times 8 dw 3
pw_7: times 8 dw 7
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pb_128: times 16 db 128
section .text
%macro UNPACK_ADD 6
mov%5 %1, %3
mov%6 m5, %4
mova m4, %1
mova %2, m5
punpcklbw %1, m7
punpcklbw m5, m7
punpckhbw m4, m7
punpckhbw %2, m7
paddw %1, m5
paddw %2, m4
%endmacro
%macro HPEL_FILTER 1
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
mov src0q, srcq
lea stridex3q, [3*strideq]
sub src0q, stridex3q
pxor m7, m7
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq], m0
add dstq, mmsize
add srcq, mmsize
add src0q, mmsize
sub widthd, mmsize
jg .loop
RET
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
dec widthd
pxor m7, m7
and widthd, ~(mmsize-1)
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq + widthq], m0
sub widthd, mmsize
jge .loop
RET
%endmacro
%macro PUT_RECT 1
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
cglobal put_signed_rect_clamped_%1, 5,7,3, dst, dst_stride, src, src_stride, w, dst2, src2
mova m0, [pb_128]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%ifdef ARCH_X86_64
mov r10d, r5m
mov r11d, wd
%define wspill r11d
%define hd r10d
%else
mov r4m, wd
%define wspill r4m
%define hd r5mp
%endif
.loopy
lea src2q, [srcq+src_strideq*2]
lea dst2q, [dstq+dst_strideq]
.loopx:
sub wd, mmsize
mova m1, [srcq +2*wq]
mova m2, [src2q+2*wq]
packsswb m1, [srcq +2*wq+mmsize]
packsswb m2, [src2q+2*wq+mmsize]
paddb m1, m0
paddb m2, m0
mova [dstq +wq], m1
mova [dst2q+wq], m2
jg .loopx
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*2]
sub hd, 2
mov wd, wspill
jg .loopy
RET
%endm
%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
cglobal add_rect_clamped_%1, 7,7,3, dst, src, stride, idwt, idwt_stride, w, h
mova m0, [pw_32]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%ifdef ARCH_X86_64
mov r11d, wd
%define wspill r11d
%else
mov r5m, wd
%define wspill r5m
%endif
.loop:
sub wd, mmsize
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
paddw m1, m0
psraw m1, 6
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
paddw m2, m0
psraw m2, 6
paddw m1, [idwtq+2*wq]
paddw m2, [idwtq+2*wq+mmsize]
packuswb m1, m2
mova [dstq +wq], m1
jg .loop
lea srcq, [srcq + 2*strideq]
add dstq, strideq
lea idwtq, [idwtq+ 2*idwt_strideq]
sub hd, 1
mov wd, wspill
jg .loop
RET
%endm
%macro ADD_OBMC 2
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
pxor m4, m4
.loop:
%assign i 0
%rep %1 / mmsize
mova m0, [srcq+i]
mova m1, m0
punpcklbw m0, m4
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
mova m2, [dstq+2*i]
mova m3, [dstq+2*i+mmsize]
paddw m0, m2
paddw m1, m3
mova [dstq+2*i], m0
mova [dstq+2*i+mmsize], m1
%assign i i+mmsize
%endrep
lea srcq, [srcq+strideq]
lea dstq, [dstq+2*strideq]
add obmcq, 32
sub yblend, 1
jg .loop
RET
%endm
INIT_MMX
%ifndef ARCH_X86_64
PUT_RECT mmx
ADD_RECT mmx
HPEL_FILTER mmx
ADD_OBMC 32, mmx
ADD_OBMC 16, mmx
%endif
ADD_OBMC 8, mmx
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
|