aboutsummaryrefslogtreecommitdiffstats
path: root/libavfilter/x86/vf_bwdif.asm
blob: c93b41ec4886d9de71536fbf21a9d575c4b74aa4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
;*****************************************************************************
;* x86-optimized functions for bwdif filter
;*
;* Copyright (C) 2016 Thomas Mundt <loudmax@yahoo.de>
;*
;* Based on yadif simd code
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
;*               2013 Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA 32

pw_coefhf:  times 8 dw  1016, 5570
pw_coefhf1: times 16 dw -3801
pw_coefsp:  times 8 dw  5077, -981
pw_splfdif: times 8 dw  -768,  768

SECTION .text

%macro LOAD8 2
    %if mmsize == 32
        pmovzxbw %1, %2
    %else
    movh         %1, %2
    punpcklbw    %1, m7
    %endif
%endmacro

%macro LOAD12 2
    movu         %1, %2
%endmacro

%macro DISP8 0
    %if mmsize == 32
        vextracti128  xm1,    m2, 1
        packuswb      xm2,   xm1
        movu         [dstq], xm2
    %else
    packuswb     m2, m2
    movh     [dstq], m2
    %endif
%endmacro

%macro DISP12 0
    CLIPW        m2, m7, m12
    movu     [dstq], m2
%endmacro

%macro FILTER 5
    pxor         m7, m7
.loop%1:
    LOAD%4       m0, [curq+t0*%5]
    LOAD%4       m1, [curq+t1*%5]
    LOAD%4       m2, [%2]
    LOAD%4       m3, [%3]
    mova         m4, m3
    paddw        m3, m2
    psubw        m2, m4
    ABS1         m2, m4
    mova         m8, m3
    mova         m9, m2
    LOAD%4       m3, [prevq+t0*%5]
    LOAD%4       m4, [prevq+t1*%5]
    psubw        m3, m0
    psubw        m4, m1
    ABS2         m3, m4, m5, m6
    paddw        m3, m4
    psrlw        m2, 1
    psrlw        m3, 1
    pmaxsw       m2, m3
    LOAD%4       m3, [nextq+t0*%5]
    LOAD%4       m4, [nextq+t1*%5]
    psubw        m3, m0
    psubw        m4, m1
    ABS2         m3, m4, m5, m6
    paddw        m3, m4
    psrlw        m3, 1
    pmaxsw       m2, m3

    LOAD%4       m3, [%2+t0*2*%5]
    LOAD%4       m4, [%3+t0*2*%5]
    LOAD%4       m5, [%2+t1*2*%5]
    LOAD%4       m6, [%3+t1*2*%5]
    paddw        m3, m4
    paddw        m5, m6
    mova         m6, m3
    paddw        m6, m5
    mova        m10, m6
    psrlw        m3, 1
    psrlw        m5, 1
    psubw        m3, m0
    psubw        m5, m1
    mova         m6, m3
    pminsw       m3, m5
    pmaxsw       m5, m6
    mova         m4, m8
    psraw        m4, 1
    mova         m6, m4
    psubw        m6, m0
    psubw        m4, m1
    pmaxsw       m3, m6
    pminsw       m5, m6
    pmaxsw       m3, m4
    pminsw       m5, m4
    mova         m6, m7
    psubw        m6, m3
    pmaxsw       m6, m5
    mova         m3, m2
    pcmpgtw      m3, m7
    pand         m6, m3
    pmaxsw       m2, m6
    mova        m11, m2

    LOAD%4       m2, [%2+t0*4*%5]
    LOAD%4       m3, [%3+t0*4*%5]
    LOAD%4       m4, [%2+t1*4*%5]
    LOAD%4       m5, [%3+t1*4*%5]
    paddw        m2, m3
    paddw        m4, m5
    paddw        m2, m4
    mova         m3, m2
    punpcklwd    m2, m8
    punpckhwd    m3, m8
    pmaddwd      m2, [pw_coefhf]
    pmaddwd      m3, [pw_coefhf]
    mova         m4, m10
    mova         m6, m4
    pmullw       m4, [pw_coefhf1]
    pmulhw       m6, [pw_coefhf1]
    mova         m5, m4
    punpcklwd    m4, m6
    punpckhwd    m5, m6
    paddd        m2, m4
    paddd        m3, m5
    psrad        m2, 2
    psrad        m3, 2

    mova         m4, m0
    paddw        m0, m1
%if ARCH_X86_64
    LOAD%4       m5, [curq+t2*%5]
    LOAD%4       m6, [curq+t3*%5]
%else
    mov          r4, prefs3mp
    mov          r5, mrefs3mp
    LOAD%4       m5, [curq+t0*%5]
    LOAD%4       m6, [curq+t1*%5]
    mov          r4, prefsmp
    mov          r5, mrefsmp
%endif
    paddw        m6, m5
    psubw        m1, m4
    ABS1         m1, m4
    pcmpgtw      m1, m9
    mova         m4, m1
    punpcklwd    m1, m4
    punpckhwd    m4, m4
    pand         m2, m1
    pand         m3, m4
    mova         m5, [pw_splfdif]
    mova         m7, m5
    pand         m5, m1
    pand         m7, m4
    paddw        m5, [pw_coefsp]
    paddw        m7, [pw_coefsp]
    mova         m4, m0
    punpcklwd    m0, m6
    punpckhwd    m4, m6
    pmaddwd      m0, m5
    pmaddwd      m4, m7
    paddd        m2, m0
    paddd        m3, m4
    psrad        m2, 13
    psrad        m3, 13
    packssdw     m2, m3

    mova         m4, m8
    psraw        m4, 1
    mova         m0, m11
    mova         m3, m4
    psubw        m4, m0
    paddw        m3, m0
    CLIPW        m2, m4, m3
    pxor         m7, m7
    DISP%4

    add        dstq, STEP
    add       prevq, STEP
    add        curq, STEP
    add       nextq, STEP
    sub    DWORD wm, mmsize/2
    jg .loop%1
%endmacro

%macro PROC 2
%if ARCH_X86_64
    movsxd       r5, DWORD prefsm
    movsxd       r6, DWORD mrefsm
    movsxd       r7, DWORD prefs3m
    movsxd       r8, DWORD mrefs3m
    DECLARE_REG_TMP 5, 6, 7, 8
%else
    %define m8  [rsp+ 0]
    %define m9  [rsp+16]
    %define m10 [rsp+32]
    %define m11 [rsp+48]
    mov          r4, prefsmp
    mov          r5, mrefsmp
    DECLARE_REG_TMP 4, 5
%endif
    cmp DWORD paritym, 0
    je .parity0
    FILTER 1, prevq, curq, %1, %2
    jmp .ret
.parity0:
    FILTER 0, curq, nextq, %1, %2
.ret:
    RET
%endmacro

%macro BWDIF 0
%if ARCH_X86_64
cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
                                        prefs4, mrefs4, parity, clip_max
%else
cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
                                        prefs4, mrefs4, parity, clip_max
%endif
    %define STEP mmsize/2
    PROC 8, 1

%if ARCH_X86_64
cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
                                              prefs, mrefs, prefs2, mrefs2, \
                                              prefs3, mrefs3, prefs4, \
                                              mrefs4, parity, clip_max
    %if mmsize == 32
        vpbroadcastw m12, WORD clip_maxm
    %else
    movd        m12, DWORD clip_maxm
    SPLATW      m12, m12, 0
    %endif
%else
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
                                              prefs, mrefs, prefs2, mrefs2, \
                                              prefs3, mrefs3, prefs4, \
                                              mrefs4, parity, clip_max
    %define m12 [rsp+64]
    movd         m0, DWORD clip_maxm
    SPLATW       m0, m0, 0
    mova        m12, m0
%endif
    %define STEP mmsize
    PROC 12, 2
%endmacro

INIT_XMM ssse3
BWDIF
INIT_XMM sse2
BWDIF

%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
INIT_YMM avx2
BWDIF
%endif