1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
;*****************************************************************************
;* x86-optimized functions for idet filter
;*
;* Copyright (C) 2014 Pascal Massimino ([email protected])
;* Copyright (c) 2014 Neil Birkbeck ([email protected])
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;******************************************************************************
; 16bit implementation that does 4/8-pixels at a time
%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
psubusw %3, %2, %1
psubusw %1, %2
por %1, %3
mova %2, %1
punpcklwd %1, m_zero
punpckhwd %2, m_zero
paddd %1, %2
%endmacro
%macro IDET_FILTER_LINE_16BIT 0
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
xor indexq, indexq
%define m_zero m1
%define m_sum m0
pxor m_sum, m_sum
pxor m_zero, m_zero
.loop_16bit:
movu m2, [bq + indexq * 2] ; B
movu m3, [aq + indexq * 2] ; A
mova m6, m2
psubusw m5, m2, m3 ; ba
movu m4, [cq + indexq * 2] ; C
add indexq, mmsize >> 1
psubusw m3, m2 ; ab
CMP indexd, widthd
psubusw m6, m4 ; bc
psubusw m4, m2 ; cb
PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
paddd m_sum, m3
paddd m_sum, m5
jl .loop_16bit
%if mmsize > 32
vextracti64x4 ym1, m0, 1
paddq ym0, ym1
%endif
HADDD m_sum, m2
movd eax, m_sum
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE_16BIT
INIT_XMM avx2
IDET_FILTER_LINE_16BIT
INIT_XMM avx512icl
IDET_FILTER_LINE_16BIT
;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
%macro IDET_FILTER_LINE 0
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
xor indexq, indexq
pxor m0, m0
pxor m1, m1
.sse2_loop:
movu m2, [bq + indexq*1] ; B
movu m3, [aq + indexq*1] ; A
mova m6, m2
mova m4, m3
psubusb m5, m2, m3 ; ba
movu m3, [cq + indexq*1] ; C
add indexq, mmsize
psubusb m4, m2 ; ab
CMP indexd, widthd
psubusb m6, m3 ; bc
psubusb m3, m2 ; cb
psadbw m4, m6 ; |ab - bc|
paddq m0, m4
psadbw m5, m3 ; |ba - cb|
paddq m1, m5
jl .sse2_loop
paddq m0, m1
%if mmsize > 32
vextracti64x4 ym1, m0, 1
paddq ym0, ym1
%endif
%if mmsize > 16
vextracti128 xm1, ym0, 1
paddq xm0, xm1
%endif
movhlps xm1, xm0
paddq xm0, xm1
movd eax, xm0
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE
INIT_YMM avx2
IDET_FILTER_LINE
INIT_ZMM avx512icl
IDET_FILTER_LINE
|