1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
;*****************************************************************************
;* x86-optimized functions for idet filter
;*
;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_TEXT
; Implementation that does 8-bytes at a time using single-word operations.
%macro IDET_FILTER_LINE 1
INIT_MMX %1
cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
xor indexq, indexq
%define m_zero m2
%define m_sum m5
pxor m_sum, m_sum
pxor m_zero, m_zero
.loop:
movu m0, [aq + indexq*1]
punpckhbw m1, m0, m_zero
punpcklbw m0, m_zero
movu m3, [cq + indexq*1]
punpckhbw m4, m3, m_zero
punpcklbw m3, m_zero
paddsw m1, m4
paddsw m0, m3
movu m3, [bq + indexq*1]
punpckhbw m4, m3, m_zero
punpcklbw m3, m_zero
paddw m4, m4
paddw m3, m3
psubsw m1, m4
psubsw m0, m3
ABS2 m1, m0, m4, m3
paddw m0, m1
punpckhwd m1, m0, m_zero
punpcklwd m0, m_zero
paddd m0, m1
paddd m_sum, m0
add indexq, 0x8
CMP widthd, indexd
jg .loop
HADDD m_sum, m0
movd eax, m_sum
RET
%endmacro
%if ARCH_X86_32
IDET_FILTER_LINE mmxext
IDET_FILTER_LINE mmx
%endif
;******************************************************************************
; 16bit implementation that does 4/8-pixels at a time
%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
psubusw %3, %2, %1
psubusw %1, %2
por %1, %3
mova %2, %1
punpcklwd %1, m_zero
punpckhwd %2, m_zero
paddd %1, %2
%endmacro
%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
xor indexq, indexq
%define m_zero m1
%define m_sum m0
pxor m_sum, m_sum
pxor m_zero, m_zero
.loop_16bit:
movu m2, [bq + indexq * 2] ; B
movu m3, [aq + indexq * 2] ; A
mova m6, m2
psubusw m5, m2, m3 ; ba
movu m4, [cq + indexq * 2] ; C
add indexq, %1
psubusw m3, m2 ; ab
CMP indexd, widthd
psubusw m6, m4 ; bc
psubusw m4, m2 ; cb
PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
paddd m_sum, m3
paddd m_sum, m5
jl .loop_16bit
HADDD m_sum, m2
movd eax, m_sum
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE_16BIT 8
%if ARCH_X86_32
INIT_MMX mmx
IDET_FILTER_LINE_16BIT 4
%endif
;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
INIT_XMM sse2
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
xor indexq, indexq
pxor m0, m0
pxor m1, m1
.sse2_loop:
movu m2, [bq + indexq*1] ; B
movu m3, [aq + indexq*1] ; A
mova m6, m2
mova m4, m3
psubusb m5, m2, m3 ; ba
movu m3, [cq + indexq*1] ; C
add indexq, 0x10
psubusb m4, m2 ; ab
CMP indexd, widthd
psubusb m6, m3 ; bc
psubusb m3, m2 ; cb
psadbw m4, m6 ; |ab - bc|
paddq m0, m4
psadbw m5, m3 ; |ba - cb|
paddq m1, m5
jl .sse2_loop
paddq m0, m1
movhlps m1, m0
paddq m0, m1
movd eax, m0
RET
|