libavfilter/x86/vf_idet.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

;*****************************************************************************
;* x86-optimized functions for idet filter
;*
;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_TEXT

; Implementation that does 8-bytes at a time using single-word operations.
%macro IDET_FILTER_LINE 1
INIT_MMX %1
cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
    xor       indexq, indexq
%define   m_zero m2
%define   m_sum  m5
    pxor      m_sum, m_sum
    pxor      m_zero, m_zero

.loop:
    movu      m0, [aq + indexq*1]
    punpckhbw m1, m0, m_zero
    punpcklbw m0, m_zero

    movu      m3, [cq + indexq*1]
    punpckhbw m4, m3, m_zero
    punpcklbw m3, m_zero

    paddsw    m1, m4
    paddsw    m0, m3

    movu      m3, [bq + indexq*1]
    punpckhbw m4, m3, m_zero
    punpcklbw m3, m_zero

    paddw     m4, m4
    paddw     m3, m3
    psubsw    m1, m4
    psubsw    m0, m3

    ABS2      m1, m0, m4, m3

    paddw     m0, m1
    punpckhwd m1, m0, m_zero
    punpcklwd m0, m_zero

    paddd     m0, m1
    paddd     m_sum, m0

    add       indexq, 0x8
    CMP       widthd, indexd
    jg        .loop

    HADDD     m_sum, m0
    movd      eax, m_sum
    RET
%endmacro

%if ARCH_X86_32
IDET_FILTER_LINE mmxext
IDET_FILTER_LINE mmx
%endif

;******************************************************************************
; 16bit implementation that does 4/8-pixels at a time

%macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
  psubusw   %3, %2, %1
  psubusw   %1, %2
  por       %1, %3

  mova      %2, %1
  punpcklwd %1, m_zero
  punpckhwd %2, m_zero
  paddd     %1, %2
%endmacro

%macro IDET_FILTER_LINE_16BIT 1   ; %1=increment (4 or 8 words)
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
    xor       indexq, indexq
%define m_zero m1
%define m_sum  m0
    pxor      m_sum, m_sum
    pxor      m_zero, m_zero

.loop_16bit:
    movu      m2, [bq + indexq * 2]  ; B
    movu      m3, [aq + indexq * 2]  ; A
    mova      m6, m2
    psubusw   m5, m2, m3             ; ba

    movu      m4, [cq + indexq * 2]  ; C
    add       indexq, %1
    psubusw   m3, m2                 ; ab
    CMP       indexd, widthd

    psubusw   m6, m4                 ; bc
    psubusw   m4, m2                 ; cb

    PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
    PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
    paddd          m_sum, m3
    paddd          m_sum, m5
    jl        .loop_16bit

    HADDD     m_sum, m2
    movd      eax, m_sum
    RET
%endmacro

INIT_XMM sse2
IDET_FILTER_LINE_16BIT 8
%if ARCH_X86_32
INIT_MMX mmx
IDET_FILTER_LINE_16BIT 4
%endif

;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:

INIT_XMM sse2
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
    xor       indexq, indexq
    pxor      m0, m0
    pxor      m1, m1

.sse2_loop:
    movu      m2, [bq + indexq*1]  ; B
    movu      m3, [aq + indexq*1]  ; A
    mova      m6, m2
    mova      m4, m3
    psubusb   m5, m2, m3           ; ba

    movu      m3, [cq + indexq*1]  ; C
    add       indexq, 0x10
    psubusb   m4, m2               ; ab
    CMP       indexd, widthd

    psubusb   m6, m3               ; bc
    psubusb   m3, m2               ; cb

    psadbw    m4, m6               ; |ab - bc|
    paddq     m0, m4
    psadbw    m5, m3               ; |ba - cb|
    paddq     m1, m5
    jl       .sse2_loop

    paddq     m0, m1
    movhlps   m1, m0
    paddq     m0, m1
    movd      eax, m0
    RET