libavfilter/x86/vf_idetdsp.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

;*****************************************************************************
;* x86-optimized functions for idet filter
;*
;* Copyright (C) 2014 Pascal Massimino ([email protected])
;* Copyright (c) 2014 Neil Birkbeck ([email protected])
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

;******************************************************************************
; 16bit implementation that does 4/8-pixels at a time

%macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
  psubusw   %3, %2, %1
  psubusw   %1, %2
  por       %1, %3

  mova      %2, %1
  punpcklwd %1, m_zero
  punpckhwd %2, m_zero
  paddd     %1, %2
%endmacro

%macro IDET_FILTER_LINE_16BIT 0
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
    xor       indexq, indexq
%define m_zero m1
%define m_sum  m0
    pxor      m_sum, m_sum
    pxor      m_zero, m_zero

.loop_16bit:
    movu      m2, [bq + indexq * 2]  ; B
    movu      m3, [aq + indexq * 2]  ; A
    mova      m6, m2
    psubusw   m5, m2, m3             ; ba

    movu      m4, [cq + indexq * 2]  ; C
    add       indexq, mmsize >> 1
    psubusw   m3, m2                 ; ab
    CMP       indexd, widthd

    psubusw   m6, m4                 ; bc
    psubusw   m4, m2                 ; cb

    PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
    PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
    paddd          m_sum, m3
    paddd          m_sum, m5
    jl        .loop_16bit

%if mmsize > 32
    vextracti64x4 ym1, m0, 1
    paddq     ym0, ym1
%endif
    HADDD     m_sum, m2
    movd      eax, m_sum
    RET
%endmacro

INIT_XMM sse2
IDET_FILTER_LINE_16BIT

INIT_XMM avx2
IDET_FILTER_LINE_16BIT

INIT_XMM avx512icl
IDET_FILTER_LINE_16BIT

;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:

%macro IDET_FILTER_LINE 0
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
    xor       indexq, indexq
    pxor      m0, m0
    pxor      m1, m1

.sse2_loop:
    movu      m2, [bq + indexq*1]  ; B
    movu      m3, [aq + indexq*1]  ; A
    mova      m6, m2
    mova      m4, m3
    psubusb   m5, m2, m3           ; ba

    movu      m3, [cq + indexq*1]  ; C
    add       indexq, mmsize
    psubusb   m4, m2               ; ab
    CMP       indexd, widthd

    psubusb   m6, m3               ; bc
    psubusb   m3, m2               ; cb

    psadbw    m4, m6               ; |ab - bc|
    paddq     m0, m4
    psadbw    m5, m3               ; |ba - cb|
    paddq     m1, m5
    jl       .sse2_loop

    paddq     m0, m1
%if mmsize > 32
    vextracti64x4 ym1, m0, 1
    paddq     ym0, ym1
%endif
%if mmsize > 16
    vextracti128 xm1, ym0, 1
    paddq     xm0, xm1
%endif
    movhlps   xm1, xm0
    paddq     xm0, xm1
    movd      eax, xm0
    RET
%endmacro

INIT_XMM sse2
IDET_FILTER_LINE

INIT_YMM avx2
IDET_FILTER_LINE

INIT_ZMM avx512icl
IDET_FILTER_LINE