aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/takdsp.asm
blob: c9aec5711457014586f662a32ffae5aab4658894 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
;******************************************************************************
;* TAK DSP SIMD optimizations
;*
;* Copyright (C) 2015 Paul B Mahol
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

pd_128: times 4 dd 128

SECTION .text

%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
    shl                     lengthd, 2
    add                         p1q, lengthq
    add                         p2q, lengthq
    neg                     lengthq
.loop:
    mova                         m0, [p1q+lengthq+mmsize*0]
    mova                         m1, [p1q+lengthq+mmsize*1]
    paddd                        m0, [p2q+lengthq+mmsize*0]
    paddd                        m1, [p2q+lengthq+mmsize*1]
    mova     [p2q+lengthq+mmsize*0], m0
    mova     [p2q+lengthq+mmsize*1], m1
    add                     lengthq, mmsize*2
    jl .loop
    RET

cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
    shl                     lengthd, 2
    add                         p1q, lengthq
    add                         p2q, lengthq
    neg                     lengthq

.loop:
    mova                         m0, [p2q+lengthq+mmsize*0]
    mova                         m1, [p2q+lengthq+mmsize*1]
    psubd                        m0, [p1q+lengthq+mmsize*0]
    psubd                        m1, [p1q+lengthq+mmsize*1]
    mova     [p1q+lengthq+mmsize*0], m0
    mova     [p1q+lengthq+mmsize*1], m1
    add                     lengthq, mmsize*2
    jl .loop
    RET

cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
    shl                     lengthd, 2
    add                         p1q, lengthq
    add                         p2q, lengthq
    neg                     lengthq

.loop:
    mova                         m0, [p1q+lengthq]
    mova                         m1, [p2q+lengthq]
    mova                         m3, [p1q+lengthq+mmsize]
    mova                         m4, [p2q+lengthq+mmsize]
    psrad                        m2, m1, 1
    psrad                        m5, m4, 1
    psubd                        m0, m2
    psubd                        m3, m5
    paddd                        m1, m0
    paddd                        m4, m3
    mova              [p1q+lengthq], m0
    mova              [p2q+lengthq], m1
    mova       [p1q+lengthq+mmsize], m3
    mova       [p2q+lengthq+mmsize], m4
    add                     lengthq, mmsize*2
    jl .loop
    RET
%endmacro

INIT_XMM sse2
TAK_DECORRELATE
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
TAK_DECORRELATE
%endif

%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
    shl             lengthd, 2
    add                 p1q, lengthq
    add                 p2q, lengthq
    neg             lengthq

    movd                xm2, dshiftm
%if UNIX64
    movd                xm3, dfactorm
    VPBROADCASTD         m3, xm3
%else
    VPBROADCASTD         m3, dfactorm
%endif
    VBROADCASTI128       m4, [pd_128]

.loop:
    mova                 m1, [p2q+lengthq]
    psrad                m1, xm2
    pmulld               m1, m3
    paddd                m1, m4
    psrad                m1, 8
    pslld                m1, xm2
    psubd                m1, [p1q+lengthq]
    mova      [p1q+lengthq], m1
    add             lengthq, mmsize
    jl .loop
    RET
%endmacro

INIT_XMM sse4
TAK_DECORRELATE_SF
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
TAK_DECORRELATE_SF
%endif