1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
;******************************************************************************
;* VP9 IDCT SIMD optimizations
;*
;* Copyright (C) 2013 Clément Bœsch <u pkh me>
;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%macro VP9_IWHT4_1D 0
SWAP 1, 2, 3
paddw m0, m2
psubw m3, m1
psubw m4, m0, m3
psraw m4, 1
psubw m5, m4, m1
SWAP 5, 1
psubw m4, m2
SWAP 4, 2
psubw m0, m1
paddw m3, m2
SWAP 3, 2, 1
%endmacro
; (a*x + b*y + round) >> shift
%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
pmaddwd m%1, m%2, %4
pmaddwd m%2, %5
paddd m%1, %3
paddd m%2, %3
psrad m%1, 14
psrad m%2, 14
%endmacro
%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if %0 == 7
punpckhwd m%6, m%2, m%1
punpcklwd m%2, m%1
VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
%else
punpckhwd m%8, m%4, m%3
punpcklwd m%2, m%4, m%3
VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
%endif
%endmacro
%macro VP9_IDCT4_1D_FINALIZE 0
SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
SWAP 0, 3, 2 ; 3102 -> 0123
%endmacro
%macro VP9_IDCT4_1D 0
%if cpuflag(ssse3)
SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw m2, m6 ; m2=t0
pmulhrsw m0, m6 ; m0=t1
%else ; <= sse2
VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
%macro VP9_IADST4_1D 0
movq2dq xmm0, m0
movq2dq xmm1, m1
movq2dq xmm2, m2
movq2dq xmm3, m3
%if cpuflag(ssse3)
paddw m3, m0
%endif
punpcklwd xmm0, xmm1
punpcklwd xmm2, xmm3
pmaddwd xmm1, xmm0, [pw_5283_13377]
pmaddwd xmm4, xmm0, [pw_9929_13377]
%if notcpuflag(ssse3)
pmaddwd xmm6, xmm0, [pw_13377_0]
%endif
pmaddwd xmm0, [pw_15212_m13377]
pmaddwd xmm3, xmm2, [pw_15212_9929]
%if notcpuflag(ssse3)
pmaddwd xmm7, xmm2, [pw_m13377_13377]
%endif
pmaddwd xmm2, [pw_m5283_m15212]
%if cpuflag(ssse3)
psubw m3, m2
%else
paddd xmm6, xmm7
%endif
paddd xmm0, xmm2
paddd xmm3, xmm5
paddd xmm2, xmm5
%if notcpuflag(ssse3)
paddd xmm6, xmm5
%endif
paddd xmm1, xmm3
paddd xmm0, xmm3
paddd xmm4, xmm2
psrad xmm1, 14
psrad xmm0, 14
psrad xmm4, 14
%if cpuflag(ssse3)
pmulhrsw m3, [pw_13377x2] ; out2
%else
psrad xmm6, 14
%endif
packssdw xmm0, xmm0
packssdw xmm1, xmm1
packssdw xmm4, xmm4
%if notcpuflag(ssse3)
packssdw xmm6, xmm6
%endif
movdq2q m0, xmm0 ; out3
movdq2q m1, xmm1 ; out0
movdq2q m2, xmm4 ; out1
%if notcpuflag(ssse3)
movdq2q m3, xmm6 ; out2
%endif
SWAP 0, 1, 2, 3
%endmacro
|