1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
|
#include "kyber512r3_consts_avx2.h"
// The small macros (.inc files) are combined with .S files directly
/*****.include "shuffle.inc"*****/
/********************************/
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm
.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm
.macro shuffle2 r0,r1,r2,r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm
.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm
/********************************/
/*****.include "fq.inc"*****/
/***************************/
.macro red16 r,rs=0,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw $10,%ymm\x,%ymm\x
.endif
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm
.macro csubq r,x=12
vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
.macro caddq r,x=12
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
.macro fqmulprecomp al,ah,b,x=12
vpmullw %ymm\al,%ymm\b,%ymm\x
vpmulhw %ymm\ah,%ymm\b,%ymm\b
vpmulhw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\b,%ymm\b
.endm
/***************************/
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw %ymm\rl0,%ymm\rh0,%ymm12
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
vpsubw %ymm\rl1,%ymm\rh1,%ymm13
vpmullw %ymm\zl0,%ymm12,%ymm\rh0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw %ymm\rl2,%ymm\rh2,%ymm14
vpmullw %ymm\zl0,%ymm13,%ymm\rh1
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw %ymm\rl3,%ymm\rh3,%ymm15
vpmullw %ymm\zl1,%ymm14,%ymm\rh2
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm15,%ymm\rh3
vpmulhw %ymm\zh0,%ymm12,%ymm12
vpmulhw %ymm\zh0,%ymm13,%ymm13
vpmulhw %ymm\zh1,%ymm14,%ymm14
vpmulhw %ymm\zh1,%ymm15,%ymm15
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0
vpmulhw %ymm0,%ymm\rh1,%ymm\rh1
vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3
#
#
vpsubw %ymm\rh0,%ymm12,%ymm\rh0
vpsubw %ymm\rh1,%ymm13,%ymm\rh1
vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm
.macro intt_levels0t5 off
/* level 0 */
vmovdqa _16XFLO*2(%rsi),%ymm2
vmovdqa _16XFHI*2(%rsi),%ymm3
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
fqmulprecomp 2,3,4
fqmulprecomp 2,3,6
fqmulprecomp 2,3,5
fqmulprecomp 2,3,7
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa (128*\off+112)*2(%rdi),%ymm11
fqmulprecomp 2,3,8
fqmulprecomp 2,3,10
fqmulprecomp 2,3,9
fqmulprecomp 2,3,11
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm12
vpshufb %ymm12,%ymm15,%ymm15
vpshufb %ymm12,%ymm1,%ymm1
vpshufb %ymm12,%ymm2,%ymm2
vpshufb %ymm12,%ymm3,%ymm3
butterfly 4,5,8,9,6,7,10,11,15,1,2,3
/* level 1 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm1
vpshufb %ymm1,%ymm2,%ymm2
vpshufb %ymm1,%ymm3,%ymm3
butterfly 4,5,6,7,8,9,10,11,2,2,3,3
shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11
/* level 2 */
vmovdqa _REVIDXD*2(%rsi),%ymm12
vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
butterfly 3,4,6,8,5,7,9,11,2,2,10,10
vmovdqa _16XV*2(%rsi),%ymm1
red16 3
shuffle2 3,4,10,4
shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11
/* level 3 */
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
butterfly 10,3,6,5,4,8,7,11,2,2,9,9
shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11
/* level 4 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
butterfly 9,10,6,4,3,5,8,11,2,2,7,7
red16 9
shuffle8 9,10,7,10
shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11
/* level 5 */
vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
butterfly 7,9,6,3,10,4,5,11,2,2,8,8
vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.endm
.macro intt_level6 off
/* level 6 */
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
vmovdqa (64*\off+128)*2(%rdi),%ymm8
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa (64*\off+160)*2(%rdi),%ymm10
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3
butterfly 4,5,6,7,8,9,10,11
.if \off == 0
red16 4
.endif
vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa %ymm8,(64*\off+128)*2(%rdi)
vmovdqa %ymm9,(64*\off+144)*2(%rdi)
vmovdqa %ymm10,(64*\off+160)*2(%rdi)
vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.endm
.text
.global cdecl(invntt_avx2_asm)
cdecl(invntt_avx2_asm):
vmovdqa _16XQ*2(%rsi),%ymm0
intt_levels0t5 0
intt_levels0t5 1
intt_level6 0
intt_level6 1
ret
|