aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S
blob: 8f131668ff7b50a6a6b20be174424a4dc32dcdf2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#include "kyber512r3_consts_avx2.h"

// The small macros (.inc files) are combined with .S files directly
/*****.include "shuffle.inc"*****/
/********************************/
.macro shuffle8 r0,r1,r2,r3
vperm2i128  $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128  $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vpsllq     $32,%ymm\r1,%ymm\r2
vmovsldup   %ymm\r1,%ymm\r2
vpblendd    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq      $32,%ymm\r0,%ymm\r0
#vmovshdup  %ymm\r0,%ymm\r0
vpblendd    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld      $16,%ymm\r1,%ymm\r2
vpblendw    $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld      $16,%ymm\r0,%ymm\r0
vpblendw    $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm
/********************************/

/*****.include "fq.inc"*****/
/***************************/
.macro red16 r,rs=0,x=12
vpmulhw     %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw   %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw      $10,%ymm\x,%ymm\x
.endif
vpmullw     %ymm0,%ymm\x,%ymm\x
vpsubw      %ymm\x,%ymm\r,%ymm\r
.endm

.macro csubq r,x=12
vpsubw      %ymm0,%ymm\r,%ymm\r
vpsraw      $15,%ymm\r,%ymm\x
vpand       %ymm0,%ymm\x,%ymm\x
vpaddw      %ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r,x=12
vpsraw      $15,%ymm\r,%ymm\x
vpand       %ymm0,%ymm\x,%ymm\x
vpaddw      %ymm\x,%ymm\r,%ymm\r
.endm

.macro fqmulprecomp al,ah,b,x=12
vpmullw     %ymm\al,%ymm\b,%ymm\x
vpmulhw     %ymm\ah,%ymm\b,%ymm\b
vpmulhw     %ymm0,%ymm\x,%ymm\x
vpsubw      %ymm\x,%ymm\b,%ymm\b
.endm
/***************************/

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw      %ymm\rl0,%ymm\rh0,%ymm12
vpaddw      %ymm\rh0,%ymm\rl0,%ymm\rl0
vpsubw      %ymm\rl1,%ymm\rh1,%ymm13

vpmullw     %ymm\zl0,%ymm12,%ymm\rh0
vpaddw      %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw      %ymm\rl2,%ymm\rh2,%ymm14

vpmullw     %ymm\zl0,%ymm13,%ymm\rh1
vpaddw      %ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw      %ymm\rl3,%ymm\rh3,%ymm15

vpmullw     %ymm\zl1,%ymm14,%ymm\rh2
vpaddw      %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw     %ymm\zl1,%ymm15,%ymm\rh3

vpmulhw     %ymm\zh0,%ymm12,%ymm12
vpmulhw     %ymm\zh0,%ymm13,%ymm13

vpmulhw     %ymm\zh1,%ymm14,%ymm14
vpmulhw     %ymm\zh1,%ymm15,%ymm15

vpmulhw     %ymm0,%ymm\rh0,%ymm\rh0

vpmulhw     %ymm0,%ymm\rh1,%ymm\rh1

vpmulhw     %ymm0,%ymm\rh2,%ymm\rh2
vpmulhw     %ymm0,%ymm\rh3,%ymm\rh3

#

#

vpsubw      %ymm\rh0,%ymm12,%ymm\rh0

vpsubw      %ymm\rh1,%ymm13,%ymm\rh1

vpsubw      %ymm\rh2,%ymm14,%ymm\rh2
vpsubw      %ymm\rh3,%ymm15,%ymm\rh3
.endm

.macro intt_levels0t5 off
/* level 0 */
vmovdqa     _16XFLO*2(%rsi),%ymm2
vmovdqa     _16XFHI*2(%rsi),%ymm3

vmovdqa     (128*\off+  0)*2(%rdi),%ymm4
vmovdqa     (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa     (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa     (128*\off+ 48)*2(%rdi),%ymm7

fqmulprecomp    2,3,4
fqmulprecomp    2,3,6
fqmulprecomp    2,3,5
fqmulprecomp    2,3,7

vmovdqa     (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa     (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa     (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa     (128*\off+112)*2(%rdi),%ymm11

fqmulprecomp    2,3,8
fqmulprecomp    2,3,10
fqmulprecomp    2,3,9
fqmulprecomp    2,3,11

vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa     _REVIDXB*2(%rsi),%ymm12
vpshufb     %ymm12,%ymm15,%ymm15
vpshufb     %ymm12,%ymm1,%ymm1
vpshufb     %ymm12,%ymm2,%ymm2
vpshufb     %ymm12,%ymm3,%ymm3

butterfly   4,5,8,9,6,7,10,11,15,1,2,3

/* level 1 */
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa     _REVIDXB*2(%rsi),%ymm1
vpshufb     %ymm1,%ymm2,%ymm2
vpshufb     %ymm1,%ymm3,%ymm3

butterfly   4,5,6,7,8,9,10,11,2,2,3,3

shuffle1    4,5,3,5
shuffle1    6,7,4,7
shuffle1    8,9,6,9
shuffle1    10,11,8,11

/* level 2 */
vmovdqa     _REVIDXD*2(%rsi),%ymm12
vpermd      (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd      (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10

butterfly   3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa     _16XV*2(%rsi),%ymm1
red16       3

shuffle2    3,4,10,4
shuffle2    6,8,3,8
shuffle2    5,7,6,7
shuffle2    9,11,5,11

/* level 3 */
vpermq      $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq      $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9

butterfly   10,3,6,5,4,8,7,11,2,2,9,9

shuffle4    10,3,9,3
shuffle4    6,5,10,5
shuffle4    4,8,6,8
shuffle4    7,11,4,11

/* level 4 */
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq      $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7

butterfly   9,10,6,4,3,5,8,11,2,2,7,7

red16       9

shuffle8    9,10,7,10
shuffle8    6,4,9,4
shuffle8    3,5,6,5
shuffle8    8,11,3,11

/* level 5 */
vmovdqa     (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa     (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8

butterfly   7,9,6,3,10,4,5,11,2,2,8,8

vmovdqa     %ymm7,(128*\off+  0)*2(%rdi)
vmovdqa     %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa     %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa     %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa     %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa     %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa     %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa     %ymm11,(128*\off+112)*2(%rdi)
.endm

.macro intt_level6 off
/* level 6 */
vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
vmovdqa         (64*\off+128)*2(%rdi),%ymm8
vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa         (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq    (_ZETAS_EXP+0)*2(%rsi),%ymm2

vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa         (64*\off+160)*2(%rdi),%ymm10
vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa         (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq    (_ZETAS_EXP+4)*2(%rsi),%ymm3

butterfly   4,5,6,7,8,9,10,11

.if \off == 0
red16       4
.endif

vmovdqa     %ymm4,(64*\off+  0)*2(%rdi)
vmovdqa     %ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa     %ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa     %ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa     %ymm8,(64*\off+128)*2(%rdi)
vmovdqa     %ymm9,(64*\off+144)*2(%rdi)
vmovdqa     %ymm10,(64*\off+160)*2(%rdi)
vmovdqa     %ymm11,(64*\off+176)*2(%rdi)
.endm

.text
.global cdecl(invntt_avx2_asm)
cdecl(invntt_avx2_asm):
vmovdqa     _16XQ*2(%rsi),%ymm0

intt_levels0t5  0
intt_levels0t5  1

intt_level6     0
intt_level6     1
ret