1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
#include "kyber512r3_consts_avx2.h"
// The small macros (.inc files) are combined with .S files directly
/*****.include "fq.inc"*****/
/***************************/
.macro red16 r,rs=0,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw $10,%ymm\x,%ymm\x
.endif
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm
.macro csubq r,x=12
vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
.macro caddq r,x=12
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
.macro fqmulprecomp al,ah,b,x=12
vpmullw %ymm\al,%ymm\b,%ymm\x
vpmulhw %ymm\ah,%ymm\b,%ymm\b
vpmulhw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\b,%ymm\b
.endm
/***************************/
.text
reduce128_avx:
#load
vmovdqa (%rdi),%ymm2
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm4
vmovdqa 96(%rdi),%ymm5
vmovdqa 128(%rdi),%ymm6
vmovdqa 160(%rdi),%ymm7
vmovdqa 192(%rdi),%ymm8
vmovdqa 224(%rdi),%ymm9
red16 2
red16 3
red16 4
red16 5
red16 6
red16 7
red16 8
red16 9
#store
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm4,64(%rdi)
vmovdqa %ymm5,96(%rdi)
vmovdqa %ymm6,128(%rdi)
vmovdqa %ymm7,160(%rdi)
vmovdqa %ymm8,192(%rdi)
vmovdqa %ymm9,224(%rdi)
ret
.global cdecl(reduce_avx2_asm)
cdecl(reduce_avx2_asm):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XV*2(%rsi),%ymm1
call reduce128_avx
add $256,%rdi
call reduce128_avx
ret
tomont128_avx:
#load
vmovdqa (%rdi),%ymm3
vmovdqa 32(%rdi),%ymm4
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm6
vmovdqa 128(%rdi),%ymm7
vmovdqa 160(%rdi),%ymm8
vmovdqa 192(%rdi),%ymm9
vmovdqa 224(%rdi),%ymm10
fqmulprecomp 1,2,3,11
fqmulprecomp 1,2,4,12
fqmulprecomp 1,2,5,13
fqmulprecomp 1,2,6,14
fqmulprecomp 1,2,7,15
fqmulprecomp 1,2,8,11
fqmulprecomp 1,2,9,12
fqmulprecomp 1,2,10,13
#store
vmovdqa %ymm3,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm7,128(%rdi)
vmovdqa %ymm8,160(%rdi)
vmovdqa %ymm9,192(%rdi)
vmovdqa %ymm10,224(%rdi)
ret
.global cdecl(tomont_avx2_asm)
cdecl(tomont_avx2_asm):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
call tomont128_avx
add $256,%rdi
call tomont128_avx
ret
|