1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
|
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
#ifdef __thumb2__
.thumb_func mul_1x1_ialu
#endif
.align 5
mul_1x1_ialu:
mov r4,#0
bic r5,r1,#3<<30 @ a1=a&0x3fffffff
str r4,[sp,#0] @ tab[0]=0
add r6,r5,r5 @ a2=a1<<1
str r5,[sp,#4] @ tab[1]=a1
eor r7,r5,r6 @ a1^a2
str r6,[sp,#8] @ tab[2]=a2
mov r8,r5,lsl#2 @ a4=a1<<2
str r7,[sp,#12] @ tab[3]=a1^a2
eor r9,r5,r8 @ a1^a4
str r8,[sp,#16] @ tab[4]=a4
eor r4,r6,r8 @ a2^a4
str r9,[sp,#20] @ tab[5]=a1^a4
eor r7,r7,r8 @ a1^a2^a4
str r4,[sp,#24] @ tab[6]=a2^a4
and r8,r12,r0,lsl#2
str r7,[sp,#28] @ tab[7]=a1^a2^a4
and r9,r12,r0,lsr#1
ldr r5,[sp,r8] @ tab[b & 0x7]
and r8,r12,r0,lsr#4
ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
and r9,r12,r0,lsr#7
ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
eor r5,r5,r7,lsl#3 @ stall
mov r4,r7,lsr#29
ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
and r8,r12,r0,lsr#10
eor r5,r5,r6,lsl#6
eor r4,r4,r6,lsr#26
ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
and r9,r12,r0,lsr#13
eor r5,r5,r7,lsl#9
eor r4,r4,r7,lsr#23
ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
and r8,r12,r0,lsr#16
eor r5,r5,r6,lsl#12
eor r4,r4,r6,lsr#20
ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
and r9,r12,r0,lsr#19
eor r5,r5,r7,lsl#15
eor r4,r4,r7,lsr#17
ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
and r8,r12,r0,lsr#22
eor r5,r5,r6,lsl#18
eor r4,r4,r6,lsr#14
ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
and r9,r12,r0,lsr#25
eor r5,r5,r7,lsl#21
eor r4,r4,r7,lsr#11
ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
tst r1,#1<<30
and r8,r12,r0,lsr#28
eor r5,r5,r6,lsl#24
eor r4,r4,r6,lsr#8
ldr r6,[sp,r8] @ tab[b >> 30 ]
#ifdef __thumb2__
itt ne
#endif
eorne r5,r5,r0,lsl#30
eorne r4,r4,r0,lsr#2
tst r1,#1<<31
eor r5,r5,r7,lsl#27
eor r4,r4,r7,lsr#5
#ifdef __thumb2__
itt ne
#endif
eorne r5,r5,r0,lsl#31
eorne r4,r4,r0,lsr#1
eor r5,r5,r6,lsl#30
eor r4,r4,r6,lsr#2
mov pc,lr
.globl _bn_GF2m_mul_2x2
#ifdef __thumb2__
.thumb_func _bn_GF2m_mul_2x2
#endif
.align 5
_bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
stmdb sp!,{r10,lr}
ldr r12,LOPENSSL_armcap
adr r10,LOPENSSL_armcap
ldr r12,[r12,r10]
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
itt ne
ldrne r10,[sp],#8
bne LNEON
stmdb sp!,{r4,r5,r6,r7,r8,r9}
#else
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
#endif
mov r10,r0 @ reassign 1st argument
mov r0,r3 @ r0=b1
sub r7,sp,#36
mov r8,sp
and r7,r7,#-32
ldr r3,[sp,#32] @ load b0
mov r12,#7<<2
mov sp,r7 @ allocate tab[8]
str r8,[r7,#32]
bl mul_1x1_ialu @ a1·b1
str r5,[r10,#8]
str r4,[r10,#12]
eor r0,r0,r3 @ flip b0 and b1
eor r1,r1,r2 @ flip a0 and a1
eor r3,r3,r0
eor r2,r2,r1
eor r0,r0,r3
eor r1,r1,r2
bl mul_1x1_ialu @ a0·b0
str r5,[r10]
str r4,[r10,#4]
eor r1,r1,r2
eor r0,r0,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
ldmia r10,{r6,r7,r8,r9}
eor r5,r5,r4
ldr sp,[sp,#32] @ destroy tab[8]
eor r4,r4,r7
eor r5,r5,r6
eor r4,r4,r8
eor r5,r5,r9
eor r4,r4,r9
str r4,[r10,#8]
eor r5,r5,r4
str r5,[r10,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
#if __ARM_MAX_ARCH__>=7
.align 5
LNEON:
ldr r12, [sp] @ 5th argument
vmov d26, r2, r1
vmov d27, r12, r3
vmov.i64 d28, #0x0000ffffffffffff
vmov.i64 d29, #0x00000000ffffffff
vmov.i64 d30, #0x000000000000ffff
vext.8 d2, d26, d26, #1 @ A1
vmull.p8 q1, d2, d27 @ F = A1*B
vext.8 d0, d27, d27, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d4, d26, d26, #2 @ A2
vmull.p8 q2, d4, d27 @ H = A2*B
vext.8 d16, d27, d27, #2 @ B2
vmull.p8 q8, d26, d16 @ G = A*B2
vext.8 d6, d26, d26, #3 @ A3
veor q1, q1, q0 @ L = E + F
vmull.p8 q3, d6, d27 @ J = A3*B
vext.8 d0, d27, d27, #3 @ B3
veor q2, q2, q8 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3
veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
vand d3, d3, d28
vext.8 d16, d27, d27, #4 @ B4
veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
vand d5, d5, d29
vmull.p8 q8, d26, d16 @ K = A*B4
veor q3, q3, q0 @ N = I + J
veor d2, d2, d3
veor d4, d4, d5
veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
vand d7, d7, d30
vext.8 q1, q1, q1, #15
veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d17, #0
vext.8 q2, q2, q2, #14
veor d6, d6, d7
vmull.p8 q0, d26, d27 @ D = A*B
vext.8 q8, q8, q8, #12
vext.8 q3, q3, q3, #13
veor q1, q1, q2
veor q3, q3, q8
veor q0, q0, q1
veor q0, q0, q3
vst1.32 {q0}, [r0]
bx lr @ bx lr
#endif
#if __ARM_MAX_ARCH__>=7
.align 5
LOPENSSL_armcap:
.word OPENSSL_armcap_P-.
#endif
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 5
#if __ARM_MAX_ARCH__>=7
.comm _OPENSSL_armcap_P,4
.non_lazy_symbol_pointer
OPENSSL_armcap_P:
.indirect_symbol _OPENSSL_armcap_P
.long 0
#endif
|