1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
|
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================
// Original code can be found at the link below:
// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
// Some function names were changed to be consistent with Go function
// names. For instance, function aes_p8_set_{en,de}crypt_key become
// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
// and a new session was created (doEncryptKeyAsm). This was necessary to
// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
// There were other modifications as well but kept the same functionality.
#include "textflag.h"
// For expandKeyAsm
#define INP R3
#define BITS R4
#define OUTENC R5 // Pointer to next expanded encrypt key
#define PTR R6
#define CNT R7
#define ROUNDS R8
#define OUTDEC R9 // Pointer to next expanded decrypt key
#define TEMP R19
#define ZERO V0
#define IN0 V1
#define IN1 V2
#define KEY V3
#define RCON V4
#define MASK V5
#define TMP V6
#define STAGE V7
#define OUTPERM V8
#define OUTMASK V9
#define OUTHEAD V10
#define OUTTAIL V11
// For P9 instruction emulation
#define ESPERM V21 // Endian swapping permute into BE
#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXVB16X
// For {en,de}cryptBlockAsm
#define BLK_INP R3
#define BLK_OUT R4
#define BLK_KEY R5
#define BLK_ROUNDS R6
#define BLK_IDX R7
DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
DATA ·rcon+0x08(SB)/8, $0x0706050403020100
DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x40(SB)/8, $0x0000000000000000
DATA ·rcon+0x48(SB)/8, $0x0000000000000000
GLOBL ·rcon(SB), RODATA, $80
#ifdef GOARCH_ppc64le
# ifdef GOPPC64_power9
#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA+RB)
#define XXBRD_ON_LE(VA,VT) XXBRD VA, VT
# else
// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
// doublewords and byte-swapping each doubleword to emulate BE load/stores.
#define NEEDS_ESPERM
#define P8_LXVB16X(RA,RB,VT) \
LXVD2X (RA+RB), VT \
VPERM VT, VT, ESPERM, VT
#define P8_STXVB16X(VS,RA,RB) \
VPERM VS, VS, ESPERM, TMP2 \
STXVD2X TMP2, (RA+RB)
#define XXBRD_ON_LE(VA,VT) \
VPERM VA, VA, ESPERM, VT
# endif // defined(GOPPC64_power9)
#else
#define P8_LXVB16X(RA,RB,VT) LXVD2X (RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVD2X VS, (RA+RB)
#define XXBRD_ON_LE(VA, VT)
#endif // defined(GOARCH_ppc64le)
// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
// Load the arguments inside the registers
MOVD nr+0(FP), ROUNDS
MOVD key+8(FP), INP
MOVD enc+16(FP), OUTENC
MOVD dec+24(FP), OUTDEC
#ifdef NEEDS_ESPERM
MOVD $·rcon(SB), PTR // PTR points to rcon addr
LVX (PTR), ESPERM
ADD $0x10, PTR
#else
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
#endif
// Get key from memory and write aligned into VR
P8_LXVB16X(INP, R0, IN0)
ADD $0x10, INP, INP
MOVD $0x20, TEMP
CMPW ROUNDS, $12
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
LVX (PTR)(TEMP), MASK
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
MOVD $8, CNT // li 7,8 CNT = 8
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
// The expanded decrypt key is the expanded encrypt key stored in reverse order.
// Move OUTDEC to the last key location, and store in descending order.
ADD $160, OUTDEC, OUTDEC
BLT loop128
ADD $32, OUTDEC, OUTDEC
BEQ l192
ADD $32, OUTDEC, OUTDEC
JMP l256
loop128:
// Key schedule (Round 1 to 8)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
BC 0x10, 0, loop128 // bdnz .Loop128
LVX (PTR)(R0), RCON // lvx 4,0,6 Last two round keys
// Key schedule (Round 9)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
// Key schedule (Round 10)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
// Key schedule (Round 11)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
RET
l192:
LXSDX (INP+R0), IN1 // Load next 8 bytes into upper half of VSR.
XXBRD_ON_LE(IN1, IN1) // and convert to BE ordering on LE hosts.
MOVD $4, CNT // li 7,4
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSPLTISB $8, KEY // vspltisb 3,8
MOVD CNT, CTR // mtctr 7
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
loop192:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $8, ZERO, IN1, STAGE // vsldoi 7,0,2,8
VSPLTW $3, IN0, TMP // vspltw 6,1,3
VXOR TMP, IN1, TMP // vxor 6,6,2
VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
VSLDOI $8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X STAGE, (R0+OUTENC)
STXVD2X STAGE, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
STXVD2X STAGE, (R0+OUTENC)
STXVD2X STAGE, (R0+OUTDEC)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSPLTW $3, IN0, TMP // vspltw 6,1,3
VXOR TMP, IN1, TMP // vxor 6,6,2
VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
BC 0x10, 0, loop192 // bdnz .Loop192
RET
l256:
P8_LXVB16X(INP, R0, IN1)
MOVD $7, CNT // li 7,7
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
MOVD CNT, CTR // mtctr 7
loop256:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN1, (R0+OUTENC)
STXVD2X IN1, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
BC 0x12, 0, done // bdz .Ldone
VSPLTW $3, IN0, KEY // vspltw 3,1,3
VSLDOI $12, ZERO, IN1, TMP // vsldoi 6,0,2,12
VSBOX KEY, KEY // vsbox 3,3
VXOR IN1, TMP, IN1 // vxor 2,2,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN1, TMP, IN1 // vxor 2,2,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN1, KEY, IN1 // vxor 2,2,3
JMP loop256 // b .Loop256
done:
RET
// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
#ifdef NEEDS_ESPERM
MOVD $·rcon(SB), R7
LVX (R7), ESPERM // Permute value for P8_ macros.
#endif
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the ciphertext.
// Load xk[0:3] and xor with text
LXVD2X (R0+R5), V1
VXOR V0, V1, V0
// Load xk[4:11] and cipher
LXVD2X (R6+R5), V1
LXVD2X (R7+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[12:19] and cipher
LXVD2X (R8+R5), V1
LXVD2X (R9+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[20:27] and cipher
LXVD2X (R10+R5), V1
LXVD2X (R11+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
LXVD2X (R0+R5), V1
LXVD2X (R6+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[36:43] and cipher
LXVD2X (R7+R5), V1
LXVD2X (R8+R5), V2
BEQ CR1, Ldec_tail // Key size 10?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[44:51] and cipher
LXVD2X (R9+R5), V1
LXVD2X (R10+R5), V2
BEQ CR2, Ldec_tail // Key size 12?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[52:59] and cipher
LXVD2X (R11+R5), V1
LXVD2X (R12+R5), V2
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VCIPHER V0, V1, V1
VCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
#ifdef NEEDS_ESPERM
MOVD $·rcon(SB), R7
LVX (R7), ESPERM // Permute value for P8_ macros.
#endif
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the text.
// Load xk[0:3] and xor with ciphertext
LXVD2X (R0+R5), V1
VXOR V0, V1, V0
// Load xk[4:11] and cipher
LXVD2X (R6+R5), V1
LXVD2X (R7+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[12:19] and cipher
LXVD2X (R8+R5), V1
LXVD2X (R9+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[20:27] and cipher
LXVD2X (R10+R5), V1
LXVD2X (R11+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
LXVD2X (R0+R5), V1
LXVD2X (R6+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[36:43] and cipher
LXVD2X (R7+R5), V1
LXVD2X (R8+R5), V2
BEQ CR1, Ldec_tail // Key size 10?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[44:51] and cipher
LXVD2X (R9+R5), V1
LXVD2X (R10+R5), V2
BEQ CR2, Ldec_tail // Key size 12?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[52:59] and cipher
LXVD2X (R11+R5), V1
LXVD2X (R12+R5), V2
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VNCIPHER V0, V1, V1
VNCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// Remove defines from above so they can be defined here
#undef INP
#undef OUTENC
#undef ROUNDS
#undef KEY
#undef TMP
#define INP R3
#define OUTP R4
#define LEN R5
#define KEYP R6
#define ROUNDS R7
#define IVP R8
#define ENC R9
#define INOUT V2
#define TMP V3
#define IVEC V4
// Load the crypt key into VSRs.
//
// The expanded key is stored and loaded using
// STXVD2X/LXVD2X. The in-memory byte ordering
// depends on the endianness of the machine. The
// expanded keys are generated by expandKeyAsm above.
//
// Rkeyp holds the key pointer. It is clobbered. Once
// the expanded keys are loaded, it is not needed.
//
// R12,R14-R21 are scratch registers.
// For keyp of 10, V6, V11-V20 hold the expanded key.
// For keyp of 12, V6, V9-V20 hold the expanded key.
// For keyp of 14, V6, V7-V20 hold the expanded key.
#define LOAD_KEY(Rkeyp) \
MOVD $16, R12 \
MOVD $32, R14 \
MOVD $48, R15 \
MOVD $64, R16 \
MOVD $80, R17 \
MOVD $96, R18 \
MOVD $112, R19 \
MOVD $128, R20 \
MOVD $144, R21 \
LXVD2X (R0+Rkeyp), V6 \
ADD $16, Rkeyp \
BEQ CR1, L_start10 \
BEQ CR2, L_start12 \
LXVD2X (R0+Rkeyp), V7 \
LXVD2X (R12+Rkeyp), V8 \
ADD $32, Rkeyp \
L_start12: \
LXVD2X (R0+Rkeyp), V9 \
LXVD2X (R12+Rkeyp), V10 \
ADD $32, Rkeyp \
L_start10: \
LXVD2X (R0+Rkeyp), V11 \
LXVD2X (R12+Rkeyp), V12 \
LXVD2X (R14+Rkeyp), V13 \
LXVD2X (R15+Rkeyp), V14 \
LXVD2X (R16+Rkeyp), V15 \
LXVD2X (R17+Rkeyp), V16 \
LXVD2X (R18+Rkeyp), V17 \
LXVD2X (R19+Rkeyp), V18 \
LXVD2X (R20+Rkeyp), V19 \
LXVD2X (R21+Rkeyp), V20
// Perform aes cipher operation for keysize 10/12/14 using the keys
// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
//
// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
// performance V6 and IVEC can be swapped (xor is both associative and
// commutative) during encryption:
//
// VXOR INOUT, IVEC, INOUT
// VXOR INOUT, V6, INOUT
//
// into
//
// VXOR INOUT, V6, INOUT
// VXOR INOUT, IVEC, INOUT
//
#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
VXOR Vin, Vxor, Vout \
BEQ CR1, label10 \
BEQ CR2, label12 \
vcipher Vout, V7, Vout \
vcipher Vout, V8, Vout \
label12: \
vcipher Vout, V9, Vout \
vcipher Vout, V10, Vout \
label10: \
vcipher Vout, V11, Vout \
vcipher Vout, V12, Vout \
vcipher Vout, V13, Vout \
vcipher Vout, V14, Vout \
vcipher Vout, V15, Vout \
vcipher Vout, V16, Vout \
vcipher Vout, V17, Vout \
vcipher Vout, V18, Vout \
vcipher Vout, V19, Vout \
vciphel Vout, V20, Vout \
#define CLEAR_KEYS() \
VXOR V6, V6, V6 \
VXOR V7, V7, V7 \
VXOR V8, V8, V8 \
VXOR V9, V9, V9 \
VXOR V10, V10, V10 \
VXOR V11, V11, V11 \
VXOR V12, V12, V12 \
VXOR V13, V13, V13 \
VXOR V14, V14, V14 \
VXOR V15, V15, V15 \
VXOR V16, V16, V16 \
VXOR V17, V17, V17 \
VXOR V18, V18, V18 \
VXOR V19, V19, V19 \
VXOR V20, V20, V20
//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
MOVD src+0(FP), INP
MOVD dst+8(FP), OUTP
MOVD length+16(FP), LEN
MOVD key+24(FP), KEYP
MOVD iv+32(FP), IVP
MOVD enc+40(FP), ENC
MOVD nr+48(FP), ROUNDS
#ifdef NEEDS_ESPERM
MOVD $·rcon(SB), R11
LVX (R11), ESPERM // Permute value for P8_ macros.
#endif
// Assume len > 0 && len % blockSize == 0.
CMPW ENC, $0
P8_LXVB16X(IVP, R0, IVEC)
CMPU ROUNDS, $10, CR1
CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
// Setup key in VSRs, and set loop count in CTR.
LOAD_KEY(KEYP)
SRD $4, LEN
MOVD LEN, CTR
BEQ Lcbc_dec
PCALIGN $16
Lcbc_enc:
P8_LXVB16X(INP, R0, INOUT)
ADD $16, INP
VXOR INOUT, V6, INOUT
CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
P8_STXVB16X(INOUT, OUTP, R0)
ADD $16, OUTP
BDNZ Lcbc_enc
P8_STXVB16X(INOUT, IVP, R0)
CLEAR_KEYS()
RET
PCALIGN $16
Lcbc_dec:
P8_LXVB16X(INP, R0, TMP)
ADD $16, INP
CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
VXOR INOUT, IVEC, INOUT
VOR TMP, TMP, IVEC // TMP is IVEC for next block.
P8_STXVB16X(INOUT, OUTP, R0)
ADD $16, OUTP
BDNZ Lcbc_dec
P8_STXVB16X(IVEC, IVP, R0)
CLEAR_KEYS()
RET
|