diff options
author | deshevoy <deshevoy@yandex-team.ru> | 2022-02-10 16:46:57 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:57 +0300 |
commit | 28148f76dbfcc644d96427d41c92f36cbf2fdc6e (patch) | |
tree | b83306b6e37edeea782e9eed673d89286c4fef35 /contrib/libs/openssl/asm/aarch64/crypto/chacha | |
parent | e988f30484abe5fdeedcc7a5d3c226c01a21800c (diff) | |
download | ydb-28148f76dbfcc644d96427d41c92f36cbf2fdc6e.tar.gz |
Restoring authorship annotation for <deshevoy@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/openssl/asm/aarch64/crypto/chacha')
-rw-r--r-- | contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S | 3952 |
1 files changed, 1976 insertions, 1976 deletions
diff --git a/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S b/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S index aa3b872cfc..f4676cbf68 100644 --- a/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S +++ b/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S @@ -1,1977 +1,1977 @@ -#include "arm_arch.h" - -.text - - +#include "arm_arch.h" + +.text + + .hidden OPENSSL_armcap_P - -.align 5 -.Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: -.long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function -.align 5 -ChaCha20_ctr32: - cbz x2,.Labort - adr x5,.LOPENSSL_armcap_P - cmp x2,#192 - b.lo .Lshort -#ifdef __ILP32__ - ldrsw x6,[x5] -#else - ldr x6,[x5] -#endif - ldr w17,[x6,x5] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - -.Lshort: -.inst 0xd503233f // paciasp - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter -#ifdef __ARMEB__ - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - -.Loop_outer: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov w7,w23 - lsr x8,x23,#32 - mov w9,w24 - lsr x10,x24,#32 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#64 -.Loop: - sub x4,x4,#1 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - ror w21,w21,#16 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#20 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - ror w21,w21,#24 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#25 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#16 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - ror w9,w9,#20 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#24 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - ror w9,w9,#25 - cbnz x4,.Loop - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - b.lo .Ltail - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - - b.hi .Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp -.Labort: - ret - -.align 4 -.Ltail: - add x2,x2,#64 -.Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] - -.Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp - ret -.size ChaCha20_ctr32,.-ChaCha20_ctr32 - -.type ChaCha20_neon,%function -.align 5 -ChaCha20_neon: -.inst 0xd503233f // paciasp - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - cmp x2,#512 - b.hs .L512_or_more_neon - - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - -.Loop_outer_neon: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov v0.16b,v24.16b - mov w7,w23 - lsr x8,x23,#32 - mov v4.16b,v24.16b - mov w9,w24 - lsr x10,x24,#32 - mov v16.16b,v24.16b - mov w11,w25 - mov v1.16b,v25.16b - lsr x12,x25,#32 - mov v5.16b,v25.16b - mov w13,w26 - mov v17.16b,v25.16b - lsr x14,x26,#32 - mov v3.16b,v27.16b - mov w15,w27 - mov v7.16b,v28.16b - lsr x16,x27,#32 - mov v19.16b,v29.16b - mov w17,w28 - mov v2.16b,v26.16b - lsr x19,x28,#32 - mov v6.16b,v26.16b - mov w20,w30 - mov v18.16b,v26.16b - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#256 -.Loop_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v16.4s,v16.4s,v17.4s - add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 - eor v19.16b,v19.16b,v16.16b - eor w19,w19,w6 - rev32 v3.8h,v3.8h - eor w20,w20,w7 - rev32 v7.8h,v7.8h - eor w21,w21,w8 - rev32 v19.8h,v19.8h - ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#20 - add w16,w16,w21 - ushr v5.4s,v21.4s,#20 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 - eor w10,w10,w14 - sli v1.4s,v20.4s,#12 - eor w11,w11,w15 - sli v5.4s,v21.4s,#12 - eor w12,w12,w16 - sli v17.4s,v22.4s,#12 - ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s - ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s - ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s - ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w11 - ushr v3.4s,v20.4s,#24 - add w8,w8,w12 - ushr v7.4s,v21.4s,#24 - eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 - eor w19,w19,w6 - sli v3.4s,v20.4s,#8 - eor w20,w20,w7 - sli v7.4s,v21.4s,#8 - eor w21,w21,w8 - sli v19.4s,v22.4s,#8 - ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#25 - add w16,w16,w21 - ushr v5.4s,v21.4s,#25 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 - eor w10,w10,w14 - sli v1.4s,v20.4s,#7 - eor w11,w11,w15 - sli v5.4s,v21.4s,#7 - eor w12,w12,w16 - sli v17.4s,v22.4s,#7 - ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w10 - add v4.4s,v4.4s,v5.4s - add w6,w6,w11 - add v16.4s,v16.4s,v17.4s - add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b - eor w17,w17,w6 - rev32 v3.8h,v3.8h - eor w19,w19,w7 - rev32 v7.8h,v7.8h - eor w20,w20,w8 - rev32 v19.8h,v19.8h - ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#20 - add w14,w14,w20 - ushr v5.4s,v21.4s,#20 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 - eor w11,w11,w16 - sli v1.4s,v20.4s,#12 - eor w12,w12,w13 - sli v5.4s,v21.4s,#12 - eor w9,w9,w14 - sli v17.4s,v22.4s,#12 - ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s - ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s - ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s - ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w12 - ushr v3.4s,v20.4s,#24 - add w8,w8,w9 - ushr v7.4s,v21.4s,#24 - eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 - eor w17,w17,w6 - sli v3.4s,v20.4s,#8 - eor w19,w19,w7 - sli v7.4s,v21.4s,#8 - eor w20,w20,w8 - sli v19.4s,v22.4s,#8 - ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#25 - add w14,w14,w20 - ushr v5.4s,v21.4s,#25 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 - eor w11,w11,w16 - sli v1.4s,v20.4s,#7 - eor w12,w12,w13 - sli v5.4s,v21.4s,#7 - eor w9,w9,w14 - sli v17.4s,v22.4s,#7 - ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,.Loop_neon - - add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s - add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s - add w7,w7,w23 - add v16.4s,v16.4s,v24.4s - add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s - add w9,w9,w24 - add v6.4s,v6.4s,v26.4s - add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s - add w11,w11,w25 - add v3.4s,v3.4s,v27.4s - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add v7.4s,v7.4s,v28.4s - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add v19.4s,v19.4s,v29.4s - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add v1.4s,v1.4s,v25.4s - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add v5.4s,v5.4s,v25.4s - add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s - - b.lo .Ltail_neon - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 - stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s - stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s - add x0,x0,#64 - - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - b.hi .Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp - ret - -.Ltail_neon: - add x2,x2,#256 - cmp x2,#64 - b.lo .Less_than_64 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_128 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_192 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon - -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon - -.align 4 -.Last_neon: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - -.Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -.Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp - ret -.size ChaCha20_neon,.-ChaCha20_neon -.type ChaCha20_512_neon,%function -.align 5 -ChaCha20_512_neon: -.inst 0xd503233f // paciasp - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - -.L512_or_more_neon: - sub sp,sp,#128+64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub x2,x2,#512 // not typo - -.Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b - mov w5,w22 // unpack key block - mov v5.16b,v25.16b - lsr x6,x22,#32 - mov v9.16b,v25.16b - mov w7,w23 - mov v13.16b,v25.16b - lsr x8,x23,#32 - mov v17.16b,v25.16b - mov w9,w24 - mov v21.16b,v25.16b - lsr x10,x24,#32 - mov v3.16b,v27.16b - mov w11,w25 - mov v7.16b,v28.16b - lsr x12,x25,#32 - mov v11.16b,v29.16b - mov w13,w26 - mov v15.16b,v30.16b - lsr x14,x26,#32 - mov v2.16b,v26.16b - mov w15,w27 - mov v6.16b,v26.16b - lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 - mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 - lsr x19,x28,#32 - mov v10.16b,v26.16b - mov w20,w30 - mov v14.16b,v26.16b - lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] - - mov x4,#5 - subs x2,x2,#512 -.Loop_upper_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_upper_neon - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - mov w5,w22 // unpack key block - lsr x6,x22,#32 - stp x9,x11,[x0,#16] - mov w7,w23 - lsr x8,x23,#32 - stp x13,x15,[x0,#32] - mov w9,w24 - lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#5 -.Loop_lower_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_lower_neon - - add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] - add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] - add w7,w7,w23 - ldp q28,q29,[sp,#64] - add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s - add w9,w9,w24 - add v4.4s,v4.4s,v24.4s - add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s - add w11,w11,w25 - add v12.4s,v12.4s,v24.4s - add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s - add w13,w13,w26 - add v20.4s,v20.4s,v24.4s - add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s - add w15,w15,w27 - add v6.4s,v6.4s,v26.4s - add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s - add w17,w17,w28 - add v14.4s,v14.4s,v26.4s - add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s - add w20,w20,w30 - add v22.4s,v22.4s,v26.4s - add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 - add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 - add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s - ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s - add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s - add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s - ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s - add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s - add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s - ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s - add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s - add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s - ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s - add x1,x1,#64 - add v21.4s,v21.4s,v25.4s - -#ifdef __ARMEB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 - - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 - - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 - eor v16.16b,v16.16b,v8.16b - eor v17.16b,v17.16b,v9.16b - eor v18.16b,v18.16b,v10.16b - eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - shl v0.4s,v31.4s,#1 // 4 -> 8 - eor v20.16b,v20.16b,v12.16b - eor v21.16b,v21.16b,v13.16b - eor v22.16b,v22.16b,v14.16b - eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s - - b.hs .Loop_outer_512_neon - - adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] - - b.eq .Ldone_512_neon - - cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s - add sp,sp,#128 - b.hs .Loop_outer_neon - - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b - b .Loop_outer - -.Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp - ret -.size ChaCha20_512_neon,.-ChaCha20_512_neon + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + cbz x2,.Labort + adr x5,.LOPENSSL_armcap_P + cmp x2,#192 + b.lo .Lshort +#ifdef __ILP32__ + ldrsw x6,[x5] +#else + ldr x6,[x5] +#endif + ldr w17,[x6,x5] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +.Lshort: +.inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __ARMEB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +.Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +.Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,.Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo .Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.inst 0xd50323bf // autiasp +.Labort: + ret + +.align 4 +.Ltail: + add x2,x2,#64 +.Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +.Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.inst 0xd50323bf // autiasp + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: +.inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +.Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +.Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,.Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo .Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.inst 0xd50323bf // autiasp + ret + +.Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo .Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b .Last_neon + +.Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b .Last_neon +.Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +.Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.inst 0xd50323bf // autiasp + ret +.size ChaCha20_neon,.-ChaCha20_neon +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: +.inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +.Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +.Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +.Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs .Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq .Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.inst 0xd50323bf // autiasp + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon |