aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/openssl/asm/aarch64/crypto/chacha
diff options
context:
space:
mode:
authordeshevoy <deshevoy@yandex-team.ru>2022-02-10 16:46:57 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:57 +0300
commit28148f76dbfcc644d96427d41c92f36cbf2fdc6e (patch)
treeb83306b6e37edeea782e9eed673d89286c4fef35 /contrib/libs/openssl/asm/aarch64/crypto/chacha
parente988f30484abe5fdeedcc7a5d3c226c01a21800c (diff)
downloadydb-28148f76dbfcc644d96427d41c92f36cbf2fdc6e.tar.gz
Restoring authorship annotation for <deshevoy@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/openssl/asm/aarch64/crypto/chacha')
-rw-r--r--contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S3952
1 files changed, 1976 insertions, 1976 deletions
diff --git a/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S b/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S
index aa3b872cfc..f4676cbf68 100644
--- a/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S
+++ b/contrib/libs/openssl/asm/aarch64/crypto/chacha/chacha-armv8.S
@@ -1,1977 +1,1977 @@
-#include "arm_arch.h"
-
-.text
-
-
+#include "arm_arch.h"
+
+.text
+
+
.hidden OPENSSL_armcap_P
-
-.align 5
-.Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-.Lone:
-.long 1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
- cbz x2,.Labort
- adr x5,.LOPENSSL_armcap_P
- cmp x2,#192
- b.lo .Lshort
-#ifdef __ILP32__
- ldrsw x6,[x5]
-#else
- ldr x6,[x5]
-#endif
- ldr w17,[x6,x5]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
-.Lshort:
-.inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ldp x28,x30,[x4] // load counter
-#ifdef __ARMEB__
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
-
-.Loop_outer:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov w7,w23
- lsr x8,x23,#32
- mov w9,w24
- lsr x10,x24,#32
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#64
-.Loop:
- sub x4,x4,#1
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- ror w21,w21,#16
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#20
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- ror w21,w21,#24
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#25
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#16
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- ror w9,w9,#20
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#24
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- ror w9,w9,#25
- cbnz x4,.Loop
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- b.lo .Ltail
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
-
- b.hi .Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.inst 0xd50323bf // autiasp
-.Labort:
- ret
-
-.align 4
-.Ltail:
- add x2,x2,#64
-.Less_than_64:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- stp x5,x7,[sp,#0]
- stp x9,x11,[sp,#16]
- stp x13,x15,[sp,#32]
- stp x17,x20,[sp,#48]
-
-.Loop_tail:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,.Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.inst 0xd50323bf // autiasp
- ret
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
-.inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- cmp x2,#512
- b.hs .L512_or_more_neon
-
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __ARMEB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
-.Loop_outer_neon:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov v0.16b,v24.16b
- mov w7,w23
- lsr x8,x23,#32
- mov v4.16b,v24.16b
- mov w9,w24
- lsr x10,x24,#32
- mov v16.16b,v24.16b
- mov w11,w25
- mov v1.16b,v25.16b
- lsr x12,x25,#32
- mov v5.16b,v25.16b
- mov w13,w26
- mov v17.16b,v25.16b
- lsr x14,x26,#32
- mov v3.16b,v27.16b
- mov w15,w27
- mov v7.16b,v28.16b
- lsr x16,x27,#32
- mov v19.16b,v29.16b
- mov w17,w28
- mov v2.16b,v26.16b
- lsr x19,x28,#32
- mov v6.16b,v26.16b
- mov w20,w30
- mov v18.16b,v26.16b
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#256
-.Loop_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w11
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w12
- eor v7.16b,v7.16b,v4.16b
- eor w17,w17,w5
- eor v19.16b,v19.16b,v16.16b
- eor w19,w19,w6
- rev32 v3.8h,v3.8h
- eor w20,w20,w7
- rev32 v7.8h,v7.8h
- eor w21,w21,w8
- rev32 v19.8h,v19.8h
- ror w17,w17,#16
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#16
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#16
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#16
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#20
- add w16,w16,w21
- ushr v5.4s,v21.4s,#20
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#20
- eor w10,w10,w14
- sli v1.4s,v20.4s,#12
- eor w11,w11,w15
- sli v5.4s,v21.4s,#12
- eor w12,w12,w16
- sli v17.4s,v22.4s,#12
- ror w9,w9,#20
- add v0.4s,v0.4s,v1.4s
- ror w10,w10,#20
- add v4.4s,v4.4s,v5.4s
- ror w11,w11,#20
- add v16.4s,v16.4s,v17.4s
- ror w12,w12,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w9
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w10
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w11
- ushr v3.4s,v20.4s,#24
- add w8,w8,w12
- ushr v7.4s,v21.4s,#24
- eor w17,w17,w5
- ushr v19.4s,v22.4s,#24
- eor w19,w19,w6
- sli v3.4s,v20.4s,#8
- eor w20,w20,w7
- sli v7.4s,v21.4s,#8
- eor w21,w21,w8
- sli v19.4s,v22.4s,#8
- ror w17,w17,#24
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#24
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#24
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#24
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#25
- add w16,w16,w21
- ushr v5.4s,v21.4s,#25
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#25
- eor w10,w10,w14
- sli v1.4s,v20.4s,#7
- eor w11,w11,w15
- sli v5.4s,v21.4s,#7
- eor w12,w12,w16
- sli v17.4s,v22.4s,#7
- ror w9,w9,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w10,w10,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w10
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w11
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w12
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w9
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w5
- eor v19.16b,v19.16b,v16.16b
- eor w17,w17,w6
- rev32 v3.8h,v3.8h
- eor w19,w19,w7
- rev32 v7.8h,v7.8h
- eor w20,w20,w8
- rev32 v19.8h,v19.8h
- ror w21,w21,#16
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#16
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#16
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#16
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#20
- add w14,w14,w20
- ushr v5.4s,v21.4s,#20
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#20
- eor w11,w11,w16
- sli v1.4s,v20.4s,#12
- eor w12,w12,w13
- sli v5.4s,v21.4s,#12
- eor w9,w9,w14
- sli v17.4s,v22.4s,#12
- ror w10,w10,#20
- add v0.4s,v0.4s,v1.4s
- ror w11,w11,#20
- add v4.4s,v4.4s,v5.4s
- ror w12,w12,#20
- add v16.4s,v16.4s,v17.4s
- ror w9,w9,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w12
- ushr v3.4s,v20.4s,#24
- add w8,w8,w9
- ushr v7.4s,v21.4s,#24
- eor w21,w21,w5
- ushr v19.4s,v22.4s,#24
- eor w17,w17,w6
- sli v3.4s,v20.4s,#8
- eor w19,w19,w7
- sli v7.4s,v21.4s,#8
- eor w20,w20,w8
- sli v19.4s,v22.4s,#8
- ror w21,w21,#24
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#24
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#24
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#24
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#25
- add w14,w14,w20
- ushr v5.4s,v21.4s,#25
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#25
- eor w11,w11,w16
- sli v1.4s,v20.4s,#7
- eor w12,w12,w13
- sli v5.4s,v21.4s,#7
- eor w9,w9,w14
- sli v17.4s,v22.4s,#7
- ror w10,w10,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w11,w11,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w12,w12,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- cbnz x4,.Loop_neon
-
- add w5,w5,w22 // accumulate key block
- add v0.4s,v0.4s,v24.4s
- add x6,x6,x22,lsr#32
- add v4.4s,v4.4s,v24.4s
- add w7,w7,w23
- add v16.4s,v16.4s,v24.4s
- add x8,x8,x23,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w9,w9,w24
- add v6.4s,v6.4s,v26.4s
- add x10,x10,x24,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w11,w11,w25
- add v3.4s,v3.4s,v27.4s
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add v7.4s,v7.4s,v28.4s
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add v19.4s,v19.4s,v29.4s
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add v1.4s,v1.4s,v25.4s
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add v5.4s,v5.4s,v25.4s
- add x21,x21,x30,lsr#32
- add v17.4s,v17.4s,v25.4s
-
- b.lo .Ltail_neon
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v20.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v21.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v22.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v23.16b
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- add v27.4s,v27.4s,v31.4s // += 4
- stp x13,x15,[x0,#32]
- add v28.4s,v28.4s,v31.4s
- stp x17,x20,[x0,#48]
- add v29.4s,v29.4s,v31.4s
- add x0,x0,#64
-
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- eor v16.16b,v16.16b,v0.16b
- eor v17.16b,v17.16b,v1.16b
- eor v18.16b,v18.16b,v2.16b
- eor v19.16b,v19.16b,v3.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- b.hi .Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.inst 0xd50323bf // autiasp
- ret
-
-.Ltail_neon:
- add x2,x2,#256
- cmp x2,#64
- b.lo .Less_than_64
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- b.eq .Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo .Less_than_128
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v0.16b,v0.16b,v20.16b
- eor v1.16b,v1.16b,v21.16b
- eor v2.16b,v2.16b,v22.16b
- eor v3.16b,v3.16b,v23.16b
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- b.eq .Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo .Less_than_192
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- b.eq .Ldone_neon
- sub x2,x2,#64
-
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
- b .Last_neon
-
-.Less_than_128:
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
- b .Last_neon
-.Less_than_192:
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
- b .Last_neon
-
-.align 4
-.Last_neon:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
-.Loop_tail_neon:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,.Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-.Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.inst 0xd50323bf // autiasp
- ret
-.size ChaCha20_neon,.-ChaCha20_neon
-.type ChaCha20_512_neon,%function
-.align 5
-ChaCha20_512_neon:
-.inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
-.L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __ARMEB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- stp q24,q25,[sp,#0] // off-load key block, invariant part
- add v27.4s,v27.4s,v31.4s // not typo
- str q26,[sp,#32]
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- add v30.4s,v29.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub x2,x2,#512 // not typo
-
-.Loop_outer_512_neon:
- mov v0.16b,v24.16b
- mov v4.16b,v24.16b
- mov v8.16b,v24.16b
- mov v12.16b,v24.16b
- mov v16.16b,v24.16b
- mov v20.16b,v24.16b
- mov v1.16b,v25.16b
- mov w5,w22 // unpack key block
- mov v5.16b,v25.16b
- lsr x6,x22,#32
- mov v9.16b,v25.16b
- mov w7,w23
- mov v13.16b,v25.16b
- lsr x8,x23,#32
- mov v17.16b,v25.16b
- mov w9,w24
- mov v21.16b,v25.16b
- lsr x10,x24,#32
- mov v3.16b,v27.16b
- mov w11,w25
- mov v7.16b,v28.16b
- lsr x12,x25,#32
- mov v11.16b,v29.16b
- mov w13,w26
- mov v15.16b,v30.16b
- lsr x14,x26,#32
- mov v2.16b,v26.16b
- mov w15,w27
- mov v6.16b,v26.16b
- lsr x16,x27,#32
- add v19.4s,v3.4s,v31.4s // +4
- mov w17,w28
- add v23.4s,v7.4s,v31.4s // +4
- lsr x19,x28,#32
- mov v10.16b,v26.16b
- mov w20,w30
- mov v14.16b,v26.16b
- lsr x21,x30,#32
- mov v18.16b,v26.16b
- stp q27,q28,[sp,#48] // off-load key block, variable part
- mov v22.16b,v26.16b
- str q29,[sp,#80]
-
- mov x4,#5
- subs x2,x2,#512
-.Loop_upper_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,.Loop_upper_neon
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- stp x9,x11,[x0,#16]
- mov w7,w23
- lsr x8,x23,#32
- stp x13,x15,[x0,#32]
- mov w9,w24
- lsr x10,x24,#32
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#5
-.Loop_lower_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,.Loop_lower_neon
-
- add w5,w5,w22 // accumulate key block
- ldp q24,q25,[sp,#0]
- add x6,x6,x22,lsr#32
- ldp q26,q27,[sp,#32]
- add w7,w7,w23
- ldp q28,q29,[sp,#64]
- add x8,x8,x23,lsr#32
- add v0.4s,v0.4s,v24.4s
- add w9,w9,w24
- add v4.4s,v4.4s,v24.4s
- add x10,x10,x24,lsr#32
- add v8.4s,v8.4s,v24.4s
- add w11,w11,w25
- add v12.4s,v12.4s,v24.4s
- add x12,x12,x25,lsr#32
- add v16.4s,v16.4s,v24.4s
- add w13,w13,w26
- add v20.4s,v20.4s,v24.4s
- add x14,x14,x26,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w15,w15,w27
- add v6.4s,v6.4s,v26.4s
- add x16,x16,x27,lsr#32
- add v10.4s,v10.4s,v26.4s
- add w17,w17,w28
- add v14.4s,v14.4s,v26.4s
- add x19,x19,x28,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w20,w20,w30
- add v22.4s,v22.4s,v26.4s
- add x21,x21,x30,lsr#32
- add v19.4s,v19.4s,v31.4s // +4
- add x5,x5,x6,lsl#32 // pack
- add v23.4s,v23.4s,v31.4s // +4
- add x7,x7,x8,lsl#32
- add v3.4s,v3.4s,v27.4s
- ldp x6,x8,[x1,#0] // load input
- add v7.4s,v7.4s,v28.4s
- add x9,x9,x10,lsl#32
- add v11.4s,v11.4s,v29.4s
- add x11,x11,x12,lsl#32
- add v15.4s,v15.4s,v30.4s
- ldp x10,x12,[x1,#16]
- add v19.4s,v19.4s,v27.4s
- add x13,x13,x14,lsl#32
- add v23.4s,v23.4s,v28.4s
- add x15,x15,x16,lsl#32
- add v1.4s,v1.4s,v25.4s
- ldp x14,x16,[x1,#32]
- add v5.4s,v5.4s,v25.4s
- add x17,x17,x19,lsl#32
- add v9.4s,v9.4s,v25.4s
- add x20,x20,x21,lsl#32
- add v13.4s,v13.4s,v25.4s
- ldp x19,x21,[x1,#48]
- add v17.4s,v17.4s,v25.4s
- add x1,x1,#64
- add v21.4s,v21.4s,v25.4s
-
-#ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v24.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v25.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v26.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v27.16b
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#7 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
- eor v4.16b,v4.16b,v24.16b
- eor v5.16b,v5.16b,v25.16b
- eor v6.16b,v6.16b,v26.16b
- eor v7.16b,v7.16b,v27.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- eor v8.16b,v8.16b,v0.16b
- ldp q24,q25,[sp,#0]
- eor v9.16b,v9.16b,v1.16b
- ldp q26,q27,[sp,#32]
- eor v10.16b,v10.16b,v2.16b
- eor v11.16b,v11.16b,v3.16b
- st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
-
- ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
- eor v12.16b,v12.16b,v4.16b
- eor v13.16b,v13.16b,v5.16b
- eor v14.16b,v14.16b,v6.16b
- eor v15.16b,v15.16b,v7.16b
- st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
-
- ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
- eor v16.16b,v16.16b,v8.16b
- eor v17.16b,v17.16b,v9.16b
- eor v18.16b,v18.16b,v10.16b
- eor v19.16b,v19.16b,v11.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- shl v0.4s,v31.4s,#1 // 4 -> 8
- eor v20.16b,v20.16b,v12.16b
- eor v21.16b,v21.16b,v13.16b
- eor v22.16b,v22.16b,v14.16b
- eor v23.16b,v23.16b,v15.16b
- st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
-
- add v27.4s,v27.4s,v0.4s // += 8
- add v28.4s,v28.4s,v0.4s
- add v29.4s,v29.4s,v0.4s
- add v30.4s,v30.4s,v0.4s
-
- b.hs .Loop_outer_512_neon
-
- adds x2,x2,#512
- ushr v0.4s,v31.4s,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp q24,q31,[sp,#0] // wipe off-load area
- stp q24,q31,[sp,#32]
- stp q24,q31,[sp,#64]
-
- b.eq .Ldone_512_neon
-
- cmp x2,#192
- sub v27.4s,v27.4s,v0.4s // -= 1
- sub v28.4s,v28.4s,v0.4s
- sub v29.4s,v29.4s,v0.4s
- add sp,sp,#128
- b.hs .Loop_outer_neon
-
- eor v25.16b,v25.16b,v25.16b
- eor v26.16b,v26.16b,v26.16b
- eor v27.16b,v27.16b,v27.16b
- eor v28.16b,v28.16b,v28.16b
- eor v29.16b,v29.16b,v29.16b
- eor v30.16b,v30.16b,v30.16b
- b .Loop_outer
-
-.Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.inst 0xd50323bf // autiasp
- ret
-.size ChaCha20_512_neon,.-ChaCha20_512_neon
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ cbz x2,.Labort
+ adr x5,.LOPENSSL_armcap_P
+ cmp x2,#192
+ b.lo .Lshort
+#ifdef __ILP32__
+ ldrsw x6,[x5]
+#else
+ ldr x6,[x5]
+#endif
+ ldr w17,[x6,x5]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+.Lshort:
+.inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.inst 0xd50323bf // autiasp
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.inst 0xd50323bf // autiasp
+ ret
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+.inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.inst 0xd50323bf // autiasp
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.inst 0xd50323bf // autiasp
+ ret
+.size ChaCha20_neon,.-ChaCha20_neon
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+.inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.inst 0xd50323bf // autiasp
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon