summaryrefslogtreecommitdiffstats
path: root/contrib/go/_std_1.25/src/runtime/memmove_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/go/_std_1.25/src/runtime/memmove_arm64.s')
-rw-r--r--contrib/go/_std_1.25/src/runtime/memmove_arm64.s238
1 files changed, 0 insertions, 238 deletions
diff --git a/contrib/go/_std_1.25/src/runtime/memmove_arm64.s b/contrib/go/_std_1.25/src/runtime/memmove_arm64.s
deleted file mode 100644
index 8ec3ed86b9b..00000000000
--- a/contrib/go/_std_1.25/src/runtime/memmove_arm64.s
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// See memmove Go doc for important implementation constraints.
-
-// Register map
-//
-// dstin R0
-// src R1
-// count R2
-// dst R3 (same as R0, but gets modified in unaligned cases)
-// srcend R4
-// dstend R5
-// data R6-R17
-// tmp1 R14
-
-// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-// copies of up to 128 bytes, and large copies. The overhead of the overlap
-// check is negligible since it is only required for large copies.
-//
-// Large copies use a software pipelined loop processing 64 bytes per iteration.
-// The destination pointer is 16-byte aligned to minimize unaligned accesses.
-// The loop tail is handled by always copying 64 bytes from the end.
-
-// func memmove(to, from unsafe.Pointer, n uintptr)
-TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
- CBZ R2, copy0
-
- // Small copies: 1..16 bytes
- CMP $16, R2
- BLE copy16
-
- // Large copies
- CMP $128, R2
- BHI copy_long
- CMP $32, R2
- BHI copy32_128
-
- // Small copies: 17..32 bytes.
- LDP (R1), (R6, R7)
- ADD R1, R2, R4 // R4 points just past the last source byte
- LDP -16(R4), (R12, R13)
- STP (R6, R7), (R0)
- ADD R0, R2, R5 // R5 points just past the last destination byte
- STP (R12, R13), -16(R5)
- RET
-
-// Small copies: 1..16 bytes.
-copy16:
- ADD R1, R2, R4 // R4 points just past the last source byte
- ADD R0, R2, R5 // R5 points just past the last destination byte
- CMP $8, R2
- BLT copy7
- MOVD (R1), R6
- MOVD -8(R4), R7
- MOVD R6, (R0)
- MOVD R7, -8(R5)
- RET
-
-copy7:
- TBZ $2, R2, copy3
- MOVWU (R1), R6
- MOVWU -4(R4), R7
- MOVW R6, (R0)
- MOVW R7, -4(R5)
- RET
-
-copy3:
- TBZ $1, R2, copy1
- MOVHU (R1), R6
- MOVHU -2(R4), R7
- MOVH R6, (R0)
- MOVH R7, -2(R5)
- RET
-
-copy1:
- MOVBU (R1), R6
- MOVB R6, (R0)
-
-copy0:
- RET
-
- // Medium copies: 33..128 bytes.
-copy32_128:
- ADD R1, R2, R4 // R4 points just past the last source byte
- ADD R0, R2, R5 // R5 points just past the last destination byte
- LDP (R1), (R6, R7)
- LDP 16(R1), (R8, R9)
- LDP -32(R4), (R10, R11)
- LDP -16(R4), (R12, R13)
- CMP $64, R2
- BHI copy128
- STP (R6, R7), (R0)
- STP (R8, R9), 16(R0)
- STP (R10, R11), -32(R5)
- STP (R12, R13), -16(R5)
- RET
-
- // Copy 65..128 bytes.
-copy128:
- LDP 32(R1), (R14, R15)
- LDP 48(R1), (R16, R17)
- CMP $96, R2
- BLS copy96
- LDP -64(R4), (R2, R3)
- LDP -48(R4), (R1, R4)
- STP (R2, R3), -64(R5)
- STP (R1, R4), -48(R5)
-
-copy96:
- STP (R6, R7), (R0)
- STP (R8, R9), 16(R0)
- STP (R14, R15), 32(R0)
- STP (R16, R17), 48(R0)
- STP (R10, R11), -32(R5)
- STP (R12, R13), -16(R5)
- RET
-
- // Copy more than 128 bytes.
-copy_long:
- ADD R1, R2, R4 // R4 points just past the last source byte
- ADD R0, R2, R5 // R5 points just past the last destination byte
- MOVD ZR, R7
- MOVD ZR, R8
-
- CMP $1024, R2
- BLT backward_check
- // feature detect to decide how to align
- MOVBU runtime·arm64UseAlignedLoads(SB), R6
- CBNZ R6, use_aligned_loads
- MOVD R0, R7
- MOVD R5, R8
- B backward_check
-use_aligned_loads:
- MOVD R1, R7
- MOVD R4, R8
- // R7 and R8 are used here for the realignment calculation. In
- // the use_aligned_loads case, R7 is the src pointer and R8 is
- // srcend pointer, which is used in the backward copy case.
- // When doing aligned stores, R7 is the dst pointer and R8 is
- // the dstend pointer.
-
-backward_check:
- // Use backward copy if there is an overlap.
- SUB R1, R0, R14
- CBZ R14, copy0
- CMP R2, R14
- BCC copy_long_backward
-
- // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
- LDP (R1), (R12, R13) // Load A
- AND $15, R7, R14 // Calculate the realignment offset
- SUB R14, R1, R1
- SUB R14, R0, R3 // move dst back same amount as src
- ADD R14, R2, R2
- LDP 16(R1), (R6, R7) // Load B
- STP (R12, R13), (R0) // Store A
- LDP 32(R1), (R8, R9) // Load C
- LDP 48(R1), (R10, R11) // Load D
- LDP.W 64(R1), (R12, R13) // Load E
- // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
- SUBS $144, R2, R2
- BLS copy64_from_end
-
-loop64:
- STP (R6, R7), 16(R3) // Store B
- LDP 16(R1), (R6, R7) // Load B (next iteration)
- STP (R8, R9), 32(R3) // Store C
- LDP 32(R1), (R8, R9) // Load C
- STP (R10, R11), 48(R3) // Store D
- LDP 48(R1), (R10, R11) // Load D
- STP.W (R12, R13), 64(R3) // Store E
- LDP.W 64(R1), (R12, R13) // Load E
- SUBS $64, R2, R2
- BHI loop64
-
- // Write the last iteration and copy 64 bytes from the end.
-copy64_from_end:
- LDP -64(R4), (R14, R15) // Load F
- STP (R6, R7), 16(R3) // Store B
- LDP -48(R4), (R6, R7) // Load G
- STP (R8, R9), 32(R3) // Store C
- LDP -32(R4), (R8, R9) // Load H
- STP (R10, R11), 48(R3) // Store D
- LDP -16(R4), (R10, R11) // Load I
- STP (R12, R13), 64(R3) // Store E
- STP (R14, R15), -64(R5) // Store F
- STP (R6, R7), -48(R5) // Store G
- STP (R8, R9), -32(R5) // Store H
- STP (R10, R11), -16(R5) // Store I
- RET
-
- // Large backward copy for overlapping copies.
- // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
-copy_long_backward:
- LDP -16(R4), (R12, R13)
- AND $15, R8, R14
- SUB R14, R4, R4
- SUB R14, R2, R2
- LDP -16(R4), (R6, R7)
- STP (R12, R13), -16(R5)
- LDP -32(R4), (R8, R9)
- LDP -48(R4), (R10, R11)
- LDP.W -64(R4), (R12, R13)
- SUB R14, R5, R5
- SUBS $128, R2, R2
- BLS copy64_from_start
-
-loop64_backward:
- STP (R6, R7), -16(R5)
- LDP -16(R4), (R6, R7)
- STP (R8, R9), -32(R5)
- LDP -32(R4), (R8, R9)
- STP (R10, R11), -48(R5)
- LDP -48(R4), (R10, R11)
- STP.W (R12, R13), -64(R5)
- LDP.W -64(R4), (R12, R13)
- SUBS $64, R2, R2
- BHI loop64_backward
-
- // Write the last iteration and copy 64 bytes from the start.
-copy64_from_start:
- LDP 48(R1), (R2, R3)
- STP (R6, R7), -16(R5)
- LDP 32(R1), (R6, R7)
- STP (R8, R9), -32(R5)
- LDP 16(R1), (R8, R9)
- STP (R10, R11), -48(R5)
- LDP (R1), (R10, R11)
- STP (R12, R13), -64(R5)
- STP (R2, R3), 48(R0)
- STP (R6, R7), 32(R0)
- STP (R8, R9), 16(R0)
- STP (R10, R11), (R0)
- RET