diff options
Diffstat (limited to 'contrib/go/_std_1.25/src/runtime/memmove_arm64.s')
| -rw-r--r-- | contrib/go/_std_1.25/src/runtime/memmove_arm64.s | 238 |
1 files changed, 0 insertions, 238 deletions
diff --git a/contrib/go/_std_1.25/src/runtime/memmove_arm64.s b/contrib/go/_std_1.25/src/runtime/memmove_arm64.s deleted file mode 100644 index 8ec3ed86b9b..00000000000 --- a/contrib/go/_std_1.25/src/runtime/memmove_arm64.s +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// See memmove Go doc for important implementation constraints. - -// Register map -// -// dstin R0 -// src R1 -// count R2 -// dst R3 (same as R0, but gets modified in unaligned cases) -// srcend R4 -// dstend R5 -// data R6-R17 -// tmp1 R14 - -// Copies are split into 3 main cases: small copies of up to 32 bytes, medium -// copies of up to 128 bytes, and large copies. The overhead of the overlap -// check is negligible since it is only required for large copies. -// -// Large copies use a software pipelined loop processing 64 bytes per iteration. -// The destination pointer is 16-byte aligned to minimize unaligned accesses. -// The loop tail is handled by always copying 64 bytes from the end. - -// func memmove(to, from unsafe.Pointer, n uintptr) -TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24 - CBZ R2, copy0 - - // Small copies: 1..16 bytes - CMP $16, R2 - BLE copy16 - - // Large copies - CMP $128, R2 - BHI copy_long - CMP $32, R2 - BHI copy32_128 - - // Small copies: 17..32 bytes. - LDP (R1), (R6, R7) - ADD R1, R2, R4 // R4 points just past the last source byte - LDP -16(R4), (R12, R13) - STP (R6, R7), (R0) - ADD R0, R2, R5 // R5 points just past the last destination byte - STP (R12, R13), -16(R5) - RET - -// Small copies: 1..16 bytes. -copy16: - ADD R1, R2, R4 // R4 points just past the last source byte - ADD R0, R2, R5 // R5 points just past the last destination byte - CMP $8, R2 - BLT copy7 - MOVD (R1), R6 - MOVD -8(R4), R7 - MOVD R6, (R0) - MOVD R7, -8(R5) - RET - -copy7: - TBZ $2, R2, copy3 - MOVWU (R1), R6 - MOVWU -4(R4), R7 - MOVW R6, (R0) - MOVW R7, -4(R5) - RET - -copy3: - TBZ $1, R2, copy1 - MOVHU (R1), R6 - MOVHU -2(R4), R7 - MOVH R6, (R0) - MOVH R7, -2(R5) - RET - -copy1: - MOVBU (R1), R6 - MOVB R6, (R0) - -copy0: - RET - - // Medium copies: 33..128 bytes. -copy32_128: - ADD R1, R2, R4 // R4 points just past the last source byte - ADD R0, R2, R5 // R5 points just past the last destination byte - LDP (R1), (R6, R7) - LDP 16(R1), (R8, R9) - LDP -32(R4), (R10, R11) - LDP -16(R4), (R12, R13) - CMP $64, R2 - BHI copy128 - STP (R6, R7), (R0) - STP (R8, R9), 16(R0) - STP (R10, R11), -32(R5) - STP (R12, R13), -16(R5) - RET - - // Copy 65..128 bytes. -copy128: - LDP 32(R1), (R14, R15) - LDP 48(R1), (R16, R17) - CMP $96, R2 - BLS copy96 - LDP -64(R4), (R2, R3) - LDP -48(R4), (R1, R4) - STP (R2, R3), -64(R5) - STP (R1, R4), -48(R5) - -copy96: - STP (R6, R7), (R0) - STP (R8, R9), 16(R0) - STP (R14, R15), 32(R0) - STP (R16, R17), 48(R0) - STP (R10, R11), -32(R5) - STP (R12, R13), -16(R5) - RET - - // Copy more than 128 bytes. -copy_long: - ADD R1, R2, R4 // R4 points just past the last source byte - ADD R0, R2, R5 // R5 points just past the last destination byte - MOVD ZR, R7 - MOVD ZR, R8 - - CMP $1024, R2 - BLT backward_check - // feature detect to decide how to align - MOVBU runtime·arm64UseAlignedLoads(SB), R6 - CBNZ R6, use_aligned_loads - MOVD R0, R7 - MOVD R5, R8 - B backward_check -use_aligned_loads: - MOVD R1, R7 - MOVD R4, R8 - // R7 and R8 are used here for the realignment calculation. In - // the use_aligned_loads case, R7 is the src pointer and R8 is - // srcend pointer, which is used in the backward copy case. - // When doing aligned stores, R7 is the dst pointer and R8 is - // the dstend pointer. - -backward_check: - // Use backward copy if there is an overlap. - SUB R1, R0, R14 - CBZ R14, copy0 - CMP R2, R14 - BCC copy_long_backward - - // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. - LDP (R1), (R12, R13) // Load A - AND $15, R7, R14 // Calculate the realignment offset - SUB R14, R1, R1 - SUB R14, R0, R3 // move dst back same amount as src - ADD R14, R2, R2 - LDP 16(R1), (R6, R7) // Load B - STP (R12, R13), (R0) // Store A - LDP 32(R1), (R8, R9) // Load C - LDP 48(R1), (R10, R11) // Load D - LDP.W 64(R1), (R12, R13) // Load E - // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end - SUBS $144, R2, R2 - BLS copy64_from_end - -loop64: - STP (R6, R7), 16(R3) // Store B - LDP 16(R1), (R6, R7) // Load B (next iteration) - STP (R8, R9), 32(R3) // Store C - LDP 32(R1), (R8, R9) // Load C - STP (R10, R11), 48(R3) // Store D - LDP 48(R1), (R10, R11) // Load D - STP.W (R12, R13), 64(R3) // Store E - LDP.W 64(R1), (R12, R13) // Load E - SUBS $64, R2, R2 - BHI loop64 - - // Write the last iteration and copy 64 bytes from the end. -copy64_from_end: - LDP -64(R4), (R14, R15) // Load F - STP (R6, R7), 16(R3) // Store B - LDP -48(R4), (R6, R7) // Load G - STP (R8, R9), 32(R3) // Store C - LDP -32(R4), (R8, R9) // Load H - STP (R10, R11), 48(R3) // Store D - LDP -16(R4), (R10, R11) // Load I - STP (R12, R13), 64(R3) // Store E - STP (R14, R15), -64(R5) // Store F - STP (R6, R7), -48(R5) // Store G - STP (R8, R9), -32(R5) // Store H - STP (R10, R11), -16(R5) // Store I - RET - - // Large backward copy for overlapping copies. - // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. -copy_long_backward: - LDP -16(R4), (R12, R13) - AND $15, R8, R14 - SUB R14, R4, R4 - SUB R14, R2, R2 - LDP -16(R4), (R6, R7) - STP (R12, R13), -16(R5) - LDP -32(R4), (R8, R9) - LDP -48(R4), (R10, R11) - LDP.W -64(R4), (R12, R13) - SUB R14, R5, R5 - SUBS $128, R2, R2 - BLS copy64_from_start - -loop64_backward: - STP (R6, R7), -16(R5) - LDP -16(R4), (R6, R7) - STP (R8, R9), -32(R5) - LDP -32(R4), (R8, R9) - STP (R10, R11), -48(R5) - LDP -48(R4), (R10, R11) - STP.W (R12, R13), -64(R5) - LDP.W -64(R4), (R12, R13) - SUBS $64, R2, R2 - BHI loop64_backward - - // Write the last iteration and copy 64 bytes from the start. -copy64_from_start: - LDP 48(R1), (R2, R3) - STP (R6, R7), -16(R5) - LDP 32(R1), (R6, R7) - STP (R8, R9), -32(R5) - LDP 16(R1), (R8, R9) - STP (R10, R11), -48(R5) - LDP (R1), (R10, R11) - STP (R12, R13), -64(R5) - STP (R2, R3), 48(R0) - STP (R6, R7), 32(R0) - STP (R8, R9), 16(R0) - STP (R10, R11), (R0) - RET |
