1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "go_asm.h"
#include "textflag.h"
// 4K (smallest case) page size offset mask for PPC64.
#define PAGE_OFFSET 4095
// Likewise, the BC opcode is hard to read, and no extended
// mnemonics are offered for these forms.
#define BGELR_CR6 BC 4, CR6LT, (LR)
#define BEQLR BC 12, CR0EQ, (LR)
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
// R3 = a
// R4 = b
// R5 = size
BR memeqbody<>(SB)
// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
// R3 = a
// R4 = b
CMP R3, R4
BEQ eq
MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure
BR memeqbody<>(SB)
eq:
MOVD $1, R3
RET
// Do an efficient memequal for ppc64
// R3 = s1
// R4 = s2
// R5 = len
// On exit:
// R3 = return value
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3, R8 // Move s1 into R8
ADD R5, R3, R9 // &s1[len(s1)]
ADD R5, R4, R10 // &s2[len(s2)]
MOVD $1, R11
CMP R5, $16 // Use GPR checks for check for len <= 16
BLE check0_16
MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
CMP R5, $32 // Use overlapping VSX loads for len <= 32
BLE check17_32 // Do a pair of overlapping VSR compares
CMP R5, $64
BLE check33_64 // Hybrid check + overlap compare.
setup64:
SRD $6, R5, R6 // number of 64 byte chunks to compare
MOVD R6, CTR
MOVD $16, R14 // index for VSX loads and stores
MOVD $32, R15
MOVD $48, R16
ANDCC $0x3F, R5, R5 // len%64==0?
PCALIGN $16
loop64:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2 // compare, setting CR6
BGELR_CR6
LXVD2X (R8+R14), V0
LXVD2X (R4+R14), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R15), V0
LXVD2X (R4+R15), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R16), V0
LXVD2X (R4+R16), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
ADD $64,R8 // bump up to next 64
ADD $64,R4
BDNZ loop64
ISEL CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
BEQLR // return if no tail.
ADD $-64, R9, R8
ADD $-64, R10, R4
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R14), V0
LXVD2X (R4+R14), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R15), V0
LXVD2X (R4+R15), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
LXVD2X (R8+R16), V0
LXVD2X (R4+R16), V1
VCMPEQUBCC V0, V1, V2
ISEL CR6LT, R11, R0, R3
RET
check33_64:
// Bytes 0-15
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
ADD $16, R8
ADD $16, R4
// Bytes 16-31
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
BGELR_CR6
// A little tricky, but point R4,R8 to &sx[len-32],
// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
ADD $-32, R9, R8
ADD $-32, R10, R4
// Fallthrough
check17_32:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL CR6LT, R11, R0, R5
// Load sX[len(sX)-16:len(sX)] and compare.
ADD $-16, R9
ADD $-16, R10
LXVD2X (R9+R0), V0
LXVD2X (R10+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL CR6LT, R5, R0, R3
RET
check0_16:
#ifdef GOPPC64_power10
SLD $56, R5, R7
LXVL R8, R7, V0
LXVL R4, R7, V1
VCMPEQUDCC V0, V1, V2
ISEL CR6LT, R11, R0, R3
RET
#else
CMP R5, $8
BLT check0_7
// Load sX[0:7] and compare.
MOVD (R8), R6
MOVD (R4), R7
CMP R6, R7
ISEL CR0EQ, R11, R0, R5
// Load sX[len(sX)-8:len(sX)] and compare.
MOVD -8(R9), R6
MOVD -8(R10), R7
CMP R6, R7
ISEL CR0EQ, R5, R0, R3
RET
check0_7:
CMP R5,$0
MOVD $1, R3
BEQLR // return if len == 0
// Check < 8B loads with a single compare, but select the load address
// such that it cannot cross a page boundary. Load a few bytes from the
// lower address if that does not cross the lower page. Or, load a few
// extra bytes from the higher addresses. And align those values
// consistently in register as either address may have differing
// alignment requirements.
ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
ANDCC $PAGE_OFFSET, R4, R9
SUBC R5, $8, R12 // 8-len
SLD $3, R12, R14 // (8-len)*8
CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
CMPU R9, R12, CR0
SUB R12, R8, R6 // compute lower load address
SUB R12, R4, R9
ISEL CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
ISEL CR0LT, R4, R9, R4 // Similar for s2
MOVD (R8), R15
MOVD (R4), R16
SLD R14, R15, R7
SLD R14, R16, R17
SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
SRD R14, R17, R17
SRD R14, R15, R6 // Clear the lower (8-len) bytes
SRD R14, R16, R9
#ifdef GOARCH_ppc64le
ISEL CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
ISEL CR0LT, R17, R9, R4
#else
ISEL CR1LT, R6, R7, R8
ISEL CR0LT, R9, R17, R4
#endif
CMP R4, R8
ISEL CR0EQ, R11, R0, R3
RET
#endif // tail processing if !defined(GOPPC64_power10)
|