1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// R3 = byte array pointer
// R4 = length
MOVD R6, R5 // R5 = byte
BR indexbytebody<>(SB)
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
// R3 = string
// R4 = length
// R5 = byte
BR indexbytebody<>(SB)
#ifndef GOPPC64_power9
#ifdef GOARCH_ppc64le
DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
#else
DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
#endif
GLOBL indexbytevbperm<>+0(SB), RODATA, $16
#endif
// Some operations are endian specific, choose the correct opcode base on GOARCH.
// Note, _VCZBEBB is only available on power9 and newer.
#ifdef GOARCH_ppc64le
#define _LDBEX MOVDBR
#define _LWBEX MOVWBR
#define _LHBEX MOVHBR
#define _VCZBEBB VCTZLSBB
#else
#define _LDBEX MOVD
#define _LWBEX MOVW
#define _LHBEX MOVH
#define _VCZBEBB VCLZLSBB
#endif
// R3 = addr of string
// R4 = len of string
// R5 = byte to find
// On exit:
// R3 = return value
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
CMPU R4,$32
#ifndef GOPPC64_power9
// Load VBPERMQ constant to reduce compare into an ordered bit mask.
MOVD $indexbytevbperm<>+00(SB),R16
LXVD2X (R16),V0 // Set up swap string
#endif
MTVRD R5,V1
VSPLTB $7,V1,V1 // Replicate byte across V1
BLT cmp16 // Jump to the small string case if it's <32 bytes.
CMP R4,$64,CR1
MOVD $16,R11
MOVD R3,R8
BLT CR1,cmp32 // Special case for length 32 - 63
MOVD $32,R12
MOVD $48,R6
RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63
ADD R3,R9,R9 // R9 = &s[len &^ 63]
ANDCC $63,R4 // (len &= 63) cmp 0.
PCALIGN $16
loop64:
LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0]
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // Match found at R8, jump out
LXVD2X (R11)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
LXVD2X (R12)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat2 // Match found at R8+32 bytes, jump out
LXVD2X (R6)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat3 // Match found at R8+48 bytes, jump out
ADD $64,R8
CMPU R8,R9,CR1
BNE CR1,loop64 // R8 != &s[len &^ 63]?
PCALIGN $32
BEQ notfound // Is tail length 0? CR0 is set before entering loop64.
CMP R4,$32 // Tail length >= 32, use cmp32 path.
CMP R4,$16,CR1
BGE cmp32
ADD R8,R4,R9
ADD $-16,R9
BLE CR1,cmp64_tail_gt0
cmp64_tail_gt16: // Tail length 17 - 32
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0
cmp64_tail_gt0: // Tail length 1 - 16
MOVD R9,R8
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0
BR notfound
cmp32: // Length 32 - 63
// Bytes 0 - 15
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0
// Bytes 16 - 31
LXVD2X (R8)(R11),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
CMP R4,$48
ADD R4,R8,R9 // Compute &s[len(s)-16]
ADD $32,R8,R8
ADD $-16,R9,R9
ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8
// Bytes 33 - 47
LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // match found at R8+32 bytes, jump out
BLE notfound
// Bytes 48 - 63
MOVD R9,R8 // R9 holds the final check.
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
BR notfound
// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
#ifndef GOPPC64_power9
#define ADJUST_FOR_CNTLZW -16
#else
#define ADJUST_FOR_CNTLZW 0
#endif
// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
foundat3:
SUB R3,R8,R3
ADD $48+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat2:
SUB R3,R8,R3
ADD $32+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat1:
SUB R3,R8,R3
ADD $16+ADJUST_FOR_CNTLZW,R3
BR vfound
foundat0:
SUB R3,R8,R3
ADD $0+ADJUST_FOR_CNTLZW,R3
vfound:
// Map equal values into a 16 bit value with earlier matches setting higher bits.
#ifndef GOPPC64_power9
VBPERMQ V6,V0,V6
MFVRD V6,R4
CNTLZW R4,R4
#else
#ifdef GOARCH_ppc64le
// Put the value back into LE ordering by swapping doublewords.
XXPERMDI V6,V6,$2,V6
#endif
_VCZBEBB V6,R4
#endif
ADD R3,R4,R3
RET
cmp16: // Length 16 - 31
CMPU R4,$16
ADD R4,R3,R9
BLT cmp8
ADD $-16,R9,R9 // &s[len(s)-16]
// Bytes 0 - 15
LXVD2X (R0)(R3),V2
VCMPEQUBCC V2,V1,V6
MOVD R3,R8
BNE CR6,foundat0 // Match found at R8+32 bytes, jump out
BEQ notfound
// Bytes 16 - 30
MOVD R9,R8 // R9 holds the final check.
LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
BR notfound
cmp8: // Length 8 - 15
#ifdef GOPPC64_power10
// Load all the bytes into a single VSR in BE order.
SLD $56,R4,R5
LXVLL R3,R5,V2
// Compare and count the number which don't match.
VCMPEQUB V2,V1,V6
VCLZLSBB V6,R3
// If count is the number of bytes, or more. No matches are found.
CMPU R3,R4
MOVD $-1,R5
// Otherwise, the count is the index of the first match.
ISEL CR0LT,R3,R5,R3
RET
#else
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
RLDIMI $16,R5,$32,R5
RLDIMI $32,R5,$0,R5
CMPU R4,$8
BLT cmp4
MOVD $-8,R11
ADD $-8,R4,R4
_LDBEX (R0)(R3),R10
_LDBEX (R11)(R9),R11
CMPB R10,R5,R10
CMPB R11,R5,R11
CMPU R10,$0
CMPU R11,$0,CR1
CNTLZD R10,R10
CNTLZD R11,R11
SRD $3,R10,R3
SRD $3,R11,R11
BNE found
ADD R4,R11,R4
MOVD $-1,R3
ISEL CR1EQ,R3,R4,R3
RET
cmp4: // Length 4 - 7
CMPU R4,$4
BLT cmp2
MOVD $-4,R11
ADD $-4,R4,R4
_LWBEX (R0)(R3),R10
_LWBEX (R11)(R9),R11
CMPB R10,R5,R10
CMPB R11,R5,R11
CNTLZW R10,R10
CNTLZW R11,R11
CMPU R10,$32
CMPU R11,$32,CR1
SRD $3,R10,R3
SRD $3,R11,R11
BNE found
ADD R4,R11,R4
MOVD $-1,R3
ISEL CR1EQ,R3,R4,R3
RET
cmp2: // Length 2 - 3
CMPU R4,$2
BLT cmp1
_LHBEX (R0)(R3),R10
CMPB R10,R5,R10
SLDCC $48,R10,R10
CNTLZD R10,R10
SRD $3,R10,R3
BNE found
cmp1: // Length 1
MOVD $-1,R3
ANDCC $1,R4,R31
BEQ found
MOVBZ -1(R9),R10
CMPB R10,R5,R10
ANDCC $1,R10
ADD $-1,R4
ISEL CR0EQ,R3,R4,R3
found:
RET
#endif
notfound:
MOVD $-1,R3
RET
|