aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/go/_std_1.22/src/internal/bytealg/indexbyte_ppc64x.s
blob: b6714f45aae3cab49b0caecb1723a0d72377ce6e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

#include "go_asm.h"
#include "textflag.h"

TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
	// R3 = byte array pointer
	// R4 = length
	MOVD	R6, R5		// R5 = byte
	BR	indexbytebody<>(SB)

TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
	// R3 = string
	// R4 = length
	// R5 = byte
	BR	indexbytebody<>(SB)

#ifndef GOPPC64_power9
#ifdef GOARCH_ppc64le
DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
#else
DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
#endif
GLOBL indexbytevbperm<>+0(SB), RODATA, $16
#endif

// Some operations are endian specific, choose the correct opcode base on GOARCH.
// Note, _VCZBEBB is only available on power9 and newer.
#ifdef GOARCH_ppc64le
#define _LDBEX	MOVDBR
#define _LWBEX	MOVWBR
#define _LHBEX	MOVHBR
#define _VCZBEBB VCTZLSBB
#else
#define _LDBEX	MOVD
#define _LWBEX	MOVW
#define _LHBEX	MOVH
#define _VCZBEBB VCLZLSBB
#endif

// R3 = addr of string
// R4 = len of string
// R5 = byte to find
// On exit:
// R3 = return value
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
	CMPU	R4,$32

#ifndef GOPPC64_power9
	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
	MOVD	$indexbytevbperm<>+00(SB),R16
	LXVD2X	(R16),V0	// Set up swap string
#endif

	MTVRD	R5,V1
	VSPLTB	$7,V1,V1	// Replicate byte across V1

	BLT	cmp16		// Jump to the small string case if it's <32 bytes.

	CMP	R4,$64,CR1
	MOVD	$16,R11
	MOVD	R3,R8
	BLT	CR1,cmp32	// Special case for length 32 - 63
	MOVD	$32,R12
	MOVD	$48,R6

	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
	ANDCC	$63,R4		// (len &= 63) cmp 0.

	PCALIGN	$16
loop64:
	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0	// Match found at R8, jump out

	LXVD2X	(R11)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out

	LXVD2X	(R12)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out

	LXVD2X	(R6)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out

	ADD	$64,R8
	CMPU	R8,R9,CR1
	BNE	CR1,loop64	// R8 != &s[len &^ 63]?

	PCALIGN	$32
	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.

	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
	CMP	R4,$16,CR1
	BGE	cmp32

	ADD	R8,R4,R9
	ADD	$-16,R9
	BLE	CR1,cmp64_tail_gt0

cmp64_tail_gt16:	// Tail length 17 - 32
	LXVD2X	(R0)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0

cmp64_tail_gt0:	// Tail length 1 - 16
	MOVD	R9,R8
	LXVD2X	(R0)(R9),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0

	BR	notfound

cmp32:	// Length 32 - 63

	// Bytes 0 - 15
	LXVD2X	(R0)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0

	// Bytes 16 - 31
	LXVD2X	(R8)(R11),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out

	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
	CMP	R4,$48

	ADD	R4,R8,R9		// Compute &s[len(s)-16]
	ADD	$32,R8,R8
	ADD	$-16,R9,R9
	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8

	// Bytes 33 - 47
	LXVD2X	(R0)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out

	BLE	notfound

	// Bytes 48 - 63
	MOVD	R9,R8			// R9 holds the final check.
	LXVD2X	(R0)(R9),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out

	BR	notfound

// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
#ifndef GOPPC64_power9
#define ADJUST_FOR_CNTLZW -16
#else
#define ADJUST_FOR_CNTLZW 0
#endif

// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
foundat3:
	SUB	R3,R8,R3
	ADD	$48+ADJUST_FOR_CNTLZW,R3
	BR	vfound
foundat2:
	SUB	R3,R8,R3
	ADD	$32+ADJUST_FOR_CNTLZW,R3
	BR	vfound
foundat1:
	SUB	R3,R8,R3
	ADD	$16+ADJUST_FOR_CNTLZW,R3
	BR	vfound
foundat0:
	SUB	R3,R8,R3
	ADD	$0+ADJUST_FOR_CNTLZW,R3
vfound:
	// Map equal values into a 16 bit value with earlier matches setting higher bits.
#ifndef GOPPC64_power9
	VBPERMQ	V6,V0,V6
	MFVRD	V6,R4
	CNTLZW	R4,R4
#else
#ifdef GOARCH_ppc64le
	// Put the value back into LE ordering by swapping doublewords.
	XXPERMDI	V6,V6,$2,V6
#endif
	_VCZBEBB	V6,R4
#endif
	ADD	R3,R4,R3
	RET

cmp16:	// Length 16 - 31
	CMPU	R4,$16
	ADD	R4,R3,R9
	BLT	cmp8

	ADD	$-16,R9,R9		// &s[len(s)-16]

	// Bytes 0 - 15
	LXVD2X	(R0)(R3),V2
	VCMPEQUBCC	V2,V1,V6
	MOVD	R3,R8
	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out

	BEQ	notfound

	// Bytes 16 - 30
	MOVD	R9,R8			// R9 holds the final check.
	LXVD2X	(R0)(R9),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out

	BR	notfound


cmp8:	// Length 8 - 15
#ifdef GOPPC64_power10
	// Load all the bytes into a single VSR in BE order.
	SLD	$56,R4,R5
	LXVLL	R3,R5,V2
	// Compare and count the number which don't match.
	VCMPEQUB	V2,V1,V6
	VCLZLSBB	V6,R3
	// If count is the number of bytes, or more. No matches are found.
	CMPU	R3,R4
	MOVD	$-1,R5
	// Otherwise, the count is the index of the first match.
	ISEL	CR0LT,R3,R5,R3
	RET
#else
	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
	RLDIMI	$16,R5,$32,R5
	RLDIMI	$32,R5,$0,R5
	CMPU	R4,$8
	BLT	cmp4
	MOVD	$-8,R11
	ADD	$-8,R4,R4

	_LDBEX	(R0)(R3),R10
	_LDBEX	(R11)(R9),R11
	CMPB	R10,R5,R10
	CMPB	R11,R5,R11
	CMPU	R10,$0
	CMPU	R11,$0,CR1
	CNTLZD	R10,R10
	CNTLZD	R11,R11
	SRD	$3,R10,R3
	SRD	$3,R11,R11
	BNE	found

	ADD	R4,R11,R4
	MOVD	$-1,R3
	ISEL	CR1EQ,R3,R4,R3
	RET

cmp4:	// Length 4 - 7
	CMPU	R4,$4
	BLT	cmp2
	MOVD	$-4,R11
	ADD	$-4,R4,R4

	_LWBEX	(R0)(R3),R10
	_LWBEX	(R11)(R9),R11
	CMPB	R10,R5,R10
	CMPB	R11,R5,R11
	CNTLZW	R10,R10
	CNTLZW	R11,R11
	CMPU	R10,$32
	CMPU	R11,$32,CR1
	SRD	$3,R10,R3
	SRD	$3,R11,R11
	BNE	found

	ADD	R4,R11,R4
	MOVD	$-1,R3
	ISEL	CR1EQ,R3,R4,R3
	RET

cmp2:	// Length 2 - 3
	CMPU	R4,$2
	BLT	cmp1

	_LHBEX	(R0)(R3),R10
	CMPB	R10,R5,R10
	SLDCC	$48,R10,R10
	CNTLZD	R10,R10
	SRD	$3,R10,R3
	BNE	found

cmp1:	// Length 1
	MOVD	$-1,R3
	ANDCC	$1,R4,R31
	BEQ	found

	MOVBZ	-1(R9),R10
	CMPB	R10,R5,R10
	ANDCC	$1,R10
	ADD	$-1,R4
	ISEL	CR0EQ,R3,R4,R3

found:
	RET
#endif

notfound:
	MOVD $-1,R3
	RET