contrib/go/_std_1.22/src/internal/bytealg/equal_ppc64x.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

#include "go_asm.h"
#include "textflag.h"

// 4K (smallest case) page size offset mask for PPC64.
#define PAGE_OFFSET 4095

// Likewise, the BC opcode is hard to read, and no extended
// mnemonics are offered for these forms.
#define BGELR_CR6 BC  4, CR6LT, (LR)
#define BEQLR     BC 12, CR0EQ, (LR)

// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
	// R3 = a
	// R4 = b
	// R5 = size
	BR	memeqbody<>(SB)

// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
	// R3 = a
	// R4 = b
	CMP	R3, R4
	BEQ	eq
	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
	BR	memeqbody<>(SB)
eq:
	MOVD	$1, R3
	RET

// Do an efficient memequal for ppc64
// R3 = s1
// R4 = s2
// R5 = len
// On exit:
// R3 = return value
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
	MOVD	R3, R8		// Move s1 into R8
	ADD	R5, R3, R9	// &s1[len(s1)]
	ADD	R5, R4, R10	// &s2[len(s2)]
	MOVD	$1, R11
	CMP	R5, $16		// Use GPR checks for check for len <= 16
	BLE	check0_16
	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
	BLE	check17_32	// Do a pair of overlapping VSR compares
	CMP	R5, $64
	BLE	check33_64	// Hybrid check + overlap compare.

setup64:
	SRD	$6, R5, R6	// number of 64 byte chunks to compare
	MOVD	R6, CTR
	MOVD	$16, R14	// index for VSX loads and stores
	MOVD	$32, R15
	MOVD	$48, R16
	ANDCC	$0x3F, R5, R5	// len%64==0?

	PCALIGN $16
loop64:
	LXVD2X	(R8+R0), V0
	LXVD2X	(R4+R0), V1
	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
	BGELR_CR6
	LXVD2X	(R8+R14), V0
	LXVD2X	(R4+R14), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	LXVD2X	(R8+R15), V0
	LXVD2X	(R4+R15), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	LXVD2X	(R8+R16), V0
	LXVD2X	(R4+R16), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	ADD	$64,R8		// bump up to next 64
	ADD	$64,R4
	BDNZ	loop64

	ISEL	CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
	BEQLR				// return if no tail.

	ADD	$-64, R9, R8
	ADD	$-64, R10, R4
	LXVD2X	(R8+R0), V0
	LXVD2X	(R4+R0), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	LXVD2X	(R8+R14), V0
	LXVD2X	(R4+R14), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	LXVD2X	(R8+R15), V0
	LXVD2X	(R4+R15), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	LXVD2X	(R8+R16), V0
	LXVD2X	(R4+R16), V1
	VCMPEQUBCC	V0, V1, V2
	ISEL	CR6LT, R11, R0, R3
	RET

check33_64:
	// Bytes 0-15
	LXVD2X	(R8+R0), V0
	LXVD2X	(R4+R0), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6
	ADD	$16, R8
	ADD	$16, R4

	// Bytes 16-31
	LXVD2X	(R8+R0), V0
	LXVD2X	(R4+R0), V1
	VCMPEQUBCC	V0, V1, V2
	BGELR_CR6

	// A little tricky, but point R4,R8 to &sx[len-32],
	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
	ADD	$-32, R9, R8
	ADD	$-32, R10, R4
	// Fallthrough

check17_32:
	LXVD2X	(R8+R0), V0
	LXVD2X	(R4+R0), V1
	VCMPEQUBCC	V0, V1, V2
	ISEL	CR6LT, R11, R0, R5

	// Load sX[len(sX)-16:len(sX)] and compare.
	ADD	$-16, R9
	ADD	$-16, R10
	LXVD2X	(R9+R0), V0
	LXVD2X	(R10+R0), V1
	VCMPEQUBCC	V0, V1, V2
	ISEL	CR6LT, R5, R0, R3
	RET

check0_16:
#ifdef GOPPC64_power10
	SLD	$56, R5, R7
	LXVL	R8, R7, V0
	LXVL	R4, R7, V1
	VCMPEQUDCC	V0, V1, V2
	ISEL	CR6LT, R11, R0, R3
	RET
#else
	CMP	R5, $8
	BLT	check0_7
	// Load sX[0:7] and compare.
	MOVD	(R8), R6
	MOVD	(R4), R7
	CMP	R6, R7
	ISEL	CR0EQ, R11, R0, R5
	// Load sX[len(sX)-8:len(sX)] and compare.
	MOVD	-8(R9), R6
	MOVD	-8(R10), R7
	CMP	R6, R7
	ISEL	CR0EQ, R5, R0, R3
	RET

check0_7:
	CMP	R5,$0
	MOVD	$1, R3
	BEQLR		// return if len == 0

	// Check < 8B loads with a single compare, but select the load address
	// such that it cannot cross a page boundary. Load a few bytes from the
	// lower address if that does not cross the lower page. Or, load a few
	// extra bytes from the higher addresses. And align those values
	// consistently in register as either address may have differing
	// alignment requirements.
	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
	ANDCC	$PAGE_OFFSET, R4, R9
	SUBC	R5, $8, R12		// 8-len
	SLD	$3, R12, R14		// (8-len)*8
	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
	CMPU	R9, R12, CR0
	SUB	R12, R8, R6		// compute lower load address
	SUB	R12, R4, R9
	ISEL	CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
	ISEL	CR0LT, R4, R9, R4	// Similar for s2
	MOVD	(R8), R15
	MOVD	(R4), R16
	SLD	R14, R15, R7
	SLD	R14, R16, R17
	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
	SRD	R14, R17, R17
	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
	SRD	R14, R16, R9
#ifdef GOARCH_ppc64le
	ISEL	CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
	ISEL	CR0LT, R17, R9, R4
#else
	ISEL	CR1LT, R6, R7, R8
	ISEL	CR0LT, R9, R17, R4
#endif
	CMP	R4, R8
	ISEL	CR0EQ, R11, R0, R3
	RET
#endif	// tail processing if !defined(GOPPC64_power10)