aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/go/_std_1.22/src/internal/bytealg/count_ppc64x.s
blob: 55e02ce8a187ec99364e9a08abd0b9ae70caa15b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64le || ppc64

#include "go_asm.h"
#include "textflag.h"

TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
	// R3 = byte array pointer
	// R4 = length
	// R6 = byte to count
	MTVRD	R6, V1		// move compare byte
	MOVD	R6, R5
	VSPLTB	$7, V1, V1	// replicate byte across V1
	BR	countbytebody<>(SB)

TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
	// R3 = byte array pointer
	// R4 = length
	// R5 = byte to count
	MTVRD	R5, V1		// move compare byte
	VSPLTB	$7, V1, V1	// replicate byte across V1
	BR	countbytebody<>(SB)

// R3: addr of string
// R4: len of string
// R5: byte to count
// V1: byte to count, splatted.
// On exit:
// R3: return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
	MOVD	$0, R18 // byte count

#ifndef GOPPC64_power10
	RLDIMI	$8, R5, $48, R5
	RLDIMI	$16, R5, $32, R5
	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
#endif

	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
	BLT	tail		// Jump to the small string case
	SRD	$5, R4, R20
	MOVD	R20, CTR
	MOVD	$16, R21
	XXLXOR	V4, V4, V4
	XXLXOR	V5, V5, V5

	PCALIGN	$16
cmploop:
	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
	LXVD2X	(R21)(R3), V2
	VCMPEQUB V2, V1, V2
	VCMPEQUB V0, V1, V0
	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
	VPOPCNTD V0, V0
	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
	ADD	$32, R3
	BDNZ	cmploop

	VADDUDM	V4, V5, V5
	MFVSRD	V5, R18
	VSLDOI	$8, V5, V5, V5
	MFVSRD	V5, R21
	ADD	R21, R18, R18
	ANDCC	$31, R4, R4
	// Skip the tail processing if no bytes remaining.
	BEQ	tail_0

#ifdef GOPPC64_power10
	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.

tail:	// Count the last 0 - 31 bytes.
	CMP	R4, $16
	BLE	small_tail_p10
	LXV	0(R3), V0
	VCMPEQUB V0, V1, V0
	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
	ADD	R14, R18, R18
	ADD	$16, R3, R3
	ANDCC	$15, R4, R4

small_tail_p10:
	SLD	$56, R4, R6
	LXVLL	R3, R6, V0
	VCMPEQUB V0, V1, V0
	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
	ADD	R14, R18, R3
	RET

#else
tail:	// Count the last 0 - 31 bytes.
	CMP	R4, $16
	BLT	tail_8
	MOVD	(R3), R12
	MOVD	8(R3), R14
	CMPB	R12, R5, R12
	CMPB	R14, R5, R14
	POPCNTD	R12, R12
	POPCNTD	R14, R14
	ADD	R12, R18, R18
	ADD	R14, R18, R18
	ADD	$16, R3, R3
	ADD	$-16, R4, R4

tail_8:	// Count the remaining 0 - 15 bytes.
	CMP	R4, $8
	BLT	tail_4
	MOVD	(R3), R12
	CMPB	R12, R5, R12
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$8, R3, R3
	ADD	$-8, R4, R4

tail_4:	// Count the remaining 0 - 7 bytes.
	CMP	R4, $4
	BLT	tail_2
	MOVWZ	(R3), R12
	CMPB	R12, R5, R12
	SLD	$32, R12, R12	// Remove non-participating matches.
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$4, R3, R3
	ADD	$-4, R4, R4

tail_2:	// Count the remaining 0 - 3 bytes.
	CMP	R4, $2
	BLT	tail_1
	MOVHZ	(R3), R12
	CMPB	R12, R5, R12
	SLD	$48, R12, R12	// Remove non-participating matches.
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$2, R3, R3
	ADD	$-2, R4, R4

tail_1:	// Count the remaining 0 - 1 bytes.
	CMP	R4, $1
	BLT	tail_0
	MOVBZ	(R3), R12
	CMPB	R12, R5, R12
	ANDCC	$0x8, R12, R12
	ADD	R12, R18, R18
#endif

tail_0:	// No remaining tail to count.
	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
	RET