1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64le || ppc64
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// R3 = byte array pointer
// R4 = length
// R6 = byte to count
MTVRD R6, V1 // move compare byte
MOVD R6, R5
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
// R3 = byte array pointer
// R4 = length
// R5 = byte to count
MTVRD R5, V1 // move compare byte
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
// R3: addr of string
// R4: len of string
// R5: byte to count
// V1: byte to count, splatted.
// On exit:
// R3: return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
MOVD $0, R18 // byte count
#ifndef GOPPC64_power10
RLDIMI $8, R5, $48, R5
RLDIMI $16, R5, $32, R5
RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
#endif
CMPU R4, $32 // Check if it's a small string (<32 bytes)
BLT tail // Jump to the small string case
SRD $5, R4, R20
MOVD R20, CTR
MOVD $16, R21
XXLXOR V4, V4, V4
XXLXOR V5, V5, V5
PCALIGN $16
cmploop:
LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
LXVD2X (R21)(R3), V2
VCMPEQUB V2, V1, V2
VCMPEQUB V0, V1, V0
VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
VPOPCNTD V0, V0
VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
ADD $32, R3
BDNZ cmploop
VADDUDM V4, V5, V5
MFVSRD V5, R18
VSLDOI $8, V5, V5, V5
MFVSRD V5, R21
ADD R21, R18, R18
ANDCC $31, R4, R4
// Skip the tail processing if no bytes remaining.
BEQ tail_0
#ifdef GOPPC64_power10
SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLE small_tail_p10
LXV 0(R3), V0
VCMPEQUB V0, V1, V0
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R18
ADD $16, R3, R3
ANDCC $15, R4, R4
small_tail_p10:
SLD $56, R4, R6
LXVLL R3, R6, V0
VCMPEQUB V0, V1, V0
VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R3
RET
#else
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLT tail_8
MOVD (R3), R12
MOVD 8(R3), R14
CMPB R12, R5, R12
CMPB R14, R5, R14
POPCNTD R12, R12
POPCNTD R14, R14
ADD R12, R18, R18
ADD R14, R18, R18
ADD $16, R3, R3
ADD $-16, R4, R4
tail_8: // Count the remaining 0 - 15 bytes.
CMP R4, $8
BLT tail_4
MOVD (R3), R12
CMPB R12, R5, R12
POPCNTD R12, R12
ADD R12, R18, R18
ADD $8, R3, R3
ADD $-8, R4, R4
tail_4: // Count the remaining 0 - 7 bytes.
CMP R4, $4
BLT tail_2
MOVWZ (R3), R12
CMPB R12, R5, R12
SLD $32, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $4, R3, R3
ADD $-4, R4, R4
tail_2: // Count the remaining 0 - 3 bytes.
CMP R4, $2
BLT tail_1
MOVHZ (R3), R12
CMPB R12, R5, R12
SLD $48, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $2, R3, R3
ADD $-2, R4, R4
tail_1: // Count the remaining 0 - 1 bytes.
CMP R4, $1
BLT tail_0
MOVBZ (R3), R12
CMPB R12, R5, R12
ANDCC $0x8, R12, R12
ADD R12, R18, R18
#endif
tail_0: // No remaining tail to count.
SRD $3, R18, R3 // Fixup count, it is off by 8x.
RET
|