1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !plan9
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB), NOSPLIT, $0-40
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP indexbytebody<>(SB)
TEXT ·IndexByteString(SB), NOSPLIT, $0-32
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP indexbytebody<>(SB)
// input:
// SI: data
// BX: data len
// AL: byte sought
// R8: address to put result
TEXT indexbytebody<>(SB), NOSPLIT, $0
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
PCALIGN $16
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Find first set bit, if any.
BSFL DX, DX
JNZ ssesuccess
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JB sseloop
// Search the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched, but that's ok.
MOVQ AX, DI
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
BSFL DX, DX
JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
SUBQ SI, DI // Compute offset of chunk within data.
ADDQ DX, DI // Add offset of byte within chunk.
MOVQ DI, (R8)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ failure
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
MOVOU (SI), X1 // Load data
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
CMPL DX, BX
JAE failure // Match is past end of data.
MOVQ DX, (R8)
RET
endofpage:
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
MOVL BX, CX
SHLL CX, DX
SHRL $16, DX // Shift desired bits down to bottom of register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
MOVQ DX, (R8)
RET
avx2:
#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
PCALIGN $32
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
ADDQ $32, DI
CMPQ DI, R11
JLT avx2_loop
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
VZEROUPPER
MOVQ $-1, (R8)
RET
avx2success:
VPMOVMSKB Y3, DX
BSFL DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVQ DX, (R8)
VZEROUPPER
RET
|