1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
|
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !plan9
#include "go_asm.h"
#include "textflag.h"
#include "asm_amd64.h"
// See memclrNoHeapPointers Go doc for important implementation constraints.
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
// ABIInternal for performance.
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
// AX = ptr
// BX = n
MOVQ AX, DI // DI = ptr
XORQ AX, AX
// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
tail:
// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
TESTQ BX, BX
JEQ _0
CMPQ BX, $2
JBE _1or2
CMPQ BX, $4
JBE _3or4
CMPQ BX, $8
JB _5through7
JE _8
CMPQ BX, $16
JBE _9through16
CMPQ BX, $32
JBE _17through32
CMPQ BX, $64
JBE _33through64
CMPQ BX, $128
JBE _65through128
CMPQ BX, $256
JBE _129through256
CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
JNE skip_erms
// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
// in the Intel Optimization Guide shows better performance for ERMSB starting
// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
CMPQ BX, $2048
JAE loop_preheader_erms
skip_erms:
#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JE loop_preheader_avx2
// TODO: for really big clears, use MOVNTDQ, even without AVX2.
loop:
MOVOU X15, 0(DI)
MOVOU X15, 16(DI)
MOVOU X15, 32(DI)
MOVOU X15, 48(DI)
MOVOU X15, 64(DI)
MOVOU X15, 80(DI)
MOVOU X15, 96(DI)
MOVOU X15, 112(DI)
MOVOU X15, 128(DI)
MOVOU X15, 144(DI)
MOVOU X15, 160(DI)
MOVOU X15, 176(DI)
MOVOU X15, 192(DI)
MOVOU X15, 208(DI)
MOVOU X15, 224(DI)
MOVOU X15, 240(DI)
SUBQ $256, BX
ADDQ $256, DI
CMPQ BX, $256
JAE loop
JMP tail
#endif
loop_preheader_avx2:
VPXOR X0, X0, X0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_avx2:
VMOVDQU Y0, 0(DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y0, 64(DI)
VMOVDQU Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2
VMOVDQU Y0, -32(DI)(BX*1)
VMOVDQU Y0, -64(DI)(BX*1)
VMOVDQU Y0, -96(DI)(BX*1)
VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
loop_preheader_erms:
#ifndef hasAVX2
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE loop_erms
#endif
VPXOR X0, X0, X0
// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
// write protocol, ERMS could show the same or slower performance comparing to
// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_erms:
// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
// for a memory subsystem as the GC requires this.
MOVQ BX, CX
SHRQ $3, CX
ANDQ $7, BX
REP; STOSQ
JMP tail
loop_preheader_avx2_huge:
// Align to 32 byte boundary
VMOVDQU Y0, 0(DI)
MOVQ DI, SI
ADDQ $32, DI
ANDQ $~31, DI
SUBQ DI, SI
ADDQ SI, BX
loop_avx2_huge:
VMOVNTDQ Y0, 0(DI)
VMOVNTDQ Y0, 32(DI)
VMOVNTDQ Y0, 64(DI)
VMOVNTDQ Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2_huge
// In the description of MOVNTDQ in [1]
// "... fencing operation implemented with the SFENCE or MFENCE instruction
// should be used in conjunction with MOVNTDQ instructions..."
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
SFENCE
VMOVDQU Y0, -32(DI)(BX*1)
VMOVDQU Y0, -64(DI)(BX*1)
VMOVDQU Y0, -96(DI)(BX*1)
VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
_1or2:
MOVB AX, (DI)
MOVB AX, -1(DI)(BX*1)
RET
_0:
RET
_3or4:
MOVW AX, (DI)
MOVW AX, -2(DI)(BX*1)
RET
_5through7:
MOVL AX, (DI)
MOVL AX, -4(DI)(BX*1)
RET
_8:
// We need a separate case for 8 to make sure we clear pointers atomically.
MOVQ AX, (DI)
RET
_9through16:
MOVQ AX, (DI)
MOVQ AX, -8(DI)(BX*1)
RET
_17through32:
MOVOU X15, (DI)
MOVOU X15, -16(DI)(BX*1)
RET
_33through64:
MOVOU X15, (DI)
MOVOU X15, 16(DI)
MOVOU X15, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1)
RET
_65through128:
MOVOU X15, (DI)
MOVOU X15, 16(DI)
MOVOU X15, 32(DI)
MOVOU X15, 48(DI)
MOVOU X15, -64(DI)(BX*1)
MOVOU X15, -48(DI)(BX*1)
MOVOU X15, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1)
RET
_129through256:
MOVOU X15, (DI)
MOVOU X15, 16(DI)
MOVOU X15, 32(DI)
MOVOU X15, 48(DI)
MOVOU X15, 64(DI)
MOVOU X15, 80(DI)
MOVOU X15, 96(DI)
MOVOU X15, 112(DI)
MOVOU X15, -128(DI)(BX*1)
MOVOU X15, -112(DI)(BX*1)
MOVOU X15, -96(DI)(BX*1)
MOVOU X15, -80(DI)(BX*1)
MOVOU X15, -64(DI)(BX*1)
MOVOU X15, -48(DI)(BX*1)
MOVOU X15, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1)
RET
|