1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
// Copyright 2023 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// -----------------------------------------------------------------------------
// File: prefetch.h
// -----------------------------------------------------------------------------
//
// This header file defines prefetch functions to prefetch memory contents
// into the first level cache (L1) for the current CPU. The prefetch logic
// offered in this header is limited to prefetching first level cachelines
// only, and is aimed at relatively 'simple' prefetching logic.
//
#ifndef Y_ABSL_BASE_PREFETCH_H_
#define Y_ABSL_BASE_PREFETCH_H_
#include "y_absl/base/attributes.h"
#include "y_absl/base/config.h"
#if defined(Y_ABSL_INTERNAL_HAVE_SSE)
#include <xmmintrin.h>
#endif
#if defined(_MSC_VER)
#include <intrin.h>
#if defined(Y_ABSL_INTERNAL_HAVE_SSE)
#pragma intrinsic(_mm_prefetch)
#endif
#endif
namespace y_absl {
Y_ABSL_NAMESPACE_BEGIN
// Moves data into the L1 cache before it is read, or "prefetches" it.
//
// The value of `addr` is the address of the memory to prefetch. If
// the target and compiler support it, data prefetch instructions are
// generated. If the prefetch is done some time before the memory is
// read, it may be in the cache by the time the read occurs.
//
// This method prefetches data with the highest degree of temporal locality;
// data is prefetched where possible into all levels of the cache.
//
// Incorrect or gratuitous use of this function can degrade performance.
// Use this function only when representative benchmarks show an improvement.
//
// Example:
//
// // Computes incremental checksum for `data`.
// int ComputeChecksum(int sum, y_absl::string_view data);
//
// // Computes cumulative checksum for all values in `data`
// int ComputeChecksum(y_absl::Span<const TString> data) {
// int sum = 0;
// auto it = data.begin();
// auto pit = data.begin();
// auto end = data.end();
// for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) {
// y_absl::PrefetchToLocalCache(pit->data());
// }
// for (; pit != end; ++pit, ++it) {
// sum = ComputeChecksum(sum, *it);
// y_absl::PrefetchToLocalCache(pit->data());
// }
// for (; it != end; ++it) {
// sum = ComputeChecksum(sum, *it);
// }
// return sum;
// }
//
void PrefetchToLocalCache(const void* addr);
// Moves data into the L1 cache before it is read, or "prefetches" it.
//
// This function is identical to `PrefetchToLocalCache()` except that it has
// non-temporal locality: the fetched data should not be left in any of the
// cache tiers. This is useful for cases where the data is used only once /
// short term, for example, invoking a destructor on an object.
//
// Incorrect or gratuitous use of this function can degrade performance.
// Use this function only when representative benchmarks show an improvement.
//
// Example:
//
// template <typename Iterator>
// void DestroyPointers(Iterator begin, Iterator end) {
// size_t distance = std::min(8U, bars.size());
//
// int dist = 8;
// auto prefetch_it = begin;
// while (prefetch_it != end && --dist;) {
// y_absl::PrefetchToLocalCacheNta(*prefetch_it++);
// }
// while (prefetch_it != end) {
// delete *begin++;
// y_absl::PrefetchToLocalCacheNta(*prefetch_it++);
// }
// while (begin != end) {
// delete *begin++;
// }
// }
//
void PrefetchToLocalCacheNta(const void* addr);
// Moves data into the L1 cache with the intent to modify it.
//
// This function is similar to `PrefetchToLocalCache()` except that it
// prefetches cachelines with an 'intent to modify' This typically includes
// invalidating cache entries for this address in all other cache tiers, and an
// exclusive access intent.
//
// Incorrect or gratuitous use of this function can degrade performance. As this
// function can invalidate cached cachelines on other caches and computer cores,
// incorrect usage of this function can have an even greater negative impact
// than incorrect regular prefetches.
// Use this function only when representative benchmarks show an improvement.
//
// Example:
//
// void* Arena::Allocate(size_t size) {
// void* ptr = AllocateBlock(size);
// y_absl::PrefetchToLocalCacheForWrite(p);
// return ptr;
// }
//
void PrefetchToLocalCacheForWrite(const void* addr);
#if Y_ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
#define Y_ABSL_HAVE_PREFETCH 1
// See __builtin_prefetch:
// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
//
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
const void* addr) {
__builtin_prefetch(addr, 0, 3);
}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
const void* addr) {
__builtin_prefetch(addr, 0, 0);
}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
const void* addr) {
// [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1)
// unless -march=broadwell or newer; this is not generally the default, so we
// manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel
// processors and has been present on AMD processors since the K6-2.
#if defined(__x86_64__) && !defined(__PRFCHW__)
asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr)));
#else
__builtin_prefetch(addr, 1, 3);
#endif
}
#elif defined(Y_ABSL_INTERNAL_HAVE_SSE)
#define Y_ABSL_HAVE_PREFETCH 1
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
const void* addr) {
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
const void* addr) {
#if defined(_MM_HINT_ET0)
_mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
#elif !defined(_MSC_VER) && defined(__x86_64__)
// _MM_HINT_ET0 is not universally supported. As we commented further
// up, PREFETCHW is recognized as a no-op on older Intel processors
// and has been present on AMD processors since the K6-2. We have this
// disabled for MSVC compilers as this miscompiles on older MSVC compilers.
asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr)));
#endif
}
#else
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
const void* addr) {}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
const void* addr) {}
Y_ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
const void* addr) {}
#endif
Y_ABSL_NAMESPACE_END
} // namespace y_absl
#endif // Y_ABSL_BASE_PREFETCH_H_
|