1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
|
#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"
#include "jemalloc/internal/hpdata.h"
static int
hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) {
uint64_t a_age = hpdata_age_get(a);
uint64_t b_age = hpdata_age_get(b);
/*
* hpdata ages are operation counts in the psset; no two should be the
* same.
*/
assert(a_age != b_age);
return (a_age > b_age) - (a_age < b_age);
}
ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp)
void
hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) {
hpdata_addr_set(hpdata, addr);
hpdata_age_set(hpdata, age);
hpdata->h_huge = false;
hpdata->h_alloc_allowed = true;
hpdata->h_in_psset_alloc_container = false;
hpdata->h_purge_allowed = false;
hpdata->h_hugify_allowed = false;
hpdata->h_in_psset_hugify_container = false;
hpdata->h_mid_purge = false;
hpdata->h_mid_hugify = false;
hpdata->h_updating = false;
hpdata->h_in_psset = false;
hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES);
hpdata->h_nactive = 0;
fb_init(hpdata->active_pages, HUGEPAGE_PAGES);
hpdata->h_ntouched = 0;
fb_init(hpdata->touched_pages, HUGEPAGE_PAGES);
hpdata_assert_consistent(hpdata);
}
void *
hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) {
hpdata_assert_consistent(hpdata);
/*
* This is a metadata change; the hpdata should therefore either not be
* in the psset, or should have explicitly marked itself as being
* mid-update.
*/
assert(!hpdata->h_in_psset || hpdata->h_updating);
assert(hpdata->h_alloc_allowed);
assert((sz & PAGE_MASK) == 0);
size_t npages = sz >> LG_PAGE;
assert(npages <= hpdata_longest_free_range_get(hpdata));
size_t result;
size_t start = 0;
/*
* These are dead stores, but the compiler will issue warnings on them
* since it can't tell statically that found is always true below.
*/
size_t begin = 0;
size_t len = 0;
size_t largest_unchosen_range = 0;
while (true) {
bool found = fb_urange_iter(hpdata->active_pages,
HUGEPAGE_PAGES, start, &begin, &len);
/*
* A precondition to this function is that hpdata must be able
* to serve the allocation.
*/
assert(found);
assert(len <= hpdata_longest_free_range_get(hpdata));
if (len >= npages) {
/*
* We use first-fit within the page slabs; this gives
* bounded worst-case fragmentation within a slab. It's
* not necessarily right; we could experiment with
* various other options.
*/
break;
}
if (len > largest_unchosen_range) {
largest_unchosen_range = len;
}
start = begin + len;
}
/* We found a range; remember it. */
result = begin;
fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages);
hpdata->h_nactive += npages;
/*
* We might be about to dirty some memory for the first time; update our
* count if so.
*/
size_t new_dirty = fb_ucount(hpdata->touched_pages, HUGEPAGE_PAGES,
result, npages);
fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages);
hpdata->h_ntouched += new_dirty;
/*
* If we allocated out of a range that was the longest in the hpdata, it
* might be the only one of that size and we'll have to adjust the
* metadata.
*/
if (len == hpdata_longest_free_range_get(hpdata)) {
start = begin + npages;
while (start < HUGEPAGE_PAGES) {
bool found = fb_urange_iter(hpdata->active_pages,
HUGEPAGE_PAGES, start, &begin, &len);
if (!found) {
break;
}
assert(len <= hpdata_longest_free_range_get(hpdata));
if (len == hpdata_longest_free_range_get(hpdata)) {
largest_unchosen_range = len;
break;
}
if (len > largest_unchosen_range) {
largest_unchosen_range = len;
}
start = begin + len;
}
hpdata_longest_free_range_set(hpdata, largest_unchosen_range);
}
hpdata_assert_consistent(hpdata);
return (void *)(
(uintptr_t)hpdata_addr_get(hpdata) + (result << LG_PAGE));
}
void
hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) {
hpdata_assert_consistent(hpdata);
/* See the comment in reserve. */
assert(!hpdata->h_in_psset || hpdata->h_updating);
assert(((uintptr_t)addr & PAGE_MASK) == 0);
assert((sz & PAGE_MASK) == 0);
size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata))
>> LG_PAGE;
assert(begin < HUGEPAGE_PAGES);
size_t npages = sz >> LG_PAGE;
size_t old_longest_range = hpdata_longest_free_range_get(hpdata);
fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages);
/* We might have just created a new, larger range. */
size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES,
begin) + 1);
size_t new_end = fb_ffs(hpdata->active_pages, HUGEPAGE_PAGES,
begin + npages - 1);
size_t new_range_len = new_end - new_begin;
if (new_range_len > old_longest_range) {
hpdata_longest_free_range_set(hpdata, new_range_len);
}
hpdata->h_nactive -= npages;
hpdata_assert_consistent(hpdata);
}
size_t
hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
hpdata_assert_consistent(hpdata);
/*
* See the comment below; we might purge any inactive extent, so it's
* unsafe for any other thread to turn any inactive extent active while
* we're operating on it.
*/
assert(!hpdata_alloc_allowed_get(hpdata));
purge_state->npurged = 0;
purge_state->next_purge_search_begin = 0;
/*
* Initialize to_purge.
*
* It's possible to end up in situations where two dirty extents are
* separated by a retained extent:
* - 1 page allocated.
* - 1 page allocated.
* - 1 pages allocated.
*
* If the middle page is freed and purged, and then the first and third
* pages are freed, and then another purge pass happens, the hpdata
* looks like this:
* - 1 page dirty.
* - 1 page retained.
* - 1 page dirty.
*
* But it's safe to do a single 3-page purge.
*
* We do this by first computing the dirty pages, and then filling in
* any gaps by extending each range in the dirty bitmap to extend until
* the next active page. This purges more pages, but the expensive part
* of purging is the TLB shootdowns, rather than the kernel state
* tracking; doing a little bit more of the latter is fine if it saves
* us from doing some of the former.
*/
/*
* The dirty pages are those that are touched but not active. Note that
* in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the
* fb_group_t is 64 bits, so this is 64 bytes, spread across 8
* fb_group_ts.
*/
fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
fb_init(dirty_pages, HUGEPAGE_PAGES);
fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES);
fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages,
HUGEPAGE_PAGES);
fb_init(purge_state->to_purge, HUGEPAGE_PAGES);
size_t next_bit = 0;
while (next_bit < HUGEPAGE_PAGES) {
size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES,
next_bit);
/* Recall that fb_ffs returns nbits if no set bit is found. */
if (next_dirty == HUGEPAGE_PAGES) {
break;
}
size_t next_active = fb_ffs(hpdata->active_pages,
HUGEPAGE_PAGES, next_dirty);
/*
* Don't purge past the end of the dirty extent, into retained
* pages. This helps the kernel a tiny bit, but honestly it's
* mostly helpful for testing (where we tend to write test cases
* that think in terms of the dirty ranges).
*/
ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES,
next_active - 1);
assert(last_dirty >= 0);
assert((size_t)last_dirty >= next_dirty);
assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES);
fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty,
last_dirty - next_dirty + 1);
next_bit = next_active + 1;
}
/* We should purge, at least, everything dirty. */
size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive;
purge_state->ndirty_to_purge = ndirty;
assert(ndirty <= fb_scount(
purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0,
HUGEPAGE_PAGES));
hpdata_assert_consistent(hpdata);
return ndirty;
}
bool
hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
void **r_purge_addr, size_t *r_purge_size) {
/*
* Note that we don't have a consistency check here; we're accessing
* hpdata without synchronization, and therefore have no right to expect
* a consistent state.
*/
assert(!hpdata_alloc_allowed_get(hpdata));
if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) {
return false;
}
size_t purge_begin;
size_t purge_len;
bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES,
purge_state->next_purge_search_begin, &purge_begin, &purge_len);
if (!found_range) {
return false;
}
*r_purge_addr = (void *)(
(uintptr_t)hpdata_addr_get(hpdata) + purge_begin * PAGE);
*r_purge_size = purge_len * PAGE;
purge_state->next_purge_search_begin = purge_begin + purge_len;
purge_state->npurged += purge_len;
assert(purge_state->npurged <= HUGEPAGE_PAGES);
return true;
}
void
hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
assert(!hpdata_alloc_allowed_get(hpdata));
hpdata_assert_consistent(hpdata);
/* See the comment in reserve. */
assert(!hpdata->h_in_psset || hpdata->h_updating);
assert(purge_state->npurged == fb_scount(purge_state->to_purge,
HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
assert(purge_state->npurged >= purge_state->ndirty_to_purge);
fb_bit_not(purge_state->to_purge, purge_state->to_purge,
HUGEPAGE_PAGES);
fb_bit_and(hpdata->touched_pages, hpdata->touched_pages,
purge_state->to_purge, HUGEPAGE_PAGES);
assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge);
hpdata->h_ntouched -= purge_state->ndirty_to_purge;
hpdata_assert_consistent(hpdata);
}
void
hpdata_hugify(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
hpdata->h_huge = true;
fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
hpdata->h_ntouched = HUGEPAGE_PAGES;
hpdata_assert_consistent(hpdata);
}
void
hpdata_dehugify(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
hpdata->h_huge = false;
hpdata_assert_consistent(hpdata);
}
|