Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/jemalloc/src/hpa.c
39478 views
1
#include "jemalloc/internal/jemalloc_preamble.h"
2
#include "jemalloc/internal/jemalloc_internal_includes.h"
3
4
#include "jemalloc/internal/hpa.h"
5
6
#include "jemalloc/internal/fb.h"
7
#include "jemalloc/internal/witness.h"
8
9
#define HPA_EDEN_SIZE (128 * HUGEPAGE)
10
11
static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
12
size_t alignment, bool zero, bool guarded, bool frequent_reuse,
13
bool *deferred_work_generated);
14
static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size,
15
size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated);
16
static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
17
size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
18
static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
19
size_t old_size, size_t new_size, bool *deferred_work_generated);
20
static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
21
bool *deferred_work_generated);
22
static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self,
23
edata_list_active_t *list, bool *deferred_work_generated);
24
static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);
25
26
bool
27
hpa_supported() {
28
#ifdef _WIN32
29
/*
30
* At least until the API and implementation is somewhat settled, we
31
* don't want to try to debug the VM subsystem on the hardest-to-test
32
* platform.
33
*/
34
return false;
35
#endif
36
if (!pages_can_hugify) {
37
return false;
38
}
39
/*
40
* We fundamentally rely on a address-space-hungry growth strategy for
41
* hugepages.
42
*/
43
if (LG_SIZEOF_PTR != 3) {
44
return false;
45
}
46
/*
47
* If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
48
* this sentinel value -- see the comment in pages.h.
49
*/
50
if (HUGEPAGE_PAGES == 1) {
51
return false;
52
}
53
return true;
54
}
55
56
static void
57
hpa_do_consistency_checks(hpa_shard_t *shard) {
58
assert(shard->base != NULL);
59
}
60
61
bool
62
hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) {
63
/* malloc_conf processing should have filtered out these cases. */
64
assert(hpa_supported());
65
bool err;
66
err = malloc_mutex_init(&central->grow_mtx, "hpa_central_grow",
67
WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive);
68
if (err) {
69
return true;
70
}
71
err = malloc_mutex_init(&central->mtx, "hpa_central",
72
WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive);
73
if (err) {
74
return true;
75
}
76
central->base = base;
77
central->eden = NULL;
78
central->eden_len = 0;
79
central->age_counter = 0;
80
central->hooks = *hooks;
81
return false;
82
}
83
84
static hpdata_t *
85
hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) {
86
return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t),
87
CACHELINE);
88
}
89
90
hpdata_t *
91
hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
92
bool *oom) {
93
/* Don't yet support big allocations; these should get filtered out. */
94
assert(size <= HUGEPAGE);
95
/*
96
* Should only try to extract from the central allocator if the local
97
* shard is exhausted. We should hold the grow_mtx on that shard.
98
*/
99
witness_assert_positive_depth_to_rank(
100
tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW);
101
102
malloc_mutex_lock(tsdn, &central->grow_mtx);
103
*oom = false;
104
105
hpdata_t *ps = NULL;
106
107
/* Is eden a perfect fit? */
108
if (central->eden != NULL && central->eden_len == HUGEPAGE) {
109
ps = hpa_alloc_ps(tsdn, central);
110
if (ps == NULL) {
111
*oom = true;
112
malloc_mutex_unlock(tsdn, &central->grow_mtx);
113
return NULL;
114
}
115
hpdata_init(ps, central->eden, central->age_counter++);
116
central->eden = NULL;
117
central->eden_len = 0;
118
malloc_mutex_unlock(tsdn, &central->grow_mtx);
119
return ps;
120
}
121
122
/*
123
* We're about to try to allocate from eden by splitting. If eden is
124
* NULL, we have to allocate it too. Otherwise, we just have to
125
* allocate an edata_t for the new psset.
126
*/
127
if (central->eden == NULL) {
128
/*
129
* During development, we're primarily concerned with systems
130
* with overcommit. Eventually, we should be more careful here.
131
*/
132
bool commit = true;
133
/* Allocate address space, bailing if we fail. */
134
void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
135
&commit);
136
if (new_eden == NULL) {
137
*oom = true;
138
malloc_mutex_unlock(tsdn, &central->grow_mtx);
139
return NULL;
140
}
141
ps = hpa_alloc_ps(tsdn, central);
142
if (ps == NULL) {
143
pages_unmap(new_eden, HPA_EDEN_SIZE);
144
*oom = true;
145
malloc_mutex_unlock(tsdn, &central->grow_mtx);
146
return NULL;
147
}
148
central->eden = new_eden;
149
central->eden_len = HPA_EDEN_SIZE;
150
} else {
151
/* Eden is already nonempty; only need an edata for ps. */
152
ps = hpa_alloc_ps(tsdn, central);
153
if (ps == NULL) {
154
*oom = true;
155
malloc_mutex_unlock(tsdn, &central->grow_mtx);
156
return NULL;
157
}
158
}
159
assert(ps != NULL);
160
assert(central->eden != NULL);
161
assert(central->eden_len > HUGEPAGE);
162
assert(central->eden_len % HUGEPAGE == 0);
163
assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden);
164
165
hpdata_init(ps, central->eden, central->age_counter++);
166
167
char *eden_char = (char *)central->eden;
168
eden_char += HUGEPAGE;
169
central->eden = (void *)eden_char;
170
central->eden_len -= HUGEPAGE;
171
172
malloc_mutex_unlock(tsdn, &central->grow_mtx);
173
174
return ps;
175
}
176
177
bool
178
hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
179
base_t *base, edata_cache_t *edata_cache, unsigned ind,
180
const hpa_shard_opts_t *opts) {
181
/* malloc_conf processing should have filtered out these cases. */
182
assert(hpa_supported());
183
bool err;
184
err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
185
WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
186
if (err) {
187
return true;
188
}
189
err = malloc_mutex_init(&shard->mtx, "hpa_shard",
190
WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
191
if (err) {
192
return true;
193
}
194
195
assert(edata_cache != NULL);
196
shard->central = central;
197
shard->base = base;
198
edata_cache_fast_init(&shard->ecf, edata_cache);
199
psset_init(&shard->psset);
200
shard->age_counter = 0;
201
shard->ind = ind;
202
shard->emap = emap;
203
204
shard->opts = *opts;
205
206
shard->npending_purge = 0;
207
nstime_init_zero(&shard->last_purge);
208
209
shard->stats.npurge_passes = 0;
210
shard->stats.npurges = 0;
211
shard->stats.nhugifies = 0;
212
shard->stats.ndehugifies = 0;
213
214
/*
215
* Fill these in last, so that if an hpa_shard gets used despite
216
* initialization failing, we'll at least crash instead of just
217
* operating on corrupted data.
218
*/
219
shard->pai.alloc = &hpa_alloc;
220
shard->pai.alloc_batch = &hpa_alloc_batch;
221
shard->pai.expand = &hpa_expand;
222
shard->pai.shrink = &hpa_shrink;
223
shard->pai.dalloc = &hpa_dalloc;
224
shard->pai.dalloc_batch = &hpa_dalloc_batch;
225
shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work;
226
227
hpa_do_consistency_checks(shard);
228
229
return false;
230
}
231
232
/*
233
* Note that the stats functions here follow the usual stats naming conventions;
234
* "merge" obtains the stats from some live object of instance, while "accum"
235
* only combines the stats from one stats objet to another. Hence the lack of
236
* locking here.
237
*/
238
static void
239
hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
240
hpa_shard_nonderived_stats_t *src) {
241
dst->npurge_passes += src->npurge_passes;
242
dst->npurges += src->npurges;
243
dst->nhugifies += src->nhugifies;
244
dst->ndehugifies += src->ndehugifies;
245
}
246
247
void
248
hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
249
psset_stats_accum(&dst->psset_stats, &src->psset_stats);
250
hpa_shard_nonderived_stats_accum(&dst->nonderived_stats,
251
&src->nonderived_stats);
252
}
253
254
void
255
hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
256
hpa_shard_stats_t *dst) {
257
hpa_do_consistency_checks(shard);
258
259
malloc_mutex_lock(tsdn, &shard->grow_mtx);
260
malloc_mutex_lock(tsdn, &shard->mtx);
261
psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
262
hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats);
263
malloc_mutex_unlock(tsdn, &shard->mtx);
264
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
265
}
266
267
static bool
268
hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) {
269
/*
270
* Note that this needs to be >= rather than just >, because of the
271
* important special case in which the hugification threshold is exactly
272
* HUGEPAGE.
273
*/
274
return hpdata_nactive_get(ps) * PAGE
275
>= shard->opts.hugification_threshold;
276
}
277
278
static size_t
279
hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
280
malloc_mutex_assert_owner(tsdn, &shard->mtx);
281
return psset_ndirty(&shard->psset) - shard->npending_purge;
282
}
283
284
static size_t
285
hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
286
malloc_mutex_assert_owner(tsdn, &shard->mtx);
287
if (shard->opts.dirty_mult == (fxp_t)-1) {
288
return (size_t)-1;
289
}
290
return fxp_mul_frac(psset_nactive(&shard->psset),
291
shard->opts.dirty_mult);
292
}
293
294
static bool
295
hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
296
malloc_mutex_assert_owner(tsdn, &shard->mtx);
297
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
298
if (to_hugify == NULL) {
299
return false;
300
}
301
return hpa_adjusted_ndirty(tsdn, shard)
302
+ hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard);
303
}
304
305
static bool
306
hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
307
malloc_mutex_assert_owner(tsdn, &shard->mtx);
308
if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) {
309
return true;
310
}
311
if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
312
return true;
313
}
314
return false;
315
}
316
317
static void
318
hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard,
319
hpdata_t *ps) {
320
malloc_mutex_assert_owner(tsdn, &shard->mtx);
321
if (hpdata_changing_state_get(ps)) {
322
hpdata_purge_allowed_set(ps, false);
323
hpdata_disallow_hugify(ps);
324
return;
325
}
326
/*
327
* Hugepages are distinctly costly to purge, so try to avoid it unless
328
* they're *particularly* full of dirty pages. Eventually, we should
329
* use a smarter / more dynamic heuristic for situations where we have
330
* to manually hugify.
331
*
332
* In situations where we don't manually hugify, this problem is
333
* reduced. The "bad" situation we're trying to avoid is one's that's
334
* common in some Linux configurations (where both enabled and defrag
335
* are set to madvise) that can lead to long latency spikes on the first
336
* access after a hugification. The ideal policy in such configurations
337
* is probably time-based for both purging and hugifying; only hugify a
338
* hugepage if it's met the criteria for some extended period of time,
339
* and only dehugify it if it's failed to meet the criteria for an
340
* extended period of time. When background threads are on, we should
341
* try to take this hit on one of them, as well.
342
*
343
* I think the ideal setting is THP always enabled, and defrag set to
344
* deferred; in that case we don't need any explicit calls on the
345
* allocator's end at all; we just try to pack allocations in a
346
* hugepage-friendly manner and let the OS hugify in the background.
347
*/
348
hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0);
349
if (hpa_good_hugification_candidate(shard, ps)
350
&& !hpdata_huge_get(ps)) {
351
nstime_t now;
352
shard->central->hooks.curtime(&now, /* first_reading */ true);
353
hpdata_allow_hugify(ps, now);
354
}
355
/*
356
* Once a hugepage has become eligible for hugification, we don't mark
357
* it as ineligible just because it stops meeting the criteria (this
358
* could lead to situations where a hugepage that spends most of its
359
* time meeting the criteria never quite getting hugified if there are
360
* intervening deallocations). The idea is that the hugification delay
361
* will allow them to get purged, reseting their "hugify-allowed" bit.
362
* If they don't get purged, then the hugification isn't hurting and
363
* might help. As an exception, we don't hugify hugepages that are now
364
* empty; it definitely doesn't help there until the hugepage gets
365
* reused, which is likely not for a while.
366
*/
367
if (hpdata_nactive_get(ps) == 0) {
368
hpdata_disallow_hugify(ps);
369
}
370
}
371
372
static bool
373
hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
374
malloc_mutex_assert_owner(tsdn, &shard->mtx);
375
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
376
return to_hugify != NULL || hpa_should_purge(tsdn, shard);
377
}
378
379
/* Returns whether or not we purged anything. */
380
static bool
381
hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
382
malloc_mutex_assert_owner(tsdn, &shard->mtx);
383
384
hpdata_t *to_purge = psset_pick_purge(&shard->psset);
385
if (to_purge == NULL) {
386
return false;
387
}
388
assert(hpdata_purge_allowed_get(to_purge));
389
assert(!hpdata_changing_state_get(to_purge));
390
391
/*
392
* Don't let anyone else purge or hugify this page while
393
* we're purging it (allocations and deallocations are
394
* OK).
395
*/
396
psset_update_begin(&shard->psset, to_purge);
397
assert(hpdata_alloc_allowed_get(to_purge));
398
hpdata_mid_purge_set(to_purge, true);
399
hpdata_purge_allowed_set(to_purge, false);
400
hpdata_disallow_hugify(to_purge);
401
/*
402
* Unlike with hugification (where concurrent
403
* allocations are allowed), concurrent allocation out
404
* of a hugepage being purged is unsafe; we might hand
405
* out an extent for an allocation and then purge it
406
* (clearing out user data).
407
*/
408
hpdata_alloc_allowed_set(to_purge, false);
409
psset_update_end(&shard->psset, to_purge);
410
411
/* Gather all the metadata we'll need during the purge. */
412
bool dehugify = hpdata_huge_get(to_purge);
413
hpdata_purge_state_t purge_state;
414
size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state);
415
416
shard->npending_purge += num_to_purge;
417
418
malloc_mutex_unlock(tsdn, &shard->mtx);
419
420
/* Actually do the purging, now that the lock is dropped. */
421
if (dehugify) {
422
shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
423
HUGEPAGE);
424
}
425
size_t total_purged = 0;
426
uint64_t purges_this_pass = 0;
427
void *purge_addr;
428
size_t purge_size;
429
while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
430
&purge_size)) {
431
total_purged += purge_size;
432
assert(total_purged <= HUGEPAGE);
433
purges_this_pass++;
434
shard->central->hooks.purge(purge_addr, purge_size);
435
}
436
437
malloc_mutex_lock(tsdn, &shard->mtx);
438
/* The shard updates */
439
shard->npending_purge -= num_to_purge;
440
shard->stats.npurge_passes++;
441
shard->stats.npurges += purges_this_pass;
442
shard->central->hooks.curtime(&shard->last_purge,
443
/* first_reading */ false);
444
if (dehugify) {
445
shard->stats.ndehugifies++;
446
}
447
448
/* The hpdata updates. */
449
psset_update_begin(&shard->psset, to_purge);
450
if (dehugify) {
451
hpdata_dehugify(to_purge);
452
}
453
hpdata_purge_end(to_purge, &purge_state);
454
hpdata_mid_purge_set(to_purge, false);
455
456
hpdata_alloc_allowed_set(to_purge, true);
457
hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge);
458
459
psset_update_end(&shard->psset, to_purge);
460
461
return true;
462
}
463
464
/* Returns whether or not we hugified anything. */
465
static bool
466
hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
467
malloc_mutex_assert_owner(tsdn, &shard->mtx);
468
469
if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
470
return false;
471
}
472
473
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
474
if (to_hugify == NULL) {
475
return false;
476
}
477
assert(hpdata_hugify_allowed_get(to_hugify));
478
assert(!hpdata_changing_state_get(to_hugify));
479
480
/* Make sure that it's been hugifiable for long enough. */
481
nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify);
482
uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed);
483
if (millis < shard->opts.hugify_delay_ms) {
484
return false;
485
}
486
487
/*
488
* Don't let anyone else purge or hugify this page while
489
* we're hugifying it (allocations and deallocations are
490
* OK).
491
*/
492
psset_update_begin(&shard->psset, to_hugify);
493
hpdata_mid_hugify_set(to_hugify, true);
494
hpdata_purge_allowed_set(to_hugify, false);
495
hpdata_disallow_hugify(to_hugify);
496
assert(hpdata_alloc_allowed_get(to_hugify));
497
psset_update_end(&shard->psset, to_hugify);
498
499
malloc_mutex_unlock(tsdn, &shard->mtx);
500
501
shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
502
503
malloc_mutex_lock(tsdn, &shard->mtx);
504
shard->stats.nhugifies++;
505
506
psset_update_begin(&shard->psset, to_hugify);
507
hpdata_hugify(to_hugify);
508
hpdata_mid_hugify_set(to_hugify, false);
509
hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify);
510
psset_update_end(&shard->psset, to_hugify);
511
512
return true;
513
}
514
515
/*
516
* Execution of deferred work is forced if it's triggered by an explicit
517
* hpa_shard_do_deferred_work() call.
518
*/
519
static void
520
hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
521
bool forced) {
522
malloc_mutex_assert_owner(tsdn, &shard->mtx);
523
if (!forced && shard->opts.deferral_allowed) {
524
return;
525
}
526
/*
527
* If we're on a background thread, do work so long as there's work to
528
* be done. Otherwise, bound latency to not be *too* bad by doing at
529
* most a small fixed number of operations.
530
*/
531
bool hugified = false;
532
bool purged = false;
533
size_t max_ops = (forced ? (size_t)-1 : 16);
534
size_t nops = 0;
535
do {
536
/*
537
* Always purge before hugifying, to make sure we get some
538
* ability to hit our quiescence targets.
539
*/
540
purged = false;
541
while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
542
purged = hpa_try_purge(tsdn, shard);
543
if (purged) {
544
nops++;
545
}
546
}
547
hugified = hpa_try_hugify(tsdn, shard);
548
if (hugified) {
549
nops++;
550
}
551
malloc_mutex_assert_owner(tsdn, &shard->mtx);
552
malloc_mutex_assert_owner(tsdn, &shard->mtx);
553
} while ((hugified || purged) && nops < max_ops);
554
}
555
556
static edata_t *
557
hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
558
bool *oom) {
559
bool err;
560
edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
561
if (edata == NULL) {
562
*oom = true;
563
return NULL;
564
}
565
566
hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
567
if (ps == NULL) {
568
edata_cache_fast_put(tsdn, &shard->ecf, edata);
569
return NULL;
570
}
571
572
psset_update_begin(&shard->psset, ps);
573
574
if (hpdata_empty(ps)) {
575
/*
576
* If the pageslab used to be empty, treat it as though it's
577
* brand new for fragmentation-avoidance purposes; what we're
578
* trying to approximate is the age of the allocations *in* that
579
* pageslab, and the allocations in the new pageslab are
580
* definitionally the youngest in this hpa shard.
581
*/
582
hpdata_age_set(ps, shard->age_counter++);
583
}
584
585
void *addr = hpdata_reserve_alloc(ps, size);
586
edata_init(edata, shard->ind, addr, size, /* slab */ false,
587
SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active,
588
/* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA,
589
EXTENT_NOT_HEAD);
590
edata_ps_set(edata, ps);
591
592
/*
593
* This could theoretically be moved outside of the critical section,
594
* but that introduces the potential for a race. Without the lock, the
595
* (initially nonempty, since this is the reuse pathway) pageslab we
596
* allocated out of could become otherwise empty while the lock is
597
* dropped. This would force us to deal with a pageslab eviction down
598
* the error pathway, which is a pain.
599
*/
600
err = emap_register_boundary(tsdn, shard->emap, edata,
601
SC_NSIZES, /* slab */ false);
602
if (err) {
603
hpdata_unreserve(ps, edata_addr_get(edata),
604
edata_size_get(edata));
605
/*
606
* We should arguably reset dirty state here, but this would
607
* require some sort of prepare + commit functionality that's a
608
* little much to deal with for now.
609
*
610
* We don't have a do_deferred_work down this pathway, on the
611
* principle that we didn't *really* affect shard state (we
612
* tweaked the stats, but our tweaks weren't really accurate).
613
*/
614
psset_update_end(&shard->psset, ps);
615
edata_cache_fast_put(tsdn, &shard->ecf, edata);
616
*oom = true;
617
return NULL;
618
}
619
620
hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
621
psset_update_end(&shard->psset, ps);
622
return edata;
623
}
624
625
static size_t
626
hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
627
bool *oom, size_t nallocs, edata_list_active_t *results,
628
bool *deferred_work_generated) {
629
malloc_mutex_lock(tsdn, &shard->mtx);
630
size_t nsuccess = 0;
631
for (; nsuccess < nallocs; nsuccess++) {
632
edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size,
633
oom);
634
if (edata == NULL) {
635
break;
636
}
637
edata_list_active_append(results, edata);
638
}
639
640
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
641
*deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
642
malloc_mutex_unlock(tsdn, &shard->mtx);
643
return nsuccess;
644
}
645
646
static size_t
647
hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
648
size_t nallocs, edata_list_active_t *results,
649
bool *deferred_work_generated) {
650
assert(size <= shard->opts.slab_max_alloc);
651
bool oom = false;
652
653
size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
654
nallocs, results, deferred_work_generated);
655
656
if (nsuccess == nallocs || oom) {
657
return nsuccess;
658
}
659
660
/*
661
* We didn't OOM, but weren't able to fill everything requested of us;
662
* try to grow.
663
*/
664
malloc_mutex_lock(tsdn, &shard->grow_mtx);
665
/*
666
* Check for grow races; maybe some earlier thread expanded the psset
667
* in between when we dropped the main mutex and grabbed the grow mutex.
668
*/
669
nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
670
nallocs - nsuccess, results, deferred_work_generated);
671
if (nsuccess == nallocs || oom) {
672
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
673
return nsuccess;
674
}
675
676
/*
677
* Note that we don't hold shard->mtx here (while growing);
678
* deallocations (and allocations of smaller sizes) may still succeed
679
* while we're doing this potentially expensive system call.
680
*/
681
hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom);
682
if (ps == NULL) {
683
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
684
return nsuccess;
685
}
686
687
/*
688
* We got the pageslab; allocate from it. This does an unlock followed
689
* by a lock on the same mutex, and holds the grow mutex while doing
690
* deferred work, but this is an uncommon path; the simplicity is worth
691
* it.
692
*/
693
malloc_mutex_lock(tsdn, &shard->mtx);
694
psset_insert(&shard->psset, ps);
695
malloc_mutex_unlock(tsdn, &shard->mtx);
696
697
nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
698
nallocs - nsuccess, results, deferred_work_generated);
699
/*
700
* Drop grow_mtx before doing deferred work; other threads blocked on it
701
* should be allowed to proceed while we're working.
702
*/
703
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
704
705
return nsuccess;
706
}
707
708
static hpa_shard_t *
709
hpa_from_pai(pai_t *self) {
710
assert(self->alloc = &hpa_alloc);
711
assert(self->expand = &hpa_expand);
712
assert(self->shrink = &hpa_shrink);
713
assert(self->dalloc = &hpa_dalloc);
714
return (hpa_shard_t *)self;
715
}
716
717
static size_t
718
hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
719
edata_list_active_t *results, bool *deferred_work_generated) {
720
assert(nallocs > 0);
721
assert((size & PAGE_MASK) == 0);
722
witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
723
WITNESS_RANK_CORE, 0);
724
hpa_shard_t *shard = hpa_from_pai(self);
725
726
if (size > shard->opts.slab_max_alloc) {
727
return 0;
728
}
729
730
size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs,
731
results, deferred_work_generated);
732
733
witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
734
WITNESS_RANK_CORE, 0);
735
736
/*
737
* Guard the sanity checks with config_debug because the loop cannot be
738
* proven non-circular by the compiler, even if everything within the
739
* loop is optimized away.
740
*/
741
if (config_debug) {
742
edata_t *edata;
743
ql_foreach(edata, &results->head, ql_link_active) {
744
emap_assert_mapped(tsdn, shard->emap, edata);
745
assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
746
assert(edata_state_get(edata) == extent_state_active);
747
assert(edata_arena_ind_get(edata) == shard->ind);
748
assert(edata_szind_get_maybe_invalid(edata) ==
749
SC_NSIZES);
750
assert(!edata_slab_get(edata));
751
assert(edata_committed_get(edata));
752
assert(edata_base_get(edata) == edata_addr_get(edata));
753
assert(edata_base_get(edata) != NULL);
754
}
755
}
756
return nsuccess;
757
}
758
759
static edata_t *
760
hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
761
bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
762
assert((size & PAGE_MASK) == 0);
763
assert(!guarded);
764
witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
765
WITNESS_RANK_CORE, 0);
766
767
/* We don't handle alignment or zeroing for now. */
768
if (alignment > PAGE || zero) {
769
return NULL;
770
}
771
/*
772
* An alloc with alignment == PAGE and zero == false is equivalent to a
773
* batch alloc of 1. Just do that, so we can share code.
774
*/
775
edata_list_active_t results;
776
edata_list_active_init(&results);
777
size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1,
778
&results, deferred_work_generated);
779
assert(nallocs == 0 || nallocs == 1);
780
edata_t *edata = edata_list_active_first(&results);
781
return edata;
782
}
783
784
static bool
785
hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
786
size_t new_size, bool zero, bool *deferred_work_generated) {
787
/* Expand not yet supported. */
788
return true;
789
}
790
791
static bool
792
hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
793
size_t old_size, size_t new_size, bool *deferred_work_generated) {
794
/* Shrink not yet supported. */
795
return true;
796
}
797
798
static void
799
hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
800
malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
801
802
assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
803
assert(edata_state_get(edata) == extent_state_active);
804
assert(edata_arena_ind_get(edata) == shard->ind);
805
assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
806
assert(edata_committed_get(edata));
807
assert(edata_base_get(edata) != NULL);
808
809
/*
810
* Another thread shouldn't be trying to touch the metadata of an
811
* allocation being freed. The one exception is a merge attempt from a
812
* lower-addressed PAC extent; in this case we have a nominal race on
813
* the edata metadata bits, but in practice the fact that the PAI bits
814
* are different will prevent any further access. The race is bad, but
815
* benign in practice, and the long term plan is to track enough state
816
* in the rtree to prevent these merge attempts in the first place.
817
*/
818
edata_addr_set(edata, edata_base_get(edata));
819
edata_zeroed_set(edata, false);
820
emap_deregister_boundary(tsdn, shard->emap, edata);
821
}
822
823
static void
824
hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
825
malloc_mutex_assert_owner(tsdn, &shard->mtx);
826
827
/*
828
* Release the metadata early, to avoid having to remember to do it
829
* while we're also doing tricky purging logic. First, we need to grab
830
* a few bits of metadata from it.
831
*
832
* Note that the shard mutex protects ps's metadata too; it wouldn't be
833
* correct to try to read most information out of it without the lock.
834
*/
835
hpdata_t *ps = edata_ps_get(edata);
836
/* Currently, all edatas come from pageslabs. */
837
assert(ps != NULL);
838
void *unreserve_addr = edata_addr_get(edata);
839
size_t unreserve_size = edata_size_get(edata);
840
edata_cache_fast_put(tsdn, &shard->ecf, edata);
841
842
psset_update_begin(&shard->psset, ps);
843
hpdata_unreserve(ps, unreserve_addr, unreserve_size);
844
hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
845
psset_update_end(&shard->psset, ps);
846
}
847
848
static void
849
hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
850
bool *deferred_work_generated) {
851
hpa_shard_t *shard = hpa_from_pai(self);
852
853
edata_t *edata;
854
ql_foreach(edata, &list->head, ql_link_active) {
855
hpa_dalloc_prepare_unlocked(tsdn, shard, edata);
856
}
857
858
malloc_mutex_lock(tsdn, &shard->mtx);
859
/* Now, remove from the list. */
860
while ((edata = edata_list_active_first(list)) != NULL) {
861
edata_list_active_remove(list, edata);
862
hpa_dalloc_locked(tsdn, shard, edata);
863
}
864
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
865
*deferred_work_generated =
866
hpa_shard_has_deferred_work(tsdn, shard);
867
868
malloc_mutex_unlock(tsdn, &shard->mtx);
869
}
870
871
static void
872
hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
873
bool *deferred_work_generated) {
874
assert(!edata_guarded_get(edata));
875
/* Just a dalloc_batch of size 1; this lets us share logic. */
876
edata_list_active_t dalloc_list;
877
edata_list_active_init(&dalloc_list);
878
edata_list_active_append(&dalloc_list, edata);
879
hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated);
880
}
881
882
/*
883
* Calculate time until either purging or hugification ought to happen.
884
* Called by background threads.
885
*/
886
static uint64_t
887
hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
888
hpa_shard_t *shard = hpa_from_pai(self);
889
uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX;
890
891
malloc_mutex_lock(tsdn, &shard->mtx);
892
893
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
894
if (to_hugify != NULL) {
895
nstime_t time_hugify_allowed =
896
hpdata_time_hugify_allowed(to_hugify);
897
uint64_t since_hugify_allowed_ms =
898
shard->central->hooks.ms_since(&time_hugify_allowed);
899
/*
900
* If not enough time has passed since hugification was allowed,
901
* sleep for the rest.
902
*/
903
if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) {
904
time_ns = shard->opts.hugify_delay_ms -
905
since_hugify_allowed_ms;
906
time_ns *= 1000 * 1000;
907
} else {
908
malloc_mutex_unlock(tsdn, &shard->mtx);
909
return BACKGROUND_THREAD_DEFERRED_MIN;
910
}
911
}
912
913
if (hpa_should_purge(tsdn, shard)) {
914
/*
915
* If we haven't purged before, no need to check interval
916
* between purges. Simply purge as soon as possible.
917
*/
918
if (shard->stats.npurge_passes == 0) {
919
malloc_mutex_unlock(tsdn, &shard->mtx);
920
return BACKGROUND_THREAD_DEFERRED_MIN;
921
}
922
uint64_t since_last_purge_ms = shard->central->hooks.ms_since(
923
&shard->last_purge);
924
925
if (since_last_purge_ms < shard->opts.min_purge_interval_ms) {
926
uint64_t until_purge_ns;
927
until_purge_ns = shard->opts.min_purge_interval_ms -
928
since_last_purge_ms;
929
until_purge_ns *= 1000 * 1000;
930
931
if (until_purge_ns < time_ns) {
932
time_ns = until_purge_ns;
933
}
934
} else {
935
time_ns = BACKGROUND_THREAD_DEFERRED_MIN;
936
}
937
}
938
malloc_mutex_unlock(tsdn, &shard->mtx);
939
return time_ns;
940
}
941
942
void
943
hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
944
hpa_do_consistency_checks(shard);
945
946
malloc_mutex_lock(tsdn, &shard->mtx);
947
edata_cache_fast_disable(tsdn, &shard->ecf);
948
malloc_mutex_unlock(tsdn, &shard->mtx);
949
}
950
951
static void
952
hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
953
assert(bin_stats->npageslabs == 0);
954
assert(bin_stats->nactive == 0);
955
}
956
957
static void
958
hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
959
malloc_mutex_assert_owner(tsdn, &shard->mtx);
960
for (int huge = 0; huge <= 1; huge++) {
961
hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]);
962
for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
963
hpa_shard_assert_stats_empty(
964
&psset->stats.nonfull_slabs[i][huge]);
965
}
966
}
967
}
968
969
void
970
hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
971
hpa_do_consistency_checks(shard);
972
/*
973
* By the time we're here, the arena code should have dalloc'd all the
974
* active extents, which means we should have eventually evicted
975
* everything from the psset, so it shouldn't be able to serve even a
976
* 1-page allocation.
977
*/
978
if (config_debug) {
979
malloc_mutex_lock(tsdn, &shard->mtx);
980
hpa_assert_empty(tsdn, shard, &shard->psset);
981
malloc_mutex_unlock(tsdn, &shard->mtx);
982
}
983
hpdata_t *ps;
984
while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) {
985
/* There should be no allocations anywhere. */
986
assert(hpdata_empty(ps));
987
psset_remove(&shard->psset, ps);
988
shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE);
989
}
990
}
991
992
void
993
hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
994
bool deferral_allowed) {
995
hpa_do_consistency_checks(shard);
996
997
malloc_mutex_lock(tsdn, &shard->mtx);
998
bool deferral_previously_allowed = shard->opts.deferral_allowed;
999
shard->opts.deferral_allowed = deferral_allowed;
1000
if (deferral_previously_allowed && !deferral_allowed) {
1001
hpa_shard_maybe_do_deferred_work(tsdn, shard,
1002
/* forced */ true);
1003
}
1004
malloc_mutex_unlock(tsdn, &shard->mtx);
1005
}
1006
1007
void
1008
hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
1009
hpa_do_consistency_checks(shard);
1010
1011
malloc_mutex_lock(tsdn, &shard->mtx);
1012
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true);
1013
malloc_mutex_unlock(tsdn, &shard->mtx);
1014
}
1015
1016
void
1017
hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
1018
hpa_do_consistency_checks(shard);
1019
1020
malloc_mutex_prefork(tsdn, &shard->grow_mtx);
1021
}
1022
1023
void
1024
hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
1025
hpa_do_consistency_checks(shard);
1026
1027
malloc_mutex_prefork(tsdn, &shard->mtx);
1028
}
1029
1030
void
1031
hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
1032
hpa_do_consistency_checks(shard);
1033
1034
malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
1035
malloc_mutex_postfork_parent(tsdn, &shard->mtx);
1036
}
1037
1038
void
1039
hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
1040
hpa_do_consistency_checks(shard);
1041
1042
malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
1043
malloc_mutex_postfork_child(tsdn, &shard->mtx);
1044
}
1045
1046