CoCalc -- hpa.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/jemalloc/src/hpa.c
³⁹⁴⁷⁸ views
1
#include "jemalloc/internal/jemalloc_preamble.h"
2
#include "jemalloc/internal/jemalloc_internal_includes.h"
3

4
#include "jemalloc/internal/hpa.h"
5

6
#include "jemalloc/internal/fb.h"
7
#include "jemalloc/internal/witness.h"
8

9
#define HPA_EDEN_SIZE (128 * HUGEPAGE)
10

11
static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
12
    size_t alignment, bool zero, bool guarded, bool frequent_reuse,
13
    bool *deferred_work_generated);
14
static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size,
15
    size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated);
16
static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
17
    size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
18
static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
19
    size_t old_size, size_t new_size, bool *deferred_work_generated);
20
static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
21
    bool *deferred_work_generated);
22
static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self,
23
    edata_list_active_t *list, bool *deferred_work_generated);
24
static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);
25

26
bool
27
hpa_supported() {
28
#ifdef _WIN32
29
	/*
30
	 * At least until the API and implementation is somewhat settled, we
31
	 * don't want to try to debug the VM subsystem on the hardest-to-test
32
	 * platform.
33
	 */
34
	return false;
35
#endif
36
	if (!pages_can_hugify) {
37
		return false;
38
	}
39
	/*
40
	 * We fundamentally rely on a address-space-hungry growth strategy for
41
	 * hugepages.
42
	 */
43
	if (LG_SIZEOF_PTR != 3) {
44
		return false;
45
	}
46
	/*
47
	 * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
48
	 * this sentinel value -- see the comment in pages.h.
49
	 */
50
	if (HUGEPAGE_PAGES == 1) {
51
		return false;
52
	}
53
	return true;
54
}
55

56
static void
57
hpa_do_consistency_checks(hpa_shard_t *shard) {
58
	assert(shard->base != NULL);
59
}
60

61
bool
62
hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) {
63
	/* malloc_conf processing should have filtered out these cases. */
64
	assert(hpa_supported());
65
	bool err;
66
	err = malloc_mutex_init(&central->grow_mtx, "hpa_central_grow",
67
	    WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive);
68
	if (err) {
69
		return true;
70
	}
71
	err = malloc_mutex_init(&central->mtx, "hpa_central",
72
	    WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive);
73
	if (err) {
74
		return true;
75
	}
76
	central->base = base;
77
	central->eden = NULL;
78
	central->eden_len = 0;
79
	central->age_counter = 0;
80
	central->hooks = *hooks;
81
	return false;
82
}
83

84
static hpdata_t *
85
hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) {
86
	return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t),
87
	    CACHELINE);
88
}
89

90
hpdata_t *
91
hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
92
    bool *oom) {
93
	/* Don't yet support big allocations; these should get filtered out. */
94
	assert(size <= HUGEPAGE);
95
	/*
96
	 * Should only try to extract from the central allocator if the local
97
	 * shard is exhausted.  We should hold the grow_mtx on that shard.
98
	 */
99
	witness_assert_positive_depth_to_rank(
100
	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW);
101

102
	malloc_mutex_lock(tsdn, &central->grow_mtx);
103
	*oom = false;
104

105
	hpdata_t *ps = NULL;
106

107
	/* Is eden a perfect fit? */
108
	if (central->eden != NULL && central->eden_len == HUGEPAGE) {
109
		ps = hpa_alloc_ps(tsdn, central);
110
		if (ps == NULL) {
111
			*oom = true;
112
			malloc_mutex_unlock(tsdn, &central->grow_mtx);
113
			return NULL;
114
		}
115
		hpdata_init(ps, central->eden, central->age_counter++);
116
		central->eden = NULL;
117
		central->eden_len = 0;
118
		malloc_mutex_unlock(tsdn, &central->grow_mtx);
119
		return ps;
120
	}
121

122
	/*
123
	 * We're about to try to allocate from eden by splitting.  If eden is
124
	 * NULL, we have to allocate it too.  Otherwise, we just have to
125
	 * allocate an edata_t for the new psset.
126
	 */
127
	if (central->eden == NULL) {
128
		/*
129
		 * During development, we're primarily concerned with systems
130
		 * with overcommit.  Eventually, we should be more careful here.
131
		 */
132
		bool commit = true;
133
		/* Allocate address space, bailing if we fail. */
134
		void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
135
		    &commit);
136
		if (new_eden == NULL) {
137
			*oom = true;
138
			malloc_mutex_unlock(tsdn, &central->grow_mtx);
139
			return NULL;
140
		}
141
		ps = hpa_alloc_ps(tsdn, central);
142
		if (ps == NULL) {
143
			pages_unmap(new_eden, HPA_EDEN_SIZE);
144
			*oom = true;
145
			malloc_mutex_unlock(tsdn, &central->grow_mtx);
146
			return NULL;
147
		}
148
		central->eden = new_eden;
149
		central->eden_len = HPA_EDEN_SIZE;
150
	} else {
151
		/* Eden is already nonempty; only need an edata for ps. */
152
		ps = hpa_alloc_ps(tsdn, central);
153
		if (ps == NULL) {
154
			*oom = true;
155
			malloc_mutex_unlock(tsdn, &central->grow_mtx);
156
			return NULL;
157
		}
158
	}
159
	assert(ps != NULL);
160
	assert(central->eden != NULL);
161
	assert(central->eden_len > HUGEPAGE);
162
	assert(central->eden_len % HUGEPAGE == 0);
163
	assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden);
164

165
	hpdata_init(ps, central->eden, central->age_counter++);
166

167
	char *eden_char = (char *)central->eden;
168
	eden_char += HUGEPAGE;
169
	central->eden = (void *)eden_char;
170
	central->eden_len -= HUGEPAGE;
171

172
	malloc_mutex_unlock(tsdn, &central->grow_mtx);
173

174
	return ps;
175
}
176

177
bool
178
hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
179
    base_t *base, edata_cache_t *edata_cache, unsigned ind,
180
    const hpa_shard_opts_t *opts) {
181
	/* malloc_conf processing should have filtered out these cases. */
182
	assert(hpa_supported());
183
	bool err;
184
	err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
185
	    WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
186
	if (err) {
187
		return true;
188
	}
189
	err = malloc_mutex_init(&shard->mtx, "hpa_shard",
190
	    WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
191
	if (err) {
192
		return true;
193
	}
194

195
	assert(edata_cache != NULL);
196
	shard->central = central;
197
	shard->base = base;
198
	edata_cache_fast_init(&shard->ecf, edata_cache);
199
	psset_init(&shard->psset);
200
	shard->age_counter = 0;
201
	shard->ind = ind;
202
	shard->emap = emap;
203

204
	shard->opts = *opts;
205

206
	shard->npending_purge = 0;
207
	nstime_init_zero(&shard->last_purge);
208

209
	shard->stats.npurge_passes = 0;
210
	shard->stats.npurges = 0;
211
	shard->stats.nhugifies = 0;
212
	shard->stats.ndehugifies = 0;
213

214
	/*
215
	 * Fill these in last, so that if an hpa_shard gets used despite
216
	 * initialization failing, we'll at least crash instead of just
217
	 * operating on corrupted data.
218
	 */
219
	shard->pai.alloc = &hpa_alloc;
220
	shard->pai.alloc_batch = &hpa_alloc_batch;
221
	shard->pai.expand = &hpa_expand;
222
	shard->pai.shrink = &hpa_shrink;
223
	shard->pai.dalloc = &hpa_dalloc;
224
	shard->pai.dalloc_batch = &hpa_dalloc_batch;
225
	shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work;
226

227
	hpa_do_consistency_checks(shard);
228

229
	return false;
230
}
231

232
/*
233
 * Note that the stats functions here follow the usual stats naming conventions;
234
 * "merge" obtains the stats from some live object of instance, while "accum"
235
 * only combines the stats from one stats objet to another.  Hence the lack of
236
 * locking here.
237
 */
238
static void
239
hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
240
    hpa_shard_nonderived_stats_t *src) {
241
	dst->npurge_passes += src->npurge_passes;
242
	dst->npurges += src->npurges;
243
	dst->nhugifies += src->nhugifies;
244
	dst->ndehugifies += src->ndehugifies;
245
}
246

247
void
248
hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
249
	psset_stats_accum(&dst->psset_stats, &src->psset_stats);
250
	hpa_shard_nonderived_stats_accum(&dst->nonderived_stats,
251
	    &src->nonderived_stats);
252
}
253

254
void
255
hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
256
    hpa_shard_stats_t *dst) {
257
	hpa_do_consistency_checks(shard);
258

259
	malloc_mutex_lock(tsdn, &shard->grow_mtx);
260
	malloc_mutex_lock(tsdn, &shard->mtx);
261
	psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
262
	hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats);
263
	malloc_mutex_unlock(tsdn, &shard->mtx);
264
	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
265
}
266

267
static bool
268
hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) {
269
	/*
270
	 * Note that this needs to be >= rather than just >, because of the
271
	 * important special case in which the hugification threshold is exactly
272
	 * HUGEPAGE.
273
	 */
274
	return hpdata_nactive_get(ps) * PAGE
275
	    >= shard->opts.hugification_threshold;
276
}
277

278
static size_t
279
hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
280
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
281
	return psset_ndirty(&shard->psset) - shard->npending_purge;
282
}
283

284
static size_t
285
hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
286
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
287
	if (shard->opts.dirty_mult == (fxp_t)-1) {
288
		return (size_t)-1;
289
	}
290
	return fxp_mul_frac(psset_nactive(&shard->psset),
291
	    shard->opts.dirty_mult);
292
}
293

294
static bool
295
hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
296
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
297
	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
298
	if (to_hugify == NULL) {
299
		return false;
300
	}
301
	return hpa_adjusted_ndirty(tsdn, shard)
302
	    + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard);
303
}
304

305
static bool
306
hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
307
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
308
	if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) {
309
		return true;
310
	}
311
	if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
312
		return true;
313
	}
314
	return false;
315
}
316

317
static void
318
hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard,
319
    hpdata_t *ps) {
320
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
321
	if (hpdata_changing_state_get(ps)) {
322
		hpdata_purge_allowed_set(ps, false);
323
		hpdata_disallow_hugify(ps);
324
		return;
325
	}
326
	/*
327
	 * Hugepages are distinctly costly to purge, so try to avoid it unless
328
	 * they're *particularly* full of dirty pages.  Eventually, we should
329
	 * use a smarter / more dynamic heuristic for situations where we have
330
	 * to manually hugify.
331
	 *
332
	 * In situations where we don't manually hugify, this problem is
333
	 * reduced.  The "bad" situation we're trying to avoid is one's that's
334
	 * common in some Linux configurations (where both enabled and defrag
335
	 * are set to madvise) that can lead to long latency spikes on the first
336
	 * access after a hugification.  The ideal policy in such configurations
337
	 * is probably time-based for both purging and hugifying; only hugify a
338
	 * hugepage if it's met the criteria for some extended period of time,
339
	 * and only dehugify it if it's failed to meet the criteria for an
340
	 * extended period of time.  When background threads are on, we should
341
	 * try to take this hit on one of them, as well.
342
	 *
343
	 * I think the ideal setting is THP always enabled, and defrag set to
344
	 * deferred; in that case we don't need any explicit calls on the
345
	 * allocator's end at all; we just try to pack allocations in a
346
	 * hugepage-friendly manner and let the OS hugify in the background.
347
	 */
348
	hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0);
349
	if (hpa_good_hugification_candidate(shard, ps)
350
	    && !hpdata_huge_get(ps)) {
351
		nstime_t now;
352
		shard->central->hooks.curtime(&now, /* first_reading */ true);
353
		hpdata_allow_hugify(ps, now);
354
	}
355
	/*
356
	 * Once a hugepage has become eligible for hugification, we don't mark
357
	 * it as ineligible just because it stops meeting the criteria (this
358
	 * could lead to situations where a hugepage that spends most of its
359
	 * time meeting the criteria never quite getting hugified if there are
360
	 * intervening deallocations).  The idea is that the hugification delay
361
	 * will allow them to get purged, reseting their "hugify-allowed" bit.
362
	 * If they don't get purged, then the hugification isn't hurting and
363
	 * might help.  As an exception, we don't hugify hugepages that are now
364
	 * empty; it definitely doesn't help there until the hugepage gets
365
	 * reused, which is likely not for a while.
366
	 */
367
	if (hpdata_nactive_get(ps) == 0) {
368
		hpdata_disallow_hugify(ps);
369
	}
370
}
371

372
static bool
373
hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
374
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
375
	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
376
	return to_hugify != NULL || hpa_should_purge(tsdn, shard);
377
}
378

379
/* Returns whether or not we purged anything. */
380
static bool
381
hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
382
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
383

384
	hpdata_t *to_purge = psset_pick_purge(&shard->psset);
385
	if (to_purge == NULL) {
386
		return false;
387
	}
388
	assert(hpdata_purge_allowed_get(to_purge));
389
	assert(!hpdata_changing_state_get(to_purge));
390

391
	/*
392
	 * Don't let anyone else purge or hugify this page while
393
	 * we're purging it (allocations and deallocations are
394
	 * OK).
395
	 */
396
	psset_update_begin(&shard->psset, to_purge);
397
	assert(hpdata_alloc_allowed_get(to_purge));
398
	hpdata_mid_purge_set(to_purge, true);
399
	hpdata_purge_allowed_set(to_purge, false);
400
	hpdata_disallow_hugify(to_purge);
401
	/*
402
	 * Unlike with hugification (where concurrent
403
	 * allocations are allowed), concurrent allocation out
404
	 * of a hugepage being purged is unsafe; we might hand
405
	 * out an extent for an allocation and then purge it
406
	 * (clearing out user data).
407
	 */
408
	hpdata_alloc_allowed_set(to_purge, false);
409
	psset_update_end(&shard->psset, to_purge);
410

411
	/* Gather all the metadata we'll need during the purge. */
412
	bool dehugify = hpdata_huge_get(to_purge);
413
	hpdata_purge_state_t purge_state;
414
	size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state);
415

416
	shard->npending_purge += num_to_purge;
417

418
	malloc_mutex_unlock(tsdn, &shard->mtx);
419

420
	/* Actually do the purging, now that the lock is dropped. */
421
	if (dehugify) {
422
		shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
423
		    HUGEPAGE);
424
	}
425
	size_t total_purged = 0;
426
	uint64_t purges_this_pass = 0;
427
	void *purge_addr;
428
	size_t purge_size;
429
	while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
430
	    &purge_size)) {
431
		total_purged += purge_size;
432
		assert(total_purged <= HUGEPAGE);
433
		purges_this_pass++;
434
		shard->central->hooks.purge(purge_addr, purge_size);
435
	}
436

437
	malloc_mutex_lock(tsdn, &shard->mtx);
438
	/* The shard updates */
439
	shard->npending_purge -= num_to_purge;
440
	shard->stats.npurge_passes++;
441
	shard->stats.npurges += purges_this_pass;
442
	shard->central->hooks.curtime(&shard->last_purge,
443
	    /* first_reading */ false);
444
	if (dehugify) {
445
		shard->stats.ndehugifies++;
446
	}
447

448
	/* The hpdata updates. */
449
	psset_update_begin(&shard->psset, to_purge);
450
	if (dehugify) {
451
		hpdata_dehugify(to_purge);
452
	}
453
	hpdata_purge_end(to_purge, &purge_state);
454
	hpdata_mid_purge_set(to_purge, false);
455

456
	hpdata_alloc_allowed_set(to_purge, true);
457
	hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge);
458

459
	psset_update_end(&shard->psset, to_purge);
460

461
	return true;
462
}
463

464
/* Returns whether or not we hugified anything. */
465
static bool
466
hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
467
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
468

469
	if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
470
		return false;
471
	}
472

473
	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
474
	if (to_hugify == NULL) {
475
		return false;
476
	}
477
	assert(hpdata_hugify_allowed_get(to_hugify));
478
	assert(!hpdata_changing_state_get(to_hugify));
479

480
	/* Make sure that it's been hugifiable for long enough. */
481
	nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify);
482
	uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed);
483
	if (millis < shard->opts.hugify_delay_ms) {
484
		return false;
485
	}
486

487
	/*
488
	 * Don't let anyone else purge or hugify this page while
489
	 * we're hugifying it (allocations and deallocations are
490
	 * OK).
491
	 */
492
	psset_update_begin(&shard->psset, to_hugify);
493
	hpdata_mid_hugify_set(to_hugify, true);
494
	hpdata_purge_allowed_set(to_hugify, false);
495
	hpdata_disallow_hugify(to_hugify);
496
	assert(hpdata_alloc_allowed_get(to_hugify));
497
	psset_update_end(&shard->psset, to_hugify);
498

499
	malloc_mutex_unlock(tsdn, &shard->mtx);
500

501
	shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
502

503
	malloc_mutex_lock(tsdn, &shard->mtx);
504
	shard->stats.nhugifies++;
505

506
	psset_update_begin(&shard->psset, to_hugify);
507
	hpdata_hugify(to_hugify);
508
	hpdata_mid_hugify_set(to_hugify, false);
509
	hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify);
510
	psset_update_end(&shard->psset, to_hugify);
511

512
	return true;
513
}
514

515
/*
516
 * Execution of deferred work is forced if it's triggered by an explicit
517
 * hpa_shard_do_deferred_work() call.
518
 */
519
static void
520
hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
521
    bool forced) {
522
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
523
	if (!forced && shard->opts.deferral_allowed) {
524
		return;
525
	}
526
	/*
527
	 * If we're on a background thread, do work so long as there's work to
528
	 * be done.  Otherwise, bound latency to not be *too* bad by doing at
529
	 * most a small fixed number of operations.
530
	 */
531
	bool hugified = false;
532
	bool purged = false;
533
	size_t max_ops = (forced ? (size_t)-1 : 16);
534
	size_t nops = 0;
535
	do {
536
		/*
537
		 * Always purge before hugifying, to make sure we get some
538
		 * ability to hit our quiescence targets.
539
		 */
540
		purged = false;
541
		while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
542
			purged = hpa_try_purge(tsdn, shard);
543
			if (purged) {
544
				nops++;
545
			}
546
		}
547
		hugified = hpa_try_hugify(tsdn, shard);
548
		if (hugified) {
549
			nops++;
550
		}
551
		malloc_mutex_assert_owner(tsdn, &shard->mtx);
552
		malloc_mutex_assert_owner(tsdn, &shard->mtx);
553
	} while ((hugified || purged) && nops < max_ops);
554
}
555

556
static edata_t *
557
hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
558
    bool *oom) {
559
	bool err;
560
	edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
561
	if (edata == NULL) {
562
		*oom = true;
563
		return NULL;
564
	}
565

566
	hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
567
	if (ps == NULL) {
568
		edata_cache_fast_put(tsdn, &shard->ecf, edata);
569
		return NULL;
570
	}
571

572
	psset_update_begin(&shard->psset, ps);
573

574
	if (hpdata_empty(ps)) {
575
		/*
576
		 * If the pageslab used to be empty, treat it as though it's
577
		 * brand new for fragmentation-avoidance purposes; what we're
578
		 * trying to approximate is the age of the allocations *in* that
579
		 * pageslab, and the allocations in the new pageslab are
580
		 * definitionally the youngest in this hpa shard.
581
		 */
582
		hpdata_age_set(ps, shard->age_counter++);
583
	}
584

585
	void *addr = hpdata_reserve_alloc(ps, size);
586
	edata_init(edata, shard->ind, addr, size, /* slab */ false,
587
	    SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active,
588
	    /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA,
589
	    EXTENT_NOT_HEAD);
590
	edata_ps_set(edata, ps);
591

592
	/*
593
	 * This could theoretically be moved outside of the critical section,
594
	 * but that introduces the potential for a race.  Without the lock, the
595
	 * (initially nonempty, since this is the reuse pathway) pageslab we
596
	 * allocated out of could become otherwise empty while the lock is
597
	 * dropped.  This would force us to deal with a pageslab eviction down
598
	 * the error pathway, which is a pain.
599
	 */
600
	err = emap_register_boundary(tsdn, shard->emap, edata,
601
	    SC_NSIZES, /* slab */ false);
602
	if (err) {
603
		hpdata_unreserve(ps, edata_addr_get(edata),
604
		    edata_size_get(edata));
605
		/*
606
		 * We should arguably reset dirty state here, but this would
607
		 * require some sort of prepare + commit functionality that's a
608
		 * little much to deal with for now.
609
		 *
610
		 * We don't have a do_deferred_work down this pathway, on the
611
		 * principle that we didn't *really* affect shard state (we
612
		 * tweaked the stats, but our tweaks weren't really accurate).
613
		 */
614
		psset_update_end(&shard->psset, ps);
615
		edata_cache_fast_put(tsdn, &shard->ecf, edata);
616
		*oom = true;
617
		return NULL;
618
	}
619

620
	hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
621
	psset_update_end(&shard->psset, ps);
622
	return edata;
623
}
624

625
static size_t
626
hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
627
    bool *oom, size_t nallocs, edata_list_active_t *results,
628
    bool *deferred_work_generated) {
629
	malloc_mutex_lock(tsdn, &shard->mtx);
630
	size_t nsuccess = 0;
631
	for (; nsuccess < nallocs; nsuccess++) {
632
		edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size,
633
		    oom);
634
		if (edata == NULL) {
635
			break;
636
		}
637
		edata_list_active_append(results, edata);
638
	}
639

640
	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
641
	*deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
642
	malloc_mutex_unlock(tsdn, &shard->mtx);
643
	return nsuccess;
644
}
645

646
static size_t
647
hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
648
    size_t nallocs, edata_list_active_t *results,
649
    bool *deferred_work_generated) {
650
	assert(size <= shard->opts.slab_max_alloc);
651
	bool oom = false;
652

653
	size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
654
	    nallocs, results, deferred_work_generated);
655

656
	if (nsuccess == nallocs || oom) {
657
		return nsuccess;
658
	}
659

660
	/*
661
	 * We didn't OOM, but weren't able to fill everything requested of us;
662
	 * try to grow.
663
	 */
664
	malloc_mutex_lock(tsdn, &shard->grow_mtx);
665
	/*
666
	 * Check for grow races; maybe some earlier thread expanded the psset
667
	 * in between when we dropped the main mutex and grabbed the grow mutex.
668
	 */
669
	nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
670
	    nallocs - nsuccess, results, deferred_work_generated);
671
	if (nsuccess == nallocs || oom) {
672
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
673
		return nsuccess;
674
	}
675

676
	/*
677
	 * Note that we don't hold shard->mtx here (while growing);
678
	 * deallocations (and allocations of smaller sizes) may still succeed
679
	 * while we're doing this potentially expensive system call.
680
	 */
681
	hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom);
682
	if (ps == NULL) {
683
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
684
		return nsuccess;
685
	}
686

687
	/*
688
	 * We got the pageslab; allocate from it.  This does an unlock followed
689
	 * by a lock on the same mutex, and holds the grow mutex while doing
690
	 * deferred work, but this is an uncommon path; the simplicity is worth
691
	 * it.
692
	 */
693
	malloc_mutex_lock(tsdn, &shard->mtx);
694
	psset_insert(&shard->psset, ps);
695
	malloc_mutex_unlock(tsdn, &shard->mtx);
696

697
	nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
698
	    nallocs - nsuccess, results, deferred_work_generated);
699
	/*
700
	 * Drop grow_mtx before doing deferred work; other threads blocked on it
701
	 * should be allowed to proceed while we're working.
702
	 */
703
	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
704

705
	return nsuccess;
706
}
707

708
static hpa_shard_t *
709
hpa_from_pai(pai_t *self) {
710
	assert(self->alloc = &hpa_alloc);
711
	assert(self->expand = &hpa_expand);
712
	assert(self->shrink = &hpa_shrink);
713
	assert(self->dalloc = &hpa_dalloc);
714
	return (hpa_shard_t *)self;
715
}
716

717
static size_t
718
hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
719
    edata_list_active_t *results, bool *deferred_work_generated) {
720
	assert(nallocs > 0);
721
	assert((size & PAGE_MASK) == 0);
722
	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
723
	    WITNESS_RANK_CORE, 0);
724
	hpa_shard_t *shard = hpa_from_pai(self);
725

726
	if (size > shard->opts.slab_max_alloc) {
727
		return 0;
728
	}
729

730
	size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs,
731
	    results, deferred_work_generated);
732

733
	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
734
	    WITNESS_RANK_CORE, 0);
735

736
	/*
737
	 * Guard the sanity checks with config_debug because the loop cannot be
738
	 * proven non-circular by the compiler, even if everything within the
739
	 * loop is optimized away.
740
	 */
741
	if (config_debug) {
742
		edata_t *edata;
743
		ql_foreach(edata, &results->head, ql_link_active) {
744
			emap_assert_mapped(tsdn, shard->emap, edata);
745
			assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
746
			assert(edata_state_get(edata) == extent_state_active);
747
			assert(edata_arena_ind_get(edata) == shard->ind);
748
			assert(edata_szind_get_maybe_invalid(edata) ==
749
			    SC_NSIZES);
750
			assert(!edata_slab_get(edata));
751
			assert(edata_committed_get(edata));
752
			assert(edata_base_get(edata) == edata_addr_get(edata));
753
			assert(edata_base_get(edata) != NULL);
754
		}
755
	}
756
	return nsuccess;
757
}
758

759
static edata_t *
760
hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
761
    bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
762
	assert((size & PAGE_MASK) == 0);
763
	assert(!guarded);
764
	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
765
	    WITNESS_RANK_CORE, 0);
766

767
	/* We don't handle alignment or zeroing for now. */
768
	if (alignment > PAGE || zero) {
769
		return NULL;
770
	}
771
	/*
772
	 * An alloc with alignment == PAGE and zero == false is equivalent to a
773
	 * batch alloc of 1.  Just do that, so we can share code.
774
	 */
775
	edata_list_active_t results;
776
	edata_list_active_init(&results);
777
	size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1,
778
	    &results, deferred_work_generated);
779
	assert(nallocs == 0 || nallocs == 1);
780
	edata_t *edata = edata_list_active_first(&results);
781
	return edata;
782
}
783

784
static bool
785
hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
786
    size_t new_size, bool zero, bool *deferred_work_generated) {
787
	/* Expand not yet supported. */
788
	return true;
789
}
790

791
static bool
792
hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
793
    size_t old_size, size_t new_size, bool *deferred_work_generated) {
794
	/* Shrink not yet supported. */
795
	return true;
796
}
797

798
static void
799
hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
800
	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
801

802
	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
803
	assert(edata_state_get(edata) == extent_state_active);
804
	assert(edata_arena_ind_get(edata) == shard->ind);
805
	assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
806
	assert(edata_committed_get(edata));
807
	assert(edata_base_get(edata) != NULL);
808

809
	/*
810
	 * Another thread shouldn't be trying to touch the metadata of an
811
	 * allocation being freed.  The one exception is a merge attempt from a
812
	 * lower-addressed PAC extent; in this case we have a nominal race on
813
	 * the edata metadata bits, but in practice the fact that the PAI bits
814
	 * are different will prevent any further access.  The race is bad, but
815
	 * benign in practice, and the long term plan is to track enough state
816
	 * in the rtree to prevent these merge attempts in the first place.
817
	 */
818
	edata_addr_set(edata, edata_base_get(edata));
819
	edata_zeroed_set(edata, false);
820
	emap_deregister_boundary(tsdn, shard->emap, edata);
821
}
822

823
static void
824
hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
825
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
826

827
	/*
828
	 * Release the metadata early, to avoid having to remember to do it
829
	 * while we're also doing tricky purging logic.  First, we need to grab
830
	 * a few bits of metadata from it.
831
	 *
832
	 * Note that the shard mutex protects ps's metadata too; it wouldn't be
833
	 * correct to try to read most information out of it without the lock.
834
	 */
835
	hpdata_t *ps = edata_ps_get(edata);
836
	/* Currently, all edatas come from pageslabs. */
837
	assert(ps != NULL);
838
	void *unreserve_addr = edata_addr_get(edata);
839
	size_t unreserve_size = edata_size_get(edata);
840
	edata_cache_fast_put(tsdn, &shard->ecf, edata);
841

842
	psset_update_begin(&shard->psset, ps);
843
	hpdata_unreserve(ps, unreserve_addr, unreserve_size);
844
	hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
845
	psset_update_end(&shard->psset, ps);
846
}
847

848
static void
849
hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
850
    bool *deferred_work_generated) {
851
	hpa_shard_t *shard = hpa_from_pai(self);
852

853
	edata_t *edata;
854
	ql_foreach(edata, &list->head, ql_link_active) {
855
		hpa_dalloc_prepare_unlocked(tsdn, shard, edata);
856
	}
857

858
	malloc_mutex_lock(tsdn, &shard->mtx);
859
	/* Now, remove from the list. */
860
	while ((edata = edata_list_active_first(list)) != NULL) {
861
		edata_list_active_remove(list, edata);
862
		hpa_dalloc_locked(tsdn, shard, edata);
863
	}
864
	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
865
	*deferred_work_generated =
866
	    hpa_shard_has_deferred_work(tsdn, shard);
867

868
	malloc_mutex_unlock(tsdn, &shard->mtx);
869
}
870

871
static void
872
hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
873
    bool *deferred_work_generated) {
874
	assert(!edata_guarded_get(edata));
875
	/* Just a dalloc_batch of size 1; this lets us share logic. */
876
	edata_list_active_t dalloc_list;
877
	edata_list_active_init(&dalloc_list);
878
	edata_list_active_append(&dalloc_list, edata);
879
	hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated);
880
}
881

882
/*
883
 * Calculate time until either purging or hugification ought to happen.
884
 * Called by background threads.
885
 */
886
static uint64_t
887
hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
888
	hpa_shard_t *shard = hpa_from_pai(self);
889
	uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX;
890

891
	malloc_mutex_lock(tsdn, &shard->mtx);
892

893
	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
894
	if (to_hugify != NULL) {
895
		nstime_t time_hugify_allowed =
896
		    hpdata_time_hugify_allowed(to_hugify);
897
		uint64_t since_hugify_allowed_ms =
898
		    shard->central->hooks.ms_since(&time_hugify_allowed);
899
		/*
900
		 * If not enough time has passed since hugification was allowed,
901
		 * sleep for the rest.
902
		 */
903
		if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) {
904
			time_ns = shard->opts.hugify_delay_ms -
905
			    since_hugify_allowed_ms;
906
			time_ns *= 1000 * 1000;
907
		} else {
908
			malloc_mutex_unlock(tsdn, &shard->mtx);
909
			return BACKGROUND_THREAD_DEFERRED_MIN;
910
		}
911
	}
912

913
	if (hpa_should_purge(tsdn, shard)) {
914
		/*
915
		 * If we haven't purged before, no need to check interval
916
		 * between purges. Simply purge as soon as possible.
917
		 */
918
		if (shard->stats.npurge_passes == 0) {
919
			malloc_mutex_unlock(tsdn, &shard->mtx);
920
			return BACKGROUND_THREAD_DEFERRED_MIN;
921
		}
922
		uint64_t since_last_purge_ms = shard->central->hooks.ms_since(
923
		    &shard->last_purge);
924

925
		if (since_last_purge_ms < shard->opts.min_purge_interval_ms) {
926
			uint64_t until_purge_ns;
927
			until_purge_ns = shard->opts.min_purge_interval_ms -
928
			    since_last_purge_ms;
929
			until_purge_ns *= 1000 * 1000;
930

931
			if (until_purge_ns < time_ns) {
932
				time_ns = until_purge_ns;
933
			}
934
		} else {
935
			time_ns = BACKGROUND_THREAD_DEFERRED_MIN;
936
		}
937
	}
938
	malloc_mutex_unlock(tsdn, &shard->mtx);
939
	return time_ns;
940
}
941

942
void
943
hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
944
	hpa_do_consistency_checks(shard);
945

946
	malloc_mutex_lock(tsdn, &shard->mtx);
947
	edata_cache_fast_disable(tsdn, &shard->ecf);
948
	malloc_mutex_unlock(tsdn, &shard->mtx);
949
}
950

951
static void
952
hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
953
	assert(bin_stats->npageslabs == 0);
954
	assert(bin_stats->nactive == 0);
955
}
956

957
static void
958
hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
959
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
960
	for (int huge = 0; huge <= 1; huge++) {
961
		hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]);
962
		for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
963
			hpa_shard_assert_stats_empty(
964
			    &psset->stats.nonfull_slabs[i][huge]);
965
		}
966
	}
967
}
968

969
void
970
hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
971
	hpa_do_consistency_checks(shard);
972
	/*
973
	 * By the time we're here, the arena code should have dalloc'd all the
974
	 * active extents, which means we should have eventually evicted
975
	 * everything from the psset, so it shouldn't be able to serve even a
976
	 * 1-page allocation.
977
	 */
978
	if (config_debug) {
979
		malloc_mutex_lock(tsdn, &shard->mtx);
980
		hpa_assert_empty(tsdn, shard, &shard->psset);
981
		malloc_mutex_unlock(tsdn, &shard->mtx);
982
	}
983
	hpdata_t *ps;
984
	while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) {
985
		/* There should be no allocations anywhere. */
986
		assert(hpdata_empty(ps));
987
		psset_remove(&shard->psset, ps);
988
		shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE);
989
	}
990
}
991

992
void
993
hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
994
    bool deferral_allowed) {
995
	hpa_do_consistency_checks(shard);
996

997
	malloc_mutex_lock(tsdn, &shard->mtx);
998
	bool deferral_previously_allowed = shard->opts.deferral_allowed;
999
	shard->opts.deferral_allowed = deferral_allowed;
1000
	if (deferral_previously_allowed && !deferral_allowed) {
1001
		hpa_shard_maybe_do_deferred_work(tsdn, shard,
1002
		    /* forced */ true);
1003
	}
1004
	malloc_mutex_unlock(tsdn, &shard->mtx);
1005
}
1006

1007
void
1008
hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
1009
	hpa_do_consistency_checks(shard);
1010

1011
	malloc_mutex_lock(tsdn, &shard->mtx);
1012
	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true);
1013
	malloc_mutex_unlock(tsdn, &shard->mtx);
1014
}
1015

1016
void
1017
hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
1018
	hpa_do_consistency_checks(shard);
1019

1020
	malloc_mutex_prefork(tsdn, &shard->grow_mtx);
1021
}
1022

1023
void
1024
hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
1025
	hpa_do_consistency_checks(shard);
1026

1027
	malloc_mutex_prefork(tsdn, &shard->mtx);
1028
}
1029

1030
void
1031
hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
1032
	hpa_do_consistency_checks(shard);
1033

1034
	malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
1035
	malloc_mutex_postfork_parent(tsdn, &shard->mtx);
1036
}
1037

1038
void
1039
hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
1040
	hpa_do_consistency_checks(shard);
1041

1042
	malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
1043
	malloc_mutex_postfork_child(tsdn, &shard->mtx);
1044
}
1045

1046
Product

Resources

Company