CoCalc -- vm

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_phys.c
³⁹⁴⁷⁶ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 2002-2006 Rice University
5
 * Copyright (c) 2007 Alan L. Cox <[email protected]>
6
 * All rights reserved.
7
 *
8
 * This software was developed for the FreeBSD Project by Alan L. Cox,
9
 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10
 *
11
 * Redistribution and use in source and binary forms, with or without
12
 * modification, are permitted provided that the following conditions
13
 * are met:
14
 * 1. Redistributions of source code must retain the above copyright
15
 *    notice, this list of conditions and the following disclaimer.
16
 * 2. Redistributions in binary form must reproduce the above copyright
17
 *    notice, this list of conditions and the following disclaimer in the
18
 *    documentation and/or other materials provided with the distribution.
19
 *
20
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
 * POSSIBILITY OF SUCH DAMAGE.
32
 */
33

34
/*
35
 *	Physical memory system implementation
36
 *
37
 * Any external functions defined by this module are only to be used by the
38
 * virtual memory system.
39
 */
40

41
#include <sys/cdefs.h>
42
#include "opt_ddb.h"
43
#include "opt_vm.h"
44

45
#include <sys/param.h>
46
#include <sys/systm.h>
47
#include <sys/domainset.h>
48
#include <sys/lock.h>
49
#include <sys/kernel.h>
50
#include <sys/kthread.h>
51
#include <sys/malloc.h>
52
#include <sys/mutex.h>
53
#include <sys/proc.h>
54
#include <sys/queue.h>
55
#include <sys/rwlock.h>
56
#include <sys/sbuf.h>
57
#include <sys/sched.h>
58
#include <sys/sysctl.h>
59
#include <sys/tree.h>
60
#include <sys/tslog.h>
61
#include <sys/unistd.h>
62
#include <sys/vmmeter.h>
63

64
#include <ddb/ddb.h>
65

66
#include <vm/vm.h>
67
#include <vm/vm_extern.h>
68
#include <vm/vm_param.h>
69
#include <vm/vm_kern.h>
70
#include <vm/vm_page.h>
71
#include <vm/vm_phys.h>
72
#include <vm/vm_pagequeue.h>
73

74
_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
75
    "Too many physsegs.");
76
_Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
77
    "vm_paddr_t too big for ffsll, flsll.");
78

79
#ifdef NUMA
80
struct mem_affinity __read_mostly *mem_affinity;
81
int __read_mostly *mem_locality;
82

83
static int numa_disabled;
84
static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
85
    "NUMA options");
86
SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
87
    &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
88
#endif
89

90
int __read_mostly vm_ndomains = 1;
91
domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
92

93
struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
94
int __read_mostly vm_phys_nsegs;
95
static struct vm_phys_seg vm_phys_early_segs[8];
96
static int vm_phys_early_nsegs;
97

98
struct vm_phys_fictitious_seg;
99
static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
100
    struct vm_phys_fictitious_seg *);
101

102
RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
103
    RB_INITIALIZER(&vm_phys_fictitious_tree);
104

105
struct vm_phys_fictitious_seg {
106
	RB_ENTRY(vm_phys_fictitious_seg) node;
107
	/* Memory region data */
108
	vm_paddr_t	start;
109
	vm_paddr_t	end;
110
	vm_page_t	first_page;
111
};
112

113
RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
114
    vm_phys_fictitious_cmp);
115

116
static struct rwlock_padalign vm_phys_fictitious_reg_lock;
117
MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
118

119
static struct vm_freelist __aligned(CACHE_LINE_SIZE)
120
    vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
121
    [VM_NFREEORDER_MAX];
122

123
static int __read_mostly vm_nfreelists;
124

125
/*
126
 * These "avail lists" are globals used to communicate boot-time physical
127
 * memory layout to other parts of the kernel.  Each physically contiguous
128
 * region of memory is defined by a start address at an even index and an
129
 * end address at the following odd index.  Each list is terminated by a
130
 * pair of zero entries.
131
 *
132
 * dump_avail tells the dump code what regions to include in a crash dump, and
133
 * phys_avail is all of the remaining physical memory that is available for
134
 * the vm system.
135
 *
136
 * Initially dump_avail and phys_avail are identical.  Boot time memory
137
 * allocations remove extents from phys_avail that may still be included
138
 * in dumps.
139
 */
140
vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
141
vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
142

143
/*
144
 * Provides the mapping from VM_FREELIST_* to free list indices (flind).
145
 */
146
static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
147
static int __read_mostly vm_default_freepool;
148

149
CTASSERT(VM_FREELIST_DEFAULT == 0);
150

151
#ifdef VM_FREELIST_DMA32
152
#define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
153
#endif
154

155
/*
156
 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
157
 * the ordering of the free list boundaries.
158
 */
159
#if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
160
CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
161
#endif
162

163
static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
164
SYSCTL_OID(_vm, OID_AUTO, phys_free,
165
    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
166
    sysctl_vm_phys_free, "A",
167
    "Phys Free Info");
168

169
static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
170
SYSCTL_OID(_vm, OID_AUTO, phys_segs,
171
    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
172
    sysctl_vm_phys_segs, "A",
173
    "Phys Seg Info");
174

175
#ifdef NUMA
176
static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
177
SYSCTL_OID(_vm, OID_AUTO, phys_locality,
178
    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
179
    sysctl_vm_phys_locality, "A",
180
    "Phys Locality Info");
181
#endif
182

183
SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
184
    &vm_ndomains, 0, "Number of physical memory domains available.");
185

186
static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
187
static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
188
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
189
    int order, int pool, int tail);
190

191
static bool __diagused
192
vm_phys_pool_valid(int pool)
193
{
194
#ifdef VM_FREEPOOL_LAZYINIT
195
	if (pool == VM_FREEPOOL_LAZYINIT)
196
		return (false);
197
#endif
198
	return (pool >= 0 && pool < VM_NFREEPOOL);
199
}
200

201
/*
202
 * Red-black tree helpers for vm fictitious range management.
203
 */
204
static inline int
205
vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
206
    struct vm_phys_fictitious_seg *range)
207
{
208

209
	KASSERT(range->start != 0 && range->end != 0,
210
	    ("Invalid range passed on search for vm_fictitious page"));
211
	if (p->start >= range->end)
212
		return (1);
213
	if (p->start < range->start)
214
		return (-1);
215

216
	return (0);
217
}
218

219
static int
220
vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
221
    struct vm_phys_fictitious_seg *p2)
222
{
223

224
	/* Check if this is a search for a page */
225
	if (p1->end == 0)
226
		return (vm_phys_fictitious_in_range(p1, p2));
227

228
	KASSERT(p2->end != 0,
229
    ("Invalid range passed as second parameter to vm fictitious comparison"));
230

231
	/* Searching to add a new range */
232
	if (p1->end <= p2->start)
233
		return (-1);
234
	if (p1->start >= p2->end)
235
		return (1);
236

237
	panic("Trying to add overlapping vm fictitious ranges:\n"
238
	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
239
	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
240
}
241

242
int
243
vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
244
    vm_paddr_t high __numa_used)
245
{
246
#ifdef NUMA
247
	domainset_t mask;
248
	int i;
249

250
	if (vm_ndomains == 1 || mem_affinity == NULL)
251
		return (0);
252

253
	DOMAINSET_ZERO(&mask);
254
	/*
255
	 * Check for any memory that overlaps low, high.
256
	 */
257
	for (i = 0; mem_affinity[i].end != 0; i++)
258
		if (mem_affinity[i].start <= high &&
259
		    mem_affinity[i].end >= low)
260
			DOMAINSET_SET(mem_affinity[i].domain, &mask);
261
	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
262
		return (prefer);
263
	if (DOMAINSET_EMPTY(&mask))
264
		panic("vm_phys_domain_match:  Impossible constraint");
265
	return (DOMAINSET_FFS(&mask) - 1);
266
#else
267
	return (0);
268
#endif
269
}
270

271
/*
272
 * Outputs the state of the physical memory allocator, specifically,
273
 * the amount of physical memory in each free list.
274
 */
275
static int
276
sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
277
{
278
	struct sbuf sbuf;
279
	struct vm_freelist *fl;
280
	int dom, error, flind, oind, pind;
281

282
	error = sysctl_wire_old_buffer(req, 0);
283
	if (error != 0)
284
		return (error);
285
	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
286
	for (dom = 0; dom < vm_ndomains; dom++) {
287
		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
288
		for (flind = 0; flind < vm_nfreelists; flind++) {
289
			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
290
			    "\n  ORDER (SIZE)  |  NUMBER"
291
			    "\n              ", flind);
292
			for (pind = 0; pind < VM_NFREEPOOL; pind++)
293
				sbuf_printf(&sbuf, "  |  POOL %d", pind);
294
			sbuf_printf(&sbuf, "\n--            ");
295
			for (pind = 0; pind < VM_NFREEPOOL; pind++)
296
				sbuf_printf(&sbuf, "-- --      ");
297
			sbuf_printf(&sbuf, "--\n");
298
			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
299
				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
300
				    1 << (PAGE_SHIFT - 10 + oind));
301
				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
302
				fl = vm_phys_free_queues[dom][flind][pind];
303
					sbuf_printf(&sbuf, "  |  %6d",
304
					    fl[oind].lcnt);
305
				}
306
				sbuf_printf(&sbuf, "\n");
307
			}
308
		}
309
	}
310
	error = sbuf_finish(&sbuf);
311
	sbuf_delete(&sbuf);
312
	return (error);
313
}
314

315
/*
316
 * Outputs the set of physical memory segments.
317
 */
318
static int
319
sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
320
{
321
	struct sbuf sbuf;
322
	struct vm_phys_seg *seg;
323
	int error, segind;
324

325
	error = sysctl_wire_old_buffer(req, 0);
326
	if (error != 0)
327
		return (error);
328
	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
329
	for (segind = 0; segind < vm_phys_nsegs; segind++) {
330
		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
331
		seg = &vm_phys_segs[segind];
332
		sbuf_printf(&sbuf, "start:     %#jx\n",
333
		    (uintmax_t)seg->start);
334
		sbuf_printf(&sbuf, "end:       %#jx\n",
335
		    (uintmax_t)seg->end);
336
		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
337
		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
338
	}
339
	error = sbuf_finish(&sbuf);
340
	sbuf_delete(&sbuf);
341
	return (error);
342
}
343

344
/*
345
 * Return affinity, or -1 if there's no affinity information.
346
 */
347
int
348
vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
349
{
350

351
#ifdef NUMA
352
	if (mem_locality == NULL)
353
		return (-1);
354
	if (f >= vm_ndomains || t >= vm_ndomains)
355
		return (-1);
356
	return (mem_locality[f * vm_ndomains + t]);
357
#else
358
	return (-1);
359
#endif
360
}
361

362
#ifdef NUMA
363
/*
364
 * Outputs the VM locality table.
365
 */
366
static int
367
sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
368
{
369
	struct sbuf sbuf;
370
	int error, i, j;
371

372
	error = sysctl_wire_old_buffer(req, 0);
373
	if (error != 0)
374
		return (error);
375
	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
376

377
	sbuf_printf(&sbuf, "\n");
378

379
	for (i = 0; i < vm_ndomains; i++) {
380
		sbuf_printf(&sbuf, "%d: ", i);
381
		for (j = 0; j < vm_ndomains; j++) {
382
			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
383
		}
384
		sbuf_printf(&sbuf, "\n");
385
	}
386
	error = sbuf_finish(&sbuf);
387
	sbuf_delete(&sbuf);
388
	return (error);
389
}
390
#endif
391

392
static void
393
vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool,
394
    int tail)
395
{
396
	/*
397
	 * The paging queues and the free page lists utilize the same field,
398
	 * plinks.q, within the vm_page structure.  When a physical page is
399
	 * freed, it is lazily removed from the paging queues to reduce the
400
	 * cost of removal through batching.  Here, we must ensure that any
401
	 * deferred dequeue on the physical page has completed before using
402
	 * its plinks.q field.
403
	 */
404
	if (__predict_false(vm_page_astate_load(m).queue != PQ_NONE))
405
		vm_page_dequeue(m);
406

407
	m->order = order;
408
	m->pool = pool;
409
	if (tail)
410
		TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
411
	else
412
		TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
413
	fl[order].lcnt++;
414
}
415

416
static void
417
vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
418
{
419

420
	TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
421
	fl[order].lcnt--;
422
	m->order = VM_NFREEORDER;
423
}
424

425
/*
426
 * Create a physical memory segment.
427
 */
428
static void
429
_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
430
{
431
	struct vm_phys_seg *seg;
432

433
	if (!(0 <= domain && domain < vm_ndomains))
434
		panic("%s: Invalid domain %d ('vm_ndomains' is %d)",
435
		    __func__, domain, vm_ndomains);
436
	if (vm_phys_nsegs >= VM_PHYSSEG_MAX)
437
		panic("Not enough storage for physical segments, "
438
		    "increase VM_PHYSSEG_MAX");
439

440
	seg = &vm_phys_segs[vm_phys_nsegs++];
441
	while (seg > vm_phys_segs && seg[-1].start >= end) {
442
		*seg = *(seg - 1);
443
		seg--;
444
	}
445
	seg->start = start;
446
	seg->end = end;
447
	seg->domain = domain;
448
	if (seg != vm_phys_segs && seg[-1].end > start)
449
		panic("Overlapping physical segments: Current [%#jx,%#jx) "
450
		    "at index %zu, previous [%#jx,%#jx)",
451
		    (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs,
452
		    (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end);
453
}
454

455
static void
456
vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
457
{
458
#ifdef NUMA
459
	int i;
460

461
	if (mem_affinity == NULL) {
462
		_vm_phys_create_seg(start, end, 0);
463
		return;
464
	}
465

466
	for (i = 0;; i++) {
467
		if (mem_affinity[i].end == 0)
468
			panic("Reached end of affinity info");
469
		if (mem_affinity[i].end <= start)
470
			continue;
471
		if (mem_affinity[i].start > start)
472
			panic("No affinity info for start %jx",
473
			    (uintmax_t)start);
474
		if (mem_affinity[i].end >= end) {
475
			_vm_phys_create_seg(start, end,
476
			    mem_affinity[i].domain);
477
			break;
478
		}
479
		_vm_phys_create_seg(start, mem_affinity[i].end,
480
		    mem_affinity[i].domain);
481
		start = mem_affinity[i].end;
482
	}
483
#else
484
	_vm_phys_create_seg(start, end, 0);
485
#endif
486
}
487

488
/*
489
 * Add a physical memory segment.
490
 */
491
void
492
vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
493
{
494
	vm_paddr_t paddr;
495

496
	if ((start & PAGE_MASK) != 0)
497
		panic("%s: start (%jx) is not page aligned", __func__,
498
		    (uintmax_t)start);
499
	if ((end & PAGE_MASK) != 0)
500
		panic("%s: end (%jx) is not page aligned", __func__,
501
		    (uintmax_t)end);
502
	if (start > end)
503
		panic("%s: start (%jx) > end (%jx)!", __func__,
504
		    (uintmax_t)start, (uintmax_t)end);
505

506
	if (start == end)
507
		return;
508

509
	/*
510
	 * Split the physical memory segment if it spans two or more free
511
	 * list boundaries.
512
	 */
513
	paddr = start;
514
#ifdef	VM_FREELIST_LOWMEM
515
	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
516
		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
517
		paddr = VM_LOWMEM_BOUNDARY;
518
	}
519
#endif
520
#ifdef	VM_FREELIST_DMA32
521
	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
522
		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
523
		paddr = VM_DMA32_BOUNDARY;
524
	}
525
#endif
526
	vm_phys_create_seg(paddr, end);
527
}
528

529
/*
530
 * Initialize the physical memory allocator.
531
 *
532
 * Requires that vm_page_array is initialized!
533
 */
534
void
535
vm_phys_init(void)
536
{
537
	struct vm_freelist *fl;
538
	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
539
#if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
540
	u_long npages;
541
#endif
542
	int dom, flind, freelist, oind, pind, segind;
543

544
	/*
545
	 * Compute the number of free lists, and generate the mapping from the
546
	 * manifest constants VM_FREELIST_* to the free list indices.
547
	 *
548
	 * Initially, the entries of vm_freelist_to_flind[] are set to either
549
	 * 0 or 1 to indicate which free lists should be created.
550
	 */
551
#ifdef	VM_DMA32_NPAGES_THRESHOLD
552
	npages = 0;
553
#endif
554
	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
555
		seg = &vm_phys_segs[segind];
556
#ifdef	VM_FREELIST_LOWMEM
557
		if (seg->end <= VM_LOWMEM_BOUNDARY)
558
			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
559
		else
560
#endif
561
#ifdef	VM_FREELIST_DMA32
562
		if (
563
#ifdef	VM_DMA32_NPAGES_THRESHOLD
564
		    /*
565
		     * Create the DMA32 free list only if the amount of
566
		     * physical memory above physical address 4G exceeds the
567
		     * given threshold.
568
		     */
569
		    npages > VM_DMA32_NPAGES_THRESHOLD &&
570
#endif
571
		    seg->end <= VM_DMA32_BOUNDARY)
572
			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
573
		else
574
#endif
575
		{
576
#ifdef	VM_DMA32_NPAGES_THRESHOLD
577
			npages += atop(seg->end - seg->start);
578
#endif
579
			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
580
		}
581
	}
582
	/* Change each entry into a running total of the free lists. */
583
	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
584
		vm_freelist_to_flind[freelist] +=
585
		    vm_freelist_to_flind[freelist - 1];
586
	}
587
	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
588
	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
589
	/* Change each entry into a free list index. */
590
	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
591
		vm_freelist_to_flind[freelist]--;
592

593
	/*
594
	 * Initialize the first_page and free_queues fields of each physical
595
	 * memory segment.
596
	 */
597
#ifdef VM_PHYSSEG_SPARSE
598
	npages = 0;
599
#endif
600
	for (segind = 0; segind < vm_phys_nsegs; segind++) {
601
		seg = &vm_phys_segs[segind];
602
#ifdef VM_PHYSSEG_SPARSE
603
		seg->first_page = &vm_page_array[npages];
604
		npages += atop(seg->end - seg->start);
605
#else
606
		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
607
#endif
608
#ifdef	VM_FREELIST_LOWMEM
609
		if (seg->end <= VM_LOWMEM_BOUNDARY) {
610
			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
611
			KASSERT(flind >= 0,
612
			    ("vm_phys_init: LOWMEM flind < 0"));
613
		} else
614
#endif
615
#ifdef	VM_FREELIST_DMA32
616
		if (seg->end <= VM_DMA32_BOUNDARY) {
617
			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
618
			KASSERT(flind >= 0,
619
			    ("vm_phys_init: DMA32 flind < 0"));
620
		} else
621
#endif
622
		{
623
			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
624
			KASSERT(flind >= 0,
625
			    ("vm_phys_init: DEFAULT flind < 0"));
626
		}
627
		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
628
	}
629

630
	/*
631
	 * Coalesce physical memory segments that are contiguous and share the
632
	 * same per-domain free queues.
633
	 */
634
	prev_seg = vm_phys_segs;
635
	seg = &vm_phys_segs[1];
636
	end_seg = &vm_phys_segs[vm_phys_nsegs];
637
	while (seg < end_seg) {
638
		if (prev_seg->end == seg->start &&
639
		    prev_seg->free_queues == seg->free_queues) {
640
			prev_seg->end = seg->end;
641
			KASSERT(prev_seg->domain == seg->domain,
642
			    ("vm_phys_init: free queues cannot span domains"));
643
			vm_phys_nsegs--;
644
			end_seg--;
645
			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
646
				*tmp_seg = *(tmp_seg + 1);
647
		} else {
648
			prev_seg = seg;
649
			seg++;
650
		}
651
	}
652

653
	/*
654
	 * Initialize the free queues.
655
	 */
656
	for (dom = 0; dom < vm_ndomains; dom++) {
657
		for (flind = 0; flind < vm_nfreelists; flind++) {
658
			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
659
				fl = vm_phys_free_queues[dom][flind][pind];
660
				for (oind = 0; oind < VM_NFREEORDER; oind++)
661
					TAILQ_INIT(&fl[oind].pl);
662
			}
663
		}
664
	}
665

666
#ifdef VM_FREEPOOL_LAZYINIT
667
	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
668
#else
669
	vm_default_freepool = VM_FREEPOOL_DEFAULT;
670
#endif
671

672
	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
673
}
674

675
/*
676
 * Register info about the NUMA topology of the system.
677
 *
678
 * Invoked by platform-dependent code prior to vm_phys_init().
679
 */
680
void
681
vm_phys_register_domains(int ndomains __numa_used,
682
    struct mem_affinity *affinity __numa_used, int *locality __numa_used)
683
{
684
#ifdef NUMA
685
	int i;
686

687
	/*
688
	 * For now the only override value that we support is 1, which
689
	 * effectively disables NUMA-awareness in the allocators.
690
	 */
691
	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
692
	if (numa_disabled)
693
		ndomains = 1;
694

695
	if (ndomains > 1) {
696
		vm_ndomains = ndomains;
697
		mem_affinity = affinity;
698
		mem_locality = locality;
699
	}
700

701
	for (i = 0; i < vm_ndomains; i++)
702
		DOMAINSET_SET(i, &all_domains);
703
#endif
704
}
705

706
/*
707
 * Split a contiguous, power of two-sized set of physical pages.
708
 *
709
 * When this function is called by a page allocation function, the caller
710
 * should request insertion at the head unless the order [order, oind) queues
711
 * are known to be empty.  The objective being to reduce the likelihood of
712
 * long-term fragmentation by promoting contemporaneous allocation and
713
 * (hopefully) deallocation.
714
 */
715
static __inline void
716
vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
717
    int pool, int tail)
718
{
719
	vm_page_t m_buddy;
720

721
	while (oind > order) {
722
		oind--;
723
		m_buddy = &m[1 << oind];
724
		KASSERT(m_buddy->order == VM_NFREEORDER,
725
		    ("vm_phys_split_pages: page %p has unexpected order %d",
726
		    m_buddy, m_buddy->order));
727
		vm_freelist_add(fl, m_buddy, oind, pool, tail);
728
        }
729
}
730

731
static void
732
vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool,
733
    int tail)
734
{
735
	KASSERT(order >= 0 && order < VM_NFREEORDER,
736
	    ("%s: invalid order %d", __func__, order));
737

738
	vm_freelist_add(fl, m, order, pool, tail);
739
#ifdef VM_FREEPOOL_LAZYINIT
740
	if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) {
741
		vm_page_t m_next;
742
		vm_paddr_t pa;
743
		int npages;
744

745
		npages = 1 << order;
746
		m_next = m + npages;
747
		pa = m->phys_addr + ptoa(npages);
748
		if (pa < vm_phys_segs[m->segind].end) {
749
			vm_page_init_page(m_next, pa, m->segind,
750
			    VM_FREEPOOL_LAZYINIT);
751
		}
752
	}
753
#endif
754
}
755

756
/*
757
 * Add the physical pages [m, m + npages) at the beginning of a power-of-two
758
 * aligned and sized set to the specified free list.
759
 *
760
 * When this function is called by a page allocation function, the caller
761
 * should request insertion at the head unless the lower-order queues are
762
 * known to be empty.  The objective being to reduce the likelihood of long-
763
 * term fragmentation by promoting contemporaneous allocation and (hopefully)
764
 * deallocation.
765
 *
766
 * The physical page m's buddy must not be free.
767
 */
768
static void
769
vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
770
    int tail)
771
{
772
        int order;
773

774
	KASSERT(npages == 0 ||
775
	    (VM_PAGE_TO_PHYS(m) &
776
	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
777
	    ("%s: page %p and npages %u are misaligned",
778
	    __func__, m, npages));
779
        while (npages > 0) {
780
		KASSERT(m->order == VM_NFREEORDER,
781
		    ("%s: page %p has unexpected order %d",
782
		    __func__, m, m->order));
783
		order = ilog2(npages);
784
		KASSERT(order < VM_NFREEORDER,
785
		    ("%s: order %d is out of range", __func__, order));
786
		vm_phys_enq_chunk(fl, m, order, pool, tail);
787
		m += 1 << order;
788
		npages -= 1 << order;
789
	}
790
}
791

792
/*
793
 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
794
 * and sized set to the specified free list.
795
 *
796
 * When this function is called by a page allocation function, the caller
797
 * should request insertion at the head unless the lower-order queues are
798
 * known to be empty.  The objective being to reduce the likelihood of long-
799
 * term fragmentation by promoting contemporaneous allocation and (hopefully)
800
 * deallocation.
801
 *
802
 * If npages is zero, this function does nothing and ignores the physical page
803
 * parameter m.  Otherwise, the physical page m's buddy must not be free.
804
 */
805
static vm_page_t
806
vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
807
    int tail)
808
{
809
	int order;
810

811
	KASSERT(npages == 0 ||
812
	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
813
	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
814
	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
815
	    m, npages));
816
	while (npages > 0) {
817
		KASSERT(m->order == VM_NFREEORDER,
818
		    ("vm_phys_enq_range: page %p has unexpected order %d",
819
		    m, m->order));
820
		order = ffs(npages) - 1;
821
		vm_phys_enq_chunk(fl, m, order, pool, tail);
822
		m += 1 << order;
823
		npages -= 1 << order;
824
	}
825
	return (m);
826
}
827

828
/*
829
 * Complete initialization a contiguous, power of two-sized set of physical
830
 * pages.
831
 *
832
 * If the pages currently belong to the lazy init pool, then the corresponding
833
 * page structures must be initialized.  In this case it is assumed that the
834
 * first page in the run has already been initialized.
835
 */
836
static void
837
vm_phys_finish_init(vm_page_t m, int order)
838
{
839
#ifdef VM_FREEPOOL_LAZYINIT
840
	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
841
		vm_paddr_t pa;
842
		int segind;
843

844
		TSENTER();
845
		pa = m->phys_addr + PAGE_SIZE;
846
		segind = m->segind;
847
		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
848
		    m_tmp++, pa += PAGE_SIZE)
849
			vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL);
850
		TSEXIT();
851
	}
852
#endif
853
}
854

855
/*
856
 * Tries to allocate the specified number of pages from the specified pool
857
 * within the specified domain.  Returns the actual number of allocated pages
858
 * and a pointer to each page through the array ma[].
859
 *
860
 * The returned pages may not be physically contiguous.  However, in contrast
861
 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
862
 * calling this function once to allocate the desired number of pages will
863
 * avoid wasted time in vm_phys_split_pages().  The allocated pages have no
864
 * valid pool field set.
865
 *
866
 * The free page queues for the specified domain must be locked.
867
 */
868
int
869
vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
870
{
871
	struct vm_freelist *alt, *fl;
872
	vm_page_t m;
873
	int avail, end, flind, freelist, i, oind, pind;
874

875
	KASSERT(domain >= 0 && domain < vm_ndomains,
876
	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
877
	KASSERT(vm_phys_pool_valid(pool),
878
	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
879
	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
880
	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
881
	vm_domain_free_assert_locked(VM_DOMAIN(domain));
882
	i = 0;
883
	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
884
		flind = vm_freelist_to_flind[freelist];
885
		if (flind < 0)
886
			continue;
887
		fl = vm_phys_free_queues[domain][flind][pool];
888
		for (oind = 0; oind < VM_NFREEORDER; oind++) {
889
			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
890
				vm_freelist_rem(fl, m, oind);
891
				avail = i + (1 << oind);
892
				end = imin(npages, avail);
893
				while (i < end)
894
					ma[i++] = m++;
895
				if (i == npages) {
896
					/*
897
					 * Return excess pages to fl.  Its order
898
					 * [0, oind) queues are empty.
899
					 */
900
					vm_phys_enq_range(m, avail - i, fl,
901
					    pool, 1);
902
					return (npages);
903
				}
904
			}
905
		}
906
		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
907
			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
908
			    pind++) {
909
				alt = vm_phys_free_queues[domain][flind][pind];
910
				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
911
				    NULL) {
912
					vm_freelist_rem(alt, m, oind);
913
					vm_phys_finish_init(m, oind);
914
					avail = i + (1 << oind);
915
					end = imin(npages, avail);
916
					while (i < end)
917
						ma[i++] = m++;
918
					if (i == npages) {
919
						/*
920
						 * Return excess pages to fl.
921
						 * Its order [0, oind) queues
922
						 * are empty.
923
						 */
924
						vm_phys_enq_range(m, avail - i,
925
						    fl, pool, 1);
926
						return (npages);
927
					}
928
				}
929
			}
930
		}
931
	}
932
	return (i);
933
}
934

935
/*
936
 * Allocate a contiguous, power of two-sized set of physical pages from the
937
 * specified free list.  The free list must be specified using one of the
938
 * manifest constants VM_FREELIST_*.
939
 *
940
 * The free page queues must be locked.
941
 */
942
static vm_page_t
943
vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
944
{
945
	struct vm_freelist *alt, *fl;
946
	vm_page_t m;
947
	int oind, pind, flind;
948

949
	KASSERT(domain >= 0 && domain < vm_ndomains,
950
	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
951
	    domain));
952
	KASSERT(freelist < VM_NFREELIST,
953
	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
954
	    freelist));
955
	KASSERT(vm_phys_pool_valid(pool),
956
	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
957
	KASSERT(order < VM_NFREEORDER,
958
	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
959

960
	flind = vm_freelist_to_flind[freelist];
961
	/* Check if freelist is present */
962
	if (flind < 0)
963
		return (NULL);
964

965
	vm_domain_free_assert_locked(VM_DOMAIN(domain));
966
	fl = &vm_phys_free_queues[domain][flind][pool][0];
967
	for (oind = order; oind < VM_NFREEORDER; oind++) {
968
		m = TAILQ_FIRST(&fl[oind].pl);
969
		if (m != NULL) {
970
			vm_freelist_rem(fl, m, oind);
971
			/* The order [order, oind) queues are empty. */
972
			vm_phys_split_pages(m, oind, fl, order, pool, 1);
973
			return (m);
974
		}
975
	}
976

977
	/*
978
	 * The given pool was empty.  Find the largest
979
	 * contiguous, power-of-two-sized set of pages in any
980
	 * pool.  Transfer these pages to the given pool, and
981
	 * use them to satisfy the allocation.
982
	 */
983
	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
984
		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
985
			alt = &vm_phys_free_queues[domain][flind][pind][0];
986
			m = TAILQ_FIRST(&alt[oind].pl);
987
			if (m != NULL) {
988
				vm_freelist_rem(alt, m, oind);
989
				vm_phys_finish_init(m, oind);
990
				/* The order [order, oind) queues are empty. */
991
				vm_phys_split_pages(m, oind, fl, order, pool, 1);
992
				return (m);
993
			}
994
		}
995
	}
996
	return (NULL);
997
}
998

999
/*
1000
 * Allocate a contiguous, power of two-sized set of physical pages
1001
 * from the free lists.
1002
 *
1003
 * The free page queues must be locked.
1004
 */
1005
vm_page_t
1006
vm_phys_alloc_pages(int domain, int pool, int order)
1007
{
1008
	vm_page_t m;
1009
	int freelist;
1010

1011
	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
1012
		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
1013
		if (m != NULL)
1014
			return (m);
1015
	}
1016
	return (NULL);
1017
}
1018

1019
/*
1020
 * Find the vm_page corresponding to the given physical address, which must lie
1021
 * within the given physical memory segment.
1022
 */
1023
vm_page_t
1024
vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
1025
{
1026
	KASSERT(pa >= seg->start && pa < seg->end,
1027
	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
1028

1029
	return (&seg->first_page[atop(pa - seg->start)]);
1030
}
1031

1032
/*
1033
 * Find the vm_page corresponding to the given physical address.
1034
 */
1035
vm_page_t
1036
vm_phys_paddr_to_vm_page(vm_paddr_t pa)
1037
{
1038
	struct vm_phys_seg *seg;
1039

1040
	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
1041
		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1042
	return (NULL);
1043
}
1044

1045
vm_page_t
1046
vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1047
{
1048
	struct vm_phys_fictitious_seg tmp, *seg;
1049
	vm_page_t m;
1050

1051
	m = NULL;
1052
	tmp.start = pa;
1053
	tmp.end = 0;
1054

1055
	rw_rlock(&vm_phys_fictitious_reg_lock);
1056
	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1057
	rw_runlock(&vm_phys_fictitious_reg_lock);
1058
	if (seg == NULL)
1059
		return (NULL);
1060

1061
	m = &seg->first_page[atop(pa - seg->start)];
1062
	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
1063

1064
	return (m);
1065
}
1066

1067
static inline void
1068
vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
1069
    long page_count, vm_memattr_t memattr)
1070
{
1071
	long i;
1072

1073
	bzero(range, page_count * sizeof(*range));
1074
	for (i = 0; i < page_count; i++) {
1075
		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
1076
		range[i].oflags &= ~VPO_UNMANAGED;
1077
		range[i].busy_lock = VPB_UNBUSIED;
1078
	}
1079
}
1080

1081
int
1082
vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1083
    vm_memattr_t memattr)
1084
{
1085
	struct vm_phys_fictitious_seg *seg;
1086
	vm_page_t fp;
1087
	long page_count;
1088
#ifdef VM_PHYSSEG_DENSE
1089
	long pi, pe;
1090
	long dpage_count;
1091
#endif
1092

1093
	KASSERT(start < end,
1094
	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1095
	    (uintmax_t)start, (uintmax_t)end));
1096

1097
	page_count = (end - start) / PAGE_SIZE;
1098

1099
#ifdef VM_PHYSSEG_DENSE
1100
	pi = atop(start);
1101
	pe = atop(end);
1102
	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1103
		fp = &vm_page_array[pi - first_page];
1104
		if ((pe - first_page) > vm_page_array_size) {
1105
			/*
1106
			 * We have a segment that starts inside
1107
			 * of vm_page_array, but ends outside of it.
1108
			 *
1109
			 * Use vm_page_array pages for those that are
1110
			 * inside of the vm_page_array range, and
1111
			 * allocate the remaining ones.
1112
			 */
1113
			dpage_count = vm_page_array_size - (pi - first_page);
1114
			vm_phys_fictitious_init_range(fp, start, dpage_count,
1115
			    memattr);
1116
			page_count -= dpage_count;
1117
			start += ptoa(dpage_count);
1118
			goto alloc;
1119
		}
1120
		/*
1121
		 * We can allocate the full range from vm_page_array,
1122
		 * so there's no need to register the range in the tree.
1123
		 */
1124
		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1125
		return (0);
1126
	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1127
		/*
1128
		 * We have a segment that ends inside of vm_page_array,
1129
		 * but starts outside of it.
1130
		 */
1131
		fp = &vm_page_array[0];
1132
		dpage_count = pe - first_page;
1133
		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1134
		    memattr);
1135
		end -= ptoa(dpage_count);
1136
		page_count -= dpage_count;
1137
		goto alloc;
1138
	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1139
		/*
1140
		 * Trying to register a fictitious range that expands before
1141
		 * and after vm_page_array.
1142
		 */
1143
		return (EINVAL);
1144
	} else {
1145
alloc:
1146
#endif
1147
		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1148
		    M_WAITOK);
1149
#ifdef VM_PHYSSEG_DENSE
1150
	}
1151
#endif
1152
	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1153

1154
	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1155
	seg->start = start;
1156
	seg->end = end;
1157
	seg->first_page = fp;
1158

1159
	rw_wlock(&vm_phys_fictitious_reg_lock);
1160
	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1161
	rw_wunlock(&vm_phys_fictitious_reg_lock);
1162

1163
	return (0);
1164
}
1165

1166
void
1167
vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1168
{
1169
	struct vm_phys_fictitious_seg *seg, tmp;
1170
#ifdef VM_PHYSSEG_DENSE
1171
	long pi, pe;
1172
#endif
1173

1174
	KASSERT(start < end,
1175
	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1176
	    (uintmax_t)start, (uintmax_t)end));
1177

1178
#ifdef VM_PHYSSEG_DENSE
1179
	pi = atop(start);
1180
	pe = atop(end);
1181
	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1182
		if ((pe - first_page) <= vm_page_array_size) {
1183
			/*
1184
			 * This segment was allocated using vm_page_array
1185
			 * only, there's nothing to do since those pages
1186
			 * were never added to the tree.
1187
			 */
1188
			return;
1189
		}
1190
		/*
1191
		 * We have a segment that starts inside
1192
		 * of vm_page_array, but ends outside of it.
1193
		 *
1194
		 * Calculate how many pages were added to the
1195
		 * tree and free them.
1196
		 */
1197
		start = ptoa(first_page + vm_page_array_size);
1198
	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1199
		/*
1200
		 * We have a segment that ends inside of vm_page_array,
1201
		 * but starts outside of it.
1202
		 */
1203
		end = ptoa(first_page);
1204
	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1205
		/* Since it's not possible to register such a range, panic. */
1206
		panic(
1207
		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1208
		    (uintmax_t)start, (uintmax_t)end);
1209
	}
1210
#endif
1211
	tmp.start = start;
1212
	tmp.end = 0;
1213

1214
	rw_wlock(&vm_phys_fictitious_reg_lock);
1215
	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1216
	if (seg->start != start || seg->end != end) {
1217
		rw_wunlock(&vm_phys_fictitious_reg_lock);
1218
		panic(
1219
		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1220
		    (uintmax_t)start, (uintmax_t)end);
1221
	}
1222
	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1223
	rw_wunlock(&vm_phys_fictitious_reg_lock);
1224
	free(seg->first_page, M_FICT_PAGES);
1225
	free(seg, M_FICT_PAGES);
1226
}
1227

1228
/*
1229
 * Free a contiguous, power of two-sized set of physical pages.
1230
 * The pool field in the first page determines the destination pool.
1231
 *
1232
 * The free page queues must be locked.
1233
 */
1234
void
1235
vm_phys_free_pages(vm_page_t m, int pool, int order)
1236
{
1237
	struct vm_freelist *fl;
1238
	struct vm_phys_seg *seg;
1239
	vm_paddr_t pa;
1240
	vm_page_t m_buddy;
1241

1242
	KASSERT(m->order == VM_NFREEORDER,
1243
	    ("%s: page %p has unexpected order %d",
1244
	    __func__, m, m->order));
1245
	KASSERT(vm_phys_pool_valid(pool),
1246
	    ("%s: unexpected pool param %d", __func__, pool));
1247
	KASSERT(order < VM_NFREEORDER,
1248
	    ("%s: order %d is out of range", __func__, order));
1249
	seg = &vm_phys_segs[m->segind];
1250
	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1251
	if (order < VM_NFREEORDER - 1) {
1252
		pa = VM_PAGE_TO_PHYS(m);
1253
		do {
1254
			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1255
			if (pa < seg->start || pa >= seg->end)
1256
				break;
1257
			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
1258
			if (m_buddy->order != order)
1259
				break;
1260
			fl = (*seg->free_queues)[m_buddy->pool];
1261
			vm_freelist_rem(fl, m_buddy, order);
1262
			vm_phys_finish_init(m_buddy, order);
1263
			order++;
1264
			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1265
			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
1266
		} while (order < VM_NFREEORDER - 1);
1267
	}
1268
	fl = (*seg->free_queues)[pool];
1269
	vm_freelist_add(fl, m, order, pool, 1);
1270
}
1271

1272
#ifdef VM_FREEPOOL_LAZYINIT
1273
/*
1274
 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1275
 * them to the default pool.  This is a prerequisite for some rare operations
1276
 * which need to scan the page array and thus depend on all pages being
1277
 * initialized.
1278
 */
1279
static void
1280
vm_phys_lazy_init_domain(int domain, bool locked)
1281
{
1282
	static bool initdone[MAXMEMDOM];
1283
	struct vm_domain *vmd;
1284
	struct vm_freelist *fl;
1285
	vm_page_t m;
1286
	int pind;
1287
	bool unlocked;
1288

1289
	if (__predict_true(atomic_load_bool(&initdone[domain])))
1290
		return;
1291

1292
	vmd = VM_DOMAIN(domain);
1293
	if (locked)
1294
		vm_domain_free_assert_locked(vmd);
1295
	else
1296
		vm_domain_free_lock(vmd);
1297
	if (atomic_load_bool(&initdone[domain]))
1298
		goto out;
1299
	pind = VM_FREEPOOL_LAZYINIT;
1300
	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1301
		int flind;
1302

1303
		flind = vm_freelist_to_flind[freelist];
1304
		if (flind < 0)
1305
			continue;
1306
		fl = vm_phys_free_queues[domain][flind][pind];
1307
		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1308
			if (atomic_load_int(&fl[oind].lcnt) == 0)
1309
				continue;
1310
			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1311
				/*
1312
				 * Avoid holding the lock across the
1313
				 * initialization unless there's a free page
1314
				 * shortage.
1315
				 */
1316
				vm_freelist_rem(fl, m, oind);
1317
				unlocked = vm_domain_allocate(vmd,
1318
				    VM_ALLOC_NORMAL, 1 << oind);
1319
				if (unlocked)
1320
					vm_domain_free_unlock(vmd);
1321
				vm_phys_finish_init(m, oind);
1322
				if (unlocked) {
1323
					vm_domain_freecnt_inc(vmd, 1 << oind);
1324
					vm_domain_free_lock(vmd);
1325
				}
1326
				vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT,
1327
				    oind);
1328
			}
1329
		}
1330
	}
1331
	atomic_store_bool(&initdone[domain], true);
1332
out:
1333
	if (!locked)
1334
		vm_domain_free_unlock(vmd);
1335
}
1336

1337
static void
1338
vm_phys_lazy_init(void)
1339
{
1340
	for (int domain = 0; domain < vm_ndomains; domain++)
1341
		vm_phys_lazy_init_domain(domain, false);
1342
	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1343
}
1344

1345
static void
1346
vm_phys_lazy_init_kthr(void *arg __unused)
1347
{
1348
	vm_phys_lazy_init();
1349
	kthread_exit();
1350
}
1351

1352
static void
1353
vm_phys_lazy_sysinit(void *arg __unused)
1354
{
1355
	struct thread *td;
1356
	int error;
1357

1358
	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1359
	    RFSTOPPED, 0, "vmlazyinit");
1360
	if (error == 0) {
1361
		thread_lock(td);
1362
		sched_prio(td, PRI_MIN_IDLE);
1363
		sched_add(td, SRQ_BORING);
1364
	} else {
1365
		printf("%s: could not create lazy init thread: %d\n",
1366
		    __func__, error);
1367
		vm_phys_lazy_init();
1368
	}
1369
}
1370
SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1371
    NULL);
1372
#endif /* VM_FREEPOOL_LAZYINIT */
1373

1374
/*
1375
 * Free a contiguous, arbitrarily sized set of physical pages, without
1376
 * merging across set boundaries.  Assumes no pages have a valid pool field.
1377
 *
1378
 * The free page queues must be locked.
1379
 */
1380
void
1381
vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1382
{
1383
	struct vm_freelist *fl;
1384
	struct vm_phys_seg *seg;
1385
	vm_page_t m_end;
1386
	vm_paddr_t diff, lo;
1387
	int order;
1388

1389
	/*
1390
	 * Avoid unnecessary coalescing by freeing the pages in the largest
1391
	 * possible power-of-two-sized subsets.
1392
	 */
1393
	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1394
	seg = &vm_phys_segs[m->segind];
1395
	fl = (*seg->free_queues)[pool];
1396
	m_end = m + npages;
1397
	/* Free blocks of increasing size. */
1398
	lo = atop(VM_PAGE_TO_PHYS(m));
1399
	if (m < m_end &&
1400
	    (diff = lo ^ (lo + npages - 1)) != 0) {
1401
		order = min(ilog2(diff), VM_NFREEORDER - 1);
1402
		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1403
		    pool, 1);
1404
	}
1405

1406
	/* Free blocks of maximum size. */
1407
	order = VM_NFREEORDER - 1;
1408
	while (m + (1 << order) <= m_end) {
1409
		KASSERT(seg == &vm_phys_segs[m->segind],
1410
		    ("%s: page range [%p,%p) spans multiple segments",
1411
		    __func__, m_end - npages, m));
1412
		vm_phys_enq_chunk(fl, m, order, pool, 1);
1413
		m += 1 << order;
1414
	}
1415
	/* Free blocks of diminishing size. */
1416
	vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1417
}
1418

1419
/*
1420
 * Free a contiguous, arbitrarily sized set of physical pages.
1421
 * Assumes that every page but the first has no valid pool field.
1422
 * Uses the pool value in the first page if valid, otherwise default.
1423
 *
1424
 * The free page queues must be locked.
1425
 */
1426
void
1427
vm_phys_free_contig(vm_page_t m, int pool, u_long npages)
1428
{
1429
	vm_paddr_t lo;
1430
	vm_page_t m_start, m_end;
1431
	unsigned max_order, order_start, order_end;
1432

1433
	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1434

1435
	lo = atop(VM_PAGE_TO_PHYS(m));
1436
	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1437

1438
	m_start = m;
1439
	order_start = ffsll(lo) - 1;
1440
	if (order_start < max_order)
1441
		m_start += 1 << order_start;
1442
	m_end = m + npages;
1443
	order_end = ffsll(lo + npages) - 1;
1444
	if (order_end < max_order)
1445
		m_end -= 1 << order_end;
1446
	/*
1447
	 * Avoid unnecessary coalescing by freeing the pages at the start and
1448
	 * end of the range last.
1449
	 */
1450
	if (m_start < m_end)
1451
		vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1452
	if (order_start < max_order)
1453
		vm_phys_free_pages(m, pool, order_start);
1454
	if (order_end < max_order)
1455
		vm_phys_free_pages(m_end, pool, order_end);
1456
}
1457

1458
/*
1459
 * Identify the first address range within segment segind or greater
1460
 * that matches the domain, lies within the low/high range, and has
1461
 * enough pages.  Return -1 if there is none.
1462
 */
1463
int
1464
vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1465
    u_long npages, vm_paddr_t low, vm_paddr_t high)
1466
{
1467
	vm_paddr_t pa_end, pa_start;
1468
	struct vm_phys_seg *end_seg, *seg;
1469

1470
	KASSERT(npages > 0, ("npages is zero"));
1471
	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1472
	end_seg = &vm_phys_segs[vm_phys_nsegs];
1473
	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1474
		if (seg->domain != domain)
1475
			continue;
1476
		if (seg->start >= high)
1477
			return (-1);
1478
		pa_start = MAX(low, seg->start);
1479
		pa_end = MIN(high, seg->end);
1480
		if (pa_end - pa_start < ptoa(npages))
1481
			continue;
1482
#ifdef VM_FREEPOOL_LAZYINIT
1483
		/*
1484
		 * The pages on the free lists must be initialized.
1485
		 */
1486
		vm_phys_lazy_init_domain(domain, false);
1487
#endif
1488
		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1489
		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1490
		return (seg - vm_phys_segs);
1491
	}
1492
	return (-1);
1493
}
1494

1495
/*
1496
 * Search for the given physical page "m" in the free lists.  If the search
1497
 * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1498
 * false, indicating that "m" is not in the free lists.
1499
 *
1500
 * The free page queues must be locked.
1501
 */
1502
bool
1503
vm_phys_unfree_page(vm_paddr_t pa)
1504
{
1505
	struct vm_freelist *fl;
1506
	struct vm_phys_seg *seg;
1507
	vm_paddr_t pa_half;
1508
	vm_page_t m, m_set, m_tmp;
1509
	int order, pool;
1510

1511
	seg = vm_phys_paddr_to_seg(pa);
1512
	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1513

1514
#ifdef VM_FREEPOOL_LAZYINIT
1515
	/*
1516
	 * The pages on the free lists must be initialized.
1517
	 */
1518
	vm_phys_lazy_init_domain(seg->domain, true);
1519
#endif
1520

1521
	/*
1522
	 * First, find the contiguous, power of two-sized set of free
1523
	 * physical pages containing the given physical page "m" and
1524
	 * assign it to "m_set".
1525
	 */
1526
	m = vm_phys_paddr_to_vm_page(pa);
1527
	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1528
	    order < VM_NFREEORDER - 1; ) {
1529
		order++;
1530
		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1531
		if (pa >= seg->start)
1532
			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1533
		else
1534
			return (false);
1535
	}
1536
	if (m_set->order < order)
1537
		return (false);
1538
	if (m_set->order == VM_NFREEORDER)
1539
		return (false);
1540
	KASSERT(m_set->order < VM_NFREEORDER,
1541
	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1542
	    m_set, m_set->order));
1543

1544
	/*
1545
	 * Next, remove "m_set" from the free lists.  Finally, extract
1546
	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1547
	 * is larger than a page, shrink "m_set" by returning the half
1548
	 * of "m_set" that does not contain "m" to the free lists.
1549
	 */
1550
	pool = m_set->pool;
1551
	fl = (*seg->free_queues)[pool];
1552
	order = m_set->order;
1553
	vm_freelist_rem(fl, m_set, order);
1554
	while (order > 0) {
1555
		order--;
1556
		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1557
		if (m->phys_addr < pa_half)
1558
			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1559
		else {
1560
			m_tmp = m_set;
1561
			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1562
		}
1563
		vm_freelist_add(fl, m_tmp, order, pool, 0);
1564
	}
1565
	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1566
	return (true);
1567
}
1568

1569
/*
1570
 * Find a run of contiguous physical pages, meeting alignment requirements, from
1571
 * a list of max-sized page blocks, where we need at least two consecutive
1572
 * blocks to satisfy the (large) page request.
1573
 */
1574
static vm_page_t
1575
vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1576
    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1577
{
1578
	struct vm_phys_seg *seg;
1579
	vm_page_t m, m_iter, m_ret;
1580
	vm_paddr_t max_size, size;
1581
	int max_order;
1582

1583
	max_order = VM_NFREEORDER - 1;
1584
	size = npages << PAGE_SHIFT;
1585
	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1586
	KASSERT(size > max_size, ("size is too small"));
1587

1588
	/*
1589
	 * In order to avoid examining any free max-sized page block more than
1590
	 * twice, identify the ones that are first in a physically-contiguous
1591
	 * sequence of such blocks, and only for those walk the sequence to
1592
	 * check if there are enough free blocks starting at a properly aligned
1593
	 * block.  Thus, no block is checked for free-ness more than twice.
1594
	 */
1595
	TAILQ_FOREACH(m, &fl[max_order].pl, plinks.q) {
1596
		/*
1597
		 * Skip m unless it is first in a sequence of free max page
1598
		 * blocks >= low in its segment.
1599
		 */
1600
		seg = &vm_phys_segs[m->segind];
1601
		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1602
			continue;
1603
		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1604
		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1605
		    max_order == m[-1 << max_order].order)
1606
			continue;
1607

1608
		/*
1609
		 * Advance m_ret from m to the first of the sequence, if any,
1610
		 * that satisfies alignment conditions and might leave enough
1611
		 * space.
1612
		 */
1613
		m_ret = m;
1614
		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1615
		    size, alignment, boundary) &&
1616
		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1617
		    max_order == m_ret[1 << max_order].order)
1618
			m_ret += 1 << max_order;
1619

1620
		/*
1621
		 * Skip m unless some block m_ret in the sequence is properly
1622
		 * aligned, and begins a sequence of enough pages less than
1623
		 * high, and in the same segment.
1624
		 */
1625
		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1626
			continue;
1627

1628
		/*
1629
		 * Skip m unless the blocks to allocate starting at m_ret are
1630
		 * all free.
1631
		 */
1632
		for (m_iter = m_ret;
1633
		    m_iter < m_ret + npages && max_order == m_iter->order;
1634
		    m_iter += 1 << max_order) {
1635
		}
1636
		if (m_iter < m_ret + npages)
1637
			continue;
1638
		return (m_ret);
1639
	}
1640
	return (NULL);
1641
}
1642

1643
/*
1644
 * Find a run of contiguous physical pages from the specified free list
1645
 * table.
1646
 */
1647
static vm_page_t
1648
vm_phys_find_queues_contig(
1649
    struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1650
    u_long npages, vm_paddr_t low, vm_paddr_t high,
1651
    u_long alignment, vm_paddr_t boundary)
1652
{
1653
	struct vm_freelist *fl;
1654
	vm_page_t m_ret;
1655
	vm_paddr_t pa, pa_end, size;
1656
	int oind, order, pind;
1657

1658
	KASSERT(npages > 0, ("npages is 0"));
1659
	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1660
	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1661
	/* Compute the queue that is the best fit for npages. */
1662
	order = flsl(npages - 1);
1663
	/* Search for a large enough free block. */
1664
	size = npages << PAGE_SHIFT;
1665
	for (oind = order; oind < VM_NFREEORDER; oind++) {
1666
		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1667
			fl = (*queues)[pind];
1668
			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
1669
				/*
1670
				 * Determine if the address range starting at pa
1671
				 * is within the given range, satisfies the
1672
				 * given alignment, and does not cross the given
1673
				 * boundary.
1674
				 */
1675
				pa = VM_PAGE_TO_PHYS(m_ret);
1676
				pa_end = pa + size;
1677
				if (low <= pa && pa_end <= high &&
1678
				    vm_addr_ok(pa, size, alignment, boundary))
1679
					return (m_ret);
1680
			}
1681
		}
1682
	}
1683
	if (order < VM_NFREEORDER)
1684
		return (NULL);
1685
	/* Search for a long-enough sequence of max-order blocks. */
1686
	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1687
		fl = (*queues)[pind];
1688
		m_ret = vm_phys_find_freelist_contig(fl, npages,
1689
		    low, high, alignment, boundary);
1690
		if (m_ret != NULL)
1691
			return (m_ret);
1692
	}
1693
	return (NULL);
1694
}
1695

1696
/*
1697
 * Allocate a contiguous set of physical pages of the given size
1698
 * "npages" from the free lists.  All of the physical pages must be at
1699
 * or above the given physical address "low" and below the given
1700
 * physical address "high".  The given value "alignment" determines the
1701
 * alignment of the first physical page in the set.  If the given value
1702
 * "boundary" is non-zero, then the set of physical pages cannot cross
1703
 * any physical address boundary that is a multiple of that value.  Both
1704
 * "alignment" and "boundary" must be a power of two.  Sets the pool
1705
 * field to DEFAULT in the first allocated page.
1706
 */
1707
vm_page_t
1708
vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1709
    u_long alignment, vm_paddr_t boundary)
1710
{
1711
	vm_paddr_t pa_end, pa_start;
1712
	struct vm_freelist *fl;
1713
	vm_page_t m, m_run;
1714
	struct vm_phys_seg *seg;
1715
	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1716
	int oind, segind;
1717

1718
	KASSERT(npages > 0, ("npages is 0"));
1719
	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1720
	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1721
	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1722
	if (low >= high)
1723
		return (NULL);
1724
	queues = NULL;
1725
	m_run = NULL;
1726
	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1727
		seg = &vm_phys_segs[segind];
1728
		if (seg->start >= high || seg->domain != domain)
1729
			continue;
1730
		if (low >= seg->end)
1731
			break;
1732
		if (low <= seg->start)
1733
			pa_start = seg->start;
1734
		else
1735
			pa_start = low;
1736
		if (high < seg->end)
1737
			pa_end = high;
1738
		else
1739
			pa_end = seg->end;
1740
		if (pa_end - pa_start < ptoa(npages))
1741
			continue;
1742
		/*
1743
		 * If a previous segment led to a search using
1744
		 * the same free lists as would this segment, then
1745
		 * we've actually already searched within this
1746
		 * too.  So skip it.
1747
		 */
1748
		if (seg->free_queues == queues)
1749
			continue;
1750
		queues = seg->free_queues;
1751
		m_run = vm_phys_find_queues_contig(queues, npages,
1752
		    low, high, alignment, boundary);
1753
		if (m_run != NULL)
1754
			break;
1755
	}
1756
	if (m_run == NULL)
1757
		return (NULL);
1758

1759
	/* Allocate pages from the page-range found. */
1760
	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1761
		fl = (*queues)[m->pool];
1762
		oind = m->order;
1763
		vm_freelist_rem(fl, m, oind);
1764
		vm_phys_finish_init(m, oind);
1765
	}
1766
	/* Return excess pages to the free lists. */
1767
	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1768
	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1769
	    VM_FREEPOOL_DEFAULT, 0);
1770

1771
	/* Return page verified to satisfy conditions of request. */
1772
	pa_start = VM_PAGE_TO_PHYS(m_run);
1773
	KASSERT(low <= pa_start,
1774
	    ("memory allocated below minimum requested range"));
1775
	KASSERT(pa_start + ptoa(npages) <= high,
1776
	    ("memory allocated above maximum requested range"));
1777
	seg = &vm_phys_segs[m_run->segind];
1778
	KASSERT(seg->domain == domain,
1779
	    ("memory not allocated from specified domain"));
1780
	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1781
	    ("memory alignment/boundary constraints not satisfied"));
1782
	return (m_run);
1783
}
1784

1785
/*
1786
 * Return the index of the first unused slot which may be the terminating
1787
 * entry.
1788
 */
1789
static int
1790
vm_phys_avail_count(void)
1791
{
1792
	int i;
1793

1794
	for (i = 0; i < PHYS_AVAIL_COUNT; i += 2)
1795
		if (phys_avail[i] == 0 && phys_avail[i + 1] == 0)
1796
			return (i);
1797
	panic("Improperly terminated phys_avail[]");
1798
}
1799

1800
/*
1801
 * Assert that a phys_avail entry is valid.
1802
 */
1803
static void
1804
vm_phys_avail_check(int i)
1805
{
1806
	if (i % 2 != 0)
1807
		panic("Chunk start index %d is not even.", i);
1808
	if (phys_avail[i] & PAGE_MASK)
1809
		panic("Unaligned phys_avail[%d]: %#jx", i,
1810
		    (intmax_t)phys_avail[i]);
1811
	if (phys_avail[i + 1] & PAGE_MASK)
1812
		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1813
		    (intmax_t)phys_avail[i + 1]);
1814
	if (phys_avail[i + 1] < phys_avail[i])
1815
		panic("phys_avail[%d]: start %#jx > end %#jx", i,
1816
		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]);
1817
}
1818

1819
/*
1820
 * Return the index of an overlapping phys_avail entry or -1.
1821
 */
1822
#ifdef NUMA
1823
static int
1824
vm_phys_avail_find(vm_paddr_t pa)
1825
{
1826
	int i;
1827

1828
	for (i = 0; phys_avail[i + 1]; i += 2)
1829
		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1830
			return (i);
1831
	return (-1);
1832
}
1833
#endif
1834

1835
/*
1836
 * Return the index of the largest entry.
1837
 */
1838
int
1839
vm_phys_avail_largest(void)
1840
{
1841
	vm_paddr_t sz, largesz;
1842
	int largest;
1843
	int i;
1844

1845
	largest = 0;
1846
	largesz = 0;
1847
	for (i = 0; phys_avail[i + 1]; i += 2) {
1848
		sz = vm_phys_avail_size(i);
1849
		if (sz > largesz) {
1850
			largesz = sz;
1851
			largest = i;
1852
		}
1853
	}
1854

1855
	return (largest);
1856
}
1857

1858
vm_paddr_t
1859
vm_phys_avail_size(int i)
1860
{
1861

1862
	return (phys_avail[i + 1] - phys_avail[i]);
1863
}
1864

1865
/*
1866
 * Split a chunk in phys_avail[] at the address 'pa'.
1867
 *
1868
 * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries.
1869
 * Returns zero on actual split, in which case the two new chunks occupy slots
1870
 * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split
1871
 * actually occurred) else ENOSPC if there are not enough slots in phys_avail[]
1872
 * to represent the additional chunk caused by the split.
1873
 */
1874
static int
1875
vm_phys_avail_split(vm_paddr_t pa, int i)
1876
{
1877
	int cnt;
1878

1879
	vm_phys_avail_check(i);
1880
	if (pa < phys_avail[i] || pa > phys_avail[i + 1])
1881
		panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].",
1882
		    __func__, (uintmax_t)pa, i,
1883
		    (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]);
1884
	if (pa == phys_avail[i] || pa == phys_avail[i + 1])
1885
		return (EJUSTRETURN);
1886
	cnt = vm_phys_avail_count();
1887
	if (cnt >= PHYS_AVAIL_ENTRIES)
1888
		return (ENOSPC);
1889
	memmove(&phys_avail[i + 2], &phys_avail[i],
1890
	    (cnt - i) * sizeof(phys_avail[0]));
1891
	phys_avail[i + 1] = pa;
1892
	phys_avail[i + 2] = pa;
1893
	vm_phys_avail_check(i);
1894
	vm_phys_avail_check(i+2);
1895

1896
	return (0);
1897
}
1898

1899
/*
1900
 * Check if a given physical address can be included as part of a crash dump.
1901
 */
1902
bool
1903
vm_phys_is_dumpable(vm_paddr_t pa)
1904
{
1905
	vm_page_t m;
1906
	int i;
1907

1908
	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1909
		return ((m->flags & PG_NODUMP) == 0);
1910

1911
	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1912
		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1913
			return (true);
1914
	}
1915
	return (false);
1916
}
1917

1918
void
1919
vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1920
{
1921
	struct vm_phys_seg *seg;
1922

1923
	if (vm_phys_early_nsegs == -1)
1924
		panic("%s: called after initialization", __func__);
1925
	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1926
		panic("%s: ran out of early segments", __func__);
1927

1928
	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1929
	seg->start = start;
1930
	seg->end = end;
1931
}
1932

1933
/*
1934
 * This routine allocates NUMA node specific memory before the page
1935
 * allocator is bootstrapped.
1936
 */
1937
vm_paddr_t
1938
vm_phys_early_alloc(int domain, size_t alloc_size)
1939
{
1940
#ifdef NUMA
1941
	int mem_index;
1942
#endif
1943
	int i, biggestone;
1944
	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1945

1946
	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1947
	    ("%s: invalid domain index %d", __func__, domain));
1948

1949
	/*
1950
	 * Search the mem_affinity array for the biggest address
1951
	 * range in the desired domain.  This is used to constrain
1952
	 * the phys_avail selection below.
1953
	 */
1954
	biggestsize = 0;
1955
	mem_start = 0;
1956
	mem_end = -1;
1957
#ifdef NUMA
1958
	mem_index = 0;
1959
	if (mem_affinity != NULL) {
1960
		for (i = 0;; i++) {
1961
			size = mem_affinity[i].end - mem_affinity[i].start;
1962
			if (size == 0)
1963
				break;
1964
			if (domain != -1 && mem_affinity[i].domain != domain)
1965
				continue;
1966
			if (size > biggestsize) {
1967
				mem_index = i;
1968
				biggestsize = size;
1969
			}
1970
		}
1971
		mem_start = mem_affinity[mem_index].start;
1972
		mem_end = mem_affinity[mem_index].end;
1973
	}
1974
#endif
1975

1976
	/*
1977
	 * Now find biggest physical segment in within the desired
1978
	 * numa domain.
1979
	 */
1980
	biggestsize = 0;
1981
	biggestone = 0;
1982
	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1983
		/* skip regions that are out of range */
1984
		if (phys_avail[i+1] - alloc_size < mem_start ||
1985
		    phys_avail[i+1] > mem_end)
1986
			continue;
1987
		size = vm_phys_avail_size(i);
1988
		if (size > biggestsize) {
1989
			biggestone = i;
1990
			biggestsize = size;
1991
		}
1992
	}
1993
	alloc_size = round_page(alloc_size);
1994

1995
	/*
1996
	 * Grab single pages from the front to reduce fragmentation.
1997
	 */
1998
	if (alloc_size == PAGE_SIZE) {
1999
		pa = phys_avail[biggestone];
2000
		phys_avail[biggestone] += PAGE_SIZE;
2001
		vm_phys_avail_check(biggestone);
2002
		return (pa);
2003
	}
2004

2005
	/*
2006
	 * Naturally align large allocations.
2007
	 */
2008
	align = phys_avail[biggestone + 1] & (alloc_size - 1);
2009
	if (alloc_size + align > biggestsize)
2010
		panic("cannot find a large enough size\n");
2011
	if (align != 0 &&
2012
	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
2013
	    biggestone) != 0)
2014
		/* Wasting memory. */
2015
		phys_avail[biggestone + 1] -= align;
2016

2017
	phys_avail[biggestone + 1] -= alloc_size;
2018
	vm_phys_avail_check(biggestone);
2019
	pa = phys_avail[biggestone + 1];
2020
	return (pa);
2021
}
2022

2023
void
2024
vm_phys_early_startup(void)
2025
{
2026
	struct vm_phys_seg *seg;
2027
	int i;
2028

2029
	if (phys_avail[1] == 0)
2030
		panic("phys_avail[] is empty");
2031

2032
	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2033
		phys_avail[i] = round_page(phys_avail[i]);
2034
		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
2035
	}
2036

2037
	for (i = 0; i < vm_phys_early_nsegs; i++) {
2038
		seg = &vm_phys_early_segs[i];
2039
		vm_phys_add_seg(seg->start, seg->end);
2040
	}
2041
	vm_phys_early_nsegs = -1;
2042

2043
#ifdef NUMA
2044
	/* Force phys_avail to be split by domain. */
2045
	if (mem_affinity != NULL) {
2046
		int idx;
2047

2048
		for (i = 0; mem_affinity[i].end != 0; i++) {
2049
			idx = vm_phys_avail_find(mem_affinity[i].start);
2050
			if (idx != -1)
2051
				vm_phys_avail_split(mem_affinity[i].start, idx);
2052
			idx = vm_phys_avail_find(mem_affinity[i].end);
2053
			if (idx != -1)
2054
				vm_phys_avail_split(mem_affinity[i].end, idx);
2055
		}
2056
	}
2057
#endif
2058
}
2059

2060
#ifdef DDB
2061
/*
2062
 * Show the number of physical pages in each of the free lists.
2063
 */
2064
DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
2065
{
2066
	struct vm_freelist *fl;
2067
	int flind, oind, pind, dom;
2068

2069
	for (dom = 0; dom < vm_ndomains; dom++) {
2070
		db_printf("DOMAIN: %d\n", dom);
2071
		for (flind = 0; flind < vm_nfreelists; flind++) {
2072
			db_printf("FREE LIST %d:\n"
2073
			    "\n  ORDER (SIZE)  |  NUMBER"
2074
			    "\n              ", flind);
2075
			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2076
				db_printf("  |  POOL %d", pind);
2077
			db_printf("\n--            ");
2078
			for (pind = 0; pind < VM_NFREEPOOL; pind++)
2079
				db_printf("-- --      ");
2080
			db_printf("--\n");
2081
			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
2082
				db_printf("  %2.2d (%6.6dK)", oind,
2083
				    1 << (PAGE_SHIFT - 10 + oind));
2084
				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
2085
				fl = vm_phys_free_queues[dom][flind][pind];
2086
					db_printf("  |  %6.6d", fl[oind].lcnt);
2087
				}
2088
				db_printf("\n");
2089
			}
2090
			db_printf("\n");
2091
		}
2092
		db_printf("\n");
2093
	}
2094
}
2095
#endif
2096

2097
Product

Resources

Company