Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_page.c
39478 views
1
/*-
2
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3
*
4
* Copyright (c) 1991 Regents of the University of California.
5
* All rights reserved.
6
* Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
7
*
8
* This code is derived from software contributed to Berkeley by
9
* The Mach Operating System project at Carnegie-Mellon University.
10
*
11
* Redistribution and use in source and binary forms, with or without
12
* modification, are permitted provided that the following conditions
13
* are met:
14
* 1. Redistributions of source code must retain the above copyright
15
* notice, this list of conditions and the following disclaimer.
16
* 2. Redistributions in binary form must reproduce the above copyright
17
* notice, this list of conditions and the following disclaimer in the
18
* documentation and/or other materials provided with the distribution.
19
* 3. Neither the name of the University nor the names of its contributors
20
* may be used to endorse or promote products derived from this software
21
* without specific prior written permission.
22
*
23
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33
* SUCH DAMAGE.
34
*/
35
36
/*-
37
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
38
* All rights reserved.
39
*
40
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
41
*
42
* Permission to use, copy, modify and distribute this software and
43
* its documentation is hereby granted, provided that both the copyright
44
* notice and this permission notice appear in all copies of the
45
* software, derivative works or modified versions, and any portions
46
* thereof, and that both notices appear in supporting documentation.
47
*
48
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51
*
52
* Carnegie Mellon requests users of this software to return to
53
*
54
* Software Distribution Coordinator or [email protected]
55
* School of Computer Science
56
* Carnegie Mellon University
57
* Pittsburgh PA 15213-3890
58
*
59
* any improvements or extensions that they make and grant Carnegie the
60
* rights to redistribute these changes.
61
*/
62
63
/*
64
* Resident memory management module.
65
*/
66
67
#include <sys/cdefs.h>
68
#include "opt_vm.h"
69
70
#include <sys/param.h>
71
#include <sys/systm.h>
72
#include <sys/counter.h>
73
#include <sys/domainset.h>
74
#include <sys/kernel.h>
75
#include <sys/limits.h>
76
#include <sys/linker.h>
77
#include <sys/lock.h>
78
#include <sys/malloc.h>
79
#include <sys/mman.h>
80
#include <sys/msgbuf.h>
81
#include <sys/mutex.h>
82
#include <sys/proc.h>
83
#include <sys/rwlock.h>
84
#include <sys/sleepqueue.h>
85
#include <sys/sbuf.h>
86
#include <sys/sched.h>
87
#include <sys/smp.h>
88
#include <sys/sysctl.h>
89
#include <sys/vmmeter.h>
90
#include <sys/vnode.h>
91
92
#include <vm/vm.h>
93
#include <vm/pmap.h>
94
#include <vm/vm_param.h>
95
#include <vm/vm_domainset.h>
96
#include <vm/vm_kern.h>
97
#include <vm/vm_map.h>
98
#include <vm/vm_object.h>
99
#include <vm/vm_page.h>
100
#include <vm/vm_pageout.h>
101
#include <vm/vm_phys.h>
102
#include <vm/vm_pagequeue.h>
103
#include <vm/vm_pager.h>
104
#include <vm/vm_radix.h>
105
#include <vm/vm_reserv.h>
106
#include <vm/vm_extern.h>
107
#include <vm/vm_dumpset.h>
108
#include <vm/uma.h>
109
#include <vm/uma_int.h>
110
111
#include <machine/md_var.h>
112
113
struct vm_domain vm_dom[MAXMEMDOM];
114
115
DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
116
117
struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
118
/* The following fields are protected by the domainset lock. */
119
domainset_t __exclusive_cache_line vm_min_domains;
120
domainset_t __exclusive_cache_line vm_severe_domains;
121
static int vm_min_waiters;
122
static int vm_severe_waiters;
123
static int vm_pageproc_waiters;
124
125
static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
126
"VM page statistics");
127
128
static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries);
129
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries,
130
CTLFLAG_RD, &pqstate_commit_retries,
131
"Number of failed per-page atomic queue state updates");
132
133
static COUNTER_U64_DEFINE_EARLY(queue_ops);
134
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops,
135
CTLFLAG_RD, &queue_ops,
136
"Number of batched queue operations");
137
138
static COUNTER_U64_DEFINE_EARLY(queue_nops);
139
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops,
140
CTLFLAG_RD, &queue_nops,
141
"Number of batched queue operations with no effects");
142
143
static unsigned long nofreeq_size;
144
SYSCTL_ULONG(_vm_stats_page, OID_AUTO, nofreeq_size, CTLFLAG_RD,
145
&nofreeq_size, 0,
146
"Size of the nofree queue");
147
148
/*
149
* bogus page -- for I/O to/from partially complete buffers,
150
* or for paging into sparsely invalid regions.
151
*/
152
vm_page_t bogus_page;
153
154
vm_page_t vm_page_array;
155
long vm_page_array_size;
156
long first_page;
157
158
struct bitset *vm_page_dump;
159
long vm_page_dump_pages;
160
161
static TAILQ_HEAD(, vm_page) blacklist_head;
162
static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
163
SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
164
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
165
166
static uma_zone_t fakepg_zone;
167
168
static void vm_page_alloc_check(vm_page_t m);
169
static vm_page_t vm_page_alloc_nofree_domain(int domain, int req);
170
static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m,
171
vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked);
172
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
173
static void vm_page_enqueue(vm_page_t m, uint8_t queue);
174
static bool vm_page_free_prep(vm_page_t m);
175
static void vm_page_free_toq(vm_page_t m);
176
static void vm_page_init(void *dummy);
177
static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object);
178
static void vm_page_mvqueue(vm_page_t m, const uint8_t queue,
179
const uint16_t nflag);
180
static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
181
vm_page_t m_run, vm_paddr_t high);
182
static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse);
183
static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
184
int req);
185
static int vm_page_zone_import(void *arg, void **store, int cnt, int domain,
186
int flags);
187
static void vm_page_zone_release(void *arg, void **store, int cnt);
188
189
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
190
191
static void
192
vm_page_init(void *dummy)
193
{
194
195
fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
196
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
197
bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_NOFREE);
198
}
199
200
static int pgcache_zone_max_pcpu;
201
SYSCTL_INT(_vm, OID_AUTO, pgcache_zone_max_pcpu,
202
CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pgcache_zone_max_pcpu, 0,
203
"Per-CPU page cache size");
204
205
/*
206
* The cache page zone is initialized later since we need to be able to allocate
207
* pages before UMA is fully initialized.
208
*/
209
static void
210
vm_page_init_cache_zones(void *dummy __unused)
211
{
212
struct vm_domain *vmd;
213
struct vm_pgcache *pgcache;
214
int cache, domain, maxcache, pool;
215
216
TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &pgcache_zone_max_pcpu);
217
maxcache = pgcache_zone_max_pcpu * mp_ncpus;
218
for (domain = 0; domain < vm_ndomains; domain++) {
219
vmd = VM_DOMAIN(domain);
220
for (pool = 0; pool < VM_NFREEPOOL; pool++) {
221
#ifdef VM_FREEPOOL_LAZYINIT
222
if (pool == VM_FREEPOOL_LAZYINIT)
223
continue;
224
#endif
225
pgcache = &vmd->vmd_pgcache[pool];
226
pgcache->domain = domain;
227
pgcache->pool = pool;
228
pgcache->zone = uma_zcache_create("vm pgcache",
229
PAGE_SIZE, NULL, NULL, NULL, NULL,
230
vm_page_zone_import, vm_page_zone_release, pgcache,
231
UMA_ZONE_VM);
232
233
/*
234
* Limit each pool's zone to 0.1% of the pages in the
235
* domain.
236
*/
237
cache = maxcache != 0 ? maxcache :
238
vmd->vmd_page_count / 1000;
239
uma_zone_set_maxcache(pgcache->zone, cache);
240
}
241
}
242
}
243
SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
244
245
/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
246
#if PAGE_SIZE == 32768
247
#ifdef CTASSERT
248
CTASSERT(sizeof(u_long) >= 8);
249
#endif
250
#endif
251
252
/*
253
* vm_set_page_size:
254
*
255
* Sets the page size, perhaps based upon the memory
256
* size. Must be called before any use of page-size
257
* dependent functions.
258
*/
259
void
260
vm_set_page_size(void)
261
{
262
if (vm_cnt.v_page_size == 0)
263
vm_cnt.v_page_size = PAGE_SIZE;
264
if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
265
panic("vm_set_page_size: page size not a power of two");
266
}
267
268
/*
269
* vm_page_blacklist_next:
270
*
271
* Find the next entry in the provided string of blacklist
272
* addresses. Entries are separated by space, comma, or newline.
273
* If an invalid integer is encountered then the rest of the
274
* string is skipped. Updates the list pointer to the next
275
* character, or NULL if the string is exhausted or invalid.
276
*/
277
static vm_paddr_t
278
vm_page_blacklist_next(char **list, char *end)
279
{
280
vm_paddr_t bad;
281
char *cp, *pos;
282
283
if (list == NULL || *list == NULL)
284
return (0);
285
if (**list =='\0') {
286
*list = NULL;
287
return (0);
288
}
289
290
/*
291
* If there's no end pointer then the buffer is coming from
292
* the kenv and we know it's null-terminated.
293
*/
294
if (end == NULL)
295
end = *list + strlen(*list);
296
297
/* Ensure that strtoq() won't walk off the end */
298
if (*end != '\0') {
299
if (*end == '\n' || *end == ' ' || *end == ',')
300
*end = '\0';
301
else {
302
printf("Blacklist not terminated, skipping\n");
303
*list = NULL;
304
return (0);
305
}
306
}
307
308
for (pos = *list; *pos != '\0'; pos = cp) {
309
bad = strtoq(pos, &cp, 0);
310
if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
311
if (bad == 0) {
312
if (++cp < end)
313
continue;
314
else
315
break;
316
}
317
} else
318
break;
319
if (*cp == '\0' || ++cp >= end)
320
*list = NULL;
321
else
322
*list = cp;
323
return (trunc_page(bad));
324
}
325
printf("Garbage in RAM blacklist, skipping\n");
326
*list = NULL;
327
return (0);
328
}
329
330
bool
331
vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
332
{
333
struct vm_domain *vmd;
334
vm_page_t m;
335
bool found;
336
337
m = vm_phys_paddr_to_vm_page(pa);
338
if (m == NULL)
339
return (true); /* page does not exist, no failure */
340
341
vmd = VM_DOMAIN(vm_phys_domain(pa));
342
vm_domain_free_lock(vmd);
343
found = vm_phys_unfree_page(pa);
344
vm_domain_free_unlock(vmd);
345
if (found) {
346
vm_domain_freecnt_inc(vmd, -1);
347
TAILQ_INSERT_TAIL(&blacklist_head, m, plinks.q);
348
if (verbose)
349
printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
350
}
351
return (found);
352
}
353
354
/*
355
* vm_page_blacklist_check:
356
*
357
* Iterate through the provided string of blacklist addresses, pulling
358
* each entry out of the physical allocator free list and putting it
359
* onto a list for reporting via the vm.page_blacklist sysctl.
360
*/
361
static void
362
vm_page_blacklist_check(char *list, char *end)
363
{
364
vm_paddr_t pa;
365
char *next;
366
367
next = list;
368
while (next != NULL) {
369
if ((pa = vm_page_blacklist_next(&next, end)) == 0)
370
continue;
371
vm_page_blacklist_add(pa, bootverbose);
372
}
373
}
374
375
/*
376
* vm_page_blacklist_load:
377
*
378
* Search for a special module named "ram_blacklist". It'll be a
379
* plain text file provided by the user via the loader directive
380
* of the same name.
381
*/
382
static void
383
vm_page_blacklist_load(char **list, char **end)
384
{
385
void *mod;
386
u_char *ptr;
387
u_int len;
388
389
mod = NULL;
390
ptr = NULL;
391
392
mod = preload_search_by_type("ram_blacklist");
393
if (mod != NULL) {
394
ptr = preload_fetch_addr(mod);
395
len = preload_fetch_size(mod);
396
}
397
398
if (ptr != NULL && len > 0) {
399
*list = ptr;
400
*end = ptr + len - 1;
401
} else {
402
*list = NULL;
403
*end = NULL;
404
}
405
406
return;
407
}
408
409
static int
410
sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
411
{
412
vm_page_t m;
413
struct sbuf sbuf;
414
int error, first;
415
416
first = 1;
417
error = sysctl_wire_old_buffer(req, 0);
418
if (error != 0)
419
return (error);
420
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
421
TAILQ_FOREACH(m, &blacklist_head, plinks.q) {
422
sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
423
(uintmax_t)m->phys_addr);
424
first = 0;
425
}
426
error = sbuf_finish(&sbuf);
427
sbuf_delete(&sbuf);
428
return (error);
429
}
430
431
/*
432
* Initialize a dummy page for use in scans of the specified paging queue.
433
* In principle, this function only needs to set the flag PG_MARKER.
434
* Nonetheless, it write busies the page as a safety precaution.
435
*/
436
void
437
vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags)
438
{
439
440
bzero(marker, sizeof(*marker));
441
marker->flags = PG_MARKER;
442
marker->a.flags = aflags;
443
marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
444
marker->a.queue = queue;
445
}
446
447
static void
448
vm_page_domain_init(int domain)
449
{
450
struct vm_domain *vmd;
451
struct vm_pagequeue *pq;
452
int i;
453
454
vmd = VM_DOMAIN(domain);
455
bzero(vmd, sizeof(*vmd));
456
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
457
"vm inactive pagequeue";
458
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
459
"vm active pagequeue";
460
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
461
"vm laundry pagequeue";
462
*__DECONST(const char **,
463
&vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
464
"vm unswappable pagequeue";
465
vmd->vmd_domain = domain;
466
vmd->vmd_page_count = 0;
467
vmd->vmd_free_count = 0;
468
vmd->vmd_segs = 0;
469
vmd->vmd_oom = false;
470
vmd->vmd_helper_threads_enabled = true;
471
for (i = 0; i < PQ_COUNT; i++) {
472
pq = &vmd->vmd_pagequeues[i];
473
TAILQ_INIT(&pq->pq_pl);
474
mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
475
MTX_DEF | MTX_DUPOK);
476
pq->pq_pdpages = 0;
477
vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
478
}
479
mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
480
mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
481
snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
482
483
/*
484
* inacthead is used to provide FIFO ordering for LRU-bypassing
485
* insertions.
486
*/
487
vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
488
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
489
&vmd->vmd_inacthead, plinks.q);
490
491
/*
492
* The clock pages are used to implement active queue scanning without
493
* requeues. Scans start at clock[0], which is advanced after the scan
494
* ends. When the two clock hands meet, they are reset and scanning
495
* resumes from the head of the queue.
496
*/
497
vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
498
vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
499
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
500
&vmd->vmd_clock[0], plinks.q);
501
TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
502
&vmd->vmd_clock[1], plinks.q);
503
}
504
505
/*
506
* Initialize a physical page in preparation for adding it to the free
507
* lists.
508
*/
509
void
510
vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind, int pool)
511
{
512
m->object = NULL;
513
m->ref_count = 0;
514
m->busy_lock = VPB_FREED;
515
m->flags = m->a.flags = 0;
516
m->phys_addr = pa;
517
m->a.queue = PQ_NONE;
518
m->psind = 0;
519
m->segind = segind;
520
m->order = VM_NFREEORDER;
521
m->pool = pool;
522
m->valid = m->dirty = 0;
523
pmap_page_init(m);
524
}
525
526
#ifndef PMAP_HAS_PAGE_ARRAY
527
static vm_paddr_t
528
vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range)
529
{
530
vm_paddr_t new_end;
531
532
/*
533
* Reserve an unmapped guard page to trap access to vm_page_array[-1].
534
* However, because this page is allocated from KVM, out-of-bounds
535
* accesses using the direct map will not be trapped.
536
*/
537
*vaddr += PAGE_SIZE;
538
539
/*
540
* Allocate physical memory for the page structures, and map it.
541
*/
542
new_end = trunc_page(end - page_range * sizeof(struct vm_page));
543
vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end,
544
VM_PROT_READ | VM_PROT_WRITE);
545
vm_page_array_size = page_range;
546
547
return (new_end);
548
}
549
#endif
550
551
/*
552
* vm_page_startup:
553
*
554
* Initializes the resident memory module. Allocates physical memory for
555
* bootstrapping UMA and some data structures that are used to manage
556
* physical pages. Initializes these structures, and populates the free
557
* page queues.
558
*/
559
vm_offset_t
560
vm_page_startup(vm_offset_t vaddr)
561
{
562
struct vm_phys_seg *seg;
563
struct vm_domain *vmd;
564
vm_page_t m;
565
char *list, *listend;
566
vm_paddr_t end, high_avail, low_avail, new_end, size;
567
vm_paddr_t page_range __unused;
568
vm_paddr_t last_pa, pa, startp, endp;
569
u_long pagecount;
570
#if MINIDUMP_PAGE_TRACKING
571
u_long vm_page_dump_size;
572
#endif
573
int biggestone, i, segind;
574
#ifdef WITNESS
575
vm_offset_t mapped;
576
int witness_size;
577
#endif
578
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
579
long ii;
580
#endif
581
int pool;
582
#ifdef VM_FREEPOOL_LAZYINIT
583
int lazyinit;
584
#endif
585
586
vaddr = round_page(vaddr);
587
588
vm_phys_early_startup();
589
biggestone = vm_phys_avail_largest();
590
end = phys_avail[biggestone+1];
591
592
/*
593
* Initialize the page and queue locks.
594
*/
595
mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
596
for (i = 0; i < vm_ndomains; i++)
597
vm_page_domain_init(i);
598
599
new_end = end;
600
#ifdef WITNESS
601
witness_size = round_page(witness_startup_count());
602
new_end -= witness_size;
603
mapped = pmap_map(&vaddr, new_end, new_end + witness_size,
604
VM_PROT_READ | VM_PROT_WRITE);
605
bzero((void *)mapped, witness_size);
606
witness_startup((void *)mapped);
607
#endif
608
609
#if MINIDUMP_PAGE_TRACKING
610
/*
611
* Allocate a bitmap to indicate that a random physical page
612
* needs to be included in a minidump.
613
*
614
* The amd64 port needs this to indicate which direct map pages
615
* need to be dumped, via calls to dump_add_page()/dump_drop_page().
616
*
617
* However, i386 still needs this workspace internally within the
618
* minidump code. In theory, they are not needed on i386, but are
619
* included should the sf_buf code decide to use them.
620
*/
621
last_pa = 0;
622
vm_page_dump_pages = 0;
623
for (i = 0; dump_avail[i + 1] != 0; i += 2) {
624
vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) -
625
dump_avail[i] / PAGE_SIZE;
626
if (dump_avail[i + 1] > last_pa)
627
last_pa = dump_avail[i + 1];
628
}
629
vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages));
630
new_end -= vm_page_dump_size;
631
vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
632
new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
633
bzero((void *)vm_page_dump, vm_page_dump_size);
634
#if MINIDUMP_STARTUP_PAGE_TRACKING
635
/*
636
* Include the UMA bootstrap pages, witness pages and vm_page_dump
637
* in a crash dump. When pmap_map() uses the direct map, they are
638
* not automatically included.
639
*/
640
for (pa = new_end; pa < end; pa += PAGE_SIZE)
641
dump_add_page(pa);
642
#endif
643
#else
644
(void)last_pa;
645
#endif
646
phys_avail[biggestone + 1] = new_end;
647
#ifdef __amd64__
648
/*
649
* Request that the physical pages underlying the message buffer be
650
* included in a crash dump. Since the message buffer is accessed
651
* through the direct map, they are not automatically included.
652
*/
653
pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
654
last_pa = pa + round_page(msgbufsize);
655
while (pa < last_pa) {
656
dump_add_page(pa);
657
pa += PAGE_SIZE;
658
}
659
#else
660
(void)pa;
661
#endif
662
663
/*
664
* Determine the lowest and highest physical addresses and, in the case
665
* of VM_PHYSSEG_SPARSE, the exact size of the available physical
666
* memory. vm_phys_early_startup() already checked that phys_avail[]
667
* has at least one element.
668
*/
669
#ifdef VM_PHYSSEG_SPARSE
670
size = phys_avail[1] - phys_avail[0];
671
#endif
672
low_avail = phys_avail[0];
673
high_avail = phys_avail[1];
674
for (i = 2; phys_avail[i + 1] != 0; i += 2) {
675
#ifdef VM_PHYSSEG_SPARSE
676
size += phys_avail[i + 1] - phys_avail[i];
677
#endif
678
if (phys_avail[i] < low_avail)
679
low_avail = phys_avail[i];
680
if (phys_avail[i + 1] > high_avail)
681
high_avail = phys_avail[i + 1];
682
}
683
for (i = 0; i < vm_phys_nsegs; i++) {
684
#ifdef VM_PHYSSEG_SPARSE
685
size += vm_phys_segs[i].end - vm_phys_segs[i].start;
686
#endif
687
if (vm_phys_segs[i].start < low_avail)
688
low_avail = vm_phys_segs[i].start;
689
if (vm_phys_segs[i].end > high_avail)
690
high_avail = vm_phys_segs[i].end;
691
}
692
first_page = low_avail / PAGE_SIZE;
693
#ifdef VM_PHYSSEG_DENSE
694
size = high_avail - low_avail;
695
#endif
696
697
#ifdef PMAP_HAS_PAGE_ARRAY
698
pmap_page_array_startup(size / PAGE_SIZE);
699
biggestone = vm_phys_avail_largest();
700
end = new_end = phys_avail[biggestone + 1];
701
#else
702
#ifdef VM_PHYSSEG_DENSE
703
/*
704
* In the VM_PHYSSEG_DENSE case, the number of pages can account for
705
* the overhead of a page structure per page only if vm_page_array is
706
* allocated from the last physical memory chunk. Otherwise, we must
707
* allocate page structures representing the physical memory
708
* underlying vm_page_array, even though they will not be used.
709
*/
710
if (new_end != high_avail)
711
page_range = size / PAGE_SIZE;
712
else
713
#endif
714
{
715
page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
716
717
/*
718
* If the partial bytes remaining are large enough for
719
* a page (PAGE_SIZE) without a corresponding
720
* 'struct vm_page', then new_end will contain an
721
* extra page after subtracting the length of the VM
722
* page array. Compensate by subtracting an extra
723
* page from new_end.
724
*/
725
if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
726
if (new_end == high_avail)
727
high_avail -= PAGE_SIZE;
728
new_end -= PAGE_SIZE;
729
}
730
}
731
end = new_end;
732
new_end = vm_page_array_alloc(&vaddr, end, page_range);
733
#endif
734
735
#if VM_NRESERVLEVEL > 0
736
/*
737
* Allocate physical memory for the reservation management system's
738
* data structures, and map it.
739
*/
740
new_end = vm_reserv_startup(&vaddr, new_end);
741
#endif
742
#if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING
743
/*
744
* Include vm_page_array and vm_reserv_array in a crash dump.
745
*/
746
for (pa = new_end; pa < end; pa += PAGE_SIZE)
747
dump_add_page(pa);
748
#endif
749
phys_avail[biggestone + 1] = new_end;
750
751
/*
752
* Add physical memory segments corresponding to the available
753
* physical pages.
754
*/
755
for (i = 0; phys_avail[i + 1] != 0; i += 2)
756
vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
757
758
/*
759
* Initialize the physical memory allocator.
760
*/
761
vm_phys_init();
762
763
pool = VM_FREEPOOL_DEFAULT;
764
#ifdef VM_FREEPOOL_LAZYINIT
765
lazyinit = 1;
766
TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit);
767
if (lazyinit)
768
pool = VM_FREEPOOL_LAZYINIT;
769
#endif
770
771
/*
772
* Initialize the page structures and add every available page to the
773
* physical memory allocator's free lists.
774
*/
775
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
776
for (ii = 0; ii < vm_page_array_size; ii++) {
777
m = &vm_page_array[ii];
778
vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0,
779
VM_FREEPOOL_DEFAULT);
780
m->flags = PG_FICTITIOUS;
781
}
782
#endif
783
vm_cnt.v_page_count = 0;
784
for (segind = 0; segind < vm_phys_nsegs; segind++) {
785
seg = &vm_phys_segs[segind];
786
787
/*
788
* Initialize pages not covered by phys_avail[], since they
789
* might be freed to the allocator at some future point, e.g.,
790
* by kmem_bootstrap_free().
791
*/
792
startp = seg->start;
793
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
794
if (startp >= seg->end)
795
break;
796
if (phys_avail[i + 1] < startp)
797
continue;
798
if (phys_avail[i] <= startp) {
799
startp = phys_avail[i + 1];
800
continue;
801
}
802
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
803
for (endp = MIN(phys_avail[i], seg->end);
804
startp < endp; startp += PAGE_SIZE, m++) {
805
vm_page_init_page(m, startp, segind,
806
VM_FREEPOOL_DEFAULT);
807
}
808
}
809
810
/*
811
* Add the segment's pages that are covered by one of
812
* phys_avail's ranges to the free lists.
813
*/
814
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
815
if (seg->end <= phys_avail[i] ||
816
seg->start >= phys_avail[i + 1])
817
continue;
818
819
startp = MAX(seg->start, phys_avail[i]);
820
endp = MIN(seg->end, phys_avail[i + 1]);
821
pagecount = (u_long)atop(endp - startp);
822
if (pagecount == 0)
823
continue;
824
825
/*
826
* If lazy vm_page initialization is not enabled, simply
827
* initialize all of the pages in the segment covered by
828
* phys_avail. Otherwise, initialize only the first
829
* page of each run of free pages handed to the vm_phys
830
* allocator, which in turn defers initialization of
831
* pages until they are needed.
832
*
833
* This avoids blocking the boot process for long
834
* periods, which may be relevant for VMs (which ought
835
* to boot as quickly as possible) and/or systems with
836
* large amounts of physical memory.
837
*/
838
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
839
vm_page_init_page(m, startp, segind, pool);
840
if (pool == VM_FREEPOOL_DEFAULT) {
841
for (u_long j = 1; j < pagecount; j++) {
842
vm_page_init_page(&m[j],
843
startp + ptoa((vm_paddr_t)j),
844
segind, pool);
845
}
846
}
847
vmd = VM_DOMAIN(seg->domain);
848
vm_domain_free_lock(vmd);
849
vm_phys_enqueue_contig(m, pool, pagecount);
850
vm_domain_free_unlock(vmd);
851
vm_domain_freecnt_inc(vmd, pagecount);
852
vm_cnt.v_page_count += (u_int)pagecount;
853
vmd->vmd_page_count += (u_int)pagecount;
854
vmd->vmd_segs |= 1UL << segind;
855
}
856
}
857
858
/*
859
* Remove blacklisted pages from the physical memory allocator.
860
*/
861
TAILQ_INIT(&blacklist_head);
862
vm_page_blacklist_load(&list, &listend);
863
vm_page_blacklist_check(list, listend);
864
865
list = kern_getenv("vm.blacklist");
866
vm_page_blacklist_check(list, NULL);
867
868
freeenv(list);
869
#if VM_NRESERVLEVEL > 0
870
/*
871
* Initialize the reservation management system.
872
*/
873
vm_reserv_init();
874
#endif
875
876
return (vaddr);
877
}
878
879
void
880
vm_page_reference(vm_page_t m)
881
{
882
883
vm_page_aflag_set(m, PGA_REFERENCED);
884
}
885
886
/*
887
* vm_page_trybusy
888
*
889
* Helper routine for grab functions to trylock busy.
890
*
891
* Returns true on success and false on failure.
892
*/
893
static bool
894
vm_page_trybusy(vm_page_t m, int allocflags)
895
{
896
897
if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0)
898
return (vm_page_trysbusy(m));
899
else
900
return (vm_page_tryxbusy(m));
901
}
902
903
/*
904
* vm_page_tryacquire
905
*
906
* Helper routine for grab functions to trylock busy and wire.
907
*
908
* Returns true on success and false on failure.
909
*/
910
static inline bool
911
vm_page_tryacquire(vm_page_t m, int allocflags)
912
{
913
bool locked;
914
915
locked = vm_page_trybusy(m, allocflags);
916
if (locked && (allocflags & VM_ALLOC_WIRED) != 0)
917
vm_page_wire(m);
918
return (locked);
919
}
920
921
/*
922
* vm_page_busy_acquire:
923
*
924
* Acquire the busy lock as described by VM_ALLOC_* flags. Will loop
925
* and drop the object lock if necessary.
926
*/
927
bool
928
vm_page_busy_acquire(vm_page_t m, int allocflags)
929
{
930
vm_object_t obj;
931
bool locked;
932
933
/*
934
* The page-specific object must be cached because page
935
* identity can change during the sleep, causing the
936
* re-lock of a different object.
937
* It is assumed that a reference to the object is already
938
* held by the callers.
939
*/
940
obj = atomic_load_ptr(&m->object);
941
for (;;) {
942
if (vm_page_tryacquire(m, allocflags))
943
return (true);
944
if ((allocflags & VM_ALLOC_NOWAIT) != 0)
945
return (false);
946
if (obj != NULL)
947
locked = VM_OBJECT_WOWNED(obj);
948
else
949
locked = false;
950
MPASS(locked || vm_page_wired(m));
951
if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags,
952
locked) && locked)
953
VM_OBJECT_WLOCK(obj);
954
if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
955
return (false);
956
KASSERT(m->object == obj || m->object == NULL,
957
("vm_page_busy_acquire: page %p does not belong to %p",
958
m, obj));
959
}
960
}
961
962
/*
963
* vm_page_busy_downgrade:
964
*
965
* Downgrade an exclusive busy page into a single shared busy page.
966
*/
967
void
968
vm_page_busy_downgrade(vm_page_t m)
969
{
970
u_int x;
971
972
vm_page_assert_xbusied(m);
973
974
x = vm_page_busy_fetch(m);
975
for (;;) {
976
if (atomic_fcmpset_rel_int(&m->busy_lock,
977
&x, VPB_SHARERS_WORD(1)))
978
break;
979
}
980
if ((x & VPB_BIT_WAITERS) != 0)
981
wakeup(m);
982
}
983
984
/*
985
*
986
* vm_page_busy_tryupgrade:
987
*
988
* Attempt to upgrade a single shared busy into an exclusive busy.
989
*/
990
int
991
vm_page_busy_tryupgrade(vm_page_t m)
992
{
993
u_int ce, x;
994
995
vm_page_assert_sbusied(m);
996
997
x = vm_page_busy_fetch(m);
998
ce = VPB_CURTHREAD_EXCLUSIVE;
999
for (;;) {
1000
if (VPB_SHARERS(x) > 1)
1001
return (0);
1002
KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
1003
("vm_page_busy_tryupgrade: invalid lock state"));
1004
if (!atomic_fcmpset_acq_int(&m->busy_lock, &x,
1005
ce | (x & VPB_BIT_WAITERS)))
1006
continue;
1007
return (1);
1008
}
1009
}
1010
1011
/*
1012
* vm_page_sbusied:
1013
*
1014
* Return a positive value if the page is shared busied, 0 otherwise.
1015
*/
1016
int
1017
vm_page_sbusied(vm_page_t m)
1018
{
1019
u_int x;
1020
1021
x = vm_page_busy_fetch(m);
1022
return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
1023
}
1024
1025
/*
1026
* vm_page_sunbusy:
1027
*
1028
* Shared unbusy a page.
1029
*/
1030
void
1031
vm_page_sunbusy(vm_page_t m)
1032
{
1033
u_int x;
1034
1035
vm_page_assert_sbusied(m);
1036
1037
x = vm_page_busy_fetch(m);
1038
for (;;) {
1039
KASSERT(x != VPB_FREED,
1040
("vm_page_sunbusy: Unlocking freed page."));
1041
if (VPB_SHARERS(x) > 1) {
1042
if (atomic_fcmpset_int(&m->busy_lock, &x,
1043
x - VPB_ONE_SHARER))
1044
break;
1045
continue;
1046
}
1047
KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
1048
("vm_page_sunbusy: invalid lock state"));
1049
if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
1050
continue;
1051
if ((x & VPB_BIT_WAITERS) == 0)
1052
break;
1053
wakeup(m);
1054
break;
1055
}
1056
}
1057
1058
/*
1059
* vm_page_busy_sleep:
1060
*
1061
* Sleep if the page is busy, using the page pointer as wchan.
1062
* This is used to implement the hard-path of the busying mechanism.
1063
*
1064
* If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1065
* will not sleep if the page is shared-busy.
1066
*
1067
* The object lock must be held on entry.
1068
*
1069
* Returns true if it slept and dropped the object lock, or false
1070
* if there was no sleep and the lock is still held.
1071
*/
1072
bool
1073
vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags)
1074
{
1075
vm_object_t obj;
1076
1077
obj = m->object;
1078
VM_OBJECT_ASSERT_LOCKED(obj);
1079
1080
return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags,
1081
true));
1082
}
1083
1084
/*
1085
* vm_page_busy_sleep_unlocked:
1086
*
1087
* Sleep if the page is busy, using the page pointer as wchan.
1088
* This is used to implement the hard-path of busying mechanism.
1089
*
1090
* If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1091
* will not sleep if the page is shared-busy.
1092
*
1093
* The object lock must not be held on entry. The operation will
1094
* return if the page changes identity.
1095
*/
1096
void
1097
vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1098
const char *wmesg, int allocflags)
1099
{
1100
VM_OBJECT_ASSERT_UNLOCKED(obj);
1101
1102
(void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false);
1103
}
1104
1105
/*
1106
* _vm_page_busy_sleep:
1107
*
1108
* Internal busy sleep function. Verifies the page identity and
1109
* lockstate against parameters. Returns true if it sleeps and
1110
* false otherwise.
1111
*
1112
* allocflags uses VM_ALLOC_* flags to specify the lock required.
1113
*
1114
* If locked is true the lock will be dropped for any true returns
1115
* and held for any false returns.
1116
*/
1117
static bool
1118
_vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1119
const char *wmesg, int allocflags, bool locked)
1120
{
1121
bool xsleep;
1122
u_int x;
1123
1124
/*
1125
* If the object is busy we must wait for that to drain to zero
1126
* before trying the page again.
1127
*/
1128
if (obj != NULL && vm_object_busied(obj)) {
1129
if (locked)
1130
VM_OBJECT_DROP(obj);
1131
vm_object_busy_wait(obj, wmesg);
1132
return (true);
1133
}
1134
1135
if (!vm_page_busied(m))
1136
return (false);
1137
1138
xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0;
1139
sleepq_lock(m);
1140
x = vm_page_busy_fetch(m);
1141
do {
1142
/*
1143
* If the page changes objects or becomes unlocked we can
1144
* simply return.
1145
*/
1146
if (x == VPB_UNBUSIED ||
1147
(xsleep && (x & VPB_BIT_SHARED) != 0) ||
1148
m->object != obj || m->pindex != pindex) {
1149
sleepq_release(m);
1150
return (false);
1151
}
1152
if ((x & VPB_BIT_WAITERS) != 0)
1153
break;
1154
} while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS));
1155
if (locked)
1156
VM_OBJECT_DROP(obj);
1157
DROP_GIANT();
1158
sleepq_add(m, NULL, wmesg, 0, 0);
1159
sleepq_wait(m, PVM);
1160
PICKUP_GIANT();
1161
return (true);
1162
}
1163
1164
/*
1165
* vm_page_trysbusy:
1166
*
1167
* Try to shared busy a page.
1168
* If the operation succeeds 1 is returned otherwise 0.
1169
* The operation never sleeps.
1170
*/
1171
int
1172
vm_page_trysbusy(vm_page_t m)
1173
{
1174
vm_object_t obj;
1175
u_int x;
1176
1177
obj = m->object;
1178
x = vm_page_busy_fetch(m);
1179
for (;;) {
1180
if ((x & VPB_BIT_SHARED) == 0)
1181
return (0);
1182
/*
1183
* Reduce the window for transient busies that will trigger
1184
* false negatives in vm_page_ps_test().
1185
*/
1186
if (obj != NULL && vm_object_busied(obj))
1187
return (0);
1188
if (atomic_fcmpset_acq_int(&m->busy_lock, &x,
1189
x + VPB_ONE_SHARER))
1190
break;
1191
}
1192
1193
/* Refetch the object now that we're guaranteed that it is stable. */
1194
obj = m->object;
1195
if (obj != NULL && vm_object_busied(obj)) {
1196
vm_page_sunbusy(m);
1197
return (0);
1198
}
1199
return (1);
1200
}
1201
1202
/*
1203
* vm_page_tryxbusy:
1204
*
1205
* Try to exclusive busy a page.
1206
* If the operation succeeds 1 is returned otherwise 0.
1207
* The operation never sleeps.
1208
*/
1209
int
1210
vm_page_tryxbusy(vm_page_t m)
1211
{
1212
vm_object_t obj;
1213
1214
if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,
1215
VPB_CURTHREAD_EXCLUSIVE) == 0)
1216
return (0);
1217
1218
obj = m->object;
1219
if (obj != NULL && vm_object_busied(obj)) {
1220
vm_page_xunbusy(m);
1221
return (0);
1222
}
1223
return (1);
1224
}
1225
1226
static void
1227
vm_page_xunbusy_hard_tail(vm_page_t m)
1228
{
1229
atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
1230
/* Wake the waiter. */
1231
wakeup(m);
1232
}
1233
1234
/*
1235
* vm_page_xunbusy_hard:
1236
*
1237
* Called when unbusy has failed because there is a waiter.
1238
*/
1239
void
1240
vm_page_xunbusy_hard(vm_page_t m)
1241
{
1242
vm_page_assert_xbusied(m);
1243
vm_page_xunbusy_hard_tail(m);
1244
}
1245
1246
void
1247
vm_page_xunbusy_hard_unchecked(vm_page_t m)
1248
{
1249
vm_page_assert_xbusied_unchecked(m);
1250
vm_page_xunbusy_hard_tail(m);
1251
}
1252
1253
static void
1254
vm_page_busy_free(vm_page_t m)
1255
{
1256
u_int x;
1257
1258
atomic_thread_fence_rel();
1259
x = atomic_swap_int(&m->busy_lock, VPB_FREED);
1260
if ((x & VPB_BIT_WAITERS) != 0)
1261
wakeup(m);
1262
}
1263
1264
/*
1265
* vm_page_unhold_pages:
1266
*
1267
* Unhold each of the pages that is referenced by the given array.
1268
*/
1269
void
1270
vm_page_unhold_pages(vm_page_t *ma, int count)
1271
{
1272
1273
for (; count != 0; count--) {
1274
vm_page_unwire(*ma, PQ_ACTIVE);
1275
ma++;
1276
}
1277
}
1278
1279
vm_page_t
1280
PHYS_TO_VM_PAGE(vm_paddr_t pa)
1281
{
1282
vm_page_t m;
1283
1284
#ifdef VM_PHYSSEG_SPARSE
1285
m = vm_phys_paddr_to_vm_page(pa);
1286
if (m == NULL)
1287
m = vm_phys_fictitious_to_vm_page(pa);
1288
return (m);
1289
#elif defined(VM_PHYSSEG_DENSE)
1290
long pi;
1291
1292
pi = atop(pa);
1293
if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1294
m = &vm_page_array[pi - first_page];
1295
return (m);
1296
}
1297
return (vm_phys_fictitious_to_vm_page(pa));
1298
#else
1299
#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
1300
#endif
1301
}
1302
1303
/*
1304
* vm_page_getfake:
1305
*
1306
* Create a fictitious page with the specified physical address and
1307
* memory attribute. The memory attribute is the only the machine-
1308
* dependent aspect of a fictitious page that must be initialized.
1309
*/
1310
vm_page_t
1311
vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
1312
{
1313
vm_page_t m;
1314
1315
m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
1316
vm_page_initfake(m, paddr, memattr);
1317
return (m);
1318
}
1319
1320
void
1321
vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1322
{
1323
1324
if ((m->flags & PG_FICTITIOUS) != 0) {
1325
/*
1326
* The page's memattr might have changed since the
1327
* previous initialization. Update the pmap to the
1328
* new memattr.
1329
*/
1330
goto memattr;
1331
}
1332
m->phys_addr = paddr;
1333
m->a.queue = PQ_NONE;
1334
/* Fictitious pages don't use "segind". */
1335
m->flags = PG_FICTITIOUS;
1336
/* Fictitious pages don't use "order" or "pool". */
1337
m->oflags = VPO_UNMANAGED;
1338
m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
1339
/* Fictitious pages are unevictable. */
1340
m->ref_count = 1;
1341
pmap_page_init(m);
1342
memattr:
1343
pmap_page_set_memattr(m, memattr);
1344
}
1345
1346
/*
1347
* vm_page_putfake:
1348
*
1349
* Release a fictitious page.
1350
*/
1351
void
1352
vm_page_putfake(vm_page_t m)
1353
{
1354
1355
KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
1356
KASSERT((m->flags & PG_FICTITIOUS) != 0,
1357
("vm_page_putfake: bad page %p", m));
1358
vm_page_assert_xbusied(m);
1359
vm_page_busy_free(m);
1360
uma_zfree(fakepg_zone, m);
1361
}
1362
1363
/*
1364
* vm_page_updatefake:
1365
*
1366
* Update the given fictitious page to the specified physical address and
1367
* memory attribute.
1368
*/
1369
void
1370
vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1371
{
1372
1373
KASSERT((m->flags & PG_FICTITIOUS) != 0,
1374
("vm_page_updatefake: bad page %p", m));
1375
m->phys_addr = paddr;
1376
pmap_page_set_memattr(m, memattr);
1377
}
1378
1379
/*
1380
* vm_page_free:
1381
*
1382
* Free a page.
1383
*/
1384
void
1385
vm_page_free(vm_page_t m)
1386
{
1387
1388
m->flags &= ~PG_ZERO;
1389
vm_page_free_toq(m);
1390
}
1391
1392
/*
1393
* vm_page_free_zero:
1394
*
1395
* Free a page to the zerod-pages queue
1396
*/
1397
void
1398
vm_page_free_zero(vm_page_t m)
1399
{
1400
1401
m->flags |= PG_ZERO;
1402
vm_page_free_toq(m);
1403
}
1404
1405
/*
1406
* Unbusy and handle the page queueing for a page from a getpages request that
1407
* was optionally read ahead or behind.
1408
*/
1409
void
1410
vm_page_readahead_finish(vm_page_t m)
1411
{
1412
1413
/* We shouldn't put invalid pages on queues. */
1414
KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m));
1415
1416
/*
1417
* Since the page is not the actually needed one, whether it should
1418
* be activated or deactivated is not obvious. Empirical results
1419
* have shown that deactivating the page is usually the best choice,
1420
* unless the page is wanted by another thread.
1421
*/
1422
if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0)
1423
vm_page_activate(m);
1424
else
1425
vm_page_deactivate(m);
1426
vm_page_xunbusy_unchecked(m);
1427
}
1428
1429
/*
1430
* Destroy the identity of an invalid page and free it if possible.
1431
* This is intended to be used when reading a page from backing store fails.
1432
*/
1433
void
1434
vm_page_free_invalid(vm_page_t m)
1435
{
1436
1437
KASSERT(vm_page_none_valid(m), ("page %p is valid", m));
1438
KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m));
1439
KASSERT(m->object != NULL, ("page %p has no object", m));
1440
VM_OBJECT_ASSERT_WLOCKED(m->object);
1441
1442
/*
1443
* We may be attempting to free the page as part of the handling for an
1444
* I/O error, in which case the page was xbusied by a different thread.
1445
*/
1446
vm_page_xbusy_claim(m);
1447
1448
/*
1449
* If someone has wired this page while the object lock
1450
* was not held, then the thread that unwires is responsible
1451
* for freeing the page. Otherwise just free the page now.
1452
* The wire count of this unmapped page cannot change while
1453
* we have the page xbusy and the page's object wlocked.
1454
*/
1455
if (vm_page_remove(m))
1456
vm_page_free(m);
1457
}
1458
1459
/*
1460
* vm_page_dirty_KBI: [ internal use only ]
1461
*
1462
* Set all bits in the page's dirty field.
1463
*
1464
* The object containing the specified page must be locked if the
1465
* call is made from the machine-independent layer.
1466
*
1467
* See vm_page_clear_dirty_mask().
1468
*
1469
* This function should only be called by vm_page_dirty().
1470
*/
1471
void
1472
vm_page_dirty_KBI(vm_page_t m)
1473
{
1474
1475
/* Refer to this operation by its public name. */
1476
KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
1477
m->dirty = VM_PAGE_BITS_ALL;
1478
}
1479
1480
/*
1481
* Insert the given page into the given object at the given pindex.
1482
*
1483
* The procedure is marked __always_inline to suggest to the compiler to
1484
* eliminate the iter parameter and the associated alternate branch.
1485
*/
1486
static __always_inline int
1487
vm_page_insert_lookup(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1488
bool iter, struct pctrie_iter *pages)
1489
{
1490
int error;
1491
1492
VM_OBJECT_ASSERT_WLOCKED(object);
1493
KASSERT(m->object == NULL,
1494
("vm_page_insert: page %p already inserted", m));
1495
1496
/*
1497
* Record the object/offset pair in this page.
1498
*/
1499
m->object = object;
1500
m->pindex = pindex;
1501
m->ref_count |= VPRC_OBJREF;
1502
1503
/*
1504
* Add this page to the object's radix tree.
1505
*/
1506
if (iter)
1507
error = vm_radix_iter_insert(pages, m);
1508
else
1509
error = vm_radix_insert(&object->rtree, m);
1510
if (__predict_false(error != 0)) {
1511
m->object = NULL;
1512
m->pindex = 0;
1513
m->ref_count &= ~VPRC_OBJREF;
1514
return (1);
1515
}
1516
1517
vm_page_insert_radixdone(m, object);
1518
vm_pager_page_inserted(object, m);
1519
return (0);
1520
}
1521
1522
/*
1523
* vm_page_insert: [ internal use only ]
1524
*
1525
* Inserts the given mem entry into the object and object list.
1526
*
1527
* The object must be locked.
1528
*/
1529
int
1530
vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1531
{
1532
return (vm_page_insert_lookup(m, object, pindex, false, NULL));
1533
}
1534
1535
/*
1536
* vm_page_iter_insert:
1537
*
1538
* Tries to insert the page "m" into the specified object at offset
1539
* "pindex" using the iterator "pages". Returns 0 if the insertion was
1540
* successful.
1541
*
1542
* The object must be locked.
1543
*/
1544
int
1545
vm_page_iter_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1546
struct pctrie_iter *pages)
1547
{
1548
return (vm_page_insert_lookup(m, object, pindex, true, pages));
1549
}
1550
1551
/*
1552
* vm_page_insert_radixdone:
1553
*
1554
* Complete page "m" insertion into the specified object after the
1555
* radix trie hooking.
1556
*
1557
* The object must be locked.
1558
*/
1559
static void
1560
vm_page_insert_radixdone(vm_page_t m, vm_object_t object)
1561
{
1562
1563
VM_OBJECT_ASSERT_WLOCKED(object);
1564
KASSERT(object != NULL && m->object == object,
1565
("vm_page_insert_radixdone: page %p has inconsistent object", m));
1566
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1567
("vm_page_insert_radixdone: page %p is missing object ref", m));
1568
1569
/*
1570
* Show that the object has one more resident page.
1571
*/
1572
object->resident_page_count++;
1573
1574
/*
1575
* Hold the vnode until the last page is released.
1576
*/
1577
if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
1578
vhold(object->handle);
1579
1580
/*
1581
* Since we are inserting a new and possibly dirty page,
1582
* update the object's generation count.
1583
*/
1584
if (pmap_page_is_write_mapped(m))
1585
vm_object_set_writeable_dirty(object);
1586
}
1587
1588
/*
1589
* vm_page_remove_radixdone
1590
*
1591
* Complete page "m" removal from the specified object after the radix trie
1592
* unhooking.
1593
*
1594
* The caller is responsible for updating the page's fields to reflect this
1595
* removal.
1596
*/
1597
static void
1598
vm_page_remove_radixdone(vm_page_t m)
1599
{
1600
vm_object_t object;
1601
1602
vm_page_assert_xbusied(m);
1603
object = m->object;
1604
VM_OBJECT_ASSERT_WLOCKED(object);
1605
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1606
("page %p is missing its object ref", m));
1607
1608
/* Deferred free of swap space. */
1609
if ((m->a.flags & PGA_SWAP_FREE) != 0)
1610
vm_pager_page_unswapped(m);
1611
1612
vm_pager_page_removed(object, m);
1613
m->object = NULL;
1614
1615
/*
1616
* And show that the object has one fewer resident page.
1617
*/
1618
object->resident_page_count--;
1619
1620
/*
1621
* The vnode may now be recycled.
1622
*/
1623
if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
1624
vdrop(object->handle);
1625
}
1626
1627
/*
1628
* vm_page_free_object_prep:
1629
*
1630
* Disassociates the given page from its VM object.
1631
*
1632
* The object must be locked, and the page must be xbusy.
1633
*/
1634
static void
1635
vm_page_free_object_prep(vm_page_t m)
1636
{
1637
KASSERT(((m->oflags & VPO_UNMANAGED) != 0) ==
1638
((m->object->flags & OBJ_UNMANAGED) != 0),
1639
("%s: managed flag mismatch for page %p",
1640
__func__, m));
1641
vm_page_assert_xbusied(m);
1642
1643
/*
1644
* The object reference can be released without an atomic
1645
* operation.
1646
*/
1647
KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
1648
m->ref_count == VPRC_OBJREF,
1649
("%s: page %p has unexpected ref_count %u",
1650
__func__, m, m->ref_count));
1651
vm_page_remove_radixdone(m);
1652
m->ref_count -= VPRC_OBJREF;
1653
}
1654
1655
/*
1656
* vm_page_iter_free:
1657
*
1658
* Free the given page, and use the iterator to remove it from the radix
1659
* tree.
1660
*/
1661
void
1662
vm_page_iter_free(struct pctrie_iter *pages, vm_page_t m)
1663
{
1664
vm_radix_iter_remove(pages);
1665
vm_page_free_object_prep(m);
1666
vm_page_xunbusy(m);
1667
m->flags &= ~PG_ZERO;
1668
vm_page_free_toq(m);
1669
}
1670
1671
/*
1672
* vm_page_remove:
1673
*
1674
* Removes the specified page from its containing object, but does not
1675
* invalidate any backing storage. Returns true if the object's reference
1676
* was the last reference to the page, and false otherwise.
1677
*
1678
* The object must be locked and the page must be exclusively busied.
1679
* The exclusive busy will be released on return. If this is not the
1680
* final ref and the caller does not hold a wire reference it may not
1681
* continue to access the page.
1682
*/
1683
bool
1684
vm_page_remove(vm_page_t m)
1685
{
1686
bool dropped;
1687
1688
dropped = vm_page_remove_xbusy(m);
1689
vm_page_xunbusy(m);
1690
1691
return (dropped);
1692
}
1693
1694
/*
1695
* vm_page_iter_remove:
1696
*
1697
* Remove the current page, and use the iterator to remove it from the
1698
* radix tree.
1699
*/
1700
bool
1701
vm_page_iter_remove(struct pctrie_iter *pages, vm_page_t m)
1702
{
1703
bool dropped;
1704
1705
vm_radix_iter_remove(pages);
1706
vm_page_remove_radixdone(m);
1707
dropped = (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
1708
vm_page_xunbusy(m);
1709
1710
return (dropped);
1711
}
1712
1713
/*
1714
* vm_page_radix_remove
1715
*
1716
* Removes the specified page from the radix tree.
1717
*/
1718
static void
1719
vm_page_radix_remove(vm_page_t m)
1720
{
1721
vm_page_t mrem __diagused;
1722
1723
mrem = vm_radix_remove(&m->object->rtree, m->pindex);
1724
KASSERT(mrem == m,
1725
("removed page %p, expected page %p", mrem, m));
1726
}
1727
1728
/*
1729
* vm_page_remove_xbusy
1730
*
1731
* Removes the page but leaves the xbusy held. Returns true if this
1732
* removed the final ref and false otherwise.
1733
*/
1734
bool
1735
vm_page_remove_xbusy(vm_page_t m)
1736
{
1737
1738
vm_page_radix_remove(m);
1739
vm_page_remove_radixdone(m);
1740
return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
1741
}
1742
1743
/*
1744
* vm_page_lookup:
1745
*
1746
* Returns the page associated with the object/offset
1747
* pair specified; if none is found, NULL is returned.
1748
*
1749
* The object must be locked.
1750
*/
1751
vm_page_t
1752
vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1753
{
1754
1755
VM_OBJECT_ASSERT_LOCKED(object);
1756
return (vm_radix_lookup(&object->rtree, pindex));
1757
}
1758
1759
/*
1760
* vm_page_iter_init:
1761
*
1762
* Initialize iterator for vm pages.
1763
*/
1764
void
1765
vm_page_iter_init(struct pctrie_iter *pages, vm_object_t object)
1766
{
1767
1768
vm_radix_iter_init(pages, &object->rtree);
1769
}
1770
1771
/*
1772
* vm_page_iter_init:
1773
*
1774
* Initialize iterator for vm pages.
1775
*/
1776
void
1777
vm_page_iter_limit_init(struct pctrie_iter *pages, vm_object_t object,
1778
vm_pindex_t limit)
1779
{
1780
1781
vm_radix_iter_limit_init(pages, &object->rtree, limit);
1782
}
1783
1784
/*
1785
* vm_page_lookup_unlocked:
1786
*
1787
* Returns the page associated with the object/offset pair specified;
1788
* if none is found, NULL is returned. The page may be no longer be
1789
* present in the object at the time that this function returns. Only
1790
* useful for opportunistic checks such as inmem().
1791
*/
1792
vm_page_t
1793
vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex)
1794
{
1795
1796
return (vm_radix_lookup_unlocked(&object->rtree, pindex));
1797
}
1798
1799
/*
1800
* vm_page_relookup:
1801
*
1802
* Returns a page that must already have been busied by
1803
* the caller. Used for bogus page replacement.
1804
*/
1805
vm_page_t
1806
vm_page_relookup(vm_object_t object, vm_pindex_t pindex)
1807
{
1808
vm_page_t m;
1809
1810
m = vm_page_lookup_unlocked(object, pindex);
1811
KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) &&
1812
m->object == object && m->pindex == pindex,
1813
("vm_page_relookup: Invalid page %p", m));
1814
return (m);
1815
}
1816
1817
/*
1818
* This should only be used by lockless functions for releasing transient
1819
* incorrect acquires. The page may have been freed after we acquired a
1820
* busy lock. In this case busy_lock == VPB_FREED and we have nothing
1821
* further to do.
1822
*/
1823
static void
1824
vm_page_busy_release(vm_page_t m)
1825
{
1826
u_int x;
1827
1828
x = vm_page_busy_fetch(m);
1829
for (;;) {
1830
if (x == VPB_FREED)
1831
break;
1832
if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) {
1833
if (atomic_fcmpset_int(&m->busy_lock, &x,
1834
x - VPB_ONE_SHARER))
1835
break;
1836
continue;
1837
}
1838
KASSERT((x & VPB_BIT_SHARED) != 0 ||
1839
(x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE,
1840
("vm_page_busy_release: %p xbusy not owned.", m));
1841
if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
1842
continue;
1843
if ((x & VPB_BIT_WAITERS) != 0)
1844
wakeup(m);
1845
break;
1846
}
1847
}
1848
1849
/*
1850
* Uses the page mnew as a replacement for an existing page at index
1851
* pindex which must be already present in the object.
1852
*
1853
* Both pages must be exclusively busied on enter. The old page is
1854
* unbusied on exit.
1855
*
1856
* A return value of true means mold is now free. If this is not the
1857
* final ref and the caller does not hold a wire reference it may not
1858
* continue to access the page.
1859
*/
1860
static bool
1861
vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1862
vm_page_t mold)
1863
{
1864
vm_page_t mret __diagused;
1865
bool dropped;
1866
1867
VM_OBJECT_ASSERT_WLOCKED(object);
1868
vm_page_assert_xbusied(mold);
1869
KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0,
1870
("vm_page_replace: page %p already in object", mnew));
1871
1872
/*
1873
* This function mostly follows vm_page_insert() and
1874
* vm_page_remove() without the radix, object count and vnode
1875
* dance. Double check such functions for more comments.
1876
*/
1877
1878
mnew->object = object;
1879
mnew->pindex = pindex;
1880
atomic_set_int(&mnew->ref_count, VPRC_OBJREF);
1881
mret = vm_radix_replace(&object->rtree, mnew);
1882
KASSERT(mret == mold,
1883
("invalid page replacement, mold=%p, mret=%p", mold, mret));
1884
KASSERT((mold->oflags & VPO_UNMANAGED) ==
1885
(mnew->oflags & VPO_UNMANAGED),
1886
("vm_page_replace: mismatched VPO_UNMANAGED"));
1887
1888
mold->object = NULL;
1889
1890
/*
1891
* The object's resident_page_count does not change because we have
1892
* swapped one page for another, but the generation count should
1893
* change if the page is dirty.
1894
*/
1895
if (pmap_page_is_write_mapped(mnew))
1896
vm_object_set_writeable_dirty(object);
1897
dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF;
1898
vm_page_xunbusy(mold);
1899
1900
return (dropped);
1901
}
1902
1903
void
1904
vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1905
vm_page_t mold)
1906
{
1907
1908
vm_page_assert_xbusied(mnew);
1909
1910
if (vm_page_replace_hold(mnew, object, pindex, mold))
1911
vm_page_free(mold);
1912
}
1913
1914
/*
1915
* vm_page_iter_rename:
1916
*
1917
* Tries to move the specified page from its current object to a new object
1918
* and pindex, using the given iterator to remove the page from its current
1919
* object. Returns true if the move was successful, and false if the move
1920
* was aborted due to a failed memory allocation.
1921
*
1922
* Panics if a page already resides in the new object at the new pindex.
1923
*
1924
* This routine dirties the page if it is valid, as callers are expected to
1925
* transfer backing storage only after moving the page. Dirtying the page
1926
* ensures that the destination object retains the most recent copy of the
1927
* page.
1928
*
1929
* The objects must be locked.
1930
*/
1931
bool
1932
vm_page_iter_rename(struct pctrie_iter *old_pages, vm_page_t m,
1933
vm_object_t new_object, vm_pindex_t new_pindex)
1934
{
1935
vm_pindex_t opidx;
1936
1937
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1938
("%s: page %p is missing object ref", __func__, m));
1939
VM_OBJECT_ASSERT_WLOCKED(m->object);
1940
VM_OBJECT_ASSERT_WLOCKED(new_object);
1941
1942
/*
1943
* Create a custom version of vm_page_insert() which does not depend
1944
* by m_prev and can cheat on the implementation aspects of the
1945
* function.
1946
*/
1947
opidx = m->pindex;
1948
m->pindex = new_pindex;
1949
if (vm_radix_insert(&new_object->rtree, m) != 0) {
1950
m->pindex = opidx;
1951
return (false);
1952
}
1953
1954
/*
1955
* The operation cannot fail anymore.
1956
*/
1957
m->pindex = opidx;
1958
vm_radix_iter_remove(old_pages);
1959
vm_page_remove_radixdone(m);
1960
1961
/* Return back to the new pindex to complete vm_page_insert(). */
1962
m->pindex = new_pindex;
1963
m->object = new_object;
1964
1965
vm_page_insert_radixdone(m, new_object);
1966
if (vm_page_any_valid(m))
1967
vm_page_dirty(m);
1968
vm_pager_page_inserted(new_object, m);
1969
return (true);
1970
}
1971
1972
/*
1973
* vm_page_alloc:
1974
*
1975
* Allocate and return a page that is associated with the specified
1976
* object and offset pair. By default, this page is exclusive busied.
1977
*
1978
* The caller must always specify an allocation class.
1979
*
1980
* allocation classes:
1981
* VM_ALLOC_NORMAL normal process request
1982
* VM_ALLOC_SYSTEM system *really* needs a page
1983
* VM_ALLOC_INTERRUPT interrupt time request
1984
*
1985
* optional allocation flags:
1986
* VM_ALLOC_COUNT(number) the number of additional pages that the caller
1987
* intends to allocate
1988
* VM_ALLOC_NOBUSY do not exclusive busy the page
1989
* VM_ALLOC_NODUMP do not include the page in a kernel core dump
1990
* VM_ALLOC_NOFREE page will never be freed
1991
* VM_ALLOC_NOWAIT ignored (default behavior)
1992
* VM_ALLOC_SBUSY shared busy the allocated page
1993
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
1994
* VM_ALLOC_WIRED wire the allocated page
1995
* VM_ALLOC_ZERO prefer a zeroed page
1996
*/
1997
vm_page_t
1998
vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1999
{
2000
struct pctrie_iter pages;
2001
2002
vm_page_iter_init(&pages, object);
2003
return (vm_page_alloc_iter(object, pindex, req, &pages));
2004
}
2005
2006
/*
2007
* Allocate a page in the specified object with the given page index. If the
2008
* object lock is dropped and regained, the pages iter is reset.
2009
*/
2010
vm_page_t
2011
vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req,
2012
struct pctrie_iter *pages)
2013
{
2014
struct vm_domainset_iter di;
2015
vm_page_t m;
2016
int domain;
2017
2018
if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
2019
return (NULL);
2020
2021
do {
2022
m = vm_page_alloc_domain_iter(object, pindex, domain, req,
2023
pages);
2024
if (m != NULL)
2025
break;
2026
} while (vm_domainset_iter_page(&di, object, &domain, pages) == 0);
2027
2028
return (m);
2029
}
2030
2031
/*
2032
* Returns true if the number of free pages exceeds the minimum
2033
* for the request class and false otherwise.
2034
*/
2035
static int
2036
_vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages)
2037
{
2038
u_int limit, old, new;
2039
2040
if (req_class == VM_ALLOC_INTERRUPT)
2041
limit = 0;
2042
else if (req_class == VM_ALLOC_SYSTEM)
2043
limit = vmd->vmd_interrupt_free_min;
2044
else
2045
limit = vmd->vmd_free_reserved;
2046
2047
/*
2048
* Attempt to reserve the pages. Fail if we're below the limit.
2049
*/
2050
limit += npages;
2051
old = atomic_load_int(&vmd->vmd_free_count);
2052
do {
2053
if (old < limit)
2054
return (0);
2055
new = old - npages;
2056
} while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
2057
2058
/* Wake the page daemon if we've crossed the threshold. */
2059
if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
2060
pagedaemon_wakeup(vmd->vmd_domain);
2061
2062
/* Only update bitsets on transitions. */
2063
if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
2064
(old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
2065
vm_domain_set(vmd);
2066
2067
return (1);
2068
}
2069
2070
int
2071
vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
2072
{
2073
int req_class;
2074
2075
/*
2076
* The page daemon is allowed to dig deeper into the free page list.
2077
*/
2078
req_class = req & VM_ALLOC_CLASS_MASK;
2079
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
2080
req_class = VM_ALLOC_SYSTEM;
2081
return (_vm_domain_allocate(vmd, req_class, npages));
2082
}
2083
2084
vm_page_t
2085
vm_page_alloc_domain_iter(vm_object_t object, vm_pindex_t pindex, int domain,
2086
int req, struct pctrie_iter *pages)
2087
{
2088
struct vm_domain *vmd;
2089
vm_page_t m;
2090
int flags;
2091
2092
#define VM_ALLOC_COMMON (VM_ALLOC_CLASS_MASK | VM_ALLOC_NODUMP | \
2093
VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | \
2094
VM_ALLOC_WIRED | VM_ALLOC_ZERO)
2095
#define VPA_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2096
VM_ALLOC_NOBUSY | VM_ALLOC_NOFREE | \
2097
VM_ALLOC_SBUSY)
2098
KASSERT((req & ~VPA_FLAGS) == 0,
2099
("invalid request %#x", req));
2100
KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2101
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2102
("invalid request %#x", req));
2103
VM_OBJECT_ASSERT_WLOCKED(object);
2104
2105
flags = 0;
2106
m = NULL;
2107
if (!vm_pager_can_alloc_page(object, pindex))
2108
return (NULL);
2109
#if VM_NRESERVLEVEL > 0
2110
again:
2111
#endif
2112
if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) {
2113
m = vm_page_alloc_nofree_domain(domain, req);
2114
if (m != NULL)
2115
goto found;
2116
}
2117
#if VM_NRESERVLEVEL > 0
2118
/*
2119
* Can we allocate the page from a reservation?
2120
*/
2121
if (vm_object_reserv(object) &&
2122
(m = vm_reserv_alloc_page(object, pindex, domain, req, pages)) !=
2123
NULL) {
2124
goto found;
2125
}
2126
#endif
2127
vmd = VM_DOMAIN(domain);
2128
if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) {
2129
m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone,
2130
M_NOWAIT | M_NOVM);
2131
if (m != NULL) {
2132
flags |= PG_PCPU_CACHE;
2133
goto found;
2134
}
2135
}
2136
if (vm_domain_allocate(vmd, req, 1)) {
2137
/*
2138
* If not, allocate it from the free page queues.
2139
*/
2140
vm_domain_free_lock(vmd);
2141
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0);
2142
vm_domain_free_unlock(vmd);
2143
if (m == NULL) {
2144
vm_domain_freecnt_inc(vmd, 1);
2145
#if VM_NRESERVLEVEL > 0
2146
if (vm_reserv_reclaim_inactive(domain))
2147
goto again;
2148
#endif
2149
}
2150
}
2151
if (m == NULL) {
2152
/*
2153
* Not allocatable, give up.
2154
*/
2155
(void)vm_domain_alloc_fail(vmd, object, req);
2156
if ((req & VM_ALLOC_WAITFAIL) != 0)
2157
pctrie_iter_reset(pages);
2158
return (NULL);
2159
}
2160
2161
/*
2162
* At this point we had better have found a good page.
2163
*/
2164
found:
2165
vm_page_dequeue(m);
2166
vm_page_alloc_check(m);
2167
2168
/*
2169
* Initialize the page. Only the PG_ZERO flag is inherited.
2170
*/
2171
flags |= m->flags & PG_ZERO;
2172
if ((req & VM_ALLOC_NODUMP) != 0)
2173
flags |= PG_NODUMP;
2174
if ((req & VM_ALLOC_NOFREE) != 0)
2175
flags |= PG_NOFREE;
2176
m->flags = flags;
2177
m->a.flags = 0;
2178
m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2179
m->pool = VM_FREEPOOL_DEFAULT;
2180
if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2181
m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2182
else if ((req & VM_ALLOC_SBUSY) != 0)
2183
m->busy_lock = VPB_SHARERS_WORD(1);
2184
else
2185
m->busy_lock = VPB_UNBUSIED;
2186
if (req & VM_ALLOC_WIRED) {
2187
vm_wire_add(1);
2188
m->ref_count = 1;
2189
}
2190
m->a.act_count = 0;
2191
2192
if (vm_page_iter_insert(m, object, pindex, pages)) {
2193
if (req & VM_ALLOC_WIRED) {
2194
vm_wire_sub(1);
2195
m->ref_count = 0;
2196
}
2197
KASSERT(m->object == NULL, ("page %p has object", m));
2198
m->oflags = VPO_UNMANAGED;
2199
m->busy_lock = VPB_UNBUSIED;
2200
/* Don't change PG_ZERO. */
2201
vm_page_free_toq(m);
2202
if (req & VM_ALLOC_WAITFAIL) {
2203
VM_OBJECT_WUNLOCK(object);
2204
vm_radix_wait();
2205
pctrie_iter_reset(pages);
2206
VM_OBJECT_WLOCK(object);
2207
}
2208
return (NULL);
2209
}
2210
2211
/* Ignore device objects; the pager sets "memattr" for them. */
2212
if (object->memattr != VM_MEMATTR_DEFAULT &&
2213
(object->flags & OBJ_FICTITIOUS) == 0)
2214
pmap_page_set_memattr(m, object->memattr);
2215
2216
return (m);
2217
}
2218
2219
/*
2220
* vm_page_alloc_contig:
2221
*
2222
* Allocate a contiguous set of physical pages of the given size "npages"
2223
* from the free lists. All of the physical pages must be at or above
2224
* the given physical address "low" and below the given physical address
2225
* "high". The given value "alignment" determines the alignment of the
2226
* first physical page in the set. If the given value "boundary" is
2227
* non-zero, then the set of physical pages cannot cross any physical
2228
* address boundary that is a multiple of that value. Both "alignment"
2229
* and "boundary" must be a power of two.
2230
*
2231
* If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
2232
* then the memory attribute setting for the physical pages is configured
2233
* to the object's memory attribute setting. Otherwise, the memory
2234
* attribute setting for the physical pages is configured to "memattr",
2235
* overriding the object's memory attribute setting. However, if the
2236
* object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
2237
* memory attribute setting for the physical pages cannot be configured
2238
* to VM_MEMATTR_DEFAULT.
2239
*
2240
* The specified object may not contain fictitious pages.
2241
*
2242
* The caller must always specify an allocation class.
2243
*
2244
* allocation classes:
2245
* VM_ALLOC_NORMAL normal process request
2246
* VM_ALLOC_SYSTEM system *really* needs the pages
2247
* VM_ALLOC_INTERRUPT interrupt time request
2248
*
2249
* optional allocation flags:
2250
* VM_ALLOC_NOBUSY do not exclusive busy the pages
2251
* VM_ALLOC_NODUMP do not include the pages in a kernel core dump
2252
* VM_ALLOC_NORECLAIM do not reclaim after initial failure
2253
* VM_ALLOC_NOWAIT ignored (default behavior)
2254
* VM_ALLOC_SBUSY shared busy the allocated pages
2255
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
2256
* VM_ALLOC_WIRED wire the allocated pages
2257
* VM_ALLOC_ZERO prefer zeroed pages
2258
*/
2259
vm_page_t
2260
vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
2261
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2262
vm_paddr_t boundary, vm_memattr_t memattr)
2263
{
2264
struct vm_domainset_iter di;
2265
vm_page_t bounds[2];
2266
vm_page_t m;
2267
int domain;
2268
int start_segind;
2269
2270
start_segind = -1;
2271
2272
if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
2273
return (NULL);
2274
2275
do {
2276
m = vm_page_alloc_contig_domain(object, pindex, domain, req,
2277
npages, low, high, alignment, boundary, memattr);
2278
if (m != NULL)
2279
break;
2280
if (start_segind == -1)
2281
start_segind = vm_phys_lookup_segind(low);
2282
if (vm_phys_find_range(bounds, start_segind, domain,
2283
npages, low, high) == -1) {
2284
vm_domainset_iter_ignore(&di, domain);
2285
}
2286
} while (vm_domainset_iter_page(&di, object, &domain, NULL) == 0);
2287
2288
return (m);
2289
}
2290
2291
static vm_page_t
2292
vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low,
2293
vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
2294
{
2295
struct vm_domain *vmd;
2296
vm_page_t m_ret;
2297
2298
/*
2299
* Can we allocate the pages without the number of free pages falling
2300
* below the lower bound for the allocation class?
2301
*/
2302
vmd = VM_DOMAIN(domain);
2303
if (!vm_domain_allocate(vmd, req, npages))
2304
return (NULL);
2305
/*
2306
* Try to allocate the pages from the free page queues.
2307
*/
2308
vm_domain_free_lock(vmd);
2309
m_ret = vm_phys_alloc_contig(domain, npages, low, high,
2310
alignment, boundary);
2311
vm_domain_free_unlock(vmd);
2312
if (m_ret != NULL)
2313
return (m_ret);
2314
#if VM_NRESERVLEVEL > 0
2315
/*
2316
* Try to break a reservation to allocate the pages.
2317
*/
2318
if ((req & VM_ALLOC_NORECLAIM) == 0) {
2319
m_ret = vm_reserv_reclaim_contig(domain, npages, low,
2320
high, alignment, boundary);
2321
if (m_ret != NULL)
2322
return (m_ret);
2323
}
2324
#endif
2325
vm_domain_freecnt_inc(vmd, npages);
2326
return (NULL);
2327
}
2328
2329
vm_page_t
2330
vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
2331
int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2332
vm_paddr_t boundary, vm_memattr_t memattr)
2333
{
2334
struct pctrie_iter pages;
2335
vm_page_t m, m_ret, mpred;
2336
u_int busy_lock, flags, oflags;
2337
2338
#define VPAC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2339
VM_ALLOC_NOBUSY | VM_ALLOC_NORECLAIM | \
2340
VM_ALLOC_SBUSY)
2341
KASSERT((req & ~VPAC_FLAGS) == 0,
2342
("invalid request %#x", req));
2343
KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2344
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2345
("invalid request %#x", req));
2346
VM_OBJECT_ASSERT_WLOCKED(object);
2347
KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
2348
("vm_page_alloc_contig: object %p has fictitious pages",
2349
object));
2350
KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2351
2352
vm_page_iter_init(&pages, object);
2353
m_ret = NULL;
2354
#if VM_NRESERVLEVEL > 0
2355
/*
2356
* Can we allocate the pages from a reservation?
2357
*/
2358
if (vm_object_reserv(object)) {
2359
m_ret = vm_reserv_alloc_contig(object, pindex, domain,
2360
req, npages, low, high, alignment, boundary, &pages);
2361
}
2362
#endif
2363
if (m_ret == NULL) {
2364
m_ret = vm_page_find_contig_domain(domain, req, npages,
2365
low, high, alignment, boundary);
2366
}
2367
if (m_ret == NULL) {
2368
(void)vm_domain_alloc_fail(VM_DOMAIN(domain), object, req);
2369
return (NULL);
2370
}
2371
2372
/*
2373
* Initialize the pages. Only the PG_ZERO flag is inherited.
2374
*/
2375
flags = PG_ZERO;
2376
if ((req & VM_ALLOC_NODUMP) != 0)
2377
flags |= PG_NODUMP;
2378
oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2379
if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2380
busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2381
else if ((req & VM_ALLOC_SBUSY) != 0)
2382
busy_lock = VPB_SHARERS_WORD(1);
2383
else
2384
busy_lock = VPB_UNBUSIED;
2385
if ((req & VM_ALLOC_WIRED) != 0)
2386
vm_wire_add(npages);
2387
if (object->memattr != VM_MEMATTR_DEFAULT &&
2388
memattr == VM_MEMATTR_DEFAULT)
2389
memattr = object->memattr;
2390
for (m = m_ret; m < &m_ret[npages]; m++) {
2391
vm_page_dequeue(m);
2392
vm_page_alloc_check(m);
2393
m->a.flags = 0;
2394
m->flags = (m->flags | PG_NODUMP) & flags;
2395
m->busy_lock = busy_lock;
2396
if ((req & VM_ALLOC_WIRED) != 0)
2397
m->ref_count = 1;
2398
m->a.act_count = 0;
2399
m->oflags = oflags;
2400
m->pool = VM_FREEPOOL_DEFAULT;
2401
if (vm_page_iter_insert(m, object, pindex, &pages)) {
2402
if ((req & VM_ALLOC_WIRED) != 0)
2403
vm_wire_sub(npages);
2404
KASSERT(m->object == NULL,
2405
("page %p has object", m));
2406
mpred = m;
2407
for (m = m_ret; m < &m_ret[npages]; m++) {
2408
if (m <= mpred &&
2409
(req & VM_ALLOC_WIRED) != 0)
2410
m->ref_count = 0;
2411
m->oflags = VPO_UNMANAGED;
2412
m->busy_lock = VPB_UNBUSIED;
2413
/* Don't change PG_ZERO. */
2414
vm_page_free_toq(m);
2415
}
2416
if (req & VM_ALLOC_WAITFAIL) {
2417
VM_OBJECT_WUNLOCK(object);
2418
vm_radix_wait();
2419
VM_OBJECT_WLOCK(object);
2420
}
2421
return (NULL);
2422
}
2423
if (memattr != VM_MEMATTR_DEFAULT)
2424
pmap_page_set_memattr(m, memattr);
2425
pindex++;
2426
}
2427
return (m_ret);
2428
}
2429
2430
/*
2431
* Allocate a physical page that is not intended to be inserted into a VM
2432
* object.
2433
*/
2434
vm_page_t
2435
vm_page_alloc_noobj_domain(int domain, int req)
2436
{
2437
struct vm_domain *vmd;
2438
vm_page_t m;
2439
int flags;
2440
2441
#define VPAN_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2442
VM_ALLOC_NOFREE | VM_ALLOC_WAITOK)
2443
KASSERT((req & ~VPAN_FLAGS) == 0,
2444
("invalid request %#x", req));
2445
2446
flags = ((req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0) |
2447
((req & VM_ALLOC_NOFREE) != 0 ? PG_NOFREE : 0);
2448
vmd = VM_DOMAIN(domain);
2449
again:
2450
if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) {
2451
m = vm_page_alloc_nofree_domain(domain, req);
2452
if (m != NULL)
2453
goto found;
2454
}
2455
2456
if (vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) {
2457
m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone,
2458
M_NOWAIT | M_NOVM);
2459
if (m != NULL) {
2460
flags |= PG_PCPU_CACHE;
2461
goto found;
2462
}
2463
}
2464
2465
if (vm_domain_allocate(vmd, req, 1)) {
2466
vm_domain_free_lock(vmd);
2467
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0);
2468
vm_domain_free_unlock(vmd);
2469
if (m == NULL) {
2470
vm_domain_freecnt_inc(vmd, 1);
2471
#if VM_NRESERVLEVEL > 0
2472
if (vm_reserv_reclaim_inactive(domain))
2473
goto again;
2474
#endif
2475
}
2476
}
2477
if (m == NULL) {
2478
if (!vm_domain_alloc_fail(vmd, NULL, req))
2479
return (NULL);
2480
goto again;
2481
}
2482
2483
found:
2484
/*
2485
* If the page comes from the free page cache, then it might still
2486
* have a pending deferred dequeue. Specifically, when the page is
2487
* imported from a different pool by vm_phys_alloc_npages(), the
2488
* second, third, etc. pages in a non-zero order set could have
2489
* pending deferred dequeues.
2490
*/
2491
vm_page_dequeue(m);
2492
vm_page_alloc_check(m);
2493
2494
/*
2495
* Consumers should not rely on a useful default pindex value.
2496
*/
2497
m->pindex = 0xdeadc0dedeadc0de;
2498
m->flags = (m->flags & PG_ZERO) | flags;
2499
m->a.flags = 0;
2500
m->oflags = VPO_UNMANAGED;
2501
m->pool = VM_FREEPOOL_DIRECT;
2502
m->busy_lock = VPB_UNBUSIED;
2503
if ((req & VM_ALLOC_WIRED) != 0) {
2504
vm_wire_add(1);
2505
m->ref_count = 1;
2506
}
2507
2508
if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2509
pmap_zero_page(m);
2510
2511
return (m);
2512
}
2513
2514
#if VM_NRESERVLEVEL > 1
2515
#define VM_NOFREE_IMPORT_ORDER (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER)
2516
#elif VM_NRESERVLEVEL > 0
2517
#define VM_NOFREE_IMPORT_ORDER VM_LEVEL_0_ORDER
2518
#else
2519
#define VM_NOFREE_IMPORT_ORDER 8
2520
#endif
2521
2522
/*
2523
* Allocate a single NOFREE page.
2524
*
2525
* This routine hands out NOFREE pages from higher-order
2526
* physical memory blocks in order to reduce memory fragmentation.
2527
* When a NOFREE for a given domain chunk is used up,
2528
* the routine will try to fetch a new one from the freelists
2529
* and discard the old one.
2530
*/
2531
static vm_page_t __noinline
2532
vm_page_alloc_nofree_domain(int domain, int req)
2533
{
2534
vm_page_t m;
2535
struct vm_domain *vmd;
2536
2537
KASSERT((req & VM_ALLOC_NOFREE) != 0, ("invalid request %#x", req));
2538
2539
vmd = VM_DOMAIN(domain);
2540
vm_domain_free_lock(vmd);
2541
if (TAILQ_EMPTY(&vmd->vmd_nofreeq)) {
2542
int count;
2543
2544
count = 1 << VM_NOFREE_IMPORT_ORDER;
2545
if (!vm_domain_allocate(vmd, req, count)) {
2546
vm_domain_free_unlock(vmd);
2547
return (NULL);
2548
}
2549
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
2550
VM_NOFREE_IMPORT_ORDER);
2551
if (m == NULL) {
2552
vm_domain_freecnt_inc(vmd, count);
2553
vm_domain_free_unlock(vmd);
2554
return (NULL);
2555
}
2556
m->ref_count = count - 1;
2557
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q);
2558
atomic_add_long(&nofreeq_size, count);
2559
}
2560
m = TAILQ_FIRST(&vmd->vmd_nofreeq);
2561
TAILQ_REMOVE(&vmd->vmd_nofreeq, m, plinks.q);
2562
if (m->ref_count > 0) {
2563
vm_page_t m_next;
2564
2565
m_next = &m[1];
2566
vm_page_dequeue(m_next);
2567
m_next->ref_count = m->ref_count - 1;
2568
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m_next, plinks.q);
2569
m->ref_count = 0;
2570
}
2571
vm_domain_free_unlock(vmd);
2572
atomic_add_long(&nofreeq_size, -1);
2573
VM_CNT_INC(v_nofree_count);
2574
2575
return (m);
2576
}
2577
2578
/*
2579
* Though a NOFREE page by definition should not be freed, we support putting
2580
* them aside for future NOFREE allocations. This enables code which allocates
2581
* NOFREE pages for some purpose but then encounters an error and releases
2582
* resources.
2583
*/
2584
static void __noinline
2585
vm_page_free_nofree(struct vm_domain *vmd, vm_page_t m)
2586
{
2587
VM_CNT_ADD(v_nofree_count, -1);
2588
atomic_add_long(&nofreeq_size, 1);
2589
vm_domain_free_lock(vmd);
2590
MPASS(m->ref_count == 0);
2591
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q);
2592
vm_domain_free_unlock(vmd);
2593
}
2594
2595
vm_page_t
2596
vm_page_alloc_noobj(int req)
2597
{
2598
struct vm_domainset_iter di;
2599
vm_page_t m;
2600
int domain;
2601
2602
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
2603
return (NULL);
2604
2605
do {
2606
m = vm_page_alloc_noobj_domain(domain, req);
2607
if (m != NULL)
2608
break;
2609
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
2610
2611
return (m);
2612
}
2613
2614
vm_page_t
2615
vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
2616
vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2617
vm_memattr_t memattr)
2618
{
2619
struct vm_domainset_iter di;
2620
vm_page_t m;
2621
int domain;
2622
2623
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
2624
return (NULL);
2625
2626
do {
2627
m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
2628
high, alignment, boundary, memattr);
2629
if (m != NULL)
2630
break;
2631
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
2632
2633
return (m);
2634
}
2635
2636
vm_page_t
2637
vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages,
2638
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2639
vm_memattr_t memattr)
2640
{
2641
vm_page_t m, m_ret;
2642
u_int flags;
2643
2644
#define VPANC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2645
VM_ALLOC_NORECLAIM | VM_ALLOC_WAITOK)
2646
KASSERT((req & ~VPANC_FLAGS) == 0,
2647
("invalid request %#x", req));
2648
KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) !=
2649
(VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM),
2650
("invalid request %#x", req));
2651
KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2652
2653
while ((m_ret = vm_page_find_contig_domain(domain, req, npages,
2654
low, high, alignment, boundary)) == NULL) {
2655
if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req))
2656
return (NULL);
2657
}
2658
2659
/*
2660
* Initialize the pages. Only the PG_ZERO flag is inherited.
2661
*/
2662
flags = PG_ZERO;
2663
if ((req & VM_ALLOC_NODUMP) != 0)
2664
flags |= PG_NODUMP;
2665
if ((req & VM_ALLOC_WIRED) != 0)
2666
vm_wire_add(npages);
2667
for (m = m_ret; m < &m_ret[npages]; m++) {
2668
vm_page_dequeue(m);
2669
vm_page_alloc_check(m);
2670
2671
/*
2672
* Consumers should not rely on a useful default pindex value.
2673
*/
2674
m->pindex = 0xdeadc0dedeadc0de;
2675
m->a.flags = 0;
2676
m->flags = (m->flags | PG_NODUMP) & flags;
2677
m->busy_lock = VPB_UNBUSIED;
2678
if ((req & VM_ALLOC_WIRED) != 0)
2679
m->ref_count = 1;
2680
m->a.act_count = 0;
2681
m->oflags = VPO_UNMANAGED;
2682
m->pool = VM_FREEPOOL_DIRECT;
2683
2684
/*
2685
* Zero the page before updating any mappings since the page is
2686
* not yet shared with any devices which might require the
2687
* non-default memory attribute. pmap_page_set_memattr()
2688
* flushes data caches before returning.
2689
*/
2690
if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2691
pmap_zero_page(m);
2692
if (memattr != VM_MEMATTR_DEFAULT)
2693
pmap_page_set_memattr(m, memattr);
2694
}
2695
return (m_ret);
2696
}
2697
2698
/*
2699
* Check a page that has been freshly dequeued from a freelist.
2700
*/
2701
static void
2702
vm_page_alloc_check(vm_page_t m)
2703
{
2704
2705
KASSERT(m->object == NULL, ("page %p has object", m));
2706
KASSERT(m->a.queue == PQ_NONE &&
2707
(m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
2708
("page %p has unexpected queue %d, flags %#x",
2709
m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK)));
2710
KASSERT(m->ref_count == 0, ("page %p has references", m));
2711
KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m));
2712
KASSERT(m->dirty == 0, ("page %p is dirty", m));
2713
KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
2714
("page %p has unexpected memattr %d",
2715
m, pmap_page_get_memattr(m)));
2716
KASSERT(vm_page_none_valid(m), ("free page %p is valid", m));
2717
pmap_vm_page_alloc_check(m);
2718
}
2719
2720
static int
2721
vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags)
2722
{
2723
struct vm_domain *vmd;
2724
struct vm_pgcache *pgcache;
2725
int i;
2726
2727
pgcache = arg;
2728
vmd = VM_DOMAIN(pgcache->domain);
2729
2730
/*
2731
* The page daemon should avoid creating extra memory pressure since its
2732
* main purpose is to replenish the store of free pages.
2733
*/
2734
if (vmd->vmd_severeset || curproc == pageproc ||
2735
!_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt))
2736
return (0);
2737
domain = vmd->vmd_domain;
2738
vm_domain_free_lock(vmd);
2739
i = vm_phys_alloc_npages(domain, pgcache->pool, cnt,
2740
(vm_page_t *)store);
2741
vm_domain_free_unlock(vmd);
2742
if (cnt != i)
2743
vm_domain_freecnt_inc(vmd, cnt - i);
2744
2745
return (i);
2746
}
2747
2748
static void
2749
vm_page_zone_release(void *arg, void **store, int cnt)
2750
{
2751
struct vm_domain *vmd;
2752
struct vm_pgcache *pgcache;
2753
vm_page_t m;
2754
int i;
2755
2756
pgcache = arg;
2757
vmd = VM_DOMAIN(pgcache->domain);
2758
vm_domain_free_lock(vmd);
2759
for (i = 0; i < cnt; i++) {
2760
m = (vm_page_t)store[i];
2761
vm_phys_free_pages(m, pgcache->pool, 0);
2762
}
2763
vm_domain_free_unlock(vmd);
2764
vm_domain_freecnt_inc(vmd, cnt);
2765
}
2766
2767
#define VPSC_ANY 0 /* No restrictions. */
2768
#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */
2769
#define VPSC_NOSUPER 2 /* Skip superpages. */
2770
2771
/*
2772
* vm_page_scan_contig:
2773
*
2774
* Scan vm_page_array[] between the specified entries "m_start" and
2775
* "m_end" for a run of contiguous physical pages that satisfy the
2776
* specified conditions, and return the lowest page in the run. The
2777
* specified "alignment" determines the alignment of the lowest physical
2778
* page in the run. If the specified "boundary" is non-zero, then the
2779
* run of physical pages cannot span a physical address that is a
2780
* multiple of "boundary".
2781
*
2782
* "m_end" is never dereferenced, so it need not point to a vm_page
2783
* structure within vm_page_array[].
2784
*
2785
* "npages" must be greater than zero. "m_start" and "m_end" must not
2786
* span a hole (or discontiguity) in the physical address space. Both
2787
* "alignment" and "boundary" must be a power of two.
2788
*/
2789
static vm_page_t
2790
vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
2791
u_long alignment, vm_paddr_t boundary, int options)
2792
{
2793
vm_object_t object;
2794
vm_paddr_t pa;
2795
vm_page_t m, m_run;
2796
#if VM_NRESERVLEVEL > 0
2797
int level;
2798
#endif
2799
int m_inc, order, run_ext, run_len;
2800
2801
KASSERT(npages > 0, ("npages is 0"));
2802
KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2803
KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2804
m_run = NULL;
2805
run_len = 0;
2806
for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
2807
KASSERT((m->flags & PG_MARKER) == 0,
2808
("page %p is PG_MARKER", m));
2809
KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1,
2810
("fictitious page %p has invalid ref count", m));
2811
2812
/*
2813
* If the current page would be the start of a run, check its
2814
* physical address against the end, alignment, and boundary
2815
* conditions. If it doesn't satisfy these conditions, either
2816
* terminate the scan or advance to the next page that
2817
* satisfies the failed condition.
2818
*/
2819
if (run_len == 0) {
2820
KASSERT(m_run == NULL, ("m_run != NULL"));
2821
if (m + npages > m_end)
2822
break;
2823
pa = VM_PAGE_TO_PHYS(m);
2824
if (!vm_addr_align_ok(pa, alignment)) {
2825
m_inc = atop(roundup2(pa, alignment) - pa);
2826
continue;
2827
}
2828
if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) {
2829
m_inc = atop(roundup2(pa, boundary) - pa);
2830
continue;
2831
}
2832
} else
2833
KASSERT(m_run != NULL, ("m_run == NULL"));
2834
2835
retry:
2836
m_inc = 1;
2837
if (vm_page_wired(m))
2838
run_ext = 0;
2839
#if VM_NRESERVLEVEL > 0
2840
else if ((level = vm_reserv_level(m)) >= 0 &&
2841
(options & VPSC_NORESERV) != 0) {
2842
run_ext = 0;
2843
/* Advance to the end of the reservation. */
2844
pa = VM_PAGE_TO_PHYS(m);
2845
m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
2846
pa);
2847
}
2848
#endif
2849
else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2850
/*
2851
* The page is considered eligible for relocation if
2852
* and only if it could be laundered or reclaimed by
2853
* the page daemon.
2854
*/
2855
VM_OBJECT_RLOCK(object);
2856
if (object != m->object) {
2857
VM_OBJECT_RUNLOCK(object);
2858
goto retry;
2859
}
2860
/* Don't care: PG_NODUMP, PG_ZERO. */
2861
if ((object->flags & OBJ_SWAP) == 0 &&
2862
object->type != OBJT_VNODE) {
2863
run_ext = 0;
2864
#if VM_NRESERVLEVEL > 0
2865
} else if ((options & VPSC_NOSUPER) != 0 &&
2866
(level = vm_reserv_level_iffullpop(m)) >= 0) {
2867
run_ext = 0;
2868
/* Advance to the end of the superpage. */
2869
pa = VM_PAGE_TO_PHYS(m);
2870
m_inc = atop(roundup2(pa + 1,
2871
vm_reserv_size(level)) - pa);
2872
#endif
2873
} else if (object->memattr == VM_MEMATTR_DEFAULT &&
2874
vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) {
2875
/*
2876
* The page is allocated but eligible for
2877
* relocation. Extend the current run by one
2878
* page.
2879
*/
2880
KASSERT(pmap_page_get_memattr(m) ==
2881
VM_MEMATTR_DEFAULT,
2882
("page %p has an unexpected memattr", m));
2883
KASSERT((m->oflags & (VPO_SWAPINPROG |
2884
VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2885
("page %p has unexpected oflags", m));
2886
/* Don't care: PGA_NOSYNC. */
2887
run_ext = 1;
2888
} else
2889
run_ext = 0;
2890
VM_OBJECT_RUNLOCK(object);
2891
#if VM_NRESERVLEVEL > 0
2892
} else if (level >= 0) {
2893
/*
2894
* The page is reserved but not yet allocated. In
2895
* other words, it is still free. Extend the current
2896
* run by one page.
2897
*/
2898
run_ext = 1;
2899
#endif
2900
} else if ((order = m->order) < VM_NFREEORDER) {
2901
/*
2902
* The page is enqueued in the physical memory
2903
* allocator's free page queues. Moreover, it is the
2904
* first page in a power-of-two-sized run of
2905
* contiguous free pages. Add these pages to the end
2906
* of the current run, and jump ahead.
2907
*/
2908
run_ext = 1 << order;
2909
m_inc = 1 << order;
2910
} else {
2911
/*
2912
* Skip the page for one of the following reasons: (1)
2913
* It is enqueued in the physical memory allocator's
2914
* free page queues. However, it is not the first
2915
* page in a run of contiguous free pages. (This case
2916
* rarely occurs because the scan is performed in
2917
* ascending order.) (2) It is not reserved, and it is
2918
* transitioning from free to allocated. (Conversely,
2919
* the transition from allocated to free for managed
2920
* pages is blocked by the page busy lock.) (3) It is
2921
* allocated but not contained by an object and not
2922
* wired, e.g., allocated by Xen's balloon driver.
2923
*/
2924
run_ext = 0;
2925
}
2926
2927
/*
2928
* Extend or reset the current run of pages.
2929
*/
2930
if (run_ext > 0) {
2931
if (run_len == 0)
2932
m_run = m;
2933
run_len += run_ext;
2934
} else {
2935
if (run_len > 0) {
2936
m_run = NULL;
2937
run_len = 0;
2938
}
2939
}
2940
}
2941
if (run_len >= npages)
2942
return (m_run);
2943
return (NULL);
2944
}
2945
2946
/*
2947
* vm_page_reclaim_run:
2948
*
2949
* Try to relocate each of the allocated virtual pages within the
2950
* specified run of physical pages to a new physical address. Free the
2951
* physical pages underlying the relocated virtual pages. A virtual page
2952
* is relocatable if and only if it could be laundered or reclaimed by
2953
* the page daemon. Whenever possible, a virtual page is relocated to a
2954
* physical address above "high".
2955
*
2956
* Returns 0 if every physical page within the run was already free or
2957
* just freed by a successful relocation. Otherwise, returns a non-zero
2958
* value indicating why the last attempt to relocate a virtual page was
2959
* unsuccessful.
2960
*
2961
* "req_class" must be an allocation class.
2962
*/
2963
static int
2964
vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
2965
vm_paddr_t high)
2966
{
2967
struct vm_domain *vmd;
2968
struct spglist free;
2969
vm_object_t object;
2970
vm_paddr_t pa;
2971
vm_page_t m, m_end, m_new;
2972
int error, order, req;
2973
2974
KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
2975
("req_class is not an allocation class"));
2976
SLIST_INIT(&free);
2977
error = 0;
2978
m = m_run;
2979
m_end = m_run + npages;
2980
for (; error == 0 && m < m_end; m++) {
2981
KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
2982
("page %p is PG_FICTITIOUS or PG_MARKER", m));
2983
2984
/*
2985
* Racily check for wirings. Races are handled once the object
2986
* lock is held and the page is unmapped.
2987
*/
2988
if (vm_page_wired(m))
2989
error = EBUSY;
2990
else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2991
/*
2992
* The page is relocated if and only if it could be
2993
* laundered or reclaimed by the page daemon.
2994
*/
2995
VM_OBJECT_WLOCK(object);
2996
/* Don't care: PG_NODUMP, PG_ZERO. */
2997
if (m->object != object ||
2998
((object->flags & OBJ_SWAP) == 0 &&
2999
object->type != OBJT_VNODE))
3000
error = EINVAL;
3001
else if (object->memattr != VM_MEMATTR_DEFAULT)
3002
error = EINVAL;
3003
else if (vm_page_queue(m) != PQ_NONE &&
3004
vm_page_tryxbusy(m) != 0) {
3005
if (vm_page_wired(m)) {
3006
vm_page_xunbusy(m);
3007
error = EBUSY;
3008
goto unlock;
3009
}
3010
KASSERT(pmap_page_get_memattr(m) ==
3011
VM_MEMATTR_DEFAULT,
3012
("page %p has an unexpected memattr", m));
3013
KASSERT(m->oflags == 0,
3014
("page %p has unexpected oflags", m));
3015
/* Don't care: PGA_NOSYNC. */
3016
if (!vm_page_none_valid(m)) {
3017
/*
3018
* First, try to allocate a new page
3019
* that is above "high". Failing
3020
* that, try to allocate a new page
3021
* that is below "m_run". Allocate
3022
* the new page between the end of
3023
* "m_run" and "high" only as a last
3024
* resort.
3025
*/
3026
req = req_class;
3027
if ((m->flags & PG_NODUMP) != 0)
3028
req |= VM_ALLOC_NODUMP;
3029
if (trunc_page(high) !=
3030
~(vm_paddr_t)PAGE_MASK) {
3031
m_new =
3032
vm_page_alloc_noobj_contig(
3033
req, 1, round_page(high),
3034
~(vm_paddr_t)0, PAGE_SIZE,
3035
0, VM_MEMATTR_DEFAULT);
3036
} else
3037
m_new = NULL;
3038
if (m_new == NULL) {
3039
pa = VM_PAGE_TO_PHYS(m_run);
3040
m_new =
3041
vm_page_alloc_noobj_contig(
3042
req, 1, 0, pa - 1,
3043
PAGE_SIZE, 0,
3044
VM_MEMATTR_DEFAULT);
3045
}
3046
if (m_new == NULL) {
3047
pa += ptoa(npages);
3048
m_new =
3049
vm_page_alloc_noobj_contig(
3050
req, 1, pa, high, PAGE_SIZE,
3051
0, VM_MEMATTR_DEFAULT);
3052
}
3053
if (m_new == NULL) {
3054
vm_page_xunbusy(m);
3055
error = ENOMEM;
3056
goto unlock;
3057
}
3058
3059
/*
3060
* Unmap the page and check for new
3061
* wirings that may have been acquired
3062
* through a pmap lookup.
3063
*/
3064
if (object->ref_count != 0 &&
3065
!vm_page_try_remove_all(m)) {
3066
vm_page_xunbusy(m);
3067
vm_page_free(m_new);
3068
error = EBUSY;
3069
goto unlock;
3070
}
3071
3072
/*
3073
* Replace "m" with the new page. For
3074
* vm_page_replace(), "m" must be busy
3075
* and dequeued. Finally, change "m"
3076
* as if vm_page_free() was called.
3077
*/
3078
m_new->a.flags = m->a.flags &
3079
~PGA_QUEUE_STATE_MASK;
3080
KASSERT(m_new->oflags == VPO_UNMANAGED,
3081
("page %p is managed", m_new));
3082
m_new->oflags = 0;
3083
pmap_copy_page(m, m_new);
3084
m_new->valid = m->valid;
3085
m_new->dirty = m->dirty;
3086
m->flags &= ~PG_ZERO;
3087
vm_page_dequeue(m);
3088
if (vm_page_replace_hold(m_new, object,
3089
m->pindex, m) &&
3090
vm_page_free_prep(m))
3091
SLIST_INSERT_HEAD(&free, m,
3092
plinks.s.ss);
3093
3094
/*
3095
* The new page must be deactivated
3096
* before the object is unlocked.
3097
*/
3098
vm_page_deactivate(m_new);
3099
} else {
3100
m->flags &= ~PG_ZERO;
3101
vm_page_dequeue(m);
3102
if (vm_page_free_prep(m))
3103
SLIST_INSERT_HEAD(&free, m,
3104
plinks.s.ss);
3105
KASSERT(m->dirty == 0,
3106
("page %p is dirty", m));
3107
}
3108
} else
3109
error = EBUSY;
3110
unlock:
3111
VM_OBJECT_WUNLOCK(object);
3112
} else {
3113
MPASS(vm_page_domain(m) == domain);
3114
vmd = VM_DOMAIN(domain);
3115
vm_domain_free_lock(vmd);
3116
order = m->order;
3117
if (order < VM_NFREEORDER) {
3118
/*
3119
* The page is enqueued in the physical memory
3120
* allocator's free page queues. Moreover, it
3121
* is the first page in a power-of-two-sized
3122
* run of contiguous free pages. Jump ahead
3123
* to the last page within that run, and
3124
* continue from there.
3125
*/
3126
m += (1 << order) - 1;
3127
}
3128
#if VM_NRESERVLEVEL > 0
3129
else if (vm_reserv_is_page_free(m))
3130
order = 0;
3131
#endif
3132
vm_domain_free_unlock(vmd);
3133
if (order == VM_NFREEORDER)
3134
error = EINVAL;
3135
}
3136
}
3137
if ((m = SLIST_FIRST(&free)) != NULL) {
3138
int cnt;
3139
3140
vmd = VM_DOMAIN(domain);
3141
cnt = 0;
3142
vm_domain_free_lock(vmd);
3143
do {
3144
MPASS(vm_page_domain(m) == domain);
3145
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3146
vm_phys_free_pages(m, m->pool, 0);
3147
cnt++;
3148
} while ((m = SLIST_FIRST(&free)) != NULL);
3149
vm_domain_free_unlock(vmd);
3150
vm_domain_freecnt_inc(vmd, cnt);
3151
}
3152
return (error);
3153
}
3154
3155
#define NRUNS 16
3156
3157
#define RUN_INDEX(count, nruns) ((count) % (nruns))
3158
3159
#define MIN_RECLAIM 8
3160
3161
/*
3162
* vm_page_reclaim_contig:
3163
*
3164
* Reclaim allocated, contiguous physical memory satisfying the specified
3165
* conditions by relocating the virtual pages using that physical memory.
3166
* Returns 0 if reclamation is successful, ERANGE if the specified domain
3167
* can't possibly satisfy the reclamation request, or ENOMEM if not
3168
* currently able to reclaim the requested number of pages. Since
3169
* relocation requires the allocation of physical pages, reclamation may
3170
* fail with ENOMEM due to a shortage of free pages. When reclamation
3171
* fails in this manner, callers are expected to perform vm_wait() before
3172
* retrying a failed allocation operation, e.g., vm_page_alloc_contig().
3173
*
3174
* The caller must always specify an allocation class through "req".
3175
*
3176
* allocation classes:
3177
* VM_ALLOC_NORMAL normal process request
3178
* VM_ALLOC_SYSTEM system *really* needs a page
3179
* VM_ALLOC_INTERRUPT interrupt time request
3180
*
3181
* The optional allocation flags are ignored.
3182
*
3183
* "npages" must be greater than zero. Both "alignment" and "boundary"
3184
* must be a power of two.
3185
*/
3186
int
3187
vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
3188
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
3189
int desired_runs)
3190
{
3191
struct vm_domain *vmd;
3192
vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs;
3193
u_long count, minalign, reclaimed;
3194
int error, i, min_reclaim, nruns, options, req_class;
3195
int segind, start_segind;
3196
int ret;
3197
3198
KASSERT(npages > 0, ("npages is 0"));
3199
KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
3200
KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
3201
3202
ret = ENOMEM;
3203
3204
/*
3205
* If the caller wants to reclaim multiple runs, try to allocate
3206
* space to store the runs. If that fails, fall back to the old
3207
* behavior of just reclaiming MIN_RECLAIM pages.
3208
*/
3209
if (desired_runs > 1)
3210
m_runs = malloc((NRUNS + desired_runs) * sizeof(*m_runs),
3211
M_TEMP, M_NOWAIT);
3212
else
3213
m_runs = NULL;
3214
3215
if (m_runs == NULL) {
3216
m_runs = _m_runs;
3217
nruns = NRUNS;
3218
} else {
3219
nruns = NRUNS + desired_runs - 1;
3220
}
3221
min_reclaim = MAX(desired_runs * npages, MIN_RECLAIM);
3222
3223
/*
3224
* The caller will attempt an allocation after some runs have been
3225
* reclaimed and added to the vm_phys buddy lists. Due to limitations
3226
* of vm_phys_alloc_contig(), round up the requested length to the next
3227
* power of two or maximum chunk size, and ensure that each run is
3228
* suitably aligned.
3229
*/
3230
minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1);
3231
npages = roundup2(npages, minalign);
3232
if (alignment < ptoa(minalign))
3233
alignment = ptoa(minalign);
3234
3235
/*
3236
* The page daemon is allowed to dig deeper into the free page list.
3237
*/
3238
req_class = req & VM_ALLOC_CLASS_MASK;
3239
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
3240
req_class = VM_ALLOC_SYSTEM;
3241
3242
start_segind = vm_phys_lookup_segind(low);
3243
3244
/*
3245
* Return if the number of free pages cannot satisfy the requested
3246
* allocation.
3247
*/
3248
vmd = VM_DOMAIN(domain);
3249
count = vmd->vmd_free_count;
3250
if (count < npages + vmd->vmd_free_reserved || (count < npages +
3251
vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
3252
(count < npages && req_class == VM_ALLOC_INTERRUPT))
3253
goto done;
3254
3255
/*
3256
* Scan up to three times, relaxing the restrictions ("options") on
3257
* the reclamation of reservations and superpages each time.
3258
*/
3259
for (options = VPSC_NORESERV;;) {
3260
bool phys_range_exists = false;
3261
3262
/*
3263
* Find the highest runs that satisfy the given constraints
3264
* and restrictions, and record them in "m_runs".
3265
*/
3266
count = 0;
3267
segind = start_segind;
3268
while ((segind = vm_phys_find_range(bounds, segind, domain,
3269
npages, low, high)) != -1) {
3270
phys_range_exists = true;
3271
while ((m_run = vm_page_scan_contig(npages, bounds[0],
3272
bounds[1], alignment, boundary, options))) {
3273
bounds[0] = m_run + npages;
3274
m_runs[RUN_INDEX(count, nruns)] = m_run;
3275
count++;
3276
}
3277
segind++;
3278
}
3279
3280
if (!phys_range_exists) {
3281
ret = ERANGE;
3282
goto done;
3283
}
3284
3285
/*
3286
* Reclaim the highest runs in LIFO (descending) order until
3287
* the number of reclaimed pages, "reclaimed", is at least
3288
* "min_reclaim". Reset "reclaimed" each time because each
3289
* reclamation is idempotent, and runs will (likely) recur
3290
* from one scan to the next as restrictions are relaxed.
3291
*/
3292
reclaimed = 0;
3293
for (i = 0; count > 0 && i < nruns; i++) {
3294
count--;
3295
m_run = m_runs[RUN_INDEX(count, nruns)];
3296
error = vm_page_reclaim_run(req_class, domain, npages,
3297
m_run, high);
3298
if (error == 0) {
3299
reclaimed += npages;
3300
if (reclaimed >= min_reclaim) {
3301
ret = 0;
3302
goto done;
3303
}
3304
}
3305
}
3306
3307
/*
3308
* Either relax the restrictions on the next scan or return if
3309
* the last scan had no restrictions.
3310
*/
3311
if (options == VPSC_NORESERV)
3312
options = VPSC_NOSUPER;
3313
else if (options == VPSC_NOSUPER)
3314
options = VPSC_ANY;
3315
else if (options == VPSC_ANY) {
3316
if (reclaimed != 0)
3317
ret = 0;
3318
goto done;
3319
}
3320
}
3321
done:
3322
if (m_runs != _m_runs)
3323
free(m_runs, M_TEMP);
3324
return (ret);
3325
}
3326
3327
int
3328
vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
3329
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
3330
{
3331
return (vm_page_reclaim_contig_domain_ext(domain, req, npages, low,
3332
high, alignment, boundary, 1));
3333
}
3334
3335
int
3336
vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
3337
u_long alignment, vm_paddr_t boundary)
3338
{
3339
struct vm_domainset_iter di;
3340
int domain, ret, status;
3341
3342
ret = ERANGE;
3343
3344
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
3345
return (ret);
3346
3347
do {
3348
status = vm_page_reclaim_contig_domain(domain, req, npages, low,
3349
high, alignment, boundary);
3350
if (status == 0)
3351
return (0);
3352
else if (status == ERANGE)
3353
vm_domainset_iter_ignore(&di, domain);
3354
else {
3355
KASSERT(status == ENOMEM, ("Unrecognized error %d "
3356
"from vm_page_reclaim_contig_domain()", status));
3357
ret = ENOMEM;
3358
}
3359
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
3360
3361
return (ret);
3362
}
3363
3364
/*
3365
* Set the domain in the appropriate page level domainset.
3366
*/
3367
void
3368
vm_domain_set(struct vm_domain *vmd)
3369
{
3370
3371
mtx_lock(&vm_domainset_lock);
3372
if (!vmd->vmd_minset && vm_paging_min(vmd)) {
3373
vmd->vmd_minset = 1;
3374
DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
3375
}
3376
if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
3377
vmd->vmd_severeset = 1;
3378
DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains);
3379
}
3380
mtx_unlock(&vm_domainset_lock);
3381
}
3382
3383
/*
3384
* Clear the domain from the appropriate page level domainset.
3385
*/
3386
void
3387
vm_domain_clear(struct vm_domain *vmd)
3388
{
3389
3390
mtx_lock(&vm_domainset_lock);
3391
if (vmd->vmd_minset && !vm_paging_min(vmd)) {
3392
vmd->vmd_minset = 0;
3393
DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
3394
if (vm_min_waiters != 0) {
3395
vm_min_waiters = 0;
3396
wakeup(&vm_min_domains);
3397
}
3398
}
3399
if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
3400
vmd->vmd_severeset = 0;
3401
DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
3402
if (vm_severe_waiters != 0) {
3403
vm_severe_waiters = 0;
3404
wakeup(&vm_severe_domains);
3405
}
3406
}
3407
3408
/*
3409
* If pageout daemon needs pages, then tell it that there are
3410
* some free.
3411
*/
3412
if (vmd->vmd_pageout_pages_needed &&
3413
vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
3414
wakeup(&vmd->vmd_pageout_pages_needed);
3415
vmd->vmd_pageout_pages_needed = 0;
3416
}
3417
3418
/* See comments in vm_wait_doms(). */
3419
if (vm_pageproc_waiters) {
3420
vm_pageproc_waiters = 0;
3421
wakeup(&vm_pageproc_waiters);
3422
}
3423
mtx_unlock(&vm_domainset_lock);
3424
}
3425
3426
/*
3427
* Wait for free pages to exceed the min threshold globally.
3428
*/
3429
void
3430
vm_wait_min(void)
3431
{
3432
3433
mtx_lock(&vm_domainset_lock);
3434
while (vm_page_count_min()) {
3435
vm_min_waiters++;
3436
msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
3437
}
3438
mtx_unlock(&vm_domainset_lock);
3439
}
3440
3441
/*
3442
* Wait for free pages to exceed the severe threshold globally.
3443
*/
3444
void
3445
vm_wait_severe(void)
3446
{
3447
3448
mtx_lock(&vm_domainset_lock);
3449
while (vm_page_count_severe()) {
3450
vm_severe_waiters++;
3451
msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
3452
"vmwait", 0);
3453
}
3454
mtx_unlock(&vm_domainset_lock);
3455
}
3456
3457
u_int
3458
vm_wait_count(void)
3459
{
3460
3461
return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
3462
}
3463
3464
int
3465
vm_wait_doms(const domainset_t *wdoms, int mflags)
3466
{
3467
int error;
3468
3469
error = 0;
3470
3471
/*
3472
* We use racey wakeup synchronization to avoid expensive global
3473
* locking for the pageproc when sleeping with a non-specific vm_wait.
3474
* To handle this, we only sleep for one tick in this instance. It
3475
* is expected that most allocations for the pageproc will come from
3476
* kmem or vm_page_grab* which will use the more specific and
3477
* race-free vm_wait_domain().
3478
*/
3479
if (curproc == pageproc) {
3480
mtx_lock(&vm_domainset_lock);
3481
vm_pageproc_waiters++;
3482
error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
3483
PVM | PDROP | mflags, "pageprocwait", 1);
3484
} else {
3485
/*
3486
* XXX Ideally we would wait only until the allocation could
3487
* be satisfied. This condition can cause new allocators to
3488
* consume all freed pages while old allocators wait.
3489
*/
3490
mtx_lock(&vm_domainset_lock);
3491
if (vm_page_count_min_set(wdoms)) {
3492
if (pageproc == NULL)
3493
panic("vm_wait in early boot");
3494
vm_min_waiters++;
3495
error = msleep(&vm_min_domains, &vm_domainset_lock,
3496
PVM | PDROP | mflags, "vmwait", 0);
3497
} else
3498
mtx_unlock(&vm_domainset_lock);
3499
}
3500
return (error);
3501
}
3502
3503
/*
3504
* vm_wait_domain:
3505
*
3506
* Sleep until free pages are available for allocation.
3507
* - Called in various places after failed memory allocations.
3508
*/
3509
void
3510
vm_wait_domain(int domain)
3511
{
3512
struct vm_domain *vmd;
3513
domainset_t wdom;
3514
3515
vmd = VM_DOMAIN(domain);
3516
vm_domain_free_assert_unlocked(vmd);
3517
3518
if (curproc == pageproc) {
3519
mtx_lock(&vm_domainset_lock);
3520
if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
3521
vmd->vmd_pageout_pages_needed = 1;
3522
msleep(&vmd->vmd_pageout_pages_needed,
3523
&vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
3524
} else
3525
mtx_unlock(&vm_domainset_lock);
3526
} else {
3527
DOMAINSET_ZERO(&wdom);
3528
DOMAINSET_SET(vmd->vmd_domain, &wdom);
3529
vm_wait_doms(&wdom, 0);
3530
}
3531
}
3532
3533
static int
3534
vm_wait_flags(vm_object_t obj, int mflags)
3535
{
3536
struct domainset *d;
3537
3538
d = NULL;
3539
3540
/*
3541
* Carefully fetch pointers only once: the struct domainset
3542
* itself is ummutable but the pointer might change.
3543
*/
3544
if (obj != NULL)
3545
d = obj->domain.dr_policy;
3546
if (d == NULL)
3547
d = curthread->td_domain.dr_policy;
3548
3549
return (vm_wait_doms(&d->ds_mask, mflags));
3550
}
3551
3552
/*
3553
* vm_wait:
3554
*
3555
* Sleep until free pages are available for allocation in the
3556
* affinity domains of the obj. If obj is NULL, the domain set
3557
* for the calling thread is used.
3558
* Called in various places after failed memory allocations.
3559
*/
3560
void
3561
vm_wait(vm_object_t obj)
3562
{
3563
(void)vm_wait_flags(obj, 0);
3564
}
3565
3566
int
3567
vm_wait_intr(vm_object_t obj)
3568
{
3569
return (vm_wait_flags(obj, PCATCH));
3570
}
3571
3572
/*
3573
* vm_domain_alloc_fail:
3574
*
3575
* Called when a page allocation function fails. Informs the
3576
* pagedaemon and performs the requested wait. Requires the
3577
* domain_free and object lock on entry. Returns with the
3578
* object lock held and free lock released. Returns an error when
3579
* retry is necessary.
3580
*
3581
*/
3582
static int
3583
vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
3584
{
3585
3586
vm_domain_free_assert_unlocked(vmd);
3587
3588
atomic_add_int(&vmd->vmd_pageout_deficit,
3589
max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
3590
if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
3591
if (object != NULL)
3592
VM_OBJECT_WUNLOCK(object);
3593
vm_wait_domain(vmd->vmd_domain);
3594
if (object != NULL)
3595
VM_OBJECT_WLOCK(object);
3596
if (req & VM_ALLOC_WAITOK)
3597
return (EAGAIN);
3598
}
3599
3600
return (0);
3601
}
3602
3603
/*
3604
* vm_waitpfault:
3605
*
3606
* Sleep until free pages are available for allocation.
3607
* - Called only in vm_fault so that processes page faulting
3608
* can be easily tracked.
3609
* - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
3610
* processes will be able to grab memory first. Do not change
3611
* this balance without careful testing first.
3612
*/
3613
void
3614
vm_waitpfault(struct domainset *dset, int timo)
3615
{
3616
3617
/*
3618
* XXX Ideally we would wait only until the allocation could
3619
* be satisfied. This condition can cause new allocators to
3620
* consume all freed pages while old allocators wait.
3621
*/
3622
mtx_lock(&vm_domainset_lock);
3623
if (vm_page_count_min_set(&dset->ds_mask)) {
3624
vm_min_waiters++;
3625
msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP,
3626
"pfault", timo);
3627
} else
3628
mtx_unlock(&vm_domainset_lock);
3629
}
3630
3631
static struct vm_pagequeue *
3632
_vm_page_pagequeue(vm_page_t m, uint8_t queue)
3633
{
3634
3635
return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]);
3636
}
3637
3638
#ifdef INVARIANTS
3639
static struct vm_pagequeue *
3640
vm_page_pagequeue(vm_page_t m)
3641
{
3642
3643
return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue));
3644
}
3645
#endif
3646
3647
static __always_inline bool
3648
vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old,
3649
vm_page_astate_t new)
3650
{
3651
vm_page_astate_t tmp;
3652
3653
tmp = *old;
3654
do {
3655
if (__predict_true(vm_page_astate_fcmpset(m, old, new)))
3656
return (true);
3657
counter_u64_add(pqstate_commit_retries, 1);
3658
} while (old->_bits == tmp._bits);
3659
3660
return (false);
3661
}
3662
3663
/*
3664
* Do the work of committing a queue state update that moves the page out of
3665
* its current queue.
3666
*/
3667
static bool
3668
_vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m,
3669
vm_page_astate_t *old, vm_page_astate_t new)
3670
{
3671
vm_page_t next;
3672
3673
vm_pagequeue_assert_locked(pq);
3674
KASSERT(vm_page_pagequeue(m) == pq,
3675
("%s: queue %p does not match page %p", __func__, pq, m));
3676
KASSERT(old->queue != PQ_NONE && new.queue != old->queue,
3677
("%s: invalid queue indices %d %d",
3678
__func__, old->queue, new.queue));
3679
3680
/*
3681
* Once the queue index of the page changes there is nothing
3682
* synchronizing with further updates to the page's physical
3683
* queue state. Therefore we must speculatively remove the page
3684
* from the queue now and be prepared to roll back if the queue
3685
* state update fails. If the page is not physically enqueued then
3686
* we just update its queue index.
3687
*/
3688
if ((old->flags & PGA_ENQUEUED) != 0) {
3689
new.flags &= ~PGA_ENQUEUED;
3690
next = TAILQ_NEXT(m, plinks.q);
3691
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3692
vm_pagequeue_cnt_dec(pq);
3693
if (!vm_page_pqstate_fcmpset(m, old, new)) {
3694
if (next == NULL)
3695
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3696
else
3697
TAILQ_INSERT_BEFORE(next, m, plinks.q);
3698
vm_pagequeue_cnt_inc(pq);
3699
return (false);
3700
} else {
3701
return (true);
3702
}
3703
} else {
3704
return (vm_page_pqstate_fcmpset(m, old, new));
3705
}
3706
}
3707
3708
static bool
3709
vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old,
3710
vm_page_astate_t new)
3711
{
3712
struct vm_pagequeue *pq;
3713
vm_page_astate_t as;
3714
bool ret;
3715
3716
pq = _vm_page_pagequeue(m, old->queue);
3717
3718
/*
3719
* The queue field and PGA_ENQUEUED flag are stable only so long as the
3720
* corresponding page queue lock is held.
3721
*/
3722
vm_pagequeue_lock(pq);
3723
as = vm_page_astate_load(m);
3724
if (__predict_false(as._bits != old->_bits)) {
3725
*old = as;
3726
ret = false;
3727
} else {
3728
ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new);
3729
}
3730
vm_pagequeue_unlock(pq);
3731
return (ret);
3732
}
3733
3734
/*
3735
* Commit a queue state update that enqueues or requeues a page.
3736
*/
3737
static bool
3738
_vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m,
3739
vm_page_astate_t *old, vm_page_astate_t new)
3740
{
3741
struct vm_domain *vmd;
3742
3743
vm_pagequeue_assert_locked(pq);
3744
KASSERT(old->queue != PQ_NONE && new.queue == old->queue,
3745
("%s: invalid queue indices %d %d",
3746
__func__, old->queue, new.queue));
3747
3748
new.flags |= PGA_ENQUEUED;
3749
if (!vm_page_pqstate_fcmpset(m, old, new))
3750
return (false);
3751
3752
if ((old->flags & PGA_ENQUEUED) != 0)
3753
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3754
else
3755
vm_pagequeue_cnt_inc(pq);
3756
3757
/*
3758
* Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if
3759
* both flags are set in close succession, only PGA_REQUEUE_HEAD will be
3760
* applied, even if it was set first.
3761
*/
3762
if ((old->flags & PGA_REQUEUE_HEAD) != 0) {
3763
vmd = vm_pagequeue_domain(m);
3764
KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE],
3765
("%s: invalid page queue for page %p", __func__, m));
3766
TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
3767
} else {
3768
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3769
}
3770
return (true);
3771
}
3772
3773
/*
3774
* Commit a queue state update that encodes a request for a deferred queue
3775
* operation.
3776
*/
3777
static bool
3778
vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old,
3779
vm_page_astate_t new)
3780
{
3781
3782
KASSERT(old->queue == new.queue || new.queue != PQ_NONE,
3783
("%s: invalid state, queue %d flags %x",
3784
__func__, new.queue, new.flags));
3785
3786
if (old->_bits != new._bits &&
3787
!vm_page_pqstate_fcmpset(m, old, new))
3788
return (false);
3789
vm_page_pqbatch_submit(m, new.queue);
3790
return (true);
3791
}
3792
3793
/*
3794
* A generic queue state update function. This handles more cases than the
3795
* specialized functions above.
3796
*/
3797
bool
3798
vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
3799
{
3800
3801
if (old->_bits == new._bits)
3802
return (true);
3803
3804
if (old->queue != PQ_NONE && new.queue != old->queue) {
3805
if (!vm_page_pqstate_commit_dequeue(m, old, new))
3806
return (false);
3807
if (new.queue != PQ_NONE)
3808
vm_page_pqbatch_submit(m, new.queue);
3809
} else {
3810
if (!vm_page_pqstate_fcmpset(m, old, new))
3811
return (false);
3812
if (new.queue != PQ_NONE &&
3813
((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0)
3814
vm_page_pqbatch_submit(m, new.queue);
3815
}
3816
return (true);
3817
}
3818
3819
/*
3820
* Apply deferred queue state updates to a page.
3821
*/
3822
static inline void
3823
vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue)
3824
{
3825
vm_page_astate_t new, old;
3826
3827
CRITICAL_ASSERT(curthread);
3828
vm_pagequeue_assert_locked(pq);
3829
KASSERT(queue < PQ_COUNT,
3830
("%s: invalid queue index %d", __func__, queue));
3831
KASSERT(pq == _vm_page_pagequeue(m, queue),
3832
("%s: page %p does not belong to queue %p", __func__, m, pq));
3833
3834
for (old = vm_page_astate_load(m);;) {
3835
if (__predict_false(old.queue != queue ||
3836
(old.flags & PGA_QUEUE_OP_MASK) == 0)) {
3837
counter_u64_add(queue_nops, 1);
3838
break;
3839
}
3840
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3841
("%s: page %p is unmanaged", __func__, m));
3842
3843
new = old;
3844
if ((old.flags & PGA_DEQUEUE) != 0) {
3845
new.flags &= ~PGA_QUEUE_OP_MASK;
3846
new.queue = PQ_NONE;
3847
if (__predict_true(_vm_page_pqstate_commit_dequeue(pq,
3848
m, &old, new))) {
3849
counter_u64_add(queue_ops, 1);
3850
break;
3851
}
3852
} else {
3853
new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD);
3854
if (__predict_true(_vm_page_pqstate_commit_requeue(pq,
3855
m, &old, new))) {
3856
counter_u64_add(queue_ops, 1);
3857
break;
3858
}
3859
}
3860
}
3861
}
3862
3863
static void
3864
vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
3865
uint8_t queue)
3866
{
3867
int i;
3868
3869
for (i = 0; i < bq->bq_cnt; i++)
3870
vm_pqbatch_process_page(pq, bq->bq_pa[i], queue);
3871
vm_batchqueue_init(bq);
3872
}
3873
3874
/*
3875
* vm_page_pqbatch_submit: [ internal use only ]
3876
*
3877
* Enqueue a page in the specified page queue's batched work queue.
3878
* The caller must have encoded the requested operation in the page
3879
* structure's a.flags field.
3880
*/
3881
void
3882
vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
3883
{
3884
struct vm_batchqueue *bq;
3885
struct vm_pagequeue *pq;
3886
int domain, slots_remaining;
3887
3888
KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
3889
3890
domain = vm_page_domain(m);
3891
critical_enter();
3892
bq = DPCPU_PTR(pqbatch[domain][queue]);
3893
slots_remaining = vm_batchqueue_insert(bq, m);
3894
if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) {
3895
/* keep building the bq */
3896
critical_exit();
3897
return;
3898
} else if (slots_remaining > 0 ) {
3899
/* Try to process the bq if we can get the lock */
3900
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3901
if (vm_pagequeue_trylock(pq)) {
3902
vm_pqbatch_process(pq, bq, queue);
3903
vm_pagequeue_unlock(pq);
3904
}
3905
critical_exit();
3906
return;
3907
}
3908
critical_exit();
3909
3910
/* if we make it here, the bq is full so wait for the lock */
3911
3912
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3913
vm_pagequeue_lock(pq);
3914
critical_enter();
3915
bq = DPCPU_PTR(pqbatch[domain][queue]);
3916
vm_pqbatch_process(pq, bq, queue);
3917
vm_pqbatch_process_page(pq, m, queue);
3918
vm_pagequeue_unlock(pq);
3919
critical_exit();
3920
}
3921
3922
/*
3923
* vm_page_pqbatch_drain: [ internal use only ]
3924
*
3925
* Force all per-CPU page queue batch queues to be drained. This is
3926
* intended for use in severe memory shortages, to ensure that pages
3927
* do not remain stuck in the batch queues.
3928
*/
3929
void
3930
vm_page_pqbatch_drain(void)
3931
{
3932
struct thread *td;
3933
struct vm_domain *vmd;
3934
struct vm_pagequeue *pq;
3935
int cpu, domain, queue;
3936
3937
td = curthread;
3938
CPU_FOREACH(cpu) {
3939
thread_lock(td);
3940
sched_bind(td, cpu);
3941
thread_unlock(td);
3942
3943
for (domain = 0; domain < vm_ndomains; domain++) {
3944
vmd = VM_DOMAIN(domain);
3945
for (queue = 0; queue < PQ_COUNT; queue++) {
3946
pq = &vmd->vmd_pagequeues[queue];
3947
vm_pagequeue_lock(pq);
3948
critical_enter();
3949
vm_pqbatch_process(pq,
3950
DPCPU_PTR(pqbatch[domain][queue]), queue);
3951
critical_exit();
3952
vm_pagequeue_unlock(pq);
3953
}
3954
}
3955
}
3956
thread_lock(td);
3957
sched_unbind(td);
3958
thread_unlock(td);
3959
}
3960
3961
/*
3962
* vm_page_dequeue_deferred: [ internal use only ]
3963
*
3964
* Request removal of the given page from its current page
3965
* queue. Physical removal from the queue may be deferred
3966
* indefinitely.
3967
*/
3968
void
3969
vm_page_dequeue_deferred(vm_page_t m)
3970
{
3971
vm_page_astate_t new, old;
3972
3973
old = vm_page_astate_load(m);
3974
do {
3975
if (old.queue == PQ_NONE) {
3976
KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
3977
("%s: page %p has unexpected queue state",
3978
__func__, m));
3979
break;
3980
}
3981
new = old;
3982
new.flags |= PGA_DEQUEUE;
3983
} while (!vm_page_pqstate_commit_request(m, &old, new));
3984
}
3985
3986
/*
3987
* vm_page_dequeue:
3988
*
3989
* Remove the page from whichever page queue it's in, if any, before
3990
* returning.
3991
*/
3992
void
3993
vm_page_dequeue(vm_page_t m)
3994
{
3995
vm_page_astate_t new, old;
3996
3997
old = vm_page_astate_load(m);
3998
do {
3999
if (__predict_true(old.queue == PQ_NONE)) {
4000
KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
4001
("%s: page %p has unexpected queue state",
4002
__func__, m));
4003
break;
4004
}
4005
new = old;
4006
new.flags &= ~PGA_QUEUE_OP_MASK;
4007
new.queue = PQ_NONE;
4008
} while (!vm_page_pqstate_commit_dequeue(m, &old, new));
4009
4010
}
4011
4012
/*
4013
* Schedule the given page for insertion into the specified page queue.
4014
* Physical insertion of the page may be deferred indefinitely.
4015
*/
4016
static void
4017
vm_page_enqueue(vm_page_t m, uint8_t queue)
4018
{
4019
4020
KASSERT(m->a.queue == PQ_NONE &&
4021
(m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
4022
("%s: page %p is already enqueued", __func__, m));
4023
KASSERT(m->ref_count > 0,
4024
("%s: page %p does not carry any references", __func__, m));
4025
4026
m->a.queue = queue;
4027
if ((m->a.flags & PGA_REQUEUE) == 0)
4028
vm_page_aflag_set(m, PGA_REQUEUE);
4029
vm_page_pqbatch_submit(m, queue);
4030
}
4031
4032
/*
4033
* vm_page_free_prep:
4034
*
4035
* Prepares the given page to be put on the free list,
4036
* disassociating it from any VM object. The caller may return
4037
* the page to the free list only if this function returns true.
4038
*
4039
* The object, if it exists, must be locked, and then the page must
4040
* be xbusy. Otherwise the page must be not busied. A managed
4041
* page must be unmapped.
4042
*/
4043
static bool
4044
vm_page_free_prep(vm_page_t m)
4045
{
4046
4047
/*
4048
* Synchronize with threads that have dropped a reference to this
4049
* page.
4050
*/
4051
atomic_thread_fence_acq();
4052
4053
#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
4054
if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) {
4055
uint64_t *p;
4056
int i;
4057
p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4058
for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
4059
KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
4060
m, i, (uintmax_t)*p));
4061
}
4062
#endif
4063
if ((m->oflags & VPO_UNMANAGED) == 0) {
4064
KASSERT(!pmap_page_is_mapped(m),
4065
("vm_page_free_prep: freeing mapped page %p", m));
4066
KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0,
4067
("vm_page_free_prep: mapping flags set in page %p", m));
4068
} else {
4069
KASSERT(m->a.queue == PQ_NONE,
4070
("vm_page_free_prep: unmanaged page %p is queued", m));
4071
}
4072
VM_CNT_INC(v_tfree);
4073
4074
if (m->object != NULL) {
4075
vm_page_radix_remove(m);
4076
vm_page_free_object_prep(m);
4077
} else
4078
vm_page_assert_unbusied(m);
4079
4080
vm_page_busy_free(m);
4081
4082
/*
4083
* If fictitious remove object association and
4084
* return.
4085
*/
4086
if ((m->flags & PG_FICTITIOUS) != 0) {
4087
KASSERT(m->ref_count == 1,
4088
("fictitious page %p is referenced", m));
4089
KASSERT(m->a.queue == PQ_NONE,
4090
("fictitious page %p is queued", m));
4091
return (false);
4092
}
4093
4094
/*
4095
* Pages need not be dequeued before they are returned to the physical
4096
* memory allocator, but they must at least be marked for a deferred
4097
* dequeue.
4098
*/
4099
if ((m->oflags & VPO_UNMANAGED) == 0)
4100
vm_page_dequeue_deferred(m);
4101
4102
m->valid = 0;
4103
vm_page_undirty(m);
4104
4105
if (m->ref_count != 0)
4106
panic("vm_page_free_prep: page %p has references", m);
4107
4108
/*
4109
* Restore the default memory attribute to the page.
4110
*/
4111
if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
4112
pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
4113
4114
#if VM_NRESERVLEVEL > 0
4115
/*
4116
* Determine whether the page belongs to a reservation. If the page was
4117
* allocated from a per-CPU cache, it cannot belong to a reservation, so
4118
* as an optimization, we avoid the check in that case.
4119
*/
4120
if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m))
4121
return (false);
4122
#endif
4123
4124
return (true);
4125
}
4126
4127
/*
4128
* vm_page_free_toq:
4129
*
4130
* Returns the given page to the free list, disassociating it
4131
* from any VM object.
4132
*
4133
* The object must be locked. The page must be exclusively busied if it
4134
* belongs to an object.
4135
*/
4136
static void
4137
vm_page_free_toq(vm_page_t m)
4138
{
4139
struct vm_domain *vmd;
4140
uma_zone_t zone;
4141
4142
if (!vm_page_free_prep(m))
4143
return;
4144
4145
vmd = vm_pagequeue_domain(m);
4146
if (__predict_false((m->flags & PG_NOFREE) != 0)) {
4147
vm_page_free_nofree(vmd, m);
4148
return;
4149
}
4150
zone = vmd->vmd_pgcache[m->pool].zone;
4151
if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) {
4152
uma_zfree(zone, m);
4153
return;
4154
}
4155
vm_domain_free_lock(vmd);
4156
vm_phys_free_pages(m, m->pool, 0);
4157
vm_domain_free_unlock(vmd);
4158
vm_domain_freecnt_inc(vmd, 1);
4159
}
4160
4161
/*
4162
* vm_page_free_pages_toq:
4163
*
4164
* Returns a list of pages to the free list, disassociating it
4165
* from any VM object. In other words, this is equivalent to
4166
* calling vm_page_free_toq() for each page of a list of VM objects.
4167
*/
4168
int
4169
vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
4170
{
4171
vm_page_t m;
4172
int count;
4173
4174
if (SLIST_EMPTY(free))
4175
return (0);
4176
4177
count = 0;
4178
while ((m = SLIST_FIRST(free)) != NULL) {
4179
count++;
4180
SLIST_REMOVE_HEAD(free, plinks.s.ss);
4181
vm_page_free_toq(m);
4182
}
4183
4184
if (update_wire_count)
4185
vm_wire_sub(count);
4186
return (count);
4187
}
4188
4189
/*
4190
* Mark this page as wired down. For managed pages, this prevents reclamation
4191
* by the page daemon, or when the containing object, if any, is destroyed.
4192
*/
4193
void
4194
vm_page_wire(vm_page_t m)
4195
{
4196
u_int old;
4197
4198
#ifdef INVARIANTS
4199
if (m->object != NULL && !vm_page_busied(m) &&
4200
!vm_object_busied(m->object))
4201
VM_OBJECT_ASSERT_LOCKED(m->object);
4202
#endif
4203
KASSERT((m->flags & PG_FICTITIOUS) == 0 ||
4204
VPRC_WIRE_COUNT(m->ref_count) >= 1,
4205
("vm_page_wire: fictitious page %p has zero wirings", m));
4206
4207
old = atomic_fetchadd_int(&m->ref_count, 1);
4208
KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
4209
("vm_page_wire: counter overflow for page %p", m));
4210
if (VPRC_WIRE_COUNT(old) == 0) {
4211
if ((m->oflags & VPO_UNMANAGED) == 0)
4212
vm_page_aflag_set(m, PGA_DEQUEUE);
4213
vm_wire_add(1);
4214
}
4215
}
4216
4217
/*
4218
* Attempt to wire a mapped page following a pmap lookup of that page.
4219
* This may fail if a thread is concurrently tearing down mappings of the page.
4220
* The transient failure is acceptable because it translates to the
4221
* failure of the caller pmap_extract_and_hold(), which should be then
4222
* followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages().
4223
*/
4224
bool
4225
vm_page_wire_mapped(vm_page_t m)
4226
{
4227
u_int old;
4228
4229
old = atomic_load_int(&m->ref_count);
4230
do {
4231
KASSERT(old > 0,
4232
("vm_page_wire_mapped: wiring unreferenced page %p", m));
4233
if ((old & VPRC_BLOCKED) != 0)
4234
return (false);
4235
} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
4236
4237
if (VPRC_WIRE_COUNT(old) == 0) {
4238
if ((m->oflags & VPO_UNMANAGED) == 0)
4239
vm_page_aflag_set(m, PGA_DEQUEUE);
4240
vm_wire_add(1);
4241
}
4242
return (true);
4243
}
4244
4245
/*
4246
* Release a wiring reference to a managed page. If the page still belongs to
4247
* an object, update its position in the page queues to reflect the reference.
4248
* If the wiring was the last reference to the page, free the page.
4249
*/
4250
static void
4251
vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
4252
{
4253
u_int old;
4254
4255
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4256
("%s: page %p is unmanaged", __func__, m));
4257
4258
/*
4259
* Update LRU state before releasing the wiring reference.
4260
* Use a release store when updating the reference count to
4261
* synchronize with vm_page_free_prep().
4262
*/
4263
old = atomic_load_int(&m->ref_count);
4264
do {
4265
u_int count;
4266
4267
KASSERT(VPRC_WIRE_COUNT(old) > 0,
4268
("vm_page_unwire: wire count underflow for page %p", m));
4269
4270
count = old & ~VPRC_BLOCKED;
4271
if (count > VPRC_OBJREF + 1) {
4272
/*
4273
* The page has at least one other wiring reference. An
4274
* earlier iteration of this loop may have called
4275
* vm_page_release_toq() and cleared PGA_DEQUEUE, so
4276
* re-set it if necessary.
4277
*/
4278
if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
4279
vm_page_aflag_set(m, PGA_DEQUEUE);
4280
} else if (count == VPRC_OBJREF + 1) {
4281
/*
4282
* This is the last wiring. Clear PGA_DEQUEUE and
4283
* update the page's queue state to reflect the
4284
* reference. If the page does not belong to an object
4285
* (i.e., the VPRC_OBJREF bit is clear), we only need to
4286
* clear leftover queue state.
4287
*/
4288
vm_page_release_toq(m, nqueue, noreuse);
4289
} else if (count == 1) {
4290
vm_page_aflag_clear(m, PGA_DEQUEUE);
4291
}
4292
} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
4293
4294
if (VPRC_WIRE_COUNT(old) == 1) {
4295
vm_wire_sub(1);
4296
if (old == 1)
4297
vm_page_free(m);
4298
}
4299
}
4300
4301
/*
4302
* Release one wiring of the specified page, potentially allowing it to be
4303
* paged out.
4304
*
4305
* Only managed pages belonging to an object can be paged out. If the number
4306
* of wirings transitions to zero and the page is eligible for page out, then
4307
* the page is added to the specified paging queue. If the released wiring
4308
* represented the last reference to the page, the page is freed.
4309
*/
4310
void
4311
vm_page_unwire(vm_page_t m, uint8_t nqueue)
4312
{
4313
4314
KASSERT(nqueue < PQ_COUNT,
4315
("vm_page_unwire: invalid queue %u request for page %p",
4316
nqueue, m));
4317
4318
if ((m->oflags & VPO_UNMANAGED) != 0) {
4319
if (vm_page_unwire_noq(m) && m->ref_count == 0)
4320
vm_page_free(m);
4321
return;
4322
}
4323
vm_page_unwire_managed(m, nqueue, false);
4324
}
4325
4326
/*
4327
* Unwire a page without (re-)inserting it into a page queue. It is up
4328
* to the caller to enqueue, requeue, or free the page as appropriate.
4329
* In most cases involving managed pages, vm_page_unwire() should be used
4330
* instead.
4331
*/
4332
bool
4333
vm_page_unwire_noq(vm_page_t m)
4334
{
4335
u_int old;
4336
4337
old = vm_page_drop(m, 1);
4338
KASSERT(VPRC_WIRE_COUNT(old) != 0,
4339
("%s: counter underflow for page %p", __func__, m));
4340
KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1,
4341
("%s: missing ref on fictitious page %p", __func__, m));
4342
4343
if (VPRC_WIRE_COUNT(old) > 1)
4344
return (false);
4345
if ((m->oflags & VPO_UNMANAGED) == 0)
4346
vm_page_aflag_clear(m, PGA_DEQUEUE);
4347
vm_wire_sub(1);
4348
return (true);
4349
}
4350
4351
/*
4352
* Ensure that the page ends up in the specified page queue. If the page is
4353
* active or being moved to the active queue, ensure that its act_count is
4354
* at least ACT_INIT but do not otherwise mess with it.
4355
*/
4356
static __always_inline void
4357
vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
4358
{
4359
vm_page_astate_t old, new;
4360
4361
KASSERT(m->ref_count > 0,
4362
("%s: page %p does not carry any references", __func__, m));
4363
KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
4364
("%s: invalid flags %x", __func__, nflag));
4365
4366
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
4367
return;
4368
4369
old = vm_page_astate_load(m);
4370
do {
4371
if ((old.flags & PGA_DEQUEUE) != 0)
4372
break;
4373
new = old;
4374
new.flags &= ~PGA_QUEUE_OP_MASK;
4375
if (nqueue == PQ_ACTIVE)
4376
new.act_count = max(old.act_count, ACT_INIT);
4377
if (old.queue == nqueue) {
4378
/*
4379
* There is no need to requeue pages already in the
4380
* active queue.
4381
*/
4382
if (nqueue != PQ_ACTIVE ||
4383
(old.flags & PGA_ENQUEUED) == 0)
4384
new.flags |= nflag;
4385
} else {
4386
new.flags |= nflag;
4387
new.queue = nqueue;
4388
}
4389
} while (!vm_page_pqstate_commit(m, &old, new));
4390
}
4391
4392
/*
4393
* Put the specified page on the active list (if appropriate).
4394
*/
4395
void
4396
vm_page_activate(vm_page_t m)
4397
{
4398
4399
vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
4400
}
4401
4402
/*
4403
* Move the specified page to the tail of the inactive queue, or requeue
4404
* the page if it is already in the inactive queue.
4405
*/
4406
void
4407
vm_page_deactivate(vm_page_t m)
4408
{
4409
4410
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
4411
}
4412
4413
void
4414
vm_page_deactivate_noreuse(vm_page_t m)
4415
{
4416
4417
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
4418
}
4419
4420
/*
4421
* Put a page in the laundry, or requeue it if it is already there.
4422
*/
4423
void
4424
vm_page_launder(vm_page_t m)
4425
{
4426
4427
vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
4428
}
4429
4430
/*
4431
* Put a page in the PQ_UNSWAPPABLE holding queue.
4432
*/
4433
void
4434
vm_page_unswappable(vm_page_t m)
4435
{
4436
4437
VM_OBJECT_ASSERT_LOCKED(m->object);
4438
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4439
("page %p already unswappable", m));
4440
4441
vm_page_dequeue(m);
4442
vm_page_enqueue(m, PQ_UNSWAPPABLE);
4443
}
4444
4445
/*
4446
* Release a page back to the page queues in preparation for unwiring.
4447
*/
4448
static void
4449
vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
4450
{
4451
vm_page_astate_t old, new;
4452
uint16_t nflag;
4453
4454
/*
4455
* Use a check of the valid bits to determine whether we should
4456
* accelerate reclamation of the page. The object lock might not be
4457
* held here, in which case the check is racy. At worst we will either
4458
* accelerate reclamation of a valid page and violate LRU, or
4459
* unnecessarily defer reclamation of an invalid page.
4460
*
4461
* If we were asked to not cache the page, place it near the head of the
4462
* inactive queue so that is reclaimed sooner.
4463
*/
4464
if (noreuse || vm_page_none_valid(m)) {
4465
nqueue = PQ_INACTIVE;
4466
nflag = PGA_REQUEUE_HEAD;
4467
} else {
4468
nflag = PGA_REQUEUE;
4469
}
4470
4471
old = vm_page_astate_load(m);
4472
do {
4473
new = old;
4474
4475
/*
4476
* If the page is already in the active queue and we are not
4477
* trying to accelerate reclamation, simply mark it as
4478
* referenced and avoid any queue operations.
4479
*/
4480
new.flags &= ~PGA_QUEUE_OP_MASK;
4481
if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE &&
4482
(old.flags & PGA_ENQUEUED) != 0)
4483
new.flags |= PGA_REFERENCED;
4484
else {
4485
new.flags |= nflag;
4486
new.queue = nqueue;
4487
}
4488
} while (!vm_page_pqstate_commit(m, &old, new));
4489
}
4490
4491
/*
4492
* Unwire a page and either attempt to free it or re-add it to the page queues.
4493
*/
4494
void
4495
vm_page_release(vm_page_t m, int flags)
4496
{
4497
vm_object_t object;
4498
4499
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4500
("vm_page_release: page %p is unmanaged", m));
4501
4502
if ((flags & VPR_TRYFREE) != 0) {
4503
for (;;) {
4504
object = atomic_load_ptr(&m->object);
4505
if (object == NULL)
4506
break;
4507
/* Depends on type-stability. */
4508
if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object))
4509
break;
4510
if (object == m->object) {
4511
vm_page_release_locked(m, flags);
4512
VM_OBJECT_WUNLOCK(object);
4513
return;
4514
}
4515
VM_OBJECT_WUNLOCK(object);
4516
}
4517
}
4518
vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0);
4519
}
4520
4521
/* See vm_page_release(). */
4522
void
4523
vm_page_release_locked(vm_page_t m, int flags)
4524
{
4525
4526
VM_OBJECT_ASSERT_WLOCKED(m->object);
4527
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4528
("vm_page_release_locked: page %p is unmanaged", m));
4529
4530
if (vm_page_unwire_noq(m)) {
4531
if ((flags & VPR_TRYFREE) != 0 &&
4532
(m->object->ref_count == 0 || !pmap_page_is_mapped(m)) &&
4533
m->dirty == 0 && vm_page_tryxbusy(m)) {
4534
/*
4535
* An unlocked lookup may have wired the page before the
4536
* busy lock was acquired, in which case the page must
4537
* not be freed.
4538
*/
4539
if (__predict_true(!vm_page_wired(m))) {
4540
vm_page_free(m);
4541
return;
4542
}
4543
vm_page_xunbusy(m);
4544
} else {
4545
vm_page_release_toq(m, PQ_INACTIVE, flags != 0);
4546
}
4547
}
4548
}
4549
4550
static bool
4551
vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t))
4552
{
4553
u_int old;
4554
4555
KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0,
4556
("vm_page_try_blocked_op: page %p has no object", m));
4557
KASSERT(vm_page_busied(m),
4558
("vm_page_try_blocked_op: page %p is not busy", m));
4559
VM_OBJECT_ASSERT_LOCKED(m->object);
4560
4561
old = atomic_load_int(&m->ref_count);
4562
do {
4563
KASSERT(old != 0,
4564
("vm_page_try_blocked_op: page %p has no references", m));
4565
KASSERT((old & VPRC_BLOCKED) == 0,
4566
("vm_page_try_blocked_op: page %p blocks wirings", m));
4567
if (VPRC_WIRE_COUNT(old) != 0)
4568
return (false);
4569
} while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED));
4570
4571
(op)(m);
4572
4573
/*
4574
* If the object is read-locked, new wirings may be created via an
4575
* object lookup.
4576
*/
4577
old = vm_page_drop(m, VPRC_BLOCKED);
4578
KASSERT(!VM_OBJECT_WOWNED(m->object) ||
4579
old == (VPRC_BLOCKED | VPRC_OBJREF),
4580
("vm_page_try_blocked_op: unexpected refcount value %u for %p",
4581
old, m));
4582
return (true);
4583
}
4584
4585
/*
4586
* Atomically check for wirings and remove all mappings of the page.
4587
*/
4588
bool
4589
vm_page_try_remove_all(vm_page_t m)
4590
{
4591
4592
return (vm_page_try_blocked_op(m, pmap_remove_all));
4593
}
4594
4595
/*
4596
* Atomically check for wirings and remove all writeable mappings of the page.
4597
*/
4598
bool
4599
vm_page_try_remove_write(vm_page_t m)
4600
{
4601
4602
return (vm_page_try_blocked_op(m, pmap_remove_write));
4603
}
4604
4605
/*
4606
* vm_page_advise
4607
*
4608
* Apply the specified advice to the given page.
4609
*/
4610
void
4611
vm_page_advise(vm_page_t m, int advice)
4612
{
4613
4614
VM_OBJECT_ASSERT_WLOCKED(m->object);
4615
vm_page_assert_xbusied(m);
4616
4617
if (advice == MADV_FREE)
4618
/*
4619
* Mark the page clean. This will allow the page to be freed
4620
* without first paging it out. MADV_FREE pages are often
4621
* quickly reused by malloc(3), so we do not do anything that
4622
* would result in a page fault on a later access.
4623
*/
4624
vm_page_undirty(m);
4625
else if (advice != MADV_DONTNEED) {
4626
if (advice == MADV_WILLNEED)
4627
vm_page_activate(m);
4628
return;
4629
}
4630
4631
if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
4632
vm_page_dirty(m);
4633
4634
/*
4635
* Clear any references to the page. Otherwise, the page daemon will
4636
* immediately reactivate the page.
4637
*/
4638
vm_page_aflag_clear(m, PGA_REFERENCED);
4639
4640
/*
4641
* Place clean pages near the head of the inactive queue rather than
4642
* the tail, thus defeating the queue's LRU operation and ensuring that
4643
* the page will be reused quickly. Dirty pages not already in the
4644
* laundry are moved there.
4645
*/
4646
if (m->dirty == 0)
4647
vm_page_deactivate_noreuse(m);
4648
else if (!vm_page_in_laundry(m))
4649
vm_page_launder(m);
4650
}
4651
4652
/*
4653
* vm_page_grab_release
4654
*
4655
* Helper routine for grab functions to release busy on return.
4656
*/
4657
static inline void
4658
vm_page_grab_release(vm_page_t m, int allocflags)
4659
{
4660
4661
if ((allocflags & VM_ALLOC_NOBUSY) != 0) {
4662
if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4663
vm_page_sunbusy(m);
4664
else
4665
vm_page_xunbusy(m);
4666
}
4667
}
4668
4669
/*
4670
* vm_page_grab_sleep
4671
*
4672
* Sleep for busy according to VM_ALLOC_ parameters. Returns true
4673
* if the caller should retry and false otherwise.
4674
*
4675
* If the object is locked on entry the object will be unlocked with
4676
* false returns and still locked but possibly having been dropped
4677
* with true returns.
4678
*/
4679
static bool
4680
vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex,
4681
const char *wmesg, int allocflags, bool locked)
4682
{
4683
4684
if ((allocflags & VM_ALLOC_NOWAIT) != 0)
4685
return (false);
4686
4687
/*
4688
* Reference the page before unlocking and sleeping so that
4689
* the page daemon is less likely to reclaim it.
4690
*/
4691
if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0)
4692
vm_page_reference(m);
4693
4694
if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) &&
4695
locked)
4696
VM_OBJECT_WLOCK(object);
4697
if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
4698
return (false);
4699
4700
return (true);
4701
}
4702
4703
/*
4704
* Assert that the grab flags are valid.
4705
*/
4706
static inline void
4707
vm_page_grab_check(int allocflags)
4708
{
4709
4710
KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
4711
(allocflags & VM_ALLOC_WIRED) != 0,
4712
("vm_page_grab*: the pages must be busied or wired"));
4713
4714
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4715
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4716
("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4717
}
4718
4719
/*
4720
* Calculate the page allocation flags for grab.
4721
*/
4722
static inline int
4723
vm_page_grab_pflags(int allocflags)
4724
{
4725
int pflags;
4726
4727
pflags = allocflags &
4728
~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL |
4729
VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
4730
if ((allocflags & VM_ALLOC_NOWAIT) == 0)
4731
pflags |= VM_ALLOC_WAITFAIL;
4732
if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4733
pflags |= VM_ALLOC_SBUSY;
4734
4735
return (pflags);
4736
}
4737
4738
/*
4739
* Grab a page, waiting until we are woken up due to the page changing state.
4740
* We keep on waiting, if the page continues to be in the object, unless
4741
* allocflags forbid waiting.
4742
*
4743
* The object must be locked on entry. This routine may sleep. The lock will,
4744
* however, be released and reacquired if the routine sleeps.
4745
*
4746
* Return a grabbed page, or NULL. Set *found if a page was found, whether or
4747
* not it was grabbed.
4748
*/
4749
static inline vm_page_t
4750
vm_page_grab_lookup(vm_object_t object, vm_pindex_t pindex, int allocflags,
4751
bool *found, struct pctrie_iter *pages)
4752
{
4753
vm_page_t m;
4754
4755
while ((*found = (m = vm_radix_iter_lookup(pages, pindex)) != NULL) &&
4756
!vm_page_tryacquire(m, allocflags)) {
4757
if (!vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4758
allocflags, true))
4759
return (NULL);
4760
pctrie_iter_reset(pages);
4761
}
4762
return (m);
4763
}
4764
4765
/*
4766
* Grab a page. Use an iterator parameter. Keep on waiting, as long as the page
4767
* exists in the object. If the page doesn't exist, first allocate it and then
4768
* conditionally zero it.
4769
*
4770
* The object must be locked on entry. This routine may sleep. The lock will,
4771
* however, be released and reacquired if the routine sleeps.
4772
*/
4773
vm_page_t
4774
vm_page_grab_iter(vm_object_t object, vm_pindex_t pindex, int allocflags,
4775
struct pctrie_iter *pages)
4776
{
4777
vm_page_t m;
4778
bool found;
4779
4780
VM_OBJECT_ASSERT_WLOCKED(object);
4781
vm_page_grab_check(allocflags);
4782
4783
while ((m = vm_page_grab_lookup(
4784
object, pindex, allocflags, &found, pages)) == NULL) {
4785
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4786
return (NULL);
4787
if (found &&
4788
(allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
4789
return (NULL);
4790
m = vm_page_alloc_iter(object, pindex,
4791
vm_page_grab_pflags(allocflags), pages);
4792
if (m != NULL) {
4793
if ((allocflags & VM_ALLOC_ZERO) != 0 &&
4794
(m->flags & PG_ZERO) == 0)
4795
pmap_zero_page(m);
4796
break;
4797
}
4798
if ((allocflags &
4799
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
4800
return (NULL);
4801
}
4802
vm_page_grab_release(m, allocflags);
4803
4804
return (m);
4805
}
4806
4807
/*
4808
* Grab a page. Keep on waiting, as long as the page exists in the object. If
4809
* the page doesn't exist, first allocate it and then conditionally zero it.
4810
*
4811
* The object must be locked on entry. This routine may sleep. The lock will,
4812
* however, be released and reacquired if the routine sleeps.
4813
*/
4814
vm_page_t
4815
vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
4816
{
4817
struct pctrie_iter pages;
4818
4819
VM_OBJECT_ASSERT_WLOCKED(object);
4820
vm_page_iter_init(&pages, object);
4821
return (vm_page_grab_iter(object, pindex, allocflags, &pages));
4822
}
4823
4824
/*
4825
* Attempt to validate a page, locklessly acquiring it if necessary, given a
4826
* (object, pindex) tuple and either an invalided page or NULL. The resulting
4827
* page will be validated against the identity tuple, and busied or wired as
4828
* requested. A NULL page returned guarantees that the page was not in radix at
4829
* the time of the call but callers must perform higher level synchronization or
4830
* retry the operation under a lock if they require an atomic answer. This is
4831
* the only lock free validation routine, other routines can depend on the
4832
* resulting page state.
4833
*
4834
* The return value PAGE_NOT_ACQUIRED indicates that the operation failed due to
4835
* caller flags.
4836
*/
4837
#define PAGE_NOT_ACQUIRED ((vm_page_t)1)
4838
static vm_page_t
4839
vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t m,
4840
int allocflags)
4841
{
4842
if (m == NULL)
4843
m = vm_page_lookup_unlocked(object, pindex);
4844
for (; m != NULL; m = vm_page_lookup_unlocked(object, pindex)) {
4845
if (vm_page_trybusy(m, allocflags)) {
4846
if (m->object == object && m->pindex == pindex) {
4847
if ((allocflags & VM_ALLOC_WIRED) != 0)
4848
vm_page_wire(m);
4849
vm_page_grab_release(m, allocflags);
4850
break;
4851
}
4852
/* relookup. */
4853
vm_page_busy_release(m);
4854
cpu_spinwait();
4855
continue;
4856
}
4857
if (!vm_page_grab_sleep(object, m, pindex, "pgnslp",
4858
allocflags, false))
4859
return (PAGE_NOT_ACQUIRED);
4860
}
4861
return (m);
4862
}
4863
4864
/*
4865
* Try to locklessly grab a page and fall back to the object lock if NOCREAT
4866
* is not set.
4867
*/
4868
vm_page_t
4869
vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags)
4870
{
4871
vm_page_t m;
4872
4873
vm_page_grab_check(allocflags);
4874
m = vm_page_acquire_unlocked(object, pindex, NULL, allocflags);
4875
if (m == PAGE_NOT_ACQUIRED)
4876
return (NULL);
4877
if (m != NULL)
4878
return (m);
4879
4880
/*
4881
* The radix lockless lookup should never return a false negative
4882
* errors. If the user specifies NOCREAT they are guaranteed there
4883
* was no page present at the instant of the call. A NOCREAT caller
4884
* must handle create races gracefully.
4885
*/
4886
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4887
return (NULL);
4888
4889
VM_OBJECT_WLOCK(object);
4890
m = vm_page_grab(object, pindex, allocflags);
4891
VM_OBJECT_WUNLOCK(object);
4892
4893
return (m);
4894
}
4895
4896
/*
4897
* Grab a page and make it valid, paging in if necessary. Use an iterator
4898
* parameter. Pages missing from their pager are zero filled and validated. If
4899
* a VM_ALLOC_COUNT is supplied and the page is not valid as many as
4900
* VM_INITIAL_PAGEIN pages can be brought in simultaneously. Additional pages
4901
* will be left on a paging queue but will neither be wired nor busy regardless
4902
* of allocflags.
4903
*/
4904
int
4905
vm_page_grab_valid_iter(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex,
4906
int allocflags, struct pctrie_iter *pages)
4907
{
4908
vm_page_t m;
4909
vm_page_t ma[VM_INITIAL_PAGEIN];
4910
int after, ahead, i, pflags, rv;
4911
4912
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4913
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4914
("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4915
KASSERT((allocflags &
4916
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
4917
("vm_page_grab_valid: Invalid flags 0x%X", allocflags));
4918
VM_OBJECT_ASSERT_WLOCKED(object);
4919
pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY |
4920
VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY);
4921
pflags |= VM_ALLOC_WAITFAIL;
4922
4923
retrylookup:
4924
if ((m = vm_radix_iter_lookup(pages, pindex)) != NULL) {
4925
/*
4926
* If the page is fully valid it can only become invalid
4927
* with the object lock held. If it is not valid it can
4928
* become valid with the busy lock held. Therefore, we
4929
* may unnecessarily lock the exclusive busy here if we
4930
* race with I/O completion not using the object lock.
4931
* However, we will not end up with an invalid page and a
4932
* shared lock.
4933
*/
4934
if (!vm_page_trybusy(m,
4935
vm_page_all_valid(m) ? allocflags : 0)) {
4936
(void)vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4937
allocflags, true);
4938
pctrie_iter_reset(pages);
4939
goto retrylookup;
4940
}
4941
if (vm_page_all_valid(m))
4942
goto out;
4943
if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4944
vm_page_busy_release(m);
4945
*mp = NULL;
4946
return (VM_PAGER_FAIL);
4947
}
4948
} else if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4949
*mp = NULL;
4950
return (VM_PAGER_FAIL);
4951
} else {
4952
m = vm_page_alloc_iter(object, pindex, pflags, pages);
4953
if (m == NULL) {
4954
if (!vm_pager_can_alloc_page(object, pindex)) {
4955
*mp = NULL;
4956
return (VM_PAGER_AGAIN);
4957
}
4958
goto retrylookup;
4959
}
4960
}
4961
4962
vm_page_assert_xbusied(m);
4963
if (vm_pager_has_page(object, pindex, NULL, &after)) {
4964
after = MIN(after, VM_INITIAL_PAGEIN);
4965
after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT);
4966
after = MAX(after, 1);
4967
ma[0] = m;
4968
pctrie_iter_reset(pages);
4969
for (i = 1; i < after; i++) {
4970
m = vm_radix_iter_lookup_ge(pages, pindex + i);
4971
ahead = after;
4972
if (m != NULL)
4973
ahead = MIN(ahead, m->pindex - pindex);
4974
for (; i < ahead; i++) {
4975
ma[i] = vm_page_alloc_iter(object, pindex + i,
4976
VM_ALLOC_NORMAL, pages);
4977
if (ma[i] == NULL)
4978
break;
4979
}
4980
if (m == NULL || m->pindex != pindex + i ||
4981
vm_page_any_valid(m) || !vm_page_tryxbusy(m))
4982
break;
4983
ma[i] = m;
4984
}
4985
after = i;
4986
vm_object_pip_add(object, after);
4987
VM_OBJECT_WUNLOCK(object);
4988
rv = vm_pager_get_pages(object, ma, after, NULL, NULL);
4989
pctrie_iter_reset(pages);
4990
VM_OBJECT_WLOCK(object);
4991
vm_object_pip_wakeupn(object, after);
4992
/* Pager may have replaced a page. */
4993
m = ma[0];
4994
if (rv != VM_PAGER_OK) {
4995
for (i = 0; i < after; i++) {
4996
if (!vm_page_wired(ma[i]))
4997
vm_page_free(ma[i]);
4998
else
4999
vm_page_xunbusy(ma[i]);
5000
}
5001
*mp = NULL;
5002
return (rv);
5003
}
5004
for (i = 1; i < after; i++)
5005
vm_page_readahead_finish(ma[i]);
5006
MPASS(vm_page_all_valid(m));
5007
} else {
5008
vm_page_zero_invalid(m, TRUE);
5009
pctrie_iter_reset(pages);
5010
}
5011
out:
5012
if ((allocflags & VM_ALLOC_WIRED) != 0)
5013
vm_page_wire(m);
5014
if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m))
5015
vm_page_busy_downgrade(m);
5016
else if ((allocflags & VM_ALLOC_NOBUSY) != 0)
5017
vm_page_busy_release(m);
5018
*mp = m;
5019
return (VM_PAGER_OK);
5020
}
5021
5022
/*
5023
* Grab a page and make it valid, paging in if necessary. Pages missing from
5024
* their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied
5025
* and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought
5026
* in simultaneously. Additional pages will be left on a paging queue but
5027
* will neither be wired nor busy regardless of allocflags.
5028
*/
5029
int
5030
vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex,
5031
int allocflags)
5032
{
5033
struct pctrie_iter pages;
5034
5035
VM_OBJECT_ASSERT_WLOCKED(object);
5036
vm_page_iter_init(&pages, object);
5037
return (vm_page_grab_valid_iter(mp, object, pindex, allocflags,
5038
&pages));
5039
}
5040
5041
/*
5042
* Grab a page. Keep on waiting, as long as the page exists in the object. If
5043
* the page doesn't exist, and the pager has it, allocate it and zero part of
5044
* it.
5045
*
5046
* The object must be locked on entry. This routine may sleep. The lock will,
5047
* however, be released and reacquired if the routine sleeps.
5048
*/
5049
int
5050
vm_page_grab_zero_partial(vm_object_t object, vm_pindex_t pindex, int base,
5051
int end)
5052
{
5053
struct pctrie_iter pages;
5054
vm_page_t m;
5055
int allocflags, rv;
5056
bool found;
5057
5058
VM_OBJECT_ASSERT_WLOCKED(object);
5059
KASSERT(base >= 0, ("%s: base %d", __func__, base));
5060
KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base,
5061
end));
5062
5063
allocflags = VM_ALLOC_NOCREAT | VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL;
5064
vm_page_iter_init(&pages, object);
5065
while ((m = vm_page_grab_lookup(
5066
object, pindex, allocflags, &found, &pages)) == NULL) {
5067
if (!vm_pager_has_page(object, pindex, NULL, NULL))
5068
return (0);
5069
m = vm_page_alloc_iter(object, pindex,
5070
vm_page_grab_pflags(allocflags), &pages);
5071
if (m != NULL) {
5072
vm_object_pip_add(object, 1);
5073
VM_OBJECT_WUNLOCK(object);
5074
rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
5075
VM_OBJECT_WLOCK(object);
5076
vm_object_pip_wakeup(object);
5077
if (rv != VM_PAGER_OK) {
5078
vm_page_free(m);
5079
return (EIO);
5080
}
5081
5082
/*
5083
* Since the page was not resident, and therefore not
5084
* recently accessed, immediately enqueue it for
5085
* asynchronous laundering. The current operation is
5086
* not regarded as an access.
5087
*/
5088
vm_page_launder(m);
5089
break;
5090
}
5091
}
5092
5093
pmap_zero_page_area(m, base, end - base);
5094
KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", __func__, m));
5095
vm_page_set_dirty(m);
5096
vm_page_xunbusy(m);
5097
return (0);
5098
}
5099
5100
/*
5101
* Locklessly grab a valid page. If the page is not valid or not yet
5102
* allocated this will fall back to the object lock method.
5103
*/
5104
int
5105
vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object,
5106
vm_pindex_t pindex, int allocflags)
5107
{
5108
vm_page_t m;
5109
int flags;
5110
int error;
5111
5112
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
5113
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
5114
("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY "
5115
"mismatch"));
5116
KASSERT((allocflags &
5117
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
5118
("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags));
5119
5120
/*
5121
* Attempt a lockless lookup and busy. We need at least an sbusy
5122
* before we can inspect the valid field and return a wired page.
5123
*/
5124
flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
5125
vm_page_grab_check(flags);
5126
m = vm_page_acquire_unlocked(object, pindex, NULL, flags);
5127
if (m == PAGE_NOT_ACQUIRED)
5128
return (VM_PAGER_FAIL);
5129
if (m != NULL) {
5130
if (vm_page_all_valid(m)) {
5131
if ((allocflags & VM_ALLOC_WIRED) != 0)
5132
vm_page_wire(m);
5133
vm_page_grab_release(m, allocflags);
5134
*mp = m;
5135
return (VM_PAGER_OK);
5136
}
5137
vm_page_busy_release(m);
5138
}
5139
if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
5140
*mp = NULL;
5141
return (VM_PAGER_FAIL);
5142
}
5143
VM_OBJECT_WLOCK(object);
5144
error = vm_page_grab_valid(mp, object, pindex, allocflags);
5145
VM_OBJECT_WUNLOCK(object);
5146
5147
return (error);
5148
}
5149
5150
/*
5151
* Return the specified range of pages from the given object. For each
5152
* page offset within the range, if a page already exists within the object
5153
* at that offset and it is busy, then wait for it to change state. If,
5154
* instead, the page doesn't exist, then allocate it.
5155
*
5156
* The caller must always specify an allocation class.
5157
*
5158
* allocation classes:
5159
* VM_ALLOC_NORMAL normal process request
5160
* VM_ALLOC_SYSTEM system *really* needs the pages
5161
* VM_ALLOC_INTERRUPT interrupt time request
5162
*
5163
* The caller must always specify that the pages are to be busied and/or
5164
* wired.
5165
*
5166
* optional allocation flags:
5167
* VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages
5168
* VM_ALLOC_NOBUSY do not exclusive busy the pages
5169
* VM_ALLOC_NODUMP do not include the pages in a kernel core dump
5170
* VM_ALLOC_NOFREE pages will never be freed
5171
* VM_ALLOC_NOWAIT do not sleep
5172
* VM_ALLOC_SBUSY set pages to sbusy state
5173
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
5174
* VM_ALLOC_WAITOK ignored (default behavior)
5175
* VM_ALLOC_WIRED wire the pages
5176
* VM_ALLOC_ZERO zero and validate any invalid pages
5177
*
5178
* If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it
5179
* may return a partial prefix of the requested range.
5180
*/
5181
int
5182
vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
5183
vm_page_t *ma, int count)
5184
{
5185
struct pctrie_iter pages;
5186
vm_page_t m;
5187
int pflags;
5188
int ahead, i;
5189
5190
VM_OBJECT_ASSERT_WLOCKED(object);
5191
KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
5192
("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
5193
KASSERT(count > 0,
5194
("vm_page_grab_pages: invalid page count %d", count));
5195
vm_page_grab_check(allocflags);
5196
5197
pflags = vm_page_grab_pflags(allocflags);
5198
i = 0;
5199
vm_page_iter_init(&pages, object);
5200
retrylookup:
5201
ahead = -1;
5202
for (; i < count; i++) {
5203
if (ahead < 0) {
5204
ahead = vm_radix_iter_lookup_range(
5205
&pages, pindex + i, &ma[i], count - i);
5206
}
5207
if (ahead-- > 0) {
5208
m = ma[i];
5209
if (!vm_page_tryacquire(m, allocflags)) {
5210
if (vm_page_grab_sleep(object, m, pindex + i,
5211
"grbmaw", allocflags, true)) {
5212
pctrie_iter_reset(&pages);
5213
goto retrylookup;
5214
}
5215
break;
5216
}
5217
} else {
5218
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
5219
break;
5220
m = vm_page_alloc_iter(object, pindex + i,
5221
pflags | VM_ALLOC_COUNT(count - i), &pages);
5222
/* pages was reset if alloc_iter lost the lock. */
5223
if (m == NULL) {
5224
if ((allocflags & (VM_ALLOC_NOWAIT |
5225
VM_ALLOC_WAITFAIL)) != 0)
5226
break;
5227
goto retrylookup;
5228
}
5229
ma[i] = m;
5230
}
5231
if (vm_page_none_valid(m) &&
5232
(allocflags & VM_ALLOC_ZERO) != 0) {
5233
if ((m->flags & PG_ZERO) == 0)
5234
pmap_zero_page(m);
5235
vm_page_valid(m);
5236
}
5237
vm_page_grab_release(m, allocflags);
5238
}
5239
return (i);
5240
}
5241
5242
/*
5243
* Unlocked variant of vm_page_grab_pages(). This accepts the same flags
5244
* and will fall back to the locked variant to handle allocation.
5245
*/
5246
int
5247
vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
5248
int allocflags, vm_page_t *ma, int count)
5249
{
5250
vm_page_t m;
5251
int flags;
5252
int i, num_fetched;
5253
5254
KASSERT(count > 0,
5255
("vm_page_grab_pages_unlocked: invalid page count %d", count));
5256
vm_page_grab_check(allocflags);
5257
5258
/*
5259
* Modify flags for lockless acquire to hold the page until we
5260
* set it valid if necessary.
5261
*/
5262
flags = allocflags & ~VM_ALLOC_NOBUSY;
5263
vm_page_grab_check(flags);
5264
num_fetched = vm_radix_lookup_range_unlocked(&object->rtree, pindex,
5265
ma, count);
5266
for (i = 0; i < num_fetched; i++, pindex++) {
5267
m = vm_page_acquire_unlocked(object, pindex, ma[i], flags);
5268
if (m == PAGE_NOT_ACQUIRED)
5269
return (i);
5270
if (m == NULL)
5271
break;
5272
if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) {
5273
if ((m->flags & PG_ZERO) == 0)
5274
pmap_zero_page(m);
5275
vm_page_valid(m);
5276
}
5277
/* m will still be wired or busy according to flags. */
5278
vm_page_grab_release(m, allocflags);
5279
/* vm_page_acquire_unlocked() may not return ma[i]. */
5280
ma[i] = m;
5281
}
5282
if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0)
5283
return (i);
5284
count -= i;
5285
VM_OBJECT_WLOCK(object);
5286
i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count);
5287
VM_OBJECT_WUNLOCK(object);
5288
5289
return (i);
5290
}
5291
5292
/*
5293
* Mapping function for valid or dirty bits in a page.
5294
*
5295
* Inputs are required to range within a page.
5296
*/
5297
vm_page_bits_t
5298
vm_page_bits(int base, int size)
5299
{
5300
int first_bit;
5301
int last_bit;
5302
5303
KASSERT(
5304
base + size <= PAGE_SIZE,
5305
("vm_page_bits: illegal base/size %d/%d", base, size)
5306
);
5307
5308
if (size == 0) /* handle degenerate case */
5309
return (0);
5310
5311
first_bit = base >> DEV_BSHIFT;
5312
last_bit = (base + size - 1) >> DEV_BSHIFT;
5313
5314
return (((vm_page_bits_t)2 << last_bit) -
5315
((vm_page_bits_t)1 << first_bit));
5316
}
5317
5318
void
5319
vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set)
5320
{
5321
5322
#if PAGE_SIZE == 32768
5323
atomic_set_64((uint64_t *)bits, set);
5324
#elif PAGE_SIZE == 16384
5325
atomic_set_32((uint32_t *)bits, set);
5326
#elif (PAGE_SIZE == 8192) && defined(atomic_set_16)
5327
atomic_set_16((uint16_t *)bits, set);
5328
#elif (PAGE_SIZE == 4096) && defined(atomic_set_8)
5329
atomic_set_8((uint8_t *)bits, set);
5330
#else /* PAGE_SIZE <= 8192 */
5331
uintptr_t addr;
5332
int shift;
5333
5334
addr = (uintptr_t)bits;
5335
/*
5336
* Use a trick to perform a 32-bit atomic on the
5337
* containing aligned word, to not depend on the existence
5338
* of atomic_{set, clear}_{8, 16}.
5339
*/
5340
shift = addr & (sizeof(uint32_t) - 1);
5341
#if BYTE_ORDER == BIG_ENDIAN
5342
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5343
#else
5344
shift *= NBBY;
5345
#endif
5346
addr &= ~(sizeof(uint32_t) - 1);
5347
atomic_set_32((uint32_t *)addr, set << shift);
5348
#endif /* PAGE_SIZE */
5349
}
5350
5351
static inline void
5352
vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
5353
{
5354
5355
#if PAGE_SIZE == 32768
5356
atomic_clear_64((uint64_t *)bits, clear);
5357
#elif PAGE_SIZE == 16384
5358
atomic_clear_32((uint32_t *)bits, clear);
5359
#elif (PAGE_SIZE == 8192) && defined(atomic_clear_16)
5360
atomic_clear_16((uint16_t *)bits, clear);
5361
#elif (PAGE_SIZE == 4096) && defined(atomic_clear_8)
5362
atomic_clear_8((uint8_t *)bits, clear);
5363
#else /* PAGE_SIZE <= 8192 */
5364
uintptr_t addr;
5365
int shift;
5366
5367
addr = (uintptr_t)bits;
5368
/*
5369
* Use a trick to perform a 32-bit atomic on the
5370
* containing aligned word, to not depend on the existence
5371
* of atomic_{set, clear}_{8, 16}.
5372
*/
5373
shift = addr & (sizeof(uint32_t) - 1);
5374
#if BYTE_ORDER == BIG_ENDIAN
5375
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5376
#else
5377
shift *= NBBY;
5378
#endif
5379
addr &= ~(sizeof(uint32_t) - 1);
5380
atomic_clear_32((uint32_t *)addr, clear << shift);
5381
#endif /* PAGE_SIZE */
5382
}
5383
5384
static inline vm_page_bits_t
5385
vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
5386
{
5387
#if PAGE_SIZE == 32768
5388
uint64_t old;
5389
5390
old = *bits;
5391
while (atomic_fcmpset_64(bits, &old, newbits) == 0);
5392
return (old);
5393
#elif PAGE_SIZE == 16384
5394
uint32_t old;
5395
5396
old = *bits;
5397
while (atomic_fcmpset_32(bits, &old, newbits) == 0);
5398
return (old);
5399
#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
5400
uint16_t old;
5401
5402
old = *bits;
5403
while (atomic_fcmpset_16(bits, &old, newbits) == 0);
5404
return (old);
5405
#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
5406
uint8_t old;
5407
5408
old = *bits;
5409
while (atomic_fcmpset_8(bits, &old, newbits) == 0);
5410
return (old);
5411
#else /* PAGE_SIZE <= 4096*/
5412
uintptr_t addr;
5413
uint32_t old, new, mask;
5414
int shift;
5415
5416
addr = (uintptr_t)bits;
5417
/*
5418
* Use a trick to perform a 32-bit atomic on the
5419
* containing aligned word, to not depend on the existence
5420
* of atomic_{set, swap, clear}_{8, 16}.
5421
*/
5422
shift = addr & (sizeof(uint32_t) - 1);
5423
#if BYTE_ORDER == BIG_ENDIAN
5424
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5425
#else
5426
shift *= NBBY;
5427
#endif
5428
addr &= ~(sizeof(uint32_t) - 1);
5429
mask = VM_PAGE_BITS_ALL << shift;
5430
5431
old = *bits;
5432
do {
5433
new = old & ~mask;
5434
new |= newbits << shift;
5435
} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
5436
return (old >> shift);
5437
#endif /* PAGE_SIZE */
5438
}
5439
5440
/*
5441
* vm_page_set_valid_range:
5442
*
5443
* Sets portions of a page valid. The arguments are expected
5444
* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5445
* of any partial chunks touched by the range. The invalid portion of
5446
* such chunks will be zeroed.
5447
*
5448
* (base + size) must be less then or equal to PAGE_SIZE.
5449
*/
5450
void
5451
vm_page_set_valid_range(vm_page_t m, int base, int size)
5452
{
5453
int endoff, frag;
5454
vm_page_bits_t pagebits;
5455
5456
vm_page_assert_busied(m);
5457
if (size == 0) /* handle degenerate case */
5458
return;
5459
5460
/*
5461
* If the base is not DEV_BSIZE aligned and the valid
5462
* bit is clear, we have to zero out a portion of the
5463
* first block.
5464
*/
5465
if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5466
(m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
5467
pmap_zero_page_area(m, frag, base - frag);
5468
5469
/*
5470
* If the ending offset is not DEV_BSIZE aligned and the
5471
* valid bit is clear, we have to zero out a portion of
5472
* the last block.
5473
*/
5474
endoff = base + size;
5475
if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5476
(m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
5477
pmap_zero_page_area(m, endoff,
5478
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5479
5480
/*
5481
* Assert that no previously invalid block that is now being validated
5482
* is already dirty.
5483
*/
5484
KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
5485
("vm_page_set_valid_range: page %p is dirty", m));
5486
5487
/*
5488
* Set valid bits inclusive of any overlap.
5489
*/
5490
pagebits = vm_page_bits(base, size);
5491
if (vm_page_xbusied(m))
5492
m->valid |= pagebits;
5493
else
5494
vm_page_bits_set(m, &m->valid, pagebits);
5495
}
5496
5497
/*
5498
* Set the page dirty bits and free the invalid swap space if
5499
* present. Returns the previous dirty bits.
5500
*/
5501
vm_page_bits_t
5502
vm_page_set_dirty(vm_page_t m)
5503
{
5504
vm_page_bits_t old;
5505
5506
VM_PAGE_OBJECT_BUSY_ASSERT(m);
5507
5508
if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
5509
old = m->dirty;
5510
m->dirty = VM_PAGE_BITS_ALL;
5511
} else
5512
old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
5513
if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
5514
vm_pager_page_unswapped(m);
5515
5516
return (old);
5517
}
5518
5519
/*
5520
* Clear the given bits from the specified page's dirty field.
5521
*/
5522
static __inline void
5523
vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
5524
{
5525
5526
vm_page_assert_busied(m);
5527
5528
/*
5529
* If the page is xbusied and not write mapped we are the
5530
* only thread that can modify dirty bits. Otherwise, The pmap
5531
* layer can call vm_page_dirty() without holding a distinguished
5532
* lock. The combination of page busy and atomic operations
5533
* suffice to guarantee consistency of the page dirty field.
5534
*/
5535
if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
5536
m->dirty &= ~pagebits;
5537
else
5538
vm_page_bits_clear(m, &m->dirty, pagebits);
5539
}
5540
5541
/*
5542
* vm_page_set_validclean:
5543
*
5544
* Sets portions of a page valid and clean. The arguments are expected
5545
* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5546
* of any partial chunks touched by the range. The invalid portion of
5547
* such chunks will be zero'd.
5548
*
5549
* (base + size) must be less then or equal to PAGE_SIZE.
5550
*/
5551
void
5552
vm_page_set_validclean(vm_page_t m, int base, int size)
5553
{
5554
vm_page_bits_t oldvalid, pagebits;
5555
int endoff, frag;
5556
5557
vm_page_assert_busied(m);
5558
if (size == 0) /* handle degenerate case */
5559
return;
5560
5561
/*
5562
* If the base is not DEV_BSIZE aligned and the valid
5563
* bit is clear, we have to zero out a portion of the
5564
* first block.
5565
*/
5566
if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5567
(m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
5568
pmap_zero_page_area(m, frag, base - frag);
5569
5570
/*
5571
* If the ending offset is not DEV_BSIZE aligned and the
5572
* valid bit is clear, we have to zero out a portion of
5573
* the last block.
5574
*/
5575
endoff = base + size;
5576
if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5577
(m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
5578
pmap_zero_page_area(m, endoff,
5579
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5580
5581
/*
5582
* Set valid, clear dirty bits. If validating the entire
5583
* page we can safely clear the pmap modify bit. We also
5584
* use this opportunity to clear the PGA_NOSYNC flag. If a process
5585
* takes a write fault on a MAP_NOSYNC memory area the flag will
5586
* be set again.
5587
*
5588
* We set valid bits inclusive of any overlap, but we can only
5589
* clear dirty bits for DEV_BSIZE chunks that are fully within
5590
* the range.
5591
*/
5592
oldvalid = m->valid;
5593
pagebits = vm_page_bits(base, size);
5594
if (vm_page_xbusied(m))
5595
m->valid |= pagebits;
5596
else
5597
vm_page_bits_set(m, &m->valid, pagebits);
5598
#if 0 /* NOT YET */
5599
if ((frag = base & (DEV_BSIZE - 1)) != 0) {
5600
frag = DEV_BSIZE - frag;
5601
base += frag;
5602
size -= frag;
5603
if (size < 0)
5604
size = 0;
5605
}
5606
pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
5607
#endif
5608
if (base == 0 && size == PAGE_SIZE) {
5609
/*
5610
* The page can only be modified within the pmap if it is
5611
* mapped, and it can only be mapped if it was previously
5612
* fully valid.
5613
*/
5614
if (oldvalid == VM_PAGE_BITS_ALL)
5615
/*
5616
* Perform the pmap_clear_modify() first. Otherwise,
5617
* a concurrent pmap operation, such as
5618
* pmap_protect(), could clear a modification in the
5619
* pmap and set the dirty field on the page before
5620
* pmap_clear_modify() had begun and after the dirty
5621
* field was cleared here.
5622
*/
5623
pmap_clear_modify(m);
5624
m->dirty = 0;
5625
vm_page_aflag_clear(m, PGA_NOSYNC);
5626
} else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m))
5627
m->dirty &= ~pagebits;
5628
else
5629
vm_page_clear_dirty_mask(m, pagebits);
5630
}
5631
5632
void
5633
vm_page_clear_dirty(vm_page_t m, int base, int size)
5634
{
5635
5636
vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
5637
}
5638
5639
/*
5640
* vm_page_set_invalid:
5641
*
5642
* Invalidates DEV_BSIZE'd chunks within a page. Both the
5643
* valid and dirty bits for the effected areas are cleared.
5644
*/
5645
void
5646
vm_page_set_invalid(vm_page_t m, int base, int size)
5647
{
5648
vm_page_bits_t bits;
5649
vm_object_t object;
5650
5651
/*
5652
* The object lock is required so that pages can't be mapped
5653
* read-only while we're in the process of invalidating them.
5654
*/
5655
object = m->object;
5656
VM_OBJECT_ASSERT_WLOCKED(object);
5657
vm_page_assert_busied(m);
5658
5659
if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
5660
size >= object->un_pager.vnp.vnp_size)
5661
bits = VM_PAGE_BITS_ALL;
5662
else
5663
bits = vm_page_bits(base, size);
5664
if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0)
5665
pmap_remove_all(m);
5666
KASSERT((bits == 0 && vm_page_all_valid(m)) ||
5667
!pmap_page_is_mapped(m),
5668
("vm_page_set_invalid: page %p is mapped", m));
5669
if (vm_page_xbusied(m)) {
5670
m->valid &= ~bits;
5671
m->dirty &= ~bits;
5672
} else {
5673
vm_page_bits_clear(m, &m->valid, bits);
5674
vm_page_bits_clear(m, &m->dirty, bits);
5675
}
5676
}
5677
5678
/*
5679
* vm_page_invalid:
5680
*
5681
* Invalidates the entire page. The page must be busy, unmapped, and
5682
* the enclosing object must be locked. The object locks protects
5683
* against concurrent read-only pmap enter which is done without
5684
* busy.
5685
*/
5686
void
5687
vm_page_invalid(vm_page_t m)
5688
{
5689
5690
vm_page_assert_busied(m);
5691
VM_OBJECT_ASSERT_WLOCKED(m->object);
5692
MPASS(!pmap_page_is_mapped(m));
5693
5694
if (vm_page_xbusied(m))
5695
m->valid = 0;
5696
else
5697
vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL);
5698
}
5699
5700
/*
5701
* vm_page_zero_invalid()
5702
*
5703
* The kernel assumes that the invalid portions of a page contain
5704
* garbage, but such pages can be mapped into memory by user code.
5705
* When this occurs, we must zero out the non-valid portions of the
5706
* page so user code sees what it expects.
5707
*
5708
* Pages are most often semi-valid when the end of a file is mapped
5709
* into memory and the file's size is not page aligned.
5710
*/
5711
void
5712
vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
5713
{
5714
int b;
5715
int i;
5716
5717
/*
5718
* Scan the valid bits looking for invalid sections that
5719
* must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the
5720
* valid bit may be set ) have already been zeroed by
5721
* vm_page_set_validclean().
5722
*/
5723
for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
5724
if (i == (PAGE_SIZE / DEV_BSIZE) ||
5725
(m->valid & ((vm_page_bits_t)1 << i))) {
5726
if (i > b) {
5727
pmap_zero_page_area(m,
5728
b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
5729
}
5730
b = i + 1;
5731
}
5732
}
5733
5734
/*
5735
* setvalid is TRUE when we can safely set the zero'd areas
5736
* as being valid. We can do this if there are no cache consistency
5737
* issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
5738
*/
5739
if (setvalid)
5740
vm_page_valid(m);
5741
}
5742
5743
/*
5744
* vm_page_is_valid:
5745
*
5746
* Is (partial) page valid? Note that the case where size == 0
5747
* will return FALSE in the degenerate case where the page is
5748
* entirely invalid, and TRUE otherwise.
5749
*
5750
* Some callers envoke this routine without the busy lock held and
5751
* handle races via higher level locks. Typical callers should
5752
* hold a busy lock to prevent invalidation.
5753
*/
5754
int
5755
vm_page_is_valid(vm_page_t m, int base, int size)
5756
{
5757
vm_page_bits_t bits;
5758
5759
bits = vm_page_bits(base, size);
5760
return (vm_page_any_valid(m) && (m->valid & bits) == bits);
5761
}
5762
5763
/*
5764
* Returns true if all of the specified predicates are true for the entire
5765
* (super)page and false otherwise.
5766
*/
5767
bool
5768
vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m)
5769
{
5770
vm_object_t object;
5771
int i, npages;
5772
5773
object = m->object;
5774
if (skip_m != NULL && skip_m->object != object)
5775
return (false);
5776
VM_OBJECT_ASSERT_LOCKED(object);
5777
KASSERT(psind <= m->psind,
5778
("psind %d > psind %d of m %p", psind, m->psind, m));
5779
npages = atop(pagesizes[psind]);
5780
5781
/*
5782
* The physically contiguous pages that make up a superpage, i.e., a
5783
* page with a page size index ("psind") greater than zero, will
5784
* occupy adjacent entries in vm_page_array[].
5785
*/
5786
for (i = 0; i < npages; i++) {
5787
/* Always test object consistency, including "skip_m". */
5788
if (m[i].object != object)
5789
return (false);
5790
if (&m[i] == skip_m)
5791
continue;
5792
if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
5793
return (false);
5794
if ((flags & PS_ALL_DIRTY) != 0) {
5795
/*
5796
* Calling vm_page_test_dirty() or pmap_is_modified()
5797
* might stop this case from spuriously returning
5798
* "false". However, that would require a write lock
5799
* on the object containing "m[i]".
5800
*/
5801
if (m[i].dirty != VM_PAGE_BITS_ALL)
5802
return (false);
5803
}
5804
if ((flags & PS_ALL_VALID) != 0 &&
5805
m[i].valid != VM_PAGE_BITS_ALL)
5806
return (false);
5807
}
5808
return (true);
5809
}
5810
5811
/*
5812
* Set the page's dirty bits if the page is modified.
5813
*/
5814
void
5815
vm_page_test_dirty(vm_page_t m)
5816
{
5817
5818
vm_page_assert_busied(m);
5819
if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
5820
vm_page_dirty(m);
5821
}
5822
5823
void
5824
vm_page_valid(vm_page_t m)
5825
{
5826
5827
vm_page_assert_busied(m);
5828
if (vm_page_xbusied(m))
5829
m->valid = VM_PAGE_BITS_ALL;
5830
else
5831
vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL);
5832
}
5833
5834
#ifdef INVARIANTS
5835
void
5836
vm_page_object_busy_assert(vm_page_t m)
5837
{
5838
5839
/*
5840
* Certain of the page's fields may only be modified by the
5841
* holder of a page or object busy.
5842
*/
5843
if (m->object != NULL && !vm_page_busied(m))
5844
VM_OBJECT_ASSERT_BUSY(m->object);
5845
}
5846
5847
void
5848
vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits)
5849
{
5850
5851
if ((bits & PGA_WRITEABLE) == 0)
5852
return;
5853
5854
/*
5855
* The PGA_WRITEABLE flag can only be set if the page is
5856
* managed, is exclusively busied or the object is locked.
5857
* Currently, this flag is only set by pmap_enter().
5858
*/
5859
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5860
("PGA_WRITEABLE on unmanaged page"));
5861
if (!vm_page_xbusied(m))
5862
VM_OBJECT_ASSERT_BUSY(m->object);
5863
}
5864
#endif
5865
5866
#include "opt_ddb.h"
5867
#ifdef DDB
5868
#include <sys/kernel.h>
5869
5870
#include <ddb/ddb.h>
5871
5872
DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE)
5873
{
5874
5875
db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
5876
db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
5877
db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
5878
db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
5879
db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count());
5880
db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
5881
db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
5882
db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
5883
db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
5884
}
5885
5886
DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE)
5887
{
5888
int dom;
5889
5890
db_printf("pq_free %d\n", vm_free_count());
5891
for (dom = 0; dom < vm_ndomains; dom++) {
5892
db_printf(
5893
"dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
5894
dom,
5895
vm_dom[dom].vmd_page_count,
5896
vm_dom[dom].vmd_free_count,
5897
vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
5898
vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
5899
vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
5900
vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
5901
}
5902
}
5903
5904
DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
5905
{
5906
vm_page_t m;
5907
boolean_t phys, virt;
5908
5909
if (!have_addr) {
5910
db_printf("show pginfo addr\n");
5911
return;
5912
}
5913
5914
phys = strchr(modif, 'p') != NULL;
5915
virt = strchr(modif, 'v') != NULL;
5916
if (virt)
5917
m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
5918
else if (phys)
5919
m = PHYS_TO_VM_PAGE(addr);
5920
else
5921
m = (vm_page_t)addr;
5922
db_printf(
5923
"page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n"
5924
" af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
5925
m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
5926
m->a.queue, m->ref_count, m->a.flags, m->oflags,
5927
m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty);
5928
}
5929
#endif /* DDB */
5930
5931