Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_page.c
103356 views
1
/*-
2
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3
*
4
* Copyright (c) 1991 Regents of the University of California.
5
* All rights reserved.
6
* Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
7
*
8
* This code is derived from software contributed to Berkeley by
9
* The Mach Operating System project at Carnegie-Mellon University.
10
*
11
* Redistribution and use in source and binary forms, with or without
12
* modification, are permitted provided that the following conditions
13
* are met:
14
* 1. Redistributions of source code must retain the above copyright
15
* notice, this list of conditions and the following disclaimer.
16
* 2. Redistributions in binary form must reproduce the above copyright
17
* notice, this list of conditions and the following disclaimer in the
18
* documentation and/or other materials provided with the distribution.
19
* 3. Neither the name of the University nor the names of its contributors
20
* may be used to endorse or promote products derived from this software
21
* without specific prior written permission.
22
*
23
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33
* SUCH DAMAGE.
34
*/
35
36
/*-
37
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
38
* All rights reserved.
39
*
40
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
41
*
42
* Permission to use, copy, modify and distribute this software and
43
* its documentation is hereby granted, provided that both the copyright
44
* notice and this permission notice appear in all copies of the
45
* software, derivative works or modified versions, and any portions
46
* thereof, and that both notices appear in supporting documentation.
47
*
48
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51
*
52
* Carnegie Mellon requests users of this software to return to
53
*
54
* Software Distribution Coordinator or [email protected]
55
* School of Computer Science
56
* Carnegie Mellon University
57
* Pittsburgh PA 15213-3890
58
*
59
* any improvements or extensions that they make and grant Carnegie the
60
* rights to redistribute these changes.
61
*/
62
63
/*
64
* Resident memory management module.
65
*/
66
67
#include <sys/cdefs.h>
68
#include "opt_vm.h"
69
70
#include <sys/param.h>
71
#include <sys/systm.h>
72
#include <sys/counter.h>
73
#include <sys/domainset.h>
74
#include <sys/kernel.h>
75
#include <sys/limits.h>
76
#include <sys/linker.h>
77
#include <sys/lock.h>
78
#include <sys/malloc.h>
79
#include <sys/mman.h>
80
#include <sys/msgbuf.h>
81
#include <sys/mutex.h>
82
#include <sys/proc.h>
83
#include <sys/rwlock.h>
84
#include <sys/sleepqueue.h>
85
#include <sys/sbuf.h>
86
#include <sys/sched.h>
87
#include <sys/sf_buf.h>
88
#include <sys/smp.h>
89
#include <sys/sysctl.h>
90
#include <sys/vmmeter.h>
91
#include <sys/vnode.h>
92
93
#include <vm/vm.h>
94
#include <vm/pmap.h>
95
#include <vm/vm_param.h>
96
#include <vm/vm_domainset.h>
97
#include <vm/vm_kern.h>
98
#include <vm/vm_map.h>
99
#include <vm/vm_object.h>
100
#include <vm/vm_page.h>
101
#include <vm/vm_pageout.h>
102
#include <vm/vm_phys.h>
103
#include <vm/vm_pagequeue.h>
104
#include <vm/vm_pager.h>
105
#include <vm/vm_radix.h>
106
#include <vm/vm_reserv.h>
107
#include <vm/vm_extern.h>
108
#include <vm/vm_dumpset.h>
109
#include <vm/uma.h>
110
#include <vm/uma_int.h>
111
112
#include <machine/md_var.h>
113
114
struct vm_domain vm_dom[MAXMEMDOM];
115
116
DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
117
118
struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
119
/* The following fields are protected by the domainset lock. */
120
domainset_t __exclusive_cache_line vm_min_domains;
121
domainset_t __exclusive_cache_line vm_severe_domains;
122
static int vm_min_waiters;
123
static int vm_severe_waiters;
124
static int vm_pageproc_waiters;
125
126
static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
127
"VM page statistics");
128
129
static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries);
130
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries,
131
CTLFLAG_RD, &pqstate_commit_retries,
132
"Number of failed per-page atomic queue state updates");
133
134
static COUNTER_U64_DEFINE_EARLY(queue_ops);
135
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops,
136
CTLFLAG_RD, &queue_ops,
137
"Number of batched queue operations");
138
139
static COUNTER_U64_DEFINE_EARLY(queue_nops);
140
SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops,
141
CTLFLAG_RD, &queue_nops,
142
"Number of batched queue operations with no effects");
143
144
static unsigned long nofreeq_size;
145
SYSCTL_ULONG(_vm_stats_page, OID_AUTO, nofreeq_size, CTLFLAG_RD,
146
&nofreeq_size, 0,
147
"Size of the nofree queue");
148
149
#ifdef INVARIANTS
150
bool vm_check_pg_zero = false;
151
SYSCTL_BOOL(_debug, OID_AUTO, vm_check_pg_zero, CTLFLAG_RWTUN,
152
&vm_check_pg_zero, 0,
153
"verify content of freed zero-filled pages");
154
#endif
155
156
/*
157
* bogus page -- for I/O to/from partially complete buffers,
158
* or for paging into sparsely invalid regions.
159
*/
160
vm_page_t bogus_page;
161
162
vm_page_t vm_page_array;
163
long vm_page_array_size;
164
long first_page;
165
166
struct bitset *vm_page_dump;
167
long vm_page_dump_pages;
168
169
static TAILQ_HEAD(, vm_page) blacklist_head;
170
static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
171
SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
172
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
173
174
static uma_zone_t fakepg_zone;
175
176
static void vm_page_alloc_check(vm_page_t m);
177
static vm_page_t vm_page_alloc_nofree_domain(int domain, int req);
178
static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m,
179
vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked);
180
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
181
static void vm_page_enqueue(vm_page_t m, uint8_t queue);
182
static bool vm_page_free_prep(vm_page_t m);
183
static void vm_page_free_toq(vm_page_t m);
184
static void vm_page_init(void *dummy);
185
static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object);
186
static void vm_page_mvqueue(vm_page_t m, const uint8_t queue,
187
const uint16_t nflag);
188
static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
189
vm_page_t m_run, vm_paddr_t high);
190
static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse);
191
static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
192
int req);
193
static int vm_page_zone_import(void *arg, void **store, int cnt, int domain,
194
int flags);
195
static void vm_page_zone_release(void *arg, void **store, int cnt);
196
197
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
198
199
static void
200
vm_page_init(void *dummy)
201
{
202
203
fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
204
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
205
bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_NOFREE);
206
}
207
208
static int pgcache_zone_max_pcpu;
209
SYSCTL_INT(_vm, OID_AUTO, pgcache_zone_max_pcpu,
210
CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pgcache_zone_max_pcpu, 0,
211
"Per-CPU page cache size");
212
213
/*
214
* The cache page zone is initialized later since we need to be able to allocate
215
* pages before UMA is fully initialized.
216
*/
217
static void
218
vm_page_init_cache_zones(void *dummy __unused)
219
{
220
struct vm_domain *vmd;
221
struct vm_pgcache *pgcache;
222
int cache, domain, maxcache, pool;
223
224
TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &pgcache_zone_max_pcpu);
225
maxcache = pgcache_zone_max_pcpu * mp_ncpus;
226
for (domain = 0; domain < vm_ndomains; domain++) {
227
vmd = VM_DOMAIN(domain);
228
for (pool = 0; pool < VM_NFREEPOOL; pool++) {
229
#ifdef VM_FREEPOOL_LAZYINIT
230
if (pool == VM_FREEPOOL_LAZYINIT)
231
continue;
232
#endif
233
pgcache = &vmd->vmd_pgcache[pool];
234
pgcache->domain = domain;
235
pgcache->pool = pool;
236
pgcache->zone = uma_zcache_create("vm pgcache",
237
PAGE_SIZE, NULL, NULL, NULL, NULL,
238
vm_page_zone_import, vm_page_zone_release, pgcache,
239
UMA_ZONE_VM);
240
241
/*
242
* Limit each pool's zone to 0.1% of the pages in the
243
* domain.
244
*/
245
cache = maxcache != 0 ? maxcache :
246
vmd->vmd_page_count / 1000;
247
uma_zone_set_maxcache(pgcache->zone, cache);
248
}
249
}
250
}
251
SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
252
253
/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
254
#if PAGE_SIZE == 32768
255
#ifdef CTASSERT
256
CTASSERT(sizeof(u_long) >= 8);
257
#endif
258
#endif
259
260
/*
261
* vm_set_page_size:
262
*
263
* Sets the page size, perhaps based upon the memory
264
* size. Must be called before any use of page-size
265
* dependent functions.
266
*/
267
void
268
vm_set_page_size(void)
269
{
270
if (vm_cnt.v_page_size == 0)
271
vm_cnt.v_page_size = PAGE_SIZE;
272
if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
273
panic("vm_set_page_size: page size not a power of two");
274
}
275
276
/*
277
* vm_page_blacklist_next:
278
*
279
* Find the next entry in the provided string of blacklist
280
* addresses. Entries are separated by space, comma, or newline.
281
* If an invalid integer is encountered then the rest of the
282
* string is skipped. Updates the list pointer to the next
283
* character, or NULL if the string is exhausted or invalid.
284
*/
285
static vm_paddr_t
286
vm_page_blacklist_next(char **list, char *end)
287
{
288
vm_paddr_t bad;
289
char *cp, *pos;
290
291
if (list == NULL || *list == NULL)
292
return (0);
293
if (**list =='\0') {
294
*list = NULL;
295
return (0);
296
}
297
298
/*
299
* If there's no end pointer then the buffer is coming from
300
* the kenv and we know it's null-terminated.
301
*/
302
if (end == NULL)
303
end = *list + strlen(*list);
304
305
/* Ensure that strtoq() won't walk off the end */
306
if (*end != '\0') {
307
if (*end == '\n' || *end == ' ' || *end == ',')
308
*end = '\0';
309
else {
310
printf("Blacklist not terminated, skipping\n");
311
*list = NULL;
312
return (0);
313
}
314
}
315
316
for (pos = *list; *pos != '\0'; pos = cp) {
317
bad = strtoq(pos, &cp, 0);
318
if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
319
if (bad == 0) {
320
if (++cp < end)
321
continue;
322
else
323
break;
324
}
325
} else
326
break;
327
if (*cp == '\0' || ++cp >= end)
328
*list = NULL;
329
else
330
*list = cp;
331
return (trunc_page(bad));
332
}
333
printf("Garbage in RAM blacklist, skipping\n");
334
*list = NULL;
335
return (0);
336
}
337
338
bool
339
vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
340
{
341
struct vm_domain *vmd;
342
vm_page_t m;
343
bool found;
344
345
m = vm_phys_paddr_to_vm_page(pa);
346
if (m == NULL)
347
return (true); /* page does not exist, no failure */
348
349
vmd = VM_DOMAIN(vm_phys_domain(pa));
350
vm_domain_free_lock(vmd);
351
found = vm_phys_unfree_page(pa);
352
vm_domain_free_unlock(vmd);
353
if (found) {
354
vm_domain_freecnt_inc(vmd, -1);
355
TAILQ_INSERT_TAIL(&blacklist_head, m, plinks.q);
356
if (verbose)
357
printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
358
}
359
return (found);
360
}
361
362
/*
363
* vm_page_blacklist_check:
364
*
365
* Iterate through the provided string of blacklist addresses, pulling
366
* each entry out of the physical allocator free list and putting it
367
* onto a list for reporting via the vm.page_blacklist sysctl.
368
*/
369
static void
370
vm_page_blacklist_check(char *list, char *end)
371
{
372
vm_paddr_t pa;
373
char *next;
374
375
next = list;
376
while (next != NULL) {
377
if ((pa = vm_page_blacklist_next(&next, end)) == 0)
378
continue;
379
vm_page_blacklist_add(pa, bootverbose);
380
}
381
}
382
383
/*
384
* vm_page_blacklist_load:
385
*
386
* Search for a special module named "ram_blacklist". It'll be a
387
* plain text file provided by the user via the loader directive
388
* of the same name.
389
*/
390
static void
391
vm_page_blacklist_load(char **list, char **end)
392
{
393
void *mod;
394
u_char *ptr;
395
u_int len;
396
397
mod = NULL;
398
ptr = NULL;
399
400
mod = preload_search_by_type("ram_blacklist");
401
if (mod != NULL) {
402
ptr = preload_fetch_addr(mod);
403
len = preload_fetch_size(mod);
404
}
405
406
if (ptr != NULL && len > 0) {
407
*list = ptr;
408
*end = ptr + len - 1;
409
} else {
410
*list = NULL;
411
*end = NULL;
412
}
413
414
return;
415
}
416
417
static int
418
sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
419
{
420
vm_page_t m;
421
struct sbuf sbuf;
422
int error, first;
423
424
first = 1;
425
error = sysctl_wire_old_buffer(req, 0);
426
if (error != 0)
427
return (error);
428
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
429
TAILQ_FOREACH(m, &blacklist_head, plinks.q) {
430
sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
431
(uintmax_t)m->phys_addr);
432
first = 0;
433
}
434
error = sbuf_finish(&sbuf);
435
sbuf_delete(&sbuf);
436
return (error);
437
}
438
439
/*
440
* Initialize a dummy page for use in scans of the specified paging queue.
441
* In principle, this function only needs to set the flag PG_MARKER.
442
* Nonetheless, it write busies the page as a safety precaution.
443
*/
444
void
445
vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags)
446
{
447
448
bzero(marker, sizeof(*marker));
449
marker->flags = PG_MARKER;
450
marker->a.flags = aflags;
451
marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
452
marker->a.queue = queue;
453
}
454
455
static void
456
vm_page_domain_init(int domain)
457
{
458
struct vm_domain *vmd;
459
struct vm_pagequeue *pq;
460
int i;
461
462
vmd = VM_DOMAIN(domain);
463
bzero(vmd, sizeof(*vmd));
464
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
465
"vm inactive pagequeue";
466
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
467
"vm active pagequeue";
468
*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
469
"vm laundry pagequeue";
470
*__DECONST(const char **,
471
&vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
472
"vm unswappable pagequeue";
473
vmd->vmd_domain = domain;
474
vmd->vmd_page_count = 0;
475
vmd->vmd_free_count = 0;
476
vmd->vmd_segs = 0;
477
vmd->vmd_oom = false;
478
vmd->vmd_helper_threads_enabled = true;
479
for (i = 0; i < PQ_COUNT; i++) {
480
pq = &vmd->vmd_pagequeues[i];
481
TAILQ_INIT(&pq->pq_pl);
482
mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
483
MTX_DEF | MTX_DUPOK);
484
pq->pq_pdpages = 0;
485
vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
486
}
487
mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
488
mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
489
snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
490
491
/*
492
* inacthead is used to provide FIFO ordering for LRU-bypassing
493
* insertions.
494
*/
495
vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
496
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
497
&vmd->vmd_inacthead, plinks.q);
498
499
/*
500
* The clock pages are used to implement active queue scanning without
501
* requeues. Scans start at clock[0], which is advanced after the scan
502
* ends. When the two clock hands meet, they are reset and scanning
503
* resumes from the head of the queue.
504
*/
505
vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
506
vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
507
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
508
&vmd->vmd_clock[0], plinks.q);
509
TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
510
&vmd->vmd_clock[1], plinks.q);
511
}
512
513
/*
514
* Initialize a physical page in preparation for adding it to the free
515
* lists.
516
*/
517
void
518
vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind, int pool)
519
{
520
m->object = NULL;
521
m->ref_count = 0;
522
m->busy_lock = VPB_FREED;
523
m->flags = m->a.flags = 0;
524
m->phys_addr = pa;
525
m->a.queue = PQ_NONE;
526
m->psind = 0;
527
m->segind = segind;
528
m->order = VM_NFREEORDER;
529
m->pool = pool;
530
m->valid = m->dirty = 0;
531
pmap_page_init(m);
532
}
533
534
#ifndef PMAP_HAS_PAGE_ARRAY
535
static vm_paddr_t
536
vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range)
537
{
538
vm_paddr_t new_end;
539
540
/*
541
* Reserve an unmapped guard page to trap access to vm_page_array[-1].
542
* However, because this page is allocated from KVM, out-of-bounds
543
* accesses using the direct map will not be trapped.
544
*/
545
*vaddr += PAGE_SIZE;
546
547
/*
548
* Allocate physical memory for the page structures, and map it.
549
*/
550
new_end = trunc_page(end - page_range * sizeof(struct vm_page));
551
vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end,
552
VM_PROT_READ | VM_PROT_WRITE);
553
vm_page_array_size = page_range;
554
555
return (new_end);
556
}
557
#endif
558
559
/*
560
* vm_page_startup:
561
*
562
* Initializes the resident memory module. Allocates physical memory for
563
* bootstrapping UMA and some data structures that are used to manage
564
* physical pages. Initializes these structures, and populates the free
565
* page queues.
566
*/
567
vm_offset_t
568
vm_page_startup(vm_offset_t vaddr)
569
{
570
struct vm_phys_seg *seg;
571
struct vm_domain *vmd;
572
vm_page_t m;
573
char *list, *listend;
574
vm_paddr_t end, high_avail, low_avail, new_end, size;
575
vm_paddr_t page_range __unused;
576
vm_paddr_t last_pa, pa, startp, endp;
577
u_long pagecount;
578
#if MINIDUMP_PAGE_TRACKING
579
u_long vm_page_dump_size;
580
#endif
581
int biggestone, i, segind;
582
#ifdef WITNESS
583
vm_offset_t mapped;
584
int witness_size;
585
#endif
586
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
587
long ii;
588
#endif
589
int pool;
590
#ifdef VM_FREEPOOL_LAZYINIT
591
int lazyinit;
592
#endif
593
594
vaddr = round_page(vaddr);
595
596
vm_phys_early_startup();
597
biggestone = vm_phys_avail_largest();
598
end = phys_avail[biggestone+1];
599
600
/*
601
* Initialize the page and queue locks.
602
*/
603
mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
604
for (i = 0; i < vm_ndomains; i++)
605
vm_page_domain_init(i);
606
607
new_end = end;
608
#ifdef WITNESS
609
witness_size = round_page(witness_startup_count());
610
new_end -= witness_size;
611
mapped = pmap_map(&vaddr, new_end, new_end + witness_size,
612
VM_PROT_READ | VM_PROT_WRITE);
613
bzero((void *)mapped, witness_size);
614
witness_startup((void *)mapped);
615
#endif
616
617
#if MINIDUMP_PAGE_TRACKING
618
/*
619
* Allocate a bitmap to indicate that a random physical page
620
* needs to be included in a minidump.
621
*
622
* The amd64 port needs this to indicate which direct map pages
623
* need to be dumped, via calls to dump_add_page()/dump_drop_page().
624
*
625
* However, i386 still needs this workspace internally within the
626
* minidump code. In theory, they are not needed on i386, but are
627
* included should the sf_buf code decide to use them.
628
*/
629
last_pa = 0;
630
vm_page_dump_pages = 0;
631
for (i = 0; dump_avail[i + 1] != 0; i += 2) {
632
vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) -
633
dump_avail[i] / PAGE_SIZE;
634
if (dump_avail[i + 1] > last_pa)
635
last_pa = dump_avail[i + 1];
636
}
637
vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages));
638
new_end -= vm_page_dump_size;
639
vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
640
new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
641
bzero((void *)vm_page_dump, vm_page_dump_size);
642
#if MINIDUMP_STARTUP_PAGE_TRACKING
643
/*
644
* Include the UMA bootstrap pages, witness pages and vm_page_dump
645
* in a crash dump. When pmap_map() uses the direct map, they are
646
* not automatically included.
647
*/
648
for (pa = new_end; pa < end; pa += PAGE_SIZE)
649
dump_add_page(pa);
650
#endif
651
#else
652
(void)last_pa;
653
#endif
654
phys_avail[biggestone + 1] = new_end;
655
#ifdef __amd64__
656
/*
657
* Request that the physical pages underlying the message buffer be
658
* included in a crash dump. Since the message buffer is accessed
659
* through the direct map, they are not automatically included.
660
*/
661
pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
662
last_pa = pa + round_page(msgbufsize);
663
while (pa < last_pa) {
664
dump_add_page(pa);
665
pa += PAGE_SIZE;
666
}
667
#else
668
(void)pa;
669
#endif
670
671
/*
672
* Determine the lowest and highest physical addresses and, in the case
673
* of VM_PHYSSEG_SPARSE, the exact size of the available physical
674
* memory. vm_phys_early_startup() already checked that phys_avail[]
675
* has at least one element.
676
*/
677
#ifdef VM_PHYSSEG_SPARSE
678
size = phys_avail[1] - phys_avail[0];
679
#endif
680
low_avail = phys_avail[0];
681
high_avail = phys_avail[1];
682
for (i = 2; phys_avail[i + 1] != 0; i += 2) {
683
#ifdef VM_PHYSSEG_SPARSE
684
size += phys_avail[i + 1] - phys_avail[i];
685
#endif
686
if (phys_avail[i] < low_avail)
687
low_avail = phys_avail[i];
688
if (phys_avail[i + 1] > high_avail)
689
high_avail = phys_avail[i + 1];
690
}
691
for (i = 0; i < vm_phys_nsegs; i++) {
692
#ifdef VM_PHYSSEG_SPARSE
693
size += vm_phys_segs[i].end - vm_phys_segs[i].start;
694
#endif
695
if (vm_phys_segs[i].start < low_avail)
696
low_avail = vm_phys_segs[i].start;
697
if (vm_phys_segs[i].end > high_avail)
698
high_avail = vm_phys_segs[i].end;
699
}
700
first_page = low_avail / PAGE_SIZE;
701
#ifdef VM_PHYSSEG_DENSE
702
size = high_avail - low_avail;
703
#endif
704
705
#ifdef PMAP_HAS_PAGE_ARRAY
706
pmap_page_array_startup(size / PAGE_SIZE);
707
biggestone = vm_phys_avail_largest();
708
end = new_end = phys_avail[biggestone + 1];
709
#else
710
#ifdef VM_PHYSSEG_DENSE
711
/*
712
* In the VM_PHYSSEG_DENSE case, the number of pages can account for
713
* the overhead of a page structure per page only if vm_page_array is
714
* allocated from the last physical memory chunk. Otherwise, we must
715
* allocate page structures representing the physical memory
716
* underlying vm_page_array, even though they will not be used.
717
*/
718
if (new_end != high_avail)
719
page_range = size / PAGE_SIZE;
720
else
721
#endif
722
{
723
page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
724
725
/*
726
* If the partial bytes remaining are large enough for
727
* a page (PAGE_SIZE) without a corresponding
728
* 'struct vm_page', then new_end will contain an
729
* extra page after subtracting the length of the VM
730
* page array. Compensate by subtracting an extra
731
* page from new_end.
732
*/
733
if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
734
if (new_end == high_avail)
735
high_avail -= PAGE_SIZE;
736
new_end -= PAGE_SIZE;
737
}
738
}
739
end = new_end;
740
new_end = vm_page_array_alloc(&vaddr, end, page_range);
741
#endif
742
743
#if VM_NRESERVLEVEL > 0
744
/*
745
* Allocate physical memory for the reservation management system's
746
* data structures, and map it.
747
*/
748
new_end = vm_reserv_startup(&vaddr, new_end);
749
#endif
750
#if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING
751
/*
752
* Include vm_page_array and vm_reserv_array in a crash dump.
753
*/
754
for (pa = new_end; pa < end; pa += PAGE_SIZE)
755
dump_add_page(pa);
756
#endif
757
phys_avail[biggestone + 1] = new_end;
758
759
/*
760
* Add physical memory segments corresponding to the available
761
* physical pages.
762
*/
763
for (i = 0; phys_avail[i + 1] != 0; i += 2)
764
vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
765
766
/*
767
* Initialize the physical memory allocator.
768
*/
769
vm_phys_init();
770
771
pool = VM_FREEPOOL_DEFAULT;
772
#ifdef VM_FREEPOOL_LAZYINIT
773
lazyinit = 1;
774
TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit);
775
if (lazyinit)
776
pool = VM_FREEPOOL_LAZYINIT;
777
#endif
778
779
/*
780
* Initialize the page structures and add every available page to the
781
* physical memory allocator's free lists.
782
*/
783
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
784
for (ii = 0; ii < vm_page_array_size; ii++) {
785
m = &vm_page_array[ii];
786
vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0,
787
VM_FREEPOOL_DEFAULT);
788
m->flags = PG_FICTITIOUS;
789
}
790
#endif
791
vm_cnt.v_page_count = 0;
792
for (segind = 0; segind < vm_phys_nsegs; segind++) {
793
seg = &vm_phys_segs[segind];
794
795
/*
796
* Initialize pages not covered by phys_avail[], since they
797
* might be freed to the allocator at some future point, e.g.,
798
* by kmem_bootstrap_free().
799
*/
800
startp = seg->start;
801
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
802
if (startp >= seg->end)
803
break;
804
if (phys_avail[i + 1] < startp)
805
continue;
806
if (phys_avail[i] <= startp) {
807
startp = phys_avail[i + 1];
808
continue;
809
}
810
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
811
for (endp = MIN(phys_avail[i], seg->end);
812
startp < endp; startp += PAGE_SIZE, m++) {
813
vm_page_init_page(m, startp, segind,
814
VM_FREEPOOL_DEFAULT);
815
}
816
}
817
818
/*
819
* Add the segment's pages that are covered by one of
820
* phys_avail's ranges to the free lists.
821
*/
822
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
823
if (seg->end <= phys_avail[i] ||
824
seg->start >= phys_avail[i + 1])
825
continue;
826
827
startp = MAX(seg->start, phys_avail[i]);
828
endp = MIN(seg->end, phys_avail[i + 1]);
829
pagecount = (u_long)atop(endp - startp);
830
if (pagecount == 0)
831
continue;
832
833
/*
834
* If lazy vm_page initialization is not enabled, simply
835
* initialize all of the pages in the segment covered by
836
* phys_avail. Otherwise, initialize only the first
837
* page of each run of free pages handed to the vm_phys
838
* allocator, which in turn defers initialization of
839
* pages until they are needed.
840
*
841
* This avoids blocking the boot process for long
842
* periods, which may be relevant for VMs (which ought
843
* to boot as quickly as possible) and/or systems with
844
* large amounts of physical memory.
845
*/
846
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
847
vm_page_init_page(m, startp, segind, pool);
848
if (pool == VM_FREEPOOL_DEFAULT) {
849
for (u_long j = 1; j < pagecount; j++) {
850
vm_page_init_page(&m[j],
851
startp + ptoa((vm_paddr_t)j),
852
segind, pool);
853
}
854
}
855
vmd = VM_DOMAIN(seg->domain);
856
vm_domain_free_lock(vmd);
857
vm_phys_enqueue_contig(m, pool, pagecount);
858
vm_domain_free_unlock(vmd);
859
vm_domain_freecnt_inc(vmd, pagecount);
860
vm_cnt.v_page_count += (u_int)pagecount;
861
vmd->vmd_page_count += (u_int)pagecount;
862
vmd->vmd_segs |= 1UL << segind;
863
}
864
}
865
866
/*
867
* Remove blacklisted pages from the physical memory allocator.
868
*/
869
TAILQ_INIT(&blacklist_head);
870
vm_page_blacklist_load(&list, &listend);
871
vm_page_blacklist_check(list, listend);
872
873
list = kern_getenv("vm.blacklist");
874
vm_page_blacklist_check(list, NULL);
875
876
freeenv(list);
877
#if VM_NRESERVLEVEL > 0
878
/*
879
* Initialize the reservation management system.
880
*/
881
vm_reserv_init();
882
#endif
883
884
return (vaddr);
885
}
886
887
void
888
vm_page_reference(vm_page_t m)
889
{
890
891
vm_page_aflag_set(m, PGA_REFERENCED);
892
}
893
894
/*
895
* vm_page_trybusy
896
*
897
* Helper routine for grab functions to trylock busy.
898
*
899
* Returns true on success and false on failure.
900
*/
901
static bool
902
vm_page_trybusy(vm_page_t m, int allocflags)
903
{
904
905
if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0)
906
return (vm_page_trysbusy(m));
907
else
908
return (vm_page_tryxbusy(m));
909
}
910
911
/*
912
* vm_page_tryacquire
913
*
914
* Helper routine for grab functions to trylock busy and wire.
915
*
916
* Returns true on success and false on failure.
917
*/
918
static inline bool
919
vm_page_tryacquire(vm_page_t m, int allocflags)
920
{
921
bool locked;
922
923
locked = vm_page_trybusy(m, allocflags);
924
if (locked && (allocflags & VM_ALLOC_WIRED) != 0)
925
vm_page_wire(m);
926
return (locked);
927
}
928
929
/*
930
* vm_page_busy_acquire:
931
*
932
* Acquire the busy lock as described by VM_ALLOC_* flags. Will loop
933
* and drop the object lock if necessary.
934
*/
935
bool
936
vm_page_busy_acquire(vm_page_t m, int allocflags)
937
{
938
vm_object_t obj;
939
bool locked;
940
941
/*
942
* The page-specific object must be cached because page
943
* identity can change during the sleep, causing the
944
* re-lock of a different object.
945
* It is assumed that a reference to the object is already
946
* held by the callers.
947
*/
948
obj = atomic_load_ptr(&m->object);
949
for (;;) {
950
if (vm_page_tryacquire(m, allocflags))
951
return (true);
952
if ((allocflags & VM_ALLOC_NOWAIT) != 0)
953
return (false);
954
if (obj != NULL)
955
locked = VM_OBJECT_WOWNED(obj);
956
else
957
locked = false;
958
MPASS(locked || vm_page_wired(m));
959
if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags,
960
locked) && locked)
961
VM_OBJECT_WLOCK(obj);
962
if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
963
return (false);
964
KASSERT(m->object == obj || m->object == NULL,
965
("vm_page_busy_acquire: page %p does not belong to %p",
966
m, obj));
967
}
968
}
969
970
/*
971
* vm_page_busy_downgrade:
972
*
973
* Downgrade an exclusive busy page into a single shared busy page.
974
*/
975
void
976
vm_page_busy_downgrade(vm_page_t m)
977
{
978
u_int x;
979
980
vm_page_assert_xbusied(m);
981
982
x = vm_page_busy_fetch(m);
983
for (;;) {
984
if (atomic_fcmpset_rel_int(&m->busy_lock,
985
&x, VPB_SHARERS_WORD(1)))
986
break;
987
}
988
if ((x & VPB_BIT_WAITERS) != 0)
989
wakeup(m);
990
}
991
992
/*
993
*
994
* vm_page_busy_tryupgrade:
995
*
996
* Attempt to upgrade a single shared busy into an exclusive busy.
997
*/
998
int
999
vm_page_busy_tryupgrade(vm_page_t m)
1000
{
1001
u_int ce, x;
1002
1003
vm_page_assert_sbusied(m);
1004
1005
x = vm_page_busy_fetch(m);
1006
ce = VPB_CURTHREAD_EXCLUSIVE;
1007
for (;;) {
1008
if (VPB_SHARERS(x) > 1)
1009
return (0);
1010
KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
1011
("vm_page_busy_tryupgrade: invalid lock state"));
1012
if (!atomic_fcmpset_acq_int(&m->busy_lock, &x,
1013
ce | (x & VPB_BIT_WAITERS)))
1014
continue;
1015
return (1);
1016
}
1017
}
1018
1019
/*
1020
* vm_page_sbusied:
1021
*
1022
* Return a positive value if the page is shared busied, 0 otherwise.
1023
*/
1024
int
1025
vm_page_sbusied(vm_page_t m)
1026
{
1027
u_int x;
1028
1029
x = vm_page_busy_fetch(m);
1030
return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
1031
}
1032
1033
/*
1034
* vm_page_sunbusy:
1035
*
1036
* Shared unbusy a page.
1037
*/
1038
void
1039
vm_page_sunbusy(vm_page_t m)
1040
{
1041
u_int x;
1042
1043
vm_page_assert_sbusied(m);
1044
1045
x = vm_page_busy_fetch(m);
1046
for (;;) {
1047
KASSERT(x != VPB_FREED,
1048
("vm_page_sunbusy: Unlocking freed page."));
1049
if (VPB_SHARERS(x) > 1) {
1050
if (atomic_fcmpset_int(&m->busy_lock, &x,
1051
x - VPB_ONE_SHARER))
1052
break;
1053
continue;
1054
}
1055
KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
1056
("vm_page_sunbusy: invalid lock state"));
1057
if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
1058
continue;
1059
if ((x & VPB_BIT_WAITERS) == 0)
1060
break;
1061
wakeup(m);
1062
break;
1063
}
1064
}
1065
1066
/*
1067
* vm_page_busy_sleep:
1068
*
1069
* Sleep if the page is busy, using the page pointer as wchan.
1070
* This is used to implement the hard-path of the busying mechanism.
1071
*
1072
* If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1073
* will not sleep if the page is shared-busy.
1074
*
1075
* The object lock must be held on entry.
1076
*
1077
* Returns true if it slept and dropped the object lock, or false
1078
* if there was no sleep and the lock is still held.
1079
*/
1080
bool
1081
vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags)
1082
{
1083
vm_object_t obj;
1084
1085
obj = m->object;
1086
VM_OBJECT_ASSERT_LOCKED(obj);
1087
1088
return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags,
1089
true));
1090
}
1091
1092
/*
1093
* vm_page_busy_sleep_unlocked:
1094
*
1095
* Sleep if the page is busy, using the page pointer as wchan.
1096
* This is used to implement the hard-path of busying mechanism.
1097
*
1098
* If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1099
* will not sleep if the page is shared-busy.
1100
*
1101
* The object lock must not be held on entry. The operation will
1102
* return if the page changes identity.
1103
*/
1104
void
1105
vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1106
const char *wmesg, int allocflags)
1107
{
1108
VM_OBJECT_ASSERT_UNLOCKED(obj);
1109
1110
(void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false);
1111
}
1112
1113
/*
1114
* _vm_page_busy_sleep:
1115
*
1116
* Internal busy sleep function. Verifies the page identity and
1117
* lockstate against parameters. Returns true if it sleeps and
1118
* false otherwise.
1119
*
1120
* allocflags uses VM_ALLOC_* flags to specify the lock required.
1121
*
1122
* If locked is true the lock will be dropped for any true returns
1123
* and held for any false returns.
1124
*/
1125
static bool
1126
_vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1127
const char *wmesg, int allocflags, bool locked)
1128
{
1129
bool xsleep;
1130
u_int x;
1131
1132
/*
1133
* If the object is busy we must wait for that to drain to zero
1134
* before trying the page again.
1135
*/
1136
if (obj != NULL && vm_object_busied(obj)) {
1137
if (locked)
1138
VM_OBJECT_DROP(obj);
1139
vm_object_busy_wait(obj, wmesg);
1140
return (true);
1141
}
1142
1143
if (!vm_page_busied(m))
1144
return (false);
1145
1146
xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0;
1147
sleepq_lock(m);
1148
x = vm_page_busy_fetch(m);
1149
do {
1150
/*
1151
* If the page changes objects or becomes unlocked we can
1152
* simply return.
1153
*/
1154
if (x == VPB_UNBUSIED ||
1155
(xsleep && (x & VPB_BIT_SHARED) != 0) ||
1156
m->object != obj || m->pindex != pindex) {
1157
sleepq_release(m);
1158
return (false);
1159
}
1160
if ((x & VPB_BIT_WAITERS) != 0)
1161
break;
1162
} while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS));
1163
if (locked)
1164
VM_OBJECT_DROP(obj);
1165
DROP_GIANT();
1166
sleepq_add(m, NULL, wmesg, 0, 0);
1167
sleepq_wait(m, PVM);
1168
PICKUP_GIANT();
1169
return (true);
1170
}
1171
1172
/*
1173
* vm_page_trysbusy:
1174
*
1175
* Try to shared busy a page.
1176
* If the operation succeeds 1 is returned otherwise 0.
1177
* The operation never sleeps.
1178
*/
1179
int
1180
vm_page_trysbusy(vm_page_t m)
1181
{
1182
vm_object_t obj;
1183
u_int x;
1184
1185
obj = m->object;
1186
x = vm_page_busy_fetch(m);
1187
for (;;) {
1188
if ((x & VPB_BIT_SHARED) == 0)
1189
return (0);
1190
/*
1191
* Reduce the window for transient busies that will trigger
1192
* false negatives in vm_page_ps_test().
1193
*/
1194
if (obj != NULL && vm_object_busied(obj))
1195
return (0);
1196
if (atomic_fcmpset_acq_int(&m->busy_lock, &x,
1197
x + VPB_ONE_SHARER))
1198
break;
1199
}
1200
1201
/* Refetch the object now that we're guaranteed that it is stable. */
1202
obj = m->object;
1203
if (obj != NULL && vm_object_busied(obj)) {
1204
vm_page_sunbusy(m);
1205
return (0);
1206
}
1207
return (1);
1208
}
1209
1210
/*
1211
* vm_page_tryxbusy:
1212
*
1213
* Try to exclusive busy a page.
1214
* If the operation succeeds 1 is returned otherwise 0.
1215
* The operation never sleeps.
1216
*/
1217
int
1218
vm_page_tryxbusy(vm_page_t m)
1219
{
1220
vm_object_t obj;
1221
1222
if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,
1223
VPB_CURTHREAD_EXCLUSIVE) == 0)
1224
return (0);
1225
1226
obj = m->object;
1227
if (obj != NULL && vm_object_busied(obj)) {
1228
vm_page_xunbusy(m);
1229
return (0);
1230
}
1231
return (1);
1232
}
1233
1234
static void
1235
vm_page_xunbusy_hard_tail(vm_page_t m)
1236
{
1237
atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
1238
/* Wake the waiter. */
1239
wakeup(m);
1240
}
1241
1242
/*
1243
* vm_page_xunbusy_hard:
1244
*
1245
* Called when unbusy has failed because there is a waiter.
1246
*/
1247
void
1248
vm_page_xunbusy_hard(vm_page_t m)
1249
{
1250
vm_page_assert_xbusied(m);
1251
vm_page_xunbusy_hard_tail(m);
1252
}
1253
1254
void
1255
vm_page_xunbusy_hard_unchecked(vm_page_t m)
1256
{
1257
vm_page_assert_xbusied_unchecked(m);
1258
vm_page_xunbusy_hard_tail(m);
1259
}
1260
1261
static void
1262
vm_page_busy_free(vm_page_t m)
1263
{
1264
u_int x;
1265
1266
atomic_thread_fence_rel();
1267
x = atomic_swap_int(&m->busy_lock, VPB_FREED);
1268
if ((x & VPB_BIT_WAITERS) != 0)
1269
wakeup(m);
1270
}
1271
1272
/*
1273
* vm_page_unhold_pages:
1274
*
1275
* Unhold each of the pages that is referenced by the given array.
1276
*/
1277
void
1278
vm_page_unhold_pages(vm_page_t *ma, int count)
1279
{
1280
1281
for (; count != 0; count--) {
1282
vm_page_unwire(*ma, PQ_ACTIVE);
1283
ma++;
1284
}
1285
}
1286
1287
vm_page_t
1288
PHYS_TO_VM_PAGE(vm_paddr_t pa)
1289
{
1290
vm_page_t m;
1291
1292
#ifdef VM_PHYSSEG_SPARSE
1293
m = vm_phys_paddr_to_vm_page(pa);
1294
if (m == NULL)
1295
m = vm_phys_fictitious_to_vm_page(pa);
1296
return (m);
1297
#elif defined(VM_PHYSSEG_DENSE)
1298
long pi;
1299
1300
pi = atop(pa);
1301
if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1302
m = &vm_page_array[pi - first_page];
1303
return (m);
1304
}
1305
return (vm_phys_fictitious_to_vm_page(pa));
1306
#else
1307
#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
1308
#endif
1309
}
1310
1311
/*
1312
* vm_page_getfake:
1313
*
1314
* Create a fictitious page with the specified physical address and
1315
* memory attribute. The memory attribute is the only the machine-
1316
* dependent aspect of a fictitious page that must be initialized.
1317
*/
1318
vm_page_t
1319
vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
1320
{
1321
vm_page_t m;
1322
1323
m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
1324
vm_page_initfake(m, paddr, memattr);
1325
return (m);
1326
}
1327
1328
void
1329
vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1330
{
1331
1332
if ((m->flags & PG_FICTITIOUS) != 0) {
1333
/*
1334
* The page's memattr might have changed since the
1335
* previous initialization. Update the pmap to the
1336
* new memattr.
1337
*/
1338
goto memattr;
1339
}
1340
m->phys_addr = paddr;
1341
m->a.queue = PQ_NONE;
1342
/* Fictitious pages don't use "segind". */
1343
m->flags = PG_FICTITIOUS;
1344
/* Fictitious pages don't use "order" or "pool". */
1345
m->oflags = VPO_UNMANAGED;
1346
m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
1347
/* Fictitious pages are unevictable. */
1348
m->ref_count = 1;
1349
pmap_page_init(m);
1350
memattr:
1351
pmap_page_set_memattr(m, memattr);
1352
}
1353
1354
/*
1355
* vm_page_putfake:
1356
*
1357
* Release a fictitious page.
1358
*/
1359
void
1360
vm_page_putfake(vm_page_t m)
1361
{
1362
1363
KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
1364
KASSERT((m->flags & PG_FICTITIOUS) != 0,
1365
("vm_page_putfake: bad page %p", m));
1366
vm_page_assert_xbusied(m);
1367
vm_page_busy_free(m);
1368
uma_zfree(fakepg_zone, m);
1369
}
1370
1371
/*
1372
* vm_page_updatefake:
1373
*
1374
* Update the given fictitious page to the specified physical address and
1375
* memory attribute.
1376
*/
1377
void
1378
vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1379
{
1380
1381
KASSERT((m->flags & PG_FICTITIOUS) != 0,
1382
("vm_page_updatefake: bad page %p", m));
1383
m->phys_addr = paddr;
1384
pmap_page_set_memattr(m, memattr);
1385
}
1386
1387
/*
1388
* vm_page_free:
1389
*
1390
* Free a page.
1391
*/
1392
void
1393
vm_page_free(vm_page_t m)
1394
{
1395
1396
m->flags &= ~PG_ZERO;
1397
vm_page_free_toq(m);
1398
}
1399
1400
/*
1401
* vm_page_free_zero:
1402
*
1403
* Free a page to the zerod-pages queue
1404
*/
1405
void
1406
vm_page_free_zero(vm_page_t m)
1407
{
1408
1409
m->flags |= PG_ZERO;
1410
vm_page_free_toq(m);
1411
}
1412
1413
/*
1414
* Unbusy and handle the page queueing for a page from a getpages request that
1415
* was optionally read ahead or behind.
1416
*/
1417
void
1418
vm_page_readahead_finish(vm_page_t m)
1419
{
1420
1421
/* We shouldn't put invalid pages on queues. */
1422
KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m));
1423
1424
/*
1425
* Since the page is not the actually needed one, whether it should
1426
* be activated or deactivated is not obvious. Empirical results
1427
* have shown that deactivating the page is usually the best choice,
1428
* unless the page is wanted by another thread.
1429
*/
1430
if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0)
1431
vm_page_activate(m);
1432
else
1433
vm_page_deactivate(m);
1434
vm_page_xunbusy_unchecked(m);
1435
}
1436
1437
/*
1438
* Destroy the identity of an invalid page and free it if possible.
1439
* This is intended to be used when reading a page from backing store fails.
1440
*/
1441
void
1442
vm_page_free_invalid(vm_page_t m)
1443
{
1444
1445
KASSERT(vm_page_none_valid(m), ("page %p is valid", m));
1446
KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m));
1447
KASSERT(m->object != NULL, ("page %p has no object", m));
1448
VM_OBJECT_ASSERT_WLOCKED(m->object);
1449
1450
/*
1451
* We may be attempting to free the page as part of the handling for an
1452
* I/O error, in which case the page was xbusied by a different thread.
1453
*/
1454
vm_page_xbusy_claim(m);
1455
1456
/*
1457
* If someone has wired this page while the object lock
1458
* was not held, then the thread that unwires is responsible
1459
* for freeing the page. Otherwise just free the page now.
1460
* The wire count of this unmapped page cannot change while
1461
* we have the page xbusy and the page's object wlocked.
1462
*/
1463
if (vm_page_remove(m))
1464
vm_page_free(m);
1465
}
1466
1467
/*
1468
* vm_page_dirty_KBI: [ internal use only ]
1469
*
1470
* Set all bits in the page's dirty field.
1471
*
1472
* The object containing the specified page must be locked if the
1473
* call is made from the machine-independent layer.
1474
*
1475
* See vm_page_clear_dirty_mask().
1476
*
1477
* This function should only be called by vm_page_dirty().
1478
*/
1479
void
1480
vm_page_dirty_KBI(vm_page_t m)
1481
{
1482
1483
/* Refer to this operation by its public name. */
1484
KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
1485
m->dirty = VM_PAGE_BITS_ALL;
1486
}
1487
1488
/*
1489
* Insert the given page into the given object at the given pindex.
1490
*
1491
* The procedure is marked __always_inline to suggest to the compiler to
1492
* eliminate the iter parameter and the associated alternate branch.
1493
*/
1494
static __always_inline int
1495
vm_page_insert_lookup(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1496
bool iter, struct pctrie_iter *pages)
1497
{
1498
int error;
1499
1500
VM_OBJECT_ASSERT_WLOCKED(object);
1501
KASSERT(m->object == NULL,
1502
("vm_page_insert: page %p already inserted", m));
1503
1504
/*
1505
* Record the object/offset pair in this page.
1506
*/
1507
m->object = object;
1508
m->pindex = pindex;
1509
m->ref_count |= VPRC_OBJREF;
1510
1511
/*
1512
* Add this page to the object's radix tree.
1513
*/
1514
if (iter)
1515
error = vm_radix_iter_insert(pages, m);
1516
else
1517
error = vm_radix_insert(&object->rtree, m);
1518
if (__predict_false(error != 0)) {
1519
m->object = NULL;
1520
m->pindex = 0;
1521
m->ref_count &= ~VPRC_OBJREF;
1522
return (1);
1523
}
1524
1525
vm_page_insert_radixdone(m, object);
1526
vm_pager_page_inserted(object, m);
1527
return (0);
1528
}
1529
1530
/*
1531
* vm_page_insert: [ internal use only ]
1532
*
1533
* Inserts the given mem entry into the object and object list.
1534
*
1535
* The object must be locked.
1536
*/
1537
int
1538
vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1539
{
1540
return (vm_page_insert_lookup(m, object, pindex, false, NULL));
1541
}
1542
1543
/*
1544
* vm_page_iter_insert:
1545
*
1546
* Tries to insert the page "m" into the specified object at offset
1547
* "pindex" using the iterator "pages". Returns 0 if the insertion was
1548
* successful.
1549
*
1550
* The object must be locked.
1551
*/
1552
int
1553
vm_page_iter_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1554
struct pctrie_iter *pages)
1555
{
1556
return (vm_page_insert_lookup(m, object, pindex, true, pages));
1557
}
1558
1559
/*
1560
* vm_page_insert_radixdone:
1561
*
1562
* Complete page "m" insertion into the specified object after the
1563
* radix trie hooking.
1564
*
1565
* The object must be locked.
1566
*/
1567
static void
1568
vm_page_insert_radixdone(vm_page_t m, vm_object_t object)
1569
{
1570
1571
VM_OBJECT_ASSERT_WLOCKED(object);
1572
KASSERT(object != NULL && m->object == object,
1573
("vm_page_insert_radixdone: page %p has inconsistent object", m));
1574
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1575
("vm_page_insert_radixdone: page %p is missing object ref", m));
1576
1577
/*
1578
* Show that the object has one more resident page.
1579
*/
1580
object->resident_page_count++;
1581
1582
/*
1583
* Hold the vnode until the last page is released.
1584
*/
1585
if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
1586
vhold(object->handle);
1587
1588
/*
1589
* Since we are inserting a new and possibly dirty page,
1590
* update the object's generation count.
1591
*/
1592
if (pmap_page_is_write_mapped(m))
1593
vm_object_set_writeable_dirty(object);
1594
}
1595
1596
/*
1597
* vm_page_remove_radixdone
1598
*
1599
* Complete page "m" removal from the specified object after the radix trie
1600
* unhooking.
1601
*
1602
* The caller is responsible for updating the page's fields to reflect this
1603
* removal.
1604
*/
1605
static void
1606
vm_page_remove_radixdone(vm_page_t m)
1607
{
1608
vm_object_t object;
1609
1610
vm_page_assert_xbusied(m);
1611
object = m->object;
1612
VM_OBJECT_ASSERT_WLOCKED(object);
1613
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1614
("page %p is missing its object ref", m));
1615
1616
/* Deferred free of swap space. */
1617
if ((m->a.flags & PGA_SWAP_FREE) != 0)
1618
vm_pager_page_unswapped(m);
1619
1620
vm_pager_page_removed(object, m);
1621
m->object = NULL;
1622
1623
/*
1624
* And show that the object has one fewer resident page.
1625
*/
1626
object->resident_page_count--;
1627
1628
/*
1629
* The vnode may now be recycled.
1630
*/
1631
if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
1632
vdrop(object->handle);
1633
}
1634
1635
/*
1636
* vm_page_free_object_prep:
1637
*
1638
* Disassociates the given page from its VM object.
1639
*
1640
* The object must be locked, and the page must be xbusy.
1641
*/
1642
static void
1643
vm_page_free_object_prep(vm_page_t m)
1644
{
1645
KASSERT(((m->oflags & VPO_UNMANAGED) != 0) ==
1646
((m->object->flags & OBJ_UNMANAGED) != 0),
1647
("%s: managed flag mismatch for page %p",
1648
__func__, m));
1649
vm_page_assert_xbusied(m);
1650
1651
/*
1652
* The object reference can be released without an atomic
1653
* operation.
1654
*/
1655
KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
1656
m->ref_count == VPRC_OBJREF,
1657
("%s: page %p has unexpected ref_count %u",
1658
__func__, m, m->ref_count));
1659
vm_page_remove_radixdone(m);
1660
m->ref_count -= VPRC_OBJREF;
1661
}
1662
1663
/*
1664
* vm_page_iter_free:
1665
*
1666
* Free the given page, and use the iterator to remove it from the radix
1667
* tree.
1668
*/
1669
void
1670
vm_page_iter_free(struct pctrie_iter *pages, vm_page_t m)
1671
{
1672
vm_radix_iter_remove(pages);
1673
vm_page_free_object_prep(m);
1674
vm_page_xunbusy(m);
1675
m->flags &= ~PG_ZERO;
1676
vm_page_free_toq(m);
1677
}
1678
1679
/*
1680
* vm_page_remove:
1681
*
1682
* Removes the specified page from its containing object, but does not
1683
* invalidate any backing storage. Returns true if the object's reference
1684
* was the last reference to the page, and false otherwise.
1685
*
1686
* The object must be locked and the page must be exclusively busied.
1687
* The exclusive busy will be released on return. If this is not the
1688
* final ref and the caller does not hold a wire reference it may not
1689
* continue to access the page.
1690
*/
1691
bool
1692
vm_page_remove(vm_page_t m)
1693
{
1694
bool dropped;
1695
1696
dropped = vm_page_remove_xbusy(m);
1697
vm_page_xunbusy(m);
1698
1699
return (dropped);
1700
}
1701
1702
/*
1703
* vm_page_iter_remove:
1704
*
1705
* Remove the current page, and use the iterator to remove it from the
1706
* radix tree.
1707
*/
1708
bool
1709
vm_page_iter_remove(struct pctrie_iter *pages, vm_page_t m)
1710
{
1711
bool dropped;
1712
1713
vm_radix_iter_remove(pages);
1714
vm_page_remove_radixdone(m);
1715
dropped = (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
1716
vm_page_xunbusy(m);
1717
1718
return (dropped);
1719
}
1720
1721
/*
1722
* vm_page_radix_remove
1723
*
1724
* Removes the specified page from the radix tree.
1725
*/
1726
static void
1727
vm_page_radix_remove(vm_page_t m)
1728
{
1729
vm_page_t mrem __diagused;
1730
1731
mrem = vm_radix_remove(&m->object->rtree, m->pindex);
1732
KASSERT(mrem == m,
1733
("removed page %p, expected page %p", mrem, m));
1734
}
1735
1736
/*
1737
* vm_page_remove_xbusy
1738
*
1739
* Removes the page but leaves the xbusy held. Returns true if this
1740
* removed the final ref and false otherwise.
1741
*/
1742
bool
1743
vm_page_remove_xbusy(vm_page_t m)
1744
{
1745
1746
vm_page_radix_remove(m);
1747
vm_page_remove_radixdone(m);
1748
return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
1749
}
1750
1751
/*
1752
* vm_page_lookup:
1753
*
1754
* Returns the page associated with the object/offset
1755
* pair specified; if none is found, NULL is returned.
1756
*
1757
* The object must be locked.
1758
*/
1759
vm_page_t
1760
vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1761
{
1762
1763
VM_OBJECT_ASSERT_LOCKED(object);
1764
return (vm_radix_lookup(&object->rtree, pindex));
1765
}
1766
1767
/*
1768
* vm_page_iter_init:
1769
*
1770
* Initialize iterator for vm pages.
1771
*/
1772
void
1773
vm_page_iter_init(struct pctrie_iter *pages, vm_object_t object)
1774
{
1775
1776
vm_radix_iter_init(pages, &object->rtree);
1777
}
1778
1779
/*
1780
* vm_page_iter_init:
1781
*
1782
* Initialize iterator for vm pages.
1783
*/
1784
void
1785
vm_page_iter_limit_init(struct pctrie_iter *pages, vm_object_t object,
1786
vm_pindex_t limit)
1787
{
1788
1789
vm_radix_iter_limit_init(pages, &object->rtree, limit);
1790
}
1791
1792
/*
1793
* vm_page_lookup_unlocked:
1794
*
1795
* Returns the page associated with the object/offset pair specified;
1796
* if none is found, NULL is returned. The page may be no longer be
1797
* present in the object at the time that this function returns. Only
1798
* useful for opportunistic checks such as inmem().
1799
*/
1800
vm_page_t
1801
vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex)
1802
{
1803
1804
return (vm_radix_lookup_unlocked(&object->rtree, pindex));
1805
}
1806
1807
/*
1808
* vm_page_relookup:
1809
*
1810
* Returns a page that must already have been busied by
1811
* the caller. Used for bogus page replacement.
1812
*/
1813
vm_page_t
1814
vm_page_relookup(vm_object_t object, vm_pindex_t pindex)
1815
{
1816
vm_page_t m;
1817
1818
m = vm_page_lookup_unlocked(object, pindex);
1819
KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) &&
1820
m->object == object && m->pindex == pindex,
1821
("vm_page_relookup: Invalid page %p", m));
1822
return (m);
1823
}
1824
1825
/*
1826
* This should only be used by lockless functions for releasing transient
1827
* incorrect acquires. The page may have been freed after we acquired a
1828
* busy lock. In this case busy_lock == VPB_FREED and we have nothing
1829
* further to do.
1830
*/
1831
static void
1832
vm_page_busy_release(vm_page_t m)
1833
{
1834
u_int x;
1835
1836
x = vm_page_busy_fetch(m);
1837
for (;;) {
1838
if (x == VPB_FREED)
1839
break;
1840
if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) {
1841
if (atomic_fcmpset_int(&m->busy_lock, &x,
1842
x - VPB_ONE_SHARER))
1843
break;
1844
continue;
1845
}
1846
KASSERT((x & VPB_BIT_SHARED) != 0 ||
1847
(x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE,
1848
("vm_page_busy_release: %p xbusy not owned.", m));
1849
if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
1850
continue;
1851
if ((x & VPB_BIT_WAITERS) != 0)
1852
wakeup(m);
1853
break;
1854
}
1855
}
1856
1857
/*
1858
* Uses the page mnew as a replacement for an existing page at index
1859
* pindex which must be already present in the object.
1860
*
1861
* Both pages must be exclusively busied on enter. The old page is
1862
* unbusied on exit.
1863
*
1864
* A return value of true means mold is now free. If this is not the
1865
* final ref and the caller does not hold a wire reference it may not
1866
* continue to access the page.
1867
*/
1868
static bool
1869
vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1870
vm_page_t mold)
1871
{
1872
vm_page_t mret __diagused;
1873
bool dropped;
1874
1875
VM_OBJECT_ASSERT_WLOCKED(object);
1876
vm_page_assert_xbusied(mold);
1877
KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0,
1878
("vm_page_replace: page %p already in object", mnew));
1879
1880
/*
1881
* This function mostly follows vm_page_insert() and
1882
* vm_page_remove() without the radix, object count and vnode
1883
* dance. Double check such functions for more comments.
1884
*/
1885
1886
mnew->object = object;
1887
mnew->pindex = pindex;
1888
atomic_set_int(&mnew->ref_count, VPRC_OBJREF);
1889
mret = vm_radix_replace(&object->rtree, mnew);
1890
KASSERT(mret == mold,
1891
("invalid page replacement, mold=%p, mret=%p", mold, mret));
1892
KASSERT((mold->oflags & VPO_UNMANAGED) ==
1893
(mnew->oflags & VPO_UNMANAGED),
1894
("vm_page_replace: mismatched VPO_UNMANAGED"));
1895
1896
mold->object = NULL;
1897
1898
/*
1899
* The object's resident_page_count does not change because we have
1900
* swapped one page for another, but the generation count should
1901
* change if the page is dirty.
1902
*/
1903
if (pmap_page_is_write_mapped(mnew))
1904
vm_object_set_writeable_dirty(object);
1905
dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF;
1906
vm_page_xunbusy(mold);
1907
1908
return (dropped);
1909
}
1910
1911
void
1912
vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1913
vm_page_t mold)
1914
{
1915
1916
vm_page_assert_xbusied(mnew);
1917
1918
if (vm_page_replace_hold(mnew, object, pindex, mold))
1919
vm_page_free(mold);
1920
}
1921
1922
/*
1923
* vm_page_iter_rename:
1924
*
1925
* Tries to move the specified page from its current object to a new object
1926
* and pindex, using the given iterator to remove the page from its current
1927
* object. Returns true if the move was successful, and false if the move
1928
* was aborted due to a failed memory allocation.
1929
*
1930
* Panics if a page already resides in the new object at the new pindex.
1931
*
1932
* This routine dirties the page if it is valid, as callers are expected to
1933
* transfer backing storage only after moving the page. Dirtying the page
1934
* ensures that the destination object retains the most recent copy of the
1935
* page.
1936
*
1937
* The objects must be locked.
1938
*/
1939
bool
1940
vm_page_iter_rename(struct pctrie_iter *old_pages, vm_page_t m,
1941
vm_object_t new_object, vm_pindex_t new_pindex)
1942
{
1943
vm_pindex_t opidx;
1944
1945
KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1946
("%s: page %p is missing object ref", __func__, m));
1947
VM_OBJECT_ASSERT_WLOCKED(m->object);
1948
VM_OBJECT_ASSERT_WLOCKED(new_object);
1949
1950
/*
1951
* Create a custom version of vm_page_insert() which does not depend
1952
* by m_prev and can cheat on the implementation aspects of the
1953
* function.
1954
*/
1955
opidx = m->pindex;
1956
m->pindex = new_pindex;
1957
if (vm_radix_insert(&new_object->rtree, m) != 0) {
1958
m->pindex = opidx;
1959
return (false);
1960
}
1961
1962
/*
1963
* The operation cannot fail anymore.
1964
*/
1965
m->pindex = opidx;
1966
vm_radix_iter_remove(old_pages);
1967
vm_page_remove_radixdone(m);
1968
1969
/* Return back to the new pindex to complete vm_page_insert(). */
1970
m->pindex = new_pindex;
1971
m->object = new_object;
1972
1973
vm_page_insert_radixdone(m, new_object);
1974
if (vm_page_any_valid(m))
1975
vm_page_dirty(m);
1976
vm_pager_page_inserted(new_object, m);
1977
return (true);
1978
}
1979
1980
/*
1981
* vm_page_alloc:
1982
*
1983
* Allocate and return a page that is associated with the specified
1984
* object and offset pair. By default, this page is exclusive busied.
1985
*
1986
* The caller must always specify an allocation class.
1987
*
1988
* allocation classes:
1989
* VM_ALLOC_NORMAL normal process request
1990
* VM_ALLOC_SYSTEM system *really* needs a page
1991
* VM_ALLOC_INTERRUPT interrupt time request
1992
*
1993
* optional allocation flags:
1994
* VM_ALLOC_COUNT(number) the number of additional pages that the caller
1995
* intends to allocate
1996
* VM_ALLOC_NOBUSY do not exclusive busy the page
1997
* VM_ALLOC_NODUMP do not include the page in a kernel core dump
1998
* VM_ALLOC_NOFREE page will never be freed
1999
* VM_ALLOC_NOWAIT ignored (default behavior)
2000
* VM_ALLOC_SBUSY shared busy the allocated page
2001
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
2002
* VM_ALLOC_WIRED wire the allocated page
2003
* VM_ALLOC_ZERO prefer a zeroed page
2004
*/
2005
vm_page_t
2006
vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
2007
{
2008
struct pctrie_iter pages;
2009
2010
vm_page_iter_init(&pages, object);
2011
return (vm_page_alloc_iter(object, pindex, req, &pages));
2012
}
2013
2014
/*
2015
* Allocate a page in the specified object with the given page index. If the
2016
* object lock is dropped and regained, the pages iter is reset.
2017
*/
2018
vm_page_t
2019
vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req,
2020
struct pctrie_iter *pages)
2021
{
2022
struct vm_domainset_iter di;
2023
vm_page_t m;
2024
int domain;
2025
2026
if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
2027
return (NULL);
2028
2029
do {
2030
m = vm_page_alloc_domain_iter(object, pindex, domain, req,
2031
pages);
2032
if (m != NULL)
2033
break;
2034
} while (vm_domainset_iter_page(&di, object, &domain, pages) == 0);
2035
2036
return (m);
2037
}
2038
2039
/*
2040
* Returns true if the number of free pages exceeds the minimum
2041
* for the request class and false otherwise.
2042
*/
2043
static int
2044
_vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages)
2045
{
2046
u_int limit, old, new;
2047
2048
if (req_class == VM_ALLOC_INTERRUPT)
2049
limit = 0;
2050
else if (req_class == VM_ALLOC_SYSTEM)
2051
limit = vmd->vmd_interrupt_free_min;
2052
else
2053
limit = vmd->vmd_free_reserved;
2054
2055
/*
2056
* Attempt to reserve the pages. Fail if we're below the limit.
2057
*/
2058
limit += npages;
2059
old = atomic_load_int(&vmd->vmd_free_count);
2060
do {
2061
if (old < limit)
2062
return (0);
2063
new = old - npages;
2064
} while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
2065
2066
/* Wake the page daemon if we've crossed the threshold. */
2067
if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
2068
pagedaemon_wakeup(vmd->vmd_domain);
2069
2070
/* Only update bitsets on transitions. */
2071
if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
2072
(old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
2073
vm_domain_set(vmd);
2074
2075
return (1);
2076
}
2077
2078
int
2079
vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
2080
{
2081
int req_class;
2082
2083
/*
2084
* The page daemon is allowed to dig deeper into the free page list.
2085
*/
2086
req_class = req & VM_ALLOC_CLASS_MASK;
2087
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
2088
req_class = VM_ALLOC_SYSTEM;
2089
return (_vm_domain_allocate(vmd, req_class, npages));
2090
}
2091
2092
vm_page_t
2093
vm_page_alloc_domain_iter(vm_object_t object, vm_pindex_t pindex, int domain,
2094
int req, struct pctrie_iter *pages)
2095
{
2096
struct vm_domain *vmd;
2097
vm_page_t m;
2098
int flags;
2099
2100
#define VM_ALLOC_COMMON (VM_ALLOC_CLASS_MASK | VM_ALLOC_NODUMP | \
2101
VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | \
2102
VM_ALLOC_WIRED | VM_ALLOC_ZERO)
2103
#define VPA_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2104
VM_ALLOC_NOBUSY | VM_ALLOC_NOFREE | \
2105
VM_ALLOC_SBUSY)
2106
KASSERT((req & ~VPA_FLAGS) == 0,
2107
("invalid request %#x", req));
2108
KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2109
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2110
("invalid request %#x", req));
2111
VM_OBJECT_ASSERT_WLOCKED(object);
2112
2113
flags = 0;
2114
m = NULL;
2115
if (!vm_pager_can_alloc_page(object, pindex))
2116
return (NULL);
2117
#if VM_NRESERVLEVEL > 0
2118
again:
2119
#endif
2120
if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) {
2121
m = vm_page_alloc_nofree_domain(domain, req);
2122
if (m != NULL)
2123
goto found;
2124
}
2125
#if VM_NRESERVLEVEL > 0
2126
/*
2127
* Can we allocate the page from a reservation?
2128
*/
2129
if (vm_object_reserv(object) &&
2130
(m = vm_reserv_alloc_page(object, pindex, domain, req, pages)) !=
2131
NULL) {
2132
goto found;
2133
}
2134
#endif
2135
vmd = VM_DOMAIN(domain);
2136
if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) {
2137
m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone,
2138
M_NOWAIT | M_NOVM);
2139
if (m != NULL) {
2140
flags |= PG_PCPU_CACHE;
2141
goto found;
2142
}
2143
}
2144
if (vm_domain_allocate(vmd, req, 1)) {
2145
/*
2146
* If not, allocate it from the free page queues.
2147
*/
2148
vm_domain_free_lock(vmd);
2149
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0);
2150
vm_domain_free_unlock(vmd);
2151
if (m == NULL) {
2152
vm_domain_freecnt_inc(vmd, 1);
2153
#if VM_NRESERVLEVEL > 0
2154
if (vm_reserv_reclaim_inactive(domain))
2155
goto again;
2156
#endif
2157
}
2158
}
2159
if (m == NULL) {
2160
/*
2161
* Not allocatable, give up.
2162
*/
2163
(void)vm_domain_alloc_fail(vmd, object, req);
2164
if ((req & VM_ALLOC_WAITFAIL) != 0)
2165
pctrie_iter_reset(pages);
2166
return (NULL);
2167
}
2168
2169
/*
2170
* At this point we had better have found a good page.
2171
*/
2172
found:
2173
vm_page_dequeue(m);
2174
vm_page_alloc_check(m);
2175
2176
/*
2177
* Initialize the page. Only the PG_ZERO flag is inherited.
2178
*/
2179
flags |= m->flags & PG_ZERO;
2180
if ((req & VM_ALLOC_NODUMP) != 0)
2181
flags |= PG_NODUMP;
2182
if ((req & VM_ALLOC_NOFREE) != 0)
2183
flags |= PG_NOFREE;
2184
m->flags = flags;
2185
m->a.flags = 0;
2186
m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2187
m->pool = VM_FREEPOOL_DEFAULT;
2188
if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2189
m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2190
else if ((req & VM_ALLOC_SBUSY) != 0)
2191
m->busy_lock = VPB_SHARERS_WORD(1);
2192
else
2193
m->busy_lock = VPB_UNBUSIED;
2194
if (req & VM_ALLOC_WIRED) {
2195
vm_wire_add(1);
2196
m->ref_count = 1;
2197
}
2198
m->a.act_count = 0;
2199
2200
if (vm_page_iter_insert(m, object, pindex, pages)) {
2201
if (req & VM_ALLOC_WIRED) {
2202
vm_wire_sub(1);
2203
m->ref_count = 0;
2204
}
2205
KASSERT(m->object == NULL, ("page %p has object", m));
2206
m->oflags = VPO_UNMANAGED;
2207
m->busy_lock = VPB_UNBUSIED;
2208
/* Don't change PG_ZERO. */
2209
vm_page_free_toq(m);
2210
if (req & VM_ALLOC_WAITFAIL) {
2211
VM_OBJECT_WUNLOCK(object);
2212
vm_radix_wait();
2213
pctrie_iter_reset(pages);
2214
VM_OBJECT_WLOCK(object);
2215
}
2216
return (NULL);
2217
}
2218
2219
/* Ignore device objects; the pager sets "memattr" for them. */
2220
if (object->memattr != VM_MEMATTR_DEFAULT &&
2221
(object->flags & OBJ_FICTITIOUS) == 0)
2222
pmap_page_set_memattr(m, object->memattr);
2223
2224
return (m);
2225
}
2226
2227
/*
2228
* vm_page_alloc_contig:
2229
*
2230
* Allocate a contiguous set of physical pages of the given size "npages"
2231
* from the free lists. All of the physical pages must be at or above
2232
* the given physical address "low" and below the given physical address
2233
* "high". The given value "alignment" determines the alignment of the
2234
* first physical page in the set. If the given value "boundary" is
2235
* non-zero, then the set of physical pages cannot cross any physical
2236
* address boundary that is a multiple of that value. Both "alignment"
2237
* and "boundary" must be a power of two.
2238
*
2239
* If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
2240
* then the memory attribute setting for the physical pages is configured
2241
* to the object's memory attribute setting. Otherwise, the memory
2242
* attribute setting for the physical pages is configured to "memattr",
2243
* overriding the object's memory attribute setting. However, if the
2244
* object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
2245
* memory attribute setting for the physical pages cannot be configured
2246
* to VM_MEMATTR_DEFAULT.
2247
*
2248
* The specified object may not contain fictitious pages.
2249
*
2250
* The caller must always specify an allocation class.
2251
*
2252
* allocation classes:
2253
* VM_ALLOC_NORMAL normal process request
2254
* VM_ALLOC_SYSTEM system *really* needs the pages
2255
* VM_ALLOC_INTERRUPT interrupt time request
2256
*
2257
* optional allocation flags:
2258
* VM_ALLOC_NOBUSY do not exclusive busy the pages
2259
* VM_ALLOC_NODUMP do not include the pages in a kernel core dump
2260
* VM_ALLOC_NORECLAIM do not reclaim after initial failure
2261
* VM_ALLOC_NOWAIT ignored (default behavior)
2262
* VM_ALLOC_SBUSY shared busy the allocated pages
2263
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
2264
* VM_ALLOC_WIRED wire the allocated pages
2265
* VM_ALLOC_ZERO prefer zeroed pages
2266
*/
2267
vm_page_t
2268
vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
2269
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2270
vm_paddr_t boundary, vm_memattr_t memattr)
2271
{
2272
struct vm_domainset_iter di;
2273
vm_page_t bounds[2];
2274
vm_page_t m;
2275
int domain;
2276
int start_segind;
2277
2278
start_segind = -1;
2279
2280
if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0)
2281
return (NULL);
2282
2283
do {
2284
m = vm_page_alloc_contig_domain(object, pindex, domain, req,
2285
npages, low, high, alignment, boundary, memattr);
2286
if (m != NULL)
2287
break;
2288
if (start_segind == -1)
2289
start_segind = vm_phys_lookup_segind(low);
2290
if (vm_phys_find_range(bounds, start_segind, domain,
2291
npages, low, high) == -1) {
2292
vm_domainset_iter_ignore(&di, domain);
2293
}
2294
} while (vm_domainset_iter_page(&di, object, &domain, NULL) == 0);
2295
2296
return (m);
2297
}
2298
2299
static vm_page_t
2300
vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low,
2301
vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
2302
{
2303
struct vm_domain *vmd;
2304
vm_page_t m_ret;
2305
2306
/*
2307
* Can we allocate the pages without the number of free pages falling
2308
* below the lower bound for the allocation class?
2309
*/
2310
vmd = VM_DOMAIN(domain);
2311
if (!vm_domain_allocate(vmd, req, npages))
2312
return (NULL);
2313
/*
2314
* Try to allocate the pages from the free page queues.
2315
*/
2316
vm_domain_free_lock(vmd);
2317
m_ret = vm_phys_alloc_contig(domain, npages, low, high,
2318
alignment, boundary);
2319
vm_domain_free_unlock(vmd);
2320
if (m_ret != NULL)
2321
return (m_ret);
2322
#if VM_NRESERVLEVEL > 0
2323
/*
2324
* Try to break a reservation to allocate the pages.
2325
*/
2326
if ((req & VM_ALLOC_NORECLAIM) == 0) {
2327
m_ret = vm_reserv_reclaim_contig(domain, npages, low,
2328
high, alignment, boundary);
2329
if (m_ret != NULL)
2330
return (m_ret);
2331
}
2332
#endif
2333
vm_domain_freecnt_inc(vmd, npages);
2334
return (NULL);
2335
}
2336
2337
vm_page_t
2338
vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
2339
int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2340
vm_paddr_t boundary, vm_memattr_t memattr)
2341
{
2342
struct pctrie_iter pages;
2343
vm_page_t m, m_ret, mpred;
2344
u_int busy_lock, flags, oflags;
2345
2346
#define VPAC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2347
VM_ALLOC_NOBUSY | VM_ALLOC_NORECLAIM | \
2348
VM_ALLOC_SBUSY)
2349
KASSERT((req & ~VPAC_FLAGS) == 0,
2350
("invalid request %#x", req));
2351
KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2352
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2353
("invalid request %#x", req));
2354
VM_OBJECT_ASSERT_WLOCKED(object);
2355
KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
2356
("vm_page_alloc_contig: object %p has fictitious pages",
2357
object));
2358
KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2359
2360
vm_page_iter_init(&pages, object);
2361
m_ret = NULL;
2362
#if VM_NRESERVLEVEL > 0
2363
/*
2364
* Can we allocate the pages from a reservation?
2365
*/
2366
if (vm_object_reserv(object)) {
2367
m_ret = vm_reserv_alloc_contig(object, pindex, domain,
2368
req, npages, low, high, alignment, boundary, &pages);
2369
}
2370
#endif
2371
if (m_ret == NULL) {
2372
m_ret = vm_page_find_contig_domain(domain, req, npages,
2373
low, high, alignment, boundary);
2374
}
2375
if (m_ret == NULL) {
2376
(void)vm_domain_alloc_fail(VM_DOMAIN(domain), object, req);
2377
return (NULL);
2378
}
2379
2380
/*
2381
* Initialize the pages. Only the PG_ZERO flag is inherited.
2382
*/
2383
flags = PG_ZERO;
2384
if ((req & VM_ALLOC_NODUMP) != 0)
2385
flags |= PG_NODUMP;
2386
oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2387
if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2388
busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2389
else if ((req & VM_ALLOC_SBUSY) != 0)
2390
busy_lock = VPB_SHARERS_WORD(1);
2391
else
2392
busy_lock = VPB_UNBUSIED;
2393
if ((req & VM_ALLOC_WIRED) != 0)
2394
vm_wire_add(npages);
2395
if (object->memattr != VM_MEMATTR_DEFAULT &&
2396
memattr == VM_MEMATTR_DEFAULT)
2397
memattr = object->memattr;
2398
for (m = m_ret; m < &m_ret[npages]; m++) {
2399
vm_page_dequeue(m);
2400
vm_page_alloc_check(m);
2401
m->a.flags = 0;
2402
m->flags = (m->flags | PG_NODUMP) & flags;
2403
m->busy_lock = busy_lock;
2404
if ((req & VM_ALLOC_WIRED) != 0)
2405
m->ref_count = 1;
2406
m->a.act_count = 0;
2407
m->oflags = oflags;
2408
m->pool = VM_FREEPOOL_DEFAULT;
2409
if (vm_page_iter_insert(m, object, pindex, &pages)) {
2410
if ((req & VM_ALLOC_WIRED) != 0)
2411
vm_wire_sub(npages);
2412
KASSERT(m->object == NULL,
2413
("page %p has object", m));
2414
mpred = m;
2415
for (m = m_ret; m < &m_ret[npages]; m++) {
2416
if (m <= mpred &&
2417
(req & VM_ALLOC_WIRED) != 0)
2418
m->ref_count = 0;
2419
m->oflags = VPO_UNMANAGED;
2420
m->busy_lock = VPB_UNBUSIED;
2421
/* Don't change PG_ZERO. */
2422
vm_page_free_toq(m);
2423
}
2424
if (req & VM_ALLOC_WAITFAIL) {
2425
VM_OBJECT_WUNLOCK(object);
2426
vm_radix_wait();
2427
VM_OBJECT_WLOCK(object);
2428
}
2429
return (NULL);
2430
}
2431
if (memattr != VM_MEMATTR_DEFAULT)
2432
pmap_page_set_memattr(m, memattr);
2433
pindex++;
2434
}
2435
return (m_ret);
2436
}
2437
2438
/*
2439
* Allocate a physical page that is not intended to be inserted into a VM
2440
* object.
2441
*/
2442
vm_page_t
2443
vm_page_alloc_noobj_domain(int domain, int req)
2444
{
2445
struct vm_domain *vmd;
2446
vm_page_t m;
2447
int flags;
2448
2449
#define VPAN_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2450
VM_ALLOC_NOFREE | VM_ALLOC_WAITOK)
2451
KASSERT((req & ~VPAN_FLAGS) == 0,
2452
("invalid request %#x", req));
2453
2454
flags = ((req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0) |
2455
((req & VM_ALLOC_NOFREE) != 0 ? PG_NOFREE : 0);
2456
vmd = VM_DOMAIN(domain);
2457
again:
2458
if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) {
2459
m = vm_page_alloc_nofree_domain(domain, req);
2460
if (m != NULL)
2461
goto found;
2462
}
2463
2464
if (vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) {
2465
m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone,
2466
M_NOWAIT | M_NOVM);
2467
if (m != NULL) {
2468
flags |= PG_PCPU_CACHE;
2469
goto found;
2470
}
2471
}
2472
2473
if (vm_domain_allocate(vmd, req, 1)) {
2474
vm_domain_free_lock(vmd);
2475
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0);
2476
vm_domain_free_unlock(vmd);
2477
if (m == NULL) {
2478
vm_domain_freecnt_inc(vmd, 1);
2479
#if VM_NRESERVLEVEL > 0
2480
if (vm_reserv_reclaim_inactive(domain))
2481
goto again;
2482
#endif
2483
}
2484
}
2485
if (m == NULL) {
2486
if (!vm_domain_alloc_fail(vmd, NULL, req))
2487
return (NULL);
2488
goto again;
2489
}
2490
2491
found:
2492
/*
2493
* If the page comes from the free page cache, then it might still
2494
* have a pending deferred dequeue. Specifically, when the page is
2495
* imported from a different pool by vm_phys_alloc_npages(), the
2496
* second, third, etc. pages in a non-zero order set could have
2497
* pending deferred dequeues.
2498
*/
2499
vm_page_dequeue(m);
2500
vm_page_alloc_check(m);
2501
2502
/*
2503
* Consumers should not rely on a useful default pindex value.
2504
*/
2505
m->pindex = 0xdeadc0dedeadc0de;
2506
m->flags = (m->flags & PG_ZERO) | flags;
2507
m->a.flags = 0;
2508
m->oflags = VPO_UNMANAGED;
2509
m->pool = VM_FREEPOOL_DIRECT;
2510
m->busy_lock = VPB_UNBUSIED;
2511
if ((req & VM_ALLOC_WIRED) != 0) {
2512
vm_wire_add(1);
2513
m->ref_count = 1;
2514
}
2515
2516
if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2517
pmap_zero_page(m);
2518
2519
return (m);
2520
}
2521
2522
#if VM_NRESERVLEVEL > 1
2523
#define VM_NOFREE_IMPORT_ORDER (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER)
2524
#elif VM_NRESERVLEVEL > 0
2525
#define VM_NOFREE_IMPORT_ORDER VM_LEVEL_0_ORDER
2526
#else
2527
#define VM_NOFREE_IMPORT_ORDER 8
2528
#endif
2529
2530
/*
2531
* Allocate a single NOFREE page.
2532
*
2533
* This routine hands out NOFREE pages from higher-order
2534
* physical memory blocks in order to reduce memory fragmentation.
2535
* When a NOFREE for a given domain chunk is used up,
2536
* the routine will try to fetch a new one from the freelists
2537
* and discard the old one.
2538
*/
2539
static vm_page_t __noinline
2540
vm_page_alloc_nofree_domain(int domain, int req)
2541
{
2542
vm_page_t m;
2543
struct vm_domain *vmd;
2544
2545
KASSERT((req & VM_ALLOC_NOFREE) != 0, ("invalid request %#x", req));
2546
2547
vmd = VM_DOMAIN(domain);
2548
vm_domain_free_lock(vmd);
2549
if (TAILQ_EMPTY(&vmd->vmd_nofreeq)) {
2550
int count;
2551
2552
count = 1 << VM_NOFREE_IMPORT_ORDER;
2553
if (!vm_domain_allocate(vmd, req, count)) {
2554
vm_domain_free_unlock(vmd);
2555
return (NULL);
2556
}
2557
m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
2558
VM_NOFREE_IMPORT_ORDER);
2559
if (m == NULL) {
2560
vm_domain_freecnt_inc(vmd, count);
2561
vm_domain_free_unlock(vmd);
2562
return (NULL);
2563
}
2564
m->ref_count = count - 1;
2565
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q);
2566
atomic_add_long(&nofreeq_size, count);
2567
}
2568
m = TAILQ_FIRST(&vmd->vmd_nofreeq);
2569
TAILQ_REMOVE(&vmd->vmd_nofreeq, m, plinks.q);
2570
if (m->ref_count > 0) {
2571
vm_page_t m_next;
2572
2573
m_next = &m[1];
2574
vm_page_dequeue(m_next);
2575
m_next->ref_count = m->ref_count - 1;
2576
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m_next, plinks.q);
2577
m->ref_count = 0;
2578
}
2579
vm_domain_free_unlock(vmd);
2580
atomic_add_long(&nofreeq_size, -1);
2581
VM_CNT_INC(v_nofree_count);
2582
2583
return (m);
2584
}
2585
2586
/*
2587
* Though a NOFREE page by definition should not be freed, we support putting
2588
* them aside for future NOFREE allocations. This enables code which allocates
2589
* NOFREE pages for some purpose but then encounters an error and releases
2590
* resources.
2591
*/
2592
static void __noinline
2593
vm_page_free_nofree(struct vm_domain *vmd, vm_page_t m)
2594
{
2595
VM_CNT_ADD(v_nofree_count, -1);
2596
atomic_add_long(&nofreeq_size, 1);
2597
vm_domain_free_lock(vmd);
2598
MPASS(m->ref_count == 0);
2599
TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q);
2600
vm_domain_free_unlock(vmd);
2601
}
2602
2603
vm_page_t
2604
vm_page_alloc_noobj(int req)
2605
{
2606
struct vm_domainset_iter di;
2607
vm_page_t m;
2608
int domain;
2609
2610
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
2611
return (NULL);
2612
2613
do {
2614
m = vm_page_alloc_noobj_domain(domain, req);
2615
if (m != NULL)
2616
break;
2617
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
2618
2619
return (m);
2620
}
2621
2622
vm_page_t
2623
vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
2624
vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2625
vm_memattr_t memattr)
2626
{
2627
struct vm_domainset_iter di;
2628
vm_page_t m;
2629
int domain;
2630
2631
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
2632
return (NULL);
2633
2634
do {
2635
m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
2636
high, alignment, boundary, memattr);
2637
if (m != NULL)
2638
break;
2639
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
2640
2641
return (m);
2642
}
2643
2644
vm_page_t
2645
vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages,
2646
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2647
vm_memattr_t memattr)
2648
{
2649
vm_page_t m, m_ret;
2650
u_int flags;
2651
2652
#define VPANC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \
2653
VM_ALLOC_NORECLAIM | VM_ALLOC_WAITOK)
2654
KASSERT((req & ~VPANC_FLAGS) == 0,
2655
("invalid request %#x", req));
2656
KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) !=
2657
(VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM),
2658
("invalid request %#x", req));
2659
KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2660
2661
while ((m_ret = vm_page_find_contig_domain(domain, req, npages,
2662
low, high, alignment, boundary)) == NULL) {
2663
if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req))
2664
return (NULL);
2665
}
2666
2667
/*
2668
* Initialize the pages. Only the PG_ZERO flag is inherited.
2669
*/
2670
flags = PG_ZERO;
2671
if ((req & VM_ALLOC_NODUMP) != 0)
2672
flags |= PG_NODUMP;
2673
if ((req & VM_ALLOC_WIRED) != 0)
2674
vm_wire_add(npages);
2675
for (m = m_ret; m < &m_ret[npages]; m++) {
2676
vm_page_dequeue(m);
2677
vm_page_alloc_check(m);
2678
2679
/*
2680
* Consumers should not rely on a useful default pindex value.
2681
*/
2682
m->pindex = 0xdeadc0dedeadc0de;
2683
m->a.flags = 0;
2684
m->flags = (m->flags | PG_NODUMP) & flags;
2685
m->busy_lock = VPB_UNBUSIED;
2686
if ((req & VM_ALLOC_WIRED) != 0)
2687
m->ref_count = 1;
2688
m->a.act_count = 0;
2689
m->oflags = VPO_UNMANAGED;
2690
m->pool = VM_FREEPOOL_DIRECT;
2691
2692
/*
2693
* Zero the page before updating any mappings since the page is
2694
* not yet shared with any devices which might require the
2695
* non-default memory attribute. pmap_page_set_memattr()
2696
* flushes data caches before returning.
2697
*/
2698
if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2699
pmap_zero_page(m);
2700
if (memattr != VM_MEMATTR_DEFAULT)
2701
pmap_page_set_memattr(m, memattr);
2702
}
2703
return (m_ret);
2704
}
2705
2706
/*
2707
* Check a page that has been freshly dequeued from a freelist.
2708
*/
2709
static void
2710
vm_page_alloc_check(vm_page_t m)
2711
{
2712
2713
KASSERT(m->object == NULL, ("page %p has object", m));
2714
KASSERT(m->a.queue == PQ_NONE &&
2715
(m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
2716
("page %p has unexpected queue %d, flags %#x",
2717
m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK)));
2718
KASSERT(m->ref_count == 0, ("page %p has references", m));
2719
KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m));
2720
KASSERT(m->dirty == 0, ("page %p is dirty", m));
2721
KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
2722
("page %p has unexpected memattr %d",
2723
m, pmap_page_get_memattr(m)));
2724
KASSERT(vm_page_none_valid(m), ("free page %p is valid", m));
2725
pmap_vm_page_alloc_check(m);
2726
}
2727
2728
static int
2729
vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags)
2730
{
2731
struct vm_domain *vmd;
2732
struct vm_pgcache *pgcache;
2733
int i;
2734
2735
pgcache = arg;
2736
vmd = VM_DOMAIN(pgcache->domain);
2737
2738
/*
2739
* The page daemon should avoid creating extra memory pressure since its
2740
* main purpose is to replenish the store of free pages.
2741
*/
2742
if (vmd->vmd_severeset || curproc == pageproc ||
2743
!_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt))
2744
return (0);
2745
domain = vmd->vmd_domain;
2746
vm_domain_free_lock(vmd);
2747
i = vm_phys_alloc_npages(domain, pgcache->pool, cnt,
2748
(vm_page_t *)store);
2749
vm_domain_free_unlock(vmd);
2750
if (cnt != i)
2751
vm_domain_freecnt_inc(vmd, cnt - i);
2752
2753
return (i);
2754
}
2755
2756
static void
2757
vm_page_zone_release(void *arg, void **store, int cnt)
2758
{
2759
struct vm_domain *vmd;
2760
struct vm_pgcache *pgcache;
2761
vm_page_t m;
2762
int i;
2763
2764
pgcache = arg;
2765
vmd = VM_DOMAIN(pgcache->domain);
2766
vm_domain_free_lock(vmd);
2767
for (i = 0; i < cnt; i++) {
2768
m = (vm_page_t)store[i];
2769
vm_phys_free_pages(m, pgcache->pool, 0);
2770
}
2771
vm_domain_free_unlock(vmd);
2772
vm_domain_freecnt_inc(vmd, cnt);
2773
}
2774
2775
#define VPSC_ANY 0 /* No restrictions. */
2776
#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */
2777
#define VPSC_NOSUPER 2 /* Skip superpages. */
2778
2779
/*
2780
* vm_page_scan_contig:
2781
*
2782
* Scan vm_page_array[] between the specified entries "m_start" and
2783
* "m_end" for a run of contiguous physical pages that satisfy the
2784
* specified conditions, and return the lowest page in the run. The
2785
* specified "alignment" determines the alignment of the lowest physical
2786
* page in the run. If the specified "boundary" is non-zero, then the
2787
* run of physical pages cannot span a physical address that is a
2788
* multiple of "boundary".
2789
*
2790
* "m_end" is never dereferenced, so it need not point to a vm_page
2791
* structure within vm_page_array[].
2792
*
2793
* "npages" must be greater than zero. "m_start" and "m_end" must not
2794
* span a hole (or discontiguity) in the physical address space. Both
2795
* "alignment" and "boundary" must be a power of two.
2796
*/
2797
static vm_page_t
2798
vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
2799
u_long alignment, vm_paddr_t boundary, int options)
2800
{
2801
vm_object_t object;
2802
vm_paddr_t pa;
2803
vm_page_t m, m_run;
2804
#if VM_NRESERVLEVEL > 0
2805
int level;
2806
#endif
2807
int m_inc, order, run_ext, run_len;
2808
2809
KASSERT(npages > 0, ("npages is 0"));
2810
KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2811
KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2812
m_run = NULL;
2813
run_len = 0;
2814
for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
2815
KASSERT((m->flags & PG_MARKER) == 0,
2816
("page %p is PG_MARKER", m));
2817
KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1,
2818
("fictitious page %p has invalid ref count", m));
2819
2820
/*
2821
* If the current page would be the start of a run, check its
2822
* physical address against the end, alignment, and boundary
2823
* conditions. If it doesn't satisfy these conditions, either
2824
* terminate the scan or advance to the next page that
2825
* satisfies the failed condition.
2826
*/
2827
if (run_len == 0) {
2828
KASSERT(m_run == NULL, ("m_run != NULL"));
2829
if (m + npages > m_end)
2830
break;
2831
pa = VM_PAGE_TO_PHYS(m);
2832
if (!vm_addr_align_ok(pa, alignment)) {
2833
m_inc = atop(roundup2(pa, alignment) - pa);
2834
continue;
2835
}
2836
if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) {
2837
m_inc = atop(roundup2(pa, boundary) - pa);
2838
continue;
2839
}
2840
} else
2841
KASSERT(m_run != NULL, ("m_run == NULL"));
2842
2843
retry:
2844
m_inc = 1;
2845
if (vm_page_wired(m))
2846
run_ext = 0;
2847
#if VM_NRESERVLEVEL > 0
2848
else if ((level = vm_reserv_level(m)) >= 0 &&
2849
(options & VPSC_NORESERV) != 0) {
2850
run_ext = 0;
2851
/* Advance to the end of the reservation. */
2852
pa = VM_PAGE_TO_PHYS(m);
2853
m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
2854
pa);
2855
}
2856
#endif
2857
else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2858
/*
2859
* The page is considered eligible for relocation if
2860
* and only if it could be laundered or reclaimed by
2861
* the page daemon.
2862
*/
2863
VM_OBJECT_RLOCK(object);
2864
if (object != m->object) {
2865
VM_OBJECT_RUNLOCK(object);
2866
goto retry;
2867
}
2868
/* Don't care: PG_NODUMP, PG_ZERO. */
2869
if ((object->flags & OBJ_SWAP) == 0 &&
2870
object->type != OBJT_VNODE) {
2871
run_ext = 0;
2872
#if VM_NRESERVLEVEL > 0
2873
} else if ((options & VPSC_NOSUPER) != 0 &&
2874
(level = vm_reserv_level_iffullpop(m)) >= 0) {
2875
run_ext = 0;
2876
/* Advance to the end of the superpage. */
2877
pa = VM_PAGE_TO_PHYS(m);
2878
m_inc = atop(roundup2(pa + 1,
2879
vm_reserv_size(level)) - pa);
2880
#endif
2881
} else if (object->memattr == VM_MEMATTR_DEFAULT &&
2882
vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) {
2883
/*
2884
* The page is allocated but eligible for
2885
* relocation. Extend the current run by one
2886
* page.
2887
*/
2888
KASSERT(pmap_page_get_memattr(m) ==
2889
VM_MEMATTR_DEFAULT,
2890
("page %p has an unexpected memattr", m));
2891
KASSERT((m->oflags & (VPO_SWAPINPROG |
2892
VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2893
("page %p has unexpected oflags", m));
2894
/* Don't care: PGA_NOSYNC. */
2895
run_ext = 1;
2896
} else
2897
run_ext = 0;
2898
VM_OBJECT_RUNLOCK(object);
2899
#if VM_NRESERVLEVEL > 0
2900
} else if (level >= 0) {
2901
/*
2902
* The page is reserved but not yet allocated. In
2903
* other words, it is still free. Extend the current
2904
* run by one page.
2905
*/
2906
run_ext = 1;
2907
#endif
2908
} else if ((order = m->order) < VM_NFREEORDER) {
2909
/*
2910
* The page is enqueued in the physical memory
2911
* allocator's free page queues. Moreover, it is the
2912
* first page in a power-of-two-sized run of
2913
* contiguous free pages. Add these pages to the end
2914
* of the current run, and jump ahead.
2915
*/
2916
run_ext = 1 << order;
2917
m_inc = 1 << order;
2918
} else {
2919
/*
2920
* Skip the page for one of the following reasons: (1)
2921
* It is enqueued in the physical memory allocator's
2922
* free page queues. However, it is not the first
2923
* page in a run of contiguous free pages. (This case
2924
* rarely occurs because the scan is performed in
2925
* ascending order.) (2) It is not reserved, and it is
2926
* transitioning from free to allocated. (Conversely,
2927
* the transition from allocated to free for managed
2928
* pages is blocked by the page busy lock.) (3) It is
2929
* allocated but not contained by an object and not
2930
* wired, e.g., allocated by Xen's balloon driver.
2931
*/
2932
run_ext = 0;
2933
}
2934
2935
/*
2936
* Extend or reset the current run of pages.
2937
*/
2938
if (run_ext > 0) {
2939
if (run_len == 0)
2940
m_run = m;
2941
run_len += run_ext;
2942
} else {
2943
if (run_len > 0) {
2944
m_run = NULL;
2945
run_len = 0;
2946
}
2947
}
2948
}
2949
if (run_len >= npages)
2950
return (m_run);
2951
return (NULL);
2952
}
2953
2954
/*
2955
* vm_page_reclaim_run:
2956
*
2957
* Try to relocate each of the allocated virtual pages within the
2958
* specified run of physical pages to a new physical address. Free the
2959
* physical pages underlying the relocated virtual pages. A virtual page
2960
* is relocatable if and only if it could be laundered or reclaimed by
2961
* the page daemon. Whenever possible, a virtual page is relocated to a
2962
* physical address above "high".
2963
*
2964
* Returns 0 if every physical page within the run was already free or
2965
* just freed by a successful relocation. Otherwise, returns a non-zero
2966
* value indicating why the last attempt to relocate a virtual page was
2967
* unsuccessful.
2968
*
2969
* "req_class" must be an allocation class.
2970
*/
2971
static int
2972
vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
2973
vm_paddr_t high)
2974
{
2975
struct vm_domain *vmd;
2976
struct spglist free;
2977
vm_object_t object;
2978
vm_paddr_t pa;
2979
vm_page_t m, m_end, m_new;
2980
int error, order, req;
2981
2982
KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
2983
("req_class is not an allocation class"));
2984
SLIST_INIT(&free);
2985
error = 0;
2986
m = m_run;
2987
m_end = m_run + npages;
2988
for (; error == 0 && m < m_end; m++) {
2989
KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
2990
("page %p is PG_FICTITIOUS or PG_MARKER", m));
2991
2992
/*
2993
* Racily check for wirings. Races are handled once the object
2994
* lock is held and the page is unmapped.
2995
*/
2996
if (vm_page_wired(m))
2997
error = EBUSY;
2998
else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2999
/*
3000
* The page is relocated if and only if it could be
3001
* laundered or reclaimed by the page daemon.
3002
*/
3003
VM_OBJECT_WLOCK(object);
3004
/* Don't care: PG_NODUMP, PG_ZERO. */
3005
if (m->object != object ||
3006
((object->flags & OBJ_SWAP) == 0 &&
3007
object->type != OBJT_VNODE))
3008
error = EINVAL;
3009
else if (object->memattr != VM_MEMATTR_DEFAULT)
3010
error = EINVAL;
3011
else if (vm_page_queue(m) != PQ_NONE &&
3012
vm_page_tryxbusy(m) != 0) {
3013
if (vm_page_wired(m)) {
3014
vm_page_xunbusy(m);
3015
error = EBUSY;
3016
goto unlock;
3017
}
3018
KASSERT(pmap_page_get_memattr(m) ==
3019
VM_MEMATTR_DEFAULT,
3020
("page %p has an unexpected memattr", m));
3021
KASSERT(m->oflags == 0,
3022
("page %p has unexpected oflags", m));
3023
/* Don't care: PGA_NOSYNC. */
3024
if (!vm_page_none_valid(m)) {
3025
/*
3026
* First, try to allocate a new page
3027
* that is above "high". Failing
3028
* that, try to allocate a new page
3029
* that is below "m_run". Allocate
3030
* the new page between the end of
3031
* "m_run" and "high" only as a last
3032
* resort.
3033
*/
3034
req = req_class;
3035
if ((m->flags & PG_NODUMP) != 0)
3036
req |= VM_ALLOC_NODUMP;
3037
if (trunc_page(high) !=
3038
~(vm_paddr_t)PAGE_MASK) {
3039
m_new =
3040
vm_page_alloc_noobj_contig(
3041
req, 1, round_page(high),
3042
~(vm_paddr_t)0, PAGE_SIZE,
3043
0, VM_MEMATTR_DEFAULT);
3044
} else
3045
m_new = NULL;
3046
if (m_new == NULL) {
3047
pa = VM_PAGE_TO_PHYS(m_run);
3048
m_new =
3049
vm_page_alloc_noobj_contig(
3050
req, 1, 0, pa - 1,
3051
PAGE_SIZE, 0,
3052
VM_MEMATTR_DEFAULT);
3053
}
3054
if (m_new == NULL) {
3055
pa += ptoa(npages);
3056
m_new =
3057
vm_page_alloc_noobj_contig(
3058
req, 1, pa, high, PAGE_SIZE,
3059
0, VM_MEMATTR_DEFAULT);
3060
}
3061
if (m_new == NULL) {
3062
vm_page_xunbusy(m);
3063
error = ENOMEM;
3064
goto unlock;
3065
}
3066
3067
/*
3068
* Unmap the page and check for new
3069
* wirings that may have been acquired
3070
* through a pmap lookup.
3071
*/
3072
if (object->ref_count != 0 &&
3073
!vm_page_try_remove_all(m)) {
3074
vm_page_xunbusy(m);
3075
vm_page_free(m_new);
3076
error = EBUSY;
3077
goto unlock;
3078
}
3079
3080
/*
3081
* Replace "m" with the new page. For
3082
* vm_page_replace(), "m" must be busy
3083
* and dequeued. Finally, change "m"
3084
* as if vm_page_free() was called.
3085
*/
3086
m_new->a.flags = m->a.flags &
3087
~PGA_QUEUE_STATE_MASK;
3088
KASSERT(m_new->oflags == VPO_UNMANAGED,
3089
("page %p is managed", m_new));
3090
m_new->oflags = 0;
3091
pmap_copy_page(m, m_new);
3092
m_new->valid = m->valid;
3093
m_new->dirty = m->dirty;
3094
m->flags &= ~PG_ZERO;
3095
vm_page_dequeue(m);
3096
if (vm_page_replace_hold(m_new, object,
3097
m->pindex, m) &&
3098
vm_page_free_prep(m))
3099
SLIST_INSERT_HEAD(&free, m,
3100
plinks.s.ss);
3101
3102
/*
3103
* The new page must be deactivated
3104
* before the object is unlocked.
3105
*/
3106
vm_page_deactivate(m_new);
3107
} else {
3108
m->flags &= ~PG_ZERO;
3109
vm_page_dequeue(m);
3110
if (vm_page_free_prep(m))
3111
SLIST_INSERT_HEAD(&free, m,
3112
plinks.s.ss);
3113
KASSERT(m->dirty == 0,
3114
("page %p is dirty", m));
3115
}
3116
} else
3117
error = EBUSY;
3118
unlock:
3119
VM_OBJECT_WUNLOCK(object);
3120
} else {
3121
MPASS(vm_page_domain(m) == domain);
3122
vmd = VM_DOMAIN(domain);
3123
vm_domain_free_lock(vmd);
3124
order = m->order;
3125
if (order < VM_NFREEORDER) {
3126
/*
3127
* The page is enqueued in the physical memory
3128
* allocator's free page queues. Moreover, it
3129
* is the first page in a power-of-two-sized
3130
* run of contiguous free pages. Jump ahead
3131
* to the last page within that run, and
3132
* continue from there.
3133
*/
3134
m += (1 << order) - 1;
3135
}
3136
#if VM_NRESERVLEVEL > 0
3137
else if (vm_reserv_is_page_free(m))
3138
order = 0;
3139
#endif
3140
vm_domain_free_unlock(vmd);
3141
if (order == VM_NFREEORDER)
3142
error = EINVAL;
3143
}
3144
}
3145
if ((m = SLIST_FIRST(&free)) != NULL) {
3146
int cnt;
3147
3148
vmd = VM_DOMAIN(domain);
3149
cnt = 0;
3150
vm_domain_free_lock(vmd);
3151
do {
3152
MPASS(vm_page_domain(m) == domain);
3153
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3154
vm_phys_free_pages(m, m->pool, 0);
3155
cnt++;
3156
} while ((m = SLIST_FIRST(&free)) != NULL);
3157
vm_domain_free_unlock(vmd);
3158
vm_domain_freecnt_inc(vmd, cnt);
3159
}
3160
return (error);
3161
}
3162
3163
#define NRUNS 16
3164
3165
#define RUN_INDEX(count, nruns) ((count) % (nruns))
3166
3167
#define MIN_RECLAIM 8
3168
3169
/*
3170
* vm_page_reclaim_contig:
3171
*
3172
* Reclaim allocated, contiguous physical memory satisfying the specified
3173
* conditions by relocating the virtual pages using that physical memory.
3174
* Returns 0 if reclamation is successful, ERANGE if the specified domain
3175
* can't possibly satisfy the reclamation request, or ENOMEM if not
3176
* currently able to reclaim the requested number of pages. Since
3177
* relocation requires the allocation of physical pages, reclamation may
3178
* fail with ENOMEM due to a shortage of free pages. When reclamation
3179
* fails in this manner, callers are expected to perform vm_wait() before
3180
* retrying a failed allocation operation, e.g., vm_page_alloc_contig().
3181
*
3182
* The caller must always specify an allocation class through "req".
3183
*
3184
* allocation classes:
3185
* VM_ALLOC_NORMAL normal process request
3186
* VM_ALLOC_SYSTEM system *really* needs a page
3187
* VM_ALLOC_INTERRUPT interrupt time request
3188
*
3189
* The optional allocation flags are ignored.
3190
*
3191
* "npages" must be greater than zero. Both "alignment" and "boundary"
3192
* must be a power of two.
3193
*/
3194
int
3195
vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
3196
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
3197
int desired_runs)
3198
{
3199
struct vm_domain *vmd;
3200
vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs;
3201
u_long count, minalign, reclaimed;
3202
int error, i, min_reclaim, nruns, options, req_class;
3203
int segind, start_segind;
3204
int ret;
3205
3206
KASSERT(npages > 0, ("npages is 0"));
3207
KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
3208
KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
3209
3210
ret = ENOMEM;
3211
3212
/*
3213
* If the caller wants to reclaim multiple runs, try to allocate
3214
* space to store the runs. If that fails, fall back to the old
3215
* behavior of just reclaiming MIN_RECLAIM pages.
3216
*/
3217
if (desired_runs > 1)
3218
m_runs = malloc((NRUNS + desired_runs) * sizeof(*m_runs),
3219
M_TEMP, M_NOWAIT);
3220
else
3221
m_runs = NULL;
3222
3223
if (m_runs == NULL) {
3224
m_runs = _m_runs;
3225
nruns = NRUNS;
3226
} else {
3227
nruns = NRUNS + desired_runs - 1;
3228
}
3229
min_reclaim = MAX(desired_runs * npages, MIN_RECLAIM);
3230
3231
/*
3232
* The caller will attempt an allocation after some runs have been
3233
* reclaimed and added to the vm_phys buddy lists. Due to limitations
3234
* of vm_phys_alloc_contig(), round up the requested length to the next
3235
* power of two or maximum chunk size, and ensure that each run is
3236
* suitably aligned.
3237
*/
3238
minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1);
3239
npages = roundup2(npages, minalign);
3240
if (alignment < ptoa(minalign))
3241
alignment = ptoa(minalign);
3242
3243
/*
3244
* The page daemon is allowed to dig deeper into the free page list.
3245
*/
3246
req_class = req & VM_ALLOC_CLASS_MASK;
3247
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
3248
req_class = VM_ALLOC_SYSTEM;
3249
3250
start_segind = vm_phys_lookup_segind(low);
3251
3252
/*
3253
* Return if the number of free pages cannot satisfy the requested
3254
* allocation.
3255
*/
3256
vmd = VM_DOMAIN(domain);
3257
count = vmd->vmd_free_count;
3258
if (count < npages + vmd->vmd_free_reserved || (count < npages +
3259
vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
3260
(count < npages && req_class == VM_ALLOC_INTERRUPT))
3261
goto done;
3262
3263
/*
3264
* Scan up to three times, relaxing the restrictions ("options") on
3265
* the reclamation of reservations and superpages each time.
3266
*/
3267
for (options = VPSC_NORESERV;;) {
3268
bool phys_range_exists = false;
3269
3270
/*
3271
* Find the highest runs that satisfy the given constraints
3272
* and restrictions, and record them in "m_runs".
3273
*/
3274
count = 0;
3275
segind = start_segind;
3276
while ((segind = vm_phys_find_range(bounds, segind, domain,
3277
npages, low, high)) != -1) {
3278
phys_range_exists = true;
3279
while ((m_run = vm_page_scan_contig(npages, bounds[0],
3280
bounds[1], alignment, boundary, options))) {
3281
bounds[0] = m_run + npages;
3282
m_runs[RUN_INDEX(count, nruns)] = m_run;
3283
count++;
3284
}
3285
segind++;
3286
}
3287
3288
if (!phys_range_exists) {
3289
ret = ERANGE;
3290
goto done;
3291
}
3292
3293
/*
3294
* Reclaim the highest runs in LIFO (descending) order until
3295
* the number of reclaimed pages, "reclaimed", is at least
3296
* "min_reclaim". Reset "reclaimed" each time because each
3297
* reclamation is idempotent, and runs will (likely) recur
3298
* from one scan to the next as restrictions are relaxed.
3299
*/
3300
reclaimed = 0;
3301
for (i = 0; count > 0 && i < nruns; i++) {
3302
count--;
3303
m_run = m_runs[RUN_INDEX(count, nruns)];
3304
error = vm_page_reclaim_run(req_class, domain, npages,
3305
m_run, high);
3306
if (error == 0) {
3307
reclaimed += npages;
3308
if (reclaimed >= min_reclaim) {
3309
ret = 0;
3310
goto done;
3311
}
3312
}
3313
}
3314
3315
/*
3316
* Either relax the restrictions on the next scan or return if
3317
* the last scan had no restrictions.
3318
*/
3319
if (options == VPSC_NORESERV)
3320
options = VPSC_NOSUPER;
3321
else if (options == VPSC_NOSUPER)
3322
options = VPSC_ANY;
3323
else if (options == VPSC_ANY) {
3324
if (reclaimed != 0)
3325
ret = 0;
3326
goto done;
3327
}
3328
}
3329
done:
3330
if (m_runs != _m_runs)
3331
free(m_runs, M_TEMP);
3332
return (ret);
3333
}
3334
3335
int
3336
vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
3337
vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
3338
{
3339
return (vm_page_reclaim_contig_domain_ext(domain, req, npages, low,
3340
high, alignment, boundary, 1));
3341
}
3342
3343
int
3344
vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
3345
u_long alignment, vm_paddr_t boundary)
3346
{
3347
struct vm_domainset_iter di;
3348
int domain, ret, status;
3349
3350
ret = ERANGE;
3351
3352
if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0)
3353
return (ret);
3354
3355
do {
3356
status = vm_page_reclaim_contig_domain(domain, req, npages, low,
3357
high, alignment, boundary);
3358
if (status == 0)
3359
return (0);
3360
else if (status == ERANGE)
3361
vm_domainset_iter_ignore(&di, domain);
3362
else {
3363
KASSERT(status == ENOMEM, ("Unrecognized error %d "
3364
"from vm_page_reclaim_contig_domain()", status));
3365
ret = ENOMEM;
3366
}
3367
} while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0);
3368
3369
return (ret);
3370
}
3371
3372
/*
3373
* Set the domain in the appropriate page level domainset.
3374
*/
3375
void
3376
vm_domain_set(struct vm_domain *vmd)
3377
{
3378
3379
mtx_lock(&vm_domainset_lock);
3380
if (!vmd->vmd_minset && vm_paging_min(vmd)) {
3381
vmd->vmd_minset = 1;
3382
DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
3383
}
3384
if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
3385
vmd->vmd_severeset = 1;
3386
DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains);
3387
}
3388
mtx_unlock(&vm_domainset_lock);
3389
}
3390
3391
/*
3392
* Clear the domain from the appropriate page level domainset.
3393
*/
3394
void
3395
vm_domain_clear(struct vm_domain *vmd)
3396
{
3397
3398
mtx_lock(&vm_domainset_lock);
3399
if (vmd->vmd_minset && !vm_paging_min(vmd)) {
3400
vmd->vmd_minset = 0;
3401
DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
3402
if (vm_min_waiters != 0) {
3403
vm_min_waiters = 0;
3404
wakeup(&vm_min_domains);
3405
}
3406
}
3407
if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
3408
vmd->vmd_severeset = 0;
3409
DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
3410
if (vm_severe_waiters != 0) {
3411
vm_severe_waiters = 0;
3412
wakeup(&vm_severe_domains);
3413
}
3414
}
3415
3416
/*
3417
* If pageout daemon needs pages, then tell it that there are
3418
* some free.
3419
*/
3420
if (vmd->vmd_pageout_pages_needed &&
3421
vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
3422
wakeup(&vmd->vmd_pageout_pages_needed);
3423
vmd->vmd_pageout_pages_needed = 0;
3424
}
3425
3426
/* See comments in vm_wait_doms(). */
3427
if (vm_pageproc_waiters) {
3428
vm_pageproc_waiters = 0;
3429
wakeup(&vm_pageproc_waiters);
3430
}
3431
mtx_unlock(&vm_domainset_lock);
3432
}
3433
3434
/*
3435
* Wait for free pages to exceed the min threshold globally.
3436
*/
3437
void
3438
vm_wait_min(void)
3439
{
3440
3441
mtx_lock(&vm_domainset_lock);
3442
while (vm_page_count_min()) {
3443
vm_min_waiters++;
3444
msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
3445
}
3446
mtx_unlock(&vm_domainset_lock);
3447
}
3448
3449
/*
3450
* Wait for free pages to exceed the severe threshold globally.
3451
*/
3452
void
3453
vm_wait_severe(void)
3454
{
3455
3456
mtx_lock(&vm_domainset_lock);
3457
while (vm_page_count_severe()) {
3458
vm_severe_waiters++;
3459
msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
3460
"vmwait", 0);
3461
}
3462
mtx_unlock(&vm_domainset_lock);
3463
}
3464
3465
u_int
3466
vm_wait_count(void)
3467
{
3468
3469
return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
3470
}
3471
3472
int
3473
vm_wait_doms(const domainset_t *wdoms, int mflags)
3474
{
3475
int error;
3476
3477
error = 0;
3478
3479
/*
3480
* We use racey wakeup synchronization to avoid expensive global
3481
* locking for the pageproc when sleeping with a non-specific vm_wait.
3482
* To handle this, we only sleep for one tick in this instance. It
3483
* is expected that most allocations for the pageproc will come from
3484
* kmem or vm_page_grab* which will use the more specific and
3485
* race-free vm_wait_domain().
3486
*/
3487
if (curproc == pageproc) {
3488
mtx_lock(&vm_domainset_lock);
3489
vm_pageproc_waiters++;
3490
error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
3491
PVM | PDROP | mflags, "pageprocwait", 1);
3492
} else {
3493
/*
3494
* XXX Ideally we would wait only until the allocation could
3495
* be satisfied. This condition can cause new allocators to
3496
* consume all freed pages while old allocators wait.
3497
*/
3498
mtx_lock(&vm_domainset_lock);
3499
if (vm_page_count_min_set(wdoms)) {
3500
if (pageproc == NULL)
3501
panic("vm_wait in early boot");
3502
vm_min_waiters++;
3503
error = msleep(&vm_min_domains, &vm_domainset_lock,
3504
PVM | PDROP | mflags, "vmwait", 0);
3505
} else
3506
mtx_unlock(&vm_domainset_lock);
3507
}
3508
return (error);
3509
}
3510
3511
/*
3512
* vm_wait_domain:
3513
*
3514
* Sleep until free pages are available for allocation.
3515
* - Called in various places after failed memory allocations.
3516
*/
3517
void
3518
vm_wait_domain(int domain)
3519
{
3520
struct vm_domain *vmd;
3521
domainset_t wdom;
3522
3523
vmd = VM_DOMAIN(domain);
3524
vm_domain_free_assert_unlocked(vmd);
3525
3526
if (curproc == pageproc) {
3527
mtx_lock(&vm_domainset_lock);
3528
if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
3529
vmd->vmd_pageout_pages_needed = 1;
3530
msleep(&vmd->vmd_pageout_pages_needed,
3531
&vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
3532
} else
3533
mtx_unlock(&vm_domainset_lock);
3534
} else {
3535
DOMAINSET_ZERO(&wdom);
3536
DOMAINSET_SET(vmd->vmd_domain, &wdom);
3537
vm_wait_doms(&wdom, 0);
3538
}
3539
}
3540
3541
static int
3542
vm_wait_flags(vm_object_t obj, int mflags)
3543
{
3544
struct domainset *d;
3545
3546
d = NULL;
3547
3548
/*
3549
* Carefully fetch pointers only once: the struct domainset
3550
* itself is ummutable but the pointer might change.
3551
*/
3552
if (obj != NULL)
3553
d = obj->domain.dr_policy;
3554
if (d == NULL)
3555
d = curthread->td_domain.dr_policy;
3556
3557
return (vm_wait_doms(&d->ds_mask, mflags));
3558
}
3559
3560
/*
3561
* vm_wait:
3562
*
3563
* Sleep until free pages are available for allocation in the
3564
* affinity domains of the obj. If obj is NULL, the domain set
3565
* for the calling thread is used.
3566
* Called in various places after failed memory allocations.
3567
*/
3568
void
3569
vm_wait(vm_object_t obj)
3570
{
3571
(void)vm_wait_flags(obj, 0);
3572
}
3573
3574
int
3575
vm_wait_intr(vm_object_t obj)
3576
{
3577
return (vm_wait_flags(obj, PCATCH));
3578
}
3579
3580
/*
3581
* vm_domain_alloc_fail:
3582
*
3583
* Called when a page allocation function fails. Informs the
3584
* pagedaemon and performs the requested wait. Requires the
3585
* domain_free and object lock on entry. Returns with the
3586
* object lock held and free lock released. Returns an error when
3587
* retry is necessary.
3588
*
3589
*/
3590
static int
3591
vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
3592
{
3593
3594
vm_domain_free_assert_unlocked(vmd);
3595
3596
atomic_add_int(&vmd->vmd_pageout_deficit,
3597
max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
3598
if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
3599
if (object != NULL)
3600
VM_OBJECT_WUNLOCK(object);
3601
vm_wait_domain(vmd->vmd_domain);
3602
if (object != NULL)
3603
VM_OBJECT_WLOCK(object);
3604
if (req & VM_ALLOC_WAITOK)
3605
return (EAGAIN);
3606
}
3607
3608
return (0);
3609
}
3610
3611
/*
3612
* vm_waitpfault:
3613
*
3614
* Sleep until free pages are available for allocation.
3615
* - Called only in vm_fault so that processes page faulting
3616
* can be easily tracked.
3617
* - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
3618
* processes will be able to grab memory first. Do not change
3619
* this balance without careful testing first.
3620
*/
3621
void
3622
vm_waitpfault(struct domainset *dset, int timo)
3623
{
3624
3625
/*
3626
* XXX Ideally we would wait only until the allocation could
3627
* be satisfied. This condition can cause new allocators to
3628
* consume all freed pages while old allocators wait.
3629
*/
3630
mtx_lock(&vm_domainset_lock);
3631
if (vm_page_count_min_set(&dset->ds_mask)) {
3632
vm_min_waiters++;
3633
msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP,
3634
"pfault", timo);
3635
} else
3636
mtx_unlock(&vm_domainset_lock);
3637
}
3638
3639
static struct vm_pagequeue *
3640
_vm_page_pagequeue(vm_page_t m, uint8_t queue)
3641
{
3642
3643
return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]);
3644
}
3645
3646
#ifdef INVARIANTS
3647
static struct vm_pagequeue *
3648
vm_page_pagequeue(vm_page_t m)
3649
{
3650
3651
return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue));
3652
}
3653
#endif
3654
3655
static __always_inline bool
3656
vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old,
3657
vm_page_astate_t new)
3658
{
3659
vm_page_astate_t tmp;
3660
3661
tmp = *old;
3662
do {
3663
if (__predict_true(vm_page_astate_fcmpset(m, old, new)))
3664
return (true);
3665
counter_u64_add(pqstate_commit_retries, 1);
3666
} while (old->_bits == tmp._bits);
3667
3668
return (false);
3669
}
3670
3671
/*
3672
* Do the work of committing a queue state update that moves the page out of
3673
* its current queue.
3674
*/
3675
static bool
3676
_vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m,
3677
vm_page_astate_t *old, vm_page_astate_t new)
3678
{
3679
vm_page_t next;
3680
3681
vm_pagequeue_assert_locked(pq);
3682
KASSERT(vm_page_pagequeue(m) == pq,
3683
("%s: queue %p does not match page %p", __func__, pq, m));
3684
KASSERT(old->queue != PQ_NONE && new.queue != old->queue,
3685
("%s: invalid queue indices %d %d",
3686
__func__, old->queue, new.queue));
3687
3688
/*
3689
* Once the queue index of the page changes there is nothing
3690
* synchronizing with further updates to the page's physical
3691
* queue state. Therefore we must speculatively remove the page
3692
* from the queue now and be prepared to roll back if the queue
3693
* state update fails. If the page is not physically enqueued then
3694
* we just update its queue index.
3695
*/
3696
if ((old->flags & PGA_ENQUEUED) != 0) {
3697
new.flags &= ~PGA_ENQUEUED;
3698
next = TAILQ_NEXT(m, plinks.q);
3699
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3700
vm_pagequeue_cnt_dec(pq);
3701
if (!vm_page_pqstate_fcmpset(m, old, new)) {
3702
if (next == NULL)
3703
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3704
else
3705
TAILQ_INSERT_BEFORE(next, m, plinks.q);
3706
vm_pagequeue_cnt_inc(pq);
3707
return (false);
3708
} else {
3709
return (true);
3710
}
3711
} else {
3712
return (vm_page_pqstate_fcmpset(m, old, new));
3713
}
3714
}
3715
3716
static bool
3717
vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old,
3718
vm_page_astate_t new)
3719
{
3720
struct vm_pagequeue *pq;
3721
vm_page_astate_t as;
3722
bool ret;
3723
3724
pq = _vm_page_pagequeue(m, old->queue);
3725
3726
/*
3727
* The queue field and PGA_ENQUEUED flag are stable only so long as the
3728
* corresponding page queue lock is held.
3729
*/
3730
vm_pagequeue_lock(pq);
3731
as = vm_page_astate_load(m);
3732
if (__predict_false(as._bits != old->_bits)) {
3733
*old = as;
3734
ret = false;
3735
} else {
3736
ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new);
3737
}
3738
vm_pagequeue_unlock(pq);
3739
return (ret);
3740
}
3741
3742
/*
3743
* Commit a queue state update that enqueues or requeues a page.
3744
*/
3745
static bool
3746
_vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m,
3747
vm_page_astate_t *old, vm_page_astate_t new)
3748
{
3749
struct vm_domain *vmd;
3750
3751
vm_pagequeue_assert_locked(pq);
3752
KASSERT(old->queue != PQ_NONE && new.queue == old->queue,
3753
("%s: invalid queue indices %d %d",
3754
__func__, old->queue, new.queue));
3755
3756
new.flags |= PGA_ENQUEUED;
3757
if (!vm_page_pqstate_fcmpset(m, old, new))
3758
return (false);
3759
3760
if ((old->flags & PGA_ENQUEUED) != 0)
3761
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3762
else
3763
vm_pagequeue_cnt_inc(pq);
3764
3765
/*
3766
* Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if
3767
* both flags are set in close succession, only PGA_REQUEUE_HEAD will be
3768
* applied, even if it was set first.
3769
*/
3770
if ((old->flags & PGA_REQUEUE_HEAD) != 0) {
3771
vmd = vm_pagequeue_domain(m);
3772
KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE],
3773
("%s: invalid page queue for page %p", __func__, m));
3774
TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
3775
} else {
3776
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3777
}
3778
return (true);
3779
}
3780
3781
/*
3782
* Commit a queue state update that encodes a request for a deferred queue
3783
* operation.
3784
*/
3785
static bool
3786
vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old,
3787
vm_page_astate_t new)
3788
{
3789
3790
KASSERT(old->queue == new.queue || new.queue != PQ_NONE,
3791
("%s: invalid state, queue %d flags %x",
3792
__func__, new.queue, new.flags));
3793
3794
if (old->_bits != new._bits &&
3795
!vm_page_pqstate_fcmpset(m, old, new))
3796
return (false);
3797
vm_page_pqbatch_submit(m, new.queue);
3798
return (true);
3799
}
3800
3801
/*
3802
* A generic queue state update function. This handles more cases than the
3803
* specialized functions above.
3804
*/
3805
bool
3806
vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
3807
{
3808
3809
if (old->_bits == new._bits)
3810
return (true);
3811
3812
if (old->queue != PQ_NONE && new.queue != old->queue) {
3813
if (!vm_page_pqstate_commit_dequeue(m, old, new))
3814
return (false);
3815
if (new.queue != PQ_NONE)
3816
vm_page_pqbatch_submit(m, new.queue);
3817
} else {
3818
if (!vm_page_pqstate_fcmpset(m, old, new))
3819
return (false);
3820
if (new.queue != PQ_NONE &&
3821
((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0)
3822
vm_page_pqbatch_submit(m, new.queue);
3823
}
3824
return (true);
3825
}
3826
3827
/*
3828
* Apply deferred queue state updates to a page.
3829
*/
3830
static inline void
3831
vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue)
3832
{
3833
vm_page_astate_t new, old;
3834
3835
CRITICAL_ASSERT(curthread);
3836
vm_pagequeue_assert_locked(pq);
3837
KASSERT(queue < PQ_COUNT,
3838
("%s: invalid queue index %d", __func__, queue));
3839
KASSERT(pq == _vm_page_pagequeue(m, queue),
3840
("%s: page %p does not belong to queue %p", __func__, m, pq));
3841
3842
for (old = vm_page_astate_load(m);;) {
3843
if (__predict_false(old.queue != queue ||
3844
(old.flags & PGA_QUEUE_OP_MASK) == 0)) {
3845
counter_u64_add(queue_nops, 1);
3846
break;
3847
}
3848
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3849
("%s: page %p is unmanaged", __func__, m));
3850
3851
new = old;
3852
if ((old.flags & PGA_DEQUEUE) != 0) {
3853
new.flags &= ~PGA_QUEUE_OP_MASK;
3854
new.queue = PQ_NONE;
3855
if (__predict_true(_vm_page_pqstate_commit_dequeue(pq,
3856
m, &old, new))) {
3857
counter_u64_add(queue_ops, 1);
3858
break;
3859
}
3860
} else {
3861
new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD);
3862
if (__predict_true(_vm_page_pqstate_commit_requeue(pq,
3863
m, &old, new))) {
3864
counter_u64_add(queue_ops, 1);
3865
break;
3866
}
3867
}
3868
}
3869
}
3870
3871
static void
3872
vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
3873
uint8_t queue)
3874
{
3875
int i;
3876
3877
for (i = 0; i < bq->bq_cnt; i++)
3878
vm_pqbatch_process_page(pq, bq->bq_pa[i], queue);
3879
vm_batchqueue_init(bq);
3880
}
3881
3882
/*
3883
* vm_page_pqbatch_submit: [ internal use only ]
3884
*
3885
* Enqueue a page in the specified page queue's batched work queue.
3886
* The caller must have encoded the requested operation in the page
3887
* structure's a.flags field.
3888
*/
3889
void
3890
vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
3891
{
3892
struct vm_batchqueue *bq;
3893
struct vm_pagequeue *pq;
3894
int domain, slots_remaining;
3895
3896
KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
3897
3898
domain = vm_page_domain(m);
3899
critical_enter();
3900
bq = DPCPU_PTR(pqbatch[domain][queue]);
3901
slots_remaining = vm_batchqueue_insert(bq, m);
3902
if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) {
3903
/* keep building the bq */
3904
critical_exit();
3905
return;
3906
} else if (slots_remaining > 0 ) {
3907
/* Try to process the bq if we can get the lock */
3908
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3909
if (vm_pagequeue_trylock(pq)) {
3910
vm_pqbatch_process(pq, bq, queue);
3911
vm_pagequeue_unlock(pq);
3912
}
3913
critical_exit();
3914
return;
3915
}
3916
critical_exit();
3917
3918
/* if we make it here, the bq is full so wait for the lock */
3919
3920
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3921
vm_pagequeue_lock(pq);
3922
critical_enter();
3923
bq = DPCPU_PTR(pqbatch[domain][queue]);
3924
vm_pqbatch_process(pq, bq, queue);
3925
vm_pqbatch_process_page(pq, m, queue);
3926
vm_pagequeue_unlock(pq);
3927
critical_exit();
3928
}
3929
3930
/*
3931
* vm_page_pqbatch_drain: [ internal use only ]
3932
*
3933
* Force all per-CPU page queue batch queues to be drained. This is
3934
* intended for use in severe memory shortages, to ensure that pages
3935
* do not remain stuck in the batch queues.
3936
*/
3937
void
3938
vm_page_pqbatch_drain(void)
3939
{
3940
struct thread *td;
3941
struct vm_domain *vmd;
3942
struct vm_pagequeue *pq;
3943
int cpu, domain, queue;
3944
3945
td = curthread;
3946
CPU_FOREACH(cpu) {
3947
thread_lock(td);
3948
sched_bind(td, cpu);
3949
thread_unlock(td);
3950
3951
for (domain = 0; domain < vm_ndomains; domain++) {
3952
vmd = VM_DOMAIN(domain);
3953
for (queue = 0; queue < PQ_COUNT; queue++) {
3954
pq = &vmd->vmd_pagequeues[queue];
3955
vm_pagequeue_lock(pq);
3956
critical_enter();
3957
vm_pqbatch_process(pq,
3958
DPCPU_PTR(pqbatch[domain][queue]), queue);
3959
critical_exit();
3960
vm_pagequeue_unlock(pq);
3961
}
3962
}
3963
}
3964
thread_lock(td);
3965
sched_unbind(td);
3966
thread_unlock(td);
3967
}
3968
3969
/*
3970
* vm_page_dequeue_deferred: [ internal use only ]
3971
*
3972
* Request removal of the given page from its current page
3973
* queue. Physical removal from the queue may be deferred
3974
* indefinitely.
3975
*/
3976
void
3977
vm_page_dequeue_deferred(vm_page_t m)
3978
{
3979
vm_page_astate_t new, old;
3980
3981
old = vm_page_astate_load(m);
3982
do {
3983
if (old.queue == PQ_NONE) {
3984
KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
3985
("%s: page %p has unexpected queue state",
3986
__func__, m));
3987
break;
3988
}
3989
new = old;
3990
new.flags |= PGA_DEQUEUE;
3991
} while (!vm_page_pqstate_commit_request(m, &old, new));
3992
}
3993
3994
/*
3995
* vm_page_dequeue:
3996
*
3997
* Remove the page from whichever page queue it's in, if any, before
3998
* returning.
3999
*/
4000
void
4001
vm_page_dequeue(vm_page_t m)
4002
{
4003
vm_page_astate_t new, old;
4004
4005
old = vm_page_astate_load(m);
4006
do {
4007
if (__predict_true(old.queue == PQ_NONE)) {
4008
KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
4009
("%s: page %p has unexpected queue state",
4010
__func__, m));
4011
break;
4012
}
4013
new = old;
4014
new.flags &= ~PGA_QUEUE_OP_MASK;
4015
new.queue = PQ_NONE;
4016
} while (!vm_page_pqstate_commit_dequeue(m, &old, new));
4017
4018
}
4019
4020
/*
4021
* Schedule the given page for insertion into the specified page queue.
4022
* Physical insertion of the page may be deferred indefinitely.
4023
*/
4024
static void
4025
vm_page_enqueue(vm_page_t m, uint8_t queue)
4026
{
4027
4028
KASSERT(m->a.queue == PQ_NONE &&
4029
(m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
4030
("%s: page %p is already enqueued", __func__, m));
4031
KASSERT(m->ref_count > 0,
4032
("%s: page %p does not carry any references", __func__, m));
4033
4034
m->a.queue = queue;
4035
if ((m->a.flags & PGA_REQUEUE) == 0)
4036
vm_page_aflag_set(m, PGA_REQUEUE);
4037
vm_page_pqbatch_submit(m, queue);
4038
}
4039
4040
/*
4041
* vm_page_free_prep:
4042
*
4043
* Prepares the given page to be put on the free list,
4044
* disassociating it from any VM object. The caller may return
4045
* the page to the free list only if this function returns true.
4046
*
4047
* The object, if it exists, must be locked, and then the page must
4048
* be xbusy. Otherwise the page must be not busied. A managed
4049
* page must be unmapped.
4050
*/
4051
static bool
4052
vm_page_free_prep(vm_page_t m)
4053
{
4054
4055
/*
4056
* Synchronize with threads that have dropped a reference to this
4057
* page.
4058
*/
4059
atomic_thread_fence_acq();
4060
4061
#ifdef INVARIANTS
4062
if (vm_check_pg_zero && (m->flags & PG_ZERO) != 0) {
4063
struct sf_buf *sf;
4064
unsigned long *p;
4065
int i;
4066
4067
sched_pin();
4068
sf = sf_buf_alloc(m, SFB_CPUPRIVATE | SFB_NOWAIT);
4069
if (sf != NULL) {
4070
p = (unsigned long *)sf_buf_kva(sf);
4071
for (i = 0; i < PAGE_SIZE / sizeof(*p); i++, p++) {
4072
KASSERT(*p == 0,
4073
("zerocheck failed page %p PG_ZERO %d %jx",
4074
m, i, (uintmax_t)*p));
4075
}
4076
sf_buf_free(sf);
4077
}
4078
sched_unpin();
4079
}
4080
#endif
4081
if ((m->oflags & VPO_UNMANAGED) == 0) {
4082
KASSERT(!pmap_page_is_mapped(m),
4083
("vm_page_free_prep: freeing mapped page %p", m));
4084
KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0,
4085
("vm_page_free_prep: mapping flags set in page %p", m));
4086
} else {
4087
KASSERT(m->a.queue == PQ_NONE,
4088
("vm_page_free_prep: unmanaged page %p is queued", m));
4089
}
4090
VM_CNT_INC(v_tfree);
4091
4092
if (m->object != NULL) {
4093
vm_page_radix_remove(m);
4094
vm_page_free_object_prep(m);
4095
} else
4096
vm_page_assert_unbusied(m);
4097
4098
vm_page_busy_free(m);
4099
4100
/*
4101
* If fictitious remove object association and
4102
* return.
4103
*/
4104
if ((m->flags & PG_FICTITIOUS) != 0) {
4105
KASSERT(m->ref_count == 1,
4106
("fictitious page %p is referenced", m));
4107
KASSERT(m->a.queue == PQ_NONE,
4108
("fictitious page %p is queued", m));
4109
return (false);
4110
}
4111
4112
/*
4113
* Pages need not be dequeued before they are returned to the physical
4114
* memory allocator, but they must at least be marked for a deferred
4115
* dequeue.
4116
*/
4117
if ((m->oflags & VPO_UNMANAGED) == 0)
4118
vm_page_dequeue_deferred(m);
4119
4120
m->valid = 0;
4121
vm_page_undirty(m);
4122
4123
if (m->ref_count != 0)
4124
panic("vm_page_free_prep: page %p has references", m);
4125
4126
/*
4127
* Restore the default memory attribute to the page.
4128
*/
4129
if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
4130
pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
4131
4132
#if VM_NRESERVLEVEL > 0
4133
/*
4134
* Determine whether the page belongs to a reservation. If the page was
4135
* allocated from a per-CPU cache, it cannot belong to a reservation, so
4136
* as an optimization, we avoid the check in that case.
4137
*/
4138
if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m))
4139
return (false);
4140
#endif
4141
4142
return (true);
4143
}
4144
4145
/*
4146
* vm_page_free_toq:
4147
*
4148
* Returns the given page to the free list, disassociating it
4149
* from any VM object.
4150
*
4151
* The object must be locked. The page must be exclusively busied if it
4152
* belongs to an object.
4153
*/
4154
static void
4155
vm_page_free_toq(vm_page_t m)
4156
{
4157
struct vm_domain *vmd;
4158
uma_zone_t zone;
4159
4160
if (!vm_page_free_prep(m))
4161
return;
4162
4163
vmd = vm_pagequeue_domain(m);
4164
if (__predict_false((m->flags & PG_NOFREE) != 0)) {
4165
vm_page_free_nofree(vmd, m);
4166
return;
4167
}
4168
zone = vmd->vmd_pgcache[m->pool].zone;
4169
if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) {
4170
uma_zfree(zone, m);
4171
return;
4172
}
4173
vm_domain_free_lock(vmd);
4174
vm_phys_free_pages(m, m->pool, 0);
4175
vm_domain_free_unlock(vmd);
4176
vm_domain_freecnt_inc(vmd, 1);
4177
}
4178
4179
/*
4180
* vm_page_free_pages_toq:
4181
*
4182
* Returns a list of pages to the free list, disassociating it
4183
* from any VM object. In other words, this is equivalent to
4184
* calling vm_page_free_toq() for each page of a list of VM objects.
4185
*/
4186
int
4187
vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
4188
{
4189
vm_page_t m;
4190
int count;
4191
4192
if (SLIST_EMPTY(free))
4193
return (0);
4194
4195
count = 0;
4196
while ((m = SLIST_FIRST(free)) != NULL) {
4197
count++;
4198
SLIST_REMOVE_HEAD(free, plinks.s.ss);
4199
vm_page_free_toq(m);
4200
}
4201
4202
if (update_wire_count)
4203
vm_wire_sub(count);
4204
return (count);
4205
}
4206
4207
/*
4208
* Mark this page as wired down. For managed pages, this prevents reclamation
4209
* by the page daemon, or when the containing object, if any, is destroyed.
4210
*/
4211
void
4212
vm_page_wire(vm_page_t m)
4213
{
4214
u_int old;
4215
4216
#ifdef INVARIANTS
4217
if (m->object != NULL && !vm_page_busied(m) &&
4218
!vm_object_busied(m->object))
4219
VM_OBJECT_ASSERT_LOCKED(m->object);
4220
#endif
4221
KASSERT((m->flags & PG_FICTITIOUS) == 0 ||
4222
VPRC_WIRE_COUNT(m->ref_count) >= 1,
4223
("vm_page_wire: fictitious page %p has zero wirings", m));
4224
4225
old = atomic_fetchadd_int(&m->ref_count, 1);
4226
KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
4227
("vm_page_wire: counter overflow for page %p", m));
4228
if (VPRC_WIRE_COUNT(old) == 0) {
4229
if ((m->oflags & VPO_UNMANAGED) == 0)
4230
vm_page_aflag_set(m, PGA_DEQUEUE);
4231
vm_wire_add(1);
4232
}
4233
}
4234
4235
/*
4236
* Attempt to wire a mapped page following a pmap lookup of that page.
4237
* This may fail if a thread is concurrently tearing down mappings of the page.
4238
* The transient failure is acceptable because it translates to the
4239
* failure of the caller pmap_extract_and_hold(), which should be then
4240
* followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages().
4241
*/
4242
bool
4243
vm_page_wire_mapped(vm_page_t m)
4244
{
4245
u_int old;
4246
4247
old = atomic_load_int(&m->ref_count);
4248
do {
4249
KASSERT(old > 0,
4250
("vm_page_wire_mapped: wiring unreferenced page %p", m));
4251
if ((old & VPRC_BLOCKED) != 0)
4252
return (false);
4253
} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
4254
4255
if (VPRC_WIRE_COUNT(old) == 0) {
4256
if ((m->oflags & VPO_UNMANAGED) == 0)
4257
vm_page_aflag_set(m, PGA_DEQUEUE);
4258
vm_wire_add(1);
4259
}
4260
return (true);
4261
}
4262
4263
/*
4264
* Release a wiring reference to a managed page. If the page still belongs to
4265
* an object, update its position in the page queues to reflect the reference.
4266
* If the wiring was the last reference to the page, free the page.
4267
*/
4268
static void
4269
vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
4270
{
4271
u_int old;
4272
4273
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4274
("%s: page %p is unmanaged", __func__, m));
4275
4276
/*
4277
* Update LRU state before releasing the wiring reference.
4278
* Use a release store when updating the reference count to
4279
* synchronize with vm_page_free_prep().
4280
*/
4281
old = atomic_load_int(&m->ref_count);
4282
do {
4283
u_int count;
4284
4285
KASSERT(VPRC_WIRE_COUNT(old) > 0,
4286
("vm_page_unwire: wire count underflow for page %p", m));
4287
4288
count = old & ~VPRC_BLOCKED;
4289
if (count > VPRC_OBJREF + 1) {
4290
/*
4291
* The page has at least one other wiring reference. An
4292
* earlier iteration of this loop may have called
4293
* vm_page_release_toq() and cleared PGA_DEQUEUE, so
4294
* re-set it if necessary.
4295
*/
4296
if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
4297
vm_page_aflag_set(m, PGA_DEQUEUE);
4298
} else if (count == VPRC_OBJREF + 1) {
4299
/*
4300
* This is the last wiring. Clear PGA_DEQUEUE and
4301
* update the page's queue state to reflect the
4302
* reference. If the page does not belong to an object
4303
* (i.e., the VPRC_OBJREF bit is clear), we only need to
4304
* clear leftover queue state.
4305
*/
4306
vm_page_release_toq(m, nqueue, noreuse);
4307
} else if (count == 1) {
4308
vm_page_aflag_clear(m, PGA_DEQUEUE);
4309
}
4310
} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
4311
4312
if (VPRC_WIRE_COUNT(old) == 1) {
4313
vm_wire_sub(1);
4314
if (old == 1)
4315
vm_page_free(m);
4316
}
4317
}
4318
4319
/*
4320
* Release one wiring of the specified page, potentially allowing it to be
4321
* paged out.
4322
*
4323
* Only managed pages belonging to an object can be paged out. If the number
4324
* of wirings transitions to zero and the page is eligible for page out, then
4325
* the page is added to the specified paging queue. If the released wiring
4326
* represented the last reference to the page, the page is freed.
4327
*/
4328
void
4329
vm_page_unwire(vm_page_t m, uint8_t nqueue)
4330
{
4331
4332
KASSERT(nqueue < PQ_COUNT,
4333
("vm_page_unwire: invalid queue %u request for page %p",
4334
nqueue, m));
4335
4336
if ((m->oflags & VPO_UNMANAGED) != 0) {
4337
if (vm_page_unwire_noq(m) && m->ref_count == 0)
4338
vm_page_free(m);
4339
return;
4340
}
4341
vm_page_unwire_managed(m, nqueue, false);
4342
}
4343
4344
/*
4345
* Unwire a page without (re-)inserting it into a page queue. It is up
4346
* to the caller to enqueue, requeue, or free the page as appropriate.
4347
* In most cases involving managed pages, vm_page_unwire() should be used
4348
* instead.
4349
*/
4350
bool
4351
vm_page_unwire_noq(vm_page_t m)
4352
{
4353
u_int old;
4354
4355
old = vm_page_drop(m, 1);
4356
KASSERT(VPRC_WIRE_COUNT(old) != 0,
4357
("%s: counter underflow for page %p", __func__, m));
4358
KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1,
4359
("%s: missing ref on fictitious page %p", __func__, m));
4360
4361
if (VPRC_WIRE_COUNT(old) > 1)
4362
return (false);
4363
if ((m->oflags & VPO_UNMANAGED) == 0)
4364
vm_page_aflag_clear(m, PGA_DEQUEUE);
4365
vm_wire_sub(1);
4366
return (true);
4367
}
4368
4369
/*
4370
* Ensure that the page ends up in the specified page queue. If the page is
4371
* active or being moved to the active queue, ensure that its act_count is
4372
* at least ACT_INIT but do not otherwise mess with it.
4373
*/
4374
static __always_inline void
4375
vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
4376
{
4377
vm_page_astate_t old, new;
4378
4379
KASSERT(m->ref_count > 0,
4380
("%s: page %p does not carry any references", __func__, m));
4381
KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
4382
("%s: invalid flags %x", __func__, nflag));
4383
4384
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
4385
return;
4386
4387
old = vm_page_astate_load(m);
4388
do {
4389
if ((old.flags & PGA_DEQUEUE) != 0)
4390
break;
4391
new = old;
4392
new.flags &= ~PGA_QUEUE_OP_MASK;
4393
if (nqueue == PQ_ACTIVE)
4394
new.act_count = max(old.act_count, ACT_INIT);
4395
if (old.queue == nqueue) {
4396
/*
4397
* There is no need to requeue pages already in the
4398
* active queue.
4399
*/
4400
if (nqueue != PQ_ACTIVE ||
4401
(old.flags & PGA_ENQUEUED) == 0)
4402
new.flags |= nflag;
4403
} else {
4404
new.flags |= nflag;
4405
new.queue = nqueue;
4406
}
4407
} while (!vm_page_pqstate_commit(m, &old, new));
4408
}
4409
4410
/*
4411
* Put the specified page on the active list (if appropriate).
4412
*/
4413
void
4414
vm_page_activate(vm_page_t m)
4415
{
4416
4417
vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
4418
}
4419
4420
/*
4421
* Move the specified page to the tail of the inactive queue, or requeue
4422
* the page if it is already in the inactive queue.
4423
*/
4424
void
4425
vm_page_deactivate(vm_page_t m)
4426
{
4427
4428
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
4429
}
4430
4431
void
4432
vm_page_deactivate_noreuse(vm_page_t m)
4433
{
4434
4435
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
4436
}
4437
4438
/*
4439
* Put a page in the laundry, or requeue it if it is already there.
4440
*/
4441
void
4442
vm_page_launder(vm_page_t m)
4443
{
4444
4445
vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
4446
}
4447
4448
/*
4449
* Put a page in the PQ_UNSWAPPABLE holding queue.
4450
*/
4451
void
4452
vm_page_unswappable(vm_page_t m)
4453
{
4454
4455
VM_OBJECT_ASSERT_LOCKED(m->object);
4456
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4457
("page %p already unswappable", m));
4458
4459
vm_page_dequeue(m);
4460
vm_page_enqueue(m, PQ_UNSWAPPABLE);
4461
}
4462
4463
/*
4464
* Release a page back to the page queues in preparation for unwiring.
4465
*/
4466
static void
4467
vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
4468
{
4469
vm_page_astate_t old, new;
4470
uint16_t nflag;
4471
4472
/*
4473
* Use a check of the valid bits to determine whether we should
4474
* accelerate reclamation of the page. The object lock might not be
4475
* held here, in which case the check is racy. At worst we will either
4476
* accelerate reclamation of a valid page and violate LRU, or
4477
* unnecessarily defer reclamation of an invalid page.
4478
*
4479
* If we were asked to not cache the page, place it near the head of the
4480
* inactive queue so that is reclaimed sooner.
4481
*/
4482
if (noreuse || vm_page_none_valid(m)) {
4483
nqueue = PQ_INACTIVE;
4484
nflag = PGA_REQUEUE_HEAD;
4485
} else {
4486
nflag = PGA_REQUEUE;
4487
}
4488
4489
old = vm_page_astate_load(m);
4490
do {
4491
new = old;
4492
4493
/*
4494
* If the page is already in the active queue and we are not
4495
* trying to accelerate reclamation, simply mark it as
4496
* referenced and avoid any queue operations.
4497
*/
4498
new.flags &= ~PGA_QUEUE_OP_MASK;
4499
if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE &&
4500
(old.flags & PGA_ENQUEUED) != 0)
4501
new.flags |= PGA_REFERENCED;
4502
else {
4503
new.flags |= nflag;
4504
new.queue = nqueue;
4505
}
4506
} while (!vm_page_pqstate_commit(m, &old, new));
4507
}
4508
4509
/*
4510
* Unwire a page and either attempt to free it or re-add it to the page queues.
4511
*/
4512
void
4513
vm_page_release(vm_page_t m, int flags)
4514
{
4515
vm_object_t object;
4516
4517
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4518
("vm_page_release: page %p is unmanaged", m));
4519
4520
if ((flags & VPR_TRYFREE) != 0) {
4521
for (;;) {
4522
object = atomic_load_ptr(&m->object);
4523
if (object == NULL)
4524
break;
4525
/* Depends on type-stability. */
4526
if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object))
4527
break;
4528
if (object == m->object) {
4529
vm_page_release_locked(m, flags);
4530
VM_OBJECT_WUNLOCK(object);
4531
return;
4532
}
4533
VM_OBJECT_WUNLOCK(object);
4534
}
4535
}
4536
vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0);
4537
}
4538
4539
/* See vm_page_release(). */
4540
void
4541
vm_page_release_locked(vm_page_t m, int flags)
4542
{
4543
4544
VM_OBJECT_ASSERT_WLOCKED(m->object);
4545
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4546
("vm_page_release_locked: page %p is unmanaged", m));
4547
4548
if (vm_page_unwire_noq(m)) {
4549
if ((flags & VPR_TRYFREE) != 0 &&
4550
(m->object->ref_count == 0 || !pmap_page_is_mapped(m)) &&
4551
m->dirty == 0 && vm_page_tryxbusy(m)) {
4552
/*
4553
* An unlocked lookup may have wired the page before the
4554
* busy lock was acquired, in which case the page must
4555
* not be freed.
4556
*/
4557
if (__predict_true(!vm_page_wired(m))) {
4558
vm_page_free(m);
4559
return;
4560
}
4561
vm_page_xunbusy(m);
4562
} else {
4563
vm_page_release_toq(m, PQ_INACTIVE, flags != 0);
4564
}
4565
}
4566
}
4567
4568
static bool
4569
vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t))
4570
{
4571
u_int old;
4572
4573
KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0,
4574
("vm_page_try_blocked_op: page %p has no object", m));
4575
KASSERT(vm_page_busied(m),
4576
("vm_page_try_blocked_op: page %p is not busy", m));
4577
VM_OBJECT_ASSERT_LOCKED(m->object);
4578
4579
old = atomic_load_int(&m->ref_count);
4580
do {
4581
KASSERT(old != 0,
4582
("vm_page_try_blocked_op: page %p has no references", m));
4583
KASSERT((old & VPRC_BLOCKED) == 0,
4584
("vm_page_try_blocked_op: page %p blocks wirings", m));
4585
if (VPRC_WIRE_COUNT(old) != 0)
4586
return (false);
4587
} while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED));
4588
4589
(op)(m);
4590
4591
/*
4592
* If the object is read-locked, new wirings may be created via an
4593
* object lookup.
4594
*/
4595
old = vm_page_drop(m, VPRC_BLOCKED);
4596
KASSERT(!VM_OBJECT_WOWNED(m->object) ||
4597
old == (VPRC_BLOCKED | VPRC_OBJREF),
4598
("vm_page_try_blocked_op: unexpected refcount value %u for %p",
4599
old, m));
4600
return (true);
4601
}
4602
4603
/*
4604
* Atomically check for wirings and remove all mappings of the page.
4605
*/
4606
bool
4607
vm_page_try_remove_all(vm_page_t m)
4608
{
4609
4610
return (vm_page_try_blocked_op(m, pmap_remove_all));
4611
}
4612
4613
/*
4614
* Atomically check for wirings and remove all writeable mappings of the page.
4615
*/
4616
bool
4617
vm_page_try_remove_write(vm_page_t m)
4618
{
4619
4620
return (vm_page_try_blocked_op(m, pmap_remove_write));
4621
}
4622
4623
/*
4624
* vm_page_advise
4625
*
4626
* Apply the specified advice to the given page.
4627
*/
4628
void
4629
vm_page_advise(vm_page_t m, int advice)
4630
{
4631
4632
VM_OBJECT_ASSERT_WLOCKED(m->object);
4633
vm_page_assert_xbusied(m);
4634
4635
if (advice == MADV_FREE)
4636
/*
4637
* Mark the page clean. This will allow the page to be freed
4638
* without first paging it out. MADV_FREE pages are often
4639
* quickly reused by malloc(3), so we do not do anything that
4640
* would result in a page fault on a later access.
4641
*/
4642
vm_page_undirty(m);
4643
else if (advice != MADV_DONTNEED) {
4644
if (advice == MADV_WILLNEED)
4645
vm_page_activate(m);
4646
return;
4647
}
4648
4649
if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
4650
vm_page_dirty(m);
4651
4652
/*
4653
* Clear any references to the page. Otherwise, the page daemon will
4654
* immediately reactivate the page.
4655
*/
4656
vm_page_aflag_clear(m, PGA_REFERENCED);
4657
4658
/*
4659
* Place clean pages near the head of the inactive queue rather than
4660
* the tail, thus defeating the queue's LRU operation and ensuring that
4661
* the page will be reused quickly. Dirty pages not already in the
4662
* laundry are moved there.
4663
*/
4664
if (m->dirty == 0)
4665
vm_page_deactivate_noreuse(m);
4666
else if (!vm_page_in_laundry(m))
4667
vm_page_launder(m);
4668
}
4669
4670
/*
4671
* vm_page_grab_release
4672
*
4673
* Helper routine for grab functions to release busy on return.
4674
*/
4675
static inline void
4676
vm_page_grab_release(vm_page_t m, int allocflags)
4677
{
4678
4679
if ((allocflags & VM_ALLOC_NOBUSY) != 0) {
4680
if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4681
vm_page_sunbusy(m);
4682
else
4683
vm_page_xunbusy(m);
4684
}
4685
}
4686
4687
/*
4688
* vm_page_grab_sleep
4689
*
4690
* Sleep for busy according to VM_ALLOC_ parameters. Returns true
4691
* if the caller should retry and false otherwise.
4692
*
4693
* If the object is locked on entry the object will be unlocked with
4694
* false returns and still locked but possibly having been dropped
4695
* with true returns.
4696
*/
4697
static bool
4698
vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex,
4699
const char *wmesg, int allocflags, bool locked)
4700
{
4701
4702
if ((allocflags & VM_ALLOC_NOWAIT) != 0)
4703
return (false);
4704
4705
/*
4706
* Reference the page before unlocking and sleeping so that
4707
* the page daemon is less likely to reclaim it.
4708
*/
4709
if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0)
4710
vm_page_reference(m);
4711
4712
if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) &&
4713
locked)
4714
VM_OBJECT_WLOCK(object);
4715
if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
4716
return (false);
4717
4718
return (true);
4719
}
4720
4721
/*
4722
* Assert that the grab flags are valid.
4723
*/
4724
static inline void
4725
vm_page_grab_check(int allocflags)
4726
{
4727
4728
KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
4729
(allocflags & VM_ALLOC_WIRED) != 0,
4730
("vm_page_grab*: the pages must be busied or wired"));
4731
4732
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4733
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4734
("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4735
}
4736
4737
/*
4738
* Calculate the page allocation flags for grab.
4739
*/
4740
static inline int
4741
vm_page_grab_pflags(int allocflags)
4742
{
4743
int pflags;
4744
4745
pflags = allocflags &
4746
~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL |
4747
VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
4748
if ((allocflags & VM_ALLOC_NOWAIT) == 0)
4749
pflags |= VM_ALLOC_WAITFAIL;
4750
if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4751
pflags |= VM_ALLOC_SBUSY;
4752
4753
return (pflags);
4754
}
4755
4756
/*
4757
* Grab a page, waiting until we are woken up due to the page changing state.
4758
* We keep on waiting, if the page continues to be in the object, unless
4759
* allocflags forbid waiting.
4760
*
4761
* The object must be locked on entry. This routine may sleep. The lock will,
4762
* however, be released and reacquired if the routine sleeps.
4763
*
4764
* Return a grabbed page, or NULL. Set *found if a page was found, whether or
4765
* not it was grabbed.
4766
*/
4767
static inline vm_page_t
4768
vm_page_grab_lookup(vm_object_t object, vm_pindex_t pindex, int allocflags,
4769
bool *found, struct pctrie_iter *pages)
4770
{
4771
vm_page_t m;
4772
4773
while ((*found = (m = vm_radix_iter_lookup(pages, pindex)) != NULL) &&
4774
!vm_page_tryacquire(m, allocflags)) {
4775
if (!vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4776
allocflags, true))
4777
return (NULL);
4778
pctrie_iter_reset(pages);
4779
}
4780
return (m);
4781
}
4782
4783
/*
4784
* Grab a page. Use an iterator parameter. Keep on waiting, as long as the page
4785
* exists in the object. If the page doesn't exist, first allocate it and then
4786
* conditionally zero it.
4787
*
4788
* The object must be locked on entry. This routine may sleep. The lock will,
4789
* however, be released and reacquired if the routine sleeps.
4790
*/
4791
vm_page_t
4792
vm_page_grab_iter(vm_object_t object, vm_pindex_t pindex, int allocflags,
4793
struct pctrie_iter *pages)
4794
{
4795
vm_page_t m;
4796
bool found;
4797
4798
VM_OBJECT_ASSERT_WLOCKED(object);
4799
vm_page_grab_check(allocflags);
4800
4801
while ((m = vm_page_grab_lookup(
4802
object, pindex, allocflags, &found, pages)) == NULL) {
4803
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4804
return (NULL);
4805
if (found &&
4806
(allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
4807
return (NULL);
4808
m = vm_page_alloc_iter(object, pindex,
4809
vm_page_grab_pflags(allocflags), pages);
4810
if (m != NULL) {
4811
if ((allocflags & VM_ALLOC_ZERO) != 0 &&
4812
(m->flags & PG_ZERO) == 0)
4813
pmap_zero_page(m);
4814
break;
4815
}
4816
if ((allocflags &
4817
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
4818
return (NULL);
4819
}
4820
vm_page_grab_release(m, allocflags);
4821
4822
return (m);
4823
}
4824
4825
/*
4826
* Grab a page. Keep on waiting, as long as the page exists in the object. If
4827
* the page doesn't exist, first allocate it and then conditionally zero it.
4828
*
4829
* The object must be locked on entry. This routine may sleep. The lock will,
4830
* however, be released and reacquired if the routine sleeps.
4831
*/
4832
vm_page_t
4833
vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
4834
{
4835
struct pctrie_iter pages;
4836
4837
VM_OBJECT_ASSERT_WLOCKED(object);
4838
vm_page_iter_init(&pages, object);
4839
return (vm_page_grab_iter(object, pindex, allocflags, &pages));
4840
}
4841
4842
/*
4843
* Attempt to validate a page, locklessly acquiring it if necessary, given a
4844
* (object, pindex) tuple and either an invalided page or NULL. The resulting
4845
* page will be validated against the identity tuple, and busied or wired as
4846
* requested. A NULL page returned guarantees that the page was not in radix at
4847
* the time of the call but callers must perform higher level synchronization or
4848
* retry the operation under a lock if they require an atomic answer. This is
4849
* the only lock free validation routine, other routines can depend on the
4850
* resulting page state.
4851
*
4852
* The return value PAGE_NOT_ACQUIRED indicates that the operation failed due to
4853
* caller flags.
4854
*/
4855
#define PAGE_NOT_ACQUIRED ((vm_page_t)1)
4856
static vm_page_t
4857
vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t m,
4858
int allocflags)
4859
{
4860
if (m == NULL)
4861
m = vm_page_lookup_unlocked(object, pindex);
4862
for (; m != NULL; m = vm_page_lookup_unlocked(object, pindex)) {
4863
if (vm_page_trybusy(m, allocflags)) {
4864
if (m->object == object && m->pindex == pindex) {
4865
if ((allocflags & VM_ALLOC_WIRED) != 0)
4866
vm_page_wire(m);
4867
vm_page_grab_release(m, allocflags);
4868
break;
4869
}
4870
/* relookup. */
4871
vm_page_busy_release(m);
4872
cpu_spinwait();
4873
continue;
4874
}
4875
if (!vm_page_grab_sleep(object, m, pindex, "pgnslp",
4876
allocflags, false))
4877
return (PAGE_NOT_ACQUIRED);
4878
}
4879
return (m);
4880
}
4881
4882
/*
4883
* Try to locklessly grab a page and fall back to the object lock if NOCREAT
4884
* is not set.
4885
*/
4886
vm_page_t
4887
vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags)
4888
{
4889
vm_page_t m;
4890
4891
vm_page_grab_check(allocflags);
4892
m = vm_page_acquire_unlocked(object, pindex, NULL, allocflags);
4893
if (m == PAGE_NOT_ACQUIRED)
4894
return (NULL);
4895
if (m != NULL)
4896
return (m);
4897
4898
/*
4899
* The radix lockless lookup should never return a false negative
4900
* errors. If the user specifies NOCREAT they are guaranteed there
4901
* was no page present at the instant of the call. A NOCREAT caller
4902
* must handle create races gracefully.
4903
*/
4904
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4905
return (NULL);
4906
4907
VM_OBJECT_WLOCK(object);
4908
m = vm_page_grab(object, pindex, allocflags);
4909
VM_OBJECT_WUNLOCK(object);
4910
4911
return (m);
4912
}
4913
4914
/*
4915
* Grab a page and make it valid, paging in if necessary. Use an iterator
4916
* parameter. Pages missing from their pager are zero filled and validated. If
4917
* a VM_ALLOC_COUNT is supplied and the page is not valid as many as
4918
* VM_INITIAL_PAGEIN pages can be brought in simultaneously. Additional pages
4919
* will be left on a paging queue but will neither be wired nor busy regardless
4920
* of allocflags.
4921
*/
4922
int
4923
vm_page_grab_valid_iter(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex,
4924
int allocflags, struct pctrie_iter *pages)
4925
{
4926
vm_page_t m;
4927
vm_page_t ma[VM_INITIAL_PAGEIN];
4928
int after, ahead, i, pflags, rv;
4929
4930
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4931
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4932
("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4933
KASSERT((allocflags &
4934
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
4935
("vm_page_grab_valid: Invalid flags 0x%X", allocflags));
4936
VM_OBJECT_ASSERT_WLOCKED(object);
4937
pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY |
4938
VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY);
4939
pflags |= VM_ALLOC_WAITFAIL;
4940
4941
retrylookup:
4942
if ((m = vm_radix_iter_lookup(pages, pindex)) != NULL) {
4943
/*
4944
* If the page is fully valid it can only become invalid
4945
* with the object lock held. If it is not valid it can
4946
* become valid with the busy lock held. Therefore, we
4947
* may unnecessarily lock the exclusive busy here if we
4948
* race with I/O completion not using the object lock.
4949
* However, we will not end up with an invalid page and a
4950
* shared lock.
4951
*/
4952
if (!vm_page_trybusy(m,
4953
vm_page_all_valid(m) ? allocflags : 0)) {
4954
(void)vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4955
allocflags, true);
4956
pctrie_iter_reset(pages);
4957
goto retrylookup;
4958
}
4959
if (vm_page_all_valid(m))
4960
goto out;
4961
if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4962
vm_page_busy_release(m);
4963
*mp = NULL;
4964
return (VM_PAGER_FAIL);
4965
}
4966
} else if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4967
*mp = NULL;
4968
return (VM_PAGER_FAIL);
4969
} else {
4970
m = vm_page_alloc_iter(object, pindex, pflags, pages);
4971
if (m == NULL) {
4972
if (!vm_pager_can_alloc_page(object, pindex)) {
4973
*mp = NULL;
4974
return (VM_PAGER_AGAIN);
4975
}
4976
goto retrylookup;
4977
}
4978
}
4979
4980
vm_page_assert_xbusied(m);
4981
if (vm_pager_has_page(object, pindex, NULL, &after)) {
4982
after = MIN(after, VM_INITIAL_PAGEIN);
4983
after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT);
4984
after = MAX(after, 1);
4985
ma[0] = m;
4986
pctrie_iter_reset(pages);
4987
for (i = 1; i < after; i++) {
4988
m = vm_radix_iter_lookup_ge(pages, pindex + i);
4989
ahead = after;
4990
if (m != NULL)
4991
ahead = MIN(ahead, m->pindex - pindex);
4992
for (; i < ahead; i++) {
4993
ma[i] = vm_page_alloc_iter(object, pindex + i,
4994
VM_ALLOC_NORMAL, pages);
4995
if (ma[i] == NULL)
4996
break;
4997
}
4998
if (m == NULL || m->pindex != pindex + i ||
4999
vm_page_any_valid(m) || !vm_page_tryxbusy(m))
5000
break;
5001
ma[i] = m;
5002
}
5003
after = i;
5004
vm_object_pip_add(object, after);
5005
VM_OBJECT_WUNLOCK(object);
5006
rv = vm_pager_get_pages(object, ma, after, NULL, NULL);
5007
pctrie_iter_reset(pages);
5008
VM_OBJECT_WLOCK(object);
5009
vm_object_pip_wakeupn(object, after);
5010
/* Pager may have replaced a page. */
5011
m = ma[0];
5012
if (rv != VM_PAGER_OK) {
5013
for (i = 0; i < after; i++) {
5014
if (!vm_page_wired(ma[i]))
5015
vm_page_free(ma[i]);
5016
else
5017
vm_page_xunbusy(ma[i]);
5018
}
5019
*mp = NULL;
5020
return (rv);
5021
}
5022
for (i = 1; i < after; i++)
5023
vm_page_readahead_finish(ma[i]);
5024
MPASS(vm_page_all_valid(m));
5025
} else {
5026
vm_page_zero_invalid(m, TRUE);
5027
pctrie_iter_reset(pages);
5028
}
5029
out:
5030
if ((allocflags & VM_ALLOC_WIRED) != 0)
5031
vm_page_wire(m);
5032
if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m))
5033
vm_page_busy_downgrade(m);
5034
else if ((allocflags & VM_ALLOC_NOBUSY) != 0)
5035
vm_page_busy_release(m);
5036
*mp = m;
5037
return (VM_PAGER_OK);
5038
}
5039
5040
/*
5041
* Grab a page and make it valid, paging in if necessary. Pages missing from
5042
* their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied
5043
* and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought
5044
* in simultaneously. Additional pages will be left on a paging queue but
5045
* will neither be wired nor busy regardless of allocflags.
5046
*/
5047
int
5048
vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex,
5049
int allocflags)
5050
{
5051
struct pctrie_iter pages;
5052
5053
VM_OBJECT_ASSERT_WLOCKED(object);
5054
vm_page_iter_init(&pages, object);
5055
return (vm_page_grab_valid_iter(mp, object, pindex, allocflags,
5056
&pages));
5057
}
5058
5059
/*
5060
* Grab a page. Keep on waiting, as long as the page exists in the object. If
5061
* the page doesn't exist, and the pager has it, allocate it and zero part of
5062
* it.
5063
*
5064
* The object must be locked on entry. This routine may sleep. The lock will,
5065
* however, be released and reacquired if the routine sleeps.
5066
*/
5067
int
5068
vm_page_grab_zero_partial(vm_object_t object, vm_pindex_t pindex, int base,
5069
int end)
5070
{
5071
struct pctrie_iter pages;
5072
vm_page_t m;
5073
int allocflags, rv;
5074
bool found;
5075
5076
VM_OBJECT_ASSERT_WLOCKED(object);
5077
KASSERT(base >= 0, ("%s: base %d", __func__, base));
5078
KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base,
5079
end));
5080
5081
allocflags = VM_ALLOC_NOCREAT | VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL;
5082
vm_page_iter_init(&pages, object);
5083
while ((m = vm_page_grab_lookup(
5084
object, pindex, allocflags, &found, &pages)) == NULL) {
5085
if (!vm_pager_has_page(object, pindex, NULL, NULL))
5086
return (0);
5087
m = vm_page_alloc_iter(object, pindex,
5088
vm_page_grab_pflags(allocflags), &pages);
5089
if (m != NULL) {
5090
vm_object_pip_add(object, 1);
5091
VM_OBJECT_WUNLOCK(object);
5092
rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
5093
VM_OBJECT_WLOCK(object);
5094
vm_object_pip_wakeup(object);
5095
if (rv != VM_PAGER_OK) {
5096
vm_page_free(m);
5097
return (EIO);
5098
}
5099
5100
/*
5101
* Since the page was not resident, and therefore not
5102
* recently accessed, immediately enqueue it for
5103
* asynchronous laundering. The current operation is
5104
* not regarded as an access.
5105
*/
5106
vm_page_launder(m);
5107
break;
5108
}
5109
}
5110
5111
pmap_zero_page_area(m, base, end - base);
5112
KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", __func__, m));
5113
vm_page_set_dirty(m);
5114
vm_page_xunbusy(m);
5115
return (0);
5116
}
5117
5118
/*
5119
* Locklessly grab a valid page. If the page is not valid or not yet
5120
* allocated this will fall back to the object lock method.
5121
*/
5122
int
5123
vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object,
5124
vm_pindex_t pindex, int allocflags)
5125
{
5126
vm_page_t m;
5127
int flags;
5128
int error;
5129
5130
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
5131
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
5132
("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY "
5133
"mismatch"));
5134
KASSERT((allocflags &
5135
(VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
5136
("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags));
5137
5138
/*
5139
* Attempt a lockless lookup and busy. We need at least an sbusy
5140
* before we can inspect the valid field and return a wired page.
5141
*/
5142
flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
5143
vm_page_grab_check(flags);
5144
m = vm_page_acquire_unlocked(object, pindex, NULL, flags);
5145
if (m == PAGE_NOT_ACQUIRED)
5146
return (VM_PAGER_FAIL);
5147
if (m != NULL) {
5148
if (vm_page_all_valid(m)) {
5149
if ((allocflags & VM_ALLOC_WIRED) != 0)
5150
vm_page_wire(m);
5151
vm_page_grab_release(m, allocflags);
5152
*mp = m;
5153
return (VM_PAGER_OK);
5154
}
5155
vm_page_busy_release(m);
5156
}
5157
if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
5158
*mp = NULL;
5159
return (VM_PAGER_FAIL);
5160
}
5161
VM_OBJECT_WLOCK(object);
5162
error = vm_page_grab_valid(mp, object, pindex, allocflags);
5163
VM_OBJECT_WUNLOCK(object);
5164
5165
return (error);
5166
}
5167
5168
/*
5169
* Return the specified range of pages from the given object. For each
5170
* page offset within the range, if a page already exists within the object
5171
* at that offset and it is busy, then wait for it to change state. If,
5172
* instead, the page doesn't exist, then allocate it.
5173
*
5174
* The caller must always specify an allocation class.
5175
*
5176
* allocation classes:
5177
* VM_ALLOC_NORMAL normal process request
5178
* VM_ALLOC_SYSTEM system *really* needs the pages
5179
* VM_ALLOC_INTERRUPT interrupt time request
5180
*
5181
* The caller must always specify that the pages are to be busied and/or
5182
* wired.
5183
*
5184
* optional allocation flags:
5185
* VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages
5186
* VM_ALLOC_NOBUSY do not exclusive busy the pages
5187
* VM_ALLOC_NODUMP do not include the pages in a kernel core dump
5188
* VM_ALLOC_NOFREE pages will never be freed
5189
* VM_ALLOC_NOWAIT do not sleep
5190
* VM_ALLOC_SBUSY set pages to sbusy state
5191
* VM_ALLOC_WAITFAIL in case of failure, sleep before returning
5192
* VM_ALLOC_WAITOK ignored (default behavior)
5193
* VM_ALLOC_WIRED wire the pages
5194
* VM_ALLOC_ZERO zero and validate any invalid pages
5195
*
5196
* If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it
5197
* may return a partial prefix of the requested range.
5198
*/
5199
int
5200
vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
5201
vm_page_t *ma, int count)
5202
{
5203
struct pctrie_iter pages;
5204
vm_page_t m;
5205
int pflags;
5206
int ahead, i;
5207
5208
VM_OBJECT_ASSERT_WLOCKED(object);
5209
KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
5210
("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
5211
KASSERT(count > 0,
5212
("vm_page_grab_pages: invalid page count %d", count));
5213
vm_page_grab_check(allocflags);
5214
5215
pflags = vm_page_grab_pflags(allocflags);
5216
i = 0;
5217
vm_page_iter_init(&pages, object);
5218
retrylookup:
5219
ahead = -1;
5220
for (; i < count; i++) {
5221
if (ahead < 0) {
5222
ahead = vm_radix_iter_lookup_range(
5223
&pages, pindex + i, &ma[i], count - i);
5224
}
5225
if (ahead-- > 0) {
5226
m = ma[i];
5227
if (!vm_page_tryacquire(m, allocflags)) {
5228
if (vm_page_grab_sleep(object, m, pindex + i,
5229
"grbmaw", allocflags, true)) {
5230
pctrie_iter_reset(&pages);
5231
goto retrylookup;
5232
}
5233
break;
5234
}
5235
} else {
5236
if ((allocflags & VM_ALLOC_NOCREAT) != 0)
5237
break;
5238
m = vm_page_alloc_iter(object, pindex + i,
5239
pflags | VM_ALLOC_COUNT(count - i), &pages);
5240
/* pages was reset if alloc_iter lost the lock. */
5241
if (m == NULL) {
5242
if ((allocflags & (VM_ALLOC_NOWAIT |
5243
VM_ALLOC_WAITFAIL)) != 0)
5244
break;
5245
goto retrylookup;
5246
}
5247
ma[i] = m;
5248
}
5249
if (vm_page_none_valid(m) &&
5250
(allocflags & VM_ALLOC_ZERO) != 0) {
5251
if ((m->flags & PG_ZERO) == 0)
5252
pmap_zero_page(m);
5253
vm_page_valid(m);
5254
}
5255
vm_page_grab_release(m, allocflags);
5256
}
5257
return (i);
5258
}
5259
5260
/*
5261
* Unlocked variant of vm_page_grab_pages(). This accepts the same flags
5262
* and will fall back to the locked variant to handle allocation.
5263
*/
5264
int
5265
vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
5266
int allocflags, vm_page_t *ma, int count)
5267
{
5268
vm_page_t m;
5269
int flags;
5270
int i, num_fetched;
5271
5272
KASSERT(count > 0,
5273
("vm_page_grab_pages_unlocked: invalid page count %d", count));
5274
vm_page_grab_check(allocflags);
5275
5276
/*
5277
* Modify flags for lockless acquire to hold the page until we
5278
* set it valid if necessary.
5279
*/
5280
flags = allocflags & ~VM_ALLOC_NOBUSY;
5281
vm_page_grab_check(flags);
5282
num_fetched = vm_radix_lookup_range_unlocked(&object->rtree, pindex,
5283
ma, count);
5284
for (i = 0; i < num_fetched; i++, pindex++) {
5285
m = vm_page_acquire_unlocked(object, pindex, ma[i], flags);
5286
if (m == PAGE_NOT_ACQUIRED)
5287
return (i);
5288
if (m == NULL)
5289
break;
5290
if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) {
5291
if ((m->flags & PG_ZERO) == 0)
5292
pmap_zero_page(m);
5293
vm_page_valid(m);
5294
}
5295
/* m will still be wired or busy according to flags. */
5296
vm_page_grab_release(m, allocflags);
5297
/* vm_page_acquire_unlocked() may not return ma[i]. */
5298
ma[i] = m;
5299
}
5300
if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0)
5301
return (i);
5302
count -= i;
5303
VM_OBJECT_WLOCK(object);
5304
i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count);
5305
VM_OBJECT_WUNLOCK(object);
5306
5307
return (i);
5308
}
5309
5310
/*
5311
* Mapping function for valid or dirty bits in a page.
5312
*
5313
* Inputs are required to range within a page.
5314
*/
5315
vm_page_bits_t
5316
vm_page_bits(int base, int size)
5317
{
5318
int first_bit;
5319
int last_bit;
5320
5321
KASSERT(
5322
base + size <= PAGE_SIZE,
5323
("vm_page_bits: illegal base/size %d/%d", base, size)
5324
);
5325
5326
if (size == 0) /* handle degenerate case */
5327
return (0);
5328
5329
first_bit = base >> DEV_BSHIFT;
5330
last_bit = (base + size - 1) >> DEV_BSHIFT;
5331
5332
return (((vm_page_bits_t)2 << last_bit) -
5333
((vm_page_bits_t)1 << first_bit));
5334
}
5335
5336
void
5337
vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set)
5338
{
5339
5340
#if PAGE_SIZE == 32768
5341
atomic_set_64((uint64_t *)bits, set);
5342
#elif PAGE_SIZE == 16384
5343
atomic_set_32((uint32_t *)bits, set);
5344
#elif (PAGE_SIZE == 8192) && defined(atomic_set_16)
5345
atomic_set_16((uint16_t *)bits, set);
5346
#elif (PAGE_SIZE == 4096) && defined(atomic_set_8)
5347
atomic_set_8((uint8_t *)bits, set);
5348
#else /* PAGE_SIZE <= 8192 */
5349
uintptr_t addr;
5350
int shift;
5351
5352
addr = (uintptr_t)bits;
5353
/*
5354
* Use a trick to perform a 32-bit atomic on the
5355
* containing aligned word, to not depend on the existence
5356
* of atomic_{set, clear}_{8, 16}.
5357
*/
5358
shift = addr & (sizeof(uint32_t) - 1);
5359
#if BYTE_ORDER == BIG_ENDIAN
5360
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5361
#else
5362
shift *= NBBY;
5363
#endif
5364
addr &= ~(sizeof(uint32_t) - 1);
5365
atomic_set_32((uint32_t *)addr, set << shift);
5366
#endif /* PAGE_SIZE */
5367
}
5368
5369
static inline void
5370
vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
5371
{
5372
5373
#if PAGE_SIZE == 32768
5374
atomic_clear_64((uint64_t *)bits, clear);
5375
#elif PAGE_SIZE == 16384
5376
atomic_clear_32((uint32_t *)bits, clear);
5377
#elif (PAGE_SIZE == 8192) && defined(atomic_clear_16)
5378
atomic_clear_16((uint16_t *)bits, clear);
5379
#elif (PAGE_SIZE == 4096) && defined(atomic_clear_8)
5380
atomic_clear_8((uint8_t *)bits, clear);
5381
#else /* PAGE_SIZE <= 8192 */
5382
uintptr_t addr;
5383
int shift;
5384
5385
addr = (uintptr_t)bits;
5386
/*
5387
* Use a trick to perform a 32-bit atomic on the
5388
* containing aligned word, to not depend on the existence
5389
* of atomic_{set, clear}_{8, 16}.
5390
*/
5391
shift = addr & (sizeof(uint32_t) - 1);
5392
#if BYTE_ORDER == BIG_ENDIAN
5393
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5394
#else
5395
shift *= NBBY;
5396
#endif
5397
addr &= ~(sizeof(uint32_t) - 1);
5398
atomic_clear_32((uint32_t *)addr, clear << shift);
5399
#endif /* PAGE_SIZE */
5400
}
5401
5402
static inline vm_page_bits_t
5403
vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
5404
{
5405
#if PAGE_SIZE == 32768
5406
uint64_t old;
5407
5408
old = *bits;
5409
while (atomic_fcmpset_64(bits, &old, newbits) == 0);
5410
return (old);
5411
#elif PAGE_SIZE == 16384
5412
uint32_t old;
5413
5414
old = *bits;
5415
while (atomic_fcmpset_32(bits, &old, newbits) == 0);
5416
return (old);
5417
#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
5418
uint16_t old;
5419
5420
old = *bits;
5421
while (atomic_fcmpset_16(bits, &old, newbits) == 0);
5422
return (old);
5423
#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
5424
uint8_t old;
5425
5426
old = *bits;
5427
while (atomic_fcmpset_8(bits, &old, newbits) == 0);
5428
return (old);
5429
#else /* PAGE_SIZE <= 4096*/
5430
uintptr_t addr;
5431
uint32_t old, new, mask;
5432
int shift;
5433
5434
addr = (uintptr_t)bits;
5435
/*
5436
* Use a trick to perform a 32-bit atomic on the
5437
* containing aligned word, to not depend on the existence
5438
* of atomic_{set, swap, clear}_{8, 16}.
5439
*/
5440
shift = addr & (sizeof(uint32_t) - 1);
5441
#if BYTE_ORDER == BIG_ENDIAN
5442
shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5443
#else
5444
shift *= NBBY;
5445
#endif
5446
addr &= ~(sizeof(uint32_t) - 1);
5447
mask = VM_PAGE_BITS_ALL << shift;
5448
5449
old = *bits;
5450
do {
5451
new = old & ~mask;
5452
new |= newbits << shift;
5453
} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
5454
return (old >> shift);
5455
#endif /* PAGE_SIZE */
5456
}
5457
5458
/*
5459
* vm_page_set_valid_range:
5460
*
5461
* Sets portions of a page valid. The arguments are expected
5462
* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5463
* of any partial chunks touched by the range. The invalid portion of
5464
* such chunks will be zeroed.
5465
*
5466
* (base + size) must be less then or equal to PAGE_SIZE.
5467
*/
5468
void
5469
vm_page_set_valid_range(vm_page_t m, int base, int size)
5470
{
5471
int endoff, frag;
5472
vm_page_bits_t pagebits;
5473
5474
vm_page_assert_busied(m);
5475
if (size == 0) /* handle degenerate case */
5476
return;
5477
5478
/*
5479
* If the base is not DEV_BSIZE aligned and the valid
5480
* bit is clear, we have to zero out a portion of the
5481
* first block.
5482
*/
5483
if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5484
(m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
5485
pmap_zero_page_area(m, frag, base - frag);
5486
5487
/*
5488
* If the ending offset is not DEV_BSIZE aligned and the
5489
* valid bit is clear, we have to zero out a portion of
5490
* the last block.
5491
*/
5492
endoff = base + size;
5493
if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5494
(m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
5495
pmap_zero_page_area(m, endoff,
5496
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5497
5498
/*
5499
* Assert that no previously invalid block that is now being validated
5500
* is already dirty.
5501
*/
5502
KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
5503
("vm_page_set_valid_range: page %p is dirty", m));
5504
5505
/*
5506
* Set valid bits inclusive of any overlap.
5507
*/
5508
pagebits = vm_page_bits(base, size);
5509
if (vm_page_xbusied(m))
5510
m->valid |= pagebits;
5511
else
5512
vm_page_bits_set(m, &m->valid, pagebits);
5513
}
5514
5515
/*
5516
* Set the page dirty bits and free the invalid swap space if
5517
* present. Returns the previous dirty bits.
5518
*/
5519
vm_page_bits_t
5520
vm_page_set_dirty(vm_page_t m)
5521
{
5522
vm_page_bits_t old;
5523
5524
VM_PAGE_OBJECT_BUSY_ASSERT(m);
5525
5526
if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
5527
old = m->dirty;
5528
m->dirty = VM_PAGE_BITS_ALL;
5529
} else
5530
old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
5531
if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
5532
vm_pager_page_unswapped(m);
5533
5534
return (old);
5535
}
5536
5537
/*
5538
* Clear the given bits from the specified page's dirty field.
5539
*/
5540
static __inline void
5541
vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
5542
{
5543
5544
vm_page_assert_busied(m);
5545
5546
/*
5547
* If the page is xbusied and not write mapped we are the
5548
* only thread that can modify dirty bits. Otherwise, The pmap
5549
* layer can call vm_page_dirty() without holding a distinguished
5550
* lock. The combination of page busy and atomic operations
5551
* suffice to guarantee consistency of the page dirty field.
5552
*/
5553
if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
5554
m->dirty &= ~pagebits;
5555
else
5556
vm_page_bits_clear(m, &m->dirty, pagebits);
5557
}
5558
5559
/*
5560
* vm_page_set_validclean:
5561
*
5562
* Sets portions of a page valid and clean. The arguments are expected
5563
* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5564
* of any partial chunks touched by the range. The invalid portion of
5565
* such chunks will be zero'd.
5566
*
5567
* (base + size) must be less then or equal to PAGE_SIZE.
5568
*/
5569
void
5570
vm_page_set_validclean(vm_page_t m, int base, int size)
5571
{
5572
vm_page_bits_t oldvalid, pagebits;
5573
int endoff, frag;
5574
5575
vm_page_assert_busied(m);
5576
if (size == 0) /* handle degenerate case */
5577
return;
5578
5579
/*
5580
* If the base is not DEV_BSIZE aligned and the valid
5581
* bit is clear, we have to zero out a portion of the
5582
* first block.
5583
*/
5584
if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5585
(m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
5586
pmap_zero_page_area(m, frag, base - frag);
5587
5588
/*
5589
* If the ending offset is not DEV_BSIZE aligned and the
5590
* valid bit is clear, we have to zero out a portion of
5591
* the last block.
5592
*/
5593
endoff = base + size;
5594
if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5595
(m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
5596
pmap_zero_page_area(m, endoff,
5597
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5598
5599
/*
5600
* Set valid, clear dirty bits. If validating the entire
5601
* page we can safely clear the pmap modify bit. We also
5602
* use this opportunity to clear the PGA_NOSYNC flag. If a process
5603
* takes a write fault on a MAP_NOSYNC memory area the flag will
5604
* be set again.
5605
*
5606
* We set valid bits inclusive of any overlap, but we can only
5607
* clear dirty bits for DEV_BSIZE chunks that are fully within
5608
* the range.
5609
*/
5610
oldvalid = m->valid;
5611
pagebits = vm_page_bits(base, size);
5612
if (vm_page_xbusied(m))
5613
m->valid |= pagebits;
5614
else
5615
vm_page_bits_set(m, &m->valid, pagebits);
5616
#if 0 /* NOT YET */
5617
if ((frag = base & (DEV_BSIZE - 1)) != 0) {
5618
frag = DEV_BSIZE - frag;
5619
base += frag;
5620
size -= frag;
5621
if (size < 0)
5622
size = 0;
5623
}
5624
pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
5625
#endif
5626
if (base == 0 && size == PAGE_SIZE) {
5627
/*
5628
* The page can only be modified within the pmap if it is
5629
* mapped, and it can only be mapped if it was previously
5630
* fully valid.
5631
*/
5632
if (oldvalid == VM_PAGE_BITS_ALL)
5633
/*
5634
* Perform the pmap_clear_modify() first. Otherwise,
5635
* a concurrent pmap operation, such as
5636
* pmap_protect(), could clear a modification in the
5637
* pmap and set the dirty field on the page before
5638
* pmap_clear_modify() had begun and after the dirty
5639
* field was cleared here.
5640
*/
5641
pmap_clear_modify(m);
5642
m->dirty = 0;
5643
vm_page_aflag_clear(m, PGA_NOSYNC);
5644
} else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m))
5645
m->dirty &= ~pagebits;
5646
else
5647
vm_page_clear_dirty_mask(m, pagebits);
5648
}
5649
5650
void
5651
vm_page_clear_dirty(vm_page_t m, int base, int size)
5652
{
5653
5654
vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
5655
}
5656
5657
/*
5658
* vm_page_set_invalid:
5659
*
5660
* Invalidates DEV_BSIZE'd chunks within a page. Both the
5661
* valid and dirty bits for the effected areas are cleared.
5662
*/
5663
void
5664
vm_page_set_invalid(vm_page_t m, int base, int size)
5665
{
5666
vm_page_bits_t bits;
5667
vm_object_t object;
5668
5669
/*
5670
* The object lock is required so that pages can't be mapped
5671
* read-only while we're in the process of invalidating them.
5672
*/
5673
object = m->object;
5674
VM_OBJECT_ASSERT_WLOCKED(object);
5675
vm_page_assert_busied(m);
5676
5677
if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
5678
size >= object->un_pager.vnp.vnp_size)
5679
bits = VM_PAGE_BITS_ALL;
5680
else
5681
bits = vm_page_bits(base, size);
5682
if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0)
5683
pmap_remove_all(m);
5684
KASSERT((bits == 0 && vm_page_all_valid(m)) ||
5685
!pmap_page_is_mapped(m),
5686
("vm_page_set_invalid: page %p is mapped", m));
5687
if (vm_page_xbusied(m)) {
5688
m->valid &= ~bits;
5689
m->dirty &= ~bits;
5690
} else {
5691
vm_page_bits_clear(m, &m->valid, bits);
5692
vm_page_bits_clear(m, &m->dirty, bits);
5693
}
5694
}
5695
5696
/*
5697
* vm_page_invalid:
5698
*
5699
* Invalidates the entire page. The page must be busy, unmapped, and
5700
* the enclosing object must be locked. The object locks protects
5701
* against concurrent read-only pmap enter which is done without
5702
* busy.
5703
*/
5704
void
5705
vm_page_invalid(vm_page_t m)
5706
{
5707
5708
vm_page_assert_busied(m);
5709
VM_OBJECT_ASSERT_WLOCKED(m->object);
5710
MPASS(!pmap_page_is_mapped(m));
5711
5712
if (vm_page_xbusied(m))
5713
m->valid = 0;
5714
else
5715
vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL);
5716
}
5717
5718
/*
5719
* vm_page_zero_invalid()
5720
*
5721
* The kernel assumes that the invalid portions of a page contain
5722
* garbage, but such pages can be mapped into memory by user code.
5723
* When this occurs, we must zero out the non-valid portions of the
5724
* page so user code sees what it expects.
5725
*
5726
* Pages are most often semi-valid when the end of a file is mapped
5727
* into memory and the file's size is not page aligned.
5728
*/
5729
void
5730
vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
5731
{
5732
int b;
5733
int i;
5734
5735
/*
5736
* Scan the valid bits looking for invalid sections that
5737
* must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the
5738
* valid bit may be set ) have already been zeroed by
5739
* vm_page_set_validclean().
5740
*/
5741
for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
5742
if (i == (PAGE_SIZE / DEV_BSIZE) ||
5743
(m->valid & ((vm_page_bits_t)1 << i))) {
5744
if (i > b) {
5745
pmap_zero_page_area(m,
5746
b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
5747
}
5748
b = i + 1;
5749
}
5750
}
5751
5752
/*
5753
* setvalid is TRUE when we can safely set the zero'd areas
5754
* as being valid. We can do this if there are no cache consistency
5755
* issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
5756
*/
5757
if (setvalid)
5758
vm_page_valid(m);
5759
}
5760
5761
/*
5762
* vm_page_is_valid:
5763
*
5764
* Is (partial) page valid? Note that the case where size == 0
5765
* will return FALSE in the degenerate case where the page is
5766
* entirely invalid, and TRUE otherwise.
5767
*
5768
* Some callers envoke this routine without the busy lock held and
5769
* handle races via higher level locks. Typical callers should
5770
* hold a busy lock to prevent invalidation.
5771
*/
5772
int
5773
vm_page_is_valid(vm_page_t m, int base, int size)
5774
{
5775
vm_page_bits_t bits;
5776
5777
bits = vm_page_bits(base, size);
5778
return (vm_page_any_valid(m) && (m->valid & bits) == bits);
5779
}
5780
5781
/*
5782
* Returns true if all of the specified predicates are true for the entire
5783
* (super)page and false otherwise.
5784
*/
5785
bool
5786
vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m)
5787
{
5788
vm_object_t object;
5789
int i, npages;
5790
5791
object = m->object;
5792
if (skip_m != NULL && skip_m->object != object)
5793
return (false);
5794
VM_OBJECT_ASSERT_LOCKED(object);
5795
KASSERT(psind <= m->psind,
5796
("psind %d > psind %d of m %p", psind, m->psind, m));
5797
npages = atop(pagesizes[psind]);
5798
5799
/*
5800
* The physically contiguous pages that make up a superpage, i.e., a
5801
* page with a page size index ("psind") greater than zero, will
5802
* occupy adjacent entries in vm_page_array[].
5803
*/
5804
for (i = 0; i < npages; i++) {
5805
/* Always test object consistency, including "skip_m". */
5806
if (m[i].object != object)
5807
return (false);
5808
if (&m[i] == skip_m)
5809
continue;
5810
if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
5811
return (false);
5812
if ((flags & PS_ALL_DIRTY) != 0) {
5813
/*
5814
* Calling vm_page_test_dirty() or pmap_is_modified()
5815
* might stop this case from spuriously returning
5816
* "false". However, that would require a write lock
5817
* on the object containing "m[i]".
5818
*/
5819
if (m[i].dirty != VM_PAGE_BITS_ALL)
5820
return (false);
5821
}
5822
if ((flags & PS_ALL_VALID) != 0 &&
5823
m[i].valid != VM_PAGE_BITS_ALL)
5824
return (false);
5825
}
5826
return (true);
5827
}
5828
5829
/*
5830
* Set the page's dirty bits if the page is modified.
5831
*/
5832
void
5833
vm_page_test_dirty(vm_page_t m)
5834
{
5835
5836
vm_page_assert_busied(m);
5837
if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
5838
vm_page_dirty(m);
5839
}
5840
5841
void
5842
vm_page_valid(vm_page_t m)
5843
{
5844
5845
vm_page_assert_busied(m);
5846
if (vm_page_xbusied(m))
5847
m->valid = VM_PAGE_BITS_ALL;
5848
else
5849
vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL);
5850
}
5851
5852
#ifdef INVARIANTS
5853
void
5854
vm_page_object_busy_assert(vm_page_t m)
5855
{
5856
5857
/*
5858
* Certain of the page's fields may only be modified by the
5859
* holder of a page or object busy.
5860
*/
5861
if (m->object != NULL && !vm_page_busied(m))
5862
VM_OBJECT_ASSERT_BUSY(m->object);
5863
}
5864
5865
void
5866
vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits)
5867
{
5868
5869
if ((bits & PGA_WRITEABLE) == 0)
5870
return;
5871
5872
/*
5873
* The PGA_WRITEABLE flag can only be set if the page is
5874
* managed, is exclusively busied or the object is locked.
5875
* Currently, this flag is only set by pmap_enter().
5876
*/
5877
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5878
("PGA_WRITEABLE on unmanaged page"));
5879
if (!vm_page_xbusied(m))
5880
VM_OBJECT_ASSERT_BUSY(m->object);
5881
}
5882
#endif
5883
5884
#include "opt_ddb.h"
5885
#ifdef DDB
5886
#include <sys/kernel.h>
5887
5888
#include <ddb/ddb.h>
5889
5890
DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE)
5891
{
5892
5893
db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
5894
db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
5895
db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
5896
db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
5897
db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count());
5898
db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
5899
db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
5900
db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
5901
db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
5902
}
5903
5904
DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE)
5905
{
5906
int dom;
5907
5908
db_printf("pq_free %d\n", vm_free_count());
5909
for (dom = 0; dom < vm_ndomains; dom++) {
5910
db_printf(
5911
"dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
5912
dom,
5913
vm_dom[dom].vmd_page_count,
5914
vm_dom[dom].vmd_free_count,
5915
vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
5916
vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
5917
vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
5918
vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
5919
}
5920
}
5921
5922
DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
5923
{
5924
vm_page_t m;
5925
boolean_t phys, virt;
5926
5927
if (!have_addr) {
5928
db_printf("show pginfo addr\n");
5929
return;
5930
}
5931
5932
phys = strchr(modif, 'p') != NULL;
5933
virt = strchr(modif, 'v') != NULL;
5934
if (virt)
5935
m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
5936
else if (phys)
5937
m = PHYS_TO_VM_PAGE(addr);
5938
else
5939
m = (vm_page_t)addr;
5940
db_printf(
5941
"page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n"
5942
" af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
5943
m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
5944
m->a.queue, m->ref_count, m->a.flags, m->oflags,
5945
m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty);
5946
}
5947
#endif /* DDB */
5948
5949