Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/swap_pager.c
104186 views
1
/*-
2
* SPDX-License-Identifier: BSD-4-Clause
3
*
4
* Copyright (c) 1998 Matthew Dillon,
5
* Copyright (c) 1994 John S. Dyson
6
* Copyright (c) 1990 University of Utah.
7
* Copyright (c) 1982, 1986, 1989, 1993
8
* The Regents of the University of California. All rights reserved.
9
*
10
* This code is derived from software contributed to Berkeley by
11
* the Systems Programming Group of the University of Utah Computer
12
* Science Department.
13
*
14
* Redistribution and use in source and binary forms, with or without
15
* modification, are permitted provided that the following conditions
16
* are met:
17
* 1. Redistributions of source code must retain the above copyright
18
* notice, this list of conditions and the following disclaimer.
19
* 2. Redistributions in binary form must reproduce the above copyright
20
* notice, this list of conditions and the following disclaimer in the
21
* documentation and/or other materials provided with the distribution.
22
* 3. All advertising materials mentioning features or use of this software
23
* must display the following acknowledgement:
24
* This product includes software developed by the University of
25
* California, Berkeley and its contributors.
26
* 4. Neither the name of the University nor the names of its contributors
27
* may be used to endorse or promote products derived from this software
28
* without specific prior written permission.
29
*
30
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
* SUCH DAMAGE.
41
*
42
* New Swap System
43
* Matthew Dillon
44
*
45
* Radix Bitmap 'blists'.
46
*
47
* - The new swapper uses the new radix bitmap code. This should scale
48
* to arbitrarily small or arbitrarily large swap spaces and an almost
49
* arbitrary degree of fragmentation.
50
*
51
* Features:
52
*
53
* - on the fly reallocation of swap during putpages. The new system
54
* does not try to keep previously allocated swap blocks for dirty
55
* pages.
56
*
57
* - on the fly deallocation of swap
58
*
59
* - No more garbage collection required. Unnecessarily allocated swap
60
* blocks only exist for dirty vm_page_t's now and these are already
61
* cycled (in a high-load system) by the pager. We also do on-the-fly
62
* removal of invalidated swap blocks when a page is destroyed
63
* or renamed.
64
*
65
* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
66
*/
67
68
#include "opt_vm.h"
69
70
#define EXTERR_CATEGORY EXTERR_CAT_SWAP
71
#include <sys/param.h>
72
#include <sys/bio.h>
73
#include <sys/blist.h>
74
#include <sys/buf.h>
75
#include <sys/conf.h>
76
#include <sys/disk.h>
77
#include <sys/disklabel.h>
78
#include <sys/eventhandler.h>
79
#include <sys/exterrvar.h>
80
#include <sys/fcntl.h>
81
#include <sys/limits.h>
82
#include <sys/lock.h>
83
#include <sys/kernel.h>
84
#include <sys/mount.h>
85
#include <sys/namei.h>
86
#include <sys/malloc.h>
87
#include <sys/pctrie.h>
88
#include <sys/priv.h>
89
#include <sys/proc.h>
90
#include <sys/racct.h>
91
#include <sys/resource.h>
92
#include <sys/resourcevar.h>
93
#include <sys/rwlock.h>
94
#include <sys/sbuf.h>
95
#include <sys/sysctl.h>
96
#include <sys/sysproto.h>
97
#include <sys/systm.h>
98
#include <sys/sx.h>
99
#include <sys/unistd.h>
100
#include <sys/user.h>
101
#include <sys/vmmeter.h>
102
#include <sys/vnode.h>
103
104
#include <security/mac/mac_framework.h>
105
106
#include <vm/vm.h>
107
#include <vm/pmap.h>
108
#include <vm/vm_map.h>
109
#include <vm/vm_kern.h>
110
#include <vm/vm_object.h>
111
#include <vm/vm_page.h>
112
#include <vm/vm_pager.h>
113
#include <vm/vm_pageout.h>
114
#include <vm/vm_param.h>
115
#include <vm/vm_radix.h>
116
#include <vm/swap_pager.h>
117
#include <vm/vm_extern.h>
118
#include <vm/uma.h>
119
120
#include <geom/geom.h>
121
122
/*
123
* MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
124
* The 64-page limit is due to the radix code (kern/subr_blist.c).
125
*/
126
#ifndef MAX_PAGEOUT_CLUSTER
127
#define MAX_PAGEOUT_CLUSTER 32
128
#endif
129
130
#if !defined(SWB_NPAGES)
131
#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
132
#endif
133
134
#define SWAP_META_PAGES PCTRIE_COUNT
135
136
/*
137
* A swblk structure maps each page index within a
138
* SWAP_META_PAGES-aligned and sized range to the address of an
139
* on-disk swap block (or SWAPBLK_NONE). The collection of these
140
* mappings for an entire vm object is implemented as a pc-trie.
141
*/
142
struct swblk {
143
vm_pindex_t p;
144
daddr_t d[SWAP_META_PAGES];
145
};
146
147
/*
148
* A page_range structure records the start address and length of a sequence of
149
* mapped page addresses.
150
*/
151
struct page_range {
152
daddr_t start;
153
daddr_t num;
154
};
155
156
static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
157
static struct mtx sw_dev_mtx;
158
static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
159
static struct swdevt *swdevhd; /* Allocate from here next */
160
static int nswapdev; /* Number of swap devices */
161
int swap_pager_avail;
162
static struct sx swdev_syscall_lock; /* serialize swap(on|off) */
163
164
static __exclusive_cache_line u_long swap_reserved;
165
static u_long swap_total;
166
static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
167
168
static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
169
"VM swap stats");
170
171
SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
172
&swap_reserved, 0, sysctl_page_shift, "QU",
173
"Amount of swap storage needed to back all allocated anonymous memory.");
174
SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
175
&swap_total, 0, sysctl_page_shift, "QU",
176
"Total amount of available swap storage.");
177
178
int vm_overcommit __read_mostly = 0;
179
SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &vm_overcommit, 0,
180
"Configure virtual memory overcommit behavior. See tuning(7) "
181
"for details.");
182
static unsigned long swzone;
183
SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
184
"Actual size of swap metadata zone");
185
static unsigned long swap_maxpages;
186
SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
187
"Maximum amount of swap supported");
188
189
static COUNTER_U64_DEFINE_EARLY(swap_free_deferred);
190
SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
191
CTLFLAG_RD, &swap_free_deferred,
192
"Number of pages that deferred freeing swap space");
193
194
static COUNTER_U64_DEFINE_EARLY(swap_free_completed);
195
SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
196
CTLFLAG_RD, &swap_free_completed,
197
"Number of deferred frees completed");
198
199
static int
200
sysctl_page_shift(SYSCTL_HANDLER_ARGS)
201
{
202
uint64_t newval;
203
u_long value = *(u_long *)arg1;
204
205
newval = ((uint64_t)value) << PAGE_SHIFT;
206
return (sysctl_handle_64(oidp, &newval, 0, req));
207
}
208
209
static bool
210
swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc)
211
{
212
struct uidinfo *uip;
213
u_long prev;
214
215
uip = cred->cr_ruidinfo;
216
217
prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr);
218
if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 &&
219
prev + pincr > lim_cur(curthread, RLIMIT_SWAP) &&
220
priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) {
221
prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr);
222
KASSERT(prev >= pincr,
223
("negative vmsize for uid %d\n", uip->ui_uid));
224
return (false);
225
}
226
return (true);
227
}
228
229
static void
230
swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred)
231
{
232
struct uidinfo *uip;
233
#ifdef INVARIANTS
234
u_long prev;
235
#endif
236
237
uip = cred->cr_ruidinfo;
238
239
#ifdef INVARIANTS
240
prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr);
241
KASSERT(prev >= pdecr,
242
("negative vmsize for uid %d, prev %#jx decr %#jx\n",
243
uip->ui_uid, (uintmax_t)prev, (uintmax_t)pdecr));
244
#else
245
atomic_subtract_long(&uip->ui_vmsize, pdecr);
246
#endif
247
}
248
249
static void
250
swap_reserve_force_rlimit(u_long pincr, struct ucred *cred)
251
{
252
struct uidinfo *uip;
253
254
uip = cred->cr_ruidinfo;
255
atomic_add_long(&uip->ui_vmsize, pincr);
256
}
257
258
bool
259
swap_reserve(vm_ooffset_t incr)
260
{
261
262
return (swap_reserve_by_cred(incr, curthread->td_ucred));
263
}
264
265
bool
266
swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
267
{
268
u_long r, s, prev, pincr;
269
#ifdef RACCT
270
int error;
271
#endif
272
int oc;
273
static int curfail;
274
static struct timeval lastfail;
275
276
KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK",
277
__func__, (uintmax_t)incr));
278
279
#ifdef RACCT
280
if (RACCT_ENABLED()) {
281
PROC_LOCK(curproc);
282
error = racct_add(curproc, RACCT_SWAP, incr);
283
PROC_UNLOCK(curproc);
284
if (error != 0)
285
return (false);
286
}
287
#endif
288
289
pincr = atop(incr);
290
prev = atomic_fetchadd_long(&swap_reserved, pincr);
291
r = prev + pincr;
292
s = swap_total;
293
oc = atomic_load_int(&vm_overcommit);
294
if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) {
295
s += vm_cnt.v_page_count - vm_cnt.v_free_reserved -
296
vm_wire_count();
297
}
298
if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s &&
299
priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) {
300
prev = atomic_fetchadd_long(&swap_reserved, -pincr);
301
KASSERT(prev >= pincr,
302
("swap_reserved < incr on overcommit fail"));
303
goto out_error;
304
}
305
306
if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) {
307
prev = atomic_fetchadd_long(&swap_reserved, -pincr);
308
KASSERT(prev >= pincr,
309
("swap_reserved < incr on overcommit fail"));
310
goto out_error;
311
}
312
313
return (true);
314
315
out_error:
316
if (ppsratecheck(&lastfail, &curfail, 1)) {
317
printf("uid %d, pid %d: swap reservation "
318
"for %jd bytes failed\n",
319
cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr);
320
}
321
#ifdef RACCT
322
if (RACCT_ENABLED()) {
323
PROC_LOCK(curproc);
324
racct_sub(curproc, RACCT_SWAP, incr);
325
PROC_UNLOCK(curproc);
326
}
327
#endif
328
329
return (false);
330
}
331
332
void
333
swap_reserve_force_by_cred(vm_ooffset_t incr, struct ucred *cred)
334
{
335
u_long pincr;
336
337
KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK",
338
__func__, (uintmax_t)incr));
339
340
#ifdef RACCT
341
if (RACCT_ENABLED()) {
342
PROC_LOCK(curproc);
343
racct_add_force(curproc, RACCT_SWAP, incr);
344
PROC_UNLOCK(curproc);
345
}
346
#endif
347
pincr = atop(incr);
348
atomic_add_long(&swap_reserved, pincr);
349
swap_reserve_force_rlimit(pincr, cred);
350
}
351
352
void
353
swap_reserve_force(vm_ooffset_t incr)
354
{
355
swap_reserve_force_by_cred(incr, curthread->td_ucred);
356
}
357
358
void
359
swap_release(vm_ooffset_t decr)
360
{
361
struct ucred *cred;
362
363
PROC_LOCK(curproc);
364
cred = curproc->p_ucred;
365
swap_release_by_cred(decr, cred);
366
PROC_UNLOCK(curproc);
367
}
368
369
void
370
swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
371
{
372
u_long pdecr;
373
#ifdef INVARIANTS
374
u_long prev;
375
#endif
376
377
KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK",
378
__func__, (uintmax_t)decr));
379
380
pdecr = atop(decr);
381
#ifdef INVARIANTS
382
prev = atomic_fetchadd_long(&swap_reserved, -pdecr);
383
KASSERT(prev >= pdecr, ("swap_reserved %#jx < decr %#jx",
384
(uintmax_t)prev, (uintmax_t)pdecr));
385
#else
386
atomic_subtract_long(&swap_reserved, pdecr);
387
#endif
388
389
swap_release_by_cred_rlimit(pdecr, cred);
390
#ifdef RACCT
391
if (racct_enable)
392
racct_sub_cred(cred, RACCT_SWAP, decr);
393
#endif
394
}
395
396
static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
397
bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
398
static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */
399
static int nsw_wcount_async; /* limit async write buffers */
400
static int nsw_wcount_async_max;/* assigned maximum */
401
int nsw_cluster_max; /* maximum VOP I/O allowed */
402
403
static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
404
SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
405
CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
406
"Maximum running async swap ops");
407
static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
408
SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
409
CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
410
"Swap Fragmentation Info");
411
412
static struct sx sw_alloc_sx;
413
414
/*
415
* "named" and "unnamed" anon region objects. Try to reduce the overhead
416
* of searching a named list by hashing it just a little.
417
*/
418
419
#define NOBJLISTS 8
420
421
#define NOBJLIST(handle) \
422
(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
423
424
static struct pagerlst swap_pager_object_list[NOBJLISTS];
425
static uma_zone_t swwbuf_zone;
426
static uma_zone_t swrbuf_zone;
427
static uma_zone_t swblk_zone;
428
static uma_zone_t swpctrie_zone;
429
430
/*
431
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
432
* calls hooked from other parts of the VM system and do not appear here.
433
* (see vm/swap_pager.h).
434
*/
435
static vm_object_t
436
swap_pager_alloc(void *handle, vm_ooffset_t size,
437
vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
438
static void swap_pager_dealloc(vm_object_t object);
439
static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
440
int *);
441
static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
442
int *, pgo_getpages_iodone_t, void *);
443
static void swap_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
444
static boolean_t
445
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
446
static void swap_pager_init(void);
447
static void swap_pager_unswapped(vm_page_t);
448
static void swap_pager_swapoff(struct swdevt *sp);
449
static void swap_pager_update_writecount(vm_object_t object,
450
vm_offset_t start, vm_offset_t end);
451
static void swap_pager_release_writecount(vm_object_t object,
452
vm_offset_t start, vm_offset_t end);
453
static void swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start,
454
vm_size_t size);
455
456
const struct pagerops swappagerops = {
457
.pgo_kvme_type = KVME_TYPE_SWAP,
458
.pgo_init = swap_pager_init, /* early system initialization of pager */
459
.pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */
460
.pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
461
.pgo_getpages = swap_pager_getpages, /* pagein */
462
.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */
463
.pgo_putpages = swap_pager_putpages, /* pageout */
464
.pgo_haspage = swap_pager_haspage, /* get backing store status for page */
465
.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
466
.pgo_update_writecount = swap_pager_update_writecount,
467
.pgo_release_writecount = swap_pager_release_writecount,
468
.pgo_freespace = swap_pager_freespace_pgo,
469
};
470
471
/*
472
* swap_*() routines are externally accessible. swp_*() routines are
473
* internal.
474
*/
475
static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
476
static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
477
478
SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
479
"Maximum size of a swap block in pages");
480
481
static void swp_sizecheck(void);
482
static void swp_pager_async_iodone(struct buf *bp);
483
static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
484
static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb);
485
static int swapongeom(struct vnode *);
486
static int swaponvp(struct thread *, struct vnode *, u_long);
487
static int swapoff_one(struct swdevt *sp, struct ucred *cred,
488
u_int flags);
489
490
/*
491
* Swap bitmap functions
492
*/
493
static void swp_pager_freeswapspace(const struct page_range *range);
494
static daddr_t swp_pager_getswapspace(int *npages);
495
496
/*
497
* Metadata functions
498
*/
499
static daddr_t swp_pager_meta_build(struct pctrie_iter *, vm_object_t object,
500
vm_pindex_t, daddr_t, bool);
501
static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t,
502
vm_size_t *);
503
static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst,
504
vm_pindex_t pindex, vm_pindex_t count);
505
static void swp_pager_meta_free_all(vm_object_t);
506
static daddr_t swp_pager_meta_lookup(struct pctrie_iter *, vm_pindex_t);
507
508
static void
509
swp_pager_init_freerange(struct page_range *range)
510
{
511
range->start = SWAPBLK_NONE;
512
range->num = 0;
513
}
514
515
static void
516
swp_pager_update_freerange(struct page_range *range, daddr_t addr)
517
{
518
if (range->start + range->num == addr) {
519
range->num++;
520
} else {
521
swp_pager_freeswapspace(range);
522
range->start = addr;
523
range->num = 1;
524
}
525
}
526
527
static void *
528
swblk_trie_alloc(struct pctrie *ptree)
529
{
530
531
return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
532
M_USE_RESERVE : 0)));
533
}
534
535
static void
536
swblk_trie_free(struct pctrie *ptree, void *node)
537
{
538
539
uma_zfree(swpctrie_zone, node);
540
}
541
542
static int
543
swblk_start(struct swblk *sb, vm_pindex_t pindex)
544
{
545
return (sb == NULL || sb->p >= pindex ?
546
0 : pindex % SWAP_META_PAGES);
547
}
548
549
PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
550
551
static struct swblk *
552
swblk_lookup(vm_object_t object, vm_pindex_t pindex)
553
{
554
return (SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
555
rounddown(pindex, SWAP_META_PAGES)));
556
}
557
558
static void
559
swblk_lookup_remove(vm_object_t object, struct swblk *sb)
560
{
561
SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
562
}
563
564
static bool
565
swblk_is_empty(vm_object_t object)
566
{
567
return (pctrie_is_empty(&object->un_pager.swp.swp_blks));
568
}
569
570
static struct swblk *
571
swblk_iter_lookup_ge(struct pctrie_iter *blks, vm_pindex_t pindex)
572
{
573
return (SWAP_PCTRIE_ITER_LOOKUP_GE(blks,
574
rounddown(pindex, SWAP_META_PAGES)));
575
}
576
577
static void
578
swblk_iter_init_only(struct pctrie_iter *blks, vm_object_t object)
579
{
580
VM_OBJECT_ASSERT_LOCKED(object);
581
MPASS((object->flags & OBJ_SWAP) != 0);
582
pctrie_iter_init(blks, &object->un_pager.swp.swp_blks);
583
}
584
585
586
static struct swblk *
587
swblk_iter_init(struct pctrie_iter *blks, vm_object_t object,
588
vm_pindex_t pindex)
589
{
590
swblk_iter_init_only(blks, object);
591
return (swblk_iter_lookup_ge(blks, pindex));
592
}
593
594
static struct swblk *
595
swblk_iter_reinit(struct pctrie_iter *blks, vm_object_t object,
596
vm_pindex_t pindex)
597
{
598
swblk_iter_init_only(blks, object);
599
return (SWAP_PCTRIE_ITER_LOOKUP(blks,
600
rounddown(pindex, SWAP_META_PAGES)));
601
}
602
603
static struct swblk *
604
swblk_iter_limit_init(struct pctrie_iter *blks, vm_object_t object,
605
vm_pindex_t pindex, vm_pindex_t limit)
606
{
607
VM_OBJECT_ASSERT_LOCKED(object);
608
MPASS((object->flags & OBJ_SWAP) != 0);
609
pctrie_iter_limit_init(blks, &object->un_pager.swp.swp_blks, limit);
610
return (swblk_iter_lookup_ge(blks, pindex));
611
}
612
613
static struct swblk *
614
swblk_iter_next(struct pctrie_iter *blks)
615
{
616
return (SWAP_PCTRIE_ITER_JUMP_GE(blks, SWAP_META_PAGES));
617
}
618
619
static struct swblk *
620
swblk_iter_lookup(struct pctrie_iter *blks, vm_pindex_t pindex)
621
{
622
return (SWAP_PCTRIE_ITER_LOOKUP(blks,
623
rounddown(pindex, SWAP_META_PAGES)));
624
}
625
626
static int
627
swblk_iter_insert(struct pctrie_iter *blks, struct swblk *sb)
628
{
629
return (SWAP_PCTRIE_ITER_INSERT(blks, sb));
630
}
631
632
static void
633
swblk_iter_remove(struct pctrie_iter *blks)
634
{
635
SWAP_PCTRIE_ITER_REMOVE(blks);
636
}
637
638
/*
639
* SWP_SIZECHECK() - update swap_pager_full indication
640
*
641
* update the swap_pager_almost_full indication and warn when we are
642
* about to run out of swap space, using lowat/hiwat hysteresis.
643
*
644
* Clear swap_pager_full ( task killing ) indication when lowat is met.
645
*
646
* No restrictions on call
647
* This routine may not block.
648
*/
649
static void
650
swp_sizecheck(void)
651
{
652
653
if (swap_pager_avail < nswap_lowat) {
654
if (!swap_pager_almost_full) {
655
printf("swap_pager: out of swap space\n");
656
swap_pager_almost_full = true;
657
}
658
} else {
659
swap_pager_full = false;
660
if (swap_pager_avail > nswap_hiwat)
661
swap_pager_almost_full = false;
662
}
663
}
664
665
/*
666
* SWAP_PAGER_INIT() - initialize the swap pager!
667
*
668
* Expected to be started from system init. NOTE: This code is run
669
* before much else so be careful what you depend on. Most of the VM
670
* system has yet to be initialized at this point.
671
*/
672
static void
673
swap_pager_init(void)
674
{
675
/*
676
* Initialize object lists
677
*/
678
int i;
679
680
for (i = 0; i < NOBJLISTS; ++i)
681
TAILQ_INIT(&swap_pager_object_list[i]);
682
mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
683
sx_init(&sw_alloc_sx, "swspsx");
684
sx_init(&swdev_syscall_lock, "swsysc");
685
686
/*
687
* The nsw_cluster_max is constrained by the bp->b_pages[]
688
* array, which has maxphys / PAGE_SIZE entries, and our locally
689
* defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
690
* constrained by the swap device interleave stripe size.
691
*
692
* Initialized early so that GEOM_ELI can see it.
693
*/
694
nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);
695
}
696
697
/*
698
* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
699
*
700
* Expected to be started from pageout process once, prior to entering
701
* its main loop.
702
*/
703
void
704
swap_pager_swap_init(void)
705
{
706
unsigned long n, n2;
707
708
/*
709
* Number of in-transit swap bp operations. Don't
710
* exhaust the pbufs completely. Make sure we
711
* initialize workable values (0 will work for hysteresis
712
* but it isn't very efficient).
713
*
714
* Currently we hardwire nsw_wcount_async to 4. This limit is
715
* designed to prevent other I/O from having high latencies due to
716
* our pageout I/O. The value 4 works well for one or two active swap
717
* devices but is probably a little low if you have more. Even so,
718
* a higher value would probably generate only a limited improvement
719
* with three or four active swap devices since the system does not
720
* typically have to pageout at extreme bandwidths. We will want
721
* at least 2 per swap devices, and 4 is a pretty good value if you
722
* have one NFS swap device due to the command/ack latency over NFS.
723
* So it all works out pretty well.
724
*
725
* nsw_cluster_max is initialized in swap_pager_init().
726
*/
727
728
nsw_wcount_async = 4;
729
nsw_wcount_async_max = nsw_wcount_async;
730
mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF);
731
732
swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4);
733
swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2);
734
735
/*
736
* Initialize our zone, taking the user's requested size or
737
* estimating the number we need based on the number of pages
738
* in the system.
739
*/
740
n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
741
vm_cnt.v_page_count / 2;
742
swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
743
pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0);
744
swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
745
NULL, NULL, _Alignof(struct swblk) - 1, 0);
746
n2 = n;
747
do {
748
if (uma_zone_reserve_kva(swblk_zone, n))
749
break;
750
/*
751
* if the allocation failed, try a zone two thirds the
752
* size of the previous attempt.
753
*/
754
n -= ((n + 2) / 3);
755
} while (n > 0);
756
757
/*
758
* Often uma_zone_reserve_kva() cannot reserve exactly the
759
* requested size. Account for the difference when
760
* calculating swap_maxpages.
761
*/
762
n = uma_zone_get_max(swblk_zone);
763
764
if (n < n2)
765
printf("Swap blk zone entries changed from %lu to %lu.\n",
766
n2, n);
767
/* absolute maximum we can handle assuming 100% efficiency */
768
swap_maxpages = n * SWAP_META_PAGES;
769
swzone = n * sizeof(struct swblk);
770
if (!uma_zone_reserve_kva(swpctrie_zone, n))
771
printf("Cannot reserve swap pctrie zone, "
772
"reduce kern.maxswzone.\n");
773
}
774
775
bool
776
swap_pager_init_object(vm_object_t object, void *handle, struct ucred *cred,
777
vm_ooffset_t size, vm_ooffset_t offset)
778
{
779
if (cred != NULL) {
780
if (!swap_reserve_by_cred(size, cred))
781
return (false);
782
crhold(cred);
783
}
784
785
object->un_pager.swp.writemappings = 0;
786
object->handle = handle;
787
object->cred = cred;
788
return (true);
789
}
790
791
static vm_object_t
792
swap_pager_alloc_init(objtype_t otype, void *handle, struct ucred *cred,
793
vm_ooffset_t size, vm_ooffset_t offset)
794
{
795
vm_object_t object;
796
797
/*
798
* The un_pager.swp.swp_blks trie is initialized by
799
* vm_object_allocate() to ensure the correct order of
800
* visibility to other threads.
801
*/
802
object = vm_object_allocate(otype, OFF_TO_IDX(offset +
803
PAGE_MASK + size));
804
805
if (!swap_pager_init_object(object, handle, cred, size, offset)) {
806
vm_object_deallocate(object);
807
return (NULL);
808
}
809
return (object);
810
}
811
812
/*
813
* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
814
* its metadata structures.
815
*
816
* This routine is called from the mmap and fork code to create a new
817
* OBJT_SWAP object.
818
*
819
* This routine must ensure that no live duplicate is created for
820
* the named object request, which is protected against by
821
* holding the sw_alloc_sx lock in case handle != NULL.
822
*/
823
static vm_object_t
824
swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
825
vm_ooffset_t offset, struct ucred *cred)
826
{
827
vm_object_t object;
828
829
if (handle != NULL) {
830
/*
831
* Reference existing named region or allocate new one. There
832
* should not be a race here against swp_pager_meta_build()
833
* as called from vm_page_remove() in regards to the lookup
834
* of the handle.
835
*/
836
sx_xlock(&sw_alloc_sx);
837
object = vm_pager_object_lookup(NOBJLIST(handle), handle);
838
if (object == NULL) {
839
object = swap_pager_alloc_init(OBJT_SWAP, handle, cred,
840
size, offset);
841
if (object != NULL) {
842
TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
843
object, pager_object_list);
844
}
845
}
846
sx_xunlock(&sw_alloc_sx);
847
} else {
848
object = swap_pager_alloc_init(OBJT_SWAP, handle, cred,
849
size, offset);
850
}
851
return (object);
852
}
853
854
/*
855
* SWAP_PAGER_DEALLOC() - remove swap metadata from object
856
*
857
* The swap backing for the object is destroyed. The code is
858
* designed such that we can reinstantiate it later, but this
859
* routine is typically called only when the entire object is
860
* about to be destroyed.
861
*
862
* The object must be locked.
863
*/
864
static void
865
swap_pager_dealloc(vm_object_t object)
866
{
867
868
VM_OBJECT_ASSERT_WLOCKED(object);
869
KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
870
871
/*
872
* Remove from list right away so lookups will fail if we block for
873
* pageout completion.
874
*/
875
if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) {
876
VM_OBJECT_WUNLOCK(object);
877
sx_xlock(&sw_alloc_sx);
878
TAILQ_REMOVE(NOBJLIST(object->handle), object,
879
pager_object_list);
880
sx_xunlock(&sw_alloc_sx);
881
VM_OBJECT_WLOCK(object);
882
}
883
884
vm_object_pip_wait(object, "swpdea");
885
886
/*
887
* Free all remaining metadata. We only bother to free it from
888
* the swap meta data. We do not attempt to free swapblk's still
889
* associated with vm_page_t's for this object. We do not care
890
* if paging is still in progress on some objects.
891
*/
892
swp_pager_meta_free_all(object);
893
object->handle = NULL;
894
object->type = OBJT_DEAD;
895
896
/*
897
* Release the allocation charge.
898
*/
899
if (object->cred != NULL) {
900
swap_release_by_cred(ptoa(object->size), object->cred);
901
crfree(object->cred);
902
object->cred = NULL;
903
}
904
905
/*
906
* Hide the object from swap_pager_swapoff().
907
*/
908
vm_object_clear_flag(object, OBJ_SWAP);
909
}
910
911
/************************************************************************
912
* SWAP PAGER BITMAP ROUTINES *
913
************************************************************************/
914
915
/*
916
* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
917
*
918
* Allocate swap for up to the requested number of pages. The
919
* starting swap block number (a page index) is returned or
920
* SWAPBLK_NONE if the allocation failed.
921
*
922
* Also has the side effect of advising that somebody made a mistake
923
* when they configured swap and didn't configure enough.
924
*
925
* This routine may not sleep.
926
*
927
* We allocate in round-robin fashion from the configured devices.
928
*/
929
static daddr_t
930
swp_pager_getswapspace(int *io_npages)
931
{
932
daddr_t blk;
933
struct swdevt *sp;
934
int mpages, npages;
935
936
KASSERT(*io_npages >= 1,
937
("%s: npages not positive", __func__));
938
blk = SWAPBLK_NONE;
939
mpages = *io_npages;
940
npages = imin(BLIST_MAX_ALLOC, mpages);
941
mtx_lock(&sw_dev_mtx);
942
sp = swdevhd;
943
while (!TAILQ_EMPTY(&swtailq)) {
944
if (sp == NULL)
945
sp = TAILQ_FIRST(&swtailq);
946
if ((sp->sw_flags & SW_CLOSING) == 0)
947
blk = blist_alloc(sp->sw_blist, &npages, mpages);
948
if (blk != SWAPBLK_NONE)
949
break;
950
sp = TAILQ_NEXT(sp, sw_list);
951
if (swdevhd == sp) {
952
if (npages == 1)
953
break;
954
mpages = npages - 1;
955
npages >>= 1;
956
}
957
}
958
if (blk != SWAPBLK_NONE) {
959
*io_npages = npages;
960
blk += sp->sw_first;
961
sp->sw_used += npages;
962
swap_pager_avail -= npages;
963
swp_sizecheck();
964
swdevhd = TAILQ_NEXT(sp, sw_list);
965
} else {
966
if (!swap_pager_full) {
967
printf("swp_pager_getswapspace(%d): failed\n",
968
*io_npages);
969
swap_pager_full = swap_pager_almost_full = true;
970
}
971
swdevhd = NULL;
972
}
973
mtx_unlock(&sw_dev_mtx);
974
return (blk);
975
}
976
977
static bool
978
swp_pager_isondev(daddr_t blk, struct swdevt *sp)
979
{
980
981
return (blk >= sp->sw_first && blk < sp->sw_end);
982
}
983
984
static void
985
swp_pager_strategy(struct buf *bp)
986
{
987
struct swdevt *sp;
988
989
mtx_lock(&sw_dev_mtx);
990
TAILQ_FOREACH(sp, &swtailq, sw_list) {
991
if (swp_pager_isondev(bp->b_blkno, sp)) {
992
mtx_unlock(&sw_dev_mtx);
993
if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
994
unmapped_buf_allowed) {
995
bp->b_data = unmapped_buf;
996
bp->b_offset = 0;
997
} else {
998
pmap_qenter((vm_offset_t)bp->b_data,
999
&bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
1000
}
1001
sp->sw_strategy(bp, sp);
1002
return;
1003
}
1004
}
1005
panic("Swapdev not found");
1006
}
1007
1008
/*
1009
* SWP_PAGER_FREESWAPSPACE() - free raw swap space
1010
*
1011
* This routine returns the specified swap blocks back to the bitmap.
1012
*
1013
* This routine may not sleep.
1014
*/
1015
static void
1016
swp_pager_freeswapspace(const struct page_range *range)
1017
{
1018
daddr_t blk, npages;
1019
struct swdevt *sp;
1020
1021
blk = range->start;
1022
npages = range->num;
1023
if (npages == 0)
1024
return;
1025
mtx_lock(&sw_dev_mtx);
1026
TAILQ_FOREACH(sp, &swtailq, sw_list) {
1027
if (swp_pager_isondev(blk, sp)) {
1028
sp->sw_used -= npages;
1029
/*
1030
* If we are attempting to stop swapping on
1031
* this device, we don't want to mark any
1032
* blocks free lest they be reused.
1033
*/
1034
if ((sp->sw_flags & SW_CLOSING) == 0) {
1035
blist_free(sp->sw_blist, blk - sp->sw_first,
1036
npages);
1037
swap_pager_avail += npages;
1038
swp_sizecheck();
1039
}
1040
mtx_unlock(&sw_dev_mtx);
1041
return;
1042
}
1043
}
1044
panic("Swapdev not found");
1045
}
1046
1047
/*
1048
* SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats
1049
*/
1050
static int
1051
sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
1052
{
1053
struct sbuf sbuf;
1054
struct swdevt *sp;
1055
const char *devname;
1056
int error;
1057
1058
error = sysctl_wire_old_buffer(req, 0);
1059
if (error != 0)
1060
return (error);
1061
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
1062
mtx_lock(&sw_dev_mtx);
1063
TAILQ_FOREACH(sp, &swtailq, sw_list) {
1064
if (vn_isdisk(sp->sw_vp))
1065
devname = devtoname(sp->sw_vp->v_rdev);
1066
else
1067
devname = "[file]";
1068
sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
1069
blist_stats(sp->sw_blist, &sbuf);
1070
}
1071
mtx_unlock(&sw_dev_mtx);
1072
error = sbuf_finish(&sbuf);
1073
sbuf_delete(&sbuf);
1074
return (error);
1075
}
1076
1077
/*
1078
* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
1079
* range within an object.
1080
*
1081
* This routine removes swapblk assignments from swap metadata.
1082
*
1083
* The external callers of this routine typically have already destroyed
1084
* or renamed vm_page_t's associated with this range in the object so
1085
* we should be ok.
1086
*
1087
* The object must be locked.
1088
*/
1089
void
1090
swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size,
1091
vm_size_t *freed)
1092
{
1093
MPASS((object->flags & OBJ_SWAP) != 0);
1094
1095
swp_pager_meta_free(object, start, size, freed);
1096
}
1097
1098
static void
1099
swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start, vm_size_t size)
1100
{
1101
MPASS((object->flags & OBJ_SWAP) != 0);
1102
1103
swp_pager_meta_free(object, start, size, NULL);
1104
}
1105
1106
/*
1107
* SWAP_PAGER_RESERVE() - reserve swap blocks in object
1108
*
1109
* Assigns swap blocks to the specified range within the object. The
1110
* swap blocks are not zeroed. Any previous swap assignment is destroyed.
1111
*
1112
* Returns 0 on success, -1 on failure.
1113
*/
1114
int
1115
swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
1116
{
1117
struct pctrie_iter blks;
1118
struct page_range range;
1119
daddr_t addr, blk;
1120
vm_pindex_t i, j;
1121
int n;
1122
1123
swp_pager_init_freerange(&range);
1124
VM_OBJECT_WLOCK(object);
1125
swblk_iter_init_only(&blks, object);
1126
for (i = 0; i < size; i += n) {
1127
n = MIN(size - i, INT_MAX);
1128
blk = swp_pager_getswapspace(&n);
1129
if (blk == SWAPBLK_NONE) {
1130
swp_pager_meta_free(object, start, i, NULL);
1131
VM_OBJECT_WUNLOCK(object);
1132
return (-1);
1133
}
1134
for (j = 0; j < n; ++j) {
1135
addr = swp_pager_meta_build(&blks, object,
1136
start + i + j, blk + j, false);
1137
if (addr != SWAPBLK_NONE)
1138
swp_pager_update_freerange(&range, addr);
1139
}
1140
}
1141
swp_pager_freeswapspace(&range);
1142
VM_OBJECT_WUNLOCK(object);
1143
return (0);
1144
}
1145
1146
/*
1147
* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
1148
* and destroy the source.
1149
*
1150
* Copy any valid swapblks from the source to the destination. In
1151
* cases where both the source and destination have a valid swapblk,
1152
* we keep the destination's.
1153
*
1154
* This routine is allowed to sleep. It may sleep allocating metadata
1155
* indirectly through swp_pager_meta_build().
1156
*
1157
* The source object contains no vm_page_t's (which is just as well)
1158
*
1159
* The source and destination objects must be locked.
1160
* Both object locks may temporarily be released.
1161
*/
1162
void
1163
swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
1164
vm_pindex_t offset, int destroysource)
1165
{
1166
VM_OBJECT_ASSERT_WLOCKED(srcobject);
1167
VM_OBJECT_ASSERT_WLOCKED(dstobject);
1168
1169
/*
1170
* If destroysource is set, we remove the source object from the
1171
* swap_pager internal queue now.
1172
*/
1173
if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
1174
srcobject->handle != NULL) {
1175
VM_OBJECT_WUNLOCK(srcobject);
1176
VM_OBJECT_WUNLOCK(dstobject);
1177
sx_xlock(&sw_alloc_sx);
1178
TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
1179
pager_object_list);
1180
sx_xunlock(&sw_alloc_sx);
1181
VM_OBJECT_WLOCK(dstobject);
1182
VM_OBJECT_WLOCK(srcobject);
1183
}
1184
1185
/*
1186
* Transfer source to destination.
1187
*/
1188
swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size);
1189
1190
/*
1191
* Free left over swap blocks in source.
1192
*/
1193
if (destroysource)
1194
swp_pager_meta_free_all(srcobject);
1195
}
1196
1197
/*
1198
* SWP_PAGER_HASPAGE_ITER() - determine if we have good backing store for
1199
* the requested page, accessed with the given
1200
* iterator.
1201
*
1202
* We determine whether good backing store exists for the requested
1203
* page and return TRUE if it does, FALSE if it doesn't.
1204
*
1205
* If TRUE, we also try to determine how much valid, contiguous backing
1206
* store exists before and after the requested page.
1207
*/
1208
static boolean_t
1209
swp_pager_haspage_iter(vm_pindex_t pindex, int *before, int *after,
1210
struct pctrie_iter *blks)
1211
{
1212
daddr_t blk, blk0;
1213
int i;
1214
1215
/*
1216
* do we have good backing store at the requested index ?
1217
*/
1218
blk0 = swp_pager_meta_lookup(blks, pindex);
1219
if (blk0 == SWAPBLK_NONE) {
1220
if (before)
1221
*before = 0;
1222
if (after)
1223
*after = 0;
1224
return (FALSE);
1225
}
1226
1227
/*
1228
* find backwards-looking contiguous good backing store
1229
*/
1230
if (before != NULL) {
1231
for (i = 1; i < SWB_NPAGES; i++) {
1232
if (i > pindex)
1233
break;
1234
blk = swp_pager_meta_lookup(blks, pindex - i);
1235
if (blk != blk0 - i)
1236
break;
1237
}
1238
*before = i - 1;
1239
}
1240
1241
/*
1242
* find forward-looking contiguous good backing store
1243
*/
1244
if (after != NULL) {
1245
for (i = 1; i < SWB_NPAGES; i++) {
1246
blk = swp_pager_meta_lookup(blks, pindex + i);
1247
if (blk != blk0 + i)
1248
break;
1249
}
1250
*after = i - 1;
1251
}
1252
return (TRUE);
1253
}
1254
1255
/*
1256
* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
1257
* the requested page, in the given object.
1258
*
1259
* We determine whether good backing store exists for the requested
1260
* page and return TRUE if it does, FALSE if it doesn't.
1261
*
1262
* If TRUE, we also try to determine how much valid, contiguous backing
1263
* store exists before and after the requested page.
1264
*/
1265
static boolean_t
1266
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
1267
int *after)
1268
{
1269
struct pctrie_iter blks;
1270
1271
swblk_iter_init_only(&blks, object);
1272
return (swp_pager_haspage_iter(pindex, before, after, &blks));
1273
}
1274
1275
static void
1276
swap_pager_unswapped_acct(vm_page_t m)
1277
{
1278
KASSERT((m->object->flags & OBJ_SWAP) != 0,
1279
("Free object not swappable"));
1280
if ((m->a.flags & PGA_SWAP_FREE) != 0)
1281
counter_u64_add(swap_free_completed, 1);
1282
vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE);
1283
1284
/*
1285
* The meta data only exists if the object is OBJT_SWAP
1286
* and even then might not be allocated yet.
1287
*/
1288
}
1289
1290
/*
1291
* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
1292
*
1293
* This removes any associated swap backing store, whether valid or
1294
* not, from the page.
1295
*
1296
* This routine is typically called when a page is made dirty, at
1297
* which point any associated swap can be freed. MADV_FREE also
1298
* calls us in a special-case situation
1299
*
1300
* NOTE!!! If the page is clean and the swap was valid, the caller
1301
* should make the page dirty before calling this routine. This routine
1302
* does NOT change the m->dirty status of the page. Also: MADV_FREE
1303
* depends on it.
1304
*
1305
* This routine may not sleep.
1306
*
1307
* The object containing the page may be locked.
1308
*/
1309
static void
1310
swap_pager_unswapped(vm_page_t m)
1311
{
1312
struct page_range range;
1313
struct swblk *sb;
1314
vm_object_t obj;
1315
1316
/*
1317
* Handle enqueing deferred frees first. If we do not have the
1318
* object lock we wait for the page daemon to clear the space.
1319
*/
1320
obj = m->object;
1321
if (!VM_OBJECT_WOWNED(obj)) {
1322
VM_PAGE_OBJECT_BUSY_ASSERT(m);
1323
/*
1324
* The caller is responsible for synchronization but we
1325
* will harmlessly handle races. This is typically provided
1326
* by only calling unswapped() when a page transitions from
1327
* clean to dirty.
1328
*/
1329
if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) ==
1330
PGA_SWAP_SPACE) {
1331
vm_page_aflag_set(m, PGA_SWAP_FREE);
1332
counter_u64_add(swap_free_deferred, 1);
1333
}
1334
return;
1335
}
1336
swap_pager_unswapped_acct(m);
1337
1338
sb = swblk_lookup(m->object, m->pindex);
1339
if (sb == NULL)
1340
return;
1341
range.start = sb->d[m->pindex % SWAP_META_PAGES];
1342
if (range.start == SWAPBLK_NONE)
1343
return;
1344
range.num = 1;
1345
swp_pager_freeswapspace(&range);
1346
sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
1347
swp_pager_free_empty_swblk(m->object, sb);
1348
}
1349
1350
/*
1351
* swap_pager_getpages_locked() - bring pages in from swap
1352
*
1353
* Attempt to page in the pages in array "ma" of length "count". The
1354
* caller may optionally specify that additional pages preceding and
1355
* succeeding the specified range be paged in. The number of such pages
1356
* is returned in the "a_rbehind" and "a_rahead" parameters, and they will
1357
* be in the inactive queue upon return.
1358
*
1359
* The pages in "ma" must be busied and will remain busied upon return.
1360
*/
1361
static int
1362
swap_pager_getpages_locked(struct pctrie_iter *blks, vm_object_t object,
1363
vm_page_t *ma, int count, int *a_rbehind, int *a_rahead, struct buf *bp)
1364
{
1365
vm_page_t m;
1366
vm_pindex_t pindex;
1367
int i, rahead, rbehind;
1368
1369
VM_OBJECT_ASSERT_WLOCKED(object);
1370
1371
KASSERT((object->flags & OBJ_SWAP) != 0,
1372
("%s: object not swappable", __func__));
1373
for (pindex = 0, i = 0; i < count; i++) {
1374
m = ma[i];
1375
if (m != bogus_page) {
1376
pindex = m->pindex - i;
1377
break;
1378
}
1379
}
1380
MPASS(i != count);
1381
if (!swp_pager_haspage_iter(pindex, &rbehind, &rahead, blks)) {
1382
VM_OBJECT_WUNLOCK(object);
1383
uma_zfree(swrbuf_zone, bp);
1384
return (VM_PAGER_FAIL);
1385
}
1386
1387
KASSERT(count - 1 <= rahead,
1388
("page count %d extends beyond swap block", count));
1389
1390
/*
1391
* Do not transfer any pages other than those that are xbusied
1392
* when running during a split or collapse operation. This
1393
* prevents clustering from re-creating pages which are being
1394
* moved into another object.
1395
*/
1396
if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) {
1397
rahead = count - 1;
1398
rbehind = 0;
1399
}
1400
/* Clip readbehind/ahead ranges to exclude already resident pages. */
1401
rbehind = a_rbehind != NULL ? imin(*a_rbehind, rbehind) : 0;
1402
rahead = a_rahead != NULL ? imin(*a_rahead, rahead - count + 1) : 0;
1403
/* Allocate pages. */
1404
vm_object_prepare_buf_pages(object, bp->b_pages, count, &rbehind,
1405
&rahead, ma);
1406
bp->b_npages = rbehind + count + rahead;
1407
KASSERT(bp->b_npages <= PBUF_PAGES,
1408
("bp_npages %d (rb %d c %d ra %d) not less than PBUF_PAGES %jd",
1409
bp->b_npages, rbehind, count, rahead, (uintmax_t)PBUF_PAGES));
1410
for (i = 0; i < bp->b_npages; i++) {
1411
m = bp->b_pages[i];
1412
if (m != bogus_page)
1413
m->oflags |= VPO_SWAPINPROG;
1414
}
1415
bp->b_blkno = swp_pager_meta_lookup(blks, pindex - rbehind);
1416
KASSERT(bp->b_blkno != SWAPBLK_NONE,
1417
("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
1418
1419
vm_object_pip_add(object, bp->b_npages);
1420
VM_OBJECT_WUNLOCK(object);
1421
MPASS((bp->b_flags & B_MAXPHYS) != 0);
1422
1423
/* Report back actual behind/ahead read. */
1424
if (a_rbehind != NULL)
1425
*a_rbehind = rbehind;
1426
if (a_rahead != NULL)
1427
*a_rahead = rahead;
1428
1429
bp->b_flags |= B_PAGING;
1430
bp->b_iocmd = BIO_READ;
1431
bp->b_iodone = swp_pager_async_iodone;
1432
bp->b_rcred = crhold(thread0.td_ucred);
1433
bp->b_wcred = crhold(thread0.td_ucred);
1434
bp->b_bufsize = bp->b_bcount = ptoa(bp->b_npages);
1435
bp->b_pgbefore = rbehind;
1436
bp->b_pgafter = rahead;
1437
1438
VM_CNT_INC(v_swapin);
1439
VM_CNT_ADD(v_swappgsin, bp->b_npages);
1440
1441
/*
1442
* perform the I/O. NOTE!!! bp cannot be considered valid after
1443
* this point because we automatically release it on completion.
1444
* Instead, we look at the one page we are interested in which we
1445
* still hold a lock on even through the I/O completion.
1446
*
1447
* The other pages in our ma[] array are also released on completion,
1448
* so we cannot assume they are valid anymore either.
1449
*
1450
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
1451
*/
1452
BUF_KERNPROC(bp);
1453
swp_pager_strategy(bp);
1454
1455
/*
1456
* Wait for the pages we want to complete. VPO_SWAPINPROG is always
1457
* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
1458
* is set in the metadata for each page in the request.
1459
*/
1460
VM_OBJECT_WLOCK(object);
1461
/* This could be implemented more efficiently with aflags */
1462
for (i = 0; i < count; i++) {
1463
m = ma[i];
1464
if (m != bogus_page)
1465
break;
1466
}
1467
MPASS(i != count);
1468
while ((m->oflags & VPO_SWAPINPROG) != 0) {
1469
m->oflags |= VPO_SWAPSLEEP;
1470
VM_CNT_INC(v_intrans);
1471
if (VM_OBJECT_SLEEP(object, &object->handle, PSWP,
1472
"swread", hz * 20)) {
1473
printf(
1474
"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
1475
bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
1476
}
1477
}
1478
VM_OBJECT_WUNLOCK(object);
1479
1480
/*
1481
* If we had an unrecoverable read error pages will not be valid.
1482
*/
1483
for (i = 0; i < count; i++) {
1484
if (ma[i] != bogus_page && ma[i]->valid != VM_PAGE_BITS_ALL)
1485
return (VM_PAGER_ERROR);
1486
}
1487
1488
return (VM_PAGER_OK);
1489
1490
/*
1491
* A final note: in a low swap situation, we cannot deallocate swap
1492
* and mark a page dirty here because the caller is likely to mark
1493
* the page clean when we return, causing the page to possibly revert
1494
* to all-zero's later.
1495
*/
1496
}
1497
1498
static int
1499
swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count,
1500
int *rbehind, int *rahead)
1501
{
1502
struct buf *bp;
1503
struct pctrie_iter blks;
1504
1505
bp = uma_zalloc(swrbuf_zone, M_WAITOK);
1506
VM_OBJECT_WLOCK(object);
1507
swblk_iter_init_only(&blks, object);
1508
return (swap_pager_getpages_locked(&blks, object, ma, count, rbehind,
1509
rahead, bp));
1510
}
1511
1512
/*
1513
* swap_pager_getpages_async():
1514
*
1515
* Right now this is emulation of asynchronous operation on top of
1516
* swap_pager_getpages().
1517
*/
1518
static int
1519
swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
1520
int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
1521
{
1522
int r, error;
1523
1524
r = swap_pager_getpages(object, ma, count, rbehind, rahead);
1525
switch (r) {
1526
case VM_PAGER_OK:
1527
error = 0;
1528
break;
1529
case VM_PAGER_ERROR:
1530
error = EIO;
1531
break;
1532
case VM_PAGER_FAIL:
1533
error = EINVAL;
1534
break;
1535
default:
1536
panic("unhandled swap_pager_getpages() error %d", r);
1537
}
1538
(iodone)(arg, ma, count, error);
1539
1540
return (r);
1541
}
1542
1543
/*
1544
* swap_pager_putpages:
1545
*
1546
* Assign swap (if necessary) and initiate I/O on the specified pages.
1547
*
1548
* In a low memory situation we may block in VOP_STRATEGY(), but the new
1549
* vm_page reservation system coupled with properly written VFS devices
1550
* should ensure that no low-memory deadlock occurs. This is an area
1551
* which needs work.
1552
*
1553
* The parent has N vm_object_pip_add() references prior to
1554
* calling us and will remove references for rtvals[] that are
1555
* not set to VM_PAGER_PEND. We need to remove the rest on I/O
1556
* completion.
1557
*
1558
* The parent has soft-busy'd the pages it passes us and will unbusy
1559
* those whose rtvals[] entry is not set to VM_PAGER_PEND on return.
1560
* We need to unbusy the rest on I/O completion.
1561
*/
1562
static void
1563
swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
1564
int flags, int *rtvals)
1565
{
1566
struct pctrie_iter blks;
1567
struct page_range range;
1568
struct buf *bp;
1569
daddr_t addr, blk;
1570
vm_page_t mreq;
1571
int i, j, n;
1572
bool async;
1573
1574
KASSERT(count == 0 || ma[0]->object == object,
1575
("%s: object mismatch %p/%p",
1576
__func__, object, ma[0]->object));
1577
1578
VM_OBJECT_WUNLOCK(object);
1579
async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0;
1580
swp_pager_init_freerange(&range);
1581
1582
/*
1583
* Assign swap blocks and issue I/O. We reallocate swap on the fly.
1584
* The page is left dirty until the pageout operation completes
1585
* successfully.
1586
*/
1587
for (i = 0; i < count; i += n) {
1588
/* Maximum I/O size is limited by maximum swap block size. */
1589
n = min(count - i, nsw_cluster_max);
1590
1591
if (async) {
1592
mtx_lock(&swbuf_mtx);
1593
while (nsw_wcount_async == 0)
1594
msleep(&nsw_wcount_async, &swbuf_mtx, PVM,
1595
"swbufa", 0);
1596
nsw_wcount_async--;
1597
mtx_unlock(&swbuf_mtx);
1598
}
1599
1600
/* Get a block of swap of size up to size n. */
1601
blk = swp_pager_getswapspace(&n);
1602
if (blk == SWAPBLK_NONE) {
1603
mtx_lock(&swbuf_mtx);
1604
if (++nsw_wcount_async == 1)
1605
wakeup(&nsw_wcount_async);
1606
mtx_unlock(&swbuf_mtx);
1607
for (j = 0; j < n; ++j)
1608
rtvals[i + j] = VM_PAGER_FAIL;
1609
continue;
1610
}
1611
VM_OBJECT_WLOCK(object);
1612
swblk_iter_init_only(&blks, object);
1613
for (j = 0; j < n; ++j) {
1614
mreq = ma[i + j];
1615
vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
1616
KASSERT(mreq->object == object,
1617
("%s: object mismatch %p/%p",
1618
__func__, mreq->object, object));
1619
addr = swp_pager_meta_build(&blks, object,
1620
mreq->pindex, blk + j, false);
1621
if (addr != SWAPBLK_NONE)
1622
swp_pager_update_freerange(&range, addr);
1623
MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
1624
mreq->oflags |= VPO_SWAPINPROG;
1625
}
1626
VM_OBJECT_WUNLOCK(object);
1627
1628
bp = uma_zalloc(swwbuf_zone, M_WAITOK);
1629
MPASS((bp->b_flags & B_MAXPHYS) != 0);
1630
if (async)
1631
bp->b_flags |= B_ASYNC;
1632
bp->b_flags |= B_PAGING;
1633
bp->b_iocmd = BIO_WRITE;
1634
1635
bp->b_rcred = crhold(thread0.td_ucred);
1636
bp->b_wcred = crhold(thread0.td_ucred);
1637
bp->b_bcount = PAGE_SIZE * n;
1638
bp->b_bufsize = PAGE_SIZE * n;
1639
bp->b_blkno = blk;
1640
for (j = 0; j < n; j++)
1641
bp->b_pages[j] = ma[i + j];
1642
bp->b_npages = n;
1643
1644
/*
1645
* Must set dirty range for NFS to work.
1646
*/
1647
bp->b_dirtyoff = 0;
1648
bp->b_dirtyend = bp->b_bcount;
1649
1650
VM_CNT_INC(v_swapout);
1651
VM_CNT_ADD(v_swappgsout, bp->b_npages);
1652
1653
/*
1654
* We unconditionally set rtvals[] to VM_PAGER_PEND so that we
1655
* can call the async completion routine at the end of a
1656
* synchronous I/O operation. Otherwise, our caller would
1657
* perform duplicate unbusy and wakeup operations on the page
1658
* and object, respectively.
1659
*/
1660
for (j = 0; j < n; j++)
1661
rtvals[i + j] = VM_PAGER_PEND;
1662
1663
/*
1664
* asynchronous
1665
*
1666
* NOTE: b_blkno is destroyed by the call to swapdev_strategy.
1667
*/
1668
if (async) {
1669
bp->b_iodone = swp_pager_async_iodone;
1670
BUF_KERNPROC(bp);
1671
swp_pager_strategy(bp);
1672
continue;
1673
}
1674
1675
/*
1676
* synchronous
1677
*
1678
* NOTE: b_blkno is destroyed by the call to swapdev_strategy.
1679
*/
1680
bp->b_iodone = bdone;
1681
swp_pager_strategy(bp);
1682
1683
/*
1684
* Wait for the sync I/O to complete.
1685
*/
1686
bwait(bp, PVM, "swwrt");
1687
1688
/*
1689
* Now that we are through with the bp, we can call the
1690
* normal async completion, which frees everything up.
1691
*/
1692
swp_pager_async_iodone(bp);
1693
}
1694
swp_pager_freeswapspace(&range);
1695
VM_OBJECT_WLOCK(object);
1696
}
1697
1698
/*
1699
* swp_pager_async_iodone:
1700
*
1701
* Completion routine for asynchronous reads and writes from/to swap.
1702
* Also called manually by synchronous code to finish up a bp.
1703
*
1704
* This routine may not sleep.
1705
*/
1706
static void
1707
swp_pager_async_iodone(struct buf *bp)
1708
{
1709
int i;
1710
vm_object_t object = NULL;
1711
1712
/*
1713
* Report error - unless we ran out of memory, in which case
1714
* we've already logged it in swapgeom_strategy().
1715
*/
1716
if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) {
1717
printf(
1718
"swap_pager: I/O error - %s failed; blkno %ld,"
1719
"size %ld, error %d\n",
1720
((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
1721
(long)bp->b_blkno,
1722
(long)bp->b_bcount,
1723
bp->b_error
1724
);
1725
}
1726
1727
/*
1728
* remove the mapping for kernel virtual
1729
*/
1730
if (buf_mapped(bp))
1731
pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
1732
else
1733
bp->b_data = bp->b_kvabase;
1734
1735
if (bp->b_npages) {
1736
object = bp->b_pages[0]->object;
1737
VM_OBJECT_WLOCK(object);
1738
}
1739
1740
/*
1741
* cleanup pages. If an error occurs writing to swap, we are in
1742
* very serious trouble. If it happens to be a disk error, though,
1743
* we may be able to recover by reassigning the swap later on. So
1744
* in this case we remove the m->swapblk assignment for the page
1745
* but do not free it in the rlist. The errornous block(s) are thus
1746
* never reallocated as swap. Redirty the page and continue.
1747
*/
1748
for (i = 0; i < bp->b_npages; ++i) {
1749
vm_page_t m = bp->b_pages[i];
1750
1751
if (m == bogus_page)
1752
continue;
1753
1754
m->oflags &= ~VPO_SWAPINPROG;
1755
if (m->oflags & VPO_SWAPSLEEP) {
1756
m->oflags &= ~VPO_SWAPSLEEP;
1757
wakeup(&object->handle);
1758
}
1759
1760
/* We always have space after I/O, successful or not. */
1761
vm_page_aflag_set(m, PGA_SWAP_SPACE);
1762
1763
if (bp->b_ioflags & BIO_ERROR) {
1764
/*
1765
* If an error occurs I'd love to throw the swapblk
1766
* away without freeing it back to swapspace, so it
1767
* can never be used again. But I can't from an
1768
* interrupt.
1769
*/
1770
if (bp->b_iocmd == BIO_READ) {
1771
/*
1772
* NOTE: for reads, m->dirty will probably
1773
* be overridden by the original caller of
1774
* getpages so don't play cute tricks here.
1775
*/
1776
vm_page_invalid(m);
1777
if (i < bp->b_pgbefore ||
1778
i >= bp->b_npages - bp->b_pgafter)
1779
vm_page_free_invalid(m);
1780
} else {
1781
/*
1782
* If a write error occurs, reactivate page
1783
* so it doesn't clog the inactive list,
1784
* then finish the I/O.
1785
*/
1786
MPASS(m->dirty == VM_PAGE_BITS_ALL);
1787
1788
/* PQ_UNSWAPPABLE? */
1789
vm_page_activate(m);
1790
vm_page_sunbusy(m);
1791
}
1792
} else if (bp->b_iocmd == BIO_READ) {
1793
/*
1794
* NOTE: for reads, m->dirty will probably be
1795
* overridden by the original caller of getpages so
1796
* we cannot set them in order to free the underlying
1797
* swap in a low-swap situation. I don't think we'd
1798
* want to do that anyway, but it was an optimization
1799
* that existed in the old swapper for a time before
1800
* it got ripped out due to precisely this problem.
1801
*/
1802
KASSERT(!pmap_page_is_mapped(m),
1803
("swp_pager_async_iodone: page %p is mapped", m));
1804
KASSERT(m->dirty == 0,
1805
("swp_pager_async_iodone: page %p is dirty", m));
1806
1807
vm_page_valid(m);
1808
if (i < bp->b_pgbefore ||
1809
i >= bp->b_npages - bp->b_pgafter)
1810
vm_page_readahead_finish(m);
1811
} else {
1812
/*
1813
* For write success, clear the dirty
1814
* status, then finish the I/O ( which decrements the
1815
* busy count and possibly wakes waiter's up ).
1816
* A page is only written to swap after a period of
1817
* inactivity. Therefore, we do not expect it to be
1818
* reused.
1819
*/
1820
KASSERT(!pmap_page_is_write_mapped(m),
1821
("swp_pager_async_iodone: page %p is not write"
1822
" protected", m));
1823
vm_page_undirty(m);
1824
vm_page_deactivate_noreuse(m);
1825
vm_page_sunbusy(m);
1826
}
1827
}
1828
1829
/*
1830
* adjust pip. NOTE: the original parent may still have its own
1831
* pip refs on the object.
1832
*/
1833
if (object != NULL) {
1834
vm_object_pip_wakeupn(object, bp->b_npages);
1835
VM_OBJECT_WUNLOCK(object);
1836
}
1837
1838
/*
1839
* swapdev_strategy() manually sets b_vp and b_bufobj before calling
1840
* bstrategy(). Set them back to NULL now we're done with it, or we'll
1841
* trigger a KASSERT in relpbuf().
1842
*/
1843
if (bp->b_vp) {
1844
bp->b_vp = NULL;
1845
bp->b_bufobj = NULL;
1846
}
1847
/*
1848
* release the physical I/O buffer
1849
*/
1850
if (bp->b_flags & B_ASYNC) {
1851
mtx_lock(&swbuf_mtx);
1852
if (++nsw_wcount_async == 1)
1853
wakeup(&nsw_wcount_async);
1854
mtx_unlock(&swbuf_mtx);
1855
}
1856
uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp);
1857
}
1858
1859
int
1860
swap_pager_nswapdev(void)
1861
{
1862
1863
return (nswapdev);
1864
}
1865
1866
static void
1867
swp_pager_force_dirty(struct page_range *range, vm_page_t m, daddr_t *blk)
1868
{
1869
vm_page_dirty(m);
1870
swap_pager_unswapped_acct(m);
1871
swp_pager_update_freerange(range, *blk);
1872
*blk = SWAPBLK_NONE;
1873
vm_page_launder(m);
1874
}
1875
1876
u_long
1877
swap_pager_swapped_pages(vm_object_t object)
1878
{
1879
struct pctrie_iter blks;
1880
struct swblk *sb;
1881
u_long res;
1882
int i;
1883
1884
VM_OBJECT_ASSERT_LOCKED(object);
1885
1886
if (swblk_is_empty(object))
1887
return (0);
1888
1889
res = 0;
1890
for (sb = swblk_iter_init(&blks, object, 0); sb != NULL;
1891
sb = swblk_iter_next(&blks)) {
1892
for (i = 0; i < SWAP_META_PAGES; i++) {
1893
if (sb->d[i] != SWAPBLK_NONE)
1894
res++;
1895
}
1896
}
1897
return (res);
1898
}
1899
1900
/*
1901
* swap_pager_swapoff_object:
1902
*
1903
* Page in all of the pages that have been paged out for an object
1904
* to a swap device.
1905
*/
1906
static void
1907
swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object,
1908
struct buf **bp)
1909
{
1910
struct pctrie_iter blks, pages;
1911
struct page_range range;
1912
struct swblk *sb;
1913
vm_page_t m;
1914
int i, rahead, rv;
1915
bool sb_empty;
1916
1917
VM_OBJECT_ASSERT_WLOCKED(object);
1918
KASSERT((object->flags & OBJ_SWAP) != 0,
1919
("%s: Object not swappable", __func__));
1920
KASSERT((object->flags & OBJ_DEAD) == 0,
1921
("%s: Object already dead", __func__));
1922
KASSERT((sp->sw_flags & SW_CLOSING) != 0,
1923
("%s: Device not blocking further allocations", __func__));
1924
1925
vm_page_iter_init(&pages, object);
1926
swp_pager_init_freerange(&range);
1927
sb = swblk_iter_init(&blks, object, 0);
1928
while (sb != NULL) {
1929
sb_empty = true;
1930
for (i = 0; i < SWAP_META_PAGES; i++) {
1931
/* Skip an invalid block. */
1932
if (sb->d[i] == SWAPBLK_NONE)
1933
continue;
1934
/* Skip a block not of this device. */
1935
if (!swp_pager_isondev(sb->d[i], sp)) {
1936
sb_empty = false;
1937
continue;
1938
}
1939
1940
/*
1941
* Look for a page corresponding to this block. If the
1942
* found page has pending operations, sleep and restart
1943
* the scan.
1944
*/
1945
m = vm_radix_iter_lookup(&pages, blks.index + i);
1946
if (m != NULL && (m->oflags & VPO_SWAPINPROG) != 0) {
1947
m->oflags |= VPO_SWAPSLEEP;
1948
VM_OBJECT_SLEEP(object, &object->handle, PSWP,
1949
"swpoff", 0);
1950
break;
1951
}
1952
1953
/*
1954
* If the found page is valid, mark it dirty and free
1955
* the swap block.
1956
*/
1957
if (m != NULL && vm_page_all_valid(m)) {
1958
swp_pager_force_dirty(&range, m, &sb->d[i]);
1959
continue;
1960
}
1961
/* Is there a page we can acquire or allocate? */
1962
if (m != NULL) {
1963
if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL))
1964
break;
1965
} else {
1966
m = vm_page_alloc_iter(object, blks.index + i,
1967
VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL,
1968
&pages);
1969
if (m == NULL)
1970
break;
1971
}
1972
1973
/* Get the page from swap, and restart the scan. */
1974
vm_object_pip_add(object, 1);
1975
rahead = SWAP_META_PAGES;
1976
rv = swap_pager_getpages_locked(&blks, object, &m, 1,
1977
NULL, &rahead, *bp);
1978
if (rv != VM_PAGER_OK)
1979
panic("%s: read from swap failed: %d",
1980
__func__, rv);
1981
*bp = uma_zalloc(swrbuf_zone, M_WAITOK);
1982
VM_OBJECT_WLOCK(object);
1983
vm_object_pip_wakeupn(object, 1);
1984
KASSERT(vm_page_all_valid(m),
1985
("%s: Page %p not all valid", __func__, m));
1986
vm_page_deactivate_noreuse(m);
1987
vm_page_xunbusy(m);
1988
break;
1989
}
1990
if (i < SWAP_META_PAGES) {
1991
/*
1992
* The object lock has been released and regained.
1993
* Perhaps the object is now dead.
1994
*/
1995
if ((object->flags & OBJ_DEAD) != 0) {
1996
/*
1997
* Make sure that pending writes finish before
1998
* returning.
1999
*/
2000
vm_object_pip_wait(object, "swpoff");
2001
swp_pager_meta_free_all(object);
2002
break;
2003
}
2004
2005
/*
2006
* The swapblk could have been freed, so reset the pages
2007
* iterator and search again for the first swblk at or
2008
* after blks.index.
2009
*/
2010
pctrie_iter_reset(&pages);
2011
sb = swblk_iter_init(&blks, object, blks.index);
2012
continue;
2013
}
2014
if (sb_empty) {
2015
swblk_iter_remove(&blks);
2016
uma_zfree(swblk_zone, sb);
2017
}
2018
2019
/*
2020
* It is safe to advance to the next block. No allocations
2021
* before blk.index have happened, even with the lock released,
2022
* because allocations on this device are blocked.
2023
*/
2024
sb = swblk_iter_next(&blks);
2025
}
2026
swp_pager_freeswapspace(&range);
2027
}
2028
2029
/*
2030
* swap_pager_swapoff:
2031
*
2032
* Page in all of the pages that have been paged out to the
2033
* given device. The corresponding blocks in the bitmap must be
2034
* marked as allocated and the device must be flagged SW_CLOSING.
2035
* There may be no processes swapped out to the device.
2036
*
2037
* This routine may block.
2038
*/
2039
static void
2040
swap_pager_swapoff(struct swdevt *sp)
2041
{
2042
vm_object_t object;
2043
struct buf *bp;
2044
int retries;
2045
2046
sx_assert(&swdev_syscall_lock, SA_XLOCKED);
2047
2048
retries = 0;
2049
full_rescan:
2050
bp = uma_zalloc(swrbuf_zone, M_WAITOK);
2051
mtx_lock(&vm_object_list_mtx);
2052
TAILQ_FOREACH(object, &vm_object_list, object_list) {
2053
if ((object->flags & OBJ_SWAP) == 0)
2054
continue;
2055
mtx_unlock(&vm_object_list_mtx);
2056
/* Depends on type-stability. */
2057
VM_OBJECT_WLOCK(object);
2058
2059
/*
2060
* Dead objects are eventually terminated on their own.
2061
*/
2062
if ((object->flags & OBJ_DEAD) != 0)
2063
goto next_obj;
2064
2065
/*
2066
* Sync with fences placed after pctrie
2067
* initialization. We must not access pctrie below
2068
* unless we checked that our object is swap and not
2069
* dead.
2070
*/
2071
atomic_thread_fence_acq();
2072
if ((object->flags & OBJ_SWAP) == 0)
2073
goto next_obj;
2074
2075
swap_pager_swapoff_object(sp, object, &bp);
2076
next_obj:
2077
VM_OBJECT_WUNLOCK(object);
2078
mtx_lock(&vm_object_list_mtx);
2079
}
2080
mtx_unlock(&vm_object_list_mtx);
2081
uma_zfree(swrbuf_zone, bp);
2082
2083
if (sp->sw_used) {
2084
/*
2085
* Objects may be locked or paging to the device being
2086
* removed, so we will miss their pages and need to
2087
* make another pass. We have marked this device as
2088
* SW_CLOSING, so the activity should finish soon.
2089
*/
2090
retries++;
2091
if (retries > 100) {
2092
panic("swapoff: failed to locate %d swap blocks",
2093
sp->sw_used);
2094
}
2095
pause("swpoff", hz / 20);
2096
goto full_rescan;
2097
}
2098
EVENTHANDLER_INVOKE(swapoff, sp);
2099
}
2100
2101
/************************************************************************
2102
* SWAP META DATA *
2103
************************************************************************
2104
*
2105
* These routines manipulate the swap metadata stored in the
2106
* OBJT_SWAP object.
2107
*
2108
* Swap metadata is implemented with a global hash and not directly
2109
* linked into the object. Instead the object simply contains
2110
* appropriate tracking counters.
2111
*/
2112
2113
/*
2114
* SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free?
2115
*/
2116
static bool
2117
swp_pager_swblk_empty(struct swblk *sb, int start, int limit)
2118
{
2119
int i;
2120
2121
MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES);
2122
for (i = start; i < limit; i++) {
2123
if (sb->d[i] != SWAPBLK_NONE)
2124
return (false);
2125
}
2126
return (true);
2127
}
2128
2129
/*
2130
* SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free
2131
*
2132
* Nothing is done if the block is still in use.
2133
*/
2134
static void
2135
swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb)
2136
{
2137
2138
if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
2139
swblk_lookup_remove(object, sb);
2140
uma_zfree(swblk_zone, sb);
2141
}
2142
}
2143
2144
/*
2145
* SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
2146
*
2147
* Try to add the specified swapblk to the object's swap metadata. If
2148
* nowait_noreplace is set, add the specified swapblk only if there is no
2149
* previously assigned swapblk at pindex. If the swapblk is invalid, and
2150
* replaces a valid swapblk, empty swap metadata is freed. If memory
2151
* allocation fails, and nowait_noreplace is set, return the specified
2152
* swapblk immediately to indicate failure; otherwise, wait and retry until
2153
* memory allocation succeeds. Return the previously assigned swapblk, if
2154
* any.
2155
*/
2156
static daddr_t
2157
swp_pager_meta_build(struct pctrie_iter *blks, vm_object_t object,
2158
vm_pindex_t pindex, daddr_t swapblk, bool nowait_noreplace)
2159
{
2160
static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
2161
struct swblk *sb, *sb1;
2162
vm_pindex_t modpi;
2163
daddr_t prev_swapblk;
2164
int error, i;
2165
2166
VM_OBJECT_ASSERT_WLOCKED(object);
2167
2168
sb = swblk_iter_lookup(blks, pindex);
2169
if (sb == NULL) {
2170
if (swapblk == SWAPBLK_NONE)
2171
return (SWAPBLK_NONE);
2172
for (;;) {
2173
sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
2174
pageproc ? M_USE_RESERVE : 0));
2175
if (sb != NULL) {
2176
sb->p = rounddown(pindex, SWAP_META_PAGES);
2177
for (i = 0; i < SWAP_META_PAGES; i++)
2178
sb->d[i] = SWAPBLK_NONE;
2179
if (atomic_cmpset_int(&swblk_zone_exhausted,
2180
1, 0))
2181
printf("swblk zone ok\n");
2182
break;
2183
}
2184
if (nowait_noreplace)
2185
return (swapblk);
2186
VM_OBJECT_WUNLOCK(object);
2187
if (uma_zone_exhausted(swblk_zone)) {
2188
if (atomic_cmpset_int(&swblk_zone_exhausted,
2189
0, 1))
2190
printf("swap blk zone exhausted, "
2191
"increase kern.maxswzone\n");
2192
vm_pageout_oom(VM_OOM_SWAPZ);
2193
pause("swzonxb", 10);
2194
} else
2195
uma_zwait(swblk_zone);
2196
VM_OBJECT_WLOCK(object);
2197
sb = swblk_iter_reinit(blks, object, pindex);
2198
if (sb != NULL)
2199
/*
2200
* Somebody swapped out a nearby page,
2201
* allocating swblk at the pindex index,
2202
* while we dropped the object lock.
2203
*/
2204
goto allocated;
2205
}
2206
for (;;) {
2207
error = swblk_iter_insert(blks, sb);
2208
if (error == 0) {
2209
if (atomic_cmpset_int(&swpctrie_zone_exhausted,
2210
1, 0))
2211
printf("swpctrie zone ok\n");
2212
break;
2213
}
2214
if (nowait_noreplace) {
2215
uma_zfree(swblk_zone, sb);
2216
return (swapblk);
2217
}
2218
VM_OBJECT_WUNLOCK(object);
2219
if (uma_zone_exhausted(swpctrie_zone)) {
2220
if (atomic_cmpset_int(&swpctrie_zone_exhausted,
2221
0, 1))
2222
printf("swap pctrie zone exhausted, "
2223
"increase kern.maxswzone\n");
2224
vm_pageout_oom(VM_OOM_SWAPZ);
2225
pause("swzonxp", 10);
2226
} else
2227
uma_zwait(swpctrie_zone);
2228
VM_OBJECT_WLOCK(object);
2229
sb1 = swblk_iter_reinit(blks, object, pindex);
2230
if (sb1 != NULL) {
2231
uma_zfree(swblk_zone, sb);
2232
sb = sb1;
2233
goto allocated;
2234
}
2235
}
2236
}
2237
allocated:
2238
MPASS(sb->p == rounddown(pindex, SWAP_META_PAGES));
2239
2240
modpi = pindex % SWAP_META_PAGES;
2241
/* Return prior contents of metadata. */
2242
prev_swapblk = sb->d[modpi];
2243
if (!nowait_noreplace || prev_swapblk == SWAPBLK_NONE) {
2244
/* Enter block into metadata. */
2245
sb->d[modpi] = swapblk;
2246
2247
/*
2248
* Free the swblk if we end up with the empty page run.
2249
*/
2250
if (swapblk == SWAPBLK_NONE &&
2251
swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
2252
swblk_iter_remove(blks);
2253
uma_zfree(swblk_zone, sb);
2254
}
2255
}
2256
return (prev_swapblk);
2257
}
2258
2259
/*
2260
* SWP_PAGER_META_TRANSFER() - transfer a range of blocks in the srcobject's
2261
* swap metadata into dstobject.
2262
*
2263
* Blocks in src that correspond to holes in dst are transferred. Blocks
2264
* in src that correspond to blocks in dst are freed.
2265
*/
2266
static void
2267
swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
2268
vm_pindex_t pindex, vm_pindex_t count)
2269
{
2270
struct pctrie_iter dstblks, srcblks;
2271
struct page_range range;
2272
struct swblk *sb;
2273
daddr_t blk, d[SWAP_META_PAGES];
2274
vm_pindex_t last;
2275
int d_mask, i, limit, start;
2276
_Static_assert(8 * sizeof(d_mask) >= SWAP_META_PAGES,
2277
"d_mask not big enough");
2278
2279
VM_OBJECT_ASSERT_WLOCKED(srcobject);
2280
VM_OBJECT_ASSERT_WLOCKED(dstobject);
2281
2282
if (count == 0 || swblk_is_empty(srcobject))
2283
return;
2284
2285
swp_pager_init_freerange(&range);
2286
d_mask = 0;
2287
last = pindex + count;
2288
swblk_iter_init_only(&dstblks, dstobject);
2289
for (sb = swblk_iter_limit_init(&srcblks, srcobject, pindex, last),
2290
start = swblk_start(sb, pindex);
2291
sb != NULL; sb = swblk_iter_next(&srcblks), start = 0) {
2292
limit = MIN(last - srcblks.index, SWAP_META_PAGES);
2293
for (i = start; i < limit; i++) {
2294
if (sb->d[i] == SWAPBLK_NONE)
2295
continue;
2296
blk = swp_pager_meta_build(&dstblks, dstobject,
2297
srcblks.index + i - pindex, sb->d[i], true);
2298
if (blk == sb->d[i]) {
2299
/*
2300
* Failed memory allocation stopped transfer;
2301
* save this block for transfer with lock
2302
* released.
2303
*/
2304
d[i] = blk;
2305
d_mask |= 1 << i;
2306
} else if (blk != SWAPBLK_NONE) {
2307
/* Dst has a block at pindex, so free block. */
2308
swp_pager_update_freerange(&range, sb->d[i]);
2309
}
2310
sb->d[i] = SWAPBLK_NONE;
2311
}
2312
if (swp_pager_swblk_empty(sb, 0, start) &&
2313
swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
2314
swblk_iter_remove(&srcblks);
2315
uma_zfree(swblk_zone, sb);
2316
}
2317
if (d_mask != 0) {
2318
/* Finish block transfer, with the lock released. */
2319
VM_OBJECT_WUNLOCK(srcobject);
2320
do {
2321
i = ffs(d_mask) - 1;
2322
swp_pager_meta_build(&dstblks, dstobject,
2323
srcblks.index + i - pindex, d[i], false);
2324
d_mask &= ~(1 << i);
2325
} while (d_mask != 0);
2326
VM_OBJECT_WLOCK(srcobject);
2327
2328
/*
2329
* While the lock was not held, the iterator path could
2330
* have become stale, so discard it.
2331
*/
2332
pctrie_iter_reset(&srcblks);
2333
}
2334
}
2335
swp_pager_freeswapspace(&range);
2336
}
2337
2338
/*
2339
* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2340
*
2341
* Return freed swap blocks to the swap bitmap, and free emptied swblk
2342
* metadata. With 'freed' set, provide a count of freed blocks that were
2343
* not associated with valid resident pages.
2344
*/
2345
static void
2346
swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count,
2347
vm_size_t *freed)
2348
{
2349
struct pctrie_iter blks, pages;
2350
struct page_range range;
2351
struct swblk *sb;
2352
vm_page_t m;
2353
vm_pindex_t last;
2354
vm_size_t fc;
2355
int i, limit, start;
2356
2357
VM_OBJECT_ASSERT_WLOCKED(object);
2358
2359
fc = 0;
2360
if (count == 0 || swblk_is_empty(object))
2361
goto out;
2362
2363
swp_pager_init_freerange(&range);
2364
vm_page_iter_init(&pages, object);
2365
last = pindex + count;
2366
for (sb = swblk_iter_limit_init(&blks, object, pindex, last),
2367
start = swblk_start(sb, pindex);
2368
sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
2369
limit = MIN(last - blks.index, SWAP_META_PAGES);
2370
for (i = start; i < limit; i++) {
2371
if (sb->d[i] == SWAPBLK_NONE)
2372
continue;
2373
swp_pager_update_freerange(&range, sb->d[i]);
2374
if (freed != NULL) {
2375
m = vm_radix_iter_lookup(&pages, blks.index + i);
2376
if (m == NULL || vm_page_none_valid(m))
2377
fc++;
2378
}
2379
sb->d[i] = SWAPBLK_NONE;
2380
}
2381
if (swp_pager_swblk_empty(sb, 0, start) &&
2382
swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
2383
swblk_iter_remove(&blks);
2384
uma_zfree(swblk_zone, sb);
2385
}
2386
}
2387
swp_pager_freeswapspace(&range);
2388
out:
2389
if (freed != NULL)
2390
*freed = fc;
2391
}
2392
2393
static void
2394
swp_pager_meta_free_block(struct swblk *sb, void *rangev)
2395
{
2396
struct page_range *range = rangev;
2397
2398
for (int i = 0; i < SWAP_META_PAGES; i++) {
2399
if (sb->d[i] != SWAPBLK_NONE)
2400
swp_pager_update_freerange(range, sb->d[i]);
2401
}
2402
uma_zfree(swblk_zone, sb);
2403
}
2404
2405
/*
2406
* SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2407
*
2408
* This routine locates and destroys all swap metadata associated with
2409
* an object.
2410
*/
2411
static void
2412
swp_pager_meta_free_all(vm_object_t object)
2413
{
2414
struct page_range range;
2415
2416
VM_OBJECT_ASSERT_WLOCKED(object);
2417
2418
swp_pager_init_freerange(&range);
2419
SWAP_PCTRIE_RECLAIM_CALLBACK(&object->un_pager.swp.swp_blks,
2420
swp_pager_meta_free_block, &range);
2421
swp_pager_freeswapspace(&range);
2422
}
2423
2424
/*
2425
* SWP_PAGER_METACTL() - misc control of swap meta data.
2426
*
2427
* This routine is capable of looking up, or removing swapblk
2428
* assignments in the swap meta data. It returns the swapblk being
2429
* looked-up, popped, or SWAPBLK_NONE if the block was invalid.
2430
*
2431
* When acting on a busy resident page and paging is in progress, we
2432
* have to wait until paging is complete but otherwise can act on the
2433
* busy page.
2434
*/
2435
static daddr_t
2436
swp_pager_meta_lookup(struct pctrie_iter *blks, vm_pindex_t pindex)
2437
{
2438
struct swblk *sb;
2439
2440
sb = swblk_iter_lookup(blks, pindex);
2441
if (sb == NULL)
2442
return (SWAPBLK_NONE);
2443
return (sb->d[pindex % SWAP_META_PAGES]);
2444
}
2445
2446
/*
2447
* Returns the least page index which is greater than or equal to the parameter
2448
* pindex and for which there is a swap block allocated. Returns OBJ_MAX_SIZE
2449
* if are no allocated swap blocks for the object after the requested pindex.
2450
*/
2451
static vm_pindex_t
2452
swap_pager_iter_find_least(struct pctrie_iter *blks, vm_pindex_t pindex)
2453
{
2454
struct swblk *sb;
2455
int i;
2456
2457
if ((sb = swblk_iter_lookup_ge(blks, pindex)) == NULL)
2458
return (OBJ_MAX_SIZE);
2459
if (blks->index < pindex) {
2460
for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
2461
if (sb->d[i] != SWAPBLK_NONE)
2462
return (blks->index + i);
2463
}
2464
if ((sb = swblk_iter_next(blks)) == NULL)
2465
return (OBJ_MAX_SIZE);
2466
}
2467
for (i = 0; i < SWAP_META_PAGES; i++) {
2468
if (sb->d[i] != SWAPBLK_NONE)
2469
return (blks->index + i);
2470
}
2471
2472
/*
2473
* We get here if a swblk is present in the trie but it
2474
* doesn't map any blocks.
2475
*/
2476
MPASS(0);
2477
return (OBJ_MAX_SIZE);
2478
}
2479
2480
/*
2481
* Find the first index >= pindex that has either a valid page or a swap
2482
* block.
2483
*/
2484
vm_pindex_t
2485
swap_pager_seek_data(vm_object_t object, vm_pindex_t pindex)
2486
{
2487
struct pctrie_iter blks, pages;
2488
vm_page_t m;
2489
vm_pindex_t swap_index;
2490
2491
VM_OBJECT_ASSERT_LOCKED(object);
2492
KASSERT((object->flags & OBJ_SWAP) != 0, ("non-swap obj %p", object));
2493
vm_page_iter_init(&pages, object);
2494
m = vm_radix_iter_lookup_ge(&pages, pindex);
2495
if (m != NULL && pages.index == pindex && vm_page_any_valid(m))
2496
return (pages.index);
2497
swblk_iter_init_only(&blks, object);
2498
swap_index = swap_pager_iter_find_least(&blks, pindex);
2499
if (swap_index == pindex)
2500
return (swap_index);
2501
2502
/*
2503
* Find the first resident page after m, before swap_index.
2504
*/
2505
while (m != NULL && pages.index < swap_index) {
2506
if (vm_page_any_valid(m))
2507
return (pages.index);
2508
m = vm_radix_iter_step(&pages);
2509
}
2510
return (swap_index);
2511
}
2512
2513
/*
2514
* Find the first index >= pindex that has neither a valid page nor a swap
2515
* block.
2516
*/
2517
vm_pindex_t
2518
swap_pager_seek_hole(vm_object_t object, vm_pindex_t pindex)
2519
{
2520
struct pctrie_iter blks, pages;
2521
struct swblk *sb;
2522
vm_page_t m;
2523
2524
VM_OBJECT_ASSERT_RLOCKED(object);
2525
vm_page_iter_init(&pages, object);
2526
swblk_iter_init_only(&blks, object);
2527
while (((m = vm_radix_iter_lookup(&pages, pindex)) != NULL &&
2528
vm_page_any_valid(m)) ||
2529
((sb = swblk_iter_lookup(&blks, pindex)) != NULL &&
2530
sb->d[pindex % SWAP_META_PAGES] != SWAPBLK_NONE))
2531
pindex++;
2532
return (pindex);
2533
}
2534
2535
/*
2536
* Is every page in the backing object or swap shadowed in the parent, and
2537
* unbusy and valid in swap?
2538
*/
2539
bool
2540
swap_pager_scan_all_shadowed(vm_object_t object)
2541
{
2542
struct pctrie_iter backing_blks, backing_pages, blks, pages;
2543
vm_object_t backing_object;
2544
vm_page_t p, pp;
2545
vm_pindex_t backing_offset_index, new_pindex, pi, pi_ubound, ps, pv;
2546
2547
VM_OBJECT_ASSERT_WLOCKED(object);
2548
VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
2549
2550
backing_object = object->backing_object;
2551
2552
if ((backing_object->flags & OBJ_ANON) == 0)
2553
return (false);
2554
2555
KASSERT((object->flags & OBJ_ANON) != 0,
2556
("Shadow object is not anonymous"));
2557
backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
2558
pi_ubound = MIN(backing_object->size,
2559
backing_offset_index + object->size);
2560
vm_page_iter_init(&pages, object);
2561
vm_page_iter_init(&backing_pages, backing_object);
2562
swblk_iter_init_only(&blks, object);
2563
swblk_iter_init_only(&backing_blks, backing_object);
2564
2565
/*
2566
* Only check pages inside the parent object's range and inside the
2567
* parent object's mapping of the backing object.
2568
*/
2569
pv = ps = pi = backing_offset_index - 1;
2570
for (;;) {
2571
if (pi == pv) {
2572
p = vm_radix_iter_lookup_ge(&backing_pages, pv + 1);
2573
pv = p != NULL ? p->pindex : backing_object->size;
2574
}
2575
if (pi == ps)
2576
ps = swap_pager_iter_find_least(&backing_blks, ps + 1);
2577
pi = MIN(pv, ps);
2578
if (pi >= pi_ubound)
2579
break;
2580
2581
if (pi == pv) {
2582
/*
2583
* If the backing object page is busy a grandparent or
2584
* older page may still be undergoing CoW. It is not
2585
* safe to collapse the backing object until it is
2586
* quiesced.
2587
*/
2588
if (vm_page_tryxbusy(p) == 0)
2589
return (false);
2590
2591
/*
2592
* We raced with the fault handler that left newly
2593
* allocated invalid page on the object queue and
2594
* retried.
2595
*/
2596
if (!vm_page_all_valid(p))
2597
break;
2598
2599
/*
2600
* Busy of p disallows fault handler to validate parent
2601
* page (pp, below).
2602
*/
2603
}
2604
2605
/*
2606
* See if the parent has the page or if the parent's object
2607
* pager has the page. If the parent has the page but the page
2608
* is not valid, the parent's object pager must have the page.
2609
*
2610
* If this fails, the parent does not completely shadow the
2611
* object and we might as well give up now.
2612
*/
2613
new_pindex = pi - backing_offset_index;
2614
pp = vm_radix_iter_lookup(&pages, new_pindex);
2615
2616
/*
2617
* The valid check here is stable due to object lock being
2618
* required to clear valid and initiate paging.
2619
*/
2620
if ((pp == NULL || vm_page_none_valid(pp)) &&
2621
!swp_pager_haspage_iter(new_pindex, NULL, NULL, &blks))
2622
break;
2623
if (pi == pv)
2624
vm_page_xunbusy(p);
2625
}
2626
if (pi < pi_ubound) {
2627
if (pi == pv)
2628
vm_page_xunbusy(p);
2629
return (false);
2630
}
2631
return (true);
2632
}
2633
2634
/*
2635
* System call swapon(name) enables swapping on device name,
2636
* which must be in the swdevsw. Return EBUSY
2637
* if already swapping on this device.
2638
*/
2639
#ifndef _SYS_SYSPROTO_H_
2640
struct swapon_args {
2641
char *name;
2642
};
2643
#endif
2644
2645
int
2646
sys_swapon(struct thread *td, struct swapon_args *uap)
2647
{
2648
struct vattr attr;
2649
struct vnode *vp;
2650
struct nameidata nd;
2651
int error;
2652
2653
error = priv_check(td, PRIV_SWAPON);
2654
if (error)
2655
return (error);
2656
2657
sx_xlock(&swdev_syscall_lock);
2658
2659
/*
2660
* Swap metadata may not fit in the KVM if we have physical
2661
* memory of >1GB.
2662
*/
2663
if (swblk_zone == NULL) {
2664
error = ENOMEM;
2665
goto done;
2666
}
2667
2668
NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
2669
UIO_USERSPACE, uap->name);
2670
error = namei(&nd);
2671
if (error)
2672
goto done;
2673
2674
NDFREE_PNBUF(&nd);
2675
vp = nd.ni_vp;
2676
2677
if (vn_isdisk_error(vp, &error)) {
2678
error = swapongeom(vp);
2679
} else if (vp->v_type == VREG &&
2680
(vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
2681
(error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
2682
/*
2683
* Allow direct swapping to NFS regular files in the same
2684
* way that nfs_mountroot() sets up diskless swapping.
2685
*/
2686
error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
2687
}
2688
2689
if (error != 0)
2690
vput(vp);
2691
else
2692
VOP_UNLOCK(vp);
2693
done:
2694
sx_xunlock(&swdev_syscall_lock);
2695
return (error);
2696
}
2697
2698
/*
2699
* Check that the total amount of swap currently configured does not
2700
* exceed half the theoretical maximum. If it does, print a warning
2701
* message.
2702
*/
2703
static void
2704
swapon_check_swzone(void)
2705
{
2706
2707
/* recommend using no more than half that amount */
2708
if (swap_total > swap_maxpages / 2) {
2709
printf("warning: total configured swap (%lu pages) "
2710
"exceeds maximum recommended amount (%lu pages).\n",
2711
swap_total, swap_maxpages / 2);
2712
printf("warning: increase kern.maxswzone "
2713
"or reduce amount of swap.\n");
2714
}
2715
}
2716
2717
static int
2718
swaponsomething(struct vnode *vp, void *id, u_long nblks,
2719
sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
2720
{
2721
struct swdevt *sp, *tsp;
2722
daddr_t dvbase;
2723
2724
/*
2725
* nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
2726
* First chop nblks off to page-align it, then convert.
2727
*
2728
* sw->sw_nblks is in page-sized chunks now too.
2729
*/
2730
nblks &= ~(ctodb(1) - 1);
2731
nblks = dbtoc(nblks);
2732
if (nblks == 0)
2733
return (EXTERROR(EINVAL, "swap device too small"));
2734
2735
sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
2736
sp->sw_blist = blist_create(nblks, M_WAITOK);
2737
sp->sw_vp = vp;
2738
sp->sw_id = id;
2739
sp->sw_dev = dev;
2740
sp->sw_nblks = nblks;
2741
sp->sw_used = 0;
2742
sp->sw_strategy = strategy;
2743
sp->sw_close = close;
2744
sp->sw_flags = flags;
2745
2746
/*
2747
* Do not free the first blocks in order to avoid overwriting
2748
* any bsd label at the front of the partition
2749
*/
2750
blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE),
2751
nblks - howmany(BBSIZE, PAGE_SIZE));
2752
2753
dvbase = 0;
2754
mtx_lock(&sw_dev_mtx);
2755
TAILQ_FOREACH(tsp, &swtailq, sw_list) {
2756
if (tsp->sw_end >= dvbase) {
2757
/*
2758
* We put one uncovered page between the devices
2759
* in order to definitively prevent any cross-device
2760
* I/O requests
2761
*/
2762
dvbase = tsp->sw_end + 1;
2763
}
2764
}
2765
sp->sw_first = dvbase;
2766
sp->sw_end = dvbase + nblks;
2767
TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
2768
nswapdev++;
2769
swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE);
2770
swap_total += nblks;
2771
swapon_check_swzone();
2772
swp_sizecheck();
2773
mtx_unlock(&sw_dev_mtx);
2774
EVENTHANDLER_INVOKE(swapon, sp);
2775
2776
return (0);
2777
}
2778
2779
/*
2780
* SYSCALL: swapoff(devname)
2781
*
2782
* Disable swapping on the given device.
2783
*
2784
* XXX: Badly designed system call: it should use a device index
2785
* rather than filename as specification. We keep sw_vp around
2786
* only to make this work.
2787
*/
2788
static int
2789
kern_swapoff(struct thread *td, const char *name, enum uio_seg name_seg,
2790
u_int flags)
2791
{
2792
struct vnode *vp;
2793
struct nameidata nd;
2794
struct swdevt *sp;
2795
int error;
2796
2797
error = priv_check(td, PRIV_SWAPOFF);
2798
if (error != 0)
2799
return (error);
2800
if ((flags & ~(SWAPOFF_FORCE)) != 0)
2801
return (EINVAL);
2802
2803
sx_xlock(&swdev_syscall_lock);
2804
2805
NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, name_seg, name);
2806
error = namei(&nd);
2807
if (error)
2808
goto done;
2809
NDFREE_PNBUF(&nd);
2810
vp = nd.ni_vp;
2811
2812
mtx_lock(&sw_dev_mtx);
2813
TAILQ_FOREACH(sp, &swtailq, sw_list) {
2814
if (sp->sw_vp == vp)
2815
break;
2816
}
2817
mtx_unlock(&sw_dev_mtx);
2818
if (sp == NULL) {
2819
error = EINVAL;
2820
goto done;
2821
}
2822
error = swapoff_one(sp, td->td_ucred, flags);
2823
done:
2824
sx_xunlock(&swdev_syscall_lock);
2825
return (error);
2826
}
2827
2828
2829
#ifdef COMPAT_FREEBSD13
2830
int
2831
freebsd13_swapoff(struct thread *td, struct freebsd13_swapoff_args *uap)
2832
{
2833
return (kern_swapoff(td, uap->name, UIO_USERSPACE, 0));
2834
}
2835
#endif
2836
2837
int
2838
sys_swapoff(struct thread *td, struct swapoff_args *uap)
2839
{
2840
return (kern_swapoff(td, uap->name, UIO_USERSPACE, uap->flags));
2841
}
2842
2843
static int
2844
swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
2845
{
2846
u_long nblks;
2847
#ifdef MAC
2848
int error;
2849
#endif
2850
2851
sx_assert(&swdev_syscall_lock, SA_XLOCKED);
2852
#ifdef MAC
2853
(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
2854
error = mac_system_check_swapoff(cred, sp->sw_vp);
2855
(void) VOP_UNLOCK(sp->sw_vp);
2856
if (error != 0)
2857
return (error);
2858
#endif
2859
nblks = sp->sw_nblks;
2860
2861
/*
2862
* We can turn off this swap device safely only if the
2863
* available virtual memory in the system will fit the amount
2864
* of data we will have to page back in, plus an epsilon so
2865
* the system doesn't become critically low on swap space.
2866
* The vm_free_count() part does not account e.g. for clean
2867
* pages that can be immediately reclaimed without paging, so
2868
* this is a very rough estimation.
2869
*
2870
* On the other hand, not turning swap off on swapoff_all()
2871
* means that we can lose swap data when filesystems go away,
2872
* which is arguably worse.
2873
*/
2874
if ((flags & SWAPOFF_FORCE) == 0 &&
2875
vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
2876
return (ENOMEM);
2877
2878
/*
2879
* Prevent further allocations on this device.
2880
*/
2881
mtx_lock(&sw_dev_mtx);
2882
sp->sw_flags |= SW_CLOSING;
2883
swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
2884
swap_total -= nblks;
2885
mtx_unlock(&sw_dev_mtx);
2886
2887
/*
2888
* Page in the contents of the device and close it.
2889
*/
2890
swap_pager_swapoff(sp);
2891
2892
sp->sw_close(curthread, sp);
2893
mtx_lock(&sw_dev_mtx);
2894
sp->sw_id = NULL;
2895
TAILQ_REMOVE(&swtailq, sp, sw_list);
2896
nswapdev--;
2897
if (nswapdev == 0)
2898
swap_pager_full = swap_pager_almost_full = true;
2899
if (swdevhd == sp)
2900
swdevhd = NULL;
2901
mtx_unlock(&sw_dev_mtx);
2902
blist_destroy(sp->sw_blist);
2903
free(sp, M_VMPGDATA);
2904
return (0);
2905
}
2906
2907
void
2908
swapoff_all(void)
2909
{
2910
struct swdevt *sp, *spt;
2911
const char *devname;
2912
int error;
2913
2914
sx_xlock(&swdev_syscall_lock);
2915
2916
mtx_lock(&sw_dev_mtx);
2917
TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
2918
mtx_unlock(&sw_dev_mtx);
2919
if (vn_isdisk(sp->sw_vp))
2920
devname = devtoname(sp->sw_vp->v_rdev);
2921
else
2922
devname = "[file]";
2923
error = swapoff_one(sp, thread0.td_ucred, SWAPOFF_FORCE);
2924
if (error != 0) {
2925
printf("Cannot remove swap device %s (error=%d), "
2926
"skipping.\n", devname, error);
2927
} else if (bootverbose) {
2928
printf("Swap device %s removed.\n", devname);
2929
}
2930
mtx_lock(&sw_dev_mtx);
2931
}
2932
mtx_unlock(&sw_dev_mtx);
2933
2934
sx_xunlock(&swdev_syscall_lock);
2935
}
2936
2937
void
2938
swap_pager_status(int *total, int *used)
2939
{
2940
2941
*total = swap_total;
2942
*used = swap_total - swap_pager_avail -
2943
nswapdev * howmany(BBSIZE, PAGE_SIZE);
2944
}
2945
2946
int
2947
swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
2948
{
2949
struct swdevt *sp;
2950
const char *tmp_devname;
2951
int error, n;
2952
2953
n = 0;
2954
error = ENOENT;
2955
mtx_lock(&sw_dev_mtx);
2956
TAILQ_FOREACH(sp, &swtailq, sw_list) {
2957
if (n != name) {
2958
n++;
2959
continue;
2960
}
2961
xs->xsw_version = XSWDEV_VERSION;
2962
xs->xsw_dev = sp->sw_dev;
2963
xs->xsw_flags = sp->sw_flags;
2964
xs->xsw_nblks = sp->sw_nblks;
2965
xs->xsw_used = sp->sw_used;
2966
if (devname != NULL) {
2967
if (vn_isdisk(sp->sw_vp))
2968
tmp_devname = devtoname(sp->sw_vp->v_rdev);
2969
else
2970
tmp_devname = "[file]";
2971
strncpy(devname, tmp_devname, len);
2972
}
2973
error = 0;
2974
break;
2975
}
2976
mtx_unlock(&sw_dev_mtx);
2977
return (error);
2978
}
2979
2980
#if defined(COMPAT_FREEBSD11)
2981
#define XSWDEV_VERSION_11 1
2982
struct xswdev11 {
2983
u_int xsw_version;
2984
uint32_t xsw_dev;
2985
int xsw_flags;
2986
int xsw_nblks;
2987
int xsw_used;
2988
};
2989
#endif
2990
2991
#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
2992
struct xswdev32 {
2993
u_int xsw_version;
2994
u_int xsw_dev1, xsw_dev2;
2995
int xsw_flags;
2996
int xsw_nblks;
2997
int xsw_used;
2998
};
2999
#endif
3000
3001
static int
3002
sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
3003
{
3004
struct xswdev xs;
3005
#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
3006
struct xswdev32 xs32;
3007
#endif
3008
#if defined(COMPAT_FREEBSD11)
3009
struct xswdev11 xs11;
3010
#endif
3011
int error;
3012
3013
if (arg2 != 1) /* name length */
3014
return (EINVAL);
3015
3016
memset(&xs, 0, sizeof(xs));
3017
error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
3018
if (error != 0)
3019
return (error);
3020
#if defined(__amd64__) && defined(COMPAT_FREEBSD32)
3021
if (req->oldlen == sizeof(xs32)) {
3022
memset(&xs32, 0, sizeof(xs32));
3023
xs32.xsw_version = XSWDEV_VERSION;
3024
xs32.xsw_dev1 = xs.xsw_dev;
3025
xs32.xsw_dev2 = xs.xsw_dev >> 32;
3026
xs32.xsw_flags = xs.xsw_flags;
3027
xs32.xsw_nblks = xs.xsw_nblks;
3028
xs32.xsw_used = xs.xsw_used;
3029
error = SYSCTL_OUT(req, &xs32, sizeof(xs32));
3030
return (error);
3031
}
3032
#endif
3033
#if defined(COMPAT_FREEBSD11)
3034
if (req->oldlen == sizeof(xs11)) {
3035
memset(&xs11, 0, sizeof(xs11));
3036
xs11.xsw_version = XSWDEV_VERSION_11;
3037
xs11.xsw_dev = xs.xsw_dev; /* truncation */
3038
xs11.xsw_flags = xs.xsw_flags;
3039
xs11.xsw_nblks = xs.xsw_nblks;
3040
xs11.xsw_used = xs.xsw_used;
3041
error = SYSCTL_OUT(req, &xs11, sizeof(xs11));
3042
return (error);
3043
}
3044
#endif
3045
error = SYSCTL_OUT(req, &xs, sizeof(xs));
3046
return (error);
3047
}
3048
3049
SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
3050
"Number of swap devices");
3051
SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
3052
sysctl_vm_swap_info,
3053
"Swap statistics by device");
3054
3055
/*
3056
* Count the approximate swap usage in pages for a vmspace. The
3057
* shadowed or not yet copied on write swap blocks are not accounted.
3058
* The map must be locked.
3059
*/
3060
long
3061
vmspace_swap_count(struct vmspace *vmspace)
3062
{
3063
struct pctrie_iter blks;
3064
vm_map_t map;
3065
vm_map_entry_t cur;
3066
vm_object_t object;
3067
struct swblk *sb;
3068
vm_pindex_t e, pi;
3069
long count;
3070
int i, limit, start;
3071
3072
map = &vmspace->vm_map;
3073
count = 0;
3074
3075
VM_MAP_ENTRY_FOREACH(cur, map) {
3076
if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3077
continue;
3078
object = cur->object.vm_object;
3079
if (object == NULL || (object->flags & OBJ_SWAP) == 0)
3080
continue;
3081
VM_OBJECT_RLOCK(object);
3082
if ((object->flags & OBJ_SWAP) == 0)
3083
goto unlock;
3084
pi = OFF_TO_IDX(cur->offset);
3085
e = pi + OFF_TO_IDX(cur->end - cur->start);
3086
for (sb = swblk_iter_limit_init(&blks, object, pi, e),
3087
start = swblk_start(sb, pi);
3088
sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
3089
limit = MIN(e - blks.index, SWAP_META_PAGES);
3090
for (i = start; i < limit; i++) {
3091
if (sb->d[i] != SWAPBLK_NONE)
3092
count++;
3093
}
3094
}
3095
unlock:
3096
VM_OBJECT_RUNLOCK(object);
3097
}
3098
return (count);
3099
}
3100
3101
/*
3102
* GEOM backend
3103
*
3104
* Swapping onto disk devices.
3105
*
3106
*/
3107
3108
static g_orphan_t swapgeom_orphan;
3109
3110
static struct g_class g_swap_class = {
3111
.name = "SWAP",
3112
.version = G_VERSION,
3113
.orphan = swapgeom_orphan,
3114
};
3115
3116
DECLARE_GEOM_CLASS(g_swap_class, g_class);
3117
3118
static void
3119
swapgeom_close_ev(void *arg, int flags)
3120
{
3121
struct g_consumer *cp;
3122
3123
cp = arg;
3124
g_access(cp, -1, -1, 0);
3125
g_detach(cp);
3126
g_destroy_consumer(cp);
3127
}
3128
3129
/*
3130
* Add a reference to the g_consumer for an inflight transaction.
3131
*/
3132
static void
3133
swapgeom_acquire(struct g_consumer *cp)
3134
{
3135
3136
mtx_assert(&sw_dev_mtx, MA_OWNED);
3137
cp->index++;
3138
}
3139
3140
/*
3141
* Remove a reference from the g_consumer. Post a close event if all
3142
* references go away, since the function might be called from the
3143
* biodone context.
3144
*/
3145
static void
3146
swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
3147
{
3148
3149
mtx_assert(&sw_dev_mtx, MA_OWNED);
3150
cp->index--;
3151
if (cp->index == 0) {
3152
if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
3153
sp->sw_id = NULL;
3154
}
3155
}
3156
3157
static void
3158
swapgeom_done(struct bio *bp2)
3159
{
3160
struct swdevt *sp;
3161
struct buf *bp;
3162
struct g_consumer *cp;
3163
3164
bp = bp2->bio_caller2;
3165
cp = bp2->bio_from;
3166
bp->b_ioflags = bp2->bio_flags;
3167
if (bp2->bio_error)
3168
bp->b_ioflags |= BIO_ERROR;
3169
bp->b_resid = bp->b_bcount - bp2->bio_completed;
3170
bp->b_error = bp2->bio_error;
3171
bp->b_caller1 = NULL;
3172
bufdone(bp);
3173
sp = bp2->bio_caller1;
3174
mtx_lock(&sw_dev_mtx);
3175
swapgeom_release(cp, sp);
3176
mtx_unlock(&sw_dev_mtx);
3177
g_destroy_bio(bp2);
3178
}
3179
3180
static void
3181
swapgeom_strategy(struct buf *bp, struct swdevt *sp)
3182
{
3183
struct bio *bio;
3184
struct g_consumer *cp;
3185
3186
mtx_lock(&sw_dev_mtx);
3187
cp = sp->sw_id;
3188
if (cp == NULL) {
3189
mtx_unlock(&sw_dev_mtx);
3190
bp->b_error = ENXIO;
3191
bp->b_ioflags |= BIO_ERROR;
3192
bufdone(bp);
3193
return;
3194
}
3195
swapgeom_acquire(cp);
3196
mtx_unlock(&sw_dev_mtx);
3197
if (bp->b_iocmd == BIO_WRITE)
3198
bio = g_new_bio();
3199
else
3200
bio = g_alloc_bio();
3201
if (bio == NULL) {
3202
mtx_lock(&sw_dev_mtx);
3203
swapgeom_release(cp, sp);
3204
mtx_unlock(&sw_dev_mtx);
3205
bp->b_error = ENOMEM;
3206
bp->b_ioflags |= BIO_ERROR;
3207
printf("swap_pager: cannot allocate bio\n");
3208
bufdone(bp);
3209
return;
3210
}
3211
3212
bp->b_caller1 = bio;
3213
bio->bio_caller1 = sp;
3214
bio->bio_caller2 = bp;
3215
bio->bio_cmd = bp->b_iocmd;
3216
bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
3217
bio->bio_length = bp->b_bcount;
3218
bio->bio_done = swapgeom_done;
3219
bio->bio_flags |= BIO_SWAP;
3220
if (!buf_mapped(bp)) {
3221
bio->bio_ma = bp->b_pages;
3222
bio->bio_data = unmapped_buf;
3223
bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
3224
bio->bio_ma_n = bp->b_npages;
3225
bio->bio_flags |= BIO_UNMAPPED;
3226
} else {
3227
bio->bio_data = bp->b_data;
3228
bio->bio_ma = NULL;
3229
}
3230
g_io_request(bio, cp);
3231
return;
3232
}
3233
3234
static void
3235
swapgeom_orphan(struct g_consumer *cp)
3236
{
3237
struct swdevt *sp;
3238
int destroy;
3239
3240
mtx_lock(&sw_dev_mtx);
3241
TAILQ_FOREACH(sp, &swtailq, sw_list) {
3242
if (sp->sw_id == cp) {
3243
sp->sw_flags |= SW_CLOSING;
3244
break;
3245
}
3246
}
3247
/*
3248
* Drop reference we were created with. Do directly since we're in a
3249
* special context where we don't have to queue the call to
3250
* swapgeom_close_ev().
3251
*/
3252
cp->index--;
3253
destroy = ((sp != NULL) && (cp->index == 0));
3254
if (destroy)
3255
sp->sw_id = NULL;
3256
mtx_unlock(&sw_dev_mtx);
3257
if (destroy)
3258
swapgeom_close_ev(cp, 0);
3259
}
3260
3261
static void
3262
swapgeom_close(struct thread *td, struct swdevt *sw)
3263
{
3264
struct g_consumer *cp;
3265
3266
mtx_lock(&sw_dev_mtx);
3267
cp = sw->sw_id;
3268
sw->sw_id = NULL;
3269
mtx_unlock(&sw_dev_mtx);
3270
3271
/*
3272
* swapgeom_close() may be called from the biodone context,
3273
* where we cannot perform topology changes. Delegate the
3274
* work to the events thread.
3275
*/
3276
if (cp != NULL)
3277
g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
3278
}
3279
3280
static int
3281
swapongeom_locked(struct cdev *dev, struct vnode *vp)
3282
{
3283
struct g_provider *pp;
3284
struct g_consumer *cp;
3285
static struct g_geom *gp;
3286
struct swdevt *sp;
3287
u_long nblks;
3288
int error;
3289
3290
pp = g_dev_getprovider(dev);
3291
if (pp == NULL)
3292
return (ENODEV);
3293
mtx_lock(&sw_dev_mtx);
3294
TAILQ_FOREACH(sp, &swtailq, sw_list) {
3295
cp = sp->sw_id;
3296
if (cp != NULL && cp->provider == pp) {
3297
mtx_unlock(&sw_dev_mtx);
3298
return (EBUSY);
3299
}
3300
}
3301
mtx_unlock(&sw_dev_mtx);
3302
if (gp == NULL)
3303
gp = g_new_geomf(&g_swap_class, "swap");
3304
cp = g_new_consumer(gp);
3305
cp->index = 1; /* Number of active I/Os, plus one for being active. */
3306
cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
3307
g_attach(cp, pp);
3308
3309
/*
3310
* XXX: Every time you think you can improve the margin for
3311
* footshooting, somebody depends on the ability to do so:
3312
* savecore(8) wants to write to our swapdev so we cannot
3313
* set an exclusive count :-(
3314
*/
3315
error = g_access(cp, 1, 1, 0);
3316
3317
if (error == 0) {
3318
nblks = pp->mediasize / DEV_BSIZE;
3319
error = swaponsomething(vp, cp, nblks, swapgeom_strategy,
3320
swapgeom_close, dev2udev(dev),
3321
(pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
3322
if (error != 0)
3323
g_access(cp, -1, -1, 0);
3324
}
3325
if (error != 0) {
3326
g_detach(cp);
3327
g_destroy_consumer(cp);
3328
}
3329
return (error);
3330
}
3331
3332
static int
3333
swapongeom(struct vnode *vp)
3334
{
3335
int error;
3336
3337
ASSERT_VOP_ELOCKED(vp, "swapongeom");
3338
if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) {
3339
error = ENOENT;
3340
} else {
3341
g_topology_lock();
3342
error = swapongeom_locked(vp->v_rdev, vp);
3343
g_topology_unlock();
3344
}
3345
return (error);
3346
}
3347
3348
/*
3349
* VNODE backend
3350
*
3351
* This is used mainly for network filesystem (read: probably only tested
3352
* with NFS) swapfiles.
3353
*
3354
*/
3355
3356
static void
3357
swapdev_strategy(struct buf *bp, struct swdevt *sp)
3358
{
3359
struct vnode *vp2;
3360
3361
bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
3362
3363
vp2 = sp->sw_id;
3364
vhold(vp2);
3365
if (bp->b_iocmd == BIO_WRITE) {
3366
vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
3367
if (bp->b_bufobj)
3368
bufobj_wdrop(bp->b_bufobj);
3369
bufobj_wref(&vp2->v_bufobj);
3370
} else {
3371
vn_lock(vp2, LK_SHARED | LK_RETRY);
3372
}
3373
if (bp->b_bufobj != &vp2->v_bufobj)
3374
bp->b_bufobj = &vp2->v_bufobj;
3375
bp->b_vp = vp2;
3376
bp->b_iooffset = dbtob(bp->b_blkno);
3377
bstrategy(bp);
3378
VOP_UNLOCK(vp2);
3379
}
3380
3381
static void
3382
swapdev_close(struct thread *td, struct swdevt *sp)
3383
{
3384
struct vnode *vp;
3385
3386
vp = sp->sw_vp;
3387
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3388
VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
3389
vput(vp);
3390
}
3391
3392
static int
3393
swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
3394
{
3395
struct swdevt *sp;
3396
int error;
3397
3398
ASSERT_VOP_ELOCKED(vp, "swaponvp");
3399
if (nblks == 0)
3400
return (ENXIO);
3401
mtx_lock(&sw_dev_mtx);
3402
TAILQ_FOREACH(sp, &swtailq, sw_list) {
3403
if (sp->sw_id == vp) {
3404
mtx_unlock(&sw_dev_mtx);
3405
return (EBUSY);
3406
}
3407
}
3408
mtx_unlock(&sw_dev_mtx);
3409
3410
#ifdef MAC
3411
error = mac_system_check_swapon(td->td_ucred, vp);
3412
if (error == 0)
3413
#endif
3414
error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
3415
if (error != 0)
3416
return (error);
3417
3418
error = swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
3419
NODEV, 0);
3420
if (error != 0)
3421
VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
3422
return (error);
3423
}
3424
3425
static int
3426
sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
3427
{
3428
int error, new, n;
3429
3430
new = nsw_wcount_async_max;
3431
error = sysctl_handle_int(oidp, &new, 0, req);
3432
if (error != 0 || req->newptr == NULL)
3433
return (error);
3434
3435
if (new > nswbuf / 2 || new < 1)
3436
return (EINVAL);
3437
3438
mtx_lock(&swbuf_mtx);
3439
while (nsw_wcount_async_max != new) {
3440
/*
3441
* Adjust difference. If the current async count is too low,
3442
* we will need to sqeeze our update slowly in. Sleep with a
3443
* higher priority than getpbuf() to finish faster.
3444
*/
3445
n = new - nsw_wcount_async_max;
3446
if (nsw_wcount_async + n >= 0) {
3447
nsw_wcount_async += n;
3448
nsw_wcount_async_max += n;
3449
wakeup(&nsw_wcount_async);
3450
} else {
3451
nsw_wcount_async_max -= nsw_wcount_async;
3452
nsw_wcount_async = 0;
3453
msleep(&nsw_wcount_async, &swbuf_mtx, PSWP,
3454
"swpsysctl", 0);
3455
}
3456
}
3457
mtx_unlock(&swbuf_mtx);
3458
3459
return (0);
3460
}
3461
3462
static void
3463
swap_pager_update_writecount(vm_object_t object, vm_offset_t start,
3464
vm_offset_t end)
3465
{
3466
3467
VM_OBJECT_WLOCK(object);
3468
KASSERT((object->flags & OBJ_ANON) == 0,
3469
("Splittable object with writecount"));
3470
object->un_pager.swp.writemappings += (vm_ooffset_t)end - start;
3471
VM_OBJECT_WUNLOCK(object);
3472
}
3473
3474
static void
3475
swap_pager_release_writecount(vm_object_t object, vm_offset_t start,
3476
vm_offset_t end)
3477
{
3478
3479
VM_OBJECT_WLOCK(object);
3480
KASSERT((object->flags & OBJ_ANON) == 0,
3481
("Splittable object with writecount"));
3482
KASSERT(object->un_pager.swp.writemappings >= (vm_ooffset_t)end - start,
3483
("swap obj %p writecount %jx dec %jx", object,
3484
(uintmax_t)object->un_pager.swp.writemappings,
3485
(uintmax_t)((vm_ooffset_t)end - start)));
3486
object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start;
3487
VM_OBJECT_WUNLOCK(object);
3488
}
3489
3490