/*-1* SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)2*3* Copyright (c) 1991 Regents of the University of California.4* All rights reserved.5* Copyright (c) 1994 John S. Dyson6* All rights reserved.7* Copyright (c) 1994 David Greenman8* All rights reserved.9* Copyright (c) 2005 Yahoo! Technologies Norway AS10* All rights reserved.11*12* This code is derived from software contributed to Berkeley by13* The Mach Operating System project at Carnegie-Mellon University.14*15* Redistribution and use in source and binary forms, with or without16* modification, are permitted provided that the following conditions17* are met:18* 1. Redistributions of source code must retain the above copyright19* notice, this list of conditions and the following disclaimer.20* 2. Redistributions in binary form must reproduce the above copyright21* notice, this list of conditions and the following disclaimer in the22* documentation and/or other materials provided with the distribution.23* 3. All advertising materials mentioning features or use of this software24* must display the following acknowledgement:25* This product includes software developed by the University of26* California, Berkeley and its contributors.27* 4. Neither the name of the University nor the names of its contributors28* may be used to endorse or promote products derived from this software29* without specific prior written permission.30*31* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND32* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE33* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE34* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE35* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL36* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS37* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)38* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT39* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY40* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF41* SUCH DAMAGE.42*43*44* Copyright (c) 1987, 1990 Carnegie-Mellon University.45* All rights reserved.46*47* Authors: Avadis Tevanian, Jr., Michael Wayne Young48*49* Permission to use, copy, modify and distribute this software and50* its documentation is hereby granted, provided that both the copyright51* notice and this permission notice appear in all copies of the52* software, derivative works or modified versions, and any portions53* thereof, and that both notices appear in supporting documentation.54*55* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"56* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND57* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.58*59* Carnegie Mellon requests users of this software to return to60*61* Software Distribution Coordinator or [email protected]62* School of Computer Science63* Carnegie Mellon University64* Pittsburgh PA 15213-389065*66* any improvements or extensions that they make and grant Carnegie the67* rights to redistribute these changes.68*/6970/*71* The proverbial page-out daemon.72*/7374#include <sys/cdefs.h>75#include "opt_vm.h"7677#include <sys/param.h>78#include <sys/systm.h>79#include <sys/kernel.h>80#include <sys/blockcount.h>81#include <sys/eventhandler.h>82#include <sys/limits.h>83#include <sys/lock.h>84#include <sys/mutex.h>85#include <sys/proc.h>86#include <sys/kthread.h>87#include <sys/ktr.h>88#include <sys/mount.h>89#include <sys/racct.h>90#include <sys/resourcevar.h>91#include <sys/sched.h>92#include <sys/sdt.h>93#include <sys/signalvar.h>94#include <sys/smp.h>95#include <sys/time.h>96#include <sys/vnode.h>97#include <sys/vmmeter.h>98#include <sys/rwlock.h>99#include <sys/sx.h>100#include <sys/sysctl.h>101102#include <vm/vm.h>103#include <vm/vm_param.h>104#include <vm/vm_object.h>105#include <vm/vm_page.h>106#include <vm/vm_map.h>107#include <vm/vm_pageout.h>108#include <vm/vm_pager.h>109#include <vm/vm_phys.h>110#include <vm/vm_pagequeue.h>111#include <vm/vm_radix.h>112#include <vm/swap_pager.h>113#include <vm/vm_extern.h>114#include <vm/uma.h>115116/*117* System initialization118*/119120/* the kernel process "vm_pageout"*/121static void vm_pageout(void);122static void vm_pageout_init(void);123static int vm_pageout_clean(vm_page_t m, int *numpagedout);124static int vm_pageout_cluster(vm_page_t m);125static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,126int starting_page_shortage);127128SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,129NULL);130131struct proc *pageproc;132133static struct kproc_desc page_kp = {134"pagedaemon",135vm_pageout,136&pageproc137};138SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,139&page_kp);140141SDT_PROVIDER_DEFINE(vm);142SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);143144/* Pagedaemon activity rates, in subdivisions of one second. */145#define VM_LAUNDER_RATE 10146#define VM_INACT_SCAN_RATE 10147148static int swapdev_enabled;149int vm_pageout_page_count = 32;150151static int vm_panic_on_oom = 0;152SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,153CTLFLAG_RWTUN, &vm_panic_on_oom, 0,154"Panic on the given number of out-of-memory errors instead of "155"killing the largest process");156157static int vm_pageout_update_period;158SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,159CTLFLAG_RWTUN, &vm_pageout_update_period, 0,160"Maximum active LRU update period");161162static int pageout_cpus_per_thread = 16;163SYSCTL_INT(_vm, OID_AUTO, pageout_cpus_per_thread, CTLFLAG_RDTUN,164&pageout_cpus_per_thread, 0,165"Number of CPUs per pagedaemon worker thread");166167static int lowmem_period = 10;168SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,169"Low memory callback period");170171static int disable_swap_pageouts;172SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,173CTLFLAG_RWTUN, &disable_swap_pageouts, 0,174"Disallow swapout of dirty pages");175176static int pageout_lock_miss;177SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,178CTLFLAG_RD, &pageout_lock_miss, 0,179"vget() lock misses during pageout");180181static int vm_pageout_oom_seq = 12;182SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,183CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,184"back-to-back calls to oom detector to start OOM");185186static int187sysctl_laundry_weight(SYSCTL_HANDLER_ARGS)188{189int error, val;190191val = *(int *)arg1;192error = sysctl_handle_int(oidp, &val, 0, req);193if (error != 0 || req->newptr == NULL)194return (error);195if (val < arg2 || val > 100)196return (EINVAL);197*(int *)arg1 = val;198return (0);199}200201static int act_scan_laundry_weight = 3;202SYSCTL_PROC(_vm, OID_AUTO, act_scan_laundry_weight,203CTLTYPE_INT | CTLFLAG_RWTUN, &act_scan_laundry_weight, 1,204sysctl_laundry_weight, "I",205"weight given to clean vs. dirty pages in active queue scans");206207static int inact_scan_laundry_weight = 1;208SYSCTL_PROC(_vm, OID_AUTO, inact_scan_laundry_weight,209CTLTYPE_INT | CTLFLAG_RWTUN, &inact_scan_laundry_weight, 0,210sysctl_laundry_weight, "I",211"weight given to clean vs. dirty pages in inactive queue scans");212213static u_int vm_background_launder_rate = 4096;214SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,215&vm_background_launder_rate, 0,216"background laundering rate, in kilobytes per second");217218static u_int vm_background_launder_max = 20 * 1024;219SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,220&vm_background_launder_max, 0,221"background laundering cap, in kilobytes");222223u_long vm_page_max_user_wired;224SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW,225&vm_page_max_user_wired, 0,226"system-wide limit to user-wired page count");227228static u_int isqrt(u_int num);229static int vm_pageout_launder(struct vm_domain *vmd, int launder,230bool in_shortfall);231static void vm_pageout_laundry_worker(void *arg);232233struct scan_state {234struct vm_batchqueue bq;235struct vm_pagequeue *pq;236vm_page_t marker;237int maxscan;238int scanned;239};240241static void242vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,243vm_page_t marker, vm_page_t after, int maxscan)244{245246vm_pagequeue_assert_locked(pq);247KASSERT((marker->a.flags & PGA_ENQUEUED) == 0,248("marker %p already enqueued", marker));249250if (after == NULL)251TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);252else253TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);254vm_page_aflag_set(marker, PGA_ENQUEUED);255256vm_batchqueue_init(&ss->bq);257ss->pq = pq;258ss->marker = marker;259ss->maxscan = maxscan;260ss->scanned = 0;261vm_pagequeue_unlock(pq);262}263264static void265vm_pageout_end_scan(struct scan_state *ss)266{267struct vm_pagequeue *pq;268269pq = ss->pq;270vm_pagequeue_assert_locked(pq);271KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0,272("marker %p not enqueued", ss->marker));273274TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);275vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);276pq->pq_pdpages += ss->scanned;277}278279/*280* Add a small number of queued pages to a batch queue for later processing281* without the corresponding queue lock held. The caller must have enqueued a282* marker page at the desired start point for the scan. Pages will be283* physically dequeued if the caller so requests. Otherwise, the returned284* batch may contain marker pages, and it is up to the caller to handle them.285*286* When processing the batch queue, vm_pageout_defer() must be used to287* determine whether the page has been logically dequeued since the batch was288* collected.289*/290static __always_inline void291vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)292{293struct vm_pagequeue *pq;294vm_page_t m, marker, n;295296marker = ss->marker;297pq = ss->pq;298299KASSERT((marker->a.flags & PGA_ENQUEUED) != 0,300("marker %p not enqueued", ss->marker));301302vm_pagequeue_lock(pq);303for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&304ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;305m = n, ss->scanned++) {306n = TAILQ_NEXT(m, plinks.q);307if ((m->flags & PG_MARKER) == 0) {308KASSERT((m->a.flags & PGA_ENQUEUED) != 0,309("page %p not enqueued", m));310KASSERT((m->flags & PG_FICTITIOUS) == 0,311("Fictitious page %p cannot be in page queue", m));312KASSERT((m->oflags & VPO_UNMANAGED) == 0,313("Unmanaged page %p cannot be in page queue", m));314} else if (dequeue)315continue;316317(void)vm_batchqueue_insert(&ss->bq, m);318if (dequeue) {319TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);320vm_page_aflag_clear(m, PGA_ENQUEUED);321}322}323TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);324if (__predict_true(m != NULL))325TAILQ_INSERT_BEFORE(m, marker, plinks.q);326else327TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);328if (dequeue)329vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);330vm_pagequeue_unlock(pq);331}332333/*334* Return the next page to be scanned, or NULL if the scan is complete.335*/336static __always_inline vm_page_t337vm_pageout_next(struct scan_state *ss, const bool dequeue)338{339340if (ss->bq.bq_cnt == 0)341vm_pageout_collect_batch(ss, dequeue);342return (vm_batchqueue_pop(&ss->bq));343}344345/*346* Determine whether processing of a page should be deferred and ensure that any347* outstanding queue operations are processed.348*/349static __always_inline bool350vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued)351{352vm_page_astate_t as;353354as = vm_page_astate_load(m);355if (__predict_false(as.queue != queue ||356((as.flags & PGA_ENQUEUED) != 0) != enqueued))357return (true);358if ((as.flags & PGA_QUEUE_OP_MASK) != 0) {359vm_page_pqbatch_submit(m, queue);360return (true);361}362return (false);363}364365/*366* We can cluster only if the page is not clean, busy, or held, and the page is367* in the laundry queue.368*/369static bool370vm_pageout_flushable(vm_page_t m)371{372if (vm_page_tryxbusy(m) == 0)373return (false);374if (!vm_page_wired(m)) {375vm_page_test_dirty(m);376if (m->dirty != 0 && vm_page_in_laundry(m) &&377vm_page_try_remove_write(m))378return (true);379}380vm_page_xunbusy(m);381return (false);382}383384/*385* Scan for pages at adjacent offsets within the given page's object that are386* eligible for laundering, form a cluster of these pages and the given page,387* and launder that cluster.388*/389static int390vm_pageout_cluster(vm_page_t m)391{392struct pctrie_iter pages;393vm_page_t mc[2 * vm_pageout_page_count - 1];394int alignment, page_base, pageout_count;395396VM_OBJECT_ASSERT_WLOCKED(m->object);397398vm_page_assert_xbusied(m);399400vm_page_iter_init(&pages, m->object);401alignment = m->pindex % vm_pageout_page_count;402page_base = nitems(mc) / 2;403pageout_count = 1;404mc[page_base] = m;405406/*407* During heavy mmap/modification loads the pageout408* daemon can really fragment the underlying file409* due to flushing pages out of order and not trying to410* align the clusters (which leaves sporadic out-of-order411* holes). To solve this problem we do the reverse scan412* first and attempt to align our cluster, then do a413* forward scan if room remains.414*415* If we are at an alignment boundary, stop here, and switch directions.416*/417if (alignment > 0) {418pages.index = mc[page_base]->pindex;419do {420m = vm_radix_iter_prev(&pages);421if (m == NULL || !vm_pageout_flushable(m))422break;423mc[--page_base] = m;424} while (pageout_count++ < alignment);425}426if (pageout_count < vm_pageout_page_count) {427pages.index = mc[page_base + pageout_count - 1]->pindex;428do {429m = vm_radix_iter_next(&pages);430if (m == NULL || !vm_pageout_flushable(m))431break;432mc[page_base + pageout_count] = m;433} while (++pageout_count < vm_pageout_page_count);434}435if (pageout_count < vm_pageout_page_count &&436alignment == nitems(mc) / 2 - page_base) {437/* Resume the reverse scan. */438pages.index = mc[page_base]->pindex;439do {440m = vm_radix_iter_prev(&pages);441if (m == NULL || !vm_pageout_flushable(m))442break;443mc[--page_base] = m;444} while (++pageout_count < vm_pageout_page_count);445}446447return (vm_pageout_flush(&mc[page_base], pageout_count,448VM_PAGER_PUT_NOREUSE, NULL));449}450451/*452* vm_pageout_flush() - launder the given pages453*454* The given pages are laundered. Note that we setup for the start of455* I/O ( i.e. busy the page ), mark it read-only, and bump the object456* reference count all in here rather then in the parent. If we want457* the parent to do more sophisticated things we may have to change458* the ordering.459*460* If eio is not NULL, returns the count of pages between 0 and first page461* with status VM_PAGER_AGAIN. *eio is set to true if pager returned462* VM_PAGER_ERROR or VM_PAGER_FAIL for any page in that set.463*464* Otherwise, returns the number of paged-out pages.465*/466int467vm_pageout_flush(vm_page_t *mc, int count, int flags, bool *eio)468{469vm_object_t object = mc[0]->object;470int pageout_status[count];471int numpagedout = 0;472int i, runlen;473474VM_OBJECT_ASSERT_WLOCKED(object);475476/*477* Initiate I/O. Mark the pages shared busy and verify that they're478* valid and read-only.479*480* We do not have to fixup the clean/dirty bits here... we can481* allow the pager to do it after the I/O completes.482*483* NOTE! mc[i]->dirty may be partial or fragmented due to an484* edge case with file fragments.485*/486for (i = 0; i < count; i++) {487KASSERT(vm_page_all_valid(mc[i]),488("vm_pageout_flush: partially invalid page %p index %d/%d",489mc[i], i, count));490KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0,491("vm_pageout_flush: writeable page %p", mc[i]));492vm_page_busy_downgrade(mc[i]);493}494vm_object_pip_add(object, count);495496vm_pager_put_pages(object, mc, count, flags, pageout_status);497498runlen = count;499if (eio != NULL)500*eio = false;501for (i = 0; i < count; i++) {502vm_page_t mt = mc[i];503504KASSERT(pageout_status[i] == VM_PAGER_PEND ||505!pmap_page_is_write_mapped(mt),506("vm_pageout_flush: page %p is not write protected", mt));507switch (pageout_status[i]) {508case VM_PAGER_OK:509/*510* The page may have moved since laundering started, in511* which case it should be left alone.512*/513if (vm_page_in_laundry(mt))514vm_page_deactivate_noreuse(mt);515/* FALLTHROUGH */516case VM_PAGER_PEND:517numpagedout++;518break;519case VM_PAGER_BAD:520/*521* The page is outside the object's range. We pretend522* that the page out worked and clean the page, so the523* changes will be lost if the page is reclaimed by524* the page daemon.525*/526vm_page_undirty(mt);527if (vm_page_in_laundry(mt))528vm_page_deactivate_noreuse(mt);529break;530case VM_PAGER_ERROR:531case VM_PAGER_FAIL:532/*533* If the page couldn't be paged out to swap because the534* pager wasn't able to find space, place the page in535* the PQ_UNSWAPPABLE holding queue. This is an536* optimization that prevents the page daemon from537* wasting CPU cycles on pages that cannot be reclaimed538* because no swap device is configured.539*540* Otherwise, reactivate the page so that it doesn't541* clog the laundry and inactive queues. (We will try542* paging it out again later.)543*/544if ((object->flags & OBJ_SWAP) != 0 &&545pageout_status[i] == VM_PAGER_FAIL) {546vm_page_unswappable(mt);547numpagedout++;548} else549vm_page_activate(mt);550if (eio != NULL)551*eio = true;552break;553case VM_PAGER_AGAIN:554if (runlen == count)555runlen = i;556break;557}558559/*560* If the operation is still going, leave the page busy to561* block all other accesses. Also, leave the paging in562* progress indicator set so that we don't attempt an object563* collapse.564*/565if (pageout_status[i] != VM_PAGER_PEND) {566vm_object_pip_wakeup(object);567vm_page_sunbusy(mt);568}569}570if (eio != NULL)571return (runlen);572return (numpagedout);573}574575static void576vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)577{578579atomic_store_rel_int(&swapdev_enabled, 1);580}581582static void583vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)584{585586if (swap_pager_nswapdev() == 1)587atomic_store_rel_int(&swapdev_enabled, 0);588}589590/*591* Attempt to acquire all of the necessary locks to launder a page and592* then call through the clustering layer to PUTPAGES. Wait a short593* time for a vnode lock.594*595* Requires the page and object lock on entry, releases both before return.596* Returns 0 on success and an errno otherwise.597*/598static int599vm_pageout_clean(vm_page_t m, int *numpagedout)600{601struct vnode *vp;602struct mount *mp;603vm_object_t object;604vm_pindex_t pindex;605int error;606607object = m->object;608VM_OBJECT_ASSERT_WLOCKED(object);609error = 0;610vp = NULL;611mp = NULL;612613/*614* The object is already known NOT to be dead. It615* is possible for the vget() to block the whole616* pageout daemon, but the new low-memory handling617* code should prevent it.618*619* We can't wait forever for the vnode lock, we might620* deadlock due to a vn_read() getting stuck in621* vm_wait while holding this vnode. We skip the622* vnode if we can't get it in a reasonable amount623* of time.624*/625if (object->type == OBJT_VNODE) {626vm_page_xunbusy(m);627vp = object->handle;628if (vp->v_type == VREG &&629vn_start_write(vp, &mp, V_NOWAIT) != 0) {630mp = NULL;631error = EDEADLK;632goto unlock_all;633}634KASSERT(mp != NULL,635("vp %p with NULL v_mount", vp));636vm_object_reference_locked(object);637pindex = m->pindex;638VM_OBJECT_WUNLOCK(object);639if (vget(vp, vn_lktype_write(NULL, vp) | LK_TIMELOCK) != 0) {640vp = NULL;641error = EDEADLK;642goto unlock_mp;643}644VM_OBJECT_WLOCK(object);645646/*647* Ensure that the object and vnode were not disassociated648* while locks were dropped.649*/650if (vp->v_object != object) {651error = ENOENT;652goto unlock_all;653}654655/*656* While the object was unlocked, the page may have been:657* (1) moved to a different queue,658* (2) reallocated to a different object,659* (3) reallocated to a different offset, or660* (4) cleaned.661*/662if (!vm_page_in_laundry(m) || m->object != object ||663m->pindex != pindex || m->dirty == 0) {664error = ENXIO;665goto unlock_all;666}667668/*669* The page may have been busied while the object lock was670* released.671*/672if (vm_page_tryxbusy(m) == 0) {673error = EBUSY;674goto unlock_all;675}676}677678/*679* Remove all writeable mappings, failing if the page is wired.680*/681if (!vm_page_try_remove_write(m)) {682vm_page_xunbusy(m);683error = EBUSY;684goto unlock_all;685}686687/*688* If a page is dirty, then it is either being washed689* (but not yet cleaned) or it is still in the690* laundry. If it is still in the laundry, then we691* start the cleaning operation.692*/693if ((*numpagedout = vm_pageout_cluster(m)) == 0)694error = EIO;695696unlock_all:697VM_OBJECT_WUNLOCK(object);698699unlock_mp:700if (mp != NULL) {701if (vp != NULL)702vput(vp);703vm_object_deallocate(object);704vn_finished_write(mp);705}706707return (error);708}709710/*711* Attempt to launder the specified number of pages.712*713* Returns the number of pages successfully laundered.714*/715static int716vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)717{718struct scan_state ss;719struct vm_pagequeue *pq;720vm_object_t object;721vm_page_t m, marker;722vm_page_astate_t new, old;723int act_delta, error, numpagedout, queue, refs, starting_target;724int vnodes_skipped;725bool pageout_ok;726727object = NULL;728starting_target = launder;729vnodes_skipped = 0;730731/*732* Scan the laundry queues for pages eligible to be laundered. We stop733* once the target number of dirty pages have been laundered, or once734* we've reached the end of the queue. A single iteration of this loop735* may cause more than one page to be laundered because of clustering.736*737* As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no738* swap devices are configured.739*/740if (atomic_load_acq_int(&swapdev_enabled))741queue = PQ_UNSWAPPABLE;742else743queue = PQ_LAUNDRY;744745scan:746marker = &vmd->vmd_markers[queue];747pq = &vmd->vmd_pagequeues[queue];748vm_pagequeue_lock(pq);749vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);750while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {751if (__predict_false((m->flags & PG_MARKER) != 0))752continue;753754/*755* Don't touch a page that was removed from the queue after the756* page queue lock was released. Otherwise, ensure that any757* pending queue operations, such as dequeues for wired pages,758* are handled.759*/760if (vm_pageout_defer(m, queue, true))761continue;762763/*764* Lock the page's object.765*/766if (object == NULL || object != m->object) {767if (object != NULL)768VM_OBJECT_WUNLOCK(object);769object = atomic_load_ptr(&m->object);770if (__predict_false(object == NULL))771/* The page is being freed by another thread. */772continue;773774/* Depends on type-stability. */775VM_OBJECT_WLOCK(object);776if (__predict_false(m->object != object)) {777VM_OBJECT_WUNLOCK(object);778object = NULL;779continue;780}781}782783if (vm_page_tryxbusy(m) == 0)784continue;785786/*787* Check for wirings now that we hold the object lock and have788* exclusively busied the page. If the page is mapped, it may789* still be wired by pmap lookups. The call to790* vm_page_try_remove_all() below atomically checks for such791* wirings and removes mappings. If the page is unmapped, the792* wire count is guaranteed not to increase after this check.793*/794if (__predict_false(vm_page_wired(m)))795goto skip_page;796797/*798* Invalid pages can be easily freed. They cannot be799* mapped; vm_page_free() asserts this.800*/801if (vm_page_none_valid(m))802goto free_page;803804refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;805806for (old = vm_page_astate_load(m);;) {807/*808* Check to see if the page has been removed from the809* queue since the first such check. Leave it alone if810* so, discarding any references collected by811* pmap_ts_referenced().812*/813if (__predict_false(_vm_page_queue(old) == PQ_NONE))814goto skip_page;815816new = old;817act_delta = refs;818if ((old.flags & PGA_REFERENCED) != 0) {819new.flags &= ~PGA_REFERENCED;820act_delta++;821}822if (act_delta == 0) {823;824} else if (object->ref_count != 0) {825/*826* Increase the activation count if the page was827* referenced while in the laundry queue. This828* makes it less likely that the page will be829* returned prematurely to the laundry queue.830*/831new.act_count += ACT_ADVANCE +832act_delta;833if (new.act_count > ACT_MAX)834new.act_count = ACT_MAX;835836new.flags &= ~PGA_QUEUE_OP_MASK;837new.flags |= PGA_REQUEUE;838new.queue = PQ_ACTIVE;839if (!vm_page_pqstate_commit(m, &old, new))840continue;841842/*843* If this was a background laundering, count844* activated pages towards our target. The845* purpose of background laundering is to ensure846* that pages are eventually cycled through the847* laundry queue, and an activation is a valid848* way out.849*/850if (!in_shortfall)851launder--;852VM_CNT_INC(v_reactivated);853goto skip_page;854} else if ((object->flags & OBJ_DEAD) == 0) {855new.flags |= PGA_REQUEUE;856if (!vm_page_pqstate_commit(m, &old, new))857continue;858goto skip_page;859}860break;861}862863/*864* If the page appears to be clean at the machine-independent865* layer, then remove all of its mappings from the pmap in866* anticipation of freeing it. If, however, any of the page's867* mappings allow write access, then the page may still be868* modified until the last of those mappings are removed.869*/870if (object->ref_count != 0) {871vm_page_test_dirty(m);872if (m->dirty == 0 && !vm_page_try_remove_all(m))873goto skip_page;874}875876/*877* Clean pages are freed, and dirty pages are paged out unless878* they belong to a dead object. Requeueing dirty pages from879* dead objects is pointless, as they are being paged out and880* freed by the thread that destroyed the object.881*/882if (m->dirty == 0) {883free_page:884/*885* Now we are guaranteed that no other threads are886* manipulating the page, check for a last-second887* reference.888*/889if (vm_pageout_defer(m, queue, true))890goto skip_page;891vm_page_free(m);892VM_CNT_INC(v_dfree);893} else if ((object->flags & OBJ_DEAD) == 0) {894if ((object->flags & OBJ_SWAP) != 0)895pageout_ok = disable_swap_pageouts == 0;896else897pageout_ok = true;898if (!pageout_ok) {899vm_page_launder(m);900goto skip_page;901}902903/*904* Form a cluster with adjacent, dirty pages from the905* same object, and page out that entire cluster.906*907* The adjacent, dirty pages must also be in the908* laundry. However, their mappings are not checked909* for new references. Consequently, a recently910* referenced page may be paged out. However, that911* page will not be prematurely reclaimed. After page912* out, the page will be placed in the inactive queue,913* where any new references will be detected and the914* page reactivated.915*/916error = vm_pageout_clean(m, &numpagedout);917if (error == 0) {918launder -= numpagedout;919ss.scanned += numpagedout;920} else if (error == EDEADLK) {921pageout_lock_miss++;922vnodes_skipped++;923}924object = NULL;925} else {926skip_page:927vm_page_xunbusy(m);928}929}930if (object != NULL) {931VM_OBJECT_WUNLOCK(object);932object = NULL;933}934vm_pagequeue_lock(pq);935vm_pageout_end_scan(&ss);936vm_pagequeue_unlock(pq);937938if (launder > 0 && queue == PQ_UNSWAPPABLE) {939queue = PQ_LAUNDRY;940goto scan;941}942943/*944* Wakeup the sync daemon if we skipped a vnode in a writeable object945* and we didn't launder enough pages.946*/947if (vnodes_skipped > 0 && launder > 0)948(void)speedup_syncer();949950return (starting_target - launder);951}952953/*954* Compute the integer square root.955*/956static u_int957isqrt(u_int num)958{959u_int bit, root, tmp;960961bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0;962root = 0;963while (bit != 0) {964tmp = root + bit;965root >>= 1;966if (num >= tmp) {967num -= tmp;968root += bit;969}970bit >>= 2;971}972return (root);973}974975/*976* Perform the work of the laundry thread: periodically wake up and determine977* whether any pages need to be laundered. If so, determine the number of pages978* that need to be laundered, and launder them.979*/980static void981vm_pageout_laundry_worker(void *arg)982{983struct vm_domain *vmd;984struct vm_pagequeue *pq;985uint64_t nclean, ndirty, nfreed;986int domain, last_target, launder, shortfall, shortfall_cycle, target;987bool in_shortfall;988989domain = (uintptr_t)arg;990vmd = VM_DOMAIN(domain);991pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];992KASSERT(vmd->vmd_segs != 0, ("domain without segments"));993994shortfall = 0;995in_shortfall = false;996shortfall_cycle = 0;997last_target = target = 0;998nfreed = 0;9991000/*1001* Calls to these handlers are serialized by the swap syscall lock.1002*/1003(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,1004EVENTHANDLER_PRI_ANY);1005(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,1006EVENTHANDLER_PRI_ANY);10071008/*1009* The pageout laundry worker is never done, so loop forever.1010*/1011for (;;) {1012KASSERT(target >= 0, ("negative target %d", target));1013KASSERT(shortfall_cycle >= 0,1014("negative cycle %d", shortfall_cycle));1015launder = 0;10161017/*1018* First determine whether we need to launder pages to meet a1019* shortage of free pages.1020*/1021if (shortfall > 0) {1022in_shortfall = true;1023shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;1024target = shortfall;1025} else if (!in_shortfall)1026goto trybackground;1027else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {1028/*1029* We recently entered shortfall and began laundering1030* pages. If we have completed that laundering run1031* (and we are no longer in shortfall) or we have met1032* our laundry target through other activity, then we1033* can stop laundering pages.1034*/1035in_shortfall = false;1036target = 0;1037goto trybackground;1038}1039launder = target / shortfall_cycle--;1040goto dolaundry;10411042/*1043* There's no immediate need to launder any pages; see if we1044* meet the conditions to perform background laundering:1045*1046* 1. The ratio of dirty to clean inactive pages exceeds the1047* background laundering threshold, or1048* 2. we haven't yet reached the target of the current1049* background laundering run.1050*1051* The background laundering threshold is not a constant.1052* Instead, it is a slowly growing function of the number of1053* clean pages freed by the page daemon since the last1054* background laundering. Thus, as the ratio of dirty to1055* clean inactive pages grows, the amount of memory pressure1056* required to trigger laundering decreases. We ensure1057* that the threshold is non-zero after an inactive queue1058* scan, even if that scan failed to free a single clean page.1059*/1060trybackground:1061nclean = vmd->vmd_free_count +1062vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;1063ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;1064if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,1065vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {1066target = vmd->vmd_background_launder_target;1067}10681069/*1070* We have a non-zero background laundering target. If we've1071* laundered up to our maximum without observing a page daemon1072* request, just stop. This is a safety belt that ensures we1073* don't launder an excessive amount if memory pressure is low1074* and the ratio of dirty to clean pages is large. Otherwise,1075* proceed at the background laundering rate.1076*/1077if (target > 0) {1078if (nfreed > 0) {1079nfreed = 0;1080last_target = target;1081} else if (last_target - target >=1082vm_background_launder_max * PAGE_SIZE / 1024) {1083target = 0;1084}1085launder = vm_background_launder_rate * PAGE_SIZE / 1024;1086launder /= VM_LAUNDER_RATE;1087if (launder > target)1088launder = target;1089}10901091dolaundry:1092if (launder > 0) {1093/*1094* Because of I/O clustering, the number of laundered1095* pages could exceed "target" by the maximum size of1096* a cluster minus one.1097*/1098target -= min(vm_pageout_launder(vmd, launder,1099in_shortfall), target);1100pause("laundp", hz / VM_LAUNDER_RATE);1101}11021103/*1104* If we're not currently laundering pages and the page daemon1105* hasn't posted a new request, sleep until the page daemon1106* kicks us.1107*/1108vm_pagequeue_lock(pq);1109if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)1110(void)mtx_sleep(&vmd->vmd_laundry_request,1111vm_pagequeue_lockptr(pq), PVM, "launds", 0);11121113/*1114* If the pagedaemon has indicated that it's in shortfall, start1115* a shortfall laundering unless we're already in the middle of1116* one. This may preempt a background laundering.1117*/1118if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&1119(!in_shortfall || shortfall_cycle == 0)) {1120shortfall = vm_laundry_target(vmd) +1121vmd->vmd_pageout_deficit;1122target = 0;1123} else1124shortfall = 0;11251126if (target == 0)1127vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;1128nfreed += vmd->vmd_clean_pages_freed;1129vmd->vmd_clean_pages_freed = 0;1130vm_pagequeue_unlock(pq);1131}1132}11331134/*1135* Compute the number of pages we want to try to move from the1136* active queue to either the inactive or laundry queue.1137*1138* When scanning active pages during a shortage, we make clean pages1139* count more heavily towards the page shortage than dirty pages.1140* This is because dirty pages must be laundered before they can be1141* reused and thus have less utility when attempting to quickly1142* alleviate a free page shortage. However, this weighting also1143* causes the scan to deactivate dirty pages more aggressively,1144* improving the effectiveness of clustering.1145*/1146static int1147vm_pageout_active_target(struct vm_domain *vmd)1148{1149int shortage;11501151shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -1152(vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +1153vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);1154shortage *= act_scan_laundry_weight;1155return (shortage);1156}11571158/*1159* Scan the active queue. If there is no shortage of inactive pages, scan a1160* small portion of the queue in order to maintain quasi-LRU.1161*/1162static void1163vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)1164{1165struct scan_state ss;1166vm_object_t object;1167vm_page_t m, marker;1168struct vm_pagequeue *pq;1169vm_page_astate_t old, new;1170long min_scan;1171int act_delta, max_scan, ps_delta, refs, scan_tick;1172uint8_t nqueue;11731174marker = &vmd->vmd_markers[PQ_ACTIVE];1175pq = &vmd->vmd_pagequeues[PQ_ACTIVE];1176vm_pagequeue_lock(pq);11771178/*1179* If we're just idle polling attempt to visit every1180* active page within 'update_period' seconds.1181*/1182scan_tick = ticks;1183if (vm_pageout_update_period != 0) {1184min_scan = pq->pq_cnt;1185min_scan *= scan_tick - vmd->vmd_last_active_scan;1186min_scan /= hz * vm_pageout_update_period;1187} else1188min_scan = 0;1189if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))1190vmd->vmd_last_active_scan = scan_tick;11911192/*1193* Scan the active queue for pages that can be deactivated. Update1194* the per-page activity counter and use it to identify deactivation1195* candidates. Held pages may be deactivated.1196*1197* To avoid requeuing each page that remains in the active queue, we1198* implement the CLOCK algorithm. To keep the implementation of the1199* enqueue operation consistent for all page queues, we use two hands,1200* represented by marker pages. Scans begin at the first hand, which1201* precedes the second hand in the queue. When the two hands meet,1202* they are moved back to the head and tail of the queue, respectively,1203* and scanning resumes.1204*/1205max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;1206act_scan:1207vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);1208while ((m = vm_pageout_next(&ss, false)) != NULL) {1209if (__predict_false(m == &vmd->vmd_clock[1])) {1210vm_pagequeue_lock(pq);1211TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);1212TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);1213TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],1214plinks.q);1215TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],1216plinks.q);1217max_scan -= ss.scanned;1218vm_pageout_end_scan(&ss);1219goto act_scan;1220}1221if (__predict_false((m->flags & PG_MARKER) != 0))1222continue;12231224/*1225* Don't touch a page that was removed from the queue after the1226* page queue lock was released. Otherwise, ensure that any1227* pending queue operations, such as dequeues for wired pages,1228* are handled.1229*/1230if (vm_pageout_defer(m, PQ_ACTIVE, true))1231continue;12321233/*1234* A page's object pointer may be set to NULL before1235* the object lock is acquired.1236*/1237object = atomic_load_ptr(&m->object);1238if (__predict_false(object == NULL))1239/*1240* The page has been removed from its object.1241*/1242continue;12431244/* Deferred free of swap space. */1245if ((m->a.flags & PGA_SWAP_FREE) != 0 &&1246VM_OBJECT_TRYWLOCK(object)) {1247if (m->object == object)1248vm_pager_page_unswapped(m);1249VM_OBJECT_WUNLOCK(object);1250}12511252/*1253* Check to see "how much" the page has been used.1254*1255* Test PGA_REFERENCED after calling pmap_ts_referenced() so1256* that a reference from a concurrently destroyed mapping is1257* observed here and now.1258*1259* Perform an unsynchronized object ref count check. While1260* the page lock ensures that the page is not reallocated to1261* another object, in particular, one with unmanaged mappings1262* that cannot support pmap_ts_referenced(), two races are,1263* nonetheless, possible:1264* 1) The count was transitioning to zero, but we saw a non-1265* zero value. pmap_ts_referenced() will return zero1266* because the page is not mapped.1267* 2) The count was transitioning to one, but we saw zero.1268* This race delays the detection of a new reference. At1269* worst, we will deactivate and reactivate the page.1270*/1271refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;12721273old = vm_page_astate_load(m);1274do {1275/*1276* Check to see if the page has been removed from the1277* queue since the first such check. Leave it alone if1278* so, discarding any references collected by1279* pmap_ts_referenced().1280*/1281if (__predict_false(_vm_page_queue(old) == PQ_NONE)) {1282ps_delta = 0;1283break;1284}12851286/*1287* Advance or decay the act_count based on recent usage.1288*/1289new = old;1290act_delta = refs;1291if ((old.flags & PGA_REFERENCED) != 0) {1292new.flags &= ~PGA_REFERENCED;1293act_delta++;1294}1295if (act_delta != 0) {1296new.act_count += ACT_ADVANCE + act_delta;1297if (new.act_count > ACT_MAX)1298new.act_count = ACT_MAX;1299} else {1300new.act_count -= min(new.act_count,1301ACT_DECLINE);1302}13031304if (new.act_count > 0) {1305/*1306* Adjust the activation count and keep the page1307* in the active queue. The count might be left1308* unchanged if it is saturated. The page may1309* have been moved to a different queue since we1310* started the scan, in which case we move it1311* back.1312*/1313ps_delta = 0;1314if (old.queue != PQ_ACTIVE) {1315new.flags &= ~PGA_QUEUE_OP_MASK;1316new.flags |= PGA_REQUEUE;1317new.queue = PQ_ACTIVE;1318}1319} else {1320/*1321* When not short for inactive pages, let dirty1322* pages go through the inactive queue before1323* moving to the laundry queue. This gives them1324* some extra time to be reactivated,1325* potentially avoiding an expensive pageout.1326* However, during a page shortage, the inactive1327* queue is necessarily small, and so dirty1328* pages would only spend a trivial amount of1329* time in the inactive queue. Therefore, we1330* might as well place them directly in the1331* laundry queue to reduce queuing overhead.1332*1333* Calling vm_page_test_dirty() here would1334* require acquisition of the object's write1335* lock. However, during a page shortage,1336* directing dirty pages into the laundry queue1337* is only an optimization and not a1338* requirement. Therefore, we simply rely on1339* the opportunistic updates to the page's dirty1340* field by the pmap.1341*/1342if (page_shortage <= 0) {1343nqueue = PQ_INACTIVE;1344ps_delta = 0;1345} else if (m->dirty == 0) {1346nqueue = PQ_INACTIVE;1347ps_delta = act_scan_laundry_weight;1348} else {1349nqueue = PQ_LAUNDRY;1350ps_delta = 1;1351}13521353new.flags &= ~PGA_QUEUE_OP_MASK;1354new.flags |= PGA_REQUEUE;1355new.queue = nqueue;1356}1357} while (!vm_page_pqstate_commit(m, &old, new));13581359page_shortage -= ps_delta;1360}1361vm_pagequeue_lock(pq);1362TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);1363TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);1364vm_pageout_end_scan(&ss);1365vm_pagequeue_unlock(pq);1366}13671368static int1369vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker,1370vm_page_t m)1371{1372vm_page_astate_t as;13731374vm_pagequeue_assert_locked(pq);13751376as = vm_page_astate_load(m);1377if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0)1378return (0);1379vm_page_aflag_set(m, PGA_ENQUEUED);1380TAILQ_INSERT_BEFORE(marker, m, plinks.q);1381return (1);1382}13831384/*1385* Re-add stuck pages to the inactive queue. We will examine them again1386* during the next scan. If the queue state of a page has changed since1387* it was physically removed from the page queue in1388* vm_pageout_collect_batch(), don't do anything with that page.1389*/1390static void1391vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,1392vm_page_t m)1393{1394struct vm_pagequeue *pq;1395vm_page_t marker;1396int delta;13971398delta = 0;1399marker = ss->marker;1400pq = ss->pq;14011402if (m != NULL) {1403if (vm_batchqueue_insert(bq, m) != 0)1404return;1405vm_pagequeue_lock(pq);1406delta += vm_pageout_reinsert_inactive_page(pq, marker, m);1407} else1408vm_pagequeue_lock(pq);1409while ((m = vm_batchqueue_pop(bq)) != NULL)1410delta += vm_pageout_reinsert_inactive_page(pq, marker, m);1411vm_pagequeue_cnt_add(pq, delta);1412vm_pagequeue_unlock(pq);1413vm_batchqueue_init(bq);1414}14151416static void1417vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage)1418{1419struct timeval start, end;1420struct scan_state ss;1421struct vm_batchqueue rq;1422struct vm_page marker_page;1423vm_page_t m, marker;1424struct vm_pagequeue *pq;1425vm_object_t object;1426vm_page_astate_t old, new;1427int act_delta, addl_page_shortage, dirty_count, dirty_thresh;1428int starting_page_shortage, refs;14291430object = NULL;1431vm_batchqueue_init(&rq);1432getmicrouptime(&start);14331434/*1435* The addl_page_shortage is an estimate of the number of temporarily1436* stuck pages in the inactive queue. In other words, the1437* number of pages from the inactive count that should be1438* discounted in setting the target for the active queue scan.1439*/1440addl_page_shortage = 0;14411442/*1443* dirty_count is the number of pages encountered that require1444* laundering before reclamation is possible. If we encounter a large1445* number of dirty pages, we may abort the scan without meeting the page1446* shortage in the hope that laundering will allow a future scan to meet1447* the target.1448*/1449dirty_count = 0;1450dirty_thresh = inact_scan_laundry_weight * page_shortage;1451if (dirty_thresh == 0)1452dirty_thresh = INT_MAX;14531454/*1455* Start scanning the inactive queue for pages that we can free. The1456* scan will stop when we reach the target or we have scanned the1457* entire queue. (Note that m->a.act_count is not used to make1458* decisions for the inactive queue, only for the active queue.)1459*/1460starting_page_shortage = page_shortage;1461marker = &marker_page;1462vm_page_init_marker(marker, PQ_INACTIVE, 0);1463pq = &vmd->vmd_pagequeues[PQ_INACTIVE];1464vm_pagequeue_lock(pq);1465vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);1466while (page_shortage > 0 && dirty_count < dirty_thresh) {1467/*1468* If we need to refill the scan batch queue, release any1469* optimistically held object lock. This gives someone else a1470* chance to grab the lock, and also avoids holding it while we1471* do unrelated work.1472*/1473if (object != NULL && vm_batchqueue_empty(&ss.bq)) {1474VM_OBJECT_WUNLOCK(object);1475object = NULL;1476}14771478m = vm_pageout_next(&ss, true);1479if (m == NULL)1480break;1481KASSERT((m->flags & PG_MARKER) == 0,1482("marker page %p was dequeued", m));14831484/*1485* Don't touch a page that was removed from the queue after the1486* page queue lock was released. Otherwise, ensure that any1487* pending queue operations, such as dequeues for wired pages,1488* are handled.1489*/1490if (vm_pageout_defer(m, PQ_INACTIVE, false))1491continue;14921493/*1494* Lock the page's object.1495*/1496if (object == NULL || object != m->object) {1497if (object != NULL)1498VM_OBJECT_WUNLOCK(object);1499object = atomic_load_ptr(&m->object);1500if (__predict_false(object == NULL))1501/* The page is being freed by another thread. */1502continue;15031504/* Depends on type-stability. */1505VM_OBJECT_WLOCK(object);1506if (__predict_false(m->object != object)) {1507VM_OBJECT_WUNLOCK(object);1508object = NULL;1509goto reinsert;1510}1511}15121513if (vm_page_tryxbusy(m) == 0) {1514/*1515* Don't mess with busy pages. Leave them at1516* the front of the queue. Most likely, they1517* are being paged out and will leave the1518* queue shortly after the scan finishes. So,1519* they ought to be discounted from the1520* inactive count.1521*/1522addl_page_shortage++;1523goto reinsert;1524}15251526/* Deferred free of swap space. */1527if ((m->a.flags & PGA_SWAP_FREE) != 0)1528vm_pager_page_unswapped(m);15291530/*1531* Check for wirings now that we hold the object lock and have1532* exclusively busied the page. If the page is mapped, it may1533* still be wired by pmap lookups. The call to1534* vm_page_try_remove_all() below atomically checks for such1535* wirings and removes mappings. If the page is unmapped, the1536* wire count is guaranteed not to increase after this check.1537*/1538if (__predict_false(vm_page_wired(m)))1539goto skip_page;15401541/*1542* Invalid pages can be easily freed. They cannot be1543* mapped, vm_page_free() asserts this.1544*/1545if (vm_page_none_valid(m))1546goto free_page;15471548refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;15491550for (old = vm_page_astate_load(m);;) {1551/*1552* Check to see if the page has been removed from the1553* queue since the first such check. Leave it alone if1554* so, discarding any references collected by1555* pmap_ts_referenced().1556*/1557if (__predict_false(_vm_page_queue(old) == PQ_NONE))1558goto skip_page;15591560new = old;1561act_delta = refs;1562if ((old.flags & PGA_REFERENCED) != 0) {1563new.flags &= ~PGA_REFERENCED;1564act_delta++;1565}1566if (act_delta == 0) {1567;1568} else if (object->ref_count != 0) {1569/*1570* Increase the activation count if the1571* page was referenced while in the1572* inactive queue. This makes it less1573* likely that the page will be returned1574* prematurely to the inactive queue.1575*/1576new.act_count += ACT_ADVANCE +1577act_delta;1578if (new.act_count > ACT_MAX)1579new.act_count = ACT_MAX;15801581new.flags &= ~PGA_QUEUE_OP_MASK;1582new.flags |= PGA_REQUEUE;1583new.queue = PQ_ACTIVE;1584if (!vm_page_pqstate_commit(m, &old, new))1585continue;15861587VM_CNT_INC(v_reactivated);1588goto skip_page;1589} else if ((object->flags & OBJ_DEAD) == 0) {1590new.queue = PQ_INACTIVE;1591new.flags |= PGA_REQUEUE;1592if (!vm_page_pqstate_commit(m, &old, new))1593continue;1594goto skip_page;1595}1596break;1597}15981599/*1600* If the page appears to be clean at the machine-independent1601* layer, then remove all of its mappings from the pmap in1602* anticipation of freeing it. If, however, any of the page's1603* mappings allow write access, then the page may still be1604* modified until the last of those mappings are removed.1605*/1606if (object->ref_count != 0) {1607vm_page_test_dirty(m);1608if (m->dirty == 0 && !vm_page_try_remove_all(m))1609goto skip_page;1610}16111612/*1613* Clean pages can be freed, but dirty pages must be sent back1614* to the laundry, unless they belong to a dead object.1615* Requeueing dirty pages from dead objects is pointless, as1616* they are being paged out and freed by the thread that1617* destroyed the object.1618*/1619if (m->dirty == 0) {1620free_page:1621/*1622* Now we are guaranteed that no other threads are1623* manipulating the page, check for a last-second1624* reference that would save it from doom.1625*/1626if (vm_pageout_defer(m, PQ_INACTIVE, false))1627goto skip_page;16281629/*1630* Because we dequeued the page and have already checked1631* for pending dequeue and enqueue requests, we can1632* safely disassociate the page from the inactive queue1633* without holding the queue lock.1634*/1635m->a.queue = PQ_NONE;1636vm_page_free(m);1637page_shortage--;1638continue;1639}1640if ((object->flags & OBJ_DEAD) == 0) {1641vm_page_launder(m);16421643/*1644* If the page would be paged out to a swap device, and1645* no devices are configured or they are all nearly1646* full, then don't count it against our threshold,1647* since it most likely can't be used to meet our1648* target.1649*/1650if ((object->flags & OBJ_SWAP) == 0 ||1651!atomic_load_bool(&swap_pager_almost_full))1652dirty_count++;1653}1654skip_page:1655vm_page_xunbusy(m);1656continue;1657reinsert:1658vm_pageout_reinsert_inactive(&ss, &rq, m);1659}1660if (object != NULL)1661VM_OBJECT_WUNLOCK(object);1662vm_pageout_reinsert_inactive(&ss, &rq, NULL);1663vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);1664vm_pagequeue_lock(pq);1665vm_pageout_end_scan(&ss);1666vm_pagequeue_unlock(pq);16671668/*1669* Record the remaining shortage and the progress and rate it was made.1670*/1671atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage);1672getmicrouptime(&end);1673timevalsub(&end, &start);1674atomic_add_int(&vmd->vmd_inactive_us,1675end.tv_sec * 1000000 + end.tv_usec);1676atomic_add_int(&vmd->vmd_inactive_freed,1677starting_page_shortage - page_shortage);1678}16791680/*1681* Dispatch a number of inactive threads according to load and collect the1682* results to present a coherent view of paging activity on this domain.1683*/1684static int1685vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage)1686{1687u_int freed, pps, slop, threads, us;16881689vmd->vmd_inactive_shortage = shortage;1690slop = 0;16911692/*1693* If we have more work than we can do in a quarter of our interval, we1694* fire off multiple threads to process it.1695*/1696if ((threads = vmd->vmd_inactive_threads) > 1 &&1697vmd->vmd_helper_threads_enabled &&1698vmd->vmd_inactive_pps != 0 &&1699shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) {1700vmd->vmd_inactive_shortage /= threads;1701slop = shortage % threads;1702vm_domain_pageout_lock(vmd);1703blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1);1704blockcount_acquire(&vmd->vmd_inactive_running, threads - 1);1705wakeup(&vmd->vmd_inactive_shortage);1706vm_domain_pageout_unlock(vmd);1707}17081709/* Run the local thread scan. */1710vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage + slop);17111712/*1713* Block until helper threads report results and then accumulate1714* totals.1715*/1716blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM);1717freed = atomic_readandclear_int(&vmd->vmd_inactive_freed);1718VM_CNT_ADD(v_dfree, freed);17191720/*1721* Calculate the per-thread paging rate with an exponential decay of1722* prior results. Careful to avoid integer rounding errors with large1723* us values.1724*/1725us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1);1726if (us > 1000000)1727/* Keep rounding to tenths */1728pps = (freed * 10) / ((us * 10) / 1000000);1729else1730pps = (1000000 / us) * freed;1731vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2);17321733return (shortage - freed);1734}17351736/*1737* Attempt to reclaim the requested number of pages from the inactive queue.1738* Returns true if the shortage was addressed.1739*/1740static int1741vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage)1742{1743struct vm_pagequeue *pq;1744u_int addl_page_shortage, deficit, page_shortage;1745u_int starting_page_shortage;17461747/*1748* vmd_pageout_deficit counts the number of pages requested in1749* allocations that failed because of a free page shortage. We assume1750* that the allocations will be reattempted and thus include the deficit1751* in our scan target.1752*/1753deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);1754starting_page_shortage = shortage + deficit;17551756/*1757* Run the inactive scan on as many threads as is necessary.1758*/1759page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage);1760addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage);17611762/*1763* Wake up the laundry thread so that it can perform any needed1764* laundering. If we didn't meet our target, we're in shortfall and1765* need to launder more aggressively. If PQ_LAUNDRY is empty and no1766* swap devices are configured, the laundry thread has no work to do, so1767* don't bother waking it up.1768*1769* The laundry thread uses the number of inactive queue scans elapsed1770* since the last laundering to determine whether to launder again, so1771* keep count.1772*/1773if (starting_page_shortage > 0) {1774pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];1775vm_pagequeue_lock(pq);1776if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&1777(pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {1778if (page_shortage > 0) {1779vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;1780VM_CNT_INC(v_pdshortfalls);1781} else if (vmd->vmd_laundry_request !=1782VM_LAUNDRY_SHORTFALL)1783vmd->vmd_laundry_request =1784VM_LAUNDRY_BACKGROUND;1785wakeup(&vmd->vmd_laundry_request);1786}1787vmd->vmd_clean_pages_freed +=1788starting_page_shortage - page_shortage;1789vm_pagequeue_unlock(pq);1790}17911792/*1793* If the inactive queue scan fails repeatedly to meet its1794* target, kill the largest process.1795*/1796vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);17971798/*1799* See the description of addl_page_shortage above.1800*/1801*addl_shortage = addl_page_shortage + deficit;18021803return (page_shortage <= 0);1804}18051806static int vm_pageout_oom_vote;18071808/*1809* The pagedaemon threads randlomly select one to perform the1810* OOM. Trying to kill processes before all pagedaemons1811* failed to reach free target is premature.1812*/1813static void1814vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,1815int starting_page_shortage)1816{1817int old_vote;18181819/*1820* Do not trigger an OOM kill if the page daemon is able to make1821* progress, or if there is no instantaneous shortage. The latter case1822* can happen if the PID controller is still reacting to an acute1823* shortage, and the inactive queue is full of dirty pages.1824*/1825if (starting_page_shortage <= 0 || starting_page_shortage !=1826page_shortage || !vm_paging_needed(vmd, vmd->vmd_free_count))1827vmd->vmd_oom_seq = 0;1828else1829vmd->vmd_oom_seq++;1830if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {1831if (vmd->vmd_oom) {1832vmd->vmd_oom = false;1833atomic_subtract_int(&vm_pageout_oom_vote, 1);1834}1835return;1836}18371838/*1839* Do not follow the call sequence until OOM condition is1840* cleared.1841*/1842vmd->vmd_oom_seq = 0;18431844if (vmd->vmd_oom)1845return;18461847vmd->vmd_oom = true;1848old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);1849if (old_vote != vm_ndomains - 1)1850return;18511852/*1853* The current pagedaemon thread is the last in the quorum to1854* start OOM. Initiate the selection and signaling of the1855* victim.1856*/1857vm_pageout_oom(VM_OOM_MEM);18581859/*1860* After one round of OOM terror, recall our vote. On the1861* next pass, current pagedaemon would vote again if the low1862* memory condition is still there, due to vmd_oom being1863* false.1864*/1865vmd->vmd_oom = false;1866atomic_subtract_int(&vm_pageout_oom_vote, 1);1867}18681869/*1870* The OOM killer is the page daemon's action of last resort when1871* memory allocation requests have been stalled for a prolonged period1872* of time because it cannot reclaim memory. This function computes1873* the approximate number of physical pages that could be reclaimed if1874* the specified address space is destroyed.1875*1876* Private, anonymous memory owned by the address space is the1877* principal resource that we expect to recover after an OOM kill.1878* Since the physical pages mapped by the address space's COW entries1879* are typically shared pages, they are unlikely to be released and so1880* they are not counted.1881*1882* To get to the point where the page daemon runs the OOM killer, its1883* efforts to write-back vnode-backed pages may have stalled. This1884* could be caused by a memory allocation deadlock in the write path1885* that might be resolved by an OOM kill. Therefore, physical pages1886* belonging to vnode-backed objects are counted, because they might1887* be freed without being written out first if the address space holds1888* the last reference to an unlinked vnode.1889*1890* Similarly, physical pages belonging to OBJT_PHYS objects are1891* counted because the address space might hold the last reference to1892* the object.1893*/1894static long1895vm_pageout_oom_pagecount(struct vmspace *vmspace)1896{1897vm_map_t map;1898vm_map_entry_t entry;1899vm_object_t obj;1900long res;19011902map = &vmspace->vm_map;1903KASSERT(!vm_map_is_system(map), ("system map"));1904sx_assert(&map->lock, SA_LOCKED);1905res = 0;1906VM_MAP_ENTRY_FOREACH(entry, map) {1907if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)1908continue;1909obj = entry->object.vm_object;1910if (obj == NULL)1911continue;1912if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&1913obj->ref_count != 1)1914continue;1915if (obj->type == OBJT_PHYS || obj->type == OBJT_VNODE ||1916(obj->flags & OBJ_SWAP) != 0)1917res += obj->resident_page_count;1918}1919return (res);1920}19211922static int vm_oom_ratelim_last;1923static int vm_oom_pf_secs = 10;1924SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0,1925"");1926static struct mtx vm_oom_ratelim_mtx;19271928void1929vm_pageout_oom(int shortage)1930{1931const char *reason;1932struct proc *p, *bigproc;1933vm_offset_t size, bigsize;1934struct thread *td;1935struct vmspace *vm;1936int now;1937bool breakout;19381939/*1940* For OOM requests originating from vm_fault(), there is a high1941* chance that a single large process faults simultaneously in1942* several threads. Also, on an active system running many1943* processes of middle-size, like buildworld, all of them1944* could fault almost simultaneously as well.1945*1946* To avoid killing too many processes, rate-limit OOMs1947* initiated by vm_fault() time-outs on the waits for free1948* pages.1949*/1950mtx_lock(&vm_oom_ratelim_mtx);1951now = ticks;1952if (shortage == VM_OOM_MEM_PF &&1953(u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) {1954mtx_unlock(&vm_oom_ratelim_mtx);1955return;1956}1957vm_oom_ratelim_last = now;1958mtx_unlock(&vm_oom_ratelim_mtx);19591960/*1961* We keep the process bigproc locked once we find it to keep anyone1962* from messing with it; however, there is a possibility of1963* deadlock if process B is bigproc and one of its child processes1964* attempts to propagate a signal to B while we are waiting for A's1965* lock while walking this list. To avoid this, we don't block on1966* the process lock but just skip a process if it is already locked.1967*/1968bigproc = NULL;1969bigsize = 0;1970sx_slock(&allproc_lock);1971FOREACH_PROC_IN_SYSTEM(p) {1972PROC_LOCK(p);19731974/*1975* If this is a system, protected or killed process, skip it.1976*/1977if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |1978P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||1979p->p_pid == 1 || P_KILLED(p) ||1980(p->p_pid < 48 && swap_pager_avail != 0)) {1981PROC_UNLOCK(p);1982continue;1983}1984/*1985* If the process is in a non-running type state,1986* don't touch it. Check all the threads individually.1987*/1988breakout = false;1989FOREACH_THREAD_IN_PROC(p, td) {1990thread_lock(td);1991if (!TD_ON_RUNQ(td) &&1992!TD_IS_RUNNING(td) &&1993!TD_IS_SLEEPING(td) &&1994!TD_IS_SUSPENDED(td)) {1995thread_unlock(td);1996breakout = true;1997break;1998}1999thread_unlock(td);2000}2001if (breakout) {2002PROC_UNLOCK(p);2003continue;2004}2005/*2006* get the process size2007*/2008vm = vmspace_acquire_ref(p);2009if (vm == NULL) {2010PROC_UNLOCK(p);2011continue;2012}2013_PHOLD(p);2014PROC_UNLOCK(p);2015sx_sunlock(&allproc_lock);2016if (!vm_map_trylock_read(&vm->vm_map)) {2017vmspace_free(vm);2018sx_slock(&allproc_lock);2019PRELE(p);2020continue;2021}2022size = vmspace_swap_count(vm);2023if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF)2024size += vm_pageout_oom_pagecount(vm);2025vm_map_unlock_read(&vm->vm_map);2026vmspace_free(vm);2027sx_slock(&allproc_lock);20282029/*2030* If this process is bigger than the biggest one,2031* remember it.2032*/2033if (size > bigsize) {2034if (bigproc != NULL)2035PRELE(bigproc);2036bigproc = p;2037bigsize = size;2038} else {2039PRELE(p);2040}2041}2042sx_sunlock(&allproc_lock);20432044if (bigproc != NULL) {2045switch (shortage) {2046case VM_OOM_MEM:2047reason = "failed to reclaim memory";2048break;2049case VM_OOM_MEM_PF:2050reason = "a thread waited too long to allocate a page";2051break;2052case VM_OOM_SWAPZ:2053reason = "out of swap space";2054break;2055default:2056panic("unknown OOM reason %d", shortage);2057}2058if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0)2059panic("%s", reason);2060PROC_LOCK(bigproc);2061killproc(bigproc, reason);2062sched_nice(bigproc, PRIO_MIN);2063_PRELE(bigproc);2064PROC_UNLOCK(bigproc);2065}2066}20672068/*2069* Signal a free page shortage to subsystems that have registered an event2070* handler. Reclaim memory from UMA in the event of a severe shortage.2071* Return true if the free page count should be re-evaluated.2072*/2073static bool2074vm_pageout_lowmem(void)2075{2076static int lowmem_ticks = 0;2077int last;2078bool ret;20792080ret = false;20812082last = atomic_load_int(&lowmem_ticks);2083while ((u_int)(ticks - last) / hz >= lowmem_period) {2084if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)2085continue;20862087/*2088* Decrease registered cache sizes.2089*/2090SDT_PROBE0(vm, , , vm__lowmem_scan);2091EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);20922093/*2094* We do this explicitly after the caches have been2095* drained above.2096*/2097uma_reclaim(UMA_RECLAIM_TRIM);2098ret = true;2099break;2100}21012102/*2103* Kick off an asynchronous reclaim of cached memory if one of the2104* page daemons is failing to keep up with demand. Use the "severe"2105* threshold instead of "min" to ensure that we do not blow away the2106* caches if a subset of the NUMA domains are depleted by kernel memory2107* allocations; the domainset iterators automatically skip domains2108* below the "min" threshold on the first pass.2109*2110* UMA reclaim worker has its own rate-limiting mechanism, so don't2111* worry about kicking it too often.2112*/2113if (vm_page_count_severe())2114uma_reclaim_wakeup();21152116return (ret);2117}21182119static void2120vm_pageout_worker(void *arg)2121{2122struct vm_domain *vmd;2123u_int ofree;2124int addl_shortage, domain, shortage;2125bool target_met;21262127domain = (uintptr_t)arg;2128vmd = VM_DOMAIN(domain);2129shortage = 0;2130target_met = true;21312132/*2133* XXXKIB It could be useful to bind pageout daemon threads to2134* the cores belonging to the domain, from which vm_page_array2135* is allocated.2136*/21372138KASSERT(vmd->vmd_segs != 0, ("domain without segments"));2139vmd->vmd_last_active_scan = ticks;21402141/*2142* The pageout daemon worker is never done, so loop forever.2143*/2144while (TRUE) {2145vm_domain_pageout_lock(vmd);21462147/*2148* We need to clear wanted before we check the limits. This2149* prevents races with wakers who will check wanted after they2150* reach the limit.2151*/2152atomic_store_int(&vmd->vmd_pageout_wanted, 0);21532154/*2155* Might the page daemon need to run again?2156*/2157if (vm_paging_needed(vmd, vmd->vmd_free_count)) {2158/*2159* Yes. If the scan failed to produce enough free2160* pages, sleep uninterruptibly for some time in the2161* hope that the laundry thread will clean some pages.2162*/2163vm_domain_pageout_unlock(vmd);2164if (!target_met)2165pause("pwait", hz / VM_INACT_SCAN_RATE);2166} else {2167/*2168* No, sleep until the next wakeup or until pages2169* need to have their reference stats updated.2170*/2171if (mtx_sleep(&vmd->vmd_pageout_wanted,2172vm_domain_pageout_lockptr(vmd), PDROP | PVM,2173"psleep", hz / VM_INACT_SCAN_RATE) == 0)2174VM_CNT_INC(v_pdwakeups);2175}21762177/* Prevent spurious wakeups by ensuring that wanted is set. */2178atomic_store_int(&vmd->vmd_pageout_wanted, 1);21792180/*2181* Use the controller to calculate how many pages to free in2182* this interval, and scan the inactive queue. If the lowmem2183* handlers appear to have freed up some pages, subtract the2184* difference from the inactive queue scan target.2185*/2186shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);2187if (shortage > 0) {2188ofree = vmd->vmd_free_count;2189if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)2190shortage -= min(vmd->vmd_free_count - ofree,2191(u_int)shortage);2192target_met = vm_pageout_inactive(vmd, shortage,2193&addl_shortage);2194} else2195addl_shortage = 0;21962197/*2198* Scan the active queue. A positive value for shortage2199* indicates that we must aggressively deactivate pages to avoid2200* a shortfall.2201*/2202shortage = vm_pageout_active_target(vmd) + addl_shortage;2203vm_pageout_scan_active(vmd, shortage);2204}2205}22062207/*2208* vm_pageout_helper runs additional pageout daemons in times of high paging2209* activity.2210*/2211static void2212vm_pageout_helper(void *arg)2213{2214struct vm_domain *vmd;2215int domain;22162217domain = (uintptr_t)arg;2218vmd = VM_DOMAIN(domain);22192220vm_domain_pageout_lock(vmd);2221for (;;) {2222msleep(&vmd->vmd_inactive_shortage,2223vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0);2224blockcount_release(&vmd->vmd_inactive_starting, 1);22252226vm_domain_pageout_unlock(vmd);2227vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage);2228vm_domain_pageout_lock(vmd);22292230/*2231* Release the running count while the pageout lock is held to2232* prevent wakeup races.2233*/2234blockcount_release(&vmd->vmd_inactive_running, 1);2235}2236}22372238static int2239get_pageout_threads_per_domain(const struct vm_domain *vmd)2240{2241unsigned total_pageout_threads, eligible_cpus, domain_cpus;22422243if (VM_DOMAIN_EMPTY(vmd->vmd_domain))2244return (0);22452246/*2247* Semi-arbitrarily constrain pagedaemon threads to less than half the2248* total number of CPUs in the system as an upper limit.2249*/2250if (pageout_cpus_per_thread < 2)2251pageout_cpus_per_thread = 2;2252else if (pageout_cpus_per_thread > mp_ncpus)2253pageout_cpus_per_thread = mp_ncpus;22542255total_pageout_threads = howmany(mp_ncpus, pageout_cpus_per_thread);2256domain_cpus = CPU_COUNT(&cpuset_domain[vmd->vmd_domain]);22572258/* Pagedaemons are not run in empty domains. */2259eligible_cpus = mp_ncpus;2260for (unsigned i = 0; i < vm_ndomains; i++)2261if (VM_DOMAIN_EMPTY(i))2262eligible_cpus -= CPU_COUNT(&cpuset_domain[i]);22632264/*2265* Assign a portion of the total pageout threads to this domain2266* corresponding to the fraction of pagedaemon-eligible CPUs in the2267* domain. In asymmetric NUMA systems, domains with more CPUs may be2268* allocated more threads than domains with fewer CPUs.2269*/2270return (howmany(total_pageout_threads * domain_cpus, eligible_cpus));2271}22722273/*2274* Initialize basic pageout daemon settings. See the comment above the2275* definition of vm_domain for some explanation of how these thresholds are2276* used.2277*/2278static void2279vm_pageout_init_domain(int domain)2280{2281struct vm_domain *vmd;2282struct sysctl_oid *oid;22832284vmd = VM_DOMAIN(domain);2285vmd->vmd_interrupt_free_min = 2;22862287/*2288* v_free_reserved needs to include enough for the largest2289* swap pager structures plus enough for any pv_entry structs2290* when paging.2291*/2292vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE +2293vmd->vmd_interrupt_free_min;2294vmd->vmd_free_reserved = vm_pageout_page_count +2295vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768;2296vmd->vmd_free_min = vmd->vmd_page_count / 200;2297vmd->vmd_free_severe = vmd->vmd_free_min / 2;2298vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;2299vmd->vmd_free_min += vmd->vmd_free_reserved;2300vmd->vmd_free_severe += vmd->vmd_free_reserved;2301vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;2302if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)2303vmd->vmd_inactive_target = vmd->vmd_free_count / 3;23042305/*2306* Set the default wakeup threshold to be 10% below the paging2307* target. This keeps the steady state out of shortfall.2308*/2309vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;23102311/*2312* Target amount of memory to move out of the laundry queue during a2313* background laundering. This is proportional to the amount of system2314* memory.2315*/2316vmd->vmd_background_launder_target = (vmd->vmd_free_target -2317vmd->vmd_free_min) / 10;23182319/* Initialize the pageout daemon pid controller. */2320pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,2321vmd->vmd_free_target, PIDCTRL_BOUND,2322PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);2323oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,2324"pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");2325pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));23262327vmd->vmd_inactive_threads = get_pageout_threads_per_domain(vmd);2328SYSCTL_ADD_BOOL(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,2329"pageout_helper_threads_enabled", CTLFLAG_RWTUN,2330&vmd->vmd_helper_threads_enabled, 0,2331"Enable multi-threaded inactive queue scanning");2332}23332334static void2335vm_pageout_init(void)2336{2337u_long freecount;2338int i;23392340/*2341* Initialize some paging parameters.2342*/2343freecount = 0;2344for (i = 0; i < vm_ndomains; i++) {2345struct vm_domain *vmd;23462347vm_pageout_init_domain(i);2348vmd = VM_DOMAIN(i);2349vm_cnt.v_free_reserved += vmd->vmd_free_reserved;2350vm_cnt.v_free_target += vmd->vmd_free_target;2351vm_cnt.v_free_min += vmd->vmd_free_min;2352vm_cnt.v_inactive_target += vmd->vmd_inactive_target;2353vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;2354vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;2355vm_cnt.v_free_severe += vmd->vmd_free_severe;2356freecount += vmd->vmd_free_count;2357}23582359/*2360* Set interval in seconds for active scan. We want to visit each2361* page at least once every ten minutes. This is to prevent worst2362* case paging behaviors with stale active LRU.2363*/2364if (vm_pageout_update_period == 0)2365vm_pageout_update_period = 600;23662367/*2368* Set the maximum number of user-wired virtual pages. Historically the2369* main source of such pages was mlock(2) and mlockall(2). Hypervisors2370* may also request user-wired memory.2371*/2372if (vm_page_max_user_wired == 0)2373vm_page_max_user_wired = 4 * freecount / 5;2374}23752376/*2377* vm_pageout is the high level pageout daemon.2378*/2379static void2380vm_pageout(void)2381{2382struct proc *p;2383struct thread *td;2384int error, first, i, j, pageout_threads;23852386p = curproc;2387td = curthread;23882389mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF);2390swap_pager_swap_init();2391for (first = -1, i = 0; i < vm_ndomains; i++) {2392if (VM_DOMAIN_EMPTY(i)) {2393if (bootverbose)2394printf("domain %d empty; skipping pageout\n",2395i);2396continue;2397}2398if (first == -1)2399first = i;2400else {2401error = kthread_add(vm_pageout_worker,2402(void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);2403if (error != 0)2404panic("starting pageout for domain %d: %d\n",2405i, error);2406}2407pageout_threads = VM_DOMAIN(i)->vmd_inactive_threads;2408for (j = 0; j < pageout_threads - 1; j++) {2409error = kthread_add(vm_pageout_helper,2410(void *)(uintptr_t)i, p, NULL, 0, 0,2411"dom%d helper%d", i, j);2412if (error != 0)2413panic("starting pageout helper %d for domain "2414"%d: %d\n", j, i, error);2415}2416error = kthread_add(vm_pageout_laundry_worker,2417(void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);2418if (error != 0)2419panic("starting laundry for domain %d: %d", i, error);2420}2421error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");2422if (error != 0)2423panic("starting uma_reclaim helper, error %d\n", error);24242425snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);2426vm_pageout_worker((void *)(uintptr_t)first);2427}24282429/*2430* Perform an advisory wakeup of the page daemon.2431*/2432void2433pagedaemon_wakeup(int domain)2434{2435struct vm_domain *vmd;24362437vmd = VM_DOMAIN(domain);2438vm_domain_pageout_assert_unlocked(vmd);2439if (curproc == pageproc)2440return;24412442if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {2443vm_domain_pageout_lock(vmd);2444atomic_store_int(&vmd->vmd_pageout_wanted, 1);2445wakeup(&vmd->vmd_pageout_wanted);2446vm_domain_pageout_unlock(vmd);2447}2448}244924502451