/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2017, Jeffrey Roberson <[email protected]>4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice unmodified, this list of conditions, and the following11* disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR17* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES18* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.19* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,20* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT21* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,22* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY23* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT24* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF25* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.26*27*/2829#include <sys/cdefs.h>30#include "opt_vm.h"3132#include <sys/param.h>33#include <sys/systm.h>34#include <sys/bitset.h>35#include <sys/domainset.h>36#include <sys/proc.h>37#include <sys/lock.h>38#include <sys/mutex.h>39#include <sys/malloc.h>40#include <sys/rwlock.h>41#include <sys/pctrie.h>42#include <sys/vmmeter.h>4344#include <vm/vm.h>45#include <vm/vm_param.h>46#include <vm/vm_domainset.h>47#include <vm/vm_object.h>48#include <vm/vm_page.h>49#include <vm/vm_phys.h>5051#ifdef NUMA52/*53* Iterators are written such that the first nowait pass has as short a54* codepath as possible to eliminate bloat from the allocator. It is55* assumed that most allocations are successful.56*/5758static int vm_domainset_default_stride = 64;5960static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);616263/*64* Determine which policy is to be used for this allocation.65*/66static void67vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,68int *iter, struct vm_object *obj, vm_pindex_t pindex)69{7071di->di_domain = ds;72di->di_iter = iter;73di->di_policy = ds->ds_policy;74DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask);75if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {76#if VM_NRESERVLEVEL > 077if (vm_object_reserv(obj)) {78/*79* Color the pindex so we end up on the correct80* reservation boundary.81*/82pindex += obj->pg_color;83#if VM_NRESERVLEVEL > 184pindex >>= VM_LEVEL_1_ORDER;85#endif86pindex >>= VM_LEVEL_0_ORDER;87} else88#endif89pindex /= vm_domainset_default_stride;90/*91* Offset pindex so the first page of each object does92* not end up in domain 0.93*/94if (obj != NULL)95pindex += (((uintptr_t)obj) / sizeof(*obj));96di->di_offset = pindex;97}98}99100static void101vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)102{103104/* Grab the next domain in 'ds_order'. */105*domain = di->di_domain->ds_order[106(*di->di_iter)++ % di->di_domain->ds_cnt];107}108109static void110vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)111{112int d;113114d = di->di_offset % di->di_domain->ds_cnt;115*domain = di->di_domain->ds_order[d];116}117118/*119* Internal function determining the current phase's first candidate domain.120*121* Returns whether these is an eligible domain, which is returned through122* '*domain'. '*domain' can be modified even if there is no eligible domain.123*124* See herald comment of vm_domainset_iter_first() below about phases.125*/126static bool127vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)128{129switch (di->di_policy) {130case DOMAINSET_POLICY_FIRSTTOUCH:131*domain = PCPU_GET(domain);132break;133case DOMAINSET_POLICY_ROUNDROBIN:134vm_domainset_iter_rr(di, domain);135break;136case DOMAINSET_POLICY_PREFER:137*domain = di->di_domain->ds_prefer;138break;139case DOMAINSET_POLICY_INTERLEAVE:140vm_domainset_iter_interleave(di, domain);141break;142default:143panic("%s: Unknown policy %d", __func__, di->di_policy);144}145KASSERT(*domain < vm_ndomains,146("%s: Invalid domain %d", __func__, *domain));147148/*149* Has the policy's start domain already been visited?150*/151if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))152return (vm_domainset_iter_next(di, domain));153154DOMAINSET_CLR(*domain, &di->di_remain_mask);155156/* Does it have enough free pages (phase 1)? */157if (di->di_minskip && vm_page_count_min_domain(*domain)) {158/* Mark the domain as eligible for phase 2. */159DOMAINSET_SET(*domain, &di->di_min_mask);160return (vm_domainset_iter_next(di, domain));161}162163return (true);164}165166/*167* Resets an iterator to point to the first candidate domain.168*169* Returns whether there is an eligible domain to start with. '*domain' may be170* modified even if there is none.171*172* There must have been one call to vm_domainset_iter_init() before.173*174* This function must be called at least once before calling175* vm_domainset_iter_next(). Note that functions wrapping176* vm_domainset_iter_init() usually do that themselves.177*178* This function may be called again to reset the iterator to the policy's first179* candidate domain. After each reset, the iterator will visit the same domains180* as in the previous iteration minus those on which vm_domainset_iter_ignore()181* has been called. Note that the first candidate domain may change at each182* reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN183* policy).184*185* Domains which have a number of free pages over 'v_free_min' are always186* visited first (this is called the "phase 1" in comments, "phase 2" being the187* examination of the remaining domains; no domains are ever visited twice).188*/189static bool190vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)191{192/* Initialize the mask of domains to visit. */193DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);194/*195* No candidate domains for phase 2 at start. This will be filled by196* phase 1.197*/198DOMAINSET_ZERO(&di->di_min_mask);199/* Skip domains below 'v_free_min' on phase 1. */200di->di_minskip = true;201202return (vm_domainset_iter_phase_first(di, domain));203}204205/*206* Advances the iterator to the next candidate domain.207*208* Returns whether there was another domain to visit. '*domain' may be modified209* even if there is none.210*211* vm_domainset_iter_first() must have been called at least once before using212* this function (see its herald comment for more details on iterators).213*/214static bool215vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)216{217/* Loop while there remains domains to visit in the current phase. */218while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {219/* Grab the next domain in 'ds_order'. */220vm_domainset_iter_rr(di, domain);221KASSERT(*domain < vm_ndomains,222("%s: Invalid domain %d", __func__, *domain));223224if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {225DOMAINSET_CLR(*domain, &di->di_remain_mask);226if (!di->di_minskip || !vm_page_count_min_domain(*domain))227return (true);228DOMAINSET_SET(*domain, &di->di_min_mask);229}230}231232/*233* If phase 1 (skip low memory domains) is over, start phase 2 (consider234* low memory domains).235*/236if (di->di_minskip) {237di->di_minskip = false;238/* Browse domains that were under 'v_free_min'. */239DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);240return (vm_domainset_iter_phase_first(di, domain));241}242243return (false);244}245246int247vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,248vm_pindex_t pindex, int *domain, int *req)249{250struct domainset_ref *dr;251252di->di_flags = *req;253*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |254VM_ALLOC_NOWAIT;255256/*257* Object policy takes precedence over thread policy. The policies258* are immutable and unsynchronized. Updates can race but pointer259* loads are assumed to be atomic.260*/261if (obj != NULL && obj->domain.dr_policy != NULL) {262/*263* This write lock protects non-atomic increments of the264* iterator index in vm_domainset_iter_rr().265*/266VM_OBJECT_ASSERT_WLOCKED(obj);267dr = &obj->domain;268} else269dr = &curthread->td_domain;270271vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);272/*273* XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was274* passed?275*/276return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);277}278279int280vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,281int *domain, struct pctrie_iter *pages)282{283if (vm_domainset_iter_next(di, domain))284return (0);285286/* If we visited all domains and this was a NOWAIT we return error. */287if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)288return (ENOMEM);289290/* Wait for one of the domains to accumulate some free pages. */291if (obj != NULL) {292VM_OBJECT_WUNLOCK(obj);293if (pages != NULL)294pctrie_iter_reset(pages);295}296vm_wait_doms(&di->di_valid_mask, 0);297if (obj != NULL)298VM_OBJECT_WLOCK(obj);299if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)300return (ENOMEM);301302/* Restart the search. */303/* XXXOC: Shouldn't we just panic on 'false'? */304return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);305}306307static int308_vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,309int *flags)310{311di->di_flags = *flags;312*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;313/* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */314return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);315}316317int318vm_domainset_iter_policy_init(struct vm_domainset_iter *di,319struct domainset *ds, int *domain, int *flags)320{321322vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);323return (_vm_domainset_iter_policy_init(di, domain, flags));324}325326int327vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,328struct domainset_ref *dr, int *domain, int *flags)329{330331vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);332return (_vm_domainset_iter_policy_init(di, domain, flags));333}334335int336vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)337{338if (vm_domainset_iter_next(di, domain))339return (0);340341/* If we visited all domains and this was a NOWAIT we return error. */342if ((di->di_flags & M_WAITOK) == 0)343return (ENOMEM);344345/* Wait for one of the domains to accumulate some free pages. */346vm_wait_doms(&di->di_valid_mask, 0);347348/* Restart the search. */349/* XXXOC: Shouldn't we just panic on 'false'? */350return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);351}352353void354vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain)355{356KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask),357("%s: domain %d not present in di_valid_mask for di %p",358__func__, domain, di));359DOMAINSET_CLR(domain, &di->di_valid_mask);360}361362#else /* !NUMA */363364int365vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,366int *domain, struct pctrie_iter *pages)367{368369return (EJUSTRETURN);370}371372int373vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,374vm_pindex_t pindex, int *domain, int *flags)375{376*domain = 0;377return (0);378}379380int381vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)382{383384return (EJUSTRETURN);385}386387int388vm_domainset_iter_policy_init(struct vm_domainset_iter *di,389struct domainset *ds, int *domain, int *flags)390{391*domain = 0;392return (0);393}394395int396vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,397struct domainset_ref *dr, int *domain, int *flags)398{399*domain = 0;400return (0);401}402403void404vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused,405int domain __unused)406{407}408409#endif /* NUMA */410411412