/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2017, Jeffrey Roberson <[email protected]>4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice unmodified, this list of conditions, and the following11* disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR17* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES18* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.19* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,20* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT21* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,22* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY23* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT24* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF25* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.26*27*/2829#include <sys/cdefs.h>30#include "opt_vm.h"3132#include <sys/param.h>33#include <sys/systm.h>34#include <sys/bitset.h>35#include <sys/domainset.h>36#include <sys/proc.h>37#include <sys/lock.h>38#include <sys/mutex.h>39#include <sys/malloc.h>40#include <sys/rwlock.h>41#include <sys/pctrie.h>42#include <sys/vmmeter.h>4344#include <vm/vm.h>45#include <vm/vm_param.h>46#include <vm/vm_domainset.h>47#include <vm/vm_object.h>48#include <vm/vm_page.h>49#include <vm/vm_phys.h>5051#ifdef NUMA52/*53* Iterators are written such that the first nowait pass has as short a54* codepath as possible to eliminate bloat from the allocator. It is55* assumed that most allocations are successful.56*/5758static int vm_domainset_default_stride = 64;5960static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain);616263/*64* Determine which policy is to be used for this allocation.65*/66static void67vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,68int *iter, struct vm_object *obj, vm_pindex_t pindex)69{7071di->di_domain = ds;72di->di_iter = iter;73di->di_policy = ds->ds_policy;74DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask);75if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {76#if VM_NRESERVLEVEL > 077if (vm_object_reserv(obj)) {78/*79* Color the pindex so we end up on the correct80* reservation boundary.81*/82pindex += obj->pg_color;83#if VM_NRESERVLEVEL > 184pindex >>= VM_LEVEL_1_ORDER;85#endif86pindex >>= VM_LEVEL_0_ORDER;87} else88#endif89pindex /= vm_domainset_default_stride;90/*91* Offset pindex so the first page of each object does92* not end up in domain 0.93*/94if (obj != NULL)95pindex += (((uintptr_t)obj) / sizeof(*obj));96di->di_offset = pindex;97}98}99100static void101vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)102{103104/* Grab the next domain in 'ds_order'. */105*domain = di->di_domain->ds_order[106(*di->di_iter)++ % di->di_domain->ds_cnt];107}108109static void110vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)111{112int d;113114d = di->di_offset % di->di_domain->ds_cnt;115*di->di_iter = d;116*domain = di->di_domain->ds_order[d];117}118119/*120* Internal function determining the current phase's first candidate domain.121*122* Returns whether these is an eligible domain, which is returned through123* '*domain'. '*domain' can be modified even if there is no eligible domain.124*125* See herald comment of vm_domainset_iter_first() below about phases.126*/127static bool128vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain)129{130switch (di->di_policy) {131case DOMAINSET_POLICY_FIRSTTOUCH:132*domain = PCPU_GET(domain);133break;134case DOMAINSET_POLICY_ROUNDROBIN:135vm_domainset_iter_rr(di, domain);136break;137case DOMAINSET_POLICY_PREFER:138*domain = di->di_domain->ds_prefer;139break;140case DOMAINSET_POLICY_INTERLEAVE:141vm_domainset_iter_interleave(di, domain);142break;143default:144panic("%s: Unknown policy %d", __func__, di->di_policy);145}146KASSERT(*domain < vm_ndomains,147("%s: Invalid domain %d", __func__, *domain));148149/*150* Has the policy's start domain already been visited?151*/152if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask))153return (vm_domainset_iter_next(di, domain));154155DOMAINSET_CLR(*domain, &di->di_remain_mask);156157/* Does it have enough free pages (phase 1)? */158if (di->di_minskip && vm_page_count_min_domain(*domain)) {159/* Mark the domain as eligible for phase 2. */160DOMAINSET_SET(*domain, &di->di_min_mask);161return (vm_domainset_iter_next(di, domain));162}163164return (true);165}166167/*168* Resets an iterator to point to the first candidate domain.169*170* Returns whether there is an eligible domain to start with. '*domain' may be171* modified even if there is none.172*173* There must have been one call to vm_domainset_iter_init() before.174*175* This function must be called at least once before calling176* vm_domainset_iter_next(). Note that functions wrapping177* vm_domainset_iter_init() usually do that themselves.178*179* This function may be called again to reset the iterator to the policy's first180* candidate domain. After each reset, the iterator will visit the same domains181* as in the previous iteration minus those on which vm_domainset_iter_ignore()182* has been called. Note that the first candidate domain may change at each183* reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN184* policy).185*186* Domains which have a number of free pages over 'v_free_min' are always187* visited first (this is called the "phase 1" in comments, "phase 2" being the188* examination of the remaining domains; no domains are ever visited twice).189*/190static bool191vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)192{193/* Initialize the mask of domains to visit. */194DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);195/*196* No candidate domains for phase 2 at start. This will be filled by197* phase 1.198*/199DOMAINSET_ZERO(&di->di_min_mask);200/* Skip domains below 'v_free_min' on phase 1. */201di->di_minskip = true;202203return (vm_domainset_iter_phase_first(di, domain));204}205206/*207* Advances the iterator to the next candidate domain.208*209* Returns whether there was another domain to visit. '*domain' may be modified210* even if there is none.211*212* vm_domainset_iter_first() must have been called at least once before using213* this function (see its herald comment for more details on iterators).214*/215static bool216vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)217{218/* Loop while there remains domains to visit in the current phase. */219while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {220/* Grab the next domain in 'ds_order'. */221vm_domainset_iter_rr(di, domain);222KASSERT(*domain < vm_ndomains,223("%s: Invalid domain %d", __func__, *domain));224225if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {226DOMAINSET_CLR(*domain, &di->di_remain_mask);227if (!di->di_minskip || !vm_page_count_min_domain(*domain))228return (true);229DOMAINSET_SET(*domain, &di->di_min_mask);230}231}232233/*234* If phase 1 (skip low memory domains) is over, start phase 2 (consider235* low memory domains).236*/237if (di->di_minskip) {238di->di_minskip = false;239/* Browse domains that were under 'v_free_min'. */240DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);241return (vm_domainset_iter_phase_first(di, domain));242}243244return (false);245}246247int248vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,249vm_pindex_t pindex, int *domain, int *req)250{251struct domainset_ref *dr;252253di->di_flags = *req;254*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |255VM_ALLOC_NOWAIT;256257/*258* Object policy takes precedence over thread policy. The policies259* are immutable and unsynchronized. Updates can race but pointer260* loads are assumed to be atomic.261*/262if (obj != NULL && obj->domain.dr_policy != NULL)263dr = &obj->domain;264else265dr = &curthread->td_domain;266267vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex);268/*269* XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was270* passed?271*/272return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);273}274275int276vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,277int *domain, struct pctrie_iter *pages)278{279if (vm_domainset_iter_next(di, domain))280return (0);281282/* If we visited all domains and this was a NOWAIT we return error. */283if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0)284return (ENOMEM);285286/* Wait for one of the domains to accumulate some free pages. */287if (obj != NULL) {288VM_OBJECT_WUNLOCK(obj);289if (pages != NULL)290pctrie_iter_reset(pages);291}292vm_wait_doms(&di->di_valid_mask, 0);293if (obj != NULL)294VM_OBJECT_WLOCK(obj);295if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)296return (ENOMEM);297298/* Restart the search. */299/* XXXOC: Shouldn't we just panic on 'false'? */300return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);301}302303static int304_vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain,305int *flags)306{307di->di_flags = *flags;308*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;309/* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */310return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);311}312313int314vm_domainset_iter_policy_init(struct vm_domainset_iter *di,315struct domainset *ds, int *domain, int *flags)316{317318vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0);319return (_vm_domainset_iter_policy_init(di, domain, flags));320}321322int323vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,324struct domainset_ref *dr, int *domain, int *flags)325{326327vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0);328return (_vm_domainset_iter_policy_init(di, domain, flags));329}330331int332vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)333{334if (vm_domainset_iter_next(di, domain))335return (0);336337/* If we visited all domains and this was a NOWAIT we return error. */338if ((di->di_flags & M_WAITOK) == 0)339return (ENOMEM);340341/* Wait for one of the domains to accumulate some free pages. */342vm_wait_doms(&di->di_valid_mask, 0);343344/* Restart the search. */345/* XXXOC: Shouldn't we just panic on 'false'? */346return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM);347}348349void350vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain)351{352KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask),353("%s: domain %d not present in di_valid_mask for di %p",354__func__, domain, di));355DOMAINSET_CLR(domain, &di->di_valid_mask);356}357358#else /* !NUMA */359360int361vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,362int *domain, struct pctrie_iter *pages)363{364365return (EJUSTRETURN);366}367368int369vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,370vm_pindex_t pindex, int *domain, int *flags)371{372*domain = 0;373return (0);374}375376int377vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)378{379380return (EJUSTRETURN);381}382383int384vm_domainset_iter_policy_init(struct vm_domainset_iter *di,385struct domainset *ds, int *domain, int *flags)386{387*domain = 0;388return (0);389}390391int392vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,393struct domainset_ref *dr, int *domain, int *flags)394{395*domain = 0;396return (0);397}398399void400vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused,401int domain __unused)402{403}404405#endif /* NUMA */406407408