// SPDX-License-Identifier: GPL-2.01/*2* kernel/cpuset.c3*4* Processor and Memory placement constraints for sets of tasks.5*6* Copyright (C) 2003 BULL SA.7* Copyright (C) 2004-2007 Silicon Graphics, Inc.8* Copyright (C) 2006 Google, Inc9*10* Portions derived from Patrick Mochel's sysfs code.11* sysfs is Copyright (c) 2001-3 Patrick Mochel12*13* 2003-10-10 Written by Simon Derr.14* 2003-10-22 Updates by Stephen Hemminger.15* 2004 May-July Rework by Paul Jackson.16* 2006 Rework by Paul Menage to use generic cgroups17* 2008 Rework of the scheduler domains and CPU hotplug handling18* by Max Krasnyansky19*/20#include "cpuset-internal.h"2122#include <linux/init.h>23#include <linux/interrupt.h>24#include <linux/kernel.h>25#include <linux/mempolicy.h>26#include <linux/mm.h>27#include <linux/memory.h>28#include <linux/export.h>29#include <linux/rcupdate.h>30#include <linux/sched.h>31#include <linux/sched/deadline.h>32#include <linux/sched/mm.h>33#include <linux/sched/task.h>34#include <linux/security.h>35#include <linux/oom.h>36#include <linux/sched/isolation.h>37#include <linux/wait.h>38#include <linux/workqueue.h>39#include <linux/task_work.h>4041DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);42DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);4344/*45* There could be abnormal cpuset configurations for cpu or memory46* node binding, add this key to provide a quick low-cost judgment47* of the situation.48*/49DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);5051static const char * const perr_strings[] = {52[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",53[PERR_INVPARENT] = "Parent is an invalid partition root",54[PERR_NOTPART] = "Parent is not a partition root",55[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",56[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",57[PERR_HOTPLUG] = "No cpu available due to hotplug",58[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",59[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",60[PERR_ACCESS] = "Enable partition not permitted",61[PERR_REMOTE] = "Have remote partition underneath",62};6364/*65* For local partitions, update to subpartitions_cpus & isolated_cpus is done66* in update_parent_effective_cpumask(). For remote partitions, it is done in67* the remote_partition_*() and remote_cpus_update() helpers.68*/69/*70* Exclusive CPUs distributed out to local or remote sub-partitions of71* top_cpuset72*/73static cpumask_var_t subpartitions_cpus;7475/*76* Exclusive CPUs in isolated partitions77*/78static cpumask_var_t isolated_cpus;7980/*81* isolated_cpus updating flag (protected by cpuset_mutex)82* Set if isolated_cpus is going to be updated in the current83* cpuset_mutex crtical section.84*/85static bool isolated_cpus_updating;8687/*88* Housekeeping (HK_TYPE_DOMAIN) CPUs at boot89*/90static cpumask_var_t boot_hk_cpus;91static bool have_boot_isolcpus;9293/*94* A flag to force sched domain rebuild at the end of an operation.95* It can be set in96* - update_partition_sd_lb()97* - update_cpumasks_hier()98* - cpuset_update_flag()99* - cpuset_hotplug_update_tasks()100* - cpuset_handle_hotplug()101*102* Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.103*104* Note that update_relax_domain_level() in cpuset-v1.c can still call105* rebuild_sched_domains_locked() directly without using this flag.106*/107static bool force_sd_rebuild;108109/*110* Partition root states:111*112* 0 - member (not a partition root)113* 1 - partition root114* 2 - partition root without load balancing (isolated)115* -1 - invalid partition root116* -2 - invalid isolated partition root117*118* There are 2 types of partitions - local or remote. Local partitions are119* those whose parents are partition root themselves. Setting of120* cpuset.cpus.exclusive are optional in setting up local partitions.121* Remote partitions are those whose parents are not partition roots. Passing122* down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor123* nodes are mandatory in creating a remote partition.124*125* For simplicity, a local partition can be created under a local or remote126* partition but a remote partition cannot have any partition root in its127* ancestor chain except the cgroup root.128*/129#define PRS_MEMBER 0130#define PRS_ROOT 1131#define PRS_ISOLATED 2132#define PRS_INVALID_ROOT -1133#define PRS_INVALID_ISOLATED -2134135/*136* Temporary cpumasks for working with partitions that are passed among137* functions to avoid memory allocation in inner functions.138*/139struct tmpmasks {140cpumask_var_t addmask, delmask; /* For partition root */141cpumask_var_t new_cpus; /* For update_cpumasks_hier() */142};143144void inc_dl_tasks_cs(struct task_struct *p)145{146struct cpuset *cs = task_cs(p);147148cs->nr_deadline_tasks++;149}150151void dec_dl_tasks_cs(struct task_struct *p)152{153struct cpuset *cs = task_cs(p);154155cs->nr_deadline_tasks--;156}157158static inline bool is_partition_valid(const struct cpuset *cs)159{160return cs->partition_root_state > 0;161}162163static inline bool is_partition_invalid(const struct cpuset *cs)164{165return cs->partition_root_state < 0;166}167168static inline bool cs_is_member(const struct cpuset *cs)169{170return cs->partition_root_state == PRS_MEMBER;171}172173/*174* Callers should hold callback_lock to modify partition_root_state.175*/176static inline void make_partition_invalid(struct cpuset *cs)177{178if (cs->partition_root_state > 0)179cs->partition_root_state = -cs->partition_root_state;180}181182/*183* Send notification event of whenever partition_root_state changes.184*/185static inline void notify_partition_change(struct cpuset *cs, int old_prs)186{187if (old_prs == cs->partition_root_state)188return;189cgroup_file_notify(&cs->partition_file);190191/* Reset prs_err if not invalid */192if (is_partition_valid(cs))193WRITE_ONCE(cs->prs_err, PERR_NONE);194}195196/*197* The top_cpuset is always synchronized to cpu_active_mask and we should avoid198* using cpu_online_mask as much as possible. An active CPU is always an online199* CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ200* during hotplug operations. A CPU is marked active at the last stage of CPU201* bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code202* will be called to update the sched domains so that the scheduler can move203* a normal task to a newly active CPU or remove tasks away from a newly204* inactivated CPU. The online bit is set much earlier in the CPU bringup205* process and cleared much later in CPU teardown.206*207* If cpu_online_mask is used while a hotunplug operation is happening in208* parallel, we may leave an offline CPU in cpu_allowed or some other masks.209*/210static struct cpuset top_cpuset = {211.flags = BIT(CS_CPU_EXCLUSIVE) |212BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),213.partition_root_state = PRS_ROOT,214.relax_domain_level = -1,215.remote_partition = false,216};217218/*219* There are two global locks guarding cpuset structures - cpuset_mutex and220* callback_lock. The cpuset code uses only cpuset_mutex. Other kernel221* subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset222* structures. Note that cpuset_mutex needs to be a mutex as it is used in223* paths that rely on priority inheritance (e.g. scheduler - on RT) for224* correctness.225*226* A task must hold both locks to modify cpusets. If a task holds227* cpuset_mutex, it blocks others, ensuring that it is the only task able to228* also acquire callback_lock and be able to modify cpusets. It can perform229* various checks on the cpuset structure first, knowing nothing will change.230* It can also allocate memory while just holding cpuset_mutex. While it is231* performing these checks, various callback routines can briefly acquire232* callback_lock to query cpusets. Once it is ready to make the changes, it233* takes callback_lock, blocking everyone else.234*235* Calls to the kernel memory allocator can not be made while holding236* callback_lock, as that would risk double tripping on callback_lock237* from one of the callbacks into the cpuset code from within238* __alloc_pages().239*240* If a task is only holding callback_lock, then it has read-only241* access to cpusets.242*243* Now, the task_struct fields mems_allowed and mempolicy may be changed244* by other task, we use alloc_lock in the task_struct fields to protect245* them.246*247* The cpuset_common_seq_show() handlers only hold callback_lock across248* small pieces of code, such as when reading out possibly multi-word249* cpumasks and nodemasks.250*/251252static DEFINE_MUTEX(cpuset_mutex);253254/**255* cpuset_lock - Acquire the global cpuset mutex256*257* This locks the global cpuset mutex to prevent modifications to cpuset258* hierarchy and configurations. This helper is not enough to make modification.259*/260void cpuset_lock(void)261{262mutex_lock(&cpuset_mutex);263}264265void cpuset_unlock(void)266{267mutex_unlock(&cpuset_mutex);268}269270/**271* cpuset_full_lock - Acquire full protection for cpuset modification272*273* Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex274* to safely modify cpuset data.275*/276void cpuset_full_lock(void)277{278cpus_read_lock();279mutex_lock(&cpuset_mutex);280}281282void cpuset_full_unlock(void)283{284mutex_unlock(&cpuset_mutex);285cpus_read_unlock();286}287288static DEFINE_SPINLOCK(callback_lock);289290void cpuset_callback_lock_irq(void)291{292spin_lock_irq(&callback_lock);293}294295void cpuset_callback_unlock_irq(void)296{297spin_unlock_irq(&callback_lock);298}299300static struct workqueue_struct *cpuset_migrate_mm_wq;301302static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);303304static inline void check_insane_mems_config(nodemask_t *nodes)305{306if (!cpusets_insane_config() &&307movable_only_nodes(nodes)) {308static_branch_enable_cpuslocked(&cpusets_insane_config_key);309pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"310"Cpuset allocations might fail even with a lot of memory available.\n",311nodemask_pr_args(nodes));312}313}314315/*316* decrease cs->attach_in_progress.317* wake_up cpuset_attach_wq if cs->attach_in_progress==0.318*/319static inline void dec_attach_in_progress_locked(struct cpuset *cs)320{321lockdep_assert_held(&cpuset_mutex);322323cs->attach_in_progress--;324if (!cs->attach_in_progress)325wake_up(&cpuset_attach_wq);326}327328static inline void dec_attach_in_progress(struct cpuset *cs)329{330mutex_lock(&cpuset_mutex);331dec_attach_in_progress_locked(cs);332mutex_unlock(&cpuset_mutex);333}334335static inline bool cpuset_v2(void)336{337return !IS_ENABLED(CONFIG_CPUSETS_V1) ||338cgroup_subsys_on_dfl(cpuset_cgrp_subsys);339}340341/*342* Cgroup v2 behavior is used on the "cpus" and "mems" control files when343* on default hierarchy or when the cpuset_v2_mode flag is set by mounting344* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.345* With v2 behavior, "cpus" and "mems" are always what the users have346* requested and won't be changed by hotplug events. Only the effective347* cpus or mems will be affected.348*/349static inline bool is_in_v2_mode(void)350{351return cpuset_v2() ||352(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);353}354355static inline bool cpuset_is_populated(struct cpuset *cs)356{357lockdep_assert_held(&cpuset_mutex);358359/* Cpusets in the process of attaching should be considered as populated */360return cgroup_is_populated(cs->css.cgroup) ||361cs->attach_in_progress;362}363364/**365* partition_is_populated - check if partition has tasks366* @cs: partition root to be checked367* @excluded_child: a child cpuset to be excluded in task checking368* Return: true if there are tasks, false otherwise369*370* @cs should be a valid partition root or going to become a partition root.371* @excluded_child should be non-NULL when this cpuset is going to become a372* partition itself.373*374* Note that a remote partition is not allowed underneath a valid local375* or remote partition. So if a non-partition root child is populated,376* the whole partition is considered populated.377*/378static inline bool partition_is_populated(struct cpuset *cs,379struct cpuset *excluded_child)380{381struct cpuset *cp;382struct cgroup_subsys_state *pos_css;383384/*385* We cannot call cs_is_populated(cs) directly, as386* nr_populated_domain_children may include populated387* csets from descendants that are partitions.388*/389if (cs->css.cgroup->nr_populated_csets ||390cs->attach_in_progress)391return true;392393rcu_read_lock();394cpuset_for_each_descendant_pre(cp, pos_css, cs) {395if (cp == cs || cp == excluded_child)396continue;397398if (is_partition_valid(cp)) {399pos_css = css_rightmost_descendant(pos_css);400continue;401}402403if (cpuset_is_populated(cp)) {404rcu_read_unlock();405return true;406}407}408rcu_read_unlock();409return false;410}411412/*413* Return in pmask the portion of a task's cpusets's cpus_allowed that414* are online and are capable of running the task. If none are found,415* walk up the cpuset hierarchy until we find one that does have some416* appropriate cpus.417*418* One way or another, we guarantee to return some non-empty subset419* of cpu_active_mask.420*421* Call with callback_lock or cpuset_mutex held.422*/423static void guarantee_active_cpus(struct task_struct *tsk,424struct cpumask *pmask)425{426const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);427struct cpuset *cs;428429if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))430cpumask_copy(pmask, cpu_active_mask);431432rcu_read_lock();433cs = task_cs(tsk);434435while (!cpumask_intersects(cs->effective_cpus, pmask))436cs = parent_cs(cs);437438cpumask_and(pmask, pmask, cs->effective_cpus);439rcu_read_unlock();440}441442/*443* Return in *pmask the portion of a cpusets's mems_allowed that444* are online, with memory. If none are online with memory, walk445* up the cpuset hierarchy until we find one that does have some446* online mems. The top cpuset always has some mems online.447*448* One way or another, we guarantee to return some non-empty subset449* of node_states[N_MEMORY].450*451* Call with callback_lock or cpuset_mutex held.452*/453static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)454{455while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))456cs = parent_cs(cs);457nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);458}459460/**461* alloc_cpumasks - Allocate an array of cpumask variables462* @pmasks: Pointer to array of cpumask_var_t pointers463* @size: Number of cpumasks to allocate464* Return: 0 if successful, -ENOMEM otherwise.465*466* Allocates @size cpumasks and initializes them to empty. Returns 0 on467* success, -ENOMEM on allocation failure. On failure, any previously468* allocated cpumasks are freed.469*/470static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)471{472int i;473474for (i = 0; i < size; i++) {475if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {476while (--i >= 0)477free_cpumask_var(*pmasks[i]);478return -ENOMEM;479}480}481return 0;482}483484/**485* alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.486* @tmp: Pointer to tmpmasks structure to populate487* Return: 0 on success, -ENOMEM on allocation failure488*/489static inline int alloc_tmpmasks(struct tmpmasks *tmp)490{491/*492* Array of pointers to the three cpumask_var_t fields in tmpmasks.493* Note: Array size must match actual number of masks (3)494*/495cpumask_var_t *pmask[3] = {496&tmp->new_cpus,497&tmp->addmask,498&tmp->delmask499};500501return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));502}503504/**505* free_tmpmasks - free cpumasks in a tmpmasks structure506* @tmp: the tmpmasks structure pointer507*/508static inline void free_tmpmasks(struct tmpmasks *tmp)509{510if (!tmp)511return;512513free_cpumask_var(tmp->new_cpus);514free_cpumask_var(tmp->addmask);515free_cpumask_var(tmp->delmask);516}517518/**519* dup_or_alloc_cpuset - Duplicate or allocate a new cpuset520* @cs: Source cpuset to duplicate (NULL for a fresh allocation)521*522* Creates a new cpuset by either:523* 1. Duplicating an existing cpuset (if @cs is non-NULL), or524* 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)525*526* Return: Pointer to newly allocated cpuset on success, NULL on failure527*/528static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)529{530struct cpuset *trial;531532/* Allocate base structure */533trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :534kzalloc(sizeof(*cs), GFP_KERNEL);535if (!trial)536return NULL;537538/* Setup cpumask pointer array */539cpumask_var_t *pmask[4] = {540&trial->cpus_allowed,541&trial->effective_cpus,542&trial->effective_xcpus,543&trial->exclusive_cpus544};545546if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {547kfree(trial);548return NULL;549}550551/* Copy masks if duplicating */552if (cs) {553cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);554cpumask_copy(trial->effective_cpus, cs->effective_cpus);555cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);556cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);557}558559return trial;560}561562/**563* free_cpuset - free the cpuset564* @cs: the cpuset to be freed565*/566static inline void free_cpuset(struct cpuset *cs)567{568free_cpumask_var(cs->cpus_allowed);569free_cpumask_var(cs->effective_cpus);570free_cpumask_var(cs->effective_xcpus);571free_cpumask_var(cs->exclusive_cpus);572kfree(cs);573}574575/* Return user specified exclusive CPUs */576static inline struct cpumask *user_xcpus(struct cpuset *cs)577{578return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed579: cs->exclusive_cpus;580}581582static inline bool xcpus_empty(struct cpuset *cs)583{584return cpumask_empty(cs->cpus_allowed) &&585cpumask_empty(cs->exclusive_cpus);586}587588/*589* cpusets_are_exclusive() - check if two cpusets are exclusive590*591* Return true if exclusive, false if not592*/593static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)594{595struct cpumask *xcpus1 = user_xcpus(cs1);596struct cpumask *xcpus2 = user_xcpus(cs2);597598if (cpumask_intersects(xcpus1, xcpus2))599return false;600return true;601}602603/**604* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts605* @cs1: first cpuset to check606* @cs2: second cpuset to check607*608* Returns: true if CPU exclusivity conflict exists, false otherwise609*610* Conflict detection rules:611* 1. If either cpuset is CPU exclusive, they must be mutually exclusive612* 2. exclusive_cpus masks cannot intersect between cpusets613* 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs614*/615static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)616{617/* If either cpuset is exclusive, check if they are mutually exclusive */618if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))619return !cpusets_are_exclusive(cs1, cs2);620621/* Exclusive_cpus cannot intersect */622if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))623return true;624625/* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */626if (!cpumask_empty(cs1->cpus_allowed) &&627cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))628return true;629630if (!cpumask_empty(cs2->cpus_allowed) &&631cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))632return true;633634return false;635}636637static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)638{639if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))640return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);641return false;642}643644/*645* validate_change() - Used to validate that any proposed cpuset change646* follows the structural rules for cpusets.647*648* If we replaced the flag and mask values of the current cpuset649* (cur) with those values in the trial cpuset (trial), would650* our various subset and exclusive rules still be valid? Presumes651* cpuset_mutex held.652*653* 'cur' is the address of an actual, in-use cpuset. Operations654* such as list traversal that depend on the actual address of the655* cpuset in the list must use cur below, not trial.656*657* 'trial' is the address of bulk structure copy of cur, with658* perhaps one or more of the fields cpus_allowed, mems_allowed,659* or flags changed to new, trial values.660*661* Return 0 if valid, -errno if not.662*/663664static int validate_change(struct cpuset *cur, struct cpuset *trial)665{666struct cgroup_subsys_state *css;667struct cpuset *c, *par;668int ret = 0;669670rcu_read_lock();671672if (!is_in_v2_mode())673ret = cpuset1_validate_change(cur, trial);674if (ret)675goto out;676677/* Remaining checks don't apply to root cpuset */678if (cur == &top_cpuset)679goto out;680681par = parent_cs(cur);682683/*684* Cpusets with tasks - existing or newly being attached - can't685* be changed to have empty cpus_allowed or mems_allowed.686*/687ret = -ENOSPC;688if (cpuset_is_populated(cur)) {689if (!cpumask_empty(cur->cpus_allowed) &&690cpumask_empty(trial->cpus_allowed))691goto out;692if (!nodes_empty(cur->mems_allowed) &&693nodes_empty(trial->mems_allowed))694goto out;695}696697/*698* We can't shrink if we won't have enough room for SCHED_DEADLINE699* tasks. This check is not done when scheduling is disabled as the700* users should know what they are doing.701*702* For v1, effective_cpus == cpus_allowed & user_xcpus() returns703* cpus_allowed.704*705* For v2, is_cpu_exclusive() & is_sched_load_balance() are true only706* for non-isolated partition root. At this point, the target707* effective_cpus isn't computed yet. user_xcpus() is the best708* approximation.709*710* TBD: May need to precompute the real effective_cpus here in case711* incorrect scheduling of SCHED_DEADLINE tasks in a partition712* becomes an issue.713*/714ret = -EBUSY;715if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&716!cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))717goto out;718719/*720* If either I or some sibling (!= me) is exclusive, we can't721* overlap. exclusive_cpus cannot overlap with each other if set.722*/723ret = -EINVAL;724cpuset_for_each_child(c, css, par) {725if (c == cur)726continue;727if (cpus_excl_conflict(trial, c))728goto out;729if (mems_excl_conflict(trial, c))730goto out;731}732733ret = 0;734out:735rcu_read_unlock();736return ret;737}738739#ifdef CONFIG_SMP740/*741* Helper routine for generate_sched_domains().742* Do cpusets a, b have overlapping effective cpus_allowed masks?743*/744static int cpusets_overlap(struct cpuset *a, struct cpuset *b)745{746return cpumask_intersects(a->effective_cpus, b->effective_cpus);747}748749static void750update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)751{752if (dattr->relax_domain_level < c->relax_domain_level)753dattr->relax_domain_level = c->relax_domain_level;754return;755}756757static void update_domain_attr_tree(struct sched_domain_attr *dattr,758struct cpuset *root_cs)759{760struct cpuset *cp;761struct cgroup_subsys_state *pos_css;762763rcu_read_lock();764cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {765/* skip the whole subtree if @cp doesn't have any CPU */766if (cpumask_empty(cp->cpus_allowed)) {767pos_css = css_rightmost_descendant(pos_css);768continue;769}770771if (is_sched_load_balance(cp))772update_domain_attr(dattr, cp);773}774rcu_read_unlock();775}776777/* Must be called with cpuset_mutex held. */778static inline int nr_cpusets(void)779{780/* jump label reference count + the top-level cpuset */781return static_key_count(&cpusets_enabled_key.key) + 1;782}783784/*785* generate_sched_domains()786*787* This function builds a partial partition of the systems CPUs788* A 'partial partition' is a set of non-overlapping subsets whose789* union is a subset of that set.790* The output of this function needs to be passed to kernel/sched/core.c791* partition_sched_domains() routine, which will rebuild the scheduler's792* load balancing domains (sched domains) as specified by that partial793* partition.794*795* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst796* for a background explanation of this.797*798* Does not return errors, on the theory that the callers of this799* routine would rather not worry about failures to rebuild sched800* domains when operating in the severe memory shortage situations801* that could cause allocation failures below.802*803* Must be called with cpuset_mutex held.804*805* The three key local variables below are:806* cp - cpuset pointer, used (together with pos_css) to perform a807* top-down scan of all cpusets. For our purposes, rebuilding808* the schedulers sched domains, we can ignore !is_sched_load_809* balance cpusets.810* csa - (for CpuSet Array) Array of pointers to all the cpusets811* that need to be load balanced, for convenient iterative812* access by the subsequent code that finds the best partition,813* i.e the set of domains (subsets) of CPUs such that the814* cpus_allowed of every cpuset marked is_sched_load_balance815* is a subset of one of these domains, while there are as816* many such domains as possible, each as small as possible.817* doms - Conversion of 'csa' to an array of cpumasks, for passing to818* the kernel/sched/core.c routine partition_sched_domains() in a819* convenient format, that can be easily compared to the prior820* value to determine what partition elements (sched domains)821* were changed (added or removed.)822*823* Finding the best partition (set of domains):824* The double nested loops below over i, j scan over the load825* balanced cpusets (using the array of cpuset pointers in csa[])826* looking for pairs of cpusets that have overlapping cpus_allowed827* and merging them using a union-find algorithm.828*829* The union of the cpus_allowed masks from the set of all cpusets830* having the same root then form the one element of the partition831* (one sched domain) to be passed to partition_sched_domains().832*833*/834static int generate_sched_domains(cpumask_var_t **domains,835struct sched_domain_attr **attributes)836{837struct cpuset *cp; /* top-down scan of cpusets */838struct cpuset **csa; /* array of all cpuset ptrs */839int csn; /* how many cpuset ptrs in csa so far */840int i, j; /* indices for partition finding loops */841cpumask_var_t *doms; /* resulting partition; i.e. sched domains */842struct sched_domain_attr *dattr; /* attributes for custom domains */843int ndoms = 0; /* number of sched domains in result */844int nslot; /* next empty doms[] struct cpumask slot */845struct cgroup_subsys_state *pos_css;846bool root_load_balance = is_sched_load_balance(&top_cpuset);847bool cgrpv2 = cpuset_v2();848int nslot_update;849850doms = NULL;851dattr = NULL;852csa = NULL;853854/* Special case for the 99% of systems with one, full, sched domain */855if (root_load_balance && cpumask_empty(subpartitions_cpus)) {856single_root_domain:857ndoms = 1;858doms = alloc_sched_domains(ndoms);859if (!doms)860goto done;861862dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);863if (dattr) {864*dattr = SD_ATTR_INIT;865update_domain_attr_tree(dattr, &top_cpuset);866}867cpumask_and(doms[0], top_cpuset.effective_cpus,868housekeeping_cpumask(HK_TYPE_DOMAIN));869870goto done;871}872873csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);874if (!csa)875goto done;876csn = 0;877878rcu_read_lock();879if (root_load_balance)880csa[csn++] = &top_cpuset;881cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {882if (cp == &top_cpuset)883continue;884885if (cgrpv2)886goto v2;887888/*889* v1:890* Continue traversing beyond @cp iff @cp has some CPUs and891* isn't load balancing. The former is obvious. The892* latter: All child cpusets contain a subset of the893* parent's cpus, so just skip them, and then we call894* update_domain_attr_tree() to calc relax_domain_level of895* the corresponding sched domain.896*/897if (!cpumask_empty(cp->cpus_allowed) &&898!(is_sched_load_balance(cp) &&899cpumask_intersects(cp->cpus_allowed,900housekeeping_cpumask(HK_TYPE_DOMAIN))))901continue;902903if (is_sched_load_balance(cp) &&904!cpumask_empty(cp->effective_cpus))905csa[csn++] = cp;906907/* skip @cp's subtree */908pos_css = css_rightmost_descendant(pos_css);909continue;910911v2:912/*913* Only valid partition roots that are not isolated and with914* non-empty effective_cpus will be saved into csn[].915*/916if ((cp->partition_root_state == PRS_ROOT) &&917!cpumask_empty(cp->effective_cpus))918csa[csn++] = cp;919920/*921* Skip @cp's subtree if not a partition root and has no922* exclusive CPUs to be granted to child cpusets.923*/924if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))925pos_css = css_rightmost_descendant(pos_css);926}927rcu_read_unlock();928929/*930* If there are only isolated partitions underneath the cgroup root,931* we can optimize out unneeded sched domains scanning.932*/933if (root_load_balance && (csn == 1))934goto single_root_domain;935936for (i = 0; i < csn; i++)937uf_node_init(&csa[i]->node);938939/* Merge overlapping cpusets */940for (i = 0; i < csn; i++) {941for (j = i + 1; j < csn; j++) {942if (cpusets_overlap(csa[i], csa[j])) {943/*944* Cgroup v2 shouldn't pass down overlapping945* partition root cpusets.946*/947WARN_ON_ONCE(cgrpv2);948uf_union(&csa[i]->node, &csa[j]->node);949}950}951}952953/* Count the total number of domains */954for (i = 0; i < csn; i++) {955if (uf_find(&csa[i]->node) == &csa[i]->node)956ndoms++;957}958959/*960* Now we know how many domains to create.961* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.962*/963doms = alloc_sched_domains(ndoms);964if (!doms)965goto done;966967/*968* The rest of the code, including the scheduler, can deal with969* dattr==NULL case. No need to abort if alloc fails.970*/971dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),972GFP_KERNEL);973974/*975* Cgroup v2 doesn't support domain attributes, just set all of them976* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a977* subset of HK_TYPE_DOMAIN housekeeping CPUs.978*/979if (cgrpv2) {980for (i = 0; i < ndoms; i++) {981/*982* The top cpuset may contain some boot time isolated983* CPUs that need to be excluded from the sched domain.984*/985if (csa[i] == &top_cpuset)986cpumask_and(doms[i], csa[i]->effective_cpus,987housekeeping_cpumask(HK_TYPE_DOMAIN));988else989cpumask_copy(doms[i], csa[i]->effective_cpus);990if (dattr)991dattr[i] = SD_ATTR_INIT;992}993goto done;994}995996for (nslot = 0, i = 0; i < csn; i++) {997nslot_update = 0;998for (j = i; j < csn; j++) {999if (uf_find(&csa[j]->node) == &csa[i]->node) {1000struct cpumask *dp = doms[nslot];10011002if (i == j) {1003nslot_update = 1;1004cpumask_clear(dp);1005if (dattr)1006*(dattr + nslot) = SD_ATTR_INIT;1007}1008cpumask_or(dp, dp, csa[j]->effective_cpus);1009cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));1010if (dattr)1011update_domain_attr_tree(dattr + nslot, csa[j]);1012}1013}1014if (nslot_update)1015nslot++;1016}1017BUG_ON(nslot != ndoms);10181019done:1020kfree(csa);10211022/*1023* Fallback to the default domain if kmalloc() failed.1024* See comments in partition_sched_domains().1025*/1026if (doms == NULL)1027ndoms = 1;10281029*domains = doms;1030*attributes = dattr;1031return ndoms;1032}10331034static void dl_update_tasks_root_domain(struct cpuset *cs)1035{1036struct css_task_iter it;1037struct task_struct *task;10381039if (cs->nr_deadline_tasks == 0)1040return;10411042css_task_iter_start(&cs->css, 0, &it);10431044while ((task = css_task_iter_next(&it)))1045dl_add_task_root_domain(task);10461047css_task_iter_end(&it);1048}10491050void dl_rebuild_rd_accounting(void)1051{1052struct cpuset *cs = NULL;1053struct cgroup_subsys_state *pos_css;1054int cpu;1055u64 cookie = ++dl_cookie;10561057lockdep_assert_held(&cpuset_mutex);1058lockdep_assert_cpus_held();1059lockdep_assert_held(&sched_domains_mutex);10601061rcu_read_lock();10621063for_each_possible_cpu(cpu) {1064if (dl_bw_visited(cpu, cookie))1065continue;10661067dl_clear_root_domain_cpu(cpu);1068}10691070cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {10711072if (cpumask_empty(cs->effective_cpus)) {1073pos_css = css_rightmost_descendant(pos_css);1074continue;1075}10761077css_get(&cs->css);10781079rcu_read_unlock();10801081dl_update_tasks_root_domain(cs);10821083rcu_read_lock();1084css_put(&cs->css);1085}1086rcu_read_unlock();1087}10881089/*1090* Rebuild scheduler domains.1091*1092* If the flag 'sched_load_balance' of any cpuset with non-empty1093* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset1094* which has that flag enabled, or if any cpuset with a non-empty1095* 'cpus' is removed, then call this routine to rebuild the1096* scheduler's dynamic sched domains.1097*1098* Call with cpuset_mutex held. Takes cpus_read_lock().1099*/1100void rebuild_sched_domains_locked(void)1101{1102struct cgroup_subsys_state *pos_css;1103struct sched_domain_attr *attr;1104cpumask_var_t *doms;1105struct cpuset *cs;1106int ndoms;11071108lockdep_assert_cpus_held();1109lockdep_assert_held(&cpuset_mutex);1110force_sd_rebuild = false;11111112/*1113* If we have raced with CPU hotplug, return early to avoid1114* passing doms with offlined cpu to partition_sched_domains().1115* Anyways, cpuset_handle_hotplug() will rebuild sched domains.1116*1117* With no CPUs in any subpartitions, top_cpuset's effective CPUs1118* should be the same as the active CPUs, so checking only top_cpuset1119* is enough to detect racing CPU offlines.1120*/1121if (cpumask_empty(subpartitions_cpus) &&1122!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))1123return;11241125/*1126* With subpartition CPUs, however, the effective CPUs of a partition1127* root should be only a subset of the active CPUs. Since a CPU in any1128* partition root could be offlined, all must be checked.1129*/1130if (!cpumask_empty(subpartitions_cpus)) {1131rcu_read_lock();1132cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {1133if (!is_partition_valid(cs)) {1134pos_css = css_rightmost_descendant(pos_css);1135continue;1136}1137if (!cpumask_subset(cs->effective_cpus,1138cpu_active_mask)) {1139rcu_read_unlock();1140return;1141}1142}1143rcu_read_unlock();1144}11451146/* Generate domain masks and attrs */1147ndoms = generate_sched_domains(&doms, &attr);11481149/* Have scheduler rebuild the domains */1150partition_sched_domains(ndoms, doms, attr);1151}1152#else /* !CONFIG_SMP */1153void rebuild_sched_domains_locked(void)1154{1155}1156#endif /* CONFIG_SMP */11571158static void rebuild_sched_domains_cpuslocked(void)1159{1160mutex_lock(&cpuset_mutex);1161rebuild_sched_domains_locked();1162mutex_unlock(&cpuset_mutex);1163}11641165void rebuild_sched_domains(void)1166{1167cpus_read_lock();1168rebuild_sched_domains_cpuslocked();1169cpus_read_unlock();1170}11711172void cpuset_reset_sched_domains(void)1173{1174mutex_lock(&cpuset_mutex);1175partition_sched_domains(1, NULL, NULL);1176mutex_unlock(&cpuset_mutex);1177}11781179/**1180* cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.1181* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed1182* @new_cpus: the temp variable for the new effective_cpus mask1183*1184* Iterate through each task of @cs updating its cpus_allowed to the1185* effective cpuset's. As this function is called with cpuset_mutex held,1186* cpuset membership stays stable.1187*1188* For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus1189* to make sure all offline CPUs are also included as hotplug code won't1190* update cpumasks for tasks in top_cpuset.1191*1192* As task_cpu_possible_mask() can be task dependent in arm64, we have to1193* do cpu masking per task instead of doing it once for all.1194*/1195void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)1196{1197struct css_task_iter it;1198struct task_struct *task;1199bool top_cs = cs == &top_cpuset;12001201css_task_iter_start(&cs->css, 0, &it);1202while ((task = css_task_iter_next(&it))) {1203const struct cpumask *possible_mask = task_cpu_possible_mask(task);12041205if (top_cs) {1206/*1207* PF_NO_SETAFFINITY tasks are ignored.1208* All per cpu kthreads should have PF_NO_SETAFFINITY1209* flag set, see kthread_set_per_cpu().1210*/1211if (task->flags & PF_NO_SETAFFINITY)1212continue;1213cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);1214} else {1215cpumask_and(new_cpus, possible_mask, cs->effective_cpus);1216}1217set_cpus_allowed_ptr(task, new_cpus);1218}1219css_task_iter_end(&it);1220}12211222/**1223* compute_effective_cpumask - Compute the effective cpumask of the cpuset1224* @new_cpus: the temp variable for the new effective_cpus mask1225* @cs: the cpuset the need to recompute the new effective_cpus mask1226* @parent: the parent cpuset1227*1228* The result is valid only if the given cpuset isn't a partition root.1229*/1230static void compute_effective_cpumask(struct cpumask *new_cpus,1231struct cpuset *cs, struct cpuset *parent)1232{1233cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);1234}12351236/*1237* Commands for update_parent_effective_cpumask1238*/1239enum partition_cmd {1240partcmd_enable, /* Enable partition root */1241partcmd_enablei, /* Enable isolated partition root */1242partcmd_disable, /* Disable partition root */1243partcmd_update, /* Update parent's effective_cpus */1244partcmd_invalidate, /* Make partition invalid */1245};12461247static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,1248struct tmpmasks *tmp);12491250/*1251* Update partition exclusive flag1252*1253* Return: 0 if successful, an error code otherwise1254*/1255static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)1256{1257bool exclusive = (new_prs > PRS_MEMBER);12581259if (exclusive && !is_cpu_exclusive(cs)) {1260if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))1261return PERR_NOTEXCL;1262} else if (!exclusive && is_cpu_exclusive(cs)) {1263/* Turning off CS_CPU_EXCLUSIVE will not return error */1264cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);1265}1266return 0;1267}12681269/*1270* Update partition load balance flag and/or rebuild sched domain1271*1272* Changing load balance flag will automatically call1273* rebuild_sched_domains_locked().1274* This function is for cgroup v2 only.1275*/1276static void update_partition_sd_lb(struct cpuset *cs, int old_prs)1277{1278int new_prs = cs->partition_root_state;1279bool rebuild_domains = (new_prs > 0) || (old_prs > 0);1280bool new_lb;12811282/*1283* If cs is not a valid partition root, the load balance state1284* will follow its parent.1285*/1286if (new_prs > 0) {1287new_lb = (new_prs != PRS_ISOLATED);1288} else {1289new_lb = is_sched_load_balance(parent_cs(cs));1290}1291if (new_lb != !!is_sched_load_balance(cs)) {1292rebuild_domains = true;1293if (new_lb)1294set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);1295else1296clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);1297}12981299if (rebuild_domains)1300cpuset_force_rebuild();1301}13021303/*1304* tasks_nocpu_error - Return true if tasks will have no effective_cpus1305*/1306static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,1307struct cpumask *xcpus)1308{1309/*1310* A populated partition (cs or parent) can't have empty effective_cpus1311*/1312return (cpumask_subset(parent->effective_cpus, xcpus) &&1313partition_is_populated(parent, cs)) ||1314(!cpumask_intersects(xcpus, cpu_active_mask) &&1315partition_is_populated(cs, NULL));1316}13171318static void reset_partition_data(struct cpuset *cs)1319{1320struct cpuset *parent = parent_cs(cs);13211322if (!cpuset_v2())1323return;13241325lockdep_assert_held(&callback_lock);13261327if (cpumask_empty(cs->exclusive_cpus)) {1328cpumask_clear(cs->effective_xcpus);1329if (is_cpu_exclusive(cs))1330clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);1331}1332if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))1333cpumask_copy(cs->effective_cpus, parent->effective_cpus);1334}13351336/*1337* isolated_cpus_update - Update the isolated_cpus mask1338* @old_prs: old partition_root_state1339* @new_prs: new partition_root_state1340* @xcpus: exclusive CPUs with state change1341*/1342static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)1343{1344WARN_ON_ONCE(old_prs == new_prs);1345if (new_prs == PRS_ISOLATED)1346cpumask_or(isolated_cpus, isolated_cpus, xcpus);1347else1348cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);13491350isolated_cpus_updating = true;1351}13521353/*1354* partition_xcpus_add - Add new exclusive CPUs to partition1355* @new_prs: new partition_root_state1356* @parent: parent cpuset1357* @xcpus: exclusive CPUs to be added1358*1359* Remote partition if parent == NULL1360*/1361static void partition_xcpus_add(int new_prs, struct cpuset *parent,1362struct cpumask *xcpus)1363{1364WARN_ON_ONCE(new_prs < 0);1365lockdep_assert_held(&callback_lock);1366if (!parent)1367parent = &top_cpuset;136813691370if (parent == &top_cpuset)1371cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);13721373if (new_prs != parent->partition_root_state)1374isolated_cpus_update(parent->partition_root_state, new_prs,1375xcpus);13761377cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);1378}13791380/*1381* partition_xcpus_del - Remove exclusive CPUs from partition1382* @old_prs: old partition_root_state1383* @parent: parent cpuset1384* @xcpus: exclusive CPUs to be removed1385*1386* Remote partition if parent == NULL1387*/1388static void partition_xcpus_del(int old_prs, struct cpuset *parent,1389struct cpumask *xcpus)1390{1391WARN_ON_ONCE(old_prs < 0);1392lockdep_assert_held(&callback_lock);1393if (!parent)1394parent = &top_cpuset;13951396if (parent == &top_cpuset)1397cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);13981399if (old_prs != parent->partition_root_state)1400isolated_cpus_update(old_prs, parent->partition_root_state,1401xcpus);14021403cpumask_and(xcpus, xcpus, cpu_active_mask);1404cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);1405}14061407/*1408* isolated_cpus_can_update - check for isolated & nohz_full conflicts1409* @add_cpus: cpu mask for cpus that are going to be isolated1410* @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL1411* Return: false if there is conflict, true otherwise1412*1413* If nohz_full is enabled and we have isolated CPUs, their combination must1414* still leave housekeeping CPUs.1415*1416* TBD: Should consider merging this function into1417* prstate_housekeeping_conflict().1418*/1419static bool isolated_cpus_can_update(struct cpumask *add_cpus,1420struct cpumask *del_cpus)1421{1422cpumask_var_t full_hk_cpus;1423int res = true;14241425if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))1426return true;14271428if (del_cpus && cpumask_weight_and(del_cpus,1429housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))1430return true;14311432if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))1433return false;14341435cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),1436housekeeping_cpumask(HK_TYPE_DOMAIN));1437cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);1438cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);1439if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))1440res = false;14411442free_cpumask_var(full_hk_cpus);1443return res;1444}14451446/*1447* prstate_housekeeping_conflict - check for partition & housekeeping conflicts1448* @prstate: partition root state to be checked1449* @new_cpus: cpu mask1450* Return: true if there is conflict, false otherwise1451*1452* CPUs outside of boot_hk_cpus, if defined, can only be used in an1453* isolated partition.1454*/1455static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)1456{1457if (!have_boot_isolcpus)1458return false;14591460if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))1461return true;14621463return false;1464}14651466/*1467* update_isolation_cpumasks - Update external isolation related CPU masks1468*1469* The following external CPU masks will be updated if necessary:1470* - workqueue unbound cpumask1471*/1472static void update_isolation_cpumasks(void)1473{1474int ret;14751476if (!isolated_cpus_updating)1477return;14781479lockdep_assert_cpus_held();14801481ret = workqueue_unbound_exclude_cpumask(isolated_cpus);1482WARN_ON_ONCE(ret < 0);14831484ret = tmigr_isolated_exclude_cpumask(isolated_cpus);1485WARN_ON_ONCE(ret < 0);14861487isolated_cpus_updating = false;1488}14891490/**1491* cpuset_cpu_is_isolated - Check if the given CPU is isolated1492* @cpu: the CPU number to be checked1493* Return: true if CPU is used in an isolated partition, false otherwise1494*/1495bool cpuset_cpu_is_isolated(int cpu)1496{1497return cpumask_test_cpu(cpu, isolated_cpus);1498}1499EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);15001501/**1502* rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets1503* @parent: Parent cpuset containing all siblings1504* @cs: Current cpuset (will be skipped)1505* @excpus: exclusive effective CPU mask to modify1506*1507* This function ensures the given @excpus mask doesn't include any CPUs that1508* are exclusively allocated to sibling cpusets. It walks through all siblings1509* of @cs under @parent and removes their exclusive CPUs from @excpus.1510*/1511static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,1512struct cpumask *excpus)1513{1514struct cgroup_subsys_state *css;1515struct cpuset *sibling;1516int retval = 0;15171518if (cpumask_empty(excpus))1519return retval;15201521/*1522* Exclude exclusive CPUs from siblings1523*/1524rcu_read_lock();1525cpuset_for_each_child(sibling, css, parent) {1526if (sibling == cs)1527continue;15281529if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {1530cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);1531retval++;1532continue;1533}1534if (cpumask_intersects(excpus, sibling->effective_xcpus)) {1535cpumask_andnot(excpus, excpus, sibling->effective_xcpus);1536retval++;1537}1538}1539rcu_read_unlock();15401541return retval;1542}15431544/*1545* compute_excpus - compute effective exclusive CPUs1546* @cs: cpuset1547* @xcpus: effective exclusive CPUs value to be set1548* Return: 0 if there is no sibling conflict, > 0 otherwise1549*1550* If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets1551* and exclude their exclusive_cpus or effective_xcpus as well.1552*/1553static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)1554{1555struct cpuset *parent = parent_cs(cs);15561557cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);15581559if (!cpumask_empty(cs->exclusive_cpus))1560return 0;15611562return rm_siblings_excl_cpus(parent, cs, excpus);1563}15641565/*1566* compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset1567* @trialcs: The trial cpuset containing the proposed new configuration1568* @cs: The original cpuset that the trial configuration is based on1569* Return: 0 if successful with no sibling conflict, >0 if a conflict is found1570*1571* Computes the effective_xcpus for a trial configuration. @cs is provided to represent1572* the real cs.1573*/1574static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)1575{1576struct cpuset *parent = parent_cs(trialcs);1577struct cpumask *excpus = trialcs->effective_xcpus;15781579/* trialcs is member, cpuset.cpus has no impact to excpus */1580if (cs_is_member(cs))1581cpumask_and(excpus, trialcs->exclusive_cpus,1582parent->effective_xcpus);1583else1584cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);15851586return rm_siblings_excl_cpus(parent, cs, excpus);1587}15881589static inline bool is_remote_partition(struct cpuset *cs)1590{1591return cs->remote_partition;1592}15931594static inline bool is_local_partition(struct cpuset *cs)1595{1596return is_partition_valid(cs) && !is_remote_partition(cs);1597}15981599/*1600* remote_partition_enable - Enable current cpuset as a remote partition root1601* @cs: the cpuset to update1602* @new_prs: new partition_root_state1603* @tmp: temporary masks1604* Return: 0 if successful, errcode if error1605*1606* Enable the current cpuset to become a remote partition root taking CPUs1607* directly from the top cpuset. cpuset_mutex must be held by the caller.1608*/1609static int remote_partition_enable(struct cpuset *cs, int new_prs,1610struct tmpmasks *tmp)1611{1612/*1613* The user must have sysadmin privilege.1614*/1615if (!capable(CAP_SYS_ADMIN))1616return PERR_ACCESS;16171618/*1619* The requested exclusive_cpus must not be allocated to other1620* partitions and it can't use up all the root's effective_cpus.1621*1622* The effective_xcpus mask can contain offline CPUs, but there must1623* be at least one or more online CPUs present before it can be enabled.1624*1625* Note that creating a remote partition with any local partition root1626* above it or remote partition root underneath it is not allowed.1627*/1628compute_excpus(cs, tmp->new_cpus);1629WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));1630if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||1631cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))1632return PERR_INVCPUS;1633if (((new_prs == PRS_ISOLATED) &&1634!isolated_cpus_can_update(tmp->new_cpus, NULL)) ||1635prstate_housekeeping_conflict(new_prs, tmp->new_cpus))1636return PERR_HKEEPING;16371638spin_lock_irq(&callback_lock);1639partition_xcpus_add(new_prs, NULL, tmp->new_cpus);1640cs->remote_partition = true;1641cpumask_copy(cs->effective_xcpus, tmp->new_cpus);1642spin_unlock_irq(&callback_lock);1643update_isolation_cpumasks();1644cpuset_force_rebuild();1645cs->prs_err = 0;16461647/*1648* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1649*/1650cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1651update_sibling_cpumasks(&top_cpuset, NULL, tmp);1652return 0;1653}16541655/*1656* remote_partition_disable - Remove current cpuset from remote partition list1657* @cs: the cpuset to update1658* @tmp: temporary masks1659*1660* The effective_cpus is also updated.1661*1662* cpuset_mutex must be held by the caller.1663*/1664static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)1665{1666WARN_ON_ONCE(!is_remote_partition(cs));1667/*1668* When a CPU is offlined, top_cpuset may end up with no available CPUs,1669* which should clear subpartitions_cpus. We should not emit a warning for this1670* scenario: the hierarchy is updated from top to bottom, so subpartitions_cpus1671* may already be cleared when disabling the partition.1672*/1673WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus) &&1674!cpumask_empty(subpartitions_cpus));16751676spin_lock_irq(&callback_lock);1677cs->remote_partition = false;1678partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);1679if (cs->prs_err)1680cs->partition_root_state = -cs->partition_root_state;1681else1682cs->partition_root_state = PRS_MEMBER;16831684/* effective_xcpus may need to be changed */1685compute_excpus(cs, cs->effective_xcpus);1686reset_partition_data(cs);1687spin_unlock_irq(&callback_lock);1688update_isolation_cpumasks();1689cpuset_force_rebuild();16901691/*1692* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1693*/1694cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1695update_sibling_cpumasks(&top_cpuset, NULL, tmp);1696}16971698/*1699* remote_cpus_update - cpus_exclusive change of remote partition1700* @cs: the cpuset to be updated1701* @xcpus: the new exclusive_cpus mask, if non-NULL1702* @excpus: the new effective_xcpus mask1703* @tmp: temporary masks1704*1705* top_cpuset and subpartitions_cpus will be updated or partition can be1706* invalidated.1707*/1708static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,1709struct cpumask *excpus, struct tmpmasks *tmp)1710{1711bool adding, deleting;1712int prs = cs->partition_root_state;17131714if (WARN_ON_ONCE(!is_remote_partition(cs)))1715return;17161717WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));17181719if (cpumask_empty(excpus)) {1720cs->prs_err = PERR_CPUSEMPTY;1721goto invalidate;1722}17231724adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);1725deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);17261727/*1728* Additions of remote CPUs is only allowed if those CPUs are1729* not allocated to other partitions and there are effective_cpus1730* left in the top cpuset.1731*/1732if (adding) {1733WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));1734if (!capable(CAP_SYS_ADMIN))1735cs->prs_err = PERR_ACCESS;1736else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||1737cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))1738cs->prs_err = PERR_NOCPUS;1739else if ((prs == PRS_ISOLATED) &&1740!isolated_cpus_can_update(tmp->addmask, tmp->delmask))1741cs->prs_err = PERR_HKEEPING;1742if (cs->prs_err)1743goto invalidate;1744}17451746spin_lock_irq(&callback_lock);1747if (adding)1748partition_xcpus_add(prs, NULL, tmp->addmask);1749if (deleting)1750partition_xcpus_del(prs, NULL, tmp->delmask);1751/*1752* Need to update effective_xcpus and exclusive_cpus now as1753* update_sibling_cpumasks() below may iterate back to the same cs.1754*/1755cpumask_copy(cs->effective_xcpus, excpus);1756if (xcpus)1757cpumask_copy(cs->exclusive_cpus, xcpus);1758spin_unlock_irq(&callback_lock);1759update_isolation_cpumasks();1760if (adding || deleting)1761cpuset_force_rebuild();17621763/*1764* Propagate changes in top_cpuset's effective_cpus down the hierarchy.1765*/1766cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);1767update_sibling_cpumasks(&top_cpuset, NULL, tmp);1768return;17691770invalidate:1771remote_partition_disable(cs, tmp);1772}17731774/**1775* update_parent_effective_cpumask - update effective_cpus mask of parent cpuset1776* @cs: The cpuset that requests change in partition root state1777* @cmd: Partition root state change command1778* @newmask: Optional new cpumask for partcmd_update1779* @tmp: Temporary addmask and delmask1780* Return: 0 or a partition root state error code1781*1782* For partcmd_enable*, the cpuset is being transformed from a non-partition1783* root to a partition root. The effective_xcpus (cpus_allowed if1784* effective_xcpus not set) mask of the given cpuset will be taken away from1785* parent's effective_cpus. The function will return 0 if all the CPUs listed1786* in effective_xcpus can be granted or an error code will be returned.1787*1788* For partcmd_disable, the cpuset is being transformed from a partition1789* root back to a non-partition root. Any CPUs in effective_xcpus will be1790* given back to parent's effective_cpus. 0 will always be returned.1791*1792* For partcmd_update, if the optional newmask is specified, the cpu list is1793* to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is1794* assumed to remain the same. The cpuset should either be a valid or invalid1795* partition root. The partition root state may change from valid to invalid1796* or vice versa. An error code will be returned if transitioning from1797* invalid to valid violates the exclusivity rule.1798*1799* For partcmd_invalidate, the current partition will be made invalid.1800*1801* The partcmd_enable* and partcmd_disable commands are used by1802* update_prstate(). An error code may be returned and the caller will check1803* for error.1804*1805* The partcmd_update command is used by update_cpumasks_hier() with newmask1806* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used1807* by update_cpumask() with NULL newmask. In both cases, the callers won't1808* check for error and so partition_root_state and prs_err will be updated1809* directly.1810*/1811static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,1812struct cpumask *newmask,1813struct tmpmasks *tmp)1814{1815struct cpuset *parent = parent_cs(cs);1816int adding; /* Adding cpus to parent's effective_cpus */1817int deleting; /* Deleting cpus from parent's effective_cpus */1818int old_prs, new_prs;1819int part_error = PERR_NONE; /* Partition error? */1820struct cpumask *xcpus = user_xcpus(cs);1821int parent_prs = parent->partition_root_state;1822bool nocpu;18231824lockdep_assert_held(&cpuset_mutex);1825WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */18261827/*1828* new_prs will only be changed for the partcmd_update and1829* partcmd_invalidate commands.1830*/1831adding = deleting = false;1832old_prs = new_prs = cs->partition_root_state;18331834if (cmd == partcmd_invalidate) {1835if (is_partition_invalid(cs))1836return 0;18371838/*1839* Make the current partition invalid.1840*/1841if (is_partition_valid(parent))1842adding = cpumask_and(tmp->addmask,1843xcpus, parent->effective_xcpus);1844if (old_prs > 0)1845new_prs = -old_prs;18461847goto write_error;1848}18491850/*1851* The parent must be a partition root.1852* The new cpumask, if present, or the current cpus_allowed must1853* not be empty.1854*/1855if (!is_partition_valid(parent)) {1856return is_partition_invalid(parent)1857? PERR_INVPARENT : PERR_NOTPART;1858}1859if (!newmask && xcpus_empty(cs))1860return PERR_CPUSEMPTY;18611862nocpu = tasks_nocpu_error(parent, cs, xcpus);18631864if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {1865/*1866* Need to call compute_excpus() in case1867* exclusive_cpus not set. Sibling conflict should only happen1868* if exclusive_cpus isn't set.1869*/1870xcpus = tmp->delmask;1871if (compute_excpus(cs, xcpus))1872WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));1873new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;18741875/*1876* Enabling partition root is not allowed if its1877* effective_xcpus is empty.1878*/1879if (cpumask_empty(xcpus))1880return PERR_INVCPUS;18811882if (prstate_housekeeping_conflict(new_prs, xcpus))1883return PERR_HKEEPING;18841885if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&1886!isolated_cpus_can_update(xcpus, NULL))1887return PERR_HKEEPING;18881889if (tasks_nocpu_error(parent, cs, xcpus))1890return PERR_NOCPUS;18911892/*1893* This function will only be called when all the preliminary1894* checks have passed. At this point, the following condition1895* should hold.1896*1897* (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus1898*1899* Warn if it is not the case.1900*/1901cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);1902WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));19031904deleting = true;1905} else if (cmd == partcmd_disable) {1906/*1907* May need to add cpus back to parent's effective_cpus1908* (and maybe removed from subpartitions_cpus/isolated_cpus)1909* for valid partition root. xcpus may contain CPUs that1910* shouldn't be removed from the two global cpumasks.1911*/1912if (is_partition_valid(cs)) {1913cpumask_copy(tmp->addmask, cs->effective_xcpus);1914adding = true;1915}1916new_prs = PRS_MEMBER;1917} else if (newmask) {1918/*1919* Empty cpumask is not allowed1920*/1921if (cpumask_empty(newmask)) {1922part_error = PERR_CPUSEMPTY;1923goto write_error;1924}19251926/* Check newmask again, whether cpus are available for parent/cs */1927nocpu |= tasks_nocpu_error(parent, cs, newmask);19281929/*1930* partcmd_update with newmask:1931*1932* Compute add/delete mask to/from effective_cpus1933*1934* For valid partition:1935* addmask = exclusive_cpus & ~newmask1936* & parent->effective_xcpus1937* delmask = newmask & ~exclusive_cpus1938* & parent->effective_xcpus1939*1940* For invalid partition:1941* delmask = newmask & parent->effective_xcpus1942* The partition may become valid soon.1943*/1944if (is_partition_invalid(cs)) {1945adding = false;1946deleting = cpumask_and(tmp->delmask,1947newmask, parent->effective_xcpus);1948} else {1949cpumask_andnot(tmp->addmask, xcpus, newmask);1950adding = cpumask_and(tmp->addmask, tmp->addmask,1951parent->effective_xcpus);19521953cpumask_andnot(tmp->delmask, newmask, xcpus);1954deleting = cpumask_and(tmp->delmask, tmp->delmask,1955parent->effective_xcpus);1956}19571958/*1959* TBD: Invalidate a currently valid child root partition may1960* still break isolated_cpus_can_update() rule if parent is an1961* isolated partition.1962*/1963if (is_partition_valid(cs) && (old_prs != parent_prs)) {1964if ((parent_prs == PRS_ROOT) &&1965/* Adding to parent means removing isolated CPUs */1966!isolated_cpus_can_update(tmp->delmask, tmp->addmask))1967part_error = PERR_HKEEPING;1968if ((parent_prs == PRS_ISOLATED) &&1969/* Adding to parent means adding isolated CPUs */1970!isolated_cpus_can_update(tmp->addmask, tmp->delmask))1971part_error = PERR_HKEEPING;1972}19731974/*1975* The new CPUs to be removed from parent's effective CPUs1976* must be present.1977*/1978if (deleting) {1979cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);1980WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));1981}19821983/*1984* Make partition invalid if parent's effective_cpus could1985* become empty and there are tasks in the parent.1986*/1987if (nocpu && (!adding ||1988!cpumask_intersects(tmp->addmask, cpu_active_mask))) {1989part_error = PERR_NOCPUS;1990deleting = false;1991adding = cpumask_and(tmp->addmask,1992xcpus, parent->effective_xcpus);1993}1994} else {1995/*1996* partcmd_update w/o newmask1997*1998* delmask = effective_xcpus & parent->effective_cpus1999*2000* This can be called from:2001* 1) update_cpumasks_hier()2002* 2) cpuset_hotplug_update_tasks()2003*2004* Check to see if it can be transitioned from valid to2005* invalid partition or vice versa.2006*2007* A partition error happens when parent has tasks and all2008* its effective CPUs will have to be distributed out.2009*/2010if (nocpu) {2011part_error = PERR_NOCPUS;2012if (is_partition_valid(cs))2013adding = cpumask_and(tmp->addmask,2014xcpus, parent->effective_xcpus);2015} else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&2016cpumask_subset(xcpus, parent->effective_xcpus)) {2017struct cgroup_subsys_state *css;2018struct cpuset *child;2019bool exclusive = true;20202021/*2022* Convert invalid partition to valid has to2023* pass the cpu exclusivity test.2024*/2025rcu_read_lock();2026cpuset_for_each_child(child, css, parent) {2027if (child == cs)2028continue;2029if (!cpusets_are_exclusive(cs, child)) {2030exclusive = false;2031break;2032}2033}2034rcu_read_unlock();2035if (exclusive)2036deleting = cpumask_and(tmp->delmask,2037xcpus, parent->effective_cpus);2038else2039part_error = PERR_NOTEXCL;2040}2041}20422043write_error:2044if (part_error)2045WRITE_ONCE(cs->prs_err, part_error);20462047if (cmd == partcmd_update) {2048/*2049* Check for possible transition between valid and invalid2050* partition root.2051*/2052switch (cs->partition_root_state) {2053case PRS_ROOT:2054case PRS_ISOLATED:2055if (part_error)2056new_prs = -old_prs;2057break;2058case PRS_INVALID_ROOT:2059case PRS_INVALID_ISOLATED:2060if (!part_error)2061new_prs = -old_prs;2062break;2063}2064}20652066if (!adding && !deleting && (new_prs == old_prs))2067return 0;20682069/*2070* Transitioning between invalid to valid or vice versa may require2071* changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,2072* validate_change() has already been successfully called and2073* CPU lists in cs haven't been updated yet. So defer it to later.2074*/2075if ((old_prs != new_prs) && (cmd != partcmd_update)) {2076int err = update_partition_exclusive_flag(cs, new_prs);20772078if (err)2079return err;2080}20812082/*2083* Change the parent's effective_cpus & effective_xcpus (top cpuset2084* only).2085*2086* Newly added CPUs will be removed from effective_cpus and2087* newly deleted ones will be added back to effective_cpus.2088*/2089spin_lock_irq(&callback_lock);2090if (old_prs != new_prs)2091cs->partition_root_state = new_prs;20922093/*2094* Adding to parent's effective_cpus means deletion CPUs from cs2095* and vice versa.2096*/2097if (adding)2098partition_xcpus_del(old_prs, parent, tmp->addmask);2099if (deleting)2100partition_xcpus_add(new_prs, parent, tmp->delmask);21012102spin_unlock_irq(&callback_lock);2103update_isolation_cpumasks();21042105if ((old_prs != new_prs) && (cmd == partcmd_update))2106update_partition_exclusive_flag(cs, new_prs);21072108if (adding || deleting) {2109cpuset_update_tasks_cpumask(parent, tmp->addmask);2110update_sibling_cpumasks(parent, cs, tmp);2111}21122113/*2114* For partcmd_update without newmask, it is being called from2115* cpuset_handle_hotplug(). Update the load balance flag and2116* scheduling domain accordingly.2117*/2118if ((cmd == partcmd_update) && !newmask)2119update_partition_sd_lb(cs, old_prs);21202121notify_partition_change(cs, old_prs);2122return 0;2123}21242125/**2126* compute_partition_effective_cpumask - compute effective_cpus for partition2127* @cs: partition root cpuset2128* @new_ecpus: previously computed effective_cpus to be updated2129*2130* Compute the effective_cpus of a partition root by scanning effective_xcpus2131* of child partition roots and excluding their effective_xcpus.2132*2133* This has the side effect of invalidating valid child partition roots,2134* if necessary. Since it is called from either cpuset_hotplug_update_tasks()2135* or update_cpumasks_hier() where parent and children are modified2136* successively, we don't need to call update_parent_effective_cpumask()2137* and the child's effective_cpus will be updated in later iterations.2138*2139* Note that rcu_read_lock() is assumed to be held.2140*/2141static void compute_partition_effective_cpumask(struct cpuset *cs,2142struct cpumask *new_ecpus)2143{2144struct cgroup_subsys_state *css;2145struct cpuset *child;2146bool populated = partition_is_populated(cs, NULL);21472148/*2149* Check child partition roots to see if they should be2150* invalidated when2151* 1) child effective_xcpus not a subset of new2152* excluisve_cpus2153* 2) All the effective_cpus will be used up and cp2154* has tasks2155*/2156compute_excpus(cs, new_ecpus);2157cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);21582159rcu_read_lock();2160cpuset_for_each_child(child, css, cs) {2161if (!is_partition_valid(child))2162continue;21632164/*2165* There shouldn't be a remote partition underneath another2166* partition root.2167*/2168WARN_ON_ONCE(is_remote_partition(child));2169child->prs_err = 0;2170if (!cpumask_subset(child->effective_xcpus,2171cs->effective_xcpus))2172child->prs_err = PERR_INVCPUS;2173else if (populated &&2174cpumask_subset(new_ecpus, child->effective_xcpus))2175child->prs_err = PERR_NOCPUS;21762177if (child->prs_err) {2178int old_prs = child->partition_root_state;21792180/*2181* Invalidate child partition2182*/2183spin_lock_irq(&callback_lock);2184make_partition_invalid(child);2185spin_unlock_irq(&callback_lock);2186notify_partition_change(child, old_prs);2187continue;2188}2189cpumask_andnot(new_ecpus, new_ecpus,2190child->effective_xcpus);2191}2192rcu_read_unlock();2193}21942195/*2196* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree2197* @cs: the cpuset to consider2198* @tmp: temp variables for calculating effective_cpus & partition setup2199* @force: don't skip any descendant cpusets if set2200*2201* When configured cpumask is changed, the effective cpumasks of this cpuset2202* and all its descendants need to be updated.2203*2204* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.2205*2206* Called with cpuset_mutex held2207*/2208static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,2209bool force)2210{2211struct cpuset *cp;2212struct cgroup_subsys_state *pos_css;2213int old_prs, new_prs;22142215rcu_read_lock();2216cpuset_for_each_descendant_pre(cp, pos_css, cs) {2217struct cpuset *parent = parent_cs(cp);2218bool remote = is_remote_partition(cp);2219bool update_parent = false;22202221old_prs = new_prs = cp->partition_root_state;22222223/*2224* For child remote partition root (!= cs), we need to call2225* remote_cpus_update() if effective_xcpus will be changed.2226* Otherwise, we can skip the whole subtree.2227*2228* remote_cpus_update() will reuse tmp->new_cpus only after2229* its value is being processed.2230*/2231if (remote && (cp != cs)) {2232compute_excpus(cp, tmp->new_cpus);2233if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {2234pos_css = css_rightmost_descendant(pos_css);2235continue;2236}2237rcu_read_unlock();2238remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);2239rcu_read_lock();22402241/* Remote partition may be invalidated */2242new_prs = cp->partition_root_state;2243remote = (new_prs == old_prs);2244}22452246if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))2247compute_partition_effective_cpumask(cp, tmp->new_cpus);2248else2249compute_effective_cpumask(tmp->new_cpus, cp, parent);22502251if (remote)2252goto get_css; /* Ready to update cpuset data */22532254/*2255* A partition with no effective_cpus is allowed as long as2256* there is no task associated with it. Call2257* update_parent_effective_cpumask() to check it.2258*/2259if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {2260update_parent = true;2261goto update_parent_effective;2262}22632264/*2265* If it becomes empty, inherit the effective mask of the2266* parent, which is guaranteed to have some CPUs unless2267* it is a partition root that has explicitly distributed2268* out all its CPUs.2269*/2270if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))2271cpumask_copy(tmp->new_cpus, parent->effective_cpus);22722273/*2274* Skip the whole subtree if2275* 1) the cpumask remains the same,2276* 2) has no partition root state,2277* 3) force flag not set, and2278* 4) for v2 load balance state same as its parent.2279*/2280if (!cp->partition_root_state && !force &&2281cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&2282(!cpuset_v2() ||2283(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {2284pos_css = css_rightmost_descendant(pos_css);2285continue;2286}22872288update_parent_effective:2289/*2290* update_parent_effective_cpumask() should have been called2291* for cs already in update_cpumask(). We should also call2292* cpuset_update_tasks_cpumask() again for tasks in the parent2293* cpuset if the parent's effective_cpus changes.2294*/2295if ((cp != cs) && old_prs) {2296switch (parent->partition_root_state) {2297case PRS_ROOT:2298case PRS_ISOLATED:2299update_parent = true;2300break;23012302default:2303/*2304* When parent is not a partition root or is2305* invalid, child partition roots become2306* invalid too.2307*/2308if (is_partition_valid(cp))2309new_prs = -cp->partition_root_state;2310WRITE_ONCE(cp->prs_err,2311is_partition_invalid(parent)2312? PERR_INVPARENT : PERR_NOTPART);2313break;2314}2315}2316get_css:2317if (!css_tryget_online(&cp->css))2318continue;2319rcu_read_unlock();23202321if (update_parent) {2322update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);2323/*2324* The cpuset partition_root_state may become2325* invalid. Capture it.2326*/2327new_prs = cp->partition_root_state;2328}23292330spin_lock_irq(&callback_lock);2331cpumask_copy(cp->effective_cpus, tmp->new_cpus);2332cp->partition_root_state = new_prs;2333if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))2334compute_excpus(cp, cp->effective_xcpus);23352336/*2337* Make sure effective_xcpus is properly set for a valid2338* partition root.2339*/2340if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))2341cpumask_and(cp->effective_xcpus,2342cp->cpus_allowed, parent->effective_xcpus);2343else if (new_prs < 0)2344reset_partition_data(cp);2345spin_unlock_irq(&callback_lock);23462347notify_partition_change(cp, old_prs);23482349WARN_ON(!is_in_v2_mode() &&2350!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));23512352cpuset_update_tasks_cpumask(cp, cp->effective_cpus);23532354/*2355* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE2356* from parent if current cpuset isn't a valid partition root2357* and their load balance states differ.2358*/2359if (cpuset_v2() && !is_partition_valid(cp) &&2360(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {2361if (is_sched_load_balance(parent))2362set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);2363else2364clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);2365}23662367/*2368* On legacy hierarchy, if the effective cpumask of any non-2369* empty cpuset is changed, we need to rebuild sched domains.2370* On default hierarchy, the cpuset needs to be a partition2371* root as well.2372*/2373if (!cpumask_empty(cp->cpus_allowed) &&2374is_sched_load_balance(cp) &&2375(!cpuset_v2() || is_partition_valid(cp)))2376cpuset_force_rebuild();23772378rcu_read_lock();2379css_put(&cp->css);2380}2381rcu_read_unlock();2382}23832384/**2385* update_sibling_cpumasks - Update siblings cpumasks2386* @parent: Parent cpuset2387* @cs: Current cpuset2388* @tmp: Temp variables2389*/2390static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,2391struct tmpmasks *tmp)2392{2393struct cpuset *sibling;2394struct cgroup_subsys_state *pos_css;23952396lockdep_assert_held(&cpuset_mutex);23972398/*2399* Check all its siblings and call update_cpumasks_hier()2400* if their effective_cpus will need to be changed.2401*2402* It is possible a change in parent's effective_cpus2403* due to a change in a child partition's effective_xcpus will impact2404* its siblings even if they do not inherit parent's effective_cpus2405* directly.2406*2407* The update_cpumasks_hier() function may sleep. So we have to2408* release the RCU read lock before calling it.2409*/2410rcu_read_lock();2411cpuset_for_each_child(sibling, pos_css, parent) {2412if (sibling == cs)2413continue;2414if (!is_partition_valid(sibling)) {2415compute_effective_cpumask(tmp->new_cpus, sibling,2416parent);2417if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))2418continue;2419} else if (is_remote_partition(sibling)) {2420/*2421* Change in a sibling cpuset won't affect a remote2422* partition root.2423*/2424continue;2425}24262427if (!css_tryget_online(&sibling->css))2428continue;24292430rcu_read_unlock();2431update_cpumasks_hier(sibling, tmp, false);2432rcu_read_lock();2433css_put(&sibling->css);2434}2435rcu_read_unlock();2436}24372438static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)2439{2440int retval;24412442retval = cpulist_parse(buf, out_mask);2443if (retval < 0)2444return retval;2445if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))2446return -EINVAL;24472448return 0;2449}24502451/**2452* validate_partition - Validate a cpuset partition configuration2453* @cs: The cpuset to validate2454* @trialcs: The trial cpuset containing proposed configuration changes2455*2456* If any validation check fails, the appropriate error code is set in the2457* cpuset's prs_err field.2458*2459* Return: PRS error code (0 if valid, non-zero error code if invalid)2460*/2461static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)2462{2463struct cpuset *parent = parent_cs(cs);24642465if (cs_is_member(trialcs))2466return PERR_NONE;24672468if (cpumask_empty(trialcs->effective_xcpus))2469return PERR_INVCPUS;24702471if (prstate_housekeeping_conflict(trialcs->partition_root_state,2472trialcs->effective_xcpus))2473return PERR_HKEEPING;24742475if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))2476return PERR_NOCPUS;24772478return PERR_NONE;2479}24802481static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,2482struct tmpmasks *tmp)2483{2484int retval;2485struct cpuset *parent = parent_cs(cs);24862487retval = validate_change(cs, trialcs);24882489if ((retval == -EINVAL) && cpuset_v2()) {2490struct cgroup_subsys_state *css;2491struct cpuset *cp;24922493/*2494* The -EINVAL error code indicates that partition sibling2495* CPU exclusivity rule has been violated. We still allow2496* the cpumask change to proceed while invalidating the2497* partition. However, any conflicting sibling partitions2498* have to be marked as invalid too.2499*/2500trialcs->prs_err = PERR_NOTEXCL;2501rcu_read_lock();2502cpuset_for_each_child(cp, css, parent) {2503struct cpumask *xcpus = user_xcpus(trialcs);25042505if (is_partition_valid(cp) &&2506cpumask_intersects(xcpus, cp->effective_xcpus)) {2507rcu_read_unlock();2508update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);2509rcu_read_lock();2510}2511}2512rcu_read_unlock();2513retval = 0;2514}2515return retval;2516}25172518/**2519* partition_cpus_change - Handle partition state changes due to CPU mask updates2520* @cs: The target cpuset being modified2521* @trialcs: The trial cpuset containing proposed configuration changes2522* @tmp: Temporary masks for intermediate calculations2523*2524* This function handles partition state transitions triggered by CPU mask changes.2525* CPU modifications may cause a partition to be disabled or require state updates.2526*/2527static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,2528struct tmpmasks *tmp)2529{2530enum prs_errcode prs_err;25312532if (cs_is_member(cs))2533return;25342535prs_err = validate_partition(cs, trialcs);2536if (prs_err)2537trialcs->prs_err = cs->prs_err = prs_err;25382539if (is_remote_partition(cs)) {2540if (trialcs->prs_err)2541remote_partition_disable(cs, tmp);2542else2543remote_cpus_update(cs, trialcs->exclusive_cpus,2544trialcs->effective_xcpus, tmp);2545} else {2546if (trialcs->prs_err)2547update_parent_effective_cpumask(cs, partcmd_invalidate,2548NULL, tmp);2549else2550update_parent_effective_cpumask(cs, partcmd_update,2551trialcs->effective_xcpus, tmp);2552}2553}25542555/**2556* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it2557* @cs: the cpuset to consider2558* @trialcs: trial cpuset2559* @buf: buffer of cpu numbers written to this cpuset2560*/2561static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,2562const char *buf)2563{2564int retval;2565struct tmpmasks tmp;2566bool force = false;2567int old_prs = cs->partition_root_state;25682569retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);2570if (retval < 0)2571return retval;25722573/* Nothing to do if the cpus didn't change */2574if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))2575return 0;25762577if (alloc_tmpmasks(&tmp))2578return -ENOMEM;25792580compute_trialcs_excpus(trialcs, cs);2581trialcs->prs_err = PERR_NONE;25822583retval = cpus_allowed_validate_change(cs, trialcs, &tmp);2584if (retval < 0)2585goto out_free;25862587/*2588* Check all the descendants in update_cpumasks_hier() if2589* effective_xcpus is to be changed.2590*/2591force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);25922593partition_cpus_change(cs, trialcs, &tmp);25942595spin_lock_irq(&callback_lock);2596cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);2597cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);2598if ((old_prs > 0) && !is_partition_valid(cs))2599reset_partition_data(cs);2600spin_unlock_irq(&callback_lock);26012602/* effective_cpus/effective_xcpus will be updated here */2603update_cpumasks_hier(cs, &tmp, force);26042605/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */2606if (cs->partition_root_state)2607update_partition_sd_lb(cs, old_prs);2608out_free:2609free_tmpmasks(&tmp);2610return retval;2611}26122613/**2614* update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset2615* @cs: the cpuset to consider2616* @trialcs: trial cpuset2617* @buf: buffer of cpu numbers written to this cpuset2618*2619* The tasks' cpumask will be updated if cs is a valid partition root.2620*/2621static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,2622const char *buf)2623{2624int retval;2625struct tmpmasks tmp;2626bool force = false;2627int old_prs = cs->partition_root_state;26282629retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);2630if (retval < 0)2631return retval;26322633/* Nothing to do if the CPUs didn't change */2634if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))2635return 0;26362637/*2638* Reject the change if there is exclusive CPUs conflict with2639* the siblings.2640*/2641if (compute_trialcs_excpus(trialcs, cs))2642return -EINVAL;26432644/*2645* Check all the descendants in update_cpumasks_hier() if2646* effective_xcpus is to be changed.2647*/2648force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);26492650retval = validate_change(cs, trialcs);2651if (retval)2652return retval;26532654if (alloc_tmpmasks(&tmp))2655return -ENOMEM;26562657trialcs->prs_err = PERR_NONE;2658partition_cpus_change(cs, trialcs, &tmp);26592660spin_lock_irq(&callback_lock);2661cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);2662cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);2663if ((old_prs > 0) && !is_partition_valid(cs))2664reset_partition_data(cs);2665spin_unlock_irq(&callback_lock);26662667/*2668* Call update_cpumasks_hier() to update effective_cpus/effective_xcpus2669* of the subtree when it is a valid partition root or effective_xcpus2670* is updated.2671*/2672if (is_partition_valid(cs) || force)2673update_cpumasks_hier(cs, &tmp, force);26742675/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */2676if (cs->partition_root_state)2677update_partition_sd_lb(cs, old_prs);26782679free_tmpmasks(&tmp);2680return 0;2681}26822683/*2684* Migrate memory region from one set of nodes to another. This is2685* performed asynchronously as it can be called from process migration path2686* holding locks involved in process management. All mm migrations are2687* performed in the queued order and can be waited for by flushing2688* cpuset_migrate_mm_wq.2689*/26902691struct cpuset_migrate_mm_work {2692struct work_struct work;2693struct mm_struct *mm;2694nodemask_t from;2695nodemask_t to;2696};26972698static void cpuset_migrate_mm_workfn(struct work_struct *work)2699{2700struct cpuset_migrate_mm_work *mwork =2701container_of(work, struct cpuset_migrate_mm_work, work);27022703/* on a wq worker, no need to worry about %current's mems_allowed */2704do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);2705mmput(mwork->mm);2706kfree(mwork);2707}27082709static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,2710const nodemask_t *to)2711{2712struct cpuset_migrate_mm_work *mwork;27132714if (nodes_equal(*from, *to)) {2715mmput(mm);2716return;2717}27182719mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);2720if (mwork) {2721mwork->mm = mm;2722mwork->from = *from;2723mwork->to = *to;2724INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);2725queue_work(cpuset_migrate_mm_wq, &mwork->work);2726} else {2727mmput(mm);2728}2729}27302731static void flush_migrate_mm_task_workfn(struct callback_head *head)2732{2733flush_workqueue(cpuset_migrate_mm_wq);2734kfree(head);2735}27362737static void schedule_flush_migrate_mm(void)2738{2739struct callback_head *flush_cb;27402741flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);2742if (!flush_cb)2743return;27442745init_task_work(flush_cb, flush_migrate_mm_task_workfn);27462747if (task_work_add(current, flush_cb, TWA_RESUME))2748kfree(flush_cb);2749}27502751/*2752* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy2753* @tsk: the task to change2754* @newmems: new nodes that the task will be set2755*2756* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed2757* and rebind an eventual tasks' mempolicy. If the task is allocating in2758* parallel, it might temporarily see an empty intersection, which results in2759* a seqlock check and retry before OOM or allocation failure.2760*/2761static void cpuset_change_task_nodemask(struct task_struct *tsk,2762nodemask_t *newmems)2763{2764task_lock(tsk);27652766local_irq_disable();2767write_seqcount_begin(&tsk->mems_allowed_seq);27682769nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);2770mpol_rebind_task(tsk, newmems);2771tsk->mems_allowed = *newmems;27722773write_seqcount_end(&tsk->mems_allowed_seq);2774local_irq_enable();27752776task_unlock(tsk);2777}27782779static void *cpuset_being_rebound;27802781/**2782* cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.2783* @cs: the cpuset in which each task's mems_allowed mask needs to be changed2784*2785* Iterate through each task of @cs updating its mems_allowed to the2786* effective cpuset's. As this function is called with cpuset_mutex held,2787* cpuset membership stays stable.2788*/2789void cpuset_update_tasks_nodemask(struct cpuset *cs)2790{2791static nodemask_t newmems; /* protected by cpuset_mutex */2792struct css_task_iter it;2793struct task_struct *task;27942795cpuset_being_rebound = cs; /* causes mpol_dup() rebind */27962797guarantee_online_mems(cs, &newmems);27982799/*2800* The mpol_rebind_mm() call takes mmap_lock, which we couldn't2801* take while holding tasklist_lock. Forks can happen - the2802* mpol_dup() cpuset_being_rebound check will catch such forks,2803* and rebind their vma mempolicies too. Because we still hold2804* the global cpuset_mutex, we know that no other rebind effort2805* will be contending for the global variable cpuset_being_rebound.2806* It's ok if we rebind the same mm twice; mpol_rebind_mm()2807* is idempotent. Also migrate pages in each mm to new nodes.2808*/2809css_task_iter_start(&cs->css, 0, &it);2810while ((task = css_task_iter_next(&it))) {2811struct mm_struct *mm;2812bool migrate;28132814cpuset_change_task_nodemask(task, &newmems);28152816mm = get_task_mm(task);2817if (!mm)2818continue;28192820migrate = is_memory_migrate(cs);28212822mpol_rebind_mm(mm, &cs->mems_allowed);2823if (migrate)2824cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);2825else2826mmput(mm);2827}2828css_task_iter_end(&it);28292830/*2831* All the tasks' nodemasks have been updated, update2832* cs->old_mems_allowed.2833*/2834cs->old_mems_allowed = newmems;28352836/* We're done rebinding vmas to this cpuset's new mems_allowed. */2837cpuset_being_rebound = NULL;2838}28392840/*2841* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree2842* @cs: the cpuset to consider2843* @new_mems: a temp variable for calculating new effective_mems2844*2845* When configured nodemask is changed, the effective nodemasks of this cpuset2846* and all its descendants need to be updated.2847*2848* On legacy hierarchy, effective_mems will be the same with mems_allowed.2849*2850* Called with cpuset_mutex held2851*/2852static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)2853{2854struct cpuset *cp;2855struct cgroup_subsys_state *pos_css;28562857rcu_read_lock();2858cpuset_for_each_descendant_pre(cp, pos_css, cs) {2859struct cpuset *parent = parent_cs(cp);28602861nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);28622863/*2864* If it becomes empty, inherit the effective mask of the2865* parent, which is guaranteed to have some MEMs.2866*/2867if (is_in_v2_mode() && nodes_empty(*new_mems))2868*new_mems = parent->effective_mems;28692870/* Skip the whole subtree if the nodemask remains the same. */2871if (nodes_equal(*new_mems, cp->effective_mems)) {2872pos_css = css_rightmost_descendant(pos_css);2873continue;2874}28752876if (!css_tryget_online(&cp->css))2877continue;2878rcu_read_unlock();28792880spin_lock_irq(&callback_lock);2881cp->effective_mems = *new_mems;2882spin_unlock_irq(&callback_lock);28832884WARN_ON(!is_in_v2_mode() &&2885!nodes_equal(cp->mems_allowed, cp->effective_mems));28862887cpuset_update_tasks_nodemask(cp);28882889rcu_read_lock();2890css_put(&cp->css);2891}2892rcu_read_unlock();2893}28942895/*2896* Handle user request to change the 'mems' memory placement2897* of a cpuset. Needs to validate the request, update the2898* cpusets mems_allowed, and for each task in the cpuset,2899* update mems_allowed and rebind task's mempolicy and any vma2900* mempolicies and if the cpuset is marked 'memory_migrate',2901* migrate the tasks pages to the new memory.2902*2903* Call with cpuset_mutex held. May take callback_lock during call.2904* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,2905* lock each such tasks mm->mmap_lock, scan its vma's and rebind2906* their mempolicies to the cpusets new mems_allowed.2907*/2908static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,2909const char *buf)2910{2911int retval;29122913/*2914* An empty mems_allowed is ok iff there are no tasks in the cpuset.2915* The validate_change() call ensures that cpusets with tasks have memory.2916*/2917retval = nodelist_parse(buf, trialcs->mems_allowed);2918if (retval < 0)2919return retval;29202921if (!nodes_subset(trialcs->mems_allowed,2922top_cpuset.mems_allowed))2923return -EINVAL;29242925/* No change? nothing to do */2926if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))2927return 0;29282929retval = validate_change(cs, trialcs);2930if (retval < 0)2931return retval;29322933check_insane_mems_config(&trialcs->mems_allowed);29342935spin_lock_irq(&callback_lock);2936cs->mems_allowed = trialcs->mems_allowed;2937spin_unlock_irq(&callback_lock);29382939/* use trialcs->mems_allowed as a temp variable */2940update_nodemasks_hier(cs, &trialcs->mems_allowed);2941return 0;2942}29432944bool current_cpuset_is_being_rebound(void)2945{2946bool ret;29472948rcu_read_lock();2949ret = task_cs(current) == cpuset_being_rebound;2950rcu_read_unlock();29512952return ret;2953}29542955/*2956* cpuset_update_flag - read a 0 or a 1 in a file and update associated flag2957* bit: the bit to update (see cpuset_flagbits_t)2958* cs: the cpuset to update2959* turning_on: whether the flag is being set or cleared2960*2961* Call with cpuset_mutex held.2962*/29632964int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,2965int turning_on)2966{2967struct cpuset *trialcs;2968int balance_flag_changed;2969int spread_flag_changed;2970int err;29712972trialcs = dup_or_alloc_cpuset(cs);2973if (!trialcs)2974return -ENOMEM;29752976if (turning_on)2977set_bit(bit, &trialcs->flags);2978else2979clear_bit(bit, &trialcs->flags);29802981err = validate_change(cs, trialcs);2982if (err < 0)2983goto out;29842985balance_flag_changed = (is_sched_load_balance(cs) !=2986is_sched_load_balance(trialcs));29872988spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))2989|| (is_spread_page(cs) != is_spread_page(trialcs)));29902991spin_lock_irq(&callback_lock);2992cs->flags = trialcs->flags;2993spin_unlock_irq(&callback_lock);29942995if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {2996if (cpuset_v2())2997cpuset_force_rebuild();2998else2999rebuild_sched_domains_locked();3000}30013002if (spread_flag_changed)3003cpuset1_update_tasks_flags(cs);3004out:3005free_cpuset(trialcs);3006return err;3007}30083009/**3010* update_prstate - update partition_root_state3011* @cs: the cpuset to update3012* @new_prs: new partition root state3013* Return: 0 if successful, != 0 if error3014*3015* Call with cpuset_mutex held.3016*/3017static int update_prstate(struct cpuset *cs, int new_prs)3018{3019int err = PERR_NONE, old_prs = cs->partition_root_state;3020struct cpuset *parent = parent_cs(cs);3021struct tmpmasks tmpmask;3022bool isolcpus_updated = false;30233024if (old_prs == new_prs)3025return 0;30263027/*3028* Treat a previously invalid partition root as if it is a "member".3029*/3030if (new_prs && is_partition_invalid(cs))3031old_prs = PRS_MEMBER;30323033if (alloc_tmpmasks(&tmpmask))3034return -ENOMEM;30353036err = update_partition_exclusive_flag(cs, new_prs);3037if (err)3038goto out;30393040if (!old_prs) {3041/*3042* cpus_allowed and exclusive_cpus cannot be both empty.3043*/3044if (xcpus_empty(cs)) {3045err = PERR_CPUSEMPTY;3046goto out;3047}30483049/*3050* We don't support the creation of a new local partition with3051* a remote partition underneath it. This unsupported3052* setting can happen only if parent is the top_cpuset because3053* a remote partition cannot be created underneath an existing3054* local or remote partition.3055*/3056if ((parent == &top_cpuset) &&3057cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {3058err = PERR_REMOTE;3059goto out;3060}30613062/*3063* If parent is valid partition, enable local partiion.3064* Otherwise, enable a remote partition.3065*/3066if (is_partition_valid(parent)) {3067enum partition_cmd cmd = (new_prs == PRS_ROOT)3068? partcmd_enable : partcmd_enablei;30693070err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);3071} else {3072err = remote_partition_enable(cs, new_prs, &tmpmask);3073}3074} else if (old_prs && new_prs) {3075/*3076* A change in load balance state only, no change in cpumasks.3077* Need to update isolated_cpus.3078*/3079if (((new_prs == PRS_ISOLATED) &&3080!isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||3081prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))3082err = PERR_HKEEPING;3083else3084isolcpus_updated = true;3085} else {3086/*3087* Switching back to member is always allowed even if it3088* disables child partitions.3089*/3090if (is_remote_partition(cs))3091remote_partition_disable(cs, &tmpmask);3092else3093update_parent_effective_cpumask(cs, partcmd_disable,3094NULL, &tmpmask);30953096/*3097* Invalidation of child partitions will be done in3098* update_cpumasks_hier().3099*/3100}3101out:3102/*3103* Make partition invalid & disable CS_CPU_EXCLUSIVE if an error3104* happens.3105*/3106if (err) {3107new_prs = -new_prs;3108update_partition_exclusive_flag(cs, new_prs);3109}31103111spin_lock_irq(&callback_lock);3112cs->partition_root_state = new_prs;3113WRITE_ONCE(cs->prs_err, err);3114if (!is_partition_valid(cs))3115reset_partition_data(cs);3116else if (isolcpus_updated)3117isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);3118spin_unlock_irq(&callback_lock);3119update_isolation_cpumasks();31203121/* Force update if switching back to member & update effective_xcpus */3122update_cpumasks_hier(cs, &tmpmask, !new_prs);31233124/* A newly created partition must have effective_xcpus set */3125WARN_ON_ONCE(!old_prs && (new_prs > 0)3126&& cpumask_empty(cs->effective_xcpus));31273128/* Update sched domains and load balance flag */3129update_partition_sd_lb(cs, old_prs);31303131notify_partition_change(cs, old_prs);3132if (force_sd_rebuild)3133rebuild_sched_domains_locked();3134free_tmpmasks(&tmpmask);3135return 0;3136}31373138static struct cpuset *cpuset_attach_old_cs;31393140/*3141* Check to see if a cpuset can accept a new task3142* For v1, cpus_allowed and mems_allowed can't be empty.3143* For v2, effective_cpus can't be empty.3144* Note that in v1, effective_cpus = cpus_allowed.3145*/3146static int cpuset_can_attach_check(struct cpuset *cs)3147{3148if (cpumask_empty(cs->effective_cpus) ||3149(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))3150return -ENOSPC;3151return 0;3152}31533154static void reset_migrate_dl_data(struct cpuset *cs)3155{3156cs->nr_migrate_dl_tasks = 0;3157cs->sum_migrate_dl_bw = 0;3158}31593160/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */3161static int cpuset_can_attach(struct cgroup_taskset *tset)3162{3163struct cgroup_subsys_state *css;3164struct cpuset *cs, *oldcs;3165struct task_struct *task;3166bool cpus_updated, mems_updated;3167int ret;31683169/* used later by cpuset_attach() */3170cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));3171oldcs = cpuset_attach_old_cs;3172cs = css_cs(css);31733174mutex_lock(&cpuset_mutex);31753176/* Check to see if task is allowed in the cpuset */3177ret = cpuset_can_attach_check(cs);3178if (ret)3179goto out_unlock;31803181cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);3182mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);31833184cgroup_taskset_for_each(task, css, tset) {3185ret = task_can_attach(task);3186if (ret)3187goto out_unlock;31883189/*3190* Skip rights over task check in v2 when nothing changes,3191* migration permission derives from hierarchy ownership in3192* cgroup_procs_write_permission()).3193*/3194if (!cpuset_v2() || (cpus_updated || mems_updated)) {3195ret = security_task_setscheduler(task);3196if (ret)3197goto out_unlock;3198}31993200if (dl_task(task)) {3201cs->nr_migrate_dl_tasks++;3202cs->sum_migrate_dl_bw += task->dl.dl_bw;3203}3204}32053206if (!cs->nr_migrate_dl_tasks)3207goto out_success;32083209if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {3210int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);32113212if (unlikely(cpu >= nr_cpu_ids)) {3213reset_migrate_dl_data(cs);3214ret = -EINVAL;3215goto out_unlock;3216}32173218ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);3219if (ret) {3220reset_migrate_dl_data(cs);3221goto out_unlock;3222}3223}32243225out_success:3226/*3227* Mark attach is in progress. This makes validate_change() fail3228* changes which zero cpus/mems_allowed.3229*/3230cs->attach_in_progress++;3231out_unlock:3232mutex_unlock(&cpuset_mutex);3233return ret;3234}32353236static void cpuset_cancel_attach(struct cgroup_taskset *tset)3237{3238struct cgroup_subsys_state *css;3239struct cpuset *cs;32403241cgroup_taskset_first(tset, &css);3242cs = css_cs(css);32433244mutex_lock(&cpuset_mutex);3245dec_attach_in_progress_locked(cs);32463247if (cs->nr_migrate_dl_tasks) {3248int cpu = cpumask_any(cs->effective_cpus);32493250dl_bw_free(cpu, cs->sum_migrate_dl_bw);3251reset_migrate_dl_data(cs);3252}32533254mutex_unlock(&cpuset_mutex);3255}32563257/*3258* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()3259* but we can't allocate it dynamically there. Define it global and3260* allocate from cpuset_init().3261*/3262static cpumask_var_t cpus_attach;3263static nodemask_t cpuset_attach_nodemask_to;32643265static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)3266{3267lockdep_assert_held(&cpuset_mutex);32683269if (cs != &top_cpuset)3270guarantee_active_cpus(task, cpus_attach);3271else3272cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),3273subpartitions_cpus);3274/*3275* can_attach beforehand should guarantee that this doesn't3276* fail. TODO: have a better way to handle failure here3277*/3278WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));32793280cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);3281cpuset1_update_task_spread_flags(cs, task);3282}32833284static void cpuset_attach(struct cgroup_taskset *tset)3285{3286struct task_struct *task;3287struct task_struct *leader;3288struct cgroup_subsys_state *css;3289struct cpuset *cs;3290struct cpuset *oldcs = cpuset_attach_old_cs;3291bool cpus_updated, mems_updated;3292bool queue_task_work = false;32933294cgroup_taskset_first(tset, &css);3295cs = css_cs(css);32963297lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */3298mutex_lock(&cpuset_mutex);3299cpus_updated = !cpumask_equal(cs->effective_cpus,3300oldcs->effective_cpus);3301mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);33023303/*3304* In the default hierarchy, enabling cpuset in the child cgroups3305* will trigger a number of cpuset_attach() calls with no change3306* in effective cpus and mems. In that case, we can optimize out3307* by skipping the task iteration and update.3308*/3309if (cpuset_v2() && !cpus_updated && !mems_updated) {3310cpuset_attach_nodemask_to = cs->effective_mems;3311goto out;3312}33133314guarantee_online_mems(cs, &cpuset_attach_nodemask_to);33153316cgroup_taskset_for_each(task, css, tset)3317cpuset_attach_task(cs, task);33183319/*3320* Change mm for all threadgroup leaders. This is expensive and may3321* sleep and should be moved outside migration path proper. Skip it3322* if there is no change in effective_mems and CS_MEMORY_MIGRATE is3323* not set.3324*/3325cpuset_attach_nodemask_to = cs->effective_mems;3326if (!is_memory_migrate(cs) && !mems_updated)3327goto out;33283329cgroup_taskset_for_each_leader(leader, css, tset) {3330struct mm_struct *mm = get_task_mm(leader);33313332if (mm) {3333mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);33343335/*3336* old_mems_allowed is the same with mems_allowed3337* here, except if this task is being moved3338* automatically due to hotplug. In that case3339* @mems_allowed has been updated and is empty, so3340* @old_mems_allowed is the right nodesets that we3341* migrate mm from.3342*/3343if (is_memory_migrate(cs)) {3344cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,3345&cpuset_attach_nodemask_to);3346queue_task_work = true;3347} else3348mmput(mm);3349}3350}33513352out:3353if (queue_task_work)3354schedule_flush_migrate_mm();3355cs->old_mems_allowed = cpuset_attach_nodemask_to;33563357if (cs->nr_migrate_dl_tasks) {3358cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;3359oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;3360reset_migrate_dl_data(cs);3361}33623363dec_attach_in_progress_locked(cs);33643365mutex_unlock(&cpuset_mutex);3366}33673368/*3369* Common handling for a write to a "cpus" or "mems" file.3370*/3371ssize_t cpuset_write_resmask(struct kernfs_open_file *of,3372char *buf, size_t nbytes, loff_t off)3373{3374struct cpuset *cs = css_cs(of_css(of));3375struct cpuset *trialcs;3376int retval = -ENODEV;33773378/* root is read-only */3379if (cs == &top_cpuset)3380return -EACCES;33813382buf = strstrip(buf);3383cpuset_full_lock();3384if (!is_cpuset_online(cs))3385goto out_unlock;33863387trialcs = dup_or_alloc_cpuset(cs);3388if (!trialcs) {3389retval = -ENOMEM;3390goto out_unlock;3391}33923393switch (of_cft(of)->private) {3394case FILE_CPULIST:3395retval = update_cpumask(cs, trialcs, buf);3396break;3397case FILE_EXCLUSIVE_CPULIST:3398retval = update_exclusive_cpumask(cs, trialcs, buf);3399break;3400case FILE_MEMLIST:3401retval = update_nodemask(cs, trialcs, buf);3402break;3403default:3404retval = -EINVAL;3405break;3406}34073408free_cpuset(trialcs);3409if (force_sd_rebuild)3410rebuild_sched_domains_locked();3411out_unlock:3412cpuset_full_unlock();3413if (of_cft(of)->private == FILE_MEMLIST)3414schedule_flush_migrate_mm();3415return retval ?: nbytes;3416}34173418/*3419* These ascii lists should be read in a single call, by using a user3420* buffer large enough to hold the entire map. If read in smaller3421* chunks, there is no guarantee of atomicity. Since the display format3422* used, list of ranges of sequential numbers, is variable length,3423* and since these maps can change value dynamically, one could read3424* gibberish by doing partial reads while a list was changing.3425*/3426int cpuset_common_seq_show(struct seq_file *sf, void *v)3427{3428struct cpuset *cs = css_cs(seq_css(sf));3429cpuset_filetype_t type = seq_cft(sf)->private;3430int ret = 0;34313432spin_lock_irq(&callback_lock);34333434switch (type) {3435case FILE_CPULIST:3436seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));3437break;3438case FILE_MEMLIST:3439seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));3440break;3441case FILE_EFFECTIVE_CPULIST:3442seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));3443break;3444case FILE_EFFECTIVE_MEMLIST:3445seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));3446break;3447case FILE_EXCLUSIVE_CPULIST:3448seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));3449break;3450case FILE_EFFECTIVE_XCPULIST:3451seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));3452break;3453case FILE_SUBPARTS_CPULIST:3454seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));3455break;3456case FILE_ISOLATED_CPULIST:3457seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));3458break;3459default:3460ret = -EINVAL;3461}34623463spin_unlock_irq(&callback_lock);3464return ret;3465}34663467static int cpuset_partition_show(struct seq_file *seq, void *v)3468{3469struct cpuset *cs = css_cs(seq_css(seq));3470const char *err, *type = NULL;34713472switch (cs->partition_root_state) {3473case PRS_ROOT:3474seq_puts(seq, "root\n");3475break;3476case PRS_ISOLATED:3477seq_puts(seq, "isolated\n");3478break;3479case PRS_MEMBER:3480seq_puts(seq, "member\n");3481break;3482case PRS_INVALID_ROOT:3483type = "root";3484fallthrough;3485case PRS_INVALID_ISOLATED:3486if (!type)3487type = "isolated";3488err = perr_strings[READ_ONCE(cs->prs_err)];3489if (err)3490seq_printf(seq, "%s invalid (%s)\n", type, err);3491else3492seq_printf(seq, "%s invalid\n", type);3493break;3494}3495return 0;3496}34973498static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,3499size_t nbytes, loff_t off)3500{3501struct cpuset *cs = css_cs(of_css(of));3502int val;3503int retval = -ENODEV;35043505buf = strstrip(buf);35063507if (!strcmp(buf, "root"))3508val = PRS_ROOT;3509else if (!strcmp(buf, "member"))3510val = PRS_MEMBER;3511else if (!strcmp(buf, "isolated"))3512val = PRS_ISOLATED;3513else3514return -EINVAL;35153516cpuset_full_lock();3517if (is_cpuset_online(cs))3518retval = update_prstate(cs, val);3519cpuset_full_unlock();3520return retval ?: nbytes;3521}35223523/*3524* This is currently a minimal set for the default hierarchy. It can be3525* expanded later on by migrating more features and control files from v1.3526*/3527static struct cftype dfl_files[] = {3528{3529.name = "cpus",3530.seq_show = cpuset_common_seq_show,3531.write = cpuset_write_resmask,3532.max_write_len = (100U + 6 * NR_CPUS),3533.private = FILE_CPULIST,3534.flags = CFTYPE_NOT_ON_ROOT,3535},35363537{3538.name = "mems",3539.seq_show = cpuset_common_seq_show,3540.write = cpuset_write_resmask,3541.max_write_len = (100U + 6 * MAX_NUMNODES),3542.private = FILE_MEMLIST,3543.flags = CFTYPE_NOT_ON_ROOT,3544},35453546{3547.name = "cpus.effective",3548.seq_show = cpuset_common_seq_show,3549.private = FILE_EFFECTIVE_CPULIST,3550},35513552{3553.name = "mems.effective",3554.seq_show = cpuset_common_seq_show,3555.private = FILE_EFFECTIVE_MEMLIST,3556},35573558{3559.name = "cpus.partition",3560.seq_show = cpuset_partition_show,3561.write = cpuset_partition_write,3562.private = FILE_PARTITION_ROOT,3563.flags = CFTYPE_NOT_ON_ROOT,3564.file_offset = offsetof(struct cpuset, partition_file),3565},35663567{3568.name = "cpus.exclusive",3569.seq_show = cpuset_common_seq_show,3570.write = cpuset_write_resmask,3571.max_write_len = (100U + 6 * NR_CPUS),3572.private = FILE_EXCLUSIVE_CPULIST,3573.flags = CFTYPE_NOT_ON_ROOT,3574},35753576{3577.name = "cpus.exclusive.effective",3578.seq_show = cpuset_common_seq_show,3579.private = FILE_EFFECTIVE_XCPULIST,3580.flags = CFTYPE_NOT_ON_ROOT,3581},35823583{3584.name = "cpus.subpartitions",3585.seq_show = cpuset_common_seq_show,3586.private = FILE_SUBPARTS_CPULIST,3587.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,3588},35893590{3591.name = "cpus.isolated",3592.seq_show = cpuset_common_seq_show,3593.private = FILE_ISOLATED_CPULIST,3594.flags = CFTYPE_ONLY_ON_ROOT,3595},35963597{ } /* terminate */3598};359936003601/**3602* cpuset_css_alloc - Allocate a cpuset css3603* @parent_css: Parent css of the control group that the new cpuset will be3604* part of3605* Return: cpuset css on success, -ENOMEM on failure.3606*3607* Allocate and initialize a new cpuset css, for non-NULL @parent_css, return3608* top cpuset css otherwise.3609*/3610static struct cgroup_subsys_state *3611cpuset_css_alloc(struct cgroup_subsys_state *parent_css)3612{3613struct cpuset *cs;36143615if (!parent_css)3616return &top_cpuset.css;36173618cs = dup_or_alloc_cpuset(NULL);3619if (!cs)3620return ERR_PTR(-ENOMEM);36213622__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);3623fmeter_init(&cs->fmeter);3624cs->relax_domain_level = -1;36253626/* Set CS_MEMORY_MIGRATE for default hierarchy */3627if (cpuset_v2())3628__set_bit(CS_MEMORY_MIGRATE, &cs->flags);36293630return &cs->css;3631}36323633static int cpuset_css_online(struct cgroup_subsys_state *css)3634{3635struct cpuset *cs = css_cs(css);3636struct cpuset *parent = parent_cs(cs);3637struct cpuset *tmp_cs;3638struct cgroup_subsys_state *pos_css;36393640if (!parent)3641return 0;36423643cpuset_full_lock();3644if (is_spread_page(parent))3645set_bit(CS_SPREAD_PAGE, &cs->flags);3646if (is_spread_slab(parent))3647set_bit(CS_SPREAD_SLAB, &cs->flags);3648/*3649* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated3650*/3651if (cpuset_v2() && !is_sched_load_balance(parent))3652clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);36533654cpuset_inc();36553656spin_lock_irq(&callback_lock);3657if (is_in_v2_mode()) {3658cpumask_copy(cs->effective_cpus, parent->effective_cpus);3659cs->effective_mems = parent->effective_mems;3660}3661spin_unlock_irq(&callback_lock);36623663if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))3664goto out_unlock;36653666/*3667* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is3668* set. This flag handling is implemented in cgroup core for3669* historical reasons - the flag may be specified during mount.3670*3671* Currently, if any sibling cpusets have exclusive cpus or mem, we3672* refuse to clone the configuration - thereby refusing the task to3673* be entered, and as a result refusing the sys_unshare() or3674* clone() which initiated it. If this becomes a problem for some3675* users who wish to allow that scenario, then this could be3676* changed to grant parent->cpus_allowed-sibling_cpus_exclusive3677* (and likewise for mems) to the new cgroup.3678*/3679rcu_read_lock();3680cpuset_for_each_child(tmp_cs, pos_css, parent) {3681if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {3682rcu_read_unlock();3683goto out_unlock;3684}3685}3686rcu_read_unlock();36873688spin_lock_irq(&callback_lock);3689cs->mems_allowed = parent->mems_allowed;3690cs->effective_mems = parent->mems_allowed;3691cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);3692cpumask_copy(cs->effective_cpus, parent->cpus_allowed);3693spin_unlock_irq(&callback_lock);3694out_unlock:3695cpuset_full_unlock();3696return 0;3697}36983699/*3700* If the cpuset being removed has its flag 'sched_load_balance'3701* enabled, then simulate turning sched_load_balance off, which3702* will call rebuild_sched_domains_locked(). That is not needed3703* in the default hierarchy where only changes in partition3704* will cause repartitioning.3705*/3706static void cpuset_css_offline(struct cgroup_subsys_state *css)3707{3708struct cpuset *cs = css_cs(css);37093710cpuset_full_lock();3711if (!cpuset_v2() && is_sched_load_balance(cs))3712cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);37133714cpuset_dec();3715cpuset_full_unlock();3716}37173718/*3719* If a dying cpuset has the 'cpus.partition' enabled, turn it off by3720* changing it back to member to free its exclusive CPUs back to the pool to3721* be used by other online cpusets.3722*/3723static void cpuset_css_killed(struct cgroup_subsys_state *css)3724{3725struct cpuset *cs = css_cs(css);37263727cpuset_full_lock();3728/* Reset valid partition back to member */3729if (is_partition_valid(cs))3730update_prstate(cs, PRS_MEMBER);3731cpuset_full_unlock();3732}37333734static void cpuset_css_free(struct cgroup_subsys_state *css)3735{3736struct cpuset *cs = css_cs(css);37373738free_cpuset(cs);3739}37403741static void cpuset_bind(struct cgroup_subsys_state *root_css)3742{3743mutex_lock(&cpuset_mutex);3744spin_lock_irq(&callback_lock);37453746if (is_in_v2_mode()) {3747cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);3748cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);3749top_cpuset.mems_allowed = node_possible_map;3750} else {3751cpumask_copy(top_cpuset.cpus_allowed,3752top_cpuset.effective_cpus);3753top_cpuset.mems_allowed = top_cpuset.effective_mems;3754}37553756spin_unlock_irq(&callback_lock);3757mutex_unlock(&cpuset_mutex);3758}37593760/*3761* In case the child is cloned into a cpuset different from its parent,3762* additional checks are done to see if the move is allowed.3763*/3764static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)3765{3766struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);3767bool same_cs;3768int ret;37693770rcu_read_lock();3771same_cs = (cs == task_cs(current));3772rcu_read_unlock();37733774if (same_cs)3775return 0;37763777lockdep_assert_held(&cgroup_mutex);3778mutex_lock(&cpuset_mutex);37793780/* Check to see if task is allowed in the cpuset */3781ret = cpuset_can_attach_check(cs);3782if (ret)3783goto out_unlock;37843785ret = task_can_attach(task);3786if (ret)3787goto out_unlock;37883789ret = security_task_setscheduler(task);3790if (ret)3791goto out_unlock;37923793/*3794* Mark attach is in progress. This makes validate_change() fail3795* changes which zero cpus/mems_allowed.3796*/3797cs->attach_in_progress++;3798out_unlock:3799mutex_unlock(&cpuset_mutex);3800return ret;3801}38023803static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)3804{3805struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);3806bool same_cs;38073808rcu_read_lock();3809same_cs = (cs == task_cs(current));3810rcu_read_unlock();38113812if (same_cs)3813return;38143815dec_attach_in_progress(cs);3816}38173818/*3819* Make sure the new task conform to the current state of its parent,3820* which could have been changed by cpuset just after it inherits the3821* state from the parent and before it sits on the cgroup's task list.3822*/3823static void cpuset_fork(struct task_struct *task)3824{3825struct cpuset *cs;3826bool same_cs;38273828rcu_read_lock();3829cs = task_cs(task);3830same_cs = (cs == task_cs(current));3831rcu_read_unlock();38323833if (same_cs) {3834if (cs == &top_cpuset)3835return;38363837set_cpus_allowed_ptr(task, current->cpus_ptr);3838task->mems_allowed = current->mems_allowed;3839return;3840}38413842/* CLONE_INTO_CGROUP */3843mutex_lock(&cpuset_mutex);3844guarantee_online_mems(cs, &cpuset_attach_nodemask_to);3845cpuset_attach_task(cs, task);38463847dec_attach_in_progress_locked(cs);3848mutex_unlock(&cpuset_mutex);3849}38503851struct cgroup_subsys cpuset_cgrp_subsys = {3852.css_alloc = cpuset_css_alloc,3853.css_online = cpuset_css_online,3854.css_offline = cpuset_css_offline,3855.css_killed = cpuset_css_killed,3856.css_free = cpuset_css_free,3857.can_attach = cpuset_can_attach,3858.cancel_attach = cpuset_cancel_attach,3859.attach = cpuset_attach,3860.bind = cpuset_bind,3861.can_fork = cpuset_can_fork,3862.cancel_fork = cpuset_cancel_fork,3863.fork = cpuset_fork,3864#ifdef CONFIG_CPUSETS_V13865.legacy_cftypes = cpuset1_files,3866#endif3867.dfl_cftypes = dfl_files,3868.early_init = true,3869.threaded = true,3870};38713872/**3873* cpuset_init - initialize cpusets at system boot3874*3875* Description: Initialize top_cpuset3876**/38773878int __init cpuset_init(void)3879{3880BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));3881BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));3882BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));3883BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));3884BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));3885BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));38863887cpumask_setall(top_cpuset.cpus_allowed);3888nodes_setall(top_cpuset.mems_allowed);3889cpumask_setall(top_cpuset.effective_cpus);3890cpumask_setall(top_cpuset.effective_xcpus);3891cpumask_setall(top_cpuset.exclusive_cpus);3892nodes_setall(top_cpuset.effective_mems);38933894fmeter_init(&top_cpuset.fmeter);38953896BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));38973898have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);3899if (have_boot_isolcpus) {3900BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));3901cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));3902cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);3903}39043905return 0;3906}39073908static void3909hotplug_update_tasks(struct cpuset *cs,3910struct cpumask *new_cpus, nodemask_t *new_mems,3911bool cpus_updated, bool mems_updated)3912{3913/* A partition root is allowed to have empty effective cpus */3914if (cpumask_empty(new_cpus) && !is_partition_valid(cs))3915cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);3916if (nodes_empty(*new_mems))3917*new_mems = parent_cs(cs)->effective_mems;39183919spin_lock_irq(&callback_lock);3920cpumask_copy(cs->effective_cpus, new_cpus);3921cs->effective_mems = *new_mems;3922spin_unlock_irq(&callback_lock);39233924if (cpus_updated)3925cpuset_update_tasks_cpumask(cs, new_cpus);3926if (mems_updated)3927cpuset_update_tasks_nodemask(cs);3928}39293930void cpuset_force_rebuild(void)3931{3932force_sd_rebuild = true;3933}39343935/**3936* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug3937* @cs: cpuset in interest3938* @tmp: the tmpmasks structure pointer3939*3940* Compare @cs's cpu and mem masks against top_cpuset and if some have gone3941* offline, update @cs accordingly. If @cs ends up with no CPU or memory,3942* all its tasks are moved to the nearest ancestor with both resources.3943*/3944static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)3945{3946static cpumask_t new_cpus;3947static nodemask_t new_mems;3948bool cpus_updated;3949bool mems_updated;3950bool remote;3951int partcmd = -1;3952struct cpuset *parent;3953retry:3954wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);39553956mutex_lock(&cpuset_mutex);39573958/*3959* We have raced with task attaching. We wait until attaching3960* is finished, so we won't attach a task to an empty cpuset.3961*/3962if (cs->attach_in_progress) {3963mutex_unlock(&cpuset_mutex);3964goto retry;3965}39663967parent = parent_cs(cs);3968compute_effective_cpumask(&new_cpus, cs, parent);3969nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);39703971if (!tmp || !cs->partition_root_state)3972goto update_tasks;39733974/*3975* Compute effective_cpus for valid partition root, may invalidate3976* child partition roots if necessary.3977*/3978remote = is_remote_partition(cs);3979if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))3980compute_partition_effective_cpumask(cs, &new_cpus);39813982if (remote && (cpumask_empty(subpartitions_cpus) ||3983(cpumask_empty(&new_cpus) &&3984partition_is_populated(cs, NULL)))) {3985cs->prs_err = PERR_HOTPLUG;3986remote_partition_disable(cs, tmp);3987compute_effective_cpumask(&new_cpus, cs, parent);3988remote = false;3989}39903991/*3992* Force the partition to become invalid if either one of3993* the following conditions hold:3994* 1) empty effective cpus but not valid empty partition.3995* 2) parent is invalid or doesn't grant any cpus to child3996* partitions.3997* 3) subpartitions_cpus is empty.3998*/3999if (is_local_partition(cs) &&4000(!is_partition_valid(parent) ||4001tasks_nocpu_error(parent, cs, &new_cpus) ||4002cpumask_empty(subpartitions_cpus)))4003partcmd = partcmd_invalidate;4004/*4005* On the other hand, an invalid partition root may be transitioned4006* back to a regular one with a non-empty effective xcpus.4007*/4008else if (is_partition_valid(parent) && is_partition_invalid(cs) &&4009!cpumask_empty(cs->effective_xcpus))4010partcmd = partcmd_update;40114012if (partcmd >= 0) {4013update_parent_effective_cpumask(cs, partcmd, NULL, tmp);4014if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {4015compute_partition_effective_cpumask(cs, &new_cpus);4016cpuset_force_rebuild();4017}4018}40194020update_tasks:4021cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);4022mems_updated = !nodes_equal(new_mems, cs->effective_mems);4023if (!cpus_updated && !mems_updated)4024goto unlock; /* Hotplug doesn't affect this cpuset */40254026if (mems_updated)4027check_insane_mems_config(&new_mems);40284029if (is_in_v2_mode())4030hotplug_update_tasks(cs, &new_cpus, &new_mems,4031cpus_updated, mems_updated);4032else4033cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,4034cpus_updated, mems_updated);40354036unlock:4037mutex_unlock(&cpuset_mutex);4038}40394040/**4041* cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset4042*4043* This function is called after either CPU or memory configuration has4044* changed and updates cpuset accordingly. The top_cpuset is always4045* synchronized to cpu_active_mask and N_MEMORY, which is necessary in4046* order to make cpusets transparent (of no affect) on systems that are4047* actively using CPU hotplug but making no active use of cpusets.4048*4049* Non-root cpusets are only affected by offlining. If any CPUs or memory4050* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on4051* all descendants.4052*4053* Note that CPU offlining during suspend is ignored. We don't modify4054* cpusets across suspend/resume cycles at all.4055*4056* CPU / memory hotplug is handled synchronously.4057*/4058static void cpuset_handle_hotplug(void)4059{4060static cpumask_t new_cpus;4061static nodemask_t new_mems;4062bool cpus_updated, mems_updated;4063bool on_dfl = is_in_v2_mode();4064struct tmpmasks tmp, *ptmp = NULL;40654066if (on_dfl && !alloc_tmpmasks(&tmp))4067ptmp = &tmp;40684069lockdep_assert_cpus_held();4070mutex_lock(&cpuset_mutex);40714072/* fetch the available cpus/mems and find out which changed how */4073cpumask_copy(&new_cpus, cpu_active_mask);4074new_mems = node_states[N_MEMORY];40754076/*4077* If subpartitions_cpus is populated, it is likely that the check4078* below will produce a false positive on cpus_updated when the cpu4079* list isn't changed. It is extra work, but it is better to be safe.4080*/4081cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||4082!cpumask_empty(subpartitions_cpus);4083mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);40844085/* For v1, synchronize cpus_allowed to cpu_active_mask */4086if (cpus_updated) {4087cpuset_force_rebuild();4088spin_lock_irq(&callback_lock);4089if (!on_dfl)4090cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);4091/*4092* Make sure that CPUs allocated to child partitions4093* do not show up in effective_cpus. If no CPU is left,4094* we clear the subpartitions_cpus & let the child partitions4095* fight for the CPUs again.4096*/4097if (!cpumask_empty(subpartitions_cpus)) {4098if (cpumask_subset(&new_cpus, subpartitions_cpus)) {4099cpumask_clear(subpartitions_cpus);4100} else {4101cpumask_andnot(&new_cpus, &new_cpus,4102subpartitions_cpus);4103}4104}4105cpumask_copy(top_cpuset.effective_cpus, &new_cpus);4106spin_unlock_irq(&callback_lock);4107/* we don't mess with cpumasks of tasks in top_cpuset */4108}41094110/* synchronize mems_allowed to N_MEMORY */4111if (mems_updated) {4112spin_lock_irq(&callback_lock);4113if (!on_dfl)4114top_cpuset.mems_allowed = new_mems;4115top_cpuset.effective_mems = new_mems;4116spin_unlock_irq(&callback_lock);4117cpuset_update_tasks_nodemask(&top_cpuset);4118}41194120mutex_unlock(&cpuset_mutex);41214122/* if cpus or mems changed, we need to propagate to descendants */4123if (cpus_updated || mems_updated) {4124struct cpuset *cs;4125struct cgroup_subsys_state *pos_css;41264127rcu_read_lock();4128cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {4129if (cs == &top_cpuset || !css_tryget_online(&cs->css))4130continue;4131rcu_read_unlock();41324133cpuset_hotplug_update_tasks(cs, ptmp);41344135rcu_read_lock();4136css_put(&cs->css);4137}4138rcu_read_unlock();4139}41404141/* rebuild sched domains if necessary */4142if (force_sd_rebuild)4143rebuild_sched_domains_cpuslocked();41444145free_tmpmasks(ptmp);4146}41474148void cpuset_update_active_cpus(void)4149{4150/*4151* We're inside cpu hotplug critical region which usually nests4152* inside cgroup synchronization. Bounce actual hotplug processing4153* to a work item to avoid reverse locking order.4154*/4155cpuset_handle_hotplug();4156}41574158/*4159* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].4160* Call this routine anytime after node_states[N_MEMORY] changes.4161* See cpuset_update_active_cpus() for CPU hotplug handling.4162*/4163static int cpuset_track_online_nodes(struct notifier_block *self,4164unsigned long action, void *arg)4165{4166cpuset_handle_hotplug();4167return NOTIFY_OK;4168}41694170/**4171* cpuset_init_smp - initialize cpus_allowed4172*4173* Description: Finish top cpuset after cpu, node maps are initialized4174*/4175void __init cpuset_init_smp(void)4176{4177/*4178* cpus_allowd/mems_allowed set to v2 values in the initial4179* cpuset_bind() call will be reset to v1 values in another4180* cpuset_bind() call when v1 cpuset is mounted.4181*/4182top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;41834184cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);4185top_cpuset.effective_mems = node_states[N_MEMORY];41864187hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);41884189cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);4190BUG_ON(!cpuset_migrate_mm_wq);4191}41924193/*4194* Return cpus_allowed mask from a task's cpuset.4195*/4196static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)4197{4198struct cpuset *cs;41994200cs = task_cs(tsk);4201if (cs != &top_cpuset)4202guarantee_active_cpus(tsk, pmask);4203/*4204* Tasks in the top cpuset won't get update to their cpumasks4205* when a hotplug online/offline event happens. So we include all4206* offline cpus in the allowed cpu list.4207*/4208if ((cs == &top_cpuset) || cpumask_empty(pmask)) {4209const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);42104211/*4212* We first exclude cpus allocated to partitions. If there is no4213* allowable online cpu left, we fall back to all possible cpus.4214*/4215cpumask_andnot(pmask, possible_mask, subpartitions_cpus);4216if (!cpumask_intersects(pmask, cpu_active_mask))4217cpumask_copy(pmask, possible_mask);4218}4219}42204221/**4222* cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.4223* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.4224* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.4225*4226* Similir to cpuset_cpus_allowed() except that the caller must have acquired4227* cpuset_mutex.4228*/4229void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)4230{4231lockdep_assert_held(&cpuset_mutex);4232__cpuset_cpus_allowed_locked(tsk, pmask);4233}42344235/**4236* cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.4237* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.4238* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.4239*4240* Description: Returns the cpumask_var_t cpus_allowed of the cpuset4241* attached to the specified @tsk. Guaranteed to return some non-empty4242* subset of cpu_active_mask, even if this means going outside the4243* tasks cpuset, except when the task is in the top cpuset.4244**/42454246void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)4247{4248unsigned long flags;42494250spin_lock_irqsave(&callback_lock, flags);4251__cpuset_cpus_allowed_locked(tsk, pmask);4252spin_unlock_irqrestore(&callback_lock, flags);4253}42544255/**4256* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.4257* @tsk: pointer to task_struct with which the scheduler is struggling4258*4259* Description: In the case that the scheduler cannot find an allowed cpu in4260* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy4261* mode however, this value is the same as task_cs(tsk)->effective_cpus,4262* which will not contain a sane cpumask during cases such as cpu hotplugging.4263* This is the absolute last resort for the scheduler and it is only used if4264* _every_ other avenue has been traveled.4265*4266* Returns true if the affinity of @tsk was changed, false otherwise.4267**/42684269bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)4270{4271const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);4272const struct cpumask *cs_mask;4273bool changed = false;42744275rcu_read_lock();4276cs_mask = task_cs(tsk)->cpus_allowed;4277if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {4278set_cpus_allowed_force(tsk, cs_mask);4279changed = true;4280}4281rcu_read_unlock();42824283/*4284* We own tsk->cpus_allowed, nobody can change it under us.4285*4286* But we used cs && cs->cpus_allowed lockless and thus can4287* race with cgroup_attach_task() or update_cpumask() and get4288* the wrong tsk->cpus_allowed. However, both cases imply the4289* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()4290* which takes task_rq_lock().4291*4292* If we are called after it dropped the lock we must see all4293* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary4294* set any mask even if it is not right from task_cs() pov,4295* the pending set_cpus_allowed_ptr() will fix things.4296*4297* select_fallback_rq() will fix things ups and set cpu_possible_mask4298* if required.4299*/4300return changed;4301}43024303void __init cpuset_init_current_mems_allowed(void)4304{4305nodes_setall(current->mems_allowed);4306}43074308/**4309* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.4310* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.4311*4312* Description: Returns the nodemask_t mems_allowed of the cpuset4313* attached to the specified @tsk. Guaranteed to return some non-empty4314* subset of node_states[N_MEMORY], even if this means going outside the4315* tasks cpuset.4316**/43174318nodemask_t cpuset_mems_allowed(struct task_struct *tsk)4319{4320nodemask_t mask;4321unsigned long flags;43224323spin_lock_irqsave(&callback_lock, flags);4324guarantee_online_mems(task_cs(tsk), &mask);4325spin_unlock_irqrestore(&callback_lock, flags);43264327return mask;4328}43294330/**4331* cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed4332* @nodemask: the nodemask to be checked4333*4334* Are any of the nodes in the nodemask allowed in current->mems_allowed?4335*/4336int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)4337{4338return nodes_intersects(*nodemask, current->mems_allowed);4339}43404341/*4342* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or4343* mem_hardwall ancestor to the specified cpuset. Call holding4344* callback_lock. If no ancestor is mem_exclusive or mem_hardwall4345* (an unusual configuration), then returns the root cpuset.4346*/4347static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)4348{4349while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))4350cs = parent_cs(cs);4351return cs;4352}43534354/*4355* cpuset_current_node_allowed - Can current task allocate on a memory node?4356* @node: is this an allowed node?4357* @gfp_mask: memory allocation flags4358*4359* If we're in interrupt, yes, we can always allocate. If @node is set in4360* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this4361* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,4362* yes. If current has access to memory reserves as an oom victim, yes.4363* Otherwise, no.4364*4365* GFP_USER allocations are marked with the __GFP_HARDWALL bit,4366* and do not allow allocations outside the current tasks cpuset4367* unless the task has been OOM killed.4368* GFP_KERNEL allocations are not so marked, so can escape to the4369* nearest enclosing hardwalled ancestor cpuset.4370*4371* Scanning up parent cpusets requires callback_lock. The4372* __alloc_pages() routine only calls here with __GFP_HARDWALL bit4373* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the4374* current tasks mems_allowed came up empty on the first pass over4375* the zonelist. So only GFP_KERNEL allocations, if all nodes in the4376* cpuset are short of memory, might require taking the callback_lock.4377*4378* The first call here from mm/page_alloc:get_page_from_freelist()4379* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,4380* so no allocation on a node outside the cpuset is allowed (unless4381* in interrupt, of course).4382*4383* The second pass through get_page_from_freelist() doesn't even call4384* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()4385* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set4386* in alloc_flags. That logic and the checks below have the combined4387* affect that:4388* in_interrupt - any node ok (current task context irrelevant)4389* GFP_ATOMIC - any node ok4390* tsk_is_oom_victim - any node ok4391* GFP_KERNEL - any node in enclosing hardwalled cpuset ok4392* GFP_USER - only nodes in current tasks mems allowed ok.4393*/4394bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)4395{4396struct cpuset *cs; /* current cpuset ancestors */4397bool allowed; /* is allocation in zone z allowed? */4398unsigned long flags;43994400if (in_interrupt())4401return true;4402if (node_isset(node, current->mems_allowed))4403return true;4404/*4405* Allow tasks that have access to memory reserves because they have4406* been OOM killed to get memory anywhere.4407*/4408if (unlikely(tsk_is_oom_victim(current)))4409return true;4410if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */4411return false;44124413if (current->flags & PF_EXITING) /* Let dying task have memory */4414return true;44154416/* Not hardwall and node outside mems_allowed: scan up cpusets */4417spin_lock_irqsave(&callback_lock, flags);44184419cs = nearest_hardwall_ancestor(task_cs(current));4420allowed = node_isset(node, cs->mems_allowed);44214422spin_unlock_irqrestore(&callback_lock, flags);4423return allowed;4424}44254426bool cpuset_node_allowed(struct cgroup *cgroup, int nid)4427{4428struct cgroup_subsys_state *css;4429struct cpuset *cs;4430bool allowed;44314432/*4433* In v1, mem_cgroup and cpuset are unlikely in the same hierarchy4434* and mems_allowed is likely to be empty even if we could get to it,4435* so return true to avoid taking a global lock on the empty check.4436*/4437if (!cpuset_v2())4438return true;44394440css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);4441if (!css)4442return true;44434444/*4445* Normally, accessing effective_mems would require the cpuset_mutex4446* or callback_lock - but node_isset is atomic and the reference4447* taken via cgroup_get_e_css is sufficient to protect css.4448*4449* Since this interface is intended for use by migration paths, we4450* relax locking here to avoid taking global locks - while accepting4451* there may be rare scenarios where the result may be innaccurate.4452*4453* Reclaim and migration are subject to these same race conditions, and4454* cannot make strong isolation guarantees, so this is acceptable.4455*/4456cs = container_of(css, struct cpuset, css);4457allowed = node_isset(nid, cs->effective_mems);4458css_put(css);4459return allowed;4460}44614462/**4463* cpuset_spread_node() - On which node to begin search for a page4464* @rotor: round robin rotor4465*4466* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for4467* tasks in a cpuset with is_spread_page or is_spread_slab set),4468* and if the memory allocation used cpuset_mem_spread_node()4469* to determine on which node to start looking, as it will for4470* certain page cache or slab cache pages such as used for file4471* system buffers and inode caches, then instead of starting on the4472* local node to look for a free page, rather spread the starting4473* node around the tasks mems_allowed nodes.4474*4475* We don't have to worry about the returned node being offline4476* because "it can't happen", and even if it did, it would be ok.4477*4478* The routines calling guarantee_online_mems() are careful to4479* only set nodes in task->mems_allowed that are online. So it4480* should not be possible for the following code to return an4481* offline node. But if it did, that would be ok, as this routine4482* is not returning the node where the allocation must be, only4483* the node where the search should start. The zonelist passed to4484* __alloc_pages() will include all nodes. If the slab allocator4485* is passed an offline node, it will fall back to the local node.4486* See kmem_cache_alloc_node().4487*/4488static int cpuset_spread_node(int *rotor)4489{4490return *rotor = next_node_in(*rotor, current->mems_allowed);4491}44924493/**4494* cpuset_mem_spread_node() - On which node to begin search for a file page4495*/4496int cpuset_mem_spread_node(void)4497{4498if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)4499current->cpuset_mem_spread_rotor =4500node_random(¤t->mems_allowed);45014502return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);4503}45044505/**4506* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?4507* @tsk1: pointer to task_struct of some task.4508* @tsk2: pointer to task_struct of some other task.4509*4510* Description: Return true if @tsk1's mems_allowed intersects the4511* mems_allowed of @tsk2. Used by the OOM killer to determine if4512* one of the task's memory usage might impact the memory available4513* to the other.4514**/45154516int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,4517const struct task_struct *tsk2)4518{4519return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);4520}45214522/**4523* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed4524*4525* Description: Prints current's name, cpuset name, and cached copy of its4526* mems_allowed to the kernel log.4527*/4528void cpuset_print_current_mems_allowed(void)4529{4530struct cgroup *cgrp;45314532rcu_read_lock();45334534cgrp = task_cs(current)->css.cgroup;4535pr_cont(",cpuset=");4536pr_cont_cgroup_name(cgrp);4537pr_cont(",mems_allowed=%*pbl",4538nodemask_pr_args(¤t->mems_allowed));45394540rcu_read_unlock();4541}45424543/* Display task mems_allowed in /proc/<pid>/status file. */4544void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)4545{4546seq_printf(m, "Mems_allowed:\t%*pb\n",4547nodemask_pr_args(&task->mems_allowed));4548seq_printf(m, "Mems_allowed_list:\t%*pbl\n",4549nodemask_pr_args(&task->mems_allowed));4550}455145524553