/*1* kernel/cpuset.c2*3* Processor and Memory placement constraints for sets of tasks.4*5* Copyright (C) 2003 BULL SA.6* Copyright (C) 2004-2007 Silicon Graphics, Inc.7* Copyright (C) 2006 Google, Inc8*9* Portions derived from Patrick Mochel's sysfs code.10* sysfs is Copyright (c) 2001-3 Patrick Mochel11*12* 2003-10-10 Written by Simon Derr.13* 2003-10-22 Updates by Stephen Hemminger.14* 2004 May-July Rework by Paul Jackson.15* 2006 Rework by Paul Menage to use generic cgroups16* 2008 Rework of the scheduler domains and CPU hotplug handling17* by Max Krasnyansky18*19* This file is subject to the terms and conditions of the GNU General Public20* License. See the file COPYING in the main directory of the Linux21* distribution for more details.22*/2324#include <linux/cpu.h>25#include <linux/cpumask.h>26#include <linux/cpuset.h>27#include <linux/err.h>28#include <linux/errno.h>29#include <linux/file.h>30#include <linux/fs.h>31#include <linux/init.h>32#include <linux/interrupt.h>33#include <linux/kernel.h>34#include <linux/kmod.h>35#include <linux/list.h>36#include <linux/mempolicy.h>37#include <linux/mm.h>38#include <linux/memory.h>39#include <linux/module.h>40#include <linux/mount.h>41#include <linux/namei.h>42#include <linux/pagemap.h>43#include <linux/proc_fs.h>44#include <linux/rcupdate.h>45#include <linux/sched.h>46#include <linux/seq_file.h>47#include <linux/security.h>48#include <linux/slab.h>49#include <linux/spinlock.h>50#include <linux/stat.h>51#include <linux/string.h>52#include <linux/time.h>53#include <linux/backing-dev.h>54#include <linux/sort.h>5556#include <asm/uaccess.h>57#include <asm/atomic.h>58#include <linux/mutex.h>59#include <linux/workqueue.h>60#include <linux/cgroup.h>6162/*63* Workqueue for cpuset related tasks.64*65* Using kevent workqueue may cause deadlock when memory_migrate66* is set. So we create a separate workqueue thread for cpuset.67*/68static struct workqueue_struct *cpuset_wq;6970/*71* Tracks how many cpusets are currently defined in system.72* When there is only one cpuset (the root cpuset) we can73* short circuit some hooks.74*/75int number_of_cpusets __read_mostly;7677/* Forward declare cgroup structures */78struct cgroup_subsys cpuset_subsys;79struct cpuset;8081/* See "Frequency meter" comments, below. */8283struct fmeter {84int cnt; /* unprocessed events count */85int val; /* most recent output value */86time_t time; /* clock (secs) when val computed */87spinlock_t lock; /* guards read or write of above */88};8990struct cpuset {91struct cgroup_subsys_state css;9293unsigned long flags; /* "unsigned long" so bitops work */94cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */95nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */9697struct cpuset *parent; /* my parent */9899struct fmeter fmeter; /* memory_pressure filter */100101/* partition number for rebuild_sched_domains() */102int pn;103104/* for custom sched domain */105int relax_domain_level;106107/* used for walking a cpuset hierarchy */108struct list_head stack_list;109};110111/* Retrieve the cpuset for a cgroup */112static inline struct cpuset *cgroup_cs(struct cgroup *cont)113{114return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),115struct cpuset, css);116}117118/* Retrieve the cpuset for a task */119static inline struct cpuset *task_cs(struct task_struct *task)120{121return container_of(task_subsys_state(task, cpuset_subsys_id),122struct cpuset, css);123}124125/* bits in struct cpuset flags field */126typedef enum {127CS_CPU_EXCLUSIVE,128CS_MEM_EXCLUSIVE,129CS_MEM_HARDWALL,130CS_MEMORY_MIGRATE,131CS_SCHED_LOAD_BALANCE,132CS_SPREAD_PAGE,133CS_SPREAD_SLAB,134} cpuset_flagbits_t;135136/* convenient tests for these bits */137static inline int is_cpu_exclusive(const struct cpuset *cs)138{139return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);140}141142static inline int is_mem_exclusive(const struct cpuset *cs)143{144return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);145}146147static inline int is_mem_hardwall(const struct cpuset *cs)148{149return test_bit(CS_MEM_HARDWALL, &cs->flags);150}151152static inline int is_sched_load_balance(const struct cpuset *cs)153{154return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);155}156157static inline int is_memory_migrate(const struct cpuset *cs)158{159return test_bit(CS_MEMORY_MIGRATE, &cs->flags);160}161162static inline int is_spread_page(const struct cpuset *cs)163{164return test_bit(CS_SPREAD_PAGE, &cs->flags);165}166167static inline int is_spread_slab(const struct cpuset *cs)168{169return test_bit(CS_SPREAD_SLAB, &cs->flags);170}171172static struct cpuset top_cpuset = {173.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),174};175176/*177* There are two global mutexes guarding cpuset structures. The first178* is the main control groups cgroup_mutex, accessed via179* cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific180* callback_mutex, below. They can nest. It is ok to first take181* cgroup_mutex, then nest callback_mutex. We also require taking182* task_lock() when dereferencing a task's cpuset pointer. See "The183* task_lock() exception", at the end of this comment.184*185* A task must hold both mutexes to modify cpusets. If a task186* holds cgroup_mutex, then it blocks others wanting that mutex,187* ensuring that it is the only task able to also acquire callback_mutex188* and be able to modify cpusets. It can perform various checks on189* the cpuset structure first, knowing nothing will change. It can190* also allocate memory while just holding cgroup_mutex. While it is191* performing these checks, various callback routines can briefly192* acquire callback_mutex to query cpusets. Once it is ready to make193* the changes, it takes callback_mutex, blocking everyone else.194*195* Calls to the kernel memory allocator can not be made while holding196* callback_mutex, as that would risk double tripping on callback_mutex197* from one of the callbacks into the cpuset code from within198* __alloc_pages().199*200* If a task is only holding callback_mutex, then it has read-only201* access to cpusets.202*203* Now, the task_struct fields mems_allowed and mempolicy may be changed204* by other task, we use alloc_lock in the task_struct fields to protect205* them.206*207* The cpuset_common_file_read() handlers only hold callback_mutex across208* small pieces of code, such as when reading out possibly multi-word209* cpumasks and nodemasks.210*211* Accessing a task's cpuset should be done in accordance with the212* guidelines for accessing subsystem state in kernel/cgroup.c213*/214215static DEFINE_MUTEX(callback_mutex);216217/*218* cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist219* buffers. They are statically allocated to prevent using excess stack220* when calling cpuset_print_task_mems_allowed().221*/222#define CPUSET_NAME_LEN (128)223#define CPUSET_NODELIST_LEN (256)224static char cpuset_name[CPUSET_NAME_LEN];225static char cpuset_nodelist[CPUSET_NODELIST_LEN];226static DEFINE_SPINLOCK(cpuset_buffer_lock);227228/*229* This is ugly, but preserves the userspace API for existing cpuset230* users. If someone tries to mount the "cpuset" filesystem, we231* silently switch it to mount "cgroup" instead232*/233static struct dentry *cpuset_mount(struct file_system_type *fs_type,234int flags, const char *unused_dev_name, void *data)235{236struct file_system_type *cgroup_fs = get_fs_type("cgroup");237struct dentry *ret = ERR_PTR(-ENODEV);238if (cgroup_fs) {239char mountopts[] =240"cpuset,noprefix,"241"release_agent=/sbin/cpuset_release_agent";242ret = cgroup_fs->mount(cgroup_fs, flags,243unused_dev_name, mountopts);244put_filesystem(cgroup_fs);245}246return ret;247}248249static struct file_system_type cpuset_fs_type = {250.name = "cpuset",251.mount = cpuset_mount,252};253254/*255* Return in pmask the portion of a cpusets's cpus_allowed that256* are online. If none are online, walk up the cpuset hierarchy257* until we find one that does have some online cpus. If we get258* all the way to the top and still haven't found any online cpus,259* return cpu_online_map. Or if passed a NULL cs from an exit'ing260* task, return cpu_online_map.261*262* One way or another, we guarantee to return some non-empty subset263* of cpu_online_map.264*265* Call with callback_mutex held.266*/267268static void guarantee_online_cpus(const struct cpuset *cs,269struct cpumask *pmask)270{271while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))272cs = cs->parent;273if (cs)274cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);275else276cpumask_copy(pmask, cpu_online_mask);277BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));278}279280/*281* Return in *pmask the portion of a cpusets's mems_allowed that282* are online, with memory. If none are online with memory, walk283* up the cpuset hierarchy until we find one that does have some284* online mems. If we get all the way to the top and still haven't285* found any online mems, return node_states[N_HIGH_MEMORY].286*287* One way or another, we guarantee to return some non-empty subset288* of node_states[N_HIGH_MEMORY].289*290* Call with callback_mutex held.291*/292293static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)294{295while (cs && !nodes_intersects(cs->mems_allowed,296node_states[N_HIGH_MEMORY]))297cs = cs->parent;298if (cs)299nodes_and(*pmask, cs->mems_allowed,300node_states[N_HIGH_MEMORY]);301else302*pmask = node_states[N_HIGH_MEMORY];303BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));304}305306/*307* update task's spread flag if cpuset's page/slab spread flag is set308*309* Called with callback_mutex/cgroup_mutex held310*/311static void cpuset_update_task_spread_flag(struct cpuset *cs,312struct task_struct *tsk)313{314if (is_spread_page(cs))315tsk->flags |= PF_SPREAD_PAGE;316else317tsk->flags &= ~PF_SPREAD_PAGE;318if (is_spread_slab(cs))319tsk->flags |= PF_SPREAD_SLAB;320else321tsk->flags &= ~PF_SPREAD_SLAB;322}323324/*325* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?326*327* One cpuset is a subset of another if all its allowed CPUs and328* Memory Nodes are a subset of the other, and its exclusive flags329* are only set if the other's are set. Call holding cgroup_mutex.330*/331332static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)333{334return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&335nodes_subset(p->mems_allowed, q->mems_allowed) &&336is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&337is_mem_exclusive(p) <= is_mem_exclusive(q);338}339340/**341* alloc_trial_cpuset - allocate a trial cpuset342* @cs: the cpuset that the trial cpuset duplicates343*/344static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)345{346struct cpuset *trial;347348trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);349if (!trial)350return NULL;351352if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {353kfree(trial);354return NULL;355}356cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);357358return trial;359}360361/**362* free_trial_cpuset - free the trial cpuset363* @trial: the trial cpuset to be freed364*/365static void free_trial_cpuset(struct cpuset *trial)366{367free_cpumask_var(trial->cpus_allowed);368kfree(trial);369}370371/*372* validate_change() - Used to validate that any proposed cpuset change373* follows the structural rules for cpusets.374*375* If we replaced the flag and mask values of the current cpuset376* (cur) with those values in the trial cpuset (trial), would377* our various subset and exclusive rules still be valid? Presumes378* cgroup_mutex held.379*380* 'cur' is the address of an actual, in-use cpuset. Operations381* such as list traversal that depend on the actual address of the382* cpuset in the list must use cur below, not trial.383*384* 'trial' is the address of bulk structure copy of cur, with385* perhaps one or more of the fields cpus_allowed, mems_allowed,386* or flags changed to new, trial values.387*388* Return 0 if valid, -errno if not.389*/390391static int validate_change(const struct cpuset *cur, const struct cpuset *trial)392{393struct cgroup *cont;394struct cpuset *c, *par;395396/* Each of our child cpusets must be a subset of us */397list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {398if (!is_cpuset_subset(cgroup_cs(cont), trial))399return -EBUSY;400}401402/* Remaining checks don't apply to root cpuset */403if (cur == &top_cpuset)404return 0;405406par = cur->parent;407408/* We must be a subset of our parent cpuset */409if (!is_cpuset_subset(trial, par))410return -EACCES;411412/*413* If either I or some sibling (!= me) is exclusive, we can't414* overlap415*/416list_for_each_entry(cont, &par->css.cgroup->children, sibling) {417c = cgroup_cs(cont);418if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&419c != cur &&420cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))421return -EINVAL;422if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&423c != cur &&424nodes_intersects(trial->mems_allowed, c->mems_allowed))425return -EINVAL;426}427428/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */429if (cgroup_task_count(cur->css.cgroup)) {430if (cpumask_empty(trial->cpus_allowed) ||431nodes_empty(trial->mems_allowed)) {432return -ENOSPC;433}434}435436return 0;437}438439#ifdef CONFIG_SMP440/*441* Helper routine for generate_sched_domains().442* Do cpusets a, b have overlapping cpus_allowed masks?443*/444static int cpusets_overlap(struct cpuset *a, struct cpuset *b)445{446return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);447}448449static void450update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)451{452if (dattr->relax_domain_level < c->relax_domain_level)453dattr->relax_domain_level = c->relax_domain_level;454return;455}456457static void458update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)459{460LIST_HEAD(q);461462list_add(&c->stack_list, &q);463while (!list_empty(&q)) {464struct cpuset *cp;465struct cgroup *cont;466struct cpuset *child;467468cp = list_first_entry(&q, struct cpuset, stack_list);469list_del(q.next);470471if (cpumask_empty(cp->cpus_allowed))472continue;473474if (is_sched_load_balance(cp))475update_domain_attr(dattr, cp);476477list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {478child = cgroup_cs(cont);479list_add_tail(&child->stack_list, &q);480}481}482}483484/*485* generate_sched_domains()486*487* This function builds a partial partition of the systems CPUs488* A 'partial partition' is a set of non-overlapping subsets whose489* union is a subset of that set.490* The output of this function needs to be passed to kernel/sched.c491* partition_sched_domains() routine, which will rebuild the scheduler's492* load balancing domains (sched domains) as specified by that partial493* partition.494*495* See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt496* for a background explanation of this.497*498* Does not return errors, on the theory that the callers of this499* routine would rather not worry about failures to rebuild sched500* domains when operating in the severe memory shortage situations501* that could cause allocation failures below.502*503* Must be called with cgroup_lock held.504*505* The three key local variables below are:506* q - a linked-list queue of cpuset pointers, used to implement a507* top-down scan of all cpusets. This scan loads a pointer508* to each cpuset marked is_sched_load_balance into the509* array 'csa'. For our purposes, rebuilding the schedulers510* sched domains, we can ignore !is_sched_load_balance cpusets.511* csa - (for CpuSet Array) Array of pointers to all the cpusets512* that need to be load balanced, for convenient iterative513* access by the subsequent code that finds the best partition,514* i.e the set of domains (subsets) of CPUs such that the515* cpus_allowed of every cpuset marked is_sched_load_balance516* is a subset of one of these domains, while there are as517* many such domains as possible, each as small as possible.518* doms - Conversion of 'csa' to an array of cpumasks, for passing to519* the kernel/sched.c routine partition_sched_domains() in a520* convenient format, that can be easily compared to the prior521* value to determine what partition elements (sched domains)522* were changed (added or removed.)523*524* Finding the best partition (set of domains):525* The triple nested loops below over i, j, k scan over the526* load balanced cpusets (using the array of cpuset pointers in527* csa[]) looking for pairs of cpusets that have overlapping528* cpus_allowed, but which don't have the same 'pn' partition529* number and gives them in the same partition number. It keeps530* looping on the 'restart' label until it can no longer find531* any such pairs.532*533* The union of the cpus_allowed masks from the set of534* all cpusets having the same 'pn' value then form the one535* element of the partition (one sched domain) to be passed to536* partition_sched_domains().537*/538static int generate_sched_domains(cpumask_var_t **domains,539struct sched_domain_attr **attributes)540{541LIST_HEAD(q); /* queue of cpusets to be scanned */542struct cpuset *cp; /* scans q */543struct cpuset **csa; /* array of all cpuset ptrs */544int csn; /* how many cpuset ptrs in csa so far */545int i, j, k; /* indices for partition finding loops */546cpumask_var_t *doms; /* resulting partition; i.e. sched domains */547struct sched_domain_attr *dattr; /* attributes for custom domains */548int ndoms = 0; /* number of sched domains in result */549int nslot; /* next empty doms[] struct cpumask slot */550551doms = NULL;552dattr = NULL;553csa = NULL;554555/* Special case for the 99% of systems with one, full, sched domain */556if (is_sched_load_balance(&top_cpuset)) {557ndoms = 1;558doms = alloc_sched_domains(ndoms);559if (!doms)560goto done;561562dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);563if (dattr) {564*dattr = SD_ATTR_INIT;565update_domain_attr_tree(dattr, &top_cpuset);566}567cpumask_copy(doms[0], top_cpuset.cpus_allowed);568569goto done;570}571572csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);573if (!csa)574goto done;575csn = 0;576577list_add(&top_cpuset.stack_list, &q);578while (!list_empty(&q)) {579struct cgroup *cont;580struct cpuset *child; /* scans child cpusets of cp */581582cp = list_first_entry(&q, struct cpuset, stack_list);583list_del(q.next);584585if (cpumask_empty(cp->cpus_allowed))586continue;587588/*589* All child cpusets contain a subset of the parent's cpus, so590* just skip them, and then we call update_domain_attr_tree()591* to calc relax_domain_level of the corresponding sched592* domain.593*/594if (is_sched_load_balance(cp)) {595csa[csn++] = cp;596continue;597}598599list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {600child = cgroup_cs(cont);601list_add_tail(&child->stack_list, &q);602}603}604605for (i = 0; i < csn; i++)606csa[i]->pn = i;607ndoms = csn;608609restart:610/* Find the best partition (set of sched domains) */611for (i = 0; i < csn; i++) {612struct cpuset *a = csa[i];613int apn = a->pn;614615for (j = 0; j < csn; j++) {616struct cpuset *b = csa[j];617int bpn = b->pn;618619if (apn != bpn && cpusets_overlap(a, b)) {620for (k = 0; k < csn; k++) {621struct cpuset *c = csa[k];622623if (c->pn == bpn)624c->pn = apn;625}626ndoms--; /* one less element */627goto restart;628}629}630}631632/*633* Now we know how many domains to create.634* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.635*/636doms = alloc_sched_domains(ndoms);637if (!doms)638goto done;639640/*641* The rest of the code, including the scheduler, can deal with642* dattr==NULL case. No need to abort if alloc fails.643*/644dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);645646for (nslot = 0, i = 0; i < csn; i++) {647struct cpuset *a = csa[i];648struct cpumask *dp;649int apn = a->pn;650651if (apn < 0) {652/* Skip completed partitions */653continue;654}655656dp = doms[nslot];657658if (nslot == ndoms) {659static int warnings = 10;660if (warnings) {661printk(KERN_WARNING662"rebuild_sched_domains confused:"663" nslot %d, ndoms %d, csn %d, i %d,"664" apn %d\n",665nslot, ndoms, csn, i, apn);666warnings--;667}668continue;669}670671cpumask_clear(dp);672if (dattr)673*(dattr + nslot) = SD_ATTR_INIT;674for (j = i; j < csn; j++) {675struct cpuset *b = csa[j];676677if (apn == b->pn) {678cpumask_or(dp, dp, b->cpus_allowed);679if (dattr)680update_domain_attr_tree(dattr + nslot, b);681682/* Done with this partition */683b->pn = -1;684}685}686nslot++;687}688BUG_ON(nslot != ndoms);689690done:691kfree(csa);692693/*694* Fallback to the default domain if kmalloc() failed.695* See comments in partition_sched_domains().696*/697if (doms == NULL)698ndoms = 1;699700*domains = doms;701*attributes = dattr;702return ndoms;703}704705/*706* Rebuild scheduler domains.707*708* Call with neither cgroup_mutex held nor within get_online_cpus().709* Takes both cgroup_mutex and get_online_cpus().710*711* Cannot be directly called from cpuset code handling changes712* to the cpuset pseudo-filesystem, because it cannot be called713* from code that already holds cgroup_mutex.714*/715static void do_rebuild_sched_domains(struct work_struct *unused)716{717struct sched_domain_attr *attr;718cpumask_var_t *doms;719int ndoms;720721get_online_cpus();722723/* Generate domain masks and attrs */724cgroup_lock();725ndoms = generate_sched_domains(&doms, &attr);726cgroup_unlock();727728/* Have scheduler rebuild the domains */729partition_sched_domains(ndoms, doms, attr);730731put_online_cpus();732}733#else /* !CONFIG_SMP */734static void do_rebuild_sched_domains(struct work_struct *unused)735{736}737738static int generate_sched_domains(cpumask_var_t **domains,739struct sched_domain_attr **attributes)740{741*domains = NULL;742return 1;743}744#endif /* CONFIG_SMP */745746static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);747748/*749* Rebuild scheduler domains, asynchronously via workqueue.750*751* If the flag 'sched_load_balance' of any cpuset with non-empty752* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset753* which has that flag enabled, or if any cpuset with a non-empty754* 'cpus' is removed, then call this routine to rebuild the755* scheduler's dynamic sched domains.756*757* The rebuild_sched_domains() and partition_sched_domains()758* routines must nest cgroup_lock() inside get_online_cpus(),759* but such cpuset changes as these must nest that locking the760* other way, holding cgroup_lock() for much of the code.761*762* So in order to avoid an ABBA deadlock, the cpuset code handling763* these user changes delegates the actual sched domain rebuilding764* to a separate workqueue thread, which ends up processing the765* above do_rebuild_sched_domains() function.766*/767static void async_rebuild_sched_domains(void)768{769queue_work(cpuset_wq, &rebuild_sched_domains_work);770}771772/*773* Accomplishes the same scheduler domain rebuild as the above774* async_rebuild_sched_domains(), however it directly calls the775* rebuild routine synchronously rather than calling it via an776* asynchronous work thread.777*778* This can only be called from code that is not holding779* cgroup_mutex (not nested in a cgroup_lock() call.)780*/781void rebuild_sched_domains(void)782{783do_rebuild_sched_domains(NULL);784}785786/**787* cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's788* @tsk: task to test789* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner790*791* Call with cgroup_mutex held. May take callback_mutex during call.792* Called for each task in a cgroup by cgroup_scan_tasks().793* Return nonzero if this tasks's cpus_allowed mask should be changed (in other794* words, if its mask is not equal to its cpuset's mask).795*/796static int cpuset_test_cpumask(struct task_struct *tsk,797struct cgroup_scanner *scan)798{799return !cpumask_equal(&tsk->cpus_allowed,800(cgroup_cs(scan->cg))->cpus_allowed);801}802803/**804* cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's805* @tsk: task to test806* @scan: struct cgroup_scanner containing the cgroup of the task807*808* Called by cgroup_scan_tasks() for each task in a cgroup whose809* cpus_allowed mask needs to be changed.810*811* We don't need to re-check for the cgroup/cpuset membership, since we're812* holding cgroup_lock() at this point.813*/814static void cpuset_change_cpumask(struct task_struct *tsk,815struct cgroup_scanner *scan)816{817set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));818}819820/**821* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.822* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed823* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()824*825* Called with cgroup_mutex held826*827* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,828* calling callback functions for each.829*830* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0831* if @heap != NULL.832*/833static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)834{835struct cgroup_scanner scan;836837scan.cg = cs->css.cgroup;838scan.test_task = cpuset_test_cpumask;839scan.process_task = cpuset_change_cpumask;840scan.heap = heap;841cgroup_scan_tasks(&scan);842}843844/**845* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it846* @cs: the cpuset to consider847* @buf: buffer of cpu numbers written to this cpuset848*/849static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,850const char *buf)851{852struct ptr_heap heap;853int retval;854int is_load_balanced;855856/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */857if (cs == &top_cpuset)858return -EACCES;859860/*861* An empty cpus_allowed is ok only if the cpuset has no tasks.862* Since cpulist_parse() fails on an empty mask, we special case863* that parsing. The validate_change() call ensures that cpusets864* with tasks have cpus.865*/866if (!*buf) {867cpumask_clear(trialcs->cpus_allowed);868} else {869retval = cpulist_parse(buf, trialcs->cpus_allowed);870if (retval < 0)871return retval;872873if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))874return -EINVAL;875}876retval = validate_change(cs, trialcs);877if (retval < 0)878return retval;879880/* Nothing to do if the cpus didn't change */881if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))882return 0;883884retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);885if (retval)886return retval;887888is_load_balanced = is_sched_load_balance(trialcs);889890mutex_lock(&callback_mutex);891cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);892mutex_unlock(&callback_mutex);893894/*895* Scan tasks in the cpuset, and update the cpumasks of any896* that need an update.897*/898update_tasks_cpumask(cs, &heap);899900heap_free(&heap);901902if (is_load_balanced)903async_rebuild_sched_domains();904return 0;905}906907/*908* cpuset_migrate_mm909*910* Migrate memory region from one set of nodes to another.911*912* Temporarilly set tasks mems_allowed to target nodes of migration,913* so that the migration code can allocate pages on these nodes.914*915* Call holding cgroup_mutex, so current's cpuset won't change916* during this call, as manage_mutex holds off any cpuset_attach()917* calls. Therefore we don't need to take task_lock around the918* call to guarantee_online_mems(), as we know no one is changing919* our task's cpuset.920*921* While the mm_struct we are migrating is typically from some922* other task, the task_struct mems_allowed that we are hacking923* is for our current task, which must allocate new pages for that924* migrating memory region.925*/926927static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,928const nodemask_t *to)929{930struct task_struct *tsk = current;931932tsk->mems_allowed = *to;933934do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);935936guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);937}938939/*940* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy941* @tsk: the task to change942* @newmems: new nodes that the task will be set943*944* In order to avoid seeing no nodes if the old and new nodes are disjoint,945* we structure updates as setting all new allowed nodes, then clearing newly946* disallowed ones.947*/948static void cpuset_change_task_nodemask(struct task_struct *tsk,949nodemask_t *newmems)950{951repeat:952/*953* Allow tasks that have access to memory reserves because they have954* been OOM killed to get memory anywhere.955*/956if (unlikely(test_thread_flag(TIF_MEMDIE)))957return;958if (current->flags & PF_EXITING) /* Let dying task have memory */959return;960961task_lock(tsk);962nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);963mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);964965966/*967* ensure checking ->mems_allowed_change_disable after setting all new968* allowed nodes.969*970* the read-side task can see an nodemask with new allowed nodes and971* old allowed nodes. and if it allocates page when cpuset clears newly972* disallowed ones continuous, it can see the new allowed bits.973*974* And if setting all new allowed nodes is after the checking, setting975* all new allowed nodes and clearing newly disallowed ones will be done976* continuous, and the read-side task may find no node to alloc page.977*/978smp_mb();979980/*981* Allocation of memory is very fast, we needn't sleep when waiting982* for the read-side.983*/984while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {985task_unlock(tsk);986if (!task_curr(tsk))987yield();988goto repeat;989}990991/*992* ensure checking ->mems_allowed_change_disable before clearing all new993* disallowed nodes.994*995* if clearing newly disallowed bits before the checking, the read-side996* task may find no node to alloc page.997*/998smp_mb();9991000mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);1001tsk->mems_allowed = *newmems;1002task_unlock(tsk);1003}10041005/*1006* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy1007* of it to cpuset's new mems_allowed, and migrate pages to new nodes if1008* memory_migrate flag is set. Called with cgroup_mutex held.1009*/1010static void cpuset_change_nodemask(struct task_struct *p,1011struct cgroup_scanner *scan)1012{1013struct mm_struct *mm;1014struct cpuset *cs;1015int migrate;1016const nodemask_t *oldmem = scan->data;1017static nodemask_t newmems; /* protected by cgroup_mutex */10181019cs = cgroup_cs(scan->cg);1020guarantee_online_mems(cs, &newmems);10211022cpuset_change_task_nodemask(p, &newmems);10231024mm = get_task_mm(p);1025if (!mm)1026return;10271028migrate = is_memory_migrate(cs);10291030mpol_rebind_mm(mm, &cs->mems_allowed);1031if (migrate)1032cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);1033mmput(mm);1034}10351036static void *cpuset_being_rebound;10371038/**1039* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.1040* @cs: the cpuset in which each task's mems_allowed mask needs to be changed1041* @oldmem: old mems_allowed of cpuset cs1042* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()1043*1044* Called with cgroup_mutex held1045* No return value. It's guaranteed that cgroup_scan_tasks() always returns 01046* if @heap != NULL.1047*/1048static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,1049struct ptr_heap *heap)1050{1051struct cgroup_scanner scan;10521053cpuset_being_rebound = cs; /* causes mpol_dup() rebind */10541055scan.cg = cs->css.cgroup;1056scan.test_task = NULL;1057scan.process_task = cpuset_change_nodemask;1058scan.heap = heap;1059scan.data = (nodemask_t *)oldmem;10601061/*1062* The mpol_rebind_mm() call takes mmap_sem, which we couldn't1063* take while holding tasklist_lock. Forks can happen - the1064* mpol_dup() cpuset_being_rebound check will catch such forks,1065* and rebind their vma mempolicies too. Because we still hold1066* the global cgroup_mutex, we know that no other rebind effort1067* will be contending for the global variable cpuset_being_rebound.1068* It's ok if we rebind the same mm twice; mpol_rebind_mm()1069* is idempotent. Also migrate pages in each mm to new nodes.1070*/1071cgroup_scan_tasks(&scan);10721073/* We're done rebinding vmas to this cpuset's new mems_allowed. */1074cpuset_being_rebound = NULL;1075}10761077/*1078* Handle user request to change the 'mems' memory placement1079* of a cpuset. Needs to validate the request, update the1080* cpusets mems_allowed, and for each task in the cpuset,1081* update mems_allowed and rebind task's mempolicy and any vma1082* mempolicies and if the cpuset is marked 'memory_migrate',1083* migrate the tasks pages to the new memory.1084*1085* Call with cgroup_mutex held. May take callback_mutex during call.1086* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,1087* lock each such tasks mm->mmap_sem, scan its vma's and rebind1088* their mempolicies to the cpusets new mems_allowed.1089*/1090static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,1091const char *buf)1092{1093NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);1094int retval;1095struct ptr_heap heap;10961097if (!oldmem)1098return -ENOMEM;10991100/*1101* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];1102* it's read-only1103*/1104if (cs == &top_cpuset) {1105retval = -EACCES;1106goto done;1107}11081109/*1110* An empty mems_allowed is ok iff there are no tasks in the cpuset.1111* Since nodelist_parse() fails on an empty mask, we special case1112* that parsing. The validate_change() call ensures that cpusets1113* with tasks have memory.1114*/1115if (!*buf) {1116nodes_clear(trialcs->mems_allowed);1117} else {1118retval = nodelist_parse(buf, trialcs->mems_allowed);1119if (retval < 0)1120goto done;11211122if (!nodes_subset(trialcs->mems_allowed,1123node_states[N_HIGH_MEMORY])) {1124retval = -EINVAL;1125goto done;1126}1127}1128*oldmem = cs->mems_allowed;1129if (nodes_equal(*oldmem, trialcs->mems_allowed)) {1130retval = 0; /* Too easy - nothing to do */1131goto done;1132}1133retval = validate_change(cs, trialcs);1134if (retval < 0)1135goto done;11361137retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);1138if (retval < 0)1139goto done;11401141mutex_lock(&callback_mutex);1142cs->mems_allowed = trialcs->mems_allowed;1143mutex_unlock(&callback_mutex);11441145update_tasks_nodemask(cs, oldmem, &heap);11461147heap_free(&heap);1148done:1149NODEMASK_FREE(oldmem);1150return retval;1151}11521153int current_cpuset_is_being_rebound(void)1154{1155return task_cs(current) == cpuset_being_rebound;1156}11571158static int update_relax_domain_level(struct cpuset *cs, s64 val)1159{1160#ifdef CONFIG_SMP1161if (val < -1 || val >= sched_domain_level_max)1162return -EINVAL;1163#endif11641165if (val != cs->relax_domain_level) {1166cs->relax_domain_level = val;1167if (!cpumask_empty(cs->cpus_allowed) &&1168is_sched_load_balance(cs))1169async_rebuild_sched_domains();1170}11711172return 0;1173}11741175/*1176* cpuset_change_flag - make a task's spread flags the same as its cpuset's1177* @tsk: task to be updated1178* @scan: struct cgroup_scanner containing the cgroup of the task1179*1180* Called by cgroup_scan_tasks() for each task in a cgroup.1181*1182* We don't need to re-check for the cgroup/cpuset membership, since we're1183* holding cgroup_lock() at this point.1184*/1185static void cpuset_change_flag(struct task_struct *tsk,1186struct cgroup_scanner *scan)1187{1188cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);1189}11901191/*1192* update_tasks_flags - update the spread flags of tasks in the cpuset.1193* @cs: the cpuset in which each task's spread flags needs to be changed1194* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()1195*1196* Called with cgroup_mutex held1197*1198* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,1199* calling callback functions for each.1200*1201* No return value. It's guaranteed that cgroup_scan_tasks() always returns 01202* if @heap != NULL.1203*/1204static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)1205{1206struct cgroup_scanner scan;12071208scan.cg = cs->css.cgroup;1209scan.test_task = NULL;1210scan.process_task = cpuset_change_flag;1211scan.heap = heap;1212cgroup_scan_tasks(&scan);1213}12141215/*1216* update_flag - read a 0 or a 1 in a file and update associated flag1217* bit: the bit to update (see cpuset_flagbits_t)1218* cs: the cpuset to update1219* turning_on: whether the flag is being set or cleared1220*1221* Call with cgroup_mutex held.1222*/12231224static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,1225int turning_on)1226{1227struct cpuset *trialcs;1228int balance_flag_changed;1229int spread_flag_changed;1230struct ptr_heap heap;1231int err;12321233trialcs = alloc_trial_cpuset(cs);1234if (!trialcs)1235return -ENOMEM;12361237if (turning_on)1238set_bit(bit, &trialcs->flags);1239else1240clear_bit(bit, &trialcs->flags);12411242err = validate_change(cs, trialcs);1243if (err < 0)1244goto out;12451246err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);1247if (err < 0)1248goto out;12491250balance_flag_changed = (is_sched_load_balance(cs) !=1251is_sched_load_balance(trialcs));12521253spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))1254|| (is_spread_page(cs) != is_spread_page(trialcs)));12551256mutex_lock(&callback_mutex);1257cs->flags = trialcs->flags;1258mutex_unlock(&callback_mutex);12591260if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)1261async_rebuild_sched_domains();12621263if (spread_flag_changed)1264update_tasks_flags(cs, &heap);1265heap_free(&heap);1266out:1267free_trial_cpuset(trialcs);1268return err;1269}12701271/*1272* Frequency meter - How fast is some event occurring?1273*1274* These routines manage a digitally filtered, constant time based,1275* event frequency meter. There are four routines:1276* fmeter_init() - initialize a frequency meter.1277* fmeter_markevent() - called each time the event happens.1278* fmeter_getrate() - returns the recent rate of such events.1279* fmeter_update() - internal routine used to update fmeter.1280*1281* A common data structure is passed to each of these routines,1282* which is used to keep track of the state required to manage the1283* frequency meter and its digital filter.1284*1285* The filter works on the number of events marked per unit time.1286* The filter is single-pole low-pass recursive (IIR). The time unit1287* is 1 second. Arithmetic is done using 32-bit integers scaled to1288* simulate 3 decimal digits of precision (multiplied by 1000).1289*1290* With an FM_COEF of 933, and a time base of 1 second, the filter1291* has a half-life of 10 seconds, meaning that if the events quit1292* happening, then the rate returned from the fmeter_getrate()1293* will be cut in half each 10 seconds, until it converges to zero.1294*1295* It is not worth doing a real infinitely recursive filter. If more1296* than FM_MAXTICKS ticks have elapsed since the last filter event,1297* just compute FM_MAXTICKS ticks worth, by which point the level1298* will be stable.1299*1300* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid1301* arithmetic overflow in the fmeter_update() routine.1302*1303* Given the simple 32 bit integer arithmetic used, this meter works1304* best for reporting rates between one per millisecond (msec) and1305* one per 32 (approx) seconds. At constant rates faster than one1306* per msec it maxes out at values just under 1,000,000. At constant1307* rates between one per msec, and one per second it will stabilize1308* to a value N*1000, where N is the rate of events per second.1309* At constant rates between one per second and one per 32 seconds,1310* it will be choppy, moving up on the seconds that have an event,1311* and then decaying until the next event. At rates slower than1312* about one in 32 seconds, it decays all the way back to zero between1313* each event.1314*/13151316#define FM_COEF 933 /* coefficient for half-life of 10 secs */1317#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */1318#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */1319#define FM_SCALE 1000 /* faux fixed point scale */13201321/* Initialize a frequency meter */1322static void fmeter_init(struct fmeter *fmp)1323{1324fmp->cnt = 0;1325fmp->val = 0;1326fmp->time = 0;1327spin_lock_init(&fmp->lock);1328}13291330/* Internal meter update - process cnt events and update value */1331static void fmeter_update(struct fmeter *fmp)1332{1333time_t now = get_seconds();1334time_t ticks = now - fmp->time;13351336if (ticks == 0)1337return;13381339ticks = min(FM_MAXTICKS, ticks);1340while (ticks-- > 0)1341fmp->val = (FM_COEF * fmp->val) / FM_SCALE;1342fmp->time = now;13431344fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;1345fmp->cnt = 0;1346}13471348/* Process any previous ticks, then bump cnt by one (times scale). */1349static void fmeter_markevent(struct fmeter *fmp)1350{1351spin_lock(&fmp->lock);1352fmeter_update(fmp);1353fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);1354spin_unlock(&fmp->lock);1355}13561357/* Process any previous ticks, then return current value. */1358static int fmeter_getrate(struct fmeter *fmp)1359{1360int val;13611362spin_lock(&fmp->lock);1363fmeter_update(fmp);1364val = fmp->val;1365spin_unlock(&fmp->lock);1366return val;1367}13681369/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */1370static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,1371struct task_struct *tsk)1372{1373struct cpuset *cs = cgroup_cs(cont);13741375if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))1376return -ENOSPC;13771378/*1379* Kthreads bound to specific cpus cannot be moved to a new cpuset; we1380* cannot change their cpu affinity and isolating such threads by their1381* set of allowed nodes is unnecessary. Thus, cpusets are not1382* applicable for such threads. This prevents checking for success of1383* set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may1384* be changed.1385*/1386if (tsk->flags & PF_THREAD_BOUND)1387return -EINVAL;13881389return 0;1390}13911392static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)1393{1394return security_task_setscheduler(task);1395}13961397/*1398* Protected by cgroup_lock. The nodemasks must be stored globally because1399* dynamically allocating them is not allowed in pre_attach, and they must1400* persist among pre_attach, attach_task, and attach.1401*/1402static cpumask_var_t cpus_attach;1403static nodemask_t cpuset_attach_nodemask_from;1404static nodemask_t cpuset_attach_nodemask_to;14051406/* Set-up work for before attaching each task. */1407static void cpuset_pre_attach(struct cgroup *cont)1408{1409struct cpuset *cs = cgroup_cs(cont);14101411if (cs == &top_cpuset)1412cpumask_copy(cpus_attach, cpu_possible_mask);1413else1414guarantee_online_cpus(cs, cpus_attach);14151416guarantee_online_mems(cs, &cpuset_attach_nodemask_to);1417}14181419/* Per-thread attachment work. */1420static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)1421{1422int err;1423struct cpuset *cs = cgroup_cs(cont);14241425/*1426* can_attach beforehand should guarantee that this doesn't fail.1427* TODO: have a better way to handle failure here1428*/1429err = set_cpus_allowed_ptr(tsk, cpus_attach);1430WARN_ON_ONCE(err);14311432cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);1433cpuset_update_task_spread_flag(cs, tsk);1434}14351436static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,1437struct cgroup *oldcont, struct task_struct *tsk)1438{1439struct mm_struct *mm;1440struct cpuset *cs = cgroup_cs(cont);1441struct cpuset *oldcs = cgroup_cs(oldcont);14421443/*1444* Change mm, possibly for multiple threads in a threadgroup. This is1445* expensive and may sleep.1446*/1447cpuset_attach_nodemask_from = oldcs->mems_allowed;1448cpuset_attach_nodemask_to = cs->mems_allowed;1449mm = get_task_mm(tsk);1450if (mm) {1451mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);1452if (is_memory_migrate(cs))1453cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,1454&cpuset_attach_nodemask_to);1455mmput(mm);1456}1457}14581459/* The various types of files and directories in a cpuset file system */14601461typedef enum {1462FILE_MEMORY_MIGRATE,1463FILE_CPULIST,1464FILE_MEMLIST,1465FILE_CPU_EXCLUSIVE,1466FILE_MEM_EXCLUSIVE,1467FILE_MEM_HARDWALL,1468FILE_SCHED_LOAD_BALANCE,1469FILE_SCHED_RELAX_DOMAIN_LEVEL,1470FILE_MEMORY_PRESSURE_ENABLED,1471FILE_MEMORY_PRESSURE,1472FILE_SPREAD_PAGE,1473FILE_SPREAD_SLAB,1474} cpuset_filetype_t;14751476static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)1477{1478int retval = 0;1479struct cpuset *cs = cgroup_cs(cgrp);1480cpuset_filetype_t type = cft->private;14811482if (!cgroup_lock_live_group(cgrp))1483return -ENODEV;14841485switch (type) {1486case FILE_CPU_EXCLUSIVE:1487retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);1488break;1489case FILE_MEM_EXCLUSIVE:1490retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);1491break;1492case FILE_MEM_HARDWALL:1493retval = update_flag(CS_MEM_HARDWALL, cs, val);1494break;1495case FILE_SCHED_LOAD_BALANCE:1496retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);1497break;1498case FILE_MEMORY_MIGRATE:1499retval = update_flag(CS_MEMORY_MIGRATE, cs, val);1500break;1501case FILE_MEMORY_PRESSURE_ENABLED:1502cpuset_memory_pressure_enabled = !!val;1503break;1504case FILE_MEMORY_PRESSURE:1505retval = -EACCES;1506break;1507case FILE_SPREAD_PAGE:1508retval = update_flag(CS_SPREAD_PAGE, cs, val);1509break;1510case FILE_SPREAD_SLAB:1511retval = update_flag(CS_SPREAD_SLAB, cs, val);1512break;1513default:1514retval = -EINVAL;1515break;1516}1517cgroup_unlock();1518return retval;1519}15201521static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)1522{1523int retval = 0;1524struct cpuset *cs = cgroup_cs(cgrp);1525cpuset_filetype_t type = cft->private;15261527if (!cgroup_lock_live_group(cgrp))1528return -ENODEV;15291530switch (type) {1531case FILE_SCHED_RELAX_DOMAIN_LEVEL:1532retval = update_relax_domain_level(cs, val);1533break;1534default:1535retval = -EINVAL;1536break;1537}1538cgroup_unlock();1539return retval;1540}15411542/*1543* Common handling for a write to a "cpus" or "mems" file.1544*/1545static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,1546const char *buf)1547{1548int retval = 0;1549struct cpuset *cs = cgroup_cs(cgrp);1550struct cpuset *trialcs;15511552if (!cgroup_lock_live_group(cgrp))1553return -ENODEV;15541555trialcs = alloc_trial_cpuset(cs);1556if (!trialcs) {1557retval = -ENOMEM;1558goto out;1559}15601561switch (cft->private) {1562case FILE_CPULIST:1563retval = update_cpumask(cs, trialcs, buf);1564break;1565case FILE_MEMLIST:1566retval = update_nodemask(cs, trialcs, buf);1567break;1568default:1569retval = -EINVAL;1570break;1571}15721573free_trial_cpuset(trialcs);1574out:1575cgroup_unlock();1576return retval;1577}15781579/*1580* These ascii lists should be read in a single call, by using a user1581* buffer large enough to hold the entire map. If read in smaller1582* chunks, there is no guarantee of atomicity. Since the display format1583* used, list of ranges of sequential numbers, is variable length,1584* and since these maps can change value dynamically, one could read1585* gibberish by doing partial reads while a list was changing.1586* A single large read to a buffer that crosses a page boundary is1587* ok, because the result being copied to user land is not recomputed1588* across a page fault.1589*/15901591static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)1592{1593size_t count;15941595mutex_lock(&callback_mutex);1596count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);1597mutex_unlock(&callback_mutex);15981599return count;1600}16011602static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)1603{1604size_t count;16051606mutex_lock(&callback_mutex);1607count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);1608mutex_unlock(&callback_mutex);16091610return count;1611}16121613static ssize_t cpuset_common_file_read(struct cgroup *cont,1614struct cftype *cft,1615struct file *file,1616char __user *buf,1617size_t nbytes, loff_t *ppos)1618{1619struct cpuset *cs = cgroup_cs(cont);1620cpuset_filetype_t type = cft->private;1621char *page;1622ssize_t retval = 0;1623char *s;16241625if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))1626return -ENOMEM;16271628s = page;16291630switch (type) {1631case FILE_CPULIST:1632s += cpuset_sprintf_cpulist(s, cs);1633break;1634case FILE_MEMLIST:1635s += cpuset_sprintf_memlist(s, cs);1636break;1637default:1638retval = -EINVAL;1639goto out;1640}1641*s++ = '\n';16421643retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);1644out:1645free_page((unsigned long)page);1646return retval;1647}16481649static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)1650{1651struct cpuset *cs = cgroup_cs(cont);1652cpuset_filetype_t type = cft->private;1653switch (type) {1654case FILE_CPU_EXCLUSIVE:1655return is_cpu_exclusive(cs);1656case FILE_MEM_EXCLUSIVE:1657return is_mem_exclusive(cs);1658case FILE_MEM_HARDWALL:1659return is_mem_hardwall(cs);1660case FILE_SCHED_LOAD_BALANCE:1661return is_sched_load_balance(cs);1662case FILE_MEMORY_MIGRATE:1663return is_memory_migrate(cs);1664case FILE_MEMORY_PRESSURE_ENABLED:1665return cpuset_memory_pressure_enabled;1666case FILE_MEMORY_PRESSURE:1667return fmeter_getrate(&cs->fmeter);1668case FILE_SPREAD_PAGE:1669return is_spread_page(cs);1670case FILE_SPREAD_SLAB:1671return is_spread_slab(cs);1672default:1673BUG();1674}16751676/* Unreachable but makes gcc happy */1677return 0;1678}16791680static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)1681{1682struct cpuset *cs = cgroup_cs(cont);1683cpuset_filetype_t type = cft->private;1684switch (type) {1685case FILE_SCHED_RELAX_DOMAIN_LEVEL:1686return cs->relax_domain_level;1687default:1688BUG();1689}16901691/* Unrechable but makes gcc happy */1692return 0;1693}169416951696/*1697* for the common functions, 'private' gives the type of file1698*/16991700static struct cftype files[] = {1701{1702.name = "cpus",1703.read = cpuset_common_file_read,1704.write_string = cpuset_write_resmask,1705.max_write_len = (100U + 6 * NR_CPUS),1706.private = FILE_CPULIST,1707},17081709{1710.name = "mems",1711.read = cpuset_common_file_read,1712.write_string = cpuset_write_resmask,1713.max_write_len = (100U + 6 * MAX_NUMNODES),1714.private = FILE_MEMLIST,1715},17161717{1718.name = "cpu_exclusive",1719.read_u64 = cpuset_read_u64,1720.write_u64 = cpuset_write_u64,1721.private = FILE_CPU_EXCLUSIVE,1722},17231724{1725.name = "mem_exclusive",1726.read_u64 = cpuset_read_u64,1727.write_u64 = cpuset_write_u64,1728.private = FILE_MEM_EXCLUSIVE,1729},17301731{1732.name = "mem_hardwall",1733.read_u64 = cpuset_read_u64,1734.write_u64 = cpuset_write_u64,1735.private = FILE_MEM_HARDWALL,1736},17371738{1739.name = "sched_load_balance",1740.read_u64 = cpuset_read_u64,1741.write_u64 = cpuset_write_u64,1742.private = FILE_SCHED_LOAD_BALANCE,1743},17441745{1746.name = "sched_relax_domain_level",1747.read_s64 = cpuset_read_s64,1748.write_s64 = cpuset_write_s64,1749.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,1750},17511752{1753.name = "memory_migrate",1754.read_u64 = cpuset_read_u64,1755.write_u64 = cpuset_write_u64,1756.private = FILE_MEMORY_MIGRATE,1757},17581759{1760.name = "memory_pressure",1761.read_u64 = cpuset_read_u64,1762.write_u64 = cpuset_write_u64,1763.private = FILE_MEMORY_PRESSURE,1764.mode = S_IRUGO,1765},17661767{1768.name = "memory_spread_page",1769.read_u64 = cpuset_read_u64,1770.write_u64 = cpuset_write_u64,1771.private = FILE_SPREAD_PAGE,1772},17731774{1775.name = "memory_spread_slab",1776.read_u64 = cpuset_read_u64,1777.write_u64 = cpuset_write_u64,1778.private = FILE_SPREAD_SLAB,1779},1780};17811782static struct cftype cft_memory_pressure_enabled = {1783.name = "memory_pressure_enabled",1784.read_u64 = cpuset_read_u64,1785.write_u64 = cpuset_write_u64,1786.private = FILE_MEMORY_PRESSURE_ENABLED,1787};17881789static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)1790{1791int err;17921793err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));1794if (err)1795return err;1796/* memory_pressure_enabled is in root cpuset only */1797if (!cont->parent)1798err = cgroup_add_file(cont, ss,1799&cft_memory_pressure_enabled);1800return err;1801}18021803/*1804* post_clone() is called during cgroup_create() when the1805* clone_children mount argument was specified. The cgroup1806* can not yet have any tasks.1807*1808* Currently we refuse to set up the cgroup - thereby1809* refusing the task to be entered, and as a result refusing1810* the sys_unshare() or clone() which initiated it - if any1811* sibling cpusets have exclusive cpus or mem.1812*1813* If this becomes a problem for some users who wish to1814* allow that scenario, then cpuset_post_clone() could be1815* changed to grant parent->cpus_allowed-sibling_cpus_exclusive1816* (and likewise for mems) to the new cgroup. Called with cgroup_mutex1817* held.1818*/1819static void cpuset_post_clone(struct cgroup_subsys *ss,1820struct cgroup *cgroup)1821{1822struct cgroup *parent, *child;1823struct cpuset *cs, *parent_cs;18241825parent = cgroup->parent;1826list_for_each_entry(child, &parent->children, sibling) {1827cs = cgroup_cs(child);1828if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))1829return;1830}1831cs = cgroup_cs(cgroup);1832parent_cs = cgroup_cs(parent);18331834mutex_lock(&callback_mutex);1835cs->mems_allowed = parent_cs->mems_allowed;1836cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);1837mutex_unlock(&callback_mutex);1838return;1839}18401841/*1842* cpuset_create - create a cpuset1843* ss: cpuset cgroup subsystem1844* cont: control group that the new cpuset will be part of1845*/18461847static struct cgroup_subsys_state *cpuset_create(1848struct cgroup_subsys *ss,1849struct cgroup *cont)1850{1851struct cpuset *cs;1852struct cpuset *parent;18531854if (!cont->parent) {1855return &top_cpuset.css;1856}1857parent = cgroup_cs(cont->parent);1858cs = kmalloc(sizeof(*cs), GFP_KERNEL);1859if (!cs)1860return ERR_PTR(-ENOMEM);1861if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {1862kfree(cs);1863return ERR_PTR(-ENOMEM);1864}18651866cs->flags = 0;1867if (is_spread_page(parent))1868set_bit(CS_SPREAD_PAGE, &cs->flags);1869if (is_spread_slab(parent))1870set_bit(CS_SPREAD_SLAB, &cs->flags);1871set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);1872cpumask_clear(cs->cpus_allowed);1873nodes_clear(cs->mems_allowed);1874fmeter_init(&cs->fmeter);1875cs->relax_domain_level = -1;18761877cs->parent = parent;1878number_of_cpusets++;1879return &cs->css ;1880}18811882/*1883* If the cpuset being removed has its flag 'sched_load_balance'1884* enabled, then simulate turning sched_load_balance off, which1885* will call async_rebuild_sched_domains().1886*/18871888static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)1889{1890struct cpuset *cs = cgroup_cs(cont);18911892if (is_sched_load_balance(cs))1893update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);18941895number_of_cpusets--;1896free_cpumask_var(cs->cpus_allowed);1897kfree(cs);1898}18991900struct cgroup_subsys cpuset_subsys = {1901.name = "cpuset",1902.create = cpuset_create,1903.destroy = cpuset_destroy,1904.can_attach = cpuset_can_attach,1905.can_attach_task = cpuset_can_attach_task,1906.pre_attach = cpuset_pre_attach,1907.attach_task = cpuset_attach_task,1908.attach = cpuset_attach,1909.populate = cpuset_populate,1910.post_clone = cpuset_post_clone,1911.subsys_id = cpuset_subsys_id,1912.early_init = 1,1913};19141915/**1916* cpuset_init - initialize cpusets at system boot1917*1918* Description: Initialize top_cpuset and the cpuset internal file system,1919**/19201921int __init cpuset_init(void)1922{1923int err = 0;19241925if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))1926BUG();19271928cpumask_setall(top_cpuset.cpus_allowed);1929nodes_setall(top_cpuset.mems_allowed);19301931fmeter_init(&top_cpuset.fmeter);1932set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);1933top_cpuset.relax_domain_level = -1;19341935err = register_filesystem(&cpuset_fs_type);1936if (err < 0)1937return err;19381939if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))1940BUG();19411942number_of_cpusets = 1;1943return 0;1944}19451946/**1947* cpuset_do_move_task - move a given task to another cpuset1948* @tsk: pointer to task_struct the task to move1949* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner1950*1951* Called by cgroup_scan_tasks() for each task in a cgroup.1952* Return nonzero to stop the walk through the tasks.1953*/1954static void cpuset_do_move_task(struct task_struct *tsk,1955struct cgroup_scanner *scan)1956{1957struct cgroup *new_cgroup = scan->data;19581959cgroup_attach_task(new_cgroup, tsk);1960}19611962/**1963* move_member_tasks_to_cpuset - move tasks from one cpuset to another1964* @from: cpuset in which the tasks currently reside1965* @to: cpuset to which the tasks will be moved1966*1967* Called with cgroup_mutex held1968* callback_mutex must not be held, as cpuset_attach() will take it.1969*1970* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,1971* calling callback functions for each.1972*/1973static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)1974{1975struct cgroup_scanner scan;19761977scan.cg = from->css.cgroup;1978scan.test_task = NULL; /* select all tasks in cgroup */1979scan.process_task = cpuset_do_move_task;1980scan.heap = NULL;1981scan.data = to->css.cgroup;19821983if (cgroup_scan_tasks(&scan))1984printk(KERN_ERR "move_member_tasks_to_cpuset: "1985"cgroup_scan_tasks failed\n");1986}19871988/*1989* If CPU and/or memory hotplug handlers, below, unplug any CPUs1990* or memory nodes, we need to walk over the cpuset hierarchy,1991* removing that CPU or node from all cpusets. If this removes the1992* last CPU or node from a cpuset, then move the tasks in the empty1993* cpuset to its next-highest non-empty parent.1994*1995* Called with cgroup_mutex held1996* callback_mutex must not be held, as cpuset_attach() will take it.1997*/1998static void remove_tasks_in_empty_cpuset(struct cpuset *cs)1999{2000struct cpuset *parent;20012002/*2003* The cgroup's css_sets list is in use if there are tasks2004* in the cpuset; the list is empty if there are none;2005* the cs->css.refcnt seems always 0.2006*/2007if (list_empty(&cs->css.cgroup->css_sets))2008return;20092010/*2011* Find its next-highest non-empty parent, (top cpuset2012* has online cpus, so can't be empty).2013*/2014parent = cs->parent;2015while (cpumask_empty(parent->cpus_allowed) ||2016nodes_empty(parent->mems_allowed))2017parent = parent->parent;20182019move_member_tasks_to_cpuset(cs, parent);2020}20212022/*2023* Walk the specified cpuset subtree and look for empty cpusets.2024* The tasks of such cpuset must be moved to a parent cpuset.2025*2026* Called with cgroup_mutex held. We take callback_mutex to modify2027* cpus_allowed and mems_allowed.2028*2029* This walk processes the tree from top to bottom, completing one layer2030* before dropping down to the next. It always processes a node before2031* any of its children.2032*2033* For now, since we lack memory hot unplug, we'll never see a cpuset2034* that has tasks along with an empty 'mems'. But if we did see such2035* a cpuset, we'd handle it just like we do if its 'cpus' was empty.2036*/2037static void scan_for_empty_cpusets(struct cpuset *root)2038{2039LIST_HEAD(queue);2040struct cpuset *cp; /* scans cpusets being updated */2041struct cpuset *child; /* scans child cpusets of cp */2042struct cgroup *cont;2043static nodemask_t oldmems; /* protected by cgroup_mutex */20442045list_add_tail((struct list_head *)&root->stack_list, &queue);20462047while (!list_empty(&queue)) {2048cp = list_first_entry(&queue, struct cpuset, stack_list);2049list_del(queue.next);2050list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {2051child = cgroup_cs(cont);2052list_add_tail(&child->stack_list, &queue);2053}20542055/* Continue past cpusets with all cpus, mems online */2056if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&2057nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))2058continue;20592060oldmems = cp->mems_allowed;20612062/* Remove offline cpus and mems from this cpuset. */2063mutex_lock(&callback_mutex);2064cpumask_and(cp->cpus_allowed, cp->cpus_allowed,2065cpu_active_mask);2066nodes_and(cp->mems_allowed, cp->mems_allowed,2067node_states[N_HIGH_MEMORY]);2068mutex_unlock(&callback_mutex);20692070/* Move tasks from the empty cpuset to a parent */2071if (cpumask_empty(cp->cpus_allowed) ||2072nodes_empty(cp->mems_allowed))2073remove_tasks_in_empty_cpuset(cp);2074else {2075update_tasks_cpumask(cp, NULL);2076update_tasks_nodemask(cp, &oldmems, NULL);2077}2078}2079}20802081/*2082* The top_cpuset tracks what CPUs and Memory Nodes are online,2083* period. This is necessary in order to make cpusets transparent2084* (of no affect) on systems that are actively using CPU hotplug2085* but making no active use of cpusets.2086*2087* This routine ensures that top_cpuset.cpus_allowed tracks2088* cpu_active_mask on each CPU hotplug (cpuhp) event.2089*2090* Called within get_online_cpus(). Needs to call cgroup_lock()2091* before calling generate_sched_domains().2092*/2093void cpuset_update_active_cpus(void)2094{2095struct sched_domain_attr *attr;2096cpumask_var_t *doms;2097int ndoms;20982099cgroup_lock();2100mutex_lock(&callback_mutex);2101cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);2102mutex_unlock(&callback_mutex);2103scan_for_empty_cpusets(&top_cpuset);2104ndoms = generate_sched_domains(&doms, &attr);2105cgroup_unlock();21062107/* Have scheduler rebuild the domains */2108partition_sched_domains(ndoms, doms, attr);2109}21102111#ifdef CONFIG_MEMORY_HOTPLUG2112/*2113* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].2114* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.2115* See also the previous routine cpuset_track_online_cpus().2116*/2117static int cpuset_track_online_nodes(struct notifier_block *self,2118unsigned long action, void *arg)2119{2120static nodemask_t oldmems; /* protected by cgroup_mutex */21212122cgroup_lock();2123switch (action) {2124case MEM_ONLINE:2125oldmems = top_cpuset.mems_allowed;2126mutex_lock(&callback_mutex);2127top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];2128mutex_unlock(&callback_mutex);2129update_tasks_nodemask(&top_cpuset, &oldmems, NULL);2130break;2131case MEM_OFFLINE:2132/*2133* needn't update top_cpuset.mems_allowed explicitly because2134* scan_for_empty_cpusets() will update it.2135*/2136scan_for_empty_cpusets(&top_cpuset);2137break;2138default:2139break;2140}2141cgroup_unlock();21422143return NOTIFY_OK;2144}2145#endif21462147/**2148* cpuset_init_smp - initialize cpus_allowed2149*2150* Description: Finish top cpuset after cpu, node maps are initialized2151**/21522153void __init cpuset_init_smp(void)2154{2155cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);2156top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];21572158hotplug_memory_notifier(cpuset_track_online_nodes, 10);21592160cpuset_wq = create_singlethread_workqueue("cpuset");2161BUG_ON(!cpuset_wq);2162}21632164/**2165* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.2166* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.2167* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.2168*2169* Description: Returns the cpumask_var_t cpus_allowed of the cpuset2170* attached to the specified @tsk. Guaranteed to return some non-empty2171* subset of cpu_online_map, even if this means going outside the2172* tasks cpuset.2173**/21742175void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)2176{2177mutex_lock(&callback_mutex);2178task_lock(tsk);2179guarantee_online_cpus(task_cs(tsk), pmask);2180task_unlock(tsk);2181mutex_unlock(&callback_mutex);2182}21832184int cpuset_cpus_allowed_fallback(struct task_struct *tsk)2185{2186const struct cpuset *cs;2187int cpu;21882189rcu_read_lock();2190cs = task_cs(tsk);2191if (cs)2192do_set_cpus_allowed(tsk, cs->cpus_allowed);2193rcu_read_unlock();21942195/*2196* We own tsk->cpus_allowed, nobody can change it under us.2197*2198* But we used cs && cs->cpus_allowed lockless and thus can2199* race with cgroup_attach_task() or update_cpumask() and get2200* the wrong tsk->cpus_allowed. However, both cases imply the2201* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()2202* which takes task_rq_lock().2203*2204* If we are called after it dropped the lock we must see all2205* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary2206* set any mask even if it is not right from task_cs() pov,2207* the pending set_cpus_allowed_ptr() will fix things.2208*/22092210cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);2211if (cpu >= nr_cpu_ids) {2212/*2213* Either tsk->cpus_allowed is wrong (see above) or it2214* is actually empty. The latter case is only possible2215* if we are racing with remove_tasks_in_empty_cpuset().2216* Like above we can temporary set any mask and rely on2217* set_cpus_allowed_ptr() as synchronization point.2218*/2219do_set_cpus_allowed(tsk, cpu_possible_mask);2220cpu = cpumask_any(cpu_active_mask);2221}22222223return cpu;2224}22252226void cpuset_init_current_mems_allowed(void)2227{2228nodes_setall(current->mems_allowed);2229}22302231/**2232* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.2233* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.2234*2235* Description: Returns the nodemask_t mems_allowed of the cpuset2236* attached to the specified @tsk. Guaranteed to return some non-empty2237* subset of node_states[N_HIGH_MEMORY], even if this means going outside the2238* tasks cpuset.2239**/22402241nodemask_t cpuset_mems_allowed(struct task_struct *tsk)2242{2243nodemask_t mask;22442245mutex_lock(&callback_mutex);2246task_lock(tsk);2247guarantee_online_mems(task_cs(tsk), &mask);2248task_unlock(tsk);2249mutex_unlock(&callback_mutex);22502251return mask;2252}22532254/**2255* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed2256* @nodemask: the nodemask to be checked2257*2258* Are any of the nodes in the nodemask allowed in current->mems_allowed?2259*/2260int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)2261{2262return nodes_intersects(*nodemask, current->mems_allowed);2263}22642265/*2266* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or2267* mem_hardwall ancestor to the specified cpuset. Call holding2268* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall2269* (an unusual configuration), then returns the root cpuset.2270*/2271static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)2272{2273while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)2274cs = cs->parent;2275return cs;2276}22772278/**2279* cpuset_node_allowed_softwall - Can we allocate on a memory node?2280* @node: is this an allowed node?2281* @gfp_mask: memory allocation flags2282*2283* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is2284* set, yes, we can always allocate. If node is in our task's mems_allowed,2285* yes. If it's not a __GFP_HARDWALL request and this node is in the nearest2286* hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been2287* OOM killed and has access to memory reserves as specified by the TIF_MEMDIE2288* flag, yes.2289* Otherwise, no.2290*2291* If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to2292* cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()2293* might sleep, and might allow a node from an enclosing cpuset.2294*2295* cpuset_node_allowed_hardwall() only handles the simpler case of hardwall2296* cpusets, and never sleeps.2297*2298* The __GFP_THISNODE placement logic is really handled elsewhere,2299* by forcibly using a zonelist starting at a specified node, and by2300* (in get_page_from_freelist()) refusing to consider the zones for2301* any node on the zonelist except the first. By the time any such2302* calls get to this routine, we should just shut up and say 'yes'.2303*2304* GFP_USER allocations are marked with the __GFP_HARDWALL bit,2305* and do not allow allocations outside the current tasks cpuset2306* unless the task has been OOM killed as is marked TIF_MEMDIE.2307* GFP_KERNEL allocations are not so marked, so can escape to the2308* nearest enclosing hardwalled ancestor cpuset.2309*2310* Scanning up parent cpusets requires callback_mutex. The2311* __alloc_pages() routine only calls here with __GFP_HARDWALL bit2312* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the2313* current tasks mems_allowed came up empty on the first pass over2314* the zonelist. So only GFP_KERNEL allocations, if all nodes in the2315* cpuset are short of memory, might require taking the callback_mutex2316* mutex.2317*2318* The first call here from mm/page_alloc:get_page_from_freelist()2319* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,2320* so no allocation on a node outside the cpuset is allowed (unless2321* in interrupt, of course).2322*2323* The second pass through get_page_from_freelist() doesn't even call2324* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()2325* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set2326* in alloc_flags. That logic and the checks below have the combined2327* affect that:2328* in_interrupt - any node ok (current task context irrelevant)2329* GFP_ATOMIC - any node ok2330* TIF_MEMDIE - any node ok2331* GFP_KERNEL - any node in enclosing hardwalled cpuset ok2332* GFP_USER - only nodes in current tasks mems allowed ok.2333*2334* Rule:2335* Don't call cpuset_node_allowed_softwall if you can't sleep, unless you2336* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables2337* the code that might scan up ancestor cpusets and sleep.2338*/2339int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)2340{2341const struct cpuset *cs; /* current cpuset ancestors */2342int allowed; /* is allocation in zone z allowed? */23432344if (in_interrupt() || (gfp_mask & __GFP_THISNODE))2345return 1;2346might_sleep_if(!(gfp_mask & __GFP_HARDWALL));2347if (node_isset(node, current->mems_allowed))2348return 1;2349/*2350* Allow tasks that have access to memory reserves because they have2351* been OOM killed to get memory anywhere.2352*/2353if (unlikely(test_thread_flag(TIF_MEMDIE)))2354return 1;2355if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */2356return 0;23572358if (current->flags & PF_EXITING) /* Let dying task have memory */2359return 1;23602361/* Not hardwall and node outside mems_allowed: scan up cpusets */2362mutex_lock(&callback_mutex);23632364task_lock(current);2365cs = nearest_hardwall_ancestor(task_cs(current));2366task_unlock(current);23672368allowed = node_isset(node, cs->mems_allowed);2369mutex_unlock(&callback_mutex);2370return allowed;2371}23722373/*2374* cpuset_node_allowed_hardwall - Can we allocate on a memory node?2375* @node: is this an allowed node?2376* @gfp_mask: memory allocation flags2377*2378* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is2379* set, yes, we can always allocate. If node is in our task's mems_allowed,2380* yes. If the task has been OOM killed and has access to memory reserves as2381* specified by the TIF_MEMDIE flag, yes.2382* Otherwise, no.2383*2384* The __GFP_THISNODE placement logic is really handled elsewhere,2385* by forcibly using a zonelist starting at a specified node, and by2386* (in get_page_from_freelist()) refusing to consider the zones for2387* any node on the zonelist except the first. By the time any such2388* calls get to this routine, we should just shut up and say 'yes'.2389*2390* Unlike the cpuset_node_allowed_softwall() variant, above,2391* this variant requires that the node be in the current task's2392* mems_allowed or that we're in interrupt. It does not scan up the2393* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.2394* It never sleeps.2395*/2396int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)2397{2398if (in_interrupt() || (gfp_mask & __GFP_THISNODE))2399return 1;2400if (node_isset(node, current->mems_allowed))2401return 1;2402/*2403* Allow tasks that have access to memory reserves because they have2404* been OOM killed to get memory anywhere.2405*/2406if (unlikely(test_thread_flag(TIF_MEMDIE)))2407return 1;2408return 0;2409}24102411/**2412* cpuset_unlock - release lock on cpuset changes2413*2414* Undo the lock taken in a previous cpuset_lock() call.2415*/24162417void cpuset_unlock(void)2418{2419mutex_unlock(&callback_mutex);2420}24212422/**2423* cpuset_mem_spread_node() - On which node to begin search for a file page2424* cpuset_slab_spread_node() - On which node to begin search for a slab page2425*2426* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for2427* tasks in a cpuset with is_spread_page or is_spread_slab set),2428* and if the memory allocation used cpuset_mem_spread_node()2429* to determine on which node to start looking, as it will for2430* certain page cache or slab cache pages such as used for file2431* system buffers and inode caches, then instead of starting on the2432* local node to look for a free page, rather spread the starting2433* node around the tasks mems_allowed nodes.2434*2435* We don't have to worry about the returned node being offline2436* because "it can't happen", and even if it did, it would be ok.2437*2438* The routines calling guarantee_online_mems() are careful to2439* only set nodes in task->mems_allowed that are online. So it2440* should not be possible for the following code to return an2441* offline node. But if it did, that would be ok, as this routine2442* is not returning the node where the allocation must be, only2443* the node where the search should start. The zonelist passed to2444* __alloc_pages() will include all nodes. If the slab allocator2445* is passed an offline node, it will fall back to the local node.2446* See kmem_cache_alloc_node().2447*/24482449static int cpuset_spread_node(int *rotor)2450{2451int node;24522453node = next_node(*rotor, current->mems_allowed);2454if (node == MAX_NUMNODES)2455node = first_node(current->mems_allowed);2456*rotor = node;2457return node;2458}24592460int cpuset_mem_spread_node(void)2461{2462return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);2463}24642465int cpuset_slab_spread_node(void)2466{2467return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);2468}24692470EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);24712472/**2473* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?2474* @tsk1: pointer to task_struct of some task.2475* @tsk2: pointer to task_struct of some other task.2476*2477* Description: Return true if @tsk1's mems_allowed intersects the2478* mems_allowed of @tsk2. Used by the OOM killer to determine if2479* one of the task's memory usage might impact the memory available2480* to the other.2481**/24822483int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,2484const struct task_struct *tsk2)2485{2486return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);2487}24882489/**2490* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed2491* @task: pointer to task_struct of some task.2492*2493* Description: Prints @task's name, cpuset name, and cached copy of its2494* mems_allowed to the kernel log. Must hold task_lock(task) to allow2495* dereferencing task_cs(task).2496*/2497void cpuset_print_task_mems_allowed(struct task_struct *tsk)2498{2499struct dentry *dentry;25002501dentry = task_cs(tsk)->css.cgroup->dentry;2502spin_lock(&cpuset_buffer_lock);2503snprintf(cpuset_name, CPUSET_NAME_LEN,2504dentry ? (const char *)dentry->d_name.name : "/");2505nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,2506tsk->mems_allowed);2507printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",2508tsk->comm, cpuset_name, cpuset_nodelist);2509spin_unlock(&cpuset_buffer_lock);2510}25112512/*2513* Collection of memory_pressure is suppressed unless2514* this flag is enabled by writing "1" to the special2515* cpuset file 'memory_pressure_enabled' in the root cpuset.2516*/25172518int cpuset_memory_pressure_enabled __read_mostly;25192520/**2521* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.2522*2523* Keep a running average of the rate of synchronous (direct)2524* page reclaim efforts initiated by tasks in each cpuset.2525*2526* This represents the rate at which some task in the cpuset2527* ran low on memory on all nodes it was allowed to use, and2528* had to enter the kernels page reclaim code in an effort to2529* create more free memory by tossing clean pages or swapping2530* or writing dirty pages.2531*2532* Display to user space in the per-cpuset read-only file2533* "memory_pressure". Value displayed is an integer2534* representing the recent rate of entry into the synchronous2535* (direct) page reclaim by any task attached to the cpuset.2536**/25372538void __cpuset_memory_pressure_bump(void)2539{2540task_lock(current);2541fmeter_markevent(&task_cs(current)->fmeter);2542task_unlock(current);2543}25442545#ifdef CONFIG_PROC_PID_CPUSET2546/*2547* proc_cpuset_show()2548* - Print tasks cpuset path into seq_file.2549* - Used for /proc/<pid>/cpuset.2550* - No need to task_lock(tsk) on this tsk->cpuset reference, as it2551* doesn't really matter if tsk->cpuset changes after we read it,2552* and we take cgroup_mutex, keeping cpuset_attach() from changing it2553* anyway.2554*/2555static int proc_cpuset_show(struct seq_file *m, void *unused_v)2556{2557struct pid *pid;2558struct task_struct *tsk;2559char *buf;2560struct cgroup_subsys_state *css;2561int retval;25622563retval = -ENOMEM;2564buf = kmalloc(PAGE_SIZE, GFP_KERNEL);2565if (!buf)2566goto out;25672568retval = -ESRCH;2569pid = m->private;2570tsk = get_pid_task(pid, PIDTYPE_PID);2571if (!tsk)2572goto out_free;25732574retval = -EINVAL;2575cgroup_lock();2576css = task_subsys_state(tsk, cpuset_subsys_id);2577retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);2578if (retval < 0)2579goto out_unlock;2580seq_puts(m, buf);2581seq_putc(m, '\n');2582out_unlock:2583cgroup_unlock();2584put_task_struct(tsk);2585out_free:2586kfree(buf);2587out:2588return retval;2589}25902591static int cpuset_open(struct inode *inode, struct file *file)2592{2593struct pid *pid = PROC_I(inode)->pid;2594return single_open(file, proc_cpuset_show, pid);2595}25962597const struct file_operations proc_cpuset_operations = {2598.open = cpuset_open,2599.read = seq_read,2600.llseek = seq_lseek,2601.release = single_release,2602};2603#endif /* CONFIG_PROC_PID_CPUSET */26042605/* Display task mems_allowed in /proc/<pid>/status file. */2606void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)2607{2608seq_printf(m, "Mems_allowed:\t");2609seq_nodemask(m, &task->mems_allowed);2610seq_printf(m, "\n");2611seq_printf(m, "Mems_allowed_list:\t");2612seq_nodemask_list(m, &task->mems_allowed);2613seq_printf(m, "\n");2614}261526162617