/*1* linux/mm/oom_kill.c2*3* Copyright (C) 1998,2000 Rik van Riel4* Thanks go out to Claus Fischer for some serious inspiration and5* for goading me into coding this file...6* Copyright (C) 2010 Google, Inc.7* Rewritten by David Rientjes8*9* The routines in this file are used to kill a process when10* we're seriously out of memory. This gets called from __alloc_pages()11* in mm/page_alloc.c when we really run out of memory.12*13* Since we won't call these routines often (on a well-configured14* machine) this file will double as a 'coding guide' and a signpost15* for newbie kernel hackers. It features several pointers to major16* kernel subsystems and hints as to where to find out what things do.17*/1819#include <linux/oom.h>20#include <linux/mm.h>21#include <linux/err.h>22#include <linux/gfp.h>23#include <linux/sched.h>24#include <linux/swap.h>25#include <linux/timex.h>26#include <linux/jiffies.h>27#include <linux/cpuset.h>28#include <linux/module.h>29#include <linux/notifier.h>30#include <linux/memcontrol.h>31#include <linux/mempolicy.h>32#include <linux/security.h>33#include <linux/ptrace.h>3435int sysctl_panic_on_oom;36int sysctl_oom_kill_allocating_task;37int sysctl_oom_dump_tasks = 1;38static DEFINE_SPINLOCK(zone_scan_lock);3940/**41* test_set_oom_score_adj() - set current's oom_score_adj and return old value42* @new_val: new oom_score_adj value43*44* Sets the oom_score_adj value for current to @new_val with proper45* synchronization and returns the old value. Usually used to temporarily46* set a value, save the old value in the caller, and then reinstate it later.47*/48int test_set_oom_score_adj(int new_val)49{50struct sighand_struct *sighand = current->sighand;51int old_val;5253spin_lock_irq(&sighand->siglock);54old_val = current->signal->oom_score_adj;55if (new_val != old_val) {56if (new_val == OOM_SCORE_ADJ_MIN)57atomic_inc(¤t->mm->oom_disable_count);58else if (old_val == OOM_SCORE_ADJ_MIN)59atomic_dec(¤t->mm->oom_disable_count);60current->signal->oom_score_adj = new_val;61}62spin_unlock_irq(&sighand->siglock);6364return old_val;65}6667#ifdef CONFIG_NUMA68/**69* has_intersects_mems_allowed() - check task eligiblity for kill70* @tsk: task struct of which task to consider71* @mask: nodemask passed to page allocator for mempolicy ooms72*73* Task eligibility is determined by whether or not a candidate task, @tsk,74* shares the same mempolicy nodes as current if it is bound by such a policy75* and whether or not it has the same set of allowed cpuset nodes.76*/77static bool has_intersects_mems_allowed(struct task_struct *tsk,78const nodemask_t *mask)79{80struct task_struct *start = tsk;8182do {83if (mask) {84/*85* If this is a mempolicy constrained oom, tsk's86* cpuset is irrelevant. Only return true if its87* mempolicy intersects current, otherwise it may be88* needlessly killed.89*/90if (mempolicy_nodemask_intersects(tsk, mask))91return true;92} else {93/*94* This is not a mempolicy constrained oom, so only95* check the mems of tsk's cpuset.96*/97if (cpuset_mems_allowed_intersects(current, tsk))98return true;99}100} while_each_thread(start, tsk);101102return false;103}104#else105static bool has_intersects_mems_allowed(struct task_struct *tsk,106const nodemask_t *mask)107{108return true;109}110#endif /* CONFIG_NUMA */111112/*113* The process p may have detached its own ->mm while exiting or through114* use_mm(), but one or more of its subthreads may still have a valid115* pointer. Return p, or any of its subthreads with a valid ->mm, with116* task_lock() held.117*/118struct task_struct *find_lock_task_mm(struct task_struct *p)119{120struct task_struct *t = p;121122do {123task_lock(t);124if (likely(t->mm))125return t;126task_unlock(t);127} while_each_thread(p, t);128129return NULL;130}131132/* return true if the task is not adequate as candidate victim task. */133static bool oom_unkillable_task(struct task_struct *p,134const struct mem_cgroup *mem, const nodemask_t *nodemask)135{136if (is_global_init(p))137return true;138if (p->flags & PF_KTHREAD)139return true;140141/* When mem_cgroup_out_of_memory() and p is not member of the group */142if (mem && !task_in_mem_cgroup(p, mem))143return true;144145/* p may not have freeable memory in nodemask */146if (!has_intersects_mems_allowed(p, nodemask))147return true;148149return false;150}151152/**153* oom_badness - heuristic function to determine which candidate task to kill154* @p: task struct of which task we should calculate155* @totalpages: total present RAM allowed for page allocation156*157* The heuristic for determining which task to kill is made to be as simple and158* predictable as possible. The goal is to return the highest value for the159* task consuming the most memory to avoid subsequent oom failures.160*/161unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,162const nodemask_t *nodemask, unsigned long totalpages)163{164int points;165166if (oom_unkillable_task(p, mem, nodemask))167return 0;168169p = find_lock_task_mm(p);170if (!p)171return 0;172173/*174* Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN175* so the entire heuristic doesn't need to be executed for something176* that cannot be killed.177*/178if (atomic_read(&p->mm->oom_disable_count)) {179task_unlock(p);180return 0;181}182183/*184* The memory controller may have a limit of 0 bytes, so avoid a divide185* by zero, if necessary.186*/187if (!totalpages)188totalpages = 1;189190/*191* The baseline for the badness score is the proportion of RAM that each192* task's rss, pagetable and swap space use.193*/194points = get_mm_rss(p->mm) + p->mm->nr_ptes;195points += get_mm_counter(p->mm, MM_SWAPENTS);196197points *= 1000;198points /= totalpages;199task_unlock(p);200201/*202* Root processes get 3% bonus, just like the __vm_enough_memory()203* implementation used by LSMs.204*/205if (has_capability_noaudit(p, CAP_SYS_ADMIN))206points -= 30;207208/*209* /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may210* either completely disable oom killing or always prefer a certain211* task.212*/213points += p->signal->oom_score_adj;214215/*216* Never return 0 for an eligible task that may be killed since it's217* possible that no single user task uses more than 0.1% of memory and218* no single admin tasks uses more than 3.0%.219*/220if (points <= 0)221return 1;222return (points < 1000) ? points : 1000;223}224225/*226* Determine the type of allocation constraint.227*/228#ifdef CONFIG_NUMA229static enum oom_constraint constrained_alloc(struct zonelist *zonelist,230gfp_t gfp_mask, nodemask_t *nodemask,231unsigned long *totalpages)232{233struct zone *zone;234struct zoneref *z;235enum zone_type high_zoneidx = gfp_zone(gfp_mask);236bool cpuset_limited = false;237int nid;238239/* Default to all available memory */240*totalpages = totalram_pages + total_swap_pages;241242if (!zonelist)243return CONSTRAINT_NONE;244/*245* Reach here only when __GFP_NOFAIL is used. So, we should avoid246* to kill current.We have to random task kill in this case.247* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.248*/249if (gfp_mask & __GFP_THISNODE)250return CONSTRAINT_NONE;251252/*253* This is not a __GFP_THISNODE allocation, so a truncated nodemask in254* the page allocator means a mempolicy is in effect. Cpuset policy255* is enforced in get_page_from_freelist().256*/257if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {258*totalpages = total_swap_pages;259for_each_node_mask(nid, *nodemask)260*totalpages += node_spanned_pages(nid);261return CONSTRAINT_MEMORY_POLICY;262}263264/* Check this allocation failure is caused by cpuset's wall function */265for_each_zone_zonelist_nodemask(zone, z, zonelist,266high_zoneidx, nodemask)267if (!cpuset_zone_allowed_softwall(zone, gfp_mask))268cpuset_limited = true;269270if (cpuset_limited) {271*totalpages = total_swap_pages;272for_each_node_mask(nid, cpuset_current_mems_allowed)273*totalpages += node_spanned_pages(nid);274return CONSTRAINT_CPUSET;275}276return CONSTRAINT_NONE;277}278#else279static enum oom_constraint constrained_alloc(struct zonelist *zonelist,280gfp_t gfp_mask, nodemask_t *nodemask,281unsigned long *totalpages)282{283*totalpages = totalram_pages + total_swap_pages;284return CONSTRAINT_NONE;285}286#endif287288/*289* Simple selection loop. We chose the process with the highest290* number of 'points'. We expect the caller will lock the tasklist.291*292* (not docbooked, we don't want this one cluttering up the manual)293*/294static struct task_struct *select_bad_process(unsigned int *ppoints,295unsigned long totalpages, struct mem_cgroup *mem,296const nodemask_t *nodemask)297{298struct task_struct *g, *p;299struct task_struct *chosen = NULL;300*ppoints = 0;301302do_each_thread(g, p) {303unsigned int points;304305if (!p->mm)306continue;307if (oom_unkillable_task(p, mem, nodemask))308continue;309310/*311* This task already has access to memory reserves and is312* being killed. Don't allow any other task access to the313* memory reserve.314*315* Note: this may have a chance of deadlock if it gets316* blocked waiting for another task which itself is waiting317* for memory. Is there a better alternative?318*/319if (test_tsk_thread_flag(p, TIF_MEMDIE))320return ERR_PTR(-1UL);321322if (p->flags & PF_EXITING) {323/*324* If p is the current task and is in the process of325* releasing memory, we allow the "kill" to set326* TIF_MEMDIE, which will allow it to gain access to327* memory reserves. Otherwise, it may stall forever.328*329* The loop isn't broken here, however, in case other330* threads are found to have already been oom killed.331*/332if (p == current) {333chosen = p;334*ppoints = 1000;335} else {336/*337* If this task is not being ptraced on exit,338* then wait for it to finish before killing339* some other task unnecessarily.340*/341if (!(task_ptrace(p->group_leader) &342PT_TRACE_EXIT))343return ERR_PTR(-1UL);344}345}346347points = oom_badness(p, mem, nodemask, totalpages);348if (points > *ppoints) {349chosen = p;350*ppoints = points;351}352} while_each_thread(g, p);353354return chosen;355}356357/**358* dump_tasks - dump current memory state of all system tasks359* @mem: current's memory controller, if constrained360* @nodemask: nodemask passed to page allocator for mempolicy ooms361*362* Dumps the current memory state of all eligible tasks. Tasks not in the same363* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes364* are not shown.365* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj366* value, oom_score_adj value, and name.367*368* Call with tasklist_lock read-locked.369*/370static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)371{372struct task_struct *p;373struct task_struct *task;374375pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");376for_each_process(p) {377if (oom_unkillable_task(p, mem, nodemask))378continue;379380task = find_lock_task_mm(p);381if (!task) {382/*383* This is a kthread or all of p's threads have already384* detached their mm's. There's no need to report385* them; they can't be oom killed anyway.386*/387continue;388}389390pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",391task->pid, task_uid(task), task->tgid,392task->mm->total_vm, get_mm_rss(task->mm),393task_cpu(task), task->signal->oom_adj,394task->signal->oom_score_adj, task->comm);395task_unlock(task);396}397}398399static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,400struct mem_cgroup *mem, const nodemask_t *nodemask)401{402task_lock(current);403pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "404"oom_adj=%d, oom_score_adj=%d\n",405current->comm, gfp_mask, order, current->signal->oom_adj,406current->signal->oom_score_adj);407cpuset_print_task_mems_allowed(current);408task_unlock(current);409dump_stack();410mem_cgroup_print_oom_info(mem, p);411show_mem(SHOW_MEM_FILTER_NODES);412if (sysctl_oom_dump_tasks)413dump_tasks(mem, nodemask);414}415416#define K(x) ((x) << (PAGE_SHIFT-10))417static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)418{419struct task_struct *q;420struct mm_struct *mm;421422p = find_lock_task_mm(p);423if (!p)424return 1;425426/* mm cannot be safely dereferenced after task_unlock(p) */427mm = p->mm;428429pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",430task_pid_nr(p), p->comm, K(p->mm->total_vm),431K(get_mm_counter(p->mm, MM_ANONPAGES)),432K(get_mm_counter(p->mm, MM_FILEPAGES)));433task_unlock(p);434435/*436* Kill all processes sharing p->mm in other thread groups, if any.437* They don't get access to memory reserves or a higher scheduler438* priority, though, to avoid depletion of all memory or task439* starvation. This prevents mm->mmap_sem livelock when an oom killed440* task cannot exit because it requires the semaphore and its contended441* by another thread trying to allocate memory itself. That thread will442* now get access to memory reserves since it has a pending fatal443* signal.444*/445for_each_process(q)446if (q->mm == mm && !same_thread_group(q, p)) {447task_lock(q); /* Protect ->comm from prctl() */448pr_err("Kill process %d (%s) sharing same memory\n",449task_pid_nr(q), q->comm);450task_unlock(q);451force_sig(SIGKILL, q);452}453454set_tsk_thread_flag(p, TIF_MEMDIE);455force_sig(SIGKILL, p);456457return 0;458}459#undef K460461static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,462unsigned int points, unsigned long totalpages,463struct mem_cgroup *mem, nodemask_t *nodemask,464const char *message)465{466struct task_struct *victim = p;467struct task_struct *child;468struct task_struct *t = p;469unsigned int victim_points = 0;470471if (printk_ratelimit())472dump_header(p, gfp_mask, order, mem, nodemask);473474/*475* If the task is already exiting, don't alarm the sysadmin or kill476* its children or threads, just set TIF_MEMDIE so it can die quickly477*/478if (p->flags & PF_EXITING) {479set_tsk_thread_flag(p, TIF_MEMDIE);480return 0;481}482483task_lock(p);484pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",485message, task_pid_nr(p), p->comm, points);486task_unlock(p);487488/*489* If any of p's children has a different mm and is eligible for kill,490* the one with the highest badness() score is sacrificed for its491* parent. This attempts to lose the minimal amount of work done while492* still freeing memory.493*/494do {495list_for_each_entry(child, &t->children, sibling) {496unsigned int child_points;497498if (child->mm == p->mm)499continue;500/*501* oom_badness() returns 0 if the thread is unkillable502*/503child_points = oom_badness(child, mem, nodemask,504totalpages);505if (child_points > victim_points) {506victim = child;507victim_points = child_points;508}509}510} while_each_thread(p, t);511512return oom_kill_task(victim, mem);513}514515/*516* Determines whether the kernel must panic because of the panic_on_oom sysctl.517*/518static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,519int order, const nodemask_t *nodemask)520{521if (likely(!sysctl_panic_on_oom))522return;523if (sysctl_panic_on_oom != 2) {524/*525* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel526* does not panic for cpuset, mempolicy, or memcg allocation527* failures.528*/529if (constraint != CONSTRAINT_NONE)530return;531}532read_lock(&tasklist_lock);533dump_header(NULL, gfp_mask, order, NULL, nodemask);534read_unlock(&tasklist_lock);535panic("Out of memory: %s panic_on_oom is enabled\n",536sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");537}538539#ifdef CONFIG_CGROUP_MEM_RES_CTLR540void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)541{542unsigned long limit;543unsigned int points = 0;544struct task_struct *p;545546/*547* If current has a pending SIGKILL, then automatically select it. The548* goal is to allow it to allocate so that it may quickly exit and free549* its memory.550*/551if (fatal_signal_pending(current)) {552set_thread_flag(TIF_MEMDIE);553return;554}555556check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);557limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;558read_lock(&tasklist_lock);559retry:560p = select_bad_process(&points, limit, mem, NULL);561if (!p || PTR_ERR(p) == -1UL)562goto out;563564if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,565"Memory cgroup out of memory"))566goto retry;567out:568read_unlock(&tasklist_lock);569}570#endif571572static BLOCKING_NOTIFIER_HEAD(oom_notify_list);573574int register_oom_notifier(struct notifier_block *nb)575{576return blocking_notifier_chain_register(&oom_notify_list, nb);577}578EXPORT_SYMBOL_GPL(register_oom_notifier);579580int unregister_oom_notifier(struct notifier_block *nb)581{582return blocking_notifier_chain_unregister(&oom_notify_list, nb);583}584EXPORT_SYMBOL_GPL(unregister_oom_notifier);585586/*587* Try to acquire the OOM killer lock for the zones in zonelist. Returns zero588* if a parallel OOM killing is already taking place that includes a zone in589* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.590*/591int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)592{593struct zoneref *z;594struct zone *zone;595int ret = 1;596597spin_lock(&zone_scan_lock);598for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {599if (zone_is_oom_locked(zone)) {600ret = 0;601goto out;602}603}604605for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {606/*607* Lock each zone in the zonelist under zone_scan_lock so a608* parallel invocation of try_set_zonelist_oom() doesn't succeed609* when it shouldn't.610*/611zone_set_flag(zone, ZONE_OOM_LOCKED);612}613614out:615spin_unlock(&zone_scan_lock);616return ret;617}618619/*620* Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed621* allocation attempts with zonelists containing them may now recall the OOM622* killer, if necessary.623*/624void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)625{626struct zoneref *z;627struct zone *zone;628629spin_lock(&zone_scan_lock);630for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {631zone_clear_flag(zone, ZONE_OOM_LOCKED);632}633spin_unlock(&zone_scan_lock);634}635636/*637* Try to acquire the oom killer lock for all system zones. Returns zero if a638* parallel oom killing is taking place, otherwise locks all zones and returns639* non-zero.640*/641static int try_set_system_oom(void)642{643struct zone *zone;644int ret = 1;645646spin_lock(&zone_scan_lock);647for_each_populated_zone(zone)648if (zone_is_oom_locked(zone)) {649ret = 0;650goto out;651}652for_each_populated_zone(zone)653zone_set_flag(zone, ZONE_OOM_LOCKED);654out:655spin_unlock(&zone_scan_lock);656return ret;657}658659/*660* Clears ZONE_OOM_LOCKED for all system zones so that failed allocation661* attempts or page faults may now recall the oom killer, if necessary.662*/663static void clear_system_oom(void)664{665struct zone *zone;666667spin_lock(&zone_scan_lock);668for_each_populated_zone(zone)669zone_clear_flag(zone, ZONE_OOM_LOCKED);670spin_unlock(&zone_scan_lock);671}672673/**674* out_of_memory - kill the "best" process when we run out of memory675* @zonelist: zonelist pointer676* @gfp_mask: memory allocation flags677* @order: amount of memory being requested as a power of 2678* @nodemask: nodemask passed to page allocator679*680* If we run out of memory, we have the choice between either681* killing a random task (bad), letting the system crash (worse)682* OR try to be smart about which process to kill. Note that we683* don't have to be perfect here, we just have to be good.684*/685void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,686int order, nodemask_t *nodemask)687{688const nodemask_t *mpol_mask;689struct task_struct *p;690unsigned long totalpages;691unsigned long freed = 0;692unsigned int points;693enum oom_constraint constraint = CONSTRAINT_NONE;694int killed = 0;695696blocking_notifier_call_chain(&oom_notify_list, 0, &freed);697if (freed > 0)698/* Got some memory back in the last second. */699return;700701/*702* If current has a pending SIGKILL, then automatically select it. The703* goal is to allow it to allocate so that it may quickly exit and free704* its memory.705*/706if (fatal_signal_pending(current)) {707set_thread_flag(TIF_MEMDIE);708return;709}710711/*712* Check if there were limitations on the allocation (only relevant for713* NUMA) that may require different handling.714*/715constraint = constrained_alloc(zonelist, gfp_mask, nodemask,716&totalpages);717mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;718check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);719720read_lock(&tasklist_lock);721if (sysctl_oom_kill_allocating_task &&722!oom_unkillable_task(current, NULL, nodemask) &&723current->mm && !atomic_read(¤t->mm->oom_disable_count)) {724/*725* oom_kill_process() needs tasklist_lock held. If it returns726* non-zero, current could not be killed so we must fallback to727* the tasklist scan.728*/729if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,730NULL, nodemask,731"Out of memory (oom_kill_allocating_task)"))732goto out;733}734735retry:736p = select_bad_process(&points, totalpages, NULL, mpol_mask);737if (PTR_ERR(p) == -1UL)738goto out;739740/* Found nothing?!?! Either we hang forever, or we panic. */741if (!p) {742dump_header(NULL, gfp_mask, order, NULL, mpol_mask);743read_unlock(&tasklist_lock);744panic("Out of memory and no killable processes...\n");745}746747if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,748nodemask, "Out of memory"))749goto retry;750killed = 1;751out:752read_unlock(&tasklist_lock);753754/*755* Give "p" a good chance of killing itself before we756* retry to allocate memory unless "p" is current757*/758if (killed && !test_thread_flag(TIF_MEMDIE))759schedule_timeout_uninterruptible(1);760}761762/*763* The pagefault handler calls here because it is out of memory, so kill a764* memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel765* oom killing is already in progress so do nothing. If a task is found with766* TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.767*/768void pagefault_out_of_memory(void)769{770if (try_set_system_oom()) {771out_of_memory(NULL, 0, 0, NULL);772clear_system_oom();773}774if (!test_thread_flag(TIF_MEMDIE))775schedule_timeout_uninterruptible(1);776}777778779