/*1* Copyright (C) 2008, 2009 Intel Corporation2* Authors: Andi Kleen, Fengguang Wu3*4* This software may be redistributed and/or modified under the terms of5* the GNU General Public License ("GPL") version 2 only as published by the6* Free Software Foundation.7*8* High level machine check handler. Handles pages reported by the9* hardware as being corrupted usually due to a multi-bit ECC memory or cache10* failure.11*12* In addition there is a "soft offline" entry point that allows stop using13* not-yet-corrupted-by-suspicious pages without killing anything.14*15* Handles page cache pages in various states. The tricky part16* here is that we can access any page asynchronously in respect to17* other VM users, because memory failures could happen anytime and18* anywhere. This could violate some of their assumptions. This is why19* this code has to be extremely careful. Generally it tries to use20* normal locking rules, as in get the standard locks, even if that means21* the error handling takes potentially a long time.22*23* There are several operations here with exponential complexity because24* of unsuitable VM data structures. For example the operation to map back25* from RMAP chains to processes has to walk the complete process list and26* has non linear complexity with the number. But since memory corruptions27* are rare we hope to get away with this. This avoids impacting the core28* VM.29*/3031/*32* Notebook:33* - hugetlb needs more code34* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages35* - pass bad pages to kdump next kernel36*/37#include <linux/kernel.h>38#include <linux/mm.h>39#include <linux/page-flags.h>40#include <linux/kernel-page-flags.h>41#include <linux/sched.h>42#include <linux/ksm.h>43#include <linux/rmap.h>44#include <linux/pagemap.h>45#include <linux/swap.h>46#include <linux/backing-dev.h>47#include <linux/migrate.h>48#include <linux/page-isolation.h>49#include <linux/suspend.h>50#include <linux/slab.h>51#include <linux/swapops.h>52#include <linux/hugetlb.h>53#include <linux/memory_hotplug.h>54#include <linux/mm_inline.h>55#include "internal.h"5657int sysctl_memory_failure_early_kill __read_mostly = 0;5859int sysctl_memory_failure_recovery __read_mostly = 1;6061atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);6263#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)6465u32 hwpoison_filter_enable = 0;66u32 hwpoison_filter_dev_major = ~0U;67u32 hwpoison_filter_dev_minor = ~0U;68u64 hwpoison_filter_flags_mask;69u64 hwpoison_filter_flags_value;70EXPORT_SYMBOL_GPL(hwpoison_filter_enable);71EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);72EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);73EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);74EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);7576static int hwpoison_filter_dev(struct page *p)77{78struct address_space *mapping;79dev_t dev;8081if (hwpoison_filter_dev_major == ~0U &&82hwpoison_filter_dev_minor == ~0U)83return 0;8485/*86* page_mapping() does not accept slab pages.87*/88if (PageSlab(p))89return -EINVAL;9091mapping = page_mapping(p);92if (mapping == NULL || mapping->host == NULL)93return -EINVAL;9495dev = mapping->host->i_sb->s_dev;96if (hwpoison_filter_dev_major != ~0U &&97hwpoison_filter_dev_major != MAJOR(dev))98return -EINVAL;99if (hwpoison_filter_dev_minor != ~0U &&100hwpoison_filter_dev_minor != MINOR(dev))101return -EINVAL;102103return 0;104}105106static int hwpoison_filter_flags(struct page *p)107{108if (!hwpoison_filter_flags_mask)109return 0;110111if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==112hwpoison_filter_flags_value)113return 0;114else115return -EINVAL;116}117118/*119* This allows stress tests to limit test scope to a collection of tasks120* by putting them under some memcg. This prevents killing unrelated/important121* processes such as /sbin/init. Note that the target task may share clean122* pages with init (eg. libc text), which is harmless. If the target task123* share _dirty_ pages with another task B, the test scheme must make sure B124* is also included in the memcg. At last, due to race conditions this filter125* can only guarantee that the page either belongs to the memcg tasks, or is126* a freed page.127*/128#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP129u64 hwpoison_filter_memcg;130EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);131static int hwpoison_filter_task(struct page *p)132{133struct mem_cgroup *mem;134struct cgroup_subsys_state *css;135unsigned long ino;136137if (!hwpoison_filter_memcg)138return 0;139140mem = try_get_mem_cgroup_from_page(p);141if (!mem)142return -EINVAL;143144css = mem_cgroup_css(mem);145/* root_mem_cgroup has NULL dentries */146if (!css->cgroup->dentry)147return -EINVAL;148149ino = css->cgroup->dentry->d_inode->i_ino;150css_put(css);151152if (ino != hwpoison_filter_memcg)153return -EINVAL;154155return 0;156}157#else158static int hwpoison_filter_task(struct page *p) { return 0; }159#endif160161int hwpoison_filter(struct page *p)162{163if (!hwpoison_filter_enable)164return 0;165166if (hwpoison_filter_dev(p))167return -EINVAL;168169if (hwpoison_filter_flags(p))170return -EINVAL;171172if (hwpoison_filter_task(p))173return -EINVAL;174175return 0;176}177#else178int hwpoison_filter(struct page *p)179{180return 0;181}182#endif183184EXPORT_SYMBOL_GPL(hwpoison_filter);185186/*187* Send all the processes who have the page mapped an ``action optional''188* signal.189*/190static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,191unsigned long pfn, struct page *page)192{193struct siginfo si;194int ret;195196printk(KERN_ERR197"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",198pfn, t->comm, t->pid);199si.si_signo = SIGBUS;200si.si_errno = 0;201si.si_code = BUS_MCEERR_AO;202si.si_addr = (void *)addr;203#ifdef __ARCH_SI_TRAPNO204si.si_trapno = trapno;205#endif206si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;207/*208* Don't use force here, it's convenient if the signal209* can be temporarily blocked.210* This could cause a loop when the user sets SIGBUS211* to SIG_IGN, but hopefully no one will do that?212*/213ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */214if (ret < 0)215printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",216t->comm, t->pid, ret);217return ret;218}219220/*221* When a unknown page type is encountered drain as many buffers as possible222* in the hope to turn the page into a LRU or free page, which we can handle.223*/224void shake_page(struct page *p, int access)225{226if (!PageSlab(p)) {227lru_add_drain_all();228if (PageLRU(p))229return;230drain_all_pages();231if (PageLRU(p) || is_free_buddy_page(p))232return;233}234235/*236* Only call shrink_slab here (which would also shrink other caches) if237* access is not potentially fatal.238*/239if (access) {240int nr;241do {242struct shrink_control shrink = {243.gfp_mask = GFP_KERNEL,244};245246nr = shrink_slab(&shrink, 1000, 1000);247if (page_count(p) == 1)248break;249} while (nr > 10);250}251}252EXPORT_SYMBOL_GPL(shake_page);253254/*255* Kill all processes that have a poisoned page mapped and then isolate256* the page.257*258* General strategy:259* Find all processes having the page mapped and kill them.260* But we keep a page reference around so that the page is not261* actually freed yet.262* Then stash the page away263*264* There's no convenient way to get back to mapped processes265* from the VMAs. So do a brute-force search over all266* running processes.267*268* Remember that machine checks are not common (or rather269* if they are common you have other problems), so this shouldn't270* be a performance issue.271*272* Also there are some races possible while we get from the273* error detection to actually handle it.274*/275276struct to_kill {277struct list_head nd;278struct task_struct *tsk;279unsigned long addr;280char addr_valid;281};282283/*284* Failure handling: if we can't find or can't kill a process there's285* not much we can do. We just print a message and ignore otherwise.286*/287288/*289* Schedule a process for later kill.290* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.291* TBD would GFP_NOIO be enough?292*/293static void add_to_kill(struct task_struct *tsk, struct page *p,294struct vm_area_struct *vma,295struct list_head *to_kill,296struct to_kill **tkc)297{298struct to_kill *tk;299300if (*tkc) {301tk = *tkc;302*tkc = NULL;303} else {304tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);305if (!tk) {306printk(KERN_ERR307"MCE: Out of memory while machine check handling\n");308return;309}310}311tk->addr = page_address_in_vma(p, vma);312tk->addr_valid = 1;313314/*315* In theory we don't have to kill when the page was316* munmaped. But it could be also a mremap. Since that's317* likely very rare kill anyways just out of paranoia, but use318* a SIGKILL because the error is not contained anymore.319*/320if (tk->addr == -EFAULT) {321pr_info("MCE: Unable to find user space address %lx in %s\n",322page_to_pfn(p), tsk->comm);323tk->addr_valid = 0;324}325get_task_struct(tsk);326tk->tsk = tsk;327list_add_tail(&tk->nd, to_kill);328}329330/*331* Kill the processes that have been collected earlier.332*333* Only do anything when DOIT is set, otherwise just free the list334* (this is used for clean pages which do not need killing)335* Also when FAIL is set do a force kill because something went336* wrong earlier.337*/338static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,339int fail, struct page *page, unsigned long pfn)340{341struct to_kill *tk, *next;342343list_for_each_entry_safe (tk, next, to_kill, nd) {344if (doit) {345/*346* In case something went wrong with munmapping347* make sure the process doesn't catch the348* signal and then access the memory. Just kill it.349*/350if (fail || tk->addr_valid == 0) {351printk(KERN_ERR352"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",353pfn, tk->tsk->comm, tk->tsk->pid);354force_sig(SIGKILL, tk->tsk);355}356357/*358* In theory the process could have mapped359* something else on the address in-between. We could360* check for that, but we need to tell the361* process anyways.362*/363else if (kill_proc_ao(tk->tsk, tk->addr, trapno,364pfn, page) < 0)365printk(KERN_ERR366"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",367pfn, tk->tsk->comm, tk->tsk->pid);368}369put_task_struct(tk->tsk);370kfree(tk);371}372}373374static int task_early_kill(struct task_struct *tsk)375{376if (!tsk->mm)377return 0;378if (tsk->flags & PF_MCE_PROCESS)379return !!(tsk->flags & PF_MCE_EARLY);380return sysctl_memory_failure_early_kill;381}382383/*384* Collect processes when the error hit an anonymous page.385*/386static void collect_procs_anon(struct page *page, struct list_head *to_kill,387struct to_kill **tkc)388{389struct vm_area_struct *vma;390struct task_struct *tsk;391struct anon_vma *av;392393av = page_lock_anon_vma(page);394if (av == NULL) /* Not actually mapped anymore */395return;396397read_lock(&tasklist_lock);398for_each_process (tsk) {399struct anon_vma_chain *vmac;400401if (!task_early_kill(tsk))402continue;403list_for_each_entry(vmac, &av->head, same_anon_vma) {404vma = vmac->vma;405if (!page_mapped_in_vma(page, vma))406continue;407if (vma->vm_mm == tsk->mm)408add_to_kill(tsk, page, vma, to_kill, tkc);409}410}411read_unlock(&tasklist_lock);412page_unlock_anon_vma(av);413}414415/*416* Collect processes when the error hit a file mapped page.417*/418static void collect_procs_file(struct page *page, struct list_head *to_kill,419struct to_kill **tkc)420{421struct vm_area_struct *vma;422struct task_struct *tsk;423struct prio_tree_iter iter;424struct address_space *mapping = page->mapping;425426mutex_lock(&mapping->i_mmap_mutex);427read_lock(&tasklist_lock);428for_each_process(tsk) {429pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);430431if (!task_early_kill(tsk))432continue;433434vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,435pgoff) {436/*437* Send early kill signal to tasks where a vma covers438* the page but the corrupted page is not necessarily439* mapped it in its pte.440* Assume applications who requested early kill want441* to be informed of all such data corruptions.442*/443if (vma->vm_mm == tsk->mm)444add_to_kill(tsk, page, vma, to_kill, tkc);445}446}447read_unlock(&tasklist_lock);448mutex_unlock(&mapping->i_mmap_mutex);449}450451/*452* Collect the processes who have the corrupted page mapped to kill.453* This is done in two steps for locking reasons.454* First preallocate one tokill structure outside the spin locks,455* so that we can kill at least one process reasonably reliable.456*/457static void collect_procs(struct page *page, struct list_head *tokill)458{459struct to_kill *tk;460461if (!page->mapping)462return;463464tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);465if (!tk)466return;467if (PageAnon(page))468collect_procs_anon(page, tokill, &tk);469else470collect_procs_file(page, tokill, &tk);471kfree(tk);472}473474/*475* Error handlers for various types of pages.476*/477478enum outcome {479IGNORED, /* Error: cannot be handled */480FAILED, /* Error: handling failed */481DELAYED, /* Will be handled later */482RECOVERED, /* Successfully recovered */483};484485static const char *action_name[] = {486[IGNORED] = "Ignored",487[FAILED] = "Failed",488[DELAYED] = "Delayed",489[RECOVERED] = "Recovered",490};491492/*493* XXX: It is possible that a page is isolated from LRU cache,494* and then kept in swap cache or failed to remove from page cache.495* The page count will stop it from being freed by unpoison.496* Stress tests should be aware of this memory leak problem.497*/498static int delete_from_lru_cache(struct page *p)499{500if (!isolate_lru_page(p)) {501/*502* Clear sensible page flags, so that the buddy system won't503* complain when the page is unpoison-and-freed.504*/505ClearPageActive(p);506ClearPageUnevictable(p);507/*508* drop the page count elevated by isolate_lru_page()509*/510page_cache_release(p);511return 0;512}513return -EIO;514}515516/*517* Error hit kernel page.518* Do nothing, try to be lucky and not touch this instead. For a few cases we519* could be more sophisticated.520*/521static int me_kernel(struct page *p, unsigned long pfn)522{523return IGNORED;524}525526/*527* Page in unknown state. Do nothing.528*/529static int me_unknown(struct page *p, unsigned long pfn)530{531printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);532return FAILED;533}534535/*536* Clean (or cleaned) page cache page.537*/538static int me_pagecache_clean(struct page *p, unsigned long pfn)539{540int err;541int ret = FAILED;542struct address_space *mapping;543544delete_from_lru_cache(p);545546/*547* For anonymous pages we're done the only reference left548* should be the one m_f() holds.549*/550if (PageAnon(p))551return RECOVERED;552553/*554* Now truncate the page in the page cache. This is really555* more like a "temporary hole punch"556* Don't do this for block devices when someone else557* has a reference, because it could be file system metadata558* and that's not safe to truncate.559*/560mapping = page_mapping(p);561if (!mapping) {562/*563* Page has been teared down in the meanwhile564*/565return FAILED;566}567568/*569* Truncation is a bit tricky. Enable it per file system for now.570*571* Open: to take i_mutex or not for this? Right now we don't.572*/573if (mapping->a_ops->error_remove_page) {574err = mapping->a_ops->error_remove_page(mapping, p);575if (err != 0) {576printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",577pfn, err);578} else if (page_has_private(p) &&579!try_to_release_page(p, GFP_NOIO)) {580pr_info("MCE %#lx: failed to release buffers\n", pfn);581} else {582ret = RECOVERED;583}584} else {585/*586* If the file system doesn't support it just invalidate587* This fails on dirty or anything with private pages588*/589if (invalidate_inode_page(p))590ret = RECOVERED;591else592printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",593pfn);594}595return ret;596}597598/*599* Dirty cache page page600* Issues: when the error hit a hole page the error is not properly601* propagated.602*/603static int me_pagecache_dirty(struct page *p, unsigned long pfn)604{605struct address_space *mapping = page_mapping(p);606607SetPageError(p);608/* TBD: print more information about the file. */609if (mapping) {610/*611* IO error will be reported by write(), fsync(), etc.612* who check the mapping.613* This way the application knows that something went614* wrong with its dirty file data.615*616* There's one open issue:617*618* The EIO will be only reported on the next IO619* operation and then cleared through the IO map.620* Normally Linux has two mechanisms to pass IO error621* first through the AS_EIO flag in the address space622* and then through the PageError flag in the page.623* Since we drop pages on memory failure handling the624* only mechanism open to use is through AS_AIO.625*626* This has the disadvantage that it gets cleared on627* the first operation that returns an error, while628* the PageError bit is more sticky and only cleared629* when the page is reread or dropped. If an630* application assumes it will always get error on631* fsync, but does other operations on the fd before632* and the page is dropped between then the error633* will not be properly reported.634*635* This can already happen even without hwpoisoned636* pages: first on metadata IO errors (which only637* report through AS_EIO) or when the page is dropped638* at the wrong time.639*640* So right now we assume that the application DTRT on641* the first EIO, but we're not worse than other parts642* of the kernel.643*/644mapping_set_error(mapping, EIO);645}646647return me_pagecache_clean(p, pfn);648}649650/*651* Clean and dirty swap cache.652*653* Dirty swap cache page is tricky to handle. The page could live both in page654* cache and swap cache(ie. page is freshly swapped in). So it could be655* referenced concurrently by 2 types of PTEs:656* normal PTEs and swap PTEs. We try to handle them consistently by calling657* try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,658* and then659* - clear dirty bit to prevent IO660* - remove from LRU661* - but keep in the swap cache, so that when we return to it on662* a later page fault, we know the application is accessing663* corrupted data and shall be killed (we installed simple664* interception code in do_swap_page to catch it).665*666* Clean swap cache pages can be directly isolated. A later page fault will667* bring in the known good data from disk.668*/669static int me_swapcache_dirty(struct page *p, unsigned long pfn)670{671ClearPageDirty(p);672/* Trigger EIO in shmem: */673ClearPageUptodate(p);674675if (!delete_from_lru_cache(p))676return DELAYED;677else678return FAILED;679}680681static int me_swapcache_clean(struct page *p, unsigned long pfn)682{683delete_from_swap_cache(p);684685if (!delete_from_lru_cache(p))686return RECOVERED;687else688return FAILED;689}690691/*692* Huge pages. Needs work.693* Issues:694* - Error on hugepage is contained in hugepage unit (not in raw page unit.)695* To narrow down kill region to one page, we need to break up pmd.696*/697static int me_huge_page(struct page *p, unsigned long pfn)698{699int res = 0;700struct page *hpage = compound_head(p);701/*702* We can safely recover from error on free or reserved (i.e.703* not in-use) hugepage by dequeuing it from freelist.704* To check whether a hugepage is in-use or not, we can't use705* page->lru because it can be used in other hugepage operations,706* such as __unmap_hugepage_range() and gather_surplus_pages().707* So instead we use page_mapping() and PageAnon().708* We assume that this function is called with page lock held,709* so there is no race between isolation and mapping/unmapping.710*/711if (!(page_mapping(hpage) || PageAnon(hpage))) {712res = dequeue_hwpoisoned_huge_page(hpage);713if (!res)714return RECOVERED;715}716return DELAYED;717}718719/*720* Various page states we can handle.721*722* A page state is defined by its current page->flags bits.723* The table matches them in order and calls the right handler.724*725* This is quite tricky because we can access page at any time726* in its live cycle, so all accesses have to be extremely careful.727*728* This is not complete. More states could be added.729* For any missing state don't attempt recovery.730*/731732#define dirty (1UL << PG_dirty)733#define sc (1UL << PG_swapcache)734#define unevict (1UL << PG_unevictable)735#define mlock (1UL << PG_mlocked)736#define writeback (1UL << PG_writeback)737#define lru (1UL << PG_lru)738#define swapbacked (1UL << PG_swapbacked)739#define head (1UL << PG_head)740#define tail (1UL << PG_tail)741#define compound (1UL << PG_compound)742#define slab (1UL << PG_slab)743#define reserved (1UL << PG_reserved)744745static struct page_state {746unsigned long mask;747unsigned long res;748char *msg;749int (*action)(struct page *p, unsigned long pfn);750} error_states[] = {751{ reserved, reserved, "reserved kernel", me_kernel },752/*753* free pages are specially detected outside this table:754* PG_buddy pages only make a small fraction of all free pages.755*/756757/*758* Could in theory check if slab page is free or if we can drop759* currently unused objects without touching them. But just760* treat it as standard kernel for now.761*/762{ slab, slab, "kernel slab", me_kernel },763764#ifdef CONFIG_PAGEFLAGS_EXTENDED765{ head, head, "huge", me_huge_page },766{ tail, tail, "huge", me_huge_page },767#else768{ compound, compound, "huge", me_huge_page },769#endif770771{ sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },772{ sc|dirty, sc, "swapcache", me_swapcache_clean },773774{ unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},775{ unevict, unevict, "unevictable LRU", me_pagecache_clean},776777{ mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },778{ mlock, mlock, "mlocked LRU", me_pagecache_clean },779780{ lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },781{ lru|dirty, lru, "clean LRU", me_pagecache_clean },782783/*784* Catchall entry: must be at end.785*/786{ 0, 0, "unknown page state", me_unknown },787};788789#undef dirty790#undef sc791#undef unevict792#undef mlock793#undef writeback794#undef lru795#undef swapbacked796#undef head797#undef tail798#undef compound799#undef slab800#undef reserved801802static void action_result(unsigned long pfn, char *msg, int result)803{804struct page *page = pfn_to_page(pfn);805806printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",807pfn,808PageDirty(page) ? "dirty " : "",809msg, action_name[result]);810}811812static int page_action(struct page_state *ps, struct page *p,813unsigned long pfn)814{815int result;816int count;817818result = ps->action(p, pfn);819action_result(pfn, ps->msg, result);820821count = page_count(p) - 1;822if (ps->action == me_swapcache_dirty && result == DELAYED)823count--;824if (count != 0) {825printk(KERN_ERR826"MCE %#lx: %s page still referenced by %d users\n",827pfn, ps->msg, count);828result = FAILED;829}830831/* Could do more checks here if page looks ok */832/*833* Could adjust zone counters here to correct for the missing page.834*/835836return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;837}838839/*840* Do all that is necessary to remove user space mappings. Unmap841* the pages and send SIGBUS to the processes if the data was dirty.842*/843static int hwpoison_user_mappings(struct page *p, unsigned long pfn,844int trapno)845{846enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;847struct address_space *mapping;848LIST_HEAD(tokill);849int ret;850int kill = 1;851struct page *hpage = compound_head(p);852struct page *ppage;853854if (PageReserved(p) || PageSlab(p))855return SWAP_SUCCESS;856857/*858* This check implies we don't kill processes if their pages859* are in the swap cache early. Those are always late kills.860*/861if (!page_mapped(hpage))862return SWAP_SUCCESS;863864if (PageKsm(p))865return SWAP_FAIL;866867if (PageSwapCache(p)) {868printk(KERN_ERR869"MCE %#lx: keeping poisoned page in swap cache\n", pfn);870ttu |= TTU_IGNORE_HWPOISON;871}872873/*874* Propagate the dirty bit from PTEs to struct page first, because we875* need this to decide if we should kill or just drop the page.876* XXX: the dirty test could be racy: set_page_dirty() may not always877* be called inside page lock (it's recommended but not enforced).878*/879mapping = page_mapping(hpage);880if (!PageDirty(hpage) && mapping &&881mapping_cap_writeback_dirty(mapping)) {882if (page_mkclean(hpage)) {883SetPageDirty(hpage);884} else {885kill = 0;886ttu |= TTU_IGNORE_HWPOISON;887printk(KERN_INFO888"MCE %#lx: corrupted page was clean: dropped without side effects\n",889pfn);890}891}892893/*894* ppage: poisoned page895* if p is regular page(4k page)896* ppage == real poisoned page;897* else p is hugetlb or THP, ppage == head page.898*/899ppage = hpage;900901if (PageTransHuge(hpage)) {902/*903* Verify that this isn't a hugetlbfs head page, the check for904* PageAnon is just for avoid tripping a split_huge_page905* internal debug check, as split_huge_page refuses to deal with906* anything that isn't an anon page. PageAnon can't go away fro907* under us because we hold a refcount on the hpage, without a908* refcount on the hpage. split_huge_page can't be safely called909* in the first place, having a refcount on the tail isn't910* enough * to be safe.911*/912if (!PageHuge(hpage) && PageAnon(hpage)) {913if (unlikely(split_huge_page(hpage))) {914/*915* FIXME: if splitting THP is failed, it is916* better to stop the following operation rather917* than causing panic by unmapping. System might918* survive if the page is freed later.919*/920printk(KERN_INFO921"MCE %#lx: failed to split THP\n", pfn);922923BUG_ON(!PageHWPoison(p));924return SWAP_FAIL;925}926/* THP is split, so ppage should be the real poisoned page. */927ppage = p;928}929}930931/*932* First collect all the processes that have the page933* mapped in dirty form. This has to be done before try_to_unmap,934* because ttu takes the rmap data structures down.935*936* Error handling: We ignore errors here because937* there's nothing that can be done.938*/939if (kill)940collect_procs(ppage, &tokill);941942if (hpage != ppage)943lock_page(ppage);944945ret = try_to_unmap(ppage, ttu);946if (ret != SWAP_SUCCESS)947printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",948pfn, page_mapcount(ppage));949950if (hpage != ppage)951unlock_page(ppage);952953/*954* Now that the dirty bit has been propagated to the955* struct page and all unmaps done we can decide if956* killing is needed or not. Only kill when the page957* was dirty, otherwise the tokill list is merely958* freed. When there was a problem unmapping earlier959* use a more force-full uncatchable kill to prevent960* any accesses to the poisoned memory.961*/962kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,963ret != SWAP_SUCCESS, p, pfn);964965return ret;966}967968static void set_page_hwpoison_huge_page(struct page *hpage)969{970int i;971int nr_pages = 1 << compound_trans_order(hpage);972for (i = 0; i < nr_pages; i++)973SetPageHWPoison(hpage + i);974}975976static void clear_page_hwpoison_huge_page(struct page *hpage)977{978int i;979int nr_pages = 1 << compound_trans_order(hpage);980for (i = 0; i < nr_pages; i++)981ClearPageHWPoison(hpage + i);982}983984int __memory_failure(unsigned long pfn, int trapno, int flags)985{986struct page_state *ps;987struct page *p;988struct page *hpage;989int res;990unsigned int nr_pages;991992if (!sysctl_memory_failure_recovery)993panic("Memory failure from trap %d on page %lx", trapno, pfn);994995if (!pfn_valid(pfn)) {996printk(KERN_ERR997"MCE %#lx: memory outside kernel control\n",998pfn);999return -ENXIO;1000}10011002p = pfn_to_page(pfn);1003hpage = compound_head(p);1004if (TestSetPageHWPoison(p)) {1005printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);1006return 0;1007}10081009nr_pages = 1 << compound_trans_order(hpage);1010atomic_long_add(nr_pages, &mce_bad_pages);10111012/*1013* We need/can do nothing about count=0 pages.1014* 1) it's a free page, and therefore in safe hand:1015* prep_new_page() will be the gate keeper.1016* 2) it's a free hugepage, which is also safe:1017* an affected hugepage will be dequeued from hugepage freelist,1018* so there's no concern about reusing it ever after.1019* 3) it's part of a non-compound high order page.1020* Implies some kernel user: cannot stop them from1021* R/W the page; let's pray that the page has been1022* used and will be freed some time later.1023* In fact it's dangerous to directly bump up page count from 0,1024* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.1025*/1026if (!(flags & MF_COUNT_INCREASED) &&1027!get_page_unless_zero(hpage)) {1028if (is_free_buddy_page(p)) {1029action_result(pfn, "free buddy", DELAYED);1030return 0;1031} else if (PageHuge(hpage)) {1032/*1033* Check "just unpoisoned", "filter hit", and1034* "race with other subpage."1035*/1036lock_page(hpage);1037if (!PageHWPoison(hpage)1038|| (hwpoison_filter(p) && TestClearPageHWPoison(p))1039|| (p != hpage && TestSetPageHWPoison(hpage))) {1040atomic_long_sub(nr_pages, &mce_bad_pages);1041return 0;1042}1043set_page_hwpoison_huge_page(hpage);1044res = dequeue_hwpoisoned_huge_page(hpage);1045action_result(pfn, "free huge",1046res ? IGNORED : DELAYED);1047unlock_page(hpage);1048return res;1049} else {1050action_result(pfn, "high order kernel", IGNORED);1051return -EBUSY;1052}1053}10541055/*1056* We ignore non-LRU pages for good reasons.1057* - PG_locked is only well defined for LRU pages and a few others1058* - to avoid races with __set_page_locked()1059* - to avoid races with __SetPageSlab*() (and more non-atomic ops)1060* The check (unnecessarily) ignores LRU pages being isolated and1061* walked by the page reclaim code, however that's not a big loss.1062*/1063if (!PageHuge(p) && !PageTransCompound(p)) {1064if (!PageLRU(p))1065shake_page(p, 0);1066if (!PageLRU(p)) {1067/*1068* shake_page could have turned it free.1069*/1070if (is_free_buddy_page(p)) {1071action_result(pfn, "free buddy, 2nd try",1072DELAYED);1073return 0;1074}1075action_result(pfn, "non LRU", IGNORED);1076put_page(p);1077return -EBUSY;1078}1079}10801081/*1082* Lock the page and wait for writeback to finish.1083* It's very difficult to mess with pages currently under IO1084* and in many cases impossible, so we just avoid it here.1085*/1086lock_page(hpage);10871088/*1089* unpoison always clear PG_hwpoison inside page lock1090*/1091if (!PageHWPoison(p)) {1092printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);1093res = 0;1094goto out;1095}1096if (hwpoison_filter(p)) {1097if (TestClearPageHWPoison(p))1098atomic_long_sub(nr_pages, &mce_bad_pages);1099unlock_page(hpage);1100put_page(hpage);1101return 0;1102}11031104/*1105* For error on the tail page, we should set PG_hwpoison1106* on the head page to show that the hugepage is hwpoisoned1107*/1108if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {1109action_result(pfn, "hugepage already hardware poisoned",1110IGNORED);1111unlock_page(hpage);1112put_page(hpage);1113return 0;1114}1115/*1116* Set PG_hwpoison on all pages in an error hugepage,1117* because containment is done in hugepage unit for now.1118* Since we have done TestSetPageHWPoison() for the head page with1119* page lock held, we can safely set PG_hwpoison bits on tail pages.1120*/1121if (PageHuge(p))1122set_page_hwpoison_huge_page(hpage);11231124wait_on_page_writeback(p);11251126/*1127* Now take care of user space mappings.1128* Abort on fail: __delete_from_page_cache() assumes unmapped page.1129*/1130if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {1131printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);1132res = -EBUSY;1133goto out;1134}11351136/*1137* Torn down by someone else?1138*/1139if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {1140action_result(pfn, "already truncated LRU", IGNORED);1141res = -EBUSY;1142goto out;1143}11441145res = -EBUSY;1146for (ps = error_states;; ps++) {1147if ((p->flags & ps->mask) == ps->res) {1148res = page_action(ps, p, pfn);1149break;1150}1151}1152out:1153unlock_page(hpage);1154return res;1155}1156EXPORT_SYMBOL_GPL(__memory_failure);11571158/**1159* memory_failure - Handle memory failure of a page.1160* @pfn: Page Number of the corrupted page1161* @trapno: Trap number reported in the signal to user space.1162*1163* This function is called by the low level machine check code1164* of an architecture when it detects hardware memory corruption1165* of a page. It tries its best to recover, which includes1166* dropping pages, killing processes etc.1167*1168* The function is primarily of use for corruptions that1169* happen outside the current execution context (e.g. when1170* detected by a background scrubber)1171*1172* Must run in process context (e.g. a work queue) with interrupts1173* enabled and no spinlocks hold.1174*/1175void memory_failure(unsigned long pfn, int trapno)1176{1177__memory_failure(pfn, trapno, 0);1178}11791180/**1181* unpoison_memory - Unpoison a previously poisoned page1182* @pfn: Page number of the to be unpoisoned page1183*1184* Software-unpoison a page that has been poisoned by1185* memory_failure() earlier.1186*1187* This is only done on the software-level, so it only works1188* for linux injected failures, not real hardware failures1189*1190* Returns 0 for success, otherwise -errno.1191*/1192int unpoison_memory(unsigned long pfn)1193{1194struct page *page;1195struct page *p;1196int freeit = 0;1197unsigned int nr_pages;11981199if (!pfn_valid(pfn))1200return -ENXIO;12011202p = pfn_to_page(pfn);1203page = compound_head(p);12041205if (!PageHWPoison(p)) {1206pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);1207return 0;1208}12091210nr_pages = 1 << compound_trans_order(page);12111212if (!get_page_unless_zero(page)) {1213/*1214* Since HWPoisoned hugepage should have non-zero refcount,1215* race between memory failure and unpoison seems to happen.1216* In such case unpoison fails and memory failure runs1217* to the end.1218*/1219if (PageHuge(page)) {1220pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);1221return 0;1222}1223if (TestClearPageHWPoison(p))1224atomic_long_sub(nr_pages, &mce_bad_pages);1225pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);1226return 0;1227}12281229lock_page(page);1230/*1231* This test is racy because PG_hwpoison is set outside of page lock.1232* That's acceptable because that won't trigger kernel panic. Instead,1233* the PG_hwpoison page will be caught and isolated on the entrance to1234* the free buddy page pool.1235*/1236if (TestClearPageHWPoison(page)) {1237pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);1238atomic_long_sub(nr_pages, &mce_bad_pages);1239freeit = 1;1240if (PageHuge(page))1241clear_page_hwpoison_huge_page(page);1242}1243unlock_page(page);12441245put_page(page);1246if (freeit)1247put_page(page);12481249return 0;1250}1251EXPORT_SYMBOL(unpoison_memory);12521253static struct page *new_page(struct page *p, unsigned long private, int **x)1254{1255int nid = page_to_nid(p);1256if (PageHuge(p))1257return alloc_huge_page_node(page_hstate(compound_head(p)),1258nid);1259else1260return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);1261}12621263/*1264* Safely get reference count of an arbitrary page.1265* Returns 0 for a free page, -EIO for a zero refcount page1266* that is not free, and 1 for any other page type.1267* For 1 the page is returned with increased page count, otherwise not.1268*/1269static int get_any_page(struct page *p, unsigned long pfn, int flags)1270{1271int ret;12721273if (flags & MF_COUNT_INCREASED)1274return 1;12751276/*1277* The lock_memory_hotplug prevents a race with memory hotplug.1278* This is a big hammer, a better would be nicer.1279*/1280lock_memory_hotplug();12811282/*1283* Isolate the page, so that it doesn't get reallocated if it1284* was free.1285*/1286set_migratetype_isolate(p);1287/*1288* When the target page is a free hugepage, just remove it1289* from free hugepage list.1290*/1291if (!get_page_unless_zero(compound_head(p))) {1292if (PageHuge(p)) {1293pr_info("get_any_page: %#lx free huge page\n", pfn);1294ret = dequeue_hwpoisoned_huge_page(compound_head(p));1295} else if (is_free_buddy_page(p)) {1296pr_info("get_any_page: %#lx free buddy page\n", pfn);1297/* Set hwpoison bit while page is still isolated */1298SetPageHWPoison(p);1299ret = 0;1300} else {1301pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",1302pfn, p->flags);1303ret = -EIO;1304}1305} else {1306/* Not a free page */1307ret = 1;1308}1309unset_migratetype_isolate(p);1310unlock_memory_hotplug();1311return ret;1312}13131314static int soft_offline_huge_page(struct page *page, int flags)1315{1316int ret;1317unsigned long pfn = page_to_pfn(page);1318struct page *hpage = compound_head(page);1319LIST_HEAD(pagelist);13201321ret = get_any_page(page, pfn, flags);1322if (ret < 0)1323return ret;1324if (ret == 0)1325goto done;13261327if (PageHWPoison(hpage)) {1328put_page(hpage);1329pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);1330return -EBUSY;1331}13321333/* Keep page count to indicate a given hugepage is isolated. */13341335list_add(&hpage->lru, &pagelist);1336ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,1337true);1338if (ret) {1339struct page *page1, *page2;1340list_for_each_entry_safe(page1, page2, &pagelist, lru)1341put_page(page1);13421343pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",1344pfn, ret, page->flags);1345if (ret > 0)1346ret = -EIO;1347return ret;1348}1349done:1350if (!PageHWPoison(hpage))1351atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);1352set_page_hwpoison_huge_page(hpage);1353dequeue_hwpoisoned_huge_page(hpage);1354/* keep elevated page count for bad page */1355return ret;1356}13571358/**1359* soft_offline_page - Soft offline a page.1360* @page: page to offline1361* @flags: flags. Same as memory_failure().1362*1363* Returns 0 on success, otherwise negated errno.1364*1365* Soft offline a page, by migration or invalidation,1366* without killing anything. This is for the case when1367* a page is not corrupted yet (so it's still valid to access),1368* but has had a number of corrected errors and is better taken1369* out.1370*1371* The actual policy on when to do that is maintained by1372* user space.1373*1374* This should never impact any application or cause data loss,1375* however it might take some time.1376*1377* This is not a 100% solution for all memory, but tries to be1378* ``good enough'' for the majority of memory.1379*/1380int soft_offline_page(struct page *page, int flags)1381{1382int ret;1383unsigned long pfn = page_to_pfn(page);13841385if (PageHuge(page))1386return soft_offline_huge_page(page, flags);13871388ret = get_any_page(page, pfn, flags);1389if (ret < 0)1390return ret;1391if (ret == 0)1392goto done;13931394/*1395* Page cache page we can handle?1396*/1397if (!PageLRU(page)) {1398/*1399* Try to free it.1400*/1401put_page(page);1402shake_page(page, 1);14031404/*1405* Did it turn free?1406*/1407ret = get_any_page(page, pfn, 0);1408if (ret < 0)1409return ret;1410if (ret == 0)1411goto done;1412}1413if (!PageLRU(page)) {1414pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",1415pfn, page->flags);1416return -EIO;1417}14181419lock_page(page);1420wait_on_page_writeback(page);14211422/*1423* Synchronized using the page lock with memory_failure()1424*/1425if (PageHWPoison(page)) {1426unlock_page(page);1427put_page(page);1428pr_info("soft offline: %#lx page already poisoned\n", pfn);1429return -EBUSY;1430}14311432/*1433* Try to invalidate first. This should work for1434* non dirty unmapped page cache pages.1435*/1436ret = invalidate_inode_page(page);1437unlock_page(page);1438/*1439* RED-PEN would be better to keep it isolated here, but we1440* would need to fix isolation locking first.1441*/1442if (ret == 1) {1443put_page(page);1444ret = 0;1445pr_info("soft_offline: %#lx: invalidated\n", pfn);1446goto done;1447}14481449/*1450* Simple invalidation didn't work.1451* Try to migrate to a new page instead. migrate.c1452* handles a large number of cases for us.1453*/1454ret = isolate_lru_page(page);1455/*1456* Drop page reference which is came from get_any_page()1457* successful isolate_lru_page() already took another one.1458*/1459put_page(page);1460if (!ret) {1461LIST_HEAD(pagelist);1462inc_zone_page_state(page, NR_ISOLATED_ANON +1463page_is_file_cache(page));1464list_add(&page->lru, &pagelist);1465ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,14660, true);1467if (ret) {1468putback_lru_pages(&pagelist);1469pr_info("soft offline: %#lx: migration failed %d, type %lx\n",1470pfn, ret, page->flags);1471if (ret > 0)1472ret = -EIO;1473}1474} else {1475pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",1476pfn, ret, page_count(page), page->flags);1477}1478if (ret)1479return ret;14801481done:1482atomic_long_add(1, &mce_bad_pages);1483SetPageHWPoison(page);1484/* keep elevated page count for bad page */1485return ret;1486}148714881489