CoCalc -- memory-failure.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/memory-failure.c
¹⁰⁸¹⁴ views
1
/*
2
 * Copyright (C) 2008, 2009 Intel Corporation
3
 * Authors: Andi Kleen, Fengguang Wu
4
 *
5
 * This software may be redistributed and/or modified under the terms of
6
 * the GNU General Public License ("GPL") version 2 only as published by the
7
 * Free Software Foundation.
8
 *
9
 * High level machine check handler. Handles pages reported by the
10
 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11
 * failure.
12
 * 
13
 * In addition there is a "soft offline" entry point that allows stop using
14
 * not-yet-corrupted-by-suspicious pages without killing anything.
15
 *
16
 * Handles page cache pages in various states.	The tricky part
17
 * here is that we can access any page asynchronously in respect to 
18
 * other VM users, because memory failures could happen anytime and 
19
 * anywhere. This could violate some of their assumptions. This is why 
20
 * this code has to be extremely careful. Generally it tries to use 
21
 * normal locking rules, as in get the standard locks, even if that means 
22
 * the error handling takes potentially a long time.
23
 * 
24
 * There are several operations here with exponential complexity because
25
 * of unsuitable VM data structures. For example the operation to map back 
26
 * from RMAP chains to processes has to walk the complete process list and 
27
 * has non linear complexity with the number. But since memory corruptions
28
 * are rare we hope to get away with this. This avoids impacting the core 
29
 * VM.
30
 */
31

32
/*
33
 * Notebook:
34
 * - hugetlb needs more code
35
 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
36
 * - pass bad pages to kdump next kernel
37
 */
38
#include <linux/kernel.h>
39
#include <linux/mm.h>
40
#include <linux/page-flags.h>
41
#include <linux/kernel-page-flags.h>
42
#include <linux/sched.h>
43
#include <linux/ksm.h>
44
#include <linux/rmap.h>
45
#include <linux/pagemap.h>
46
#include <linux/swap.h>
47
#include <linux/backing-dev.h>
48
#include <linux/migrate.h>
49
#include <linux/page-isolation.h>
50
#include <linux/suspend.h>
51
#include <linux/slab.h>
52
#include <linux/swapops.h>
53
#include <linux/hugetlb.h>
54
#include <linux/memory_hotplug.h>
55
#include <linux/mm_inline.h>
56
#include "internal.h"
57

58
int sysctl_memory_failure_early_kill __read_mostly = 0;
59

60
int sysctl_memory_failure_recovery __read_mostly = 1;
61

62
atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
63

64
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
65

66
u32 hwpoison_filter_enable = 0;
67
u32 hwpoison_filter_dev_major = ~0U;
68
u32 hwpoison_filter_dev_minor = ~0U;
69
u64 hwpoison_filter_flags_mask;
70
u64 hwpoison_filter_flags_value;
71
EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
72
EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
73
EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
74
EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
75
EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
76

77
static int hwpoison_filter_dev(struct page *p)
78
{
79
	struct address_space *mapping;
80
	dev_t dev;
81

82
	if (hwpoison_filter_dev_major == ~0U &&
83
	    hwpoison_filter_dev_minor == ~0U)
84
		return 0;
85

86
	/*
87
	 * page_mapping() does not accept slab pages.
88
	 */
89
	if (PageSlab(p))
90
		return -EINVAL;
91

92
	mapping = page_mapping(p);
93
	if (mapping == NULL || mapping->host == NULL)
94
		return -EINVAL;
95

96
	dev = mapping->host->i_sb->s_dev;
97
	if (hwpoison_filter_dev_major != ~0U &&
98
	    hwpoison_filter_dev_major != MAJOR(dev))
99
		return -EINVAL;
100
	if (hwpoison_filter_dev_minor != ~0U &&
101
	    hwpoison_filter_dev_minor != MINOR(dev))
102
		return -EINVAL;
103

104
	return 0;
105
}
106

107
static int hwpoison_filter_flags(struct page *p)
108
{
109
	if (!hwpoison_filter_flags_mask)
110
		return 0;
111

112
	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
113
				    hwpoison_filter_flags_value)
114
		return 0;
115
	else
116
		return -EINVAL;
117
}
118

119
/*
120
 * This allows stress tests to limit test scope to a collection of tasks
121
 * by putting them under some memcg. This prevents killing unrelated/important
122
 * processes such as /sbin/init. Note that the target task may share clean
123
 * pages with init (eg. libc text), which is harmless. If the target task
124
 * share _dirty_ pages with another task B, the test scheme must make sure B
125
 * is also included in the memcg. At last, due to race conditions this filter
126
 * can only guarantee that the page either belongs to the memcg tasks, or is
127
 * a freed page.
128
 */
129
#ifdef	CONFIG_CGROUP_MEM_RES_CTLR_SWAP
130
u64 hwpoison_filter_memcg;
131
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
132
static int hwpoison_filter_task(struct page *p)
133
{
134
	struct mem_cgroup *mem;
135
	struct cgroup_subsys_state *css;
136
	unsigned long ino;
137

138
	if (!hwpoison_filter_memcg)
139
		return 0;
140

141
	mem = try_get_mem_cgroup_from_page(p);
142
	if (!mem)
143
		return -EINVAL;
144

145
	css = mem_cgroup_css(mem);
146
	/* root_mem_cgroup has NULL dentries */
147
	if (!css->cgroup->dentry)
148
		return -EINVAL;
149

150
	ino = css->cgroup->dentry->d_inode->i_ino;
151
	css_put(css);
152

153
	if (ino != hwpoison_filter_memcg)
154
		return -EINVAL;
155

156
	return 0;
157
}
158
#else
159
static int hwpoison_filter_task(struct page *p) { return 0; }
160
#endif
161

162
int hwpoison_filter(struct page *p)
163
{
164
	if (!hwpoison_filter_enable)
165
		return 0;
166

167
	if (hwpoison_filter_dev(p))
168
		return -EINVAL;
169

170
	if (hwpoison_filter_flags(p))
171
		return -EINVAL;
172

173
	if (hwpoison_filter_task(p))
174
		return -EINVAL;
175

176
	return 0;
177
}
178
#else
179
int hwpoison_filter(struct page *p)
180
{
181
	return 0;
182
}
183
#endif
184

185
EXPORT_SYMBOL_GPL(hwpoison_filter);
186

187
/*
188
 * Send all the processes who have the page mapped an ``action optional''
189
 * signal.
190
 */
191
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
192
			unsigned long pfn, struct page *page)
193
{
194
	struct siginfo si;
195
	int ret;
196

197
	printk(KERN_ERR
198
		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
199
		pfn, t->comm, t->pid);
200
	si.si_signo = SIGBUS;
201
	si.si_errno = 0;
202
	si.si_code = BUS_MCEERR_AO;
203
	si.si_addr = (void *)addr;
204
#ifdef __ARCH_SI_TRAPNO
205
	si.si_trapno = trapno;
206
#endif
207
	si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
208
	/*
209
	 * Don't use force here, it's convenient if the signal
210
	 * can be temporarily blocked.
211
	 * This could cause a loop when the user sets SIGBUS
212
	 * to SIG_IGN, but hopefully no one will do that?
213
	 */
214
	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
215
	if (ret < 0)
216
		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
217
		       t->comm, t->pid, ret);
218
	return ret;
219
}
220

221
/*
222
 * When a unknown page type is encountered drain as many buffers as possible
223
 * in the hope to turn the page into a LRU or free page, which we can handle.
224
 */
225
void shake_page(struct page *p, int access)
226
{
227
	if (!PageSlab(p)) {
228
		lru_add_drain_all();
229
		if (PageLRU(p))
230
			return;
231
		drain_all_pages();
232
		if (PageLRU(p) || is_free_buddy_page(p))
233
			return;
234
	}
235

236
	/*
237
	 * Only call shrink_slab here (which would also shrink other caches) if
238
	 * access is not potentially fatal.
239
	 */
240
	if (access) {
241
		int nr;
242
		do {
243
			struct shrink_control shrink = {
244
				.gfp_mask = GFP_KERNEL,
245
			};
246

247
			nr = shrink_slab(&shrink, 1000, 1000);
248
			if (page_count(p) == 1)
249
				break;
250
		} while (nr > 10);
251
	}
252
}
253
EXPORT_SYMBOL_GPL(shake_page);
254

255
/*
256
 * Kill all processes that have a poisoned page mapped and then isolate
257
 * the page.
258
 *
259
 * General strategy:
260
 * Find all processes having the page mapped and kill them.
261
 * But we keep a page reference around so that the page is not
262
 * actually freed yet.
263
 * Then stash the page away
264
 *
265
 * There's no convenient way to get back to mapped processes
266
 * from the VMAs. So do a brute-force search over all
267
 * running processes.
268
 *
269
 * Remember that machine checks are not common (or rather
270
 * if they are common you have other problems), so this shouldn't
271
 * be a performance issue.
272
 *
273
 * Also there are some races possible while we get from the
274
 * error detection to actually handle it.
275
 */
276

277
struct to_kill {
278
	struct list_head nd;
279
	struct task_struct *tsk;
280
	unsigned long addr;
281
	char addr_valid;
282
};
283

284
/*
285
 * Failure handling: if we can't find or can't kill a process there's
286
 * not much we can do.	We just print a message and ignore otherwise.
287
 */
288

289
/*
290
 * Schedule a process for later kill.
291
 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
292
 * TBD would GFP_NOIO be enough?
293
 */
294
static void add_to_kill(struct task_struct *tsk, struct page *p,
295
		       struct vm_area_struct *vma,
296
		       struct list_head *to_kill,
297
		       struct to_kill **tkc)
298
{
299
	struct to_kill *tk;
300

301
	if (*tkc) {
302
		tk = *tkc;
303
		*tkc = NULL;
304
	} else {
305
		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
306
		if (!tk) {
307
			printk(KERN_ERR
308
		"MCE: Out of memory while machine check handling\n");
309
			return;
310
		}
311
	}
312
	tk->addr = page_address_in_vma(p, vma);
313
	tk->addr_valid = 1;
314

315
	/*
316
	 * In theory we don't have to kill when the page was
317
	 * munmaped. But it could be also a mremap. Since that's
318
	 * likely very rare kill anyways just out of paranoia, but use
319
	 * a SIGKILL because the error is not contained anymore.
320
	 */
321
	if (tk->addr == -EFAULT) {
322
		pr_info("MCE: Unable to find user space address %lx in %s\n",
323
			page_to_pfn(p), tsk->comm);
324
		tk->addr_valid = 0;
325
	}
326
	get_task_struct(tsk);
327
	tk->tsk = tsk;
328
	list_add_tail(&tk->nd, to_kill);
329
}
330

331
/*
332
 * Kill the processes that have been collected earlier.
333
 *
334
 * Only do anything when DOIT is set, otherwise just free the list
335
 * (this is used for clean pages which do not need killing)
336
 * Also when FAIL is set do a force kill because something went
337
 * wrong earlier.
338
 */
339
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
340
			  int fail, struct page *page, unsigned long pfn)
341
{
342
	struct to_kill *tk, *next;
343

344
	list_for_each_entry_safe (tk, next, to_kill, nd) {
345
		if (doit) {
346
			/*
347
			 * In case something went wrong with munmapping
348
			 * make sure the process doesn't catch the
349
			 * signal and then access the memory. Just kill it.
350
			 */
351
			if (fail || tk->addr_valid == 0) {
352
				printk(KERN_ERR
353
		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
354
					pfn, tk->tsk->comm, tk->tsk->pid);
355
				force_sig(SIGKILL, tk->tsk);
356
			}
357

358
			/*
359
			 * In theory the process could have mapped
360
			 * something else on the address in-between. We could
361
			 * check for that, but we need to tell the
362
			 * process anyways.
363
			 */
364
			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
365
					      pfn, page) < 0)
366
				printk(KERN_ERR
367
		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
368
					pfn, tk->tsk->comm, tk->tsk->pid);
369
		}
370
		put_task_struct(tk->tsk);
371
		kfree(tk);
372
	}
373
}
374

375
static int task_early_kill(struct task_struct *tsk)
376
{
377
	if (!tsk->mm)
378
		return 0;
379
	if (tsk->flags & PF_MCE_PROCESS)
380
		return !!(tsk->flags & PF_MCE_EARLY);
381
	return sysctl_memory_failure_early_kill;
382
}
383

384
/*
385
 * Collect processes when the error hit an anonymous page.
386
 */
387
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
388
			      struct to_kill **tkc)
389
{
390
	struct vm_area_struct *vma;
391
	struct task_struct *tsk;
392
	struct anon_vma *av;
393

394
	av = page_lock_anon_vma(page);
395
	if (av == NULL)	/* Not actually mapped anymore */
396
		return;
397

398
	read_lock(&tasklist_lock);
399
	for_each_process (tsk) {
400
		struct anon_vma_chain *vmac;
401

402
		if (!task_early_kill(tsk))
403
			continue;
404
		list_for_each_entry(vmac, &av->head, same_anon_vma) {
405
			vma = vmac->vma;
406
			if (!page_mapped_in_vma(page, vma))
407
				continue;
408
			if (vma->vm_mm == tsk->mm)
409
				add_to_kill(tsk, page, vma, to_kill, tkc);
410
		}
411
	}
412
	read_unlock(&tasklist_lock);
413
	page_unlock_anon_vma(av);
414
}
415

416
/*
417
 * Collect processes when the error hit a file mapped page.
418
 */
419
static void collect_procs_file(struct page *page, struct list_head *to_kill,
420
			      struct to_kill **tkc)
421
{
422
	struct vm_area_struct *vma;
423
	struct task_struct *tsk;
424
	struct prio_tree_iter iter;
425
	struct address_space *mapping = page->mapping;
426

427
	mutex_lock(&mapping->i_mmap_mutex);
428
	read_lock(&tasklist_lock);
429
	for_each_process(tsk) {
430
		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
431

432
		if (!task_early_kill(tsk))
433
			continue;
434

435
		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
436
				      pgoff) {
437
			/*
438
			 * Send early kill signal to tasks where a vma covers
439
			 * the page but the corrupted page is not necessarily
440
			 * mapped it in its pte.
441
			 * Assume applications who requested early kill want
442
			 * to be informed of all such data corruptions.
443
			 */
444
			if (vma->vm_mm == tsk->mm)
445
				add_to_kill(tsk, page, vma, to_kill, tkc);
446
		}
447
	}
448
	read_unlock(&tasklist_lock);
449
	mutex_unlock(&mapping->i_mmap_mutex);
450
}
451

452
/*
453
 * Collect the processes who have the corrupted page mapped to kill.
454
 * This is done in two steps for locking reasons.
455
 * First preallocate one tokill structure outside the spin locks,
456
 * so that we can kill at least one process reasonably reliable.
457
 */
458
static void collect_procs(struct page *page, struct list_head *tokill)
459
{
460
	struct to_kill *tk;
461

462
	if (!page->mapping)
463
		return;
464

465
	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
466
	if (!tk)
467
		return;
468
	if (PageAnon(page))
469
		collect_procs_anon(page, tokill, &tk);
470
	else
471
		collect_procs_file(page, tokill, &tk);
472
	kfree(tk);
473
}
474

475
/*
476
 * Error handlers for various types of pages.
477
 */
478

479
enum outcome {
480
	IGNORED,	/* Error: cannot be handled */
481
	FAILED,		/* Error: handling failed */
482
	DELAYED,	/* Will be handled later */
483
	RECOVERED,	/* Successfully recovered */
484
};
485

486
static const char *action_name[] = {
487
	[IGNORED] = "Ignored",
488
	[FAILED] = "Failed",
489
	[DELAYED] = "Delayed",
490
	[RECOVERED] = "Recovered",
491
};
492

493
/*
494
 * XXX: It is possible that a page is isolated from LRU cache,
495
 * and then kept in swap cache or failed to remove from page cache.
496
 * The page count will stop it from being freed by unpoison.
497
 * Stress tests should be aware of this memory leak problem.
498
 */
499
static int delete_from_lru_cache(struct page *p)
500
{
501
	if (!isolate_lru_page(p)) {
502
		/*
503
		 * Clear sensible page flags, so that the buddy system won't
504
		 * complain when the page is unpoison-and-freed.
505
		 */
506
		ClearPageActive(p);
507
		ClearPageUnevictable(p);
508
		/*
509
		 * drop the page count elevated by isolate_lru_page()
510
		 */
511
		page_cache_release(p);
512
		return 0;
513
	}
514
	return -EIO;
515
}
516

517
/*
518
 * Error hit kernel page.
519
 * Do nothing, try to be lucky and not touch this instead. For a few cases we
520
 * could be more sophisticated.
521
 */
522
static int me_kernel(struct page *p, unsigned long pfn)
523
{
524
	return IGNORED;
525
}
526

527
/*
528
 * Page in unknown state. Do nothing.
529
 */
530
static int me_unknown(struct page *p, unsigned long pfn)
531
{
532
	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
533
	return FAILED;
534
}
535

536
/*
537
 * Clean (or cleaned) page cache page.
538
 */
539
static int me_pagecache_clean(struct page *p, unsigned long pfn)
540
{
541
	int err;
542
	int ret = FAILED;
543
	struct address_space *mapping;
544

545
	delete_from_lru_cache(p);
546

547
	/*
548
	 * For anonymous pages we're done the only reference left
549
	 * should be the one m_f() holds.
550
	 */
551
	if (PageAnon(p))
552
		return RECOVERED;
553

554
	/*
555
	 * Now truncate the page in the page cache. This is really
556
	 * more like a "temporary hole punch"
557
	 * Don't do this for block devices when someone else
558
	 * has a reference, because it could be file system metadata
559
	 * and that's not safe to truncate.
560
	 */
561
	mapping = page_mapping(p);
562
	if (!mapping) {
563
		/*
564
		 * Page has been teared down in the meanwhile
565
		 */
566
		return FAILED;
567
	}
568

569
	/*
570
	 * Truncation is a bit tricky. Enable it per file system for now.
571
	 *
572
	 * Open: to take i_mutex or not for this? Right now we don't.
573
	 */
574
	if (mapping->a_ops->error_remove_page) {
575
		err = mapping->a_ops->error_remove_page(mapping, p);
576
		if (err != 0) {
577
			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
578
					pfn, err);
579
		} else if (page_has_private(p) &&
580
				!try_to_release_page(p, GFP_NOIO)) {
581
			pr_info("MCE %#lx: failed to release buffers\n", pfn);
582
		} else {
583
			ret = RECOVERED;
584
		}
585
	} else {
586
		/*
587
		 * If the file system doesn't support it just invalidate
588
		 * This fails on dirty or anything with private pages
589
		 */
590
		if (invalidate_inode_page(p))
591
			ret = RECOVERED;
592
		else
593
			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
594
				pfn);
595
	}
596
	return ret;
597
}
598

599
/*
600
 * Dirty cache page page
601
 * Issues: when the error hit a hole page the error is not properly
602
 * propagated.
603
 */
604
static int me_pagecache_dirty(struct page *p, unsigned long pfn)
605
{
606
	struct address_space *mapping = page_mapping(p);
607

608
	SetPageError(p);
609
	/* TBD: print more information about the file. */
610
	if (mapping) {
611
		/*
612
		 * IO error will be reported by write(), fsync(), etc.
613
		 * who check the mapping.
614
		 * This way the application knows that something went
615
		 * wrong with its dirty file data.
616
		 *
617
		 * There's one open issue:
618
		 *
619
		 * The EIO will be only reported on the next IO
620
		 * operation and then cleared through the IO map.
621
		 * Normally Linux has two mechanisms to pass IO error
622
		 * first through the AS_EIO flag in the address space
623
		 * and then through the PageError flag in the page.
624
		 * Since we drop pages on memory failure handling the
625
		 * only mechanism open to use is through AS_AIO.
626
		 *
627
		 * This has the disadvantage that it gets cleared on
628
		 * the first operation that returns an error, while
629
		 * the PageError bit is more sticky and only cleared
630
		 * when the page is reread or dropped.  If an
631
		 * application assumes it will always get error on
632
		 * fsync, but does other operations on the fd before
633
		 * and the page is dropped between then the error
634
		 * will not be properly reported.
635
		 *
636
		 * This can already happen even without hwpoisoned
637
		 * pages: first on metadata IO errors (which only
638
		 * report through AS_EIO) or when the page is dropped
639
		 * at the wrong time.
640
		 *
641
		 * So right now we assume that the application DTRT on
642
		 * the first EIO, but we're not worse than other parts
643
		 * of the kernel.
644
		 */
645
		mapping_set_error(mapping, EIO);
646
	}
647

648
	return me_pagecache_clean(p, pfn);
649
}
650

651
/*
652
 * Clean and dirty swap cache.
653
 *
654
 * Dirty swap cache page is tricky to handle. The page could live both in page
655
 * cache and swap cache(ie. page is freshly swapped in). So it could be
656
 * referenced concurrently by 2 types of PTEs:
657
 * normal PTEs and swap PTEs. We try to handle them consistently by calling
658
 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
659
 * and then
660
 *      - clear dirty bit to prevent IO
661
 *      - remove from LRU
662
 *      - but keep in the swap cache, so that when we return to it on
663
 *        a later page fault, we know the application is accessing
664
 *        corrupted data and shall be killed (we installed simple
665
 *        interception code in do_swap_page to catch it).
666
 *
667
 * Clean swap cache pages can be directly isolated. A later page fault will
668
 * bring in the known good data from disk.
669
 */
670
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
671
{
672
	ClearPageDirty(p);
673
	/* Trigger EIO in shmem: */
674
	ClearPageUptodate(p);
675

676
	if (!delete_from_lru_cache(p))
677
		return DELAYED;
678
	else
679
		return FAILED;
680
}
681

682
static int me_swapcache_clean(struct page *p, unsigned long pfn)
683
{
684
	delete_from_swap_cache(p);
685

686
	if (!delete_from_lru_cache(p))
687
		return RECOVERED;
688
	else
689
		return FAILED;
690
}
691

692
/*
693
 * Huge pages. Needs work.
694
 * Issues:
695
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
696
 *   To narrow down kill region to one page, we need to break up pmd.
697
 */
698
static int me_huge_page(struct page *p, unsigned long pfn)
699
{
700
	int res = 0;
701
	struct page *hpage = compound_head(p);
702
	/*
703
	 * We can safely recover from error on free or reserved (i.e.
704
	 * not in-use) hugepage by dequeuing it from freelist.
705
	 * To check whether a hugepage is in-use or not, we can't use
706
	 * page->lru because it can be used in other hugepage operations,
707
	 * such as __unmap_hugepage_range() and gather_surplus_pages().
708
	 * So instead we use page_mapping() and PageAnon().
709
	 * We assume that this function is called with page lock held,
710
	 * so there is no race between isolation and mapping/unmapping.
711
	 */
712
	if (!(page_mapping(hpage) || PageAnon(hpage))) {
713
		res = dequeue_hwpoisoned_huge_page(hpage);
714
		if (!res)
715
			return RECOVERED;
716
	}
717
	return DELAYED;
718
}
719

720
/*
721
 * Various page states we can handle.
722
 *
723
 * A page state is defined by its current page->flags bits.
724
 * The table matches them in order and calls the right handler.
725
 *
726
 * This is quite tricky because we can access page at any time
727
 * in its live cycle, so all accesses have to be extremely careful.
728
 *
729
 * This is not complete. More states could be added.
730
 * For any missing state don't attempt recovery.
731
 */
732

733
#define dirty		(1UL << PG_dirty)
734
#define sc		(1UL << PG_swapcache)
735
#define unevict		(1UL << PG_unevictable)
736
#define mlock		(1UL << PG_mlocked)
737
#define writeback	(1UL << PG_writeback)
738
#define lru		(1UL << PG_lru)
739
#define swapbacked	(1UL << PG_swapbacked)
740
#define head		(1UL << PG_head)
741
#define tail		(1UL << PG_tail)
742
#define compound	(1UL << PG_compound)
743
#define slab		(1UL << PG_slab)
744
#define reserved	(1UL << PG_reserved)
745

746
static struct page_state {
747
	unsigned long mask;
748
	unsigned long res;
749
	char *msg;
750
	int (*action)(struct page *p, unsigned long pfn);
751
} error_states[] = {
752
	{ reserved,	reserved,	"reserved kernel",	me_kernel },
753
	/*
754
	 * free pages are specially detected outside this table:
755
	 * PG_buddy pages only make a small fraction of all free pages.
756
	 */
757

758
	/*
759
	 * Could in theory check if slab page is free or if we can drop
760
	 * currently unused objects without touching them. But just
761
	 * treat it as standard kernel for now.
762
	 */
763
	{ slab,		slab,		"kernel slab",	me_kernel },
764

765
#ifdef CONFIG_PAGEFLAGS_EXTENDED
766
	{ head,		head,		"huge",		me_huge_page },
767
	{ tail,		tail,		"huge",		me_huge_page },
768
#else
769
	{ compound,	compound,	"huge",		me_huge_page },
770
#endif
771

772
	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
773
	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
774

775
	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
776
	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
777

778
	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
779
	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
780

781
	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
782
	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
783

784
	/*
785
	 * Catchall entry: must be at end.
786
	 */
787
	{ 0,		0,		"unknown page state",	me_unknown },
788
};
789

790
#undef dirty
791
#undef sc
792
#undef unevict
793
#undef mlock
794
#undef writeback
795
#undef lru
796
#undef swapbacked
797
#undef head
798
#undef tail
799
#undef compound
800
#undef slab
801
#undef reserved
802

803
static void action_result(unsigned long pfn, char *msg, int result)
804
{
805
	struct page *page = pfn_to_page(pfn);
806

807
	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
808
		pfn,
809
		PageDirty(page) ? "dirty " : "",
810
		msg, action_name[result]);
811
}
812

813
static int page_action(struct page_state *ps, struct page *p,
814
			unsigned long pfn)
815
{
816
	int result;
817
	int count;
818

819
	result = ps->action(p, pfn);
820
	action_result(pfn, ps->msg, result);
821

822
	count = page_count(p) - 1;
823
	if (ps->action == me_swapcache_dirty && result == DELAYED)
824
		count--;
825
	if (count != 0) {
826
		printk(KERN_ERR
827
		       "MCE %#lx: %s page still referenced by %d users\n",
828
		       pfn, ps->msg, count);
829
		result = FAILED;
830
	}
831

832
	/* Could do more checks here if page looks ok */
833
	/*
834
	 * Could adjust zone counters here to correct for the missing page.
835
	 */
836

837
	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
838
}
839

840
/*
841
 * Do all that is necessary to remove user space mappings. Unmap
842
 * the pages and send SIGBUS to the processes if the data was dirty.
843
 */
844
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
845
				  int trapno)
846
{
847
	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
848
	struct address_space *mapping;
849
	LIST_HEAD(tokill);
850
	int ret;
851
	int kill = 1;
852
	struct page *hpage = compound_head(p);
853
	struct page *ppage;
854

855
	if (PageReserved(p) || PageSlab(p))
856
		return SWAP_SUCCESS;
857

858
	/*
859
	 * This check implies we don't kill processes if their pages
860
	 * are in the swap cache early. Those are always late kills.
861
	 */
862
	if (!page_mapped(hpage))
863
		return SWAP_SUCCESS;
864

865
	if (PageKsm(p))
866
		return SWAP_FAIL;
867

868
	if (PageSwapCache(p)) {
869
		printk(KERN_ERR
870
		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
871
		ttu |= TTU_IGNORE_HWPOISON;
872
	}
873

874
	/*
875
	 * Propagate the dirty bit from PTEs to struct page first, because we
876
	 * need this to decide if we should kill or just drop the page.
877
	 * XXX: the dirty test could be racy: set_page_dirty() may not always
878
	 * be called inside page lock (it's recommended but not enforced).
879
	 */
880
	mapping = page_mapping(hpage);
881
	if (!PageDirty(hpage) && mapping &&
882
	    mapping_cap_writeback_dirty(mapping)) {
883
		if (page_mkclean(hpage)) {
884
			SetPageDirty(hpage);
885
		} else {
886
			kill = 0;
887
			ttu |= TTU_IGNORE_HWPOISON;
888
			printk(KERN_INFO
889
	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
890
				pfn);
891
		}
892
	}
893

894
	/*
895
	 * ppage: poisoned page
896
	 *   if p is regular page(4k page)
897
	 *        ppage == real poisoned page;
898
	 *   else p is hugetlb or THP, ppage == head page.
899
	 */
900
	ppage = hpage;
901

902
	if (PageTransHuge(hpage)) {
903
		/*
904
		 * Verify that this isn't a hugetlbfs head page, the check for
905
		 * PageAnon is just for avoid tripping a split_huge_page
906
		 * internal debug check, as split_huge_page refuses to deal with
907
		 * anything that isn't an anon page. PageAnon can't go away fro
908
		 * under us because we hold a refcount on the hpage, without a
909
		 * refcount on the hpage. split_huge_page can't be safely called
910
		 * in the first place, having a refcount on the tail isn't
911
		 * enough * to be safe.
912
		 */
913
		if (!PageHuge(hpage) && PageAnon(hpage)) {
914
			if (unlikely(split_huge_page(hpage))) {
915
				/*
916
				 * FIXME: if splitting THP is failed, it is
917
				 * better to stop the following operation rather
918
				 * than causing panic by unmapping. System might
919
				 * survive if the page is freed later.
920
				 */
921
				printk(KERN_INFO
922
					"MCE %#lx: failed to split THP\n", pfn);
923

924
				BUG_ON(!PageHWPoison(p));
925
				return SWAP_FAIL;
926
			}
927
			/* THP is split, so ppage should be the real poisoned page. */
928
			ppage = p;
929
		}
930
	}
931

932
	/*
933
	 * First collect all the processes that have the page
934
	 * mapped in dirty form.  This has to be done before try_to_unmap,
935
	 * because ttu takes the rmap data structures down.
936
	 *
937
	 * Error handling: We ignore errors here because
938
	 * there's nothing that can be done.
939
	 */
940
	if (kill)
941
		collect_procs(ppage, &tokill);
942

943
	if (hpage != ppage)
944
		lock_page(ppage);
945

946
	ret = try_to_unmap(ppage, ttu);
947
	if (ret != SWAP_SUCCESS)
948
		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
949
				pfn, page_mapcount(ppage));
950

951
	if (hpage != ppage)
952
		unlock_page(ppage);
953

954
	/*
955
	 * Now that the dirty bit has been propagated to the
956
	 * struct page and all unmaps done we can decide if
957
	 * killing is needed or not.  Only kill when the page
958
	 * was dirty, otherwise the tokill list is merely
959
	 * freed.  When there was a problem unmapping earlier
960
	 * use a more force-full uncatchable kill to prevent
961
	 * any accesses to the poisoned memory.
962
	 */
963
	kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
964
		      ret != SWAP_SUCCESS, p, pfn);
965

966
	return ret;
967
}
968

969
static void set_page_hwpoison_huge_page(struct page *hpage)
970
{
971
	int i;
972
	int nr_pages = 1 << compound_trans_order(hpage);
973
	for (i = 0; i < nr_pages; i++)
974
		SetPageHWPoison(hpage + i);
975
}
976

977
static void clear_page_hwpoison_huge_page(struct page *hpage)
978
{
979
	int i;
980
	int nr_pages = 1 << compound_trans_order(hpage);
981
	for (i = 0; i < nr_pages; i++)
982
		ClearPageHWPoison(hpage + i);
983
}
984

985
int __memory_failure(unsigned long pfn, int trapno, int flags)
986
{
987
	struct page_state *ps;
988
	struct page *p;
989
	struct page *hpage;
990
	int res;
991
	unsigned int nr_pages;
992

993
	if (!sysctl_memory_failure_recovery)
994
		panic("Memory failure from trap %d on page %lx", trapno, pfn);
995

996
	if (!pfn_valid(pfn)) {
997
		printk(KERN_ERR
998
		       "MCE %#lx: memory outside kernel control\n",
999
		       pfn);
1000
		return -ENXIO;
1001
	}
1002

1003
	p = pfn_to_page(pfn);
1004
	hpage = compound_head(p);
1005
	if (TestSetPageHWPoison(p)) {
1006
		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1007
		return 0;
1008
	}
1009

1010
	nr_pages = 1 << compound_trans_order(hpage);
1011
	atomic_long_add(nr_pages, &mce_bad_pages);
1012

1013
	/*
1014
	 * We need/can do nothing about count=0 pages.
1015
	 * 1) it's a free page, and therefore in safe hand:
1016
	 *    prep_new_page() will be the gate keeper.
1017
	 * 2) it's a free hugepage, which is also safe:
1018
	 *    an affected hugepage will be dequeued from hugepage freelist,
1019
	 *    so there's no concern about reusing it ever after.
1020
	 * 3) it's part of a non-compound high order page.
1021
	 *    Implies some kernel user: cannot stop them from
1022
	 *    R/W the page; let's pray that the page has been
1023
	 *    used and will be freed some time later.
1024
	 * In fact it's dangerous to directly bump up page count from 0,
1025
	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
1026
	 */
1027
	if (!(flags & MF_COUNT_INCREASED) &&
1028
		!get_page_unless_zero(hpage)) {
1029
		if (is_free_buddy_page(p)) {
1030
			action_result(pfn, "free buddy", DELAYED);
1031
			return 0;
1032
		} else if (PageHuge(hpage)) {
1033
			/*
1034
			 * Check "just unpoisoned", "filter hit", and
1035
			 * "race with other subpage."
1036
			 */
1037
			lock_page(hpage);
1038
			if (!PageHWPoison(hpage)
1039
			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1040
			    || (p != hpage && TestSetPageHWPoison(hpage))) {
1041
				atomic_long_sub(nr_pages, &mce_bad_pages);
1042
				return 0;
1043
			}
1044
			set_page_hwpoison_huge_page(hpage);
1045
			res = dequeue_hwpoisoned_huge_page(hpage);
1046
			action_result(pfn, "free huge",
1047
				      res ? IGNORED : DELAYED);
1048
			unlock_page(hpage);
1049
			return res;
1050
		} else {
1051
			action_result(pfn, "high order kernel", IGNORED);
1052
			return -EBUSY;
1053
		}
1054
	}
1055

1056
	/*
1057
	 * We ignore non-LRU pages for good reasons.
1058
	 * - PG_locked is only well defined for LRU pages and a few others
1059
	 * - to avoid races with __set_page_locked()
1060
	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1061
	 * The check (unnecessarily) ignores LRU pages being isolated and
1062
	 * walked by the page reclaim code, however that's not a big loss.
1063
	 */
1064
	if (!PageHuge(p) && !PageTransCompound(p)) {
1065
		if (!PageLRU(p))
1066
			shake_page(p, 0);
1067
		if (!PageLRU(p)) {
1068
			/*
1069
			 * shake_page could have turned it free.
1070
			 */
1071
			if (is_free_buddy_page(p)) {
1072
				action_result(pfn, "free buddy, 2nd try",
1073
						DELAYED);
1074
				return 0;
1075
			}
1076
			action_result(pfn, "non LRU", IGNORED);
1077
			put_page(p);
1078
			return -EBUSY;
1079
		}
1080
	}
1081

1082
	/*
1083
	 * Lock the page and wait for writeback to finish.
1084
	 * It's very difficult to mess with pages currently under IO
1085
	 * and in many cases impossible, so we just avoid it here.
1086
	 */
1087
	lock_page(hpage);
1088

1089
	/*
1090
	 * unpoison always clear PG_hwpoison inside page lock
1091
	 */
1092
	if (!PageHWPoison(p)) {
1093
		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1094
		res = 0;
1095
		goto out;
1096
	}
1097
	if (hwpoison_filter(p)) {
1098
		if (TestClearPageHWPoison(p))
1099
			atomic_long_sub(nr_pages, &mce_bad_pages);
1100
		unlock_page(hpage);
1101
		put_page(hpage);
1102
		return 0;
1103
	}
1104

1105
	/*
1106
	 * For error on the tail page, we should set PG_hwpoison
1107
	 * on the head page to show that the hugepage is hwpoisoned
1108
	 */
1109
	if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1110
		action_result(pfn, "hugepage already hardware poisoned",
1111
				IGNORED);
1112
		unlock_page(hpage);
1113
		put_page(hpage);
1114
		return 0;
1115
	}
1116
	/*
1117
	 * Set PG_hwpoison on all pages in an error hugepage,
1118
	 * because containment is done in hugepage unit for now.
1119
	 * Since we have done TestSetPageHWPoison() for the head page with
1120
	 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1121
	 */
1122
	if (PageHuge(p))
1123
		set_page_hwpoison_huge_page(hpage);
1124

1125
	wait_on_page_writeback(p);
1126

1127
	/*
1128
	 * Now take care of user space mappings.
1129
	 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1130
	 */
1131
	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1132
		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1133
		res = -EBUSY;
1134
		goto out;
1135
	}
1136

1137
	/*
1138
	 * Torn down by someone else?
1139
	 */
1140
	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1141
		action_result(pfn, "already truncated LRU", IGNORED);
1142
		res = -EBUSY;
1143
		goto out;
1144
	}
1145

1146
	res = -EBUSY;
1147
	for (ps = error_states;; ps++) {
1148
		if ((p->flags & ps->mask) == ps->res) {
1149
			res = page_action(ps, p, pfn);
1150
			break;
1151
		}
1152
	}
1153
out:
1154
	unlock_page(hpage);
1155
	return res;
1156
}
1157
EXPORT_SYMBOL_GPL(__memory_failure);
1158

1159
/**
1160
 * memory_failure - Handle memory failure of a page.
1161
 * @pfn: Page Number of the corrupted page
1162
 * @trapno: Trap number reported in the signal to user space.
1163
 *
1164
 * This function is called by the low level machine check code
1165
 * of an architecture when it detects hardware memory corruption
1166
 * of a page. It tries its best to recover, which includes
1167
 * dropping pages, killing processes etc.
1168
 *
1169
 * The function is primarily of use for corruptions that
1170
 * happen outside the current execution context (e.g. when
1171
 * detected by a background scrubber)
1172
 *
1173
 * Must run in process context (e.g. a work queue) with interrupts
1174
 * enabled and no spinlocks hold.
1175
 */
1176
void memory_failure(unsigned long pfn, int trapno)
1177
{
1178
	__memory_failure(pfn, trapno, 0);
1179
}
1180

1181
/**
1182
 * unpoison_memory - Unpoison a previously poisoned page
1183
 * @pfn: Page number of the to be unpoisoned page
1184
 *
1185
 * Software-unpoison a page that has been poisoned by
1186
 * memory_failure() earlier.
1187
 *
1188
 * This is only done on the software-level, so it only works
1189
 * for linux injected failures, not real hardware failures
1190
 *
1191
 * Returns 0 for success, otherwise -errno.
1192
 */
1193
int unpoison_memory(unsigned long pfn)
1194
{
1195
	struct page *page;
1196
	struct page *p;
1197
	int freeit = 0;
1198
	unsigned int nr_pages;
1199

1200
	if (!pfn_valid(pfn))
1201
		return -ENXIO;
1202

1203
	p = pfn_to_page(pfn);
1204
	page = compound_head(p);
1205

1206
	if (!PageHWPoison(p)) {
1207
		pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1208
		return 0;
1209
	}
1210

1211
	nr_pages = 1 << compound_trans_order(page);
1212

1213
	if (!get_page_unless_zero(page)) {
1214
		/*
1215
		 * Since HWPoisoned hugepage should have non-zero refcount,
1216
		 * race between memory failure and unpoison seems to happen.
1217
		 * In such case unpoison fails and memory failure runs
1218
		 * to the end.
1219
		 */
1220
		if (PageHuge(page)) {
1221
			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1222
			return 0;
1223
		}
1224
		if (TestClearPageHWPoison(p))
1225
			atomic_long_sub(nr_pages, &mce_bad_pages);
1226
		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1227
		return 0;
1228
	}
1229

1230
	lock_page(page);
1231
	/*
1232
	 * This test is racy because PG_hwpoison is set outside of page lock.
1233
	 * That's acceptable because that won't trigger kernel panic. Instead,
1234
	 * the PG_hwpoison page will be caught and isolated on the entrance to
1235
	 * the free buddy page pool.
1236
	 */
1237
	if (TestClearPageHWPoison(page)) {
1238
		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1239
		atomic_long_sub(nr_pages, &mce_bad_pages);
1240
		freeit = 1;
1241
		if (PageHuge(page))
1242
			clear_page_hwpoison_huge_page(page);
1243
	}
1244
	unlock_page(page);
1245

1246
	put_page(page);
1247
	if (freeit)
1248
		put_page(page);
1249

1250
	return 0;
1251
}
1252
EXPORT_SYMBOL(unpoison_memory);
1253

1254
static struct page *new_page(struct page *p, unsigned long private, int **x)
1255
{
1256
	int nid = page_to_nid(p);
1257
	if (PageHuge(p))
1258
		return alloc_huge_page_node(page_hstate(compound_head(p)),
1259
						   nid);
1260
	else
1261
		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1262
}
1263

1264
/*
1265
 * Safely get reference count of an arbitrary page.
1266
 * Returns 0 for a free page, -EIO for a zero refcount page
1267
 * that is not free, and 1 for any other page type.
1268
 * For 1 the page is returned with increased page count, otherwise not.
1269
 */
1270
static int get_any_page(struct page *p, unsigned long pfn, int flags)
1271
{
1272
	int ret;
1273

1274
	if (flags & MF_COUNT_INCREASED)
1275
		return 1;
1276

1277
	/*
1278
	 * The lock_memory_hotplug prevents a race with memory hotplug.
1279
	 * This is a big hammer, a better would be nicer.
1280
	 */
1281
	lock_memory_hotplug();
1282

1283
	/*
1284
	 * Isolate the page, so that it doesn't get reallocated if it
1285
	 * was free.
1286
	 */
1287
	set_migratetype_isolate(p);
1288
	/*
1289
	 * When the target page is a free hugepage, just remove it
1290
	 * from free hugepage list.
1291
	 */
1292
	if (!get_page_unless_zero(compound_head(p))) {
1293
		if (PageHuge(p)) {
1294
			pr_info("get_any_page: %#lx free huge page\n", pfn);
1295
			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1296
		} else if (is_free_buddy_page(p)) {
1297
			pr_info("get_any_page: %#lx free buddy page\n", pfn);
1298
			/* Set hwpoison bit while page is still isolated */
1299
			SetPageHWPoison(p);
1300
			ret = 0;
1301
		} else {
1302
			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1303
				pfn, p->flags);
1304
			ret = -EIO;
1305
		}
1306
	} else {
1307
		/* Not a free page */
1308
		ret = 1;
1309
	}
1310
	unset_migratetype_isolate(p);
1311
	unlock_memory_hotplug();
1312
	return ret;
1313
}
1314

1315
static int soft_offline_huge_page(struct page *page, int flags)
1316
{
1317
	int ret;
1318
	unsigned long pfn = page_to_pfn(page);
1319
	struct page *hpage = compound_head(page);
1320
	LIST_HEAD(pagelist);
1321

1322
	ret = get_any_page(page, pfn, flags);
1323
	if (ret < 0)
1324
		return ret;
1325
	if (ret == 0)
1326
		goto done;
1327

1328
	if (PageHWPoison(hpage)) {
1329
		put_page(hpage);
1330
		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1331
		return -EBUSY;
1332
	}
1333

1334
	/* Keep page count to indicate a given hugepage is isolated. */
1335

1336
	list_add(&hpage->lru, &pagelist);
1337
	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1338
				true);
1339
	if (ret) {
1340
		struct page *page1, *page2;
1341
		list_for_each_entry_safe(page1, page2, &pagelist, lru)
1342
			put_page(page1);
1343

1344
		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1345
			 pfn, ret, page->flags);
1346
		if (ret > 0)
1347
			ret = -EIO;
1348
		return ret;
1349
	}
1350
done:
1351
	if (!PageHWPoison(hpage))
1352
		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1353
	set_page_hwpoison_huge_page(hpage);
1354
	dequeue_hwpoisoned_huge_page(hpage);
1355
	/* keep elevated page count for bad page */
1356
	return ret;
1357
}
1358

1359
/**
1360
 * soft_offline_page - Soft offline a page.
1361
 * @page: page to offline
1362
 * @flags: flags. Same as memory_failure().
1363
 *
1364
 * Returns 0 on success, otherwise negated errno.
1365
 *
1366
 * Soft offline a page, by migration or invalidation,
1367
 * without killing anything. This is for the case when
1368
 * a page is not corrupted yet (so it's still valid to access),
1369
 * but has had a number of corrected errors and is better taken
1370
 * out.
1371
 *
1372
 * The actual policy on when to do that is maintained by
1373
 * user space.
1374
 *
1375
 * This should never impact any application or cause data loss,
1376
 * however it might take some time.
1377
 *
1378
 * This is not a 100% solution for all memory, but tries to be
1379
 * ``good enough'' for the majority of memory.
1380
 */
1381
int soft_offline_page(struct page *page, int flags)
1382
{
1383
	int ret;
1384
	unsigned long pfn = page_to_pfn(page);
1385

1386
	if (PageHuge(page))
1387
		return soft_offline_huge_page(page, flags);
1388

1389
	ret = get_any_page(page, pfn, flags);
1390
	if (ret < 0)
1391
		return ret;
1392
	if (ret == 0)
1393
		goto done;
1394

1395
	/*
1396
	 * Page cache page we can handle?
1397
	 */
1398
	if (!PageLRU(page)) {
1399
		/*
1400
		 * Try to free it.
1401
		 */
1402
		put_page(page);
1403
		shake_page(page, 1);
1404

1405
		/*
1406
		 * Did it turn free?
1407
		 */
1408
		ret = get_any_page(page, pfn, 0);
1409
		if (ret < 0)
1410
			return ret;
1411
		if (ret == 0)
1412
			goto done;
1413
	}
1414
	if (!PageLRU(page)) {
1415
		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1416
				pfn, page->flags);
1417
		return -EIO;
1418
	}
1419

1420
	lock_page(page);
1421
	wait_on_page_writeback(page);
1422

1423
	/*
1424
	 * Synchronized using the page lock with memory_failure()
1425
	 */
1426
	if (PageHWPoison(page)) {
1427
		unlock_page(page);
1428
		put_page(page);
1429
		pr_info("soft offline: %#lx page already poisoned\n", pfn);
1430
		return -EBUSY;
1431
	}
1432

1433
	/*
1434
	 * Try to invalidate first. This should work for
1435
	 * non dirty unmapped page cache pages.
1436
	 */
1437
	ret = invalidate_inode_page(page);
1438
	unlock_page(page);
1439
	/*
1440
	 * RED-PEN would be better to keep it isolated here, but we
1441
	 * would need to fix isolation locking first.
1442
	 */
1443
	if (ret == 1) {
1444
		put_page(page);
1445
		ret = 0;
1446
		pr_info("soft_offline: %#lx: invalidated\n", pfn);
1447
		goto done;
1448
	}
1449

1450
	/*
1451
	 * Simple invalidation didn't work.
1452
	 * Try to migrate to a new page instead. migrate.c
1453
	 * handles a large number of cases for us.
1454
	 */
1455
	ret = isolate_lru_page(page);
1456
	/*
1457
	 * Drop page reference which is came from get_any_page()
1458
	 * successful isolate_lru_page() already took another one.
1459
	 */
1460
	put_page(page);
1461
	if (!ret) {
1462
		LIST_HEAD(pagelist);
1463
		inc_zone_page_state(page, NR_ISOLATED_ANON +
1464
					    page_is_file_cache(page));
1465
		list_add(&page->lru, &pagelist);
1466
		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1467
								0, true);
1468
		if (ret) {
1469
			putback_lru_pages(&pagelist);
1470
			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1471
				pfn, ret, page->flags);
1472
			if (ret > 0)
1473
				ret = -EIO;
1474
		}
1475
	} else {
1476
		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1477
				pfn, ret, page_count(page), page->flags);
1478
	}
1479
	if (ret)
1480
		return ret;
1481

1482
done:
1483
	atomic_long_add(1, &mce_bad_pages);
1484
	SetPageHWPoison(page);
1485
	/* keep elevated page count for bad page */
1486
	return ret;
1487
}
1488

1489
Product

Resources

Company