Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/mm/fault.c
10817 views
1
/*
2
* Copyright (C) 1995 Linus Torvalds
3
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5
*/
6
#include <linux/magic.h> /* STACK_END_MAGIC */
7
#include <linux/sched.h> /* test_thread_flag(), ... */
8
#include <linux/kdebug.h> /* oops_begin/end, ... */
9
#include <linux/module.h> /* search_exception_table */
10
#include <linux/bootmem.h> /* max_low_pfn */
11
#include <linux/kprobes.h> /* __kprobes, ... */
12
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13
#include <linux/perf_event.h> /* perf_sw_event */
14
#include <linux/hugetlb.h> /* hstate_index_to_shift */
15
#include <linux/prefetch.h> /* prefetchw */
16
17
#include <asm/traps.h> /* dotraplinkage, ... */
18
#include <asm/pgalloc.h> /* pgd_*(), ... */
19
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20
21
/*
22
* Page fault error code bits:
23
*
24
* bit 0 == 0: no page found 1: protection fault
25
* bit 1 == 0: read access 1: write access
26
* bit 2 == 0: kernel-mode access 1: user-mode access
27
* bit 3 == 1: use of reserved bit detected
28
* bit 4 == 1: fault was an instruction fetch
29
*/
30
enum x86_pf_error_code {
31
32
PF_PROT = 1 << 0,
33
PF_WRITE = 1 << 1,
34
PF_USER = 1 << 2,
35
PF_RSVD = 1 << 3,
36
PF_INSTR = 1 << 4,
37
};
38
39
/*
40
* Returns 0 if mmiotrace is disabled, or if the fault is not
41
* handled by mmiotrace:
42
*/
43
static inline int __kprobes
44
kmmio_fault(struct pt_regs *regs, unsigned long addr)
45
{
46
if (unlikely(is_kmmio_active()))
47
if (kmmio_handler(regs, addr) == 1)
48
return -1;
49
return 0;
50
}
51
52
static inline int __kprobes notify_page_fault(struct pt_regs *regs)
53
{
54
int ret = 0;
55
56
/* kprobe_running() needs smp_processor_id() */
57
if (kprobes_built_in() && !user_mode_vm(regs)) {
58
preempt_disable();
59
if (kprobe_running() && kprobe_fault_handler(regs, 14))
60
ret = 1;
61
preempt_enable();
62
}
63
64
return ret;
65
}
66
67
/*
68
* Prefetch quirks:
69
*
70
* 32-bit mode:
71
*
72
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
73
* Check that here and ignore it.
74
*
75
* 64-bit mode:
76
*
77
* Sometimes the CPU reports invalid exceptions on prefetch.
78
* Check that here and ignore it.
79
*
80
* Opcode checker based on code by Richard Brunner.
81
*/
82
static inline int
83
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
84
unsigned char opcode, int *prefetch)
85
{
86
unsigned char instr_hi = opcode & 0xf0;
87
unsigned char instr_lo = opcode & 0x0f;
88
89
switch (instr_hi) {
90
case 0x20:
91
case 0x30:
92
/*
93
* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
94
* In X86_64 long mode, the CPU will signal invalid
95
* opcode if some of these prefixes are present so
96
* X86_64 will never get here anyway
97
*/
98
return ((instr_lo & 7) == 0x6);
99
#ifdef CONFIG_X86_64
100
case 0x40:
101
/*
102
* In AMD64 long mode 0x40..0x4F are valid REX prefixes
103
* Need to figure out under what instruction mode the
104
* instruction was issued. Could check the LDT for lm,
105
* but for now it's good enough to assume that long
106
* mode only uses well known segments or kernel.
107
*/
108
return (!user_mode(regs)) || (regs->cs == __USER_CS);
109
#endif
110
case 0x60:
111
/* 0x64 thru 0x67 are valid prefixes in all modes. */
112
return (instr_lo & 0xC) == 0x4;
113
case 0xF0:
114
/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
115
return !instr_lo || (instr_lo>>1) == 1;
116
case 0x00:
117
/* Prefetch instruction is 0x0F0D or 0x0F18 */
118
if (probe_kernel_address(instr, opcode))
119
return 0;
120
121
*prefetch = (instr_lo == 0xF) &&
122
(opcode == 0x0D || opcode == 0x18);
123
return 0;
124
default:
125
return 0;
126
}
127
}
128
129
static int
130
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
131
{
132
unsigned char *max_instr;
133
unsigned char *instr;
134
int prefetch = 0;
135
136
/*
137
* If it was a exec (instruction fetch) fault on NX page, then
138
* do not ignore the fault:
139
*/
140
if (error_code & PF_INSTR)
141
return 0;
142
143
instr = (void *)convert_ip_to_linear(current, regs);
144
max_instr = instr + 15;
145
146
if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
147
return 0;
148
149
while (instr < max_instr) {
150
unsigned char opcode;
151
152
if (probe_kernel_address(instr, opcode))
153
break;
154
155
instr++;
156
157
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
158
break;
159
}
160
return prefetch;
161
}
162
163
static void
164
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
165
struct task_struct *tsk, int fault)
166
{
167
unsigned lsb = 0;
168
siginfo_t info;
169
170
info.si_signo = si_signo;
171
info.si_errno = 0;
172
info.si_code = si_code;
173
info.si_addr = (void __user *)address;
174
if (fault & VM_FAULT_HWPOISON_LARGE)
175
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
176
if (fault & VM_FAULT_HWPOISON)
177
lsb = PAGE_SHIFT;
178
info.si_addr_lsb = lsb;
179
180
force_sig_info(si_signo, &info, tsk);
181
}
182
183
DEFINE_SPINLOCK(pgd_lock);
184
LIST_HEAD(pgd_list);
185
186
#ifdef CONFIG_X86_32
187
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
188
{
189
unsigned index = pgd_index(address);
190
pgd_t *pgd_k;
191
pud_t *pud, *pud_k;
192
pmd_t *pmd, *pmd_k;
193
194
pgd += index;
195
pgd_k = init_mm.pgd + index;
196
197
if (!pgd_present(*pgd_k))
198
return NULL;
199
200
/*
201
* set_pgd(pgd, *pgd_k); here would be useless on PAE
202
* and redundant with the set_pmd() on non-PAE. As would
203
* set_pud.
204
*/
205
pud = pud_offset(pgd, address);
206
pud_k = pud_offset(pgd_k, address);
207
if (!pud_present(*pud_k))
208
return NULL;
209
210
pmd = pmd_offset(pud, address);
211
pmd_k = pmd_offset(pud_k, address);
212
if (!pmd_present(*pmd_k))
213
return NULL;
214
215
if (!pmd_present(*pmd))
216
set_pmd(pmd, *pmd_k);
217
else
218
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
219
220
return pmd_k;
221
}
222
223
void vmalloc_sync_all(void)
224
{
225
unsigned long address;
226
227
if (SHARED_KERNEL_PMD)
228
return;
229
230
for (address = VMALLOC_START & PMD_MASK;
231
address >= TASK_SIZE && address < FIXADDR_TOP;
232
address += PMD_SIZE) {
233
struct page *page;
234
235
spin_lock(&pgd_lock);
236
list_for_each_entry(page, &pgd_list, lru) {
237
spinlock_t *pgt_lock;
238
pmd_t *ret;
239
240
/* the pgt_lock only for Xen */
241
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243
spin_lock(pgt_lock);
244
ret = vmalloc_sync_one(page_address(page), address);
245
spin_unlock(pgt_lock);
246
247
if (!ret)
248
break;
249
}
250
spin_unlock(&pgd_lock);
251
}
252
}
253
254
/*
255
* 32-bit:
256
*
257
* Handle a fault on the vmalloc or module mapping area
258
*/
259
static noinline __kprobes int vmalloc_fault(unsigned long address)
260
{
261
unsigned long pgd_paddr;
262
pmd_t *pmd_k;
263
pte_t *pte_k;
264
265
/* Make sure we are in vmalloc area: */
266
if (!(address >= VMALLOC_START && address < VMALLOC_END))
267
return -1;
268
269
WARN_ON_ONCE(in_nmi());
270
271
/*
272
* Synchronize this task's top level page-table
273
* with the 'reference' page table.
274
*
275
* Do _not_ use "current" here. We might be inside
276
* an interrupt in the middle of a task switch..
277
*/
278
pgd_paddr = read_cr3();
279
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
280
if (!pmd_k)
281
return -1;
282
283
pte_k = pte_offset_kernel(pmd_k, address);
284
if (!pte_present(*pte_k))
285
return -1;
286
287
return 0;
288
}
289
290
/*
291
* Did it hit the DOS screen memory VA from vm86 mode?
292
*/
293
static inline void
294
check_v8086_mode(struct pt_regs *regs, unsigned long address,
295
struct task_struct *tsk)
296
{
297
unsigned long bit;
298
299
if (!v8086_mode(regs))
300
return;
301
302
bit = (address - 0xA0000) >> PAGE_SHIFT;
303
if (bit < 32)
304
tsk->thread.screen_bitmap |= 1 << bit;
305
}
306
307
static bool low_pfn(unsigned long pfn)
308
{
309
return pfn < max_low_pfn;
310
}
311
312
static void dump_pagetable(unsigned long address)
313
{
314
pgd_t *base = __va(read_cr3());
315
pgd_t *pgd = &base[pgd_index(address)];
316
pmd_t *pmd;
317
pte_t *pte;
318
319
#ifdef CONFIG_X86_PAE
320
printk("*pdpt = %016Lx ", pgd_val(*pgd));
321
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
322
goto out;
323
#endif
324
pmd = pmd_offset(pud_offset(pgd, address), address);
325
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
326
327
/*
328
* We must not directly access the pte in the highpte
329
* case if the page table is located in highmem.
330
* And let's rather not kmap-atomic the pte, just in case
331
* it's allocated already:
332
*/
333
if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
334
goto out;
335
336
pte = pte_offset_kernel(pmd, address);
337
printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
338
out:
339
printk("\n");
340
}
341
342
#else /* CONFIG_X86_64: */
343
344
void vmalloc_sync_all(void)
345
{
346
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
347
}
348
349
/*
350
* 64-bit:
351
*
352
* Handle a fault on the vmalloc area
353
*
354
* This assumes no large pages in there.
355
*/
356
static noinline __kprobes int vmalloc_fault(unsigned long address)
357
{
358
pgd_t *pgd, *pgd_ref;
359
pud_t *pud, *pud_ref;
360
pmd_t *pmd, *pmd_ref;
361
pte_t *pte, *pte_ref;
362
363
/* Make sure we are in vmalloc area: */
364
if (!(address >= VMALLOC_START && address < VMALLOC_END))
365
return -1;
366
367
WARN_ON_ONCE(in_nmi());
368
369
/*
370
* Copy kernel mappings over when needed. This can also
371
* happen within a race in page table update. In the later
372
* case just flush:
373
*/
374
pgd = pgd_offset(current->active_mm, address);
375
pgd_ref = pgd_offset_k(address);
376
if (pgd_none(*pgd_ref))
377
return -1;
378
379
if (pgd_none(*pgd))
380
set_pgd(pgd, *pgd_ref);
381
else
382
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
383
384
/*
385
* Below here mismatches are bugs because these lower tables
386
* are shared:
387
*/
388
389
pud = pud_offset(pgd, address);
390
pud_ref = pud_offset(pgd_ref, address);
391
if (pud_none(*pud_ref))
392
return -1;
393
394
if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
395
BUG();
396
397
pmd = pmd_offset(pud, address);
398
pmd_ref = pmd_offset(pud_ref, address);
399
if (pmd_none(*pmd_ref))
400
return -1;
401
402
if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
403
BUG();
404
405
pte_ref = pte_offset_kernel(pmd_ref, address);
406
if (!pte_present(*pte_ref))
407
return -1;
408
409
pte = pte_offset_kernel(pmd, address);
410
411
/*
412
* Don't use pte_page here, because the mappings can point
413
* outside mem_map, and the NUMA hash lookup cannot handle
414
* that:
415
*/
416
if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
417
BUG();
418
419
return 0;
420
}
421
422
static const char errata93_warning[] =
423
KERN_ERR
424
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
425
"******* Working around it, but it may cause SEGVs or burn power.\n"
426
"******* Please consider a BIOS update.\n"
427
"******* Disabling USB legacy in the BIOS may also help.\n";
428
429
/*
430
* No vm86 mode in 64-bit mode:
431
*/
432
static inline void
433
check_v8086_mode(struct pt_regs *regs, unsigned long address,
434
struct task_struct *tsk)
435
{
436
}
437
438
static int bad_address(void *p)
439
{
440
unsigned long dummy;
441
442
return probe_kernel_address((unsigned long *)p, dummy);
443
}
444
445
static void dump_pagetable(unsigned long address)
446
{
447
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
448
pgd_t *pgd = base + pgd_index(address);
449
pud_t *pud;
450
pmd_t *pmd;
451
pte_t *pte;
452
453
if (bad_address(pgd))
454
goto bad;
455
456
printk("PGD %lx ", pgd_val(*pgd));
457
458
if (!pgd_present(*pgd))
459
goto out;
460
461
pud = pud_offset(pgd, address);
462
if (bad_address(pud))
463
goto bad;
464
465
printk("PUD %lx ", pud_val(*pud));
466
if (!pud_present(*pud) || pud_large(*pud))
467
goto out;
468
469
pmd = pmd_offset(pud, address);
470
if (bad_address(pmd))
471
goto bad;
472
473
printk("PMD %lx ", pmd_val(*pmd));
474
if (!pmd_present(*pmd) || pmd_large(*pmd))
475
goto out;
476
477
pte = pte_offset_kernel(pmd, address);
478
if (bad_address(pte))
479
goto bad;
480
481
printk("PTE %lx", pte_val(*pte));
482
out:
483
printk("\n");
484
return;
485
bad:
486
printk("BAD\n");
487
}
488
489
#endif /* CONFIG_X86_64 */
490
491
/*
492
* Workaround for K8 erratum #93 & buggy BIOS.
493
*
494
* BIOS SMM functions are required to use a specific workaround
495
* to avoid corruption of the 64bit RIP register on C stepping K8.
496
*
497
* A lot of BIOS that didn't get tested properly miss this.
498
*
499
* The OS sees this as a page fault with the upper 32bits of RIP cleared.
500
* Try to work around it here.
501
*
502
* Note we only handle faults in kernel here.
503
* Does nothing on 32-bit.
504
*/
505
static int is_errata93(struct pt_regs *regs, unsigned long address)
506
{
507
#ifdef CONFIG_X86_64
508
if (address != regs->ip)
509
return 0;
510
511
if ((address >> 32) != 0)
512
return 0;
513
514
address |= 0xffffffffUL << 32;
515
if ((address >= (u64)_stext && address <= (u64)_etext) ||
516
(address >= MODULES_VADDR && address <= MODULES_END)) {
517
printk_once(errata93_warning);
518
regs->ip = address;
519
return 1;
520
}
521
#endif
522
return 0;
523
}
524
525
/*
526
* Work around K8 erratum #100 K8 in compat mode occasionally jumps
527
* to illegal addresses >4GB.
528
*
529
* We catch this in the page fault handler because these addresses
530
* are not reachable. Just detect this case and return. Any code
531
* segment in LDT is compatibility mode.
532
*/
533
static int is_errata100(struct pt_regs *regs, unsigned long address)
534
{
535
#ifdef CONFIG_X86_64
536
if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
537
return 1;
538
#endif
539
return 0;
540
}
541
542
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
543
{
544
#ifdef CONFIG_X86_F00F_BUG
545
unsigned long nr;
546
547
/*
548
* Pentium F0 0F C7 C8 bug workaround:
549
*/
550
if (boot_cpu_data.f00f_bug) {
551
nr = (address - idt_descr.address) >> 3;
552
553
if (nr == 6) {
554
do_invalid_op(regs, 0);
555
return 1;
556
}
557
}
558
#endif
559
return 0;
560
}
561
562
static const char nx_warning[] = KERN_CRIT
563
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
564
565
static void
566
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
567
unsigned long address)
568
{
569
if (!oops_may_print())
570
return;
571
572
if (error_code & PF_INSTR) {
573
unsigned int level;
574
575
pte_t *pte = lookup_address(address, &level);
576
577
if (pte && pte_present(*pte) && !pte_exec(*pte))
578
printk(nx_warning, current_uid());
579
}
580
581
printk(KERN_ALERT "BUG: unable to handle kernel ");
582
if (address < PAGE_SIZE)
583
printk(KERN_CONT "NULL pointer dereference");
584
else
585
printk(KERN_CONT "paging request");
586
587
printk(KERN_CONT " at %p\n", (void *) address);
588
printk(KERN_ALERT "IP:");
589
printk_address(regs->ip, 1);
590
591
dump_pagetable(address);
592
}
593
594
static noinline void
595
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
596
unsigned long address)
597
{
598
struct task_struct *tsk;
599
unsigned long flags;
600
int sig;
601
602
flags = oops_begin();
603
tsk = current;
604
sig = SIGKILL;
605
606
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
607
tsk->comm, address);
608
dump_pagetable(address);
609
610
tsk->thread.cr2 = address;
611
tsk->thread.trap_no = 14;
612
tsk->thread.error_code = error_code;
613
614
if (__die("Bad pagetable", regs, error_code))
615
sig = 0;
616
617
oops_end(flags, regs, sig);
618
}
619
620
static noinline void
621
no_context(struct pt_regs *regs, unsigned long error_code,
622
unsigned long address)
623
{
624
struct task_struct *tsk = current;
625
unsigned long *stackend;
626
unsigned long flags;
627
int sig;
628
629
/* Are we prepared to handle this kernel fault? */
630
if (fixup_exception(regs))
631
return;
632
633
/*
634
* 32-bit:
635
*
636
* Valid to do another page fault here, because if this fault
637
* had been triggered by is_prefetch fixup_exception would have
638
* handled it.
639
*
640
* 64-bit:
641
*
642
* Hall of shame of CPU/BIOS bugs.
643
*/
644
if (is_prefetch(regs, error_code, address))
645
return;
646
647
if (is_errata93(regs, address))
648
return;
649
650
/*
651
* Oops. The kernel tried to access some bad page. We'll have to
652
* terminate things with extreme prejudice:
653
*/
654
flags = oops_begin();
655
656
show_fault_oops(regs, error_code, address);
657
658
stackend = end_of_stack(tsk);
659
if (tsk != &init_task && *stackend != STACK_END_MAGIC)
660
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
661
662
tsk->thread.cr2 = address;
663
tsk->thread.trap_no = 14;
664
tsk->thread.error_code = error_code;
665
666
sig = SIGKILL;
667
if (__die("Oops", regs, error_code))
668
sig = 0;
669
670
/* Executive summary in case the body of the oops scrolled away */
671
printk(KERN_EMERG "CR2: %016lx\n", address);
672
673
oops_end(flags, regs, sig);
674
}
675
676
/*
677
* Print out info about fatal segfaults, if the show_unhandled_signals
678
* sysctl is set:
679
*/
680
static inline void
681
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
682
unsigned long address, struct task_struct *tsk)
683
{
684
if (!unhandled_signal(tsk, SIGSEGV))
685
return;
686
687
if (!printk_ratelimit())
688
return;
689
690
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
691
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
692
tsk->comm, task_pid_nr(tsk), address,
693
(void *)regs->ip, (void *)regs->sp, error_code);
694
695
print_vma_addr(KERN_CONT " in ", regs->ip);
696
697
printk(KERN_CONT "\n");
698
}
699
700
static void
701
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
702
unsigned long address, int si_code)
703
{
704
struct task_struct *tsk = current;
705
706
/* User mode accesses just cause a SIGSEGV */
707
if (error_code & PF_USER) {
708
/*
709
* It's possible to have interrupts off here:
710
*/
711
local_irq_enable();
712
713
/*
714
* Valid to do another page fault here because this one came
715
* from user space:
716
*/
717
if (is_prefetch(regs, error_code, address))
718
return;
719
720
if (is_errata100(regs, address))
721
return;
722
723
if (unlikely(show_unhandled_signals))
724
show_signal_msg(regs, error_code, address, tsk);
725
726
/* Kernel addresses are always protection faults: */
727
tsk->thread.cr2 = address;
728
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
729
tsk->thread.trap_no = 14;
730
731
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
732
733
return;
734
}
735
736
if (is_f00f_bug(regs, address))
737
return;
738
739
no_context(regs, error_code, address);
740
}
741
742
static noinline void
743
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
744
unsigned long address)
745
{
746
__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
747
}
748
749
static void
750
__bad_area(struct pt_regs *regs, unsigned long error_code,
751
unsigned long address, int si_code)
752
{
753
struct mm_struct *mm = current->mm;
754
755
/*
756
* Something tried to access memory that isn't in our memory map..
757
* Fix it, but check if it's kernel or user first..
758
*/
759
up_read(&mm->mmap_sem);
760
761
__bad_area_nosemaphore(regs, error_code, address, si_code);
762
}
763
764
static noinline void
765
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
766
{
767
__bad_area(regs, error_code, address, SEGV_MAPERR);
768
}
769
770
static noinline void
771
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
772
unsigned long address)
773
{
774
__bad_area(regs, error_code, address, SEGV_ACCERR);
775
}
776
777
/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
778
static void
779
out_of_memory(struct pt_regs *regs, unsigned long error_code,
780
unsigned long address)
781
{
782
/*
783
* We ran out of memory, call the OOM killer, and return the userspace
784
* (which will retry the fault, or kill us if we got oom-killed):
785
*/
786
up_read(&current->mm->mmap_sem);
787
788
pagefault_out_of_memory();
789
}
790
791
static void
792
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
793
unsigned int fault)
794
{
795
struct task_struct *tsk = current;
796
struct mm_struct *mm = tsk->mm;
797
int code = BUS_ADRERR;
798
799
up_read(&mm->mmap_sem);
800
801
/* Kernel mode? Handle exceptions or die: */
802
if (!(error_code & PF_USER)) {
803
no_context(regs, error_code, address);
804
return;
805
}
806
807
/* User-space => ok to do another page fault: */
808
if (is_prefetch(regs, error_code, address))
809
return;
810
811
tsk->thread.cr2 = address;
812
tsk->thread.error_code = error_code;
813
tsk->thread.trap_no = 14;
814
815
#ifdef CONFIG_MEMORY_FAILURE
816
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
817
printk(KERN_ERR
818
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819
tsk->comm, tsk->pid, address);
820
code = BUS_MCEERR_AR;
821
}
822
#endif
823
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
824
}
825
826
static noinline int
827
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
828
unsigned long address, unsigned int fault)
829
{
830
/*
831
* Pagefault was interrupted by SIGKILL. We have no reason to
832
* continue pagefault.
833
*/
834
if (fatal_signal_pending(current)) {
835
if (!(fault & VM_FAULT_RETRY))
836
up_read(&current->mm->mmap_sem);
837
if (!(error_code & PF_USER))
838
no_context(regs, error_code, address);
839
return 1;
840
}
841
if (!(fault & VM_FAULT_ERROR))
842
return 0;
843
844
if (fault & VM_FAULT_OOM) {
845
/* Kernel mode? Handle exceptions or die: */
846
if (!(error_code & PF_USER)) {
847
up_read(&current->mm->mmap_sem);
848
no_context(regs, error_code, address);
849
return 1;
850
}
851
852
out_of_memory(regs, error_code, address);
853
} else {
854
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
855
VM_FAULT_HWPOISON_LARGE))
856
do_sigbus(regs, error_code, address, fault);
857
else
858
BUG();
859
}
860
return 1;
861
}
862
863
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
864
{
865
if ((error_code & PF_WRITE) && !pte_write(*pte))
866
return 0;
867
868
if ((error_code & PF_INSTR) && !pte_exec(*pte))
869
return 0;
870
871
return 1;
872
}
873
874
/*
875
* Handle a spurious fault caused by a stale TLB entry.
876
*
877
* This allows us to lazily refresh the TLB when increasing the
878
* permissions of a kernel page (RO -> RW or NX -> X). Doing it
879
* eagerly is very expensive since that implies doing a full
880
* cross-processor TLB flush, even if no stale TLB entries exist
881
* on other processors.
882
*
883
* There are no security implications to leaving a stale TLB when
884
* increasing the permissions on a page.
885
*/
886
static noinline __kprobes int
887
spurious_fault(unsigned long error_code, unsigned long address)
888
{
889
pgd_t *pgd;
890
pud_t *pud;
891
pmd_t *pmd;
892
pte_t *pte;
893
int ret;
894
895
/* Reserved-bit violation or user access to kernel space? */
896
if (error_code & (PF_USER | PF_RSVD))
897
return 0;
898
899
pgd = init_mm.pgd + pgd_index(address);
900
if (!pgd_present(*pgd))
901
return 0;
902
903
pud = pud_offset(pgd, address);
904
if (!pud_present(*pud))
905
return 0;
906
907
if (pud_large(*pud))
908
return spurious_fault_check(error_code, (pte_t *) pud);
909
910
pmd = pmd_offset(pud, address);
911
if (!pmd_present(*pmd))
912
return 0;
913
914
if (pmd_large(*pmd))
915
return spurious_fault_check(error_code, (pte_t *) pmd);
916
917
/*
918
* Note: don't use pte_present() here, since it returns true
919
* if the _PAGE_PROTNONE bit is set. However, this aliases the
920
* _PAGE_GLOBAL bit, which for kernel pages give false positives
921
* when CONFIG_DEBUG_PAGEALLOC is used.
922
*/
923
pte = pte_offset_kernel(pmd, address);
924
if (!(pte_flags(*pte) & _PAGE_PRESENT))
925
return 0;
926
927
ret = spurious_fault_check(error_code, pte);
928
if (!ret)
929
return 0;
930
931
/*
932
* Make sure we have permissions in PMD.
933
* If not, then there's a bug in the page tables:
934
*/
935
ret = spurious_fault_check(error_code, (pte_t *) pmd);
936
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
937
938
return ret;
939
}
940
941
int show_unhandled_signals = 1;
942
943
static inline int
944
access_error(unsigned long error_code, struct vm_area_struct *vma)
945
{
946
if (error_code & PF_WRITE) {
947
/* write, present and write, not present: */
948
if (unlikely(!(vma->vm_flags & VM_WRITE)))
949
return 1;
950
return 0;
951
}
952
953
/* read, present: */
954
if (unlikely(error_code & PF_PROT))
955
return 1;
956
957
/* read, not present: */
958
if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
959
return 1;
960
961
return 0;
962
}
963
964
static int fault_in_kernel_space(unsigned long address)
965
{
966
return address >= TASK_SIZE_MAX;
967
}
968
969
/*
970
* This routine handles page faults. It determines the address,
971
* and the problem, and then passes it off to one of the appropriate
972
* routines.
973
*/
974
dotraplinkage void __kprobes
975
do_page_fault(struct pt_regs *regs, unsigned long error_code)
976
{
977
struct vm_area_struct *vma;
978
struct task_struct *tsk;
979
unsigned long address;
980
struct mm_struct *mm;
981
int fault;
982
int write = error_code & PF_WRITE;
983
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
984
(write ? FAULT_FLAG_WRITE : 0);
985
986
tsk = current;
987
mm = tsk->mm;
988
989
/* Get the faulting address: */
990
address = read_cr2();
991
992
/*
993
* Detect and handle instructions that would cause a page fault for
994
* both a tracked kernel page and a userspace page.
995
*/
996
if (kmemcheck_active(regs))
997
kmemcheck_hide(regs);
998
prefetchw(&mm->mmap_sem);
999
1000
if (unlikely(kmmio_fault(regs, address)))
1001
return;
1002
1003
/*
1004
* We fault-in kernel-space virtual memory on-demand. The
1005
* 'reference' page table is init_mm.pgd.
1006
*
1007
* NOTE! We MUST NOT take any locks for this case. We may
1008
* be in an interrupt or a critical region, and should
1009
* only copy the information from the master page table,
1010
* nothing more.
1011
*
1012
* This verifies that the fault happens in kernel space
1013
* (error_code & 4) == 0, and that the fault was not a
1014
* protection error (error_code & 9) == 0.
1015
*/
1016
if (unlikely(fault_in_kernel_space(address))) {
1017
if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1018
if (vmalloc_fault(address) >= 0)
1019
return;
1020
1021
if (kmemcheck_fault(regs, address, error_code))
1022
return;
1023
}
1024
1025
/* Can handle a stale RO->RW TLB: */
1026
if (spurious_fault(error_code, address))
1027
return;
1028
1029
/* kprobes don't want to hook the spurious faults: */
1030
if (notify_page_fault(regs))
1031
return;
1032
/*
1033
* Don't take the mm semaphore here. If we fixup a prefetch
1034
* fault we could otherwise deadlock:
1035
*/
1036
bad_area_nosemaphore(regs, error_code, address);
1037
1038
return;
1039
}
1040
1041
/* kprobes don't want to hook the spurious faults: */
1042
if (unlikely(notify_page_fault(regs)))
1043
return;
1044
/*
1045
* It's safe to allow irq's after cr2 has been saved and the
1046
* vmalloc fault has been handled.
1047
*
1048
* User-mode registers count as a user access even for any
1049
* potential system fault or CPU buglet:
1050
*/
1051
if (user_mode_vm(regs)) {
1052
local_irq_enable();
1053
error_code |= PF_USER;
1054
} else {
1055
if (regs->flags & X86_EFLAGS_IF)
1056
local_irq_enable();
1057
}
1058
1059
if (unlikely(error_code & PF_RSVD))
1060
pgtable_bad(regs, error_code, address);
1061
1062
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1063
1064
/*
1065
* If we're in an interrupt, have no user context or are running
1066
* in an atomic region then we must not take the fault:
1067
*/
1068
if (unlikely(in_atomic() || !mm)) {
1069
bad_area_nosemaphore(regs, error_code, address);
1070
return;
1071
}
1072
1073
/*
1074
* When running in the kernel we expect faults to occur only to
1075
* addresses in user space. All other faults represent errors in
1076
* the kernel and should generate an OOPS. Unfortunately, in the
1077
* case of an erroneous fault occurring in a code path which already
1078
* holds mmap_sem we will deadlock attempting to validate the fault
1079
* against the address space. Luckily the kernel only validly
1080
* references user space from well defined areas of code, which are
1081
* listed in the exceptions table.
1082
*
1083
* As the vast majority of faults will be valid we will only perform
1084
* the source reference check when there is a possibility of a
1085
* deadlock. Attempt to lock the address space, if we cannot we then
1086
* validate the source. If this is invalid we can skip the address
1087
* space check, thus avoiding the deadlock:
1088
*/
1089
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1090
if ((error_code & PF_USER) == 0 &&
1091
!search_exception_tables(regs->ip)) {
1092
bad_area_nosemaphore(regs, error_code, address);
1093
return;
1094
}
1095
retry:
1096
down_read(&mm->mmap_sem);
1097
} else {
1098
/*
1099
* The above down_read_trylock() might have succeeded in
1100
* which case we'll have missed the might_sleep() from
1101
* down_read():
1102
*/
1103
might_sleep();
1104
}
1105
1106
vma = find_vma(mm, address);
1107
if (unlikely(!vma)) {
1108
bad_area(regs, error_code, address);
1109
return;
1110
}
1111
if (likely(vma->vm_start <= address))
1112
goto good_area;
1113
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1114
bad_area(regs, error_code, address);
1115
return;
1116
}
1117
if (error_code & PF_USER) {
1118
/*
1119
* Accessing the stack below %sp is always a bug.
1120
* The large cushion allows instructions like enter
1121
* and pusha to work. ("enter $65535, $31" pushes
1122
* 32 pointers and then decrements %sp by 65535.)
1123
*/
1124
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1125
bad_area(regs, error_code, address);
1126
return;
1127
}
1128
}
1129
if (unlikely(expand_stack(vma, address))) {
1130
bad_area(regs, error_code, address);
1131
return;
1132
}
1133
1134
/*
1135
* Ok, we have a good vm_area for this memory access, so
1136
* we can handle it..
1137
*/
1138
good_area:
1139
if (unlikely(access_error(error_code, vma))) {
1140
bad_area_access_error(regs, error_code, address);
1141
return;
1142
}
1143
1144
/*
1145
* If for any reason at all we couldn't handle the fault,
1146
* make sure we exit gracefully rather than endlessly redo
1147
* the fault:
1148
*/
1149
fault = handle_mm_fault(mm, vma, address, flags);
1150
1151
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1152
if (mm_fault_error(regs, error_code, address, fault))
1153
return;
1154
}
1155
1156
/*
1157
* Major/minor page fault accounting is only done on the
1158
* initial attempt. If we go through a retry, it is extremely
1159
* likely that the page will be found in page cache at that point.
1160
*/
1161
if (flags & FAULT_FLAG_ALLOW_RETRY) {
1162
if (fault & VM_FAULT_MAJOR) {
1163
tsk->maj_flt++;
1164
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1165
regs, address);
1166
} else {
1167
tsk->min_flt++;
1168
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1169
regs, address);
1170
}
1171
if (fault & VM_FAULT_RETRY) {
1172
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1173
* of starvation. */
1174
flags &= ~FAULT_FLAG_ALLOW_RETRY;
1175
goto retry;
1176
}
1177
}
1178
1179
check_v8086_mode(regs, address, tsk);
1180
1181
up_read(&mm->mmap_sem);
1182
}
1183
1184