Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/mm/fault.c
10817 views
1
/*
2
* PowerPC version
3
* Copyright (C) 1995-1996 Gary Thomas ([email protected])
4
*
5
* Derived from "arch/i386/mm/fault.c"
6
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
7
*
8
* Modified by Cort Dougan and Paul Mackerras.
9
*
10
* Modified for PPC64 by Dave Engebretsen ([email protected])
11
*
12
* This program is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU General Public License
14
* as published by the Free Software Foundation; either version
15
* 2 of the License, or (at your option) any later version.
16
*/
17
18
#include <linux/signal.h>
19
#include <linux/sched.h>
20
#include <linux/kernel.h>
21
#include <linux/errno.h>
22
#include <linux/string.h>
23
#include <linux/types.h>
24
#include <linux/ptrace.h>
25
#include <linux/mman.h>
26
#include <linux/mm.h>
27
#include <linux/interrupt.h>
28
#include <linux/highmem.h>
29
#include <linux/module.h>
30
#include <linux/kprobes.h>
31
#include <linux/kdebug.h>
32
#include <linux/perf_event.h>
33
#include <linux/magic.h>
34
#include <linux/ratelimit.h>
35
36
#include <asm/firmware.h>
37
#include <asm/page.h>
38
#include <asm/pgtable.h>
39
#include <asm/mmu.h>
40
#include <asm/mmu_context.h>
41
#include <asm/system.h>
42
#include <asm/uaccess.h>
43
#include <asm/tlbflush.h>
44
#include <asm/siginfo.h>
45
#include <mm/mmu_decl.h>
46
47
#ifdef CONFIG_KPROBES
48
static inline int notify_page_fault(struct pt_regs *regs)
49
{
50
int ret = 0;
51
52
/* kprobe_running() needs smp_processor_id() */
53
if (!user_mode(regs)) {
54
preempt_disable();
55
if (kprobe_running() && kprobe_fault_handler(regs, 11))
56
ret = 1;
57
preempt_enable();
58
}
59
60
return ret;
61
}
62
#else
63
static inline int notify_page_fault(struct pt_regs *regs)
64
{
65
return 0;
66
}
67
#endif
68
69
/*
70
* Check whether the instruction at regs->nip is a store using
71
* an update addressing form which will update r1.
72
*/
73
static int store_updates_sp(struct pt_regs *regs)
74
{
75
unsigned int inst;
76
77
if (get_user(inst, (unsigned int __user *)regs->nip))
78
return 0;
79
/* check for 1 in the rA field */
80
if (((inst >> 16) & 0x1f) != 1)
81
return 0;
82
/* check major opcode */
83
switch (inst >> 26) {
84
case 37: /* stwu */
85
case 39: /* stbu */
86
case 45: /* sthu */
87
case 53: /* stfsu */
88
case 55: /* stfdu */
89
return 1;
90
case 62: /* std or stdu */
91
return (inst & 3) == 1;
92
case 31:
93
/* check minor opcode */
94
switch ((inst >> 1) & 0x3ff) {
95
case 181: /* stdux */
96
case 183: /* stwux */
97
case 247: /* stbux */
98
case 439: /* sthux */
99
case 695: /* stfsux */
100
case 759: /* stfdux */
101
return 1;
102
}
103
}
104
return 0;
105
}
106
107
/*
108
* For 600- and 800-family processors, the error_code parameter is DSISR
109
* for a data fault, SRR1 for an instruction fault. For 400-family processors
110
* the error_code parameter is ESR for a data fault, 0 for an instruction
111
* fault.
112
* For 64-bit processors, the error_code parameter is
113
* - DSISR for a non-SLB data access fault,
114
* - SRR1 & 0x08000000 for a non-SLB instruction access fault
115
* - 0 any SLB fault.
116
*
117
* The return value is 0 if the fault was handled, or the signal
118
* number if this is a kernel fault that can't be handled here.
119
*/
120
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
121
unsigned long error_code)
122
{
123
struct vm_area_struct * vma;
124
struct mm_struct *mm = current->mm;
125
siginfo_t info;
126
int code = SEGV_MAPERR;
127
int is_write = 0, ret;
128
int trap = TRAP(regs);
129
int is_exec = trap == 0x400;
130
131
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
132
/*
133
* Fortunately the bit assignments in SRR1 for an instruction
134
* fault and DSISR for a data fault are mostly the same for the
135
* bits we are interested in. But there are some bits which
136
* indicate errors in DSISR but can validly be set in SRR1.
137
*/
138
if (trap == 0x400)
139
error_code &= 0x48200000;
140
else
141
is_write = error_code & DSISR_ISSTORE;
142
#else
143
is_write = error_code & ESR_DST;
144
#endif /* CONFIG_4xx || CONFIG_BOOKE */
145
146
if (notify_page_fault(regs))
147
return 0;
148
149
if (unlikely(debugger_fault_handler(regs)))
150
return 0;
151
152
/* On a kernel SLB miss we can only check for a valid exception entry */
153
if (!user_mode(regs) && (address >= TASK_SIZE))
154
return SIGSEGV;
155
156
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
157
defined(CONFIG_PPC_BOOK3S_64))
158
if (error_code & DSISR_DABRMATCH) {
159
/* DABR match */
160
do_dabr(regs, address, error_code);
161
return 0;
162
}
163
#endif
164
165
if (in_atomic() || mm == NULL) {
166
if (!user_mode(regs))
167
return SIGSEGV;
168
/* in_atomic() in user mode is really bad,
169
as is current->mm == NULL. */
170
printk(KERN_EMERG "Page fault in user mode with "
171
"in_atomic() = %d mm = %p\n", in_atomic(), mm);
172
printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
173
regs->nip, regs->msr);
174
die("Weird page fault", regs, SIGSEGV);
175
}
176
177
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
178
179
/* When running in the kernel we expect faults to occur only to
180
* addresses in user space. All other faults represent errors in the
181
* kernel and should generate an OOPS. Unfortunately, in the case of an
182
* erroneous fault occurring in a code path which already holds mmap_sem
183
* we will deadlock attempting to validate the fault against the
184
* address space. Luckily the kernel only validly references user
185
* space from well defined areas of code, which are listed in the
186
* exceptions table.
187
*
188
* As the vast majority of faults will be valid we will only perform
189
* the source reference check when there is a possibility of a deadlock.
190
* Attempt to lock the address space, if we cannot we then validate the
191
* source. If this is invalid we can skip the address space check,
192
* thus avoiding the deadlock.
193
*/
194
if (!down_read_trylock(&mm->mmap_sem)) {
195
if (!user_mode(regs) && !search_exception_tables(regs->nip))
196
goto bad_area_nosemaphore;
197
198
down_read(&mm->mmap_sem);
199
}
200
201
vma = find_vma(mm, address);
202
if (!vma)
203
goto bad_area;
204
if (vma->vm_start <= address)
205
goto good_area;
206
if (!(vma->vm_flags & VM_GROWSDOWN))
207
goto bad_area;
208
209
/*
210
* N.B. The POWER/Open ABI allows programs to access up to
211
* 288 bytes below the stack pointer.
212
* The kernel signal delivery code writes up to about 1.5kB
213
* below the stack pointer (r1) before decrementing it.
214
* The exec code can write slightly over 640kB to the stack
215
* before setting the user r1. Thus we allow the stack to
216
* expand to 1MB without further checks.
217
*/
218
if (address + 0x100000 < vma->vm_end) {
219
/* get user regs even if this fault is in kernel mode */
220
struct pt_regs *uregs = current->thread.regs;
221
if (uregs == NULL)
222
goto bad_area;
223
224
/*
225
* A user-mode access to an address a long way below
226
* the stack pointer is only valid if the instruction
227
* is one which would update the stack pointer to the
228
* address accessed if the instruction completed,
229
* i.e. either stwu rs,n(r1) or stwux rs,r1,rb
230
* (or the byte, halfword, float or double forms).
231
*
232
* If we don't check this then any write to the area
233
* between the last mapped region and the stack will
234
* expand the stack rather than segfaulting.
235
*/
236
if (address + 2048 < uregs->gpr[1]
237
&& (!user_mode(regs) || !store_updates_sp(regs)))
238
goto bad_area;
239
}
240
if (expand_stack(vma, address))
241
goto bad_area;
242
243
good_area:
244
code = SEGV_ACCERR;
245
#if defined(CONFIG_6xx)
246
if (error_code & 0x95700000)
247
/* an error such as lwarx to I/O controller space,
248
address matching DABR, eciwx, etc. */
249
goto bad_area;
250
#endif /* CONFIG_6xx */
251
#if defined(CONFIG_8xx)
252
/* 8xx sometimes need to load a invalid/non-present TLBs.
253
* These must be invalidated separately as linux mm don't.
254
*/
255
if (error_code & 0x40000000) /* no translation? */
256
_tlbil_va(address, 0, 0, 0);
257
258
/* The MPC8xx seems to always set 0x80000000, which is
259
* "undefined". Of those that can be set, this is the only
260
* one which seems bad.
261
*/
262
if (error_code & 0x10000000)
263
/* Guarded storage error. */
264
goto bad_area;
265
#endif /* CONFIG_8xx */
266
267
if (is_exec) {
268
#ifdef CONFIG_PPC_STD_MMU
269
/* Protection fault on exec go straight to failure on
270
* Hash based MMUs as they either don't support per-page
271
* execute permission, or if they do, it's handled already
272
* at the hash level. This test would probably have to
273
* be removed if we change the way this works to make hash
274
* processors use the same I/D cache coherency mechanism
275
* as embedded.
276
*/
277
if (error_code & DSISR_PROTFAULT)
278
goto bad_area;
279
#endif /* CONFIG_PPC_STD_MMU */
280
281
/*
282
* Allow execution from readable areas if the MMU does not
283
* provide separate controls over reading and executing.
284
*
285
* Note: That code used to not be enabled for 4xx/BookE.
286
* It is now as I/D cache coherency for these is done at
287
* set_pte_at() time and I see no reason why the test
288
* below wouldn't be valid on those processors. This -may-
289
* break programs compiled with a really old ABI though.
290
*/
291
if (!(vma->vm_flags & VM_EXEC) &&
292
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
293
!(vma->vm_flags & (VM_READ | VM_WRITE))))
294
goto bad_area;
295
/* a write */
296
} else if (is_write) {
297
if (!(vma->vm_flags & VM_WRITE))
298
goto bad_area;
299
/* a read */
300
} else {
301
/* protection fault */
302
if (error_code & 0x08000000)
303
goto bad_area;
304
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
305
goto bad_area;
306
}
307
308
/*
309
* If for any reason at all we couldn't handle the fault,
310
* make sure we exit gracefully rather than endlessly redo
311
* the fault.
312
*/
313
ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
314
if (unlikely(ret & VM_FAULT_ERROR)) {
315
if (ret & VM_FAULT_OOM)
316
goto out_of_memory;
317
else if (ret & VM_FAULT_SIGBUS)
318
goto do_sigbus;
319
BUG();
320
}
321
if (ret & VM_FAULT_MAJOR) {
322
current->maj_flt++;
323
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
324
regs, address);
325
#ifdef CONFIG_PPC_SMLPAR
326
if (firmware_has_feature(FW_FEATURE_CMO)) {
327
preempt_disable();
328
get_lppaca()->page_ins += (1 << PAGE_FACTOR);
329
preempt_enable();
330
}
331
#endif
332
} else {
333
current->min_flt++;
334
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
335
regs, address);
336
}
337
up_read(&mm->mmap_sem);
338
return 0;
339
340
bad_area:
341
up_read(&mm->mmap_sem);
342
343
bad_area_nosemaphore:
344
/* User mode accesses cause a SIGSEGV */
345
if (user_mode(regs)) {
346
_exception(SIGSEGV, regs, code, address);
347
return 0;
348
}
349
350
if (is_exec && (error_code & DSISR_PROTFAULT))
351
printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
352
" page (%lx) - exploit attempt? (uid: %d)\n",
353
address, current_uid());
354
355
return SIGSEGV;
356
357
/*
358
* We ran out of memory, or some other thing happened to us that made
359
* us unable to handle the page fault gracefully.
360
*/
361
out_of_memory:
362
up_read(&mm->mmap_sem);
363
if (!user_mode(regs))
364
return SIGKILL;
365
pagefault_out_of_memory();
366
return 0;
367
368
do_sigbus:
369
up_read(&mm->mmap_sem);
370
if (user_mode(regs)) {
371
info.si_signo = SIGBUS;
372
info.si_errno = 0;
373
info.si_code = BUS_ADRERR;
374
info.si_addr = (void __user *)address;
375
force_sig_info(SIGBUS, &info, current);
376
return 0;
377
}
378
return SIGBUS;
379
}
380
381
/*
382
* bad_page_fault is called when we have a bad access from the kernel.
383
* It is called from the DSI and ISI handlers in head.S and from some
384
* of the procedures in traps.c.
385
*/
386
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
387
{
388
const struct exception_table_entry *entry;
389
unsigned long *stackend;
390
391
/* Are we prepared to handle this fault? */
392
if ((entry = search_exception_tables(regs->nip)) != NULL) {
393
regs->nip = entry->fixup;
394
return;
395
}
396
397
/* kernel has accessed a bad area */
398
399
switch (regs->trap) {
400
case 0x300:
401
case 0x380:
402
printk(KERN_ALERT "Unable to handle kernel paging request for "
403
"data at address 0x%08lx\n", regs->dar);
404
break;
405
case 0x400:
406
case 0x480:
407
printk(KERN_ALERT "Unable to handle kernel paging request for "
408
"instruction fetch\n");
409
break;
410
default:
411
printk(KERN_ALERT "Unable to handle kernel paging request for "
412
"unknown fault\n");
413
break;
414
}
415
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
416
regs->nip);
417
418
stackend = end_of_stack(current);
419
if (current != &init_task && *stackend != STACK_END_MAGIC)
420
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
421
422
die("Kernel access of bad area", regs, sig);
423
}
424
425