CoCalc -- fault.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/mm/fault.c
¹⁰⁸¹⁷ views
1
/*
2
 *  PowerPC version
3
 *    Copyright (C) 1995-1996 Gary Thomas ([email protected])
4
 *
5
 *  Derived from "arch/i386/mm/fault.c"
6
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
7
 *
8
 *  Modified by Cort Dougan and Paul Mackerras.
9
 *
10
 *  Modified for PPC64 by Dave Engebretsen ([email protected])
11
 *
12
 *  This program is free software; you can redistribute it and/or
13
 *  modify it under the terms of the GNU General Public License
14
 *  as published by the Free Software Foundation; either version
15
 *  2 of the License, or (at your option) any later version.
16
 */
17

18
#include <linux/signal.h>
19
#include <linux/sched.h>
20
#include <linux/kernel.h>
21
#include <linux/errno.h>
22
#include <linux/string.h>
23
#include <linux/types.h>
24
#include <linux/ptrace.h>
25
#include <linux/mman.h>
26
#include <linux/mm.h>
27
#include <linux/interrupt.h>
28
#include <linux/highmem.h>
29
#include <linux/module.h>
30
#include <linux/kprobes.h>
31
#include <linux/kdebug.h>
32
#include <linux/perf_event.h>
33
#include <linux/magic.h>
34
#include <linux/ratelimit.h>
35

36
#include <asm/firmware.h>
37
#include <asm/page.h>
38
#include <asm/pgtable.h>
39
#include <asm/mmu.h>
40
#include <asm/mmu_context.h>
41
#include <asm/system.h>
42
#include <asm/uaccess.h>
43
#include <asm/tlbflush.h>
44
#include <asm/siginfo.h>
45
#include <mm/mmu_decl.h>
46

47
#ifdef CONFIG_KPROBES
48
static inline int notify_page_fault(struct pt_regs *regs)
49
{
50
	int ret = 0;
51

52
	/* kprobe_running() needs smp_processor_id() */
53
	if (!user_mode(regs)) {
54
		preempt_disable();
55
		if (kprobe_running() && kprobe_fault_handler(regs, 11))
56
			ret = 1;
57
		preempt_enable();
58
	}
59

60
	return ret;
61
}
62
#else
63
static inline int notify_page_fault(struct pt_regs *regs)
64
{
65
	return 0;
66
}
67
#endif
68

69
/*
70
 * Check whether the instruction at regs->nip is a store using
71
 * an update addressing form which will update r1.
72
 */
73
static int store_updates_sp(struct pt_regs *regs)
74
{
75
	unsigned int inst;
76

77
	if (get_user(inst, (unsigned int __user *)regs->nip))
78
		return 0;
79
	/* check for 1 in the rA field */
80
	if (((inst >> 16) & 0x1f) != 1)
81
		return 0;
82
	/* check major opcode */
83
	switch (inst >> 26) {
84
	case 37:	/* stwu */
85
	case 39:	/* stbu */
86
	case 45:	/* sthu */
87
	case 53:	/* stfsu */
88
	case 55:	/* stfdu */
89
		return 1;
90
	case 62:	/* std or stdu */
91
		return (inst & 3) == 1;
92
	case 31:
93
		/* check minor opcode */
94
		switch ((inst >> 1) & 0x3ff) {
95
		case 181:	/* stdux */
96
		case 183:	/* stwux */
97
		case 247:	/* stbux */
98
		case 439:	/* sthux */
99
		case 695:	/* stfsux */
100
		case 759:	/* stfdux */
101
			return 1;
102
		}
103
	}
104
	return 0;
105
}
106

107
/*
108
 * For 600- and 800-family processors, the error_code parameter is DSISR
109
 * for a data fault, SRR1 for an instruction fault. For 400-family processors
110
 * the error_code parameter is ESR for a data fault, 0 for an instruction
111
 * fault.
112
 * For 64-bit processors, the error_code parameter is
113
 *  - DSISR for a non-SLB data access fault,
114
 *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
115
 *  - 0 any SLB fault.
116
 *
117
 * The return value is 0 if the fault was handled, or the signal
118
 * number if this is a kernel fault that can't be handled here.
119
 */
120
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
121
			    unsigned long error_code)
122
{
123
	struct vm_area_struct * vma;
124
	struct mm_struct *mm = current->mm;
125
	siginfo_t info;
126
	int code = SEGV_MAPERR;
127
	int is_write = 0, ret;
128
	int trap = TRAP(regs);
129
 	int is_exec = trap == 0x400;
130

131
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
132
	/*
133
	 * Fortunately the bit assignments in SRR1 for an instruction
134
	 * fault and DSISR for a data fault are mostly the same for the
135
	 * bits we are interested in.  But there are some bits which
136
	 * indicate errors in DSISR but can validly be set in SRR1.
137
	 */
138
	if (trap == 0x400)
139
		error_code &= 0x48200000;
140
	else
141
		is_write = error_code & DSISR_ISSTORE;
142
#else
143
	is_write = error_code & ESR_DST;
144
#endif /* CONFIG_4xx || CONFIG_BOOKE */
145

146
	if (notify_page_fault(regs))
147
		return 0;
148

149
	if (unlikely(debugger_fault_handler(regs)))
150
		return 0;
151

152
	/* On a kernel SLB miss we can only check for a valid exception entry */
153
	if (!user_mode(regs) && (address >= TASK_SIZE))
154
		return SIGSEGV;
155

156
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
157
			     defined(CONFIG_PPC_BOOK3S_64))
158
  	if (error_code & DSISR_DABRMATCH) {
159
		/* DABR match */
160
		do_dabr(regs, address, error_code);
161
		return 0;
162
	}
163
#endif
164

165
	if (in_atomic() || mm == NULL) {
166
		if (!user_mode(regs))
167
			return SIGSEGV;
168
		/* in_atomic() in user mode is really bad,
169
		   as is current->mm == NULL. */
170
		printk(KERN_EMERG "Page fault in user mode with "
171
		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
172
		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
173
		       regs->nip, regs->msr);
174
		die("Weird page fault", regs, SIGSEGV);
175
	}
176

177
	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
178

179
	/* When running in the kernel we expect faults to occur only to
180
	 * addresses in user space.  All other faults represent errors in the
181
	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
182
	 * erroneous fault occurring in a code path which already holds mmap_sem
183
	 * we will deadlock attempting to validate the fault against the
184
	 * address space.  Luckily the kernel only validly references user
185
	 * space from well defined areas of code, which are listed in the
186
	 * exceptions table.
187
	 *
188
	 * As the vast majority of faults will be valid we will only perform
189
	 * the source reference check when there is a possibility of a deadlock.
190
	 * Attempt to lock the address space, if we cannot we then validate the
191
	 * source.  If this is invalid we can skip the address space check,
192
	 * thus avoiding the deadlock.
193
	 */
194
	if (!down_read_trylock(&mm->mmap_sem)) {
195
		if (!user_mode(regs) && !search_exception_tables(regs->nip))
196
			goto bad_area_nosemaphore;
197

198
		down_read(&mm->mmap_sem);
199
	}
200

201
	vma = find_vma(mm, address);
202
	if (!vma)
203
		goto bad_area;
204
	if (vma->vm_start <= address)
205
		goto good_area;
206
	if (!(vma->vm_flags & VM_GROWSDOWN))
207
		goto bad_area;
208

209
	/*
210
	 * N.B. The POWER/Open ABI allows programs to access up to
211
	 * 288 bytes below the stack pointer.
212
	 * The kernel signal delivery code writes up to about 1.5kB
213
	 * below the stack pointer (r1) before decrementing it.
214
	 * The exec code can write slightly over 640kB to the stack
215
	 * before setting the user r1.  Thus we allow the stack to
216
	 * expand to 1MB without further checks.
217
	 */
218
	if (address + 0x100000 < vma->vm_end) {
219
		/* get user regs even if this fault is in kernel mode */
220
		struct pt_regs *uregs = current->thread.regs;
221
		if (uregs == NULL)
222
			goto bad_area;
223

224
		/*
225
		 * A user-mode access to an address a long way below
226
		 * the stack pointer is only valid if the instruction
227
		 * is one which would update the stack pointer to the
228
		 * address accessed if the instruction completed,
229
		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
230
		 * (or the byte, halfword, float or double forms).
231
		 *
232
		 * If we don't check this then any write to the area
233
		 * between the last mapped region and the stack will
234
		 * expand the stack rather than segfaulting.
235
		 */
236
		if (address + 2048 < uregs->gpr[1]
237
		    && (!user_mode(regs) || !store_updates_sp(regs)))
238
			goto bad_area;
239
	}
240
	if (expand_stack(vma, address))
241
		goto bad_area;
242

243
good_area:
244
	code = SEGV_ACCERR;
245
#if defined(CONFIG_6xx)
246
	if (error_code & 0x95700000)
247
		/* an error such as lwarx to I/O controller space,
248
		   address matching DABR, eciwx, etc. */
249
		goto bad_area;
250
#endif /* CONFIG_6xx */
251
#if defined(CONFIG_8xx)
252
	/* 8xx sometimes need to load a invalid/non-present TLBs.
253
	 * These must be invalidated separately as linux mm don't.
254
	 */
255
	if (error_code & 0x40000000) /* no translation? */
256
		_tlbil_va(address, 0, 0, 0);
257

258
        /* The MPC8xx seems to always set 0x80000000, which is
259
         * "undefined".  Of those that can be set, this is the only
260
         * one which seems bad.
261
         */
262
	if (error_code & 0x10000000)
263
                /* Guarded storage error. */
264
		goto bad_area;
265
#endif /* CONFIG_8xx */
266

267
	if (is_exec) {
268
#ifdef CONFIG_PPC_STD_MMU
269
		/* Protection fault on exec go straight to failure on
270
		 * Hash based MMUs as they either don't support per-page
271
		 * execute permission, or if they do, it's handled already
272
		 * at the hash level. This test would probably have to
273
		 * be removed if we change the way this works to make hash
274
		 * processors use the same I/D cache coherency mechanism
275
		 * as embedded.
276
		 */
277
		if (error_code & DSISR_PROTFAULT)
278
			goto bad_area;
279
#endif /* CONFIG_PPC_STD_MMU */
280

281
		/*
282
		 * Allow execution from readable areas if the MMU does not
283
		 * provide separate controls over reading and executing.
284
		 *
285
		 * Note: That code used to not be enabled for 4xx/BookE.
286
		 * It is now as I/D cache coherency for these is done at
287
		 * set_pte_at() time and I see no reason why the test
288
		 * below wouldn't be valid on those processors. This -may-
289
		 * break programs compiled with a really old ABI though.
290
		 */
291
		if (!(vma->vm_flags & VM_EXEC) &&
292
		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
293
		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
294
			goto bad_area;
295
	/* a write */
296
	} else if (is_write) {
297
		if (!(vma->vm_flags & VM_WRITE))
298
			goto bad_area;
299
	/* a read */
300
	} else {
301
		/* protection fault */
302
		if (error_code & 0x08000000)
303
			goto bad_area;
304
		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
305
			goto bad_area;
306
	}
307

308
	/*
309
	 * If for any reason at all we couldn't handle the fault,
310
	 * make sure we exit gracefully rather than endlessly redo
311
	 * the fault.
312
	 */
313
	ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
314
	if (unlikely(ret & VM_FAULT_ERROR)) {
315
		if (ret & VM_FAULT_OOM)
316
			goto out_of_memory;
317
		else if (ret & VM_FAULT_SIGBUS)
318
			goto do_sigbus;
319
		BUG();
320
	}
321
	if (ret & VM_FAULT_MAJOR) {
322
		current->maj_flt++;
323
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
324
				     regs, address);
325
#ifdef CONFIG_PPC_SMLPAR
326
		if (firmware_has_feature(FW_FEATURE_CMO)) {
327
			preempt_disable();
328
			get_lppaca()->page_ins += (1 << PAGE_FACTOR);
329
			preempt_enable();
330
		}
331
#endif
332
	} else {
333
		current->min_flt++;
334
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
335
				     regs, address);
336
	}
337
	up_read(&mm->mmap_sem);
338
	return 0;
339

340
bad_area:
341
	up_read(&mm->mmap_sem);
342

343
bad_area_nosemaphore:
344
	/* User mode accesses cause a SIGSEGV */
345
	if (user_mode(regs)) {
346
		_exception(SIGSEGV, regs, code, address);
347
		return 0;
348
	}
349

350
	if (is_exec && (error_code & DSISR_PROTFAULT))
351
		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
352
				   " page (%lx) - exploit attempt? (uid: %d)\n",
353
				   address, current_uid());
354

355
	return SIGSEGV;
356

357
/*
358
 * We ran out of memory, or some other thing happened to us that made
359
 * us unable to handle the page fault gracefully.
360
 */
361
out_of_memory:
362
	up_read(&mm->mmap_sem);
363
	if (!user_mode(regs))
364
		return SIGKILL;
365
	pagefault_out_of_memory();
366
	return 0;
367

368
do_sigbus:
369
	up_read(&mm->mmap_sem);
370
	if (user_mode(regs)) {
371
		info.si_signo = SIGBUS;
372
		info.si_errno = 0;
373
		info.si_code = BUS_ADRERR;
374
		info.si_addr = (void __user *)address;
375
		force_sig_info(SIGBUS, &info, current);
376
		return 0;
377
	}
378
	return SIGBUS;
379
}
380

381
/*
382
 * bad_page_fault is called when we have a bad access from the kernel.
383
 * It is called from the DSI and ISI handlers in head.S and from some
384
 * of the procedures in traps.c.
385
 */
386
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
387
{
388
	const struct exception_table_entry *entry;
389
	unsigned long *stackend;
390

391
	/* Are we prepared to handle this fault?  */
392
	if ((entry = search_exception_tables(regs->nip)) != NULL) {
393
		regs->nip = entry->fixup;
394
		return;
395
	}
396

397
	/* kernel has accessed a bad area */
398

399
	switch (regs->trap) {
400
	case 0x300:
401
	case 0x380:
402
		printk(KERN_ALERT "Unable to handle kernel paging request for "
403
			"data at address 0x%08lx\n", regs->dar);
404
		break;
405
	case 0x400:
406
	case 0x480:
407
		printk(KERN_ALERT "Unable to handle kernel paging request for "
408
			"instruction fetch\n");
409
		break;
410
	default:
411
		printk(KERN_ALERT "Unable to handle kernel paging request for "
412
			"unknown fault\n");
413
		break;
414
	}
415
	printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
416
		regs->nip);
417

418
	stackend = end_of_stack(current);
419
	if (current != &init_task && *stackend != STACK_END_MAGIC)
420
		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
421

422
	die("Kernel access of bad area", regs, sig);
423
}
424

425
Product

Resources

Company