Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/um/kernel/trap.c
26439 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
4
*/
5
6
#include <linux/mm.h>
7
#include <linux/sched/signal.h>
8
#include <linux/hardirq.h>
9
#include <linux/module.h>
10
#include <linux/uaccess.h>
11
#include <linux/sched/debug.h>
12
#include <asm/current.h>
13
#include <asm/tlbflush.h>
14
#include <arch.h>
15
#include <as-layout.h>
16
#include <kern_util.h>
17
#include <os.h>
18
#include <skas.h>
19
20
/*
21
* NOTE: UML does not have exception tables. As such, this is almost a copy
22
* of the code in mm/memory.c, only adjusting the logic to simply check whether
23
* we are coming from the kernel instead of doing an additional lookup in the
24
* exception table.
25
* We can do this simplification because we never get here if the exception was
26
* fixable.
27
*/
28
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
29
{
30
if (likely(mmap_read_trylock(mm)))
31
return true;
32
33
if (!is_user)
34
return false;
35
36
return !mmap_read_lock_killable(mm);
37
}
38
39
static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
40
{
41
/*
42
* We don't have this operation yet.
43
*
44
* It should be easy enough to do: it's basically a
45
* atomic_long_try_cmpxchg_acquire()
46
* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
47
* it also needs the proper lockdep magic etc.
48
*/
49
return false;
50
}
51
52
static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
53
{
54
mmap_read_unlock(mm);
55
if (!is_user)
56
return false;
57
58
return !mmap_write_lock_killable(mm);
59
}
60
61
/*
62
* Helper for page fault handling.
63
*
64
* This is kind of equivalend to "mmap_read_lock()" followed
65
* by "find_extend_vma()", except it's a lot more careful about
66
* the locking (and will drop the lock on failure).
67
*
68
* For example, if we have a kernel bug that causes a page
69
* fault, we don't want to just use mmap_read_lock() to get
70
* the mm lock, because that would deadlock if the bug were
71
* to happen while we're holding the mm lock for writing.
72
*
73
* So this checks the exception tables on kernel faults in
74
* order to only do this all for instructions that are actually
75
* expected to fault.
76
*
77
* We can also actually take the mm lock for writing if we
78
* need to extend the vma, which helps the VM layer a lot.
79
*/
80
static struct vm_area_struct *
81
um_lock_mm_and_find_vma(struct mm_struct *mm,
82
unsigned long addr, bool is_user)
83
{
84
struct vm_area_struct *vma;
85
86
if (!get_mmap_lock_carefully(mm, is_user))
87
return NULL;
88
89
vma = find_vma(mm, addr);
90
if (likely(vma && (vma->vm_start <= addr)))
91
return vma;
92
93
/*
94
* Well, dang. We might still be successful, but only
95
* if we can extend a vma to do so.
96
*/
97
if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
98
mmap_read_unlock(mm);
99
return NULL;
100
}
101
102
/*
103
* We can try to upgrade the mmap lock atomically,
104
* in which case we can continue to use the vma
105
* we already looked up.
106
*
107
* Otherwise we'll have to drop the mmap lock and
108
* re-take it, and also look up the vma again,
109
* re-checking it.
110
*/
111
if (!mmap_upgrade_trylock(mm)) {
112
if (!upgrade_mmap_lock_carefully(mm, is_user))
113
return NULL;
114
115
vma = find_vma(mm, addr);
116
if (!vma)
117
goto fail;
118
if (vma->vm_start <= addr)
119
goto success;
120
if (!(vma->vm_flags & VM_GROWSDOWN))
121
goto fail;
122
}
123
124
if (expand_stack_locked(vma, addr))
125
goto fail;
126
127
success:
128
mmap_write_downgrade(mm);
129
return vma;
130
131
fail:
132
mmap_write_unlock(mm);
133
return NULL;
134
}
135
136
/*
137
* Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
138
* segv().
139
*/
140
int handle_page_fault(unsigned long address, unsigned long ip,
141
int is_write, int is_user, int *code_out)
142
{
143
struct mm_struct *mm = current->mm;
144
struct vm_area_struct *vma;
145
pmd_t *pmd;
146
pte_t *pte;
147
int err = -EFAULT;
148
unsigned int flags = FAULT_FLAG_DEFAULT;
149
150
*code_out = SEGV_MAPERR;
151
152
/*
153
* If the fault was with pagefaults disabled, don't take the fault, just
154
* fail.
155
*/
156
if (faulthandler_disabled())
157
goto out_nosemaphore;
158
159
if (is_user)
160
flags |= FAULT_FLAG_USER;
161
retry:
162
vma = um_lock_mm_and_find_vma(mm, address, is_user);
163
if (!vma)
164
goto out_nosemaphore;
165
166
*code_out = SEGV_ACCERR;
167
if (is_write) {
168
if (!(vma->vm_flags & VM_WRITE))
169
goto out;
170
flags |= FAULT_FLAG_WRITE;
171
} else {
172
/* Don't require VM_READ|VM_EXEC for write faults! */
173
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
174
goto out;
175
}
176
177
do {
178
vm_fault_t fault;
179
180
fault = handle_mm_fault(vma, address, flags, NULL);
181
182
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
183
goto out_nosemaphore;
184
185
/* The fault is fully completed (including releasing mmap lock) */
186
if (fault & VM_FAULT_COMPLETED)
187
return 0;
188
189
if (unlikely(fault & VM_FAULT_ERROR)) {
190
if (fault & VM_FAULT_OOM) {
191
goto out_of_memory;
192
} else if (fault & VM_FAULT_SIGSEGV) {
193
goto out;
194
} else if (fault & VM_FAULT_SIGBUS) {
195
err = -EACCES;
196
goto out;
197
}
198
BUG();
199
}
200
if (fault & VM_FAULT_RETRY) {
201
flags |= FAULT_FLAG_TRIED;
202
203
goto retry;
204
}
205
206
pmd = pmd_off(mm, address);
207
pte = pte_offset_kernel(pmd, address);
208
} while (!pte_present(*pte));
209
err = 0;
210
/*
211
* The below warning was added in place of
212
* pte_mkyoung(); if (is_write) pte_mkdirty();
213
* If it's triggered, we'd see normally a hang here (a clean pte is
214
* marked read-only to emulate the dirty bit).
215
* However, the generic code can mark a PTE writable but clean on a
216
* concurrent read fault, triggering this harmlessly. So comment it out.
217
*/
218
#if 0
219
WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte)));
220
#endif
221
222
out:
223
mmap_read_unlock(mm);
224
out_nosemaphore:
225
return err;
226
227
out_of_memory:
228
/*
229
* We ran out of memory, call the OOM killer, and return the userspace
230
* (which will retry the fault, or kill us if we got oom-killed).
231
*/
232
mmap_read_unlock(mm);
233
if (!is_user)
234
goto out_nosemaphore;
235
pagefault_out_of_memory();
236
return 0;
237
}
238
239
static void show_segv_info(struct uml_pt_regs *regs)
240
{
241
struct task_struct *tsk = current;
242
struct faultinfo *fi = UPT_FAULTINFO(regs);
243
244
if (!unhandled_signal(tsk, SIGSEGV))
245
return;
246
247
if (!printk_ratelimit())
248
return;
249
250
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
251
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
252
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
253
(void *)UPT_IP(regs), (void *)UPT_SP(regs),
254
fi->error_code);
255
256
print_vma_addr(KERN_CONT " in ", UPT_IP(regs));
257
printk(KERN_CONT "\n");
258
}
259
260
static void bad_segv(struct faultinfo fi, unsigned long ip)
261
{
262
current->thread.arch.faultinfo = fi;
263
force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
264
}
265
266
void fatal_sigsegv(void)
267
{
268
force_fatal_sig(SIGSEGV);
269
do_signal(&current->thread.regs);
270
/*
271
* This is to tell gcc that we're not returning - do_signal
272
* can, in general, return, but in this case, it's not, since
273
* we just got a fatal SIGSEGV queued.
274
*/
275
os_dump_core();
276
}
277
278
/**
279
* segv_handler() - the SIGSEGV handler
280
* @sig: the signal number
281
* @unused_si: the signal info struct; unused in this handler
282
* @regs: the ptrace register information
283
* @mc: the mcontext of the signal
284
*
285
* The handler first extracts the faultinfo from the UML ptrace regs struct.
286
* If the userfault did not happen in an UML userspace process, bad_segv is called.
287
* Otherwise the signal did happen in a cloned userspace process, handle it.
288
*/
289
void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
290
void *mc)
291
{
292
struct faultinfo * fi = UPT_FAULTINFO(regs);
293
294
if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) {
295
show_segv_info(regs);
296
bad_segv(*fi, UPT_IP(regs));
297
return;
298
}
299
segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc);
300
}
301
302
/*
303
* We give a *copy* of the faultinfo in the regs to segv.
304
* This must be done, since nesting SEGVs could overwrite
305
* the info in the regs. A pointer to the info then would
306
* give us bad data!
307
*/
308
unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
309
struct uml_pt_regs *regs, void *mc)
310
{
311
int si_code;
312
int err;
313
int is_write = FAULT_WRITE(fi);
314
unsigned long address = FAULT_ADDRESS(fi);
315
316
if (!is_user && regs)
317
current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
318
319
if (!is_user && init_mm.context.sync_tlb_range_to) {
320
/*
321
* Kernel has pending updates from set_ptes that were not
322
* flushed yet. Syncing them should fix the pagefault (if not
323
* we'll get here again and panic).
324
*/
325
err = um_tlb_sync(&init_mm);
326
if (err == -ENOMEM)
327
report_enomem();
328
if (err)
329
panic("Failed to sync kernel TLBs: %d", err);
330
goto out;
331
}
332
else if (current->pagefault_disabled) {
333
if (!mc) {
334
show_regs(container_of(regs, struct pt_regs, regs));
335
panic("Segfault with pagefaults disabled but no mcontext");
336
}
337
if (!current->thread.segv_continue) {
338
show_regs(container_of(regs, struct pt_regs, regs));
339
panic("Segfault without recovery target");
340
}
341
mc_set_rip(mc, current->thread.segv_continue);
342
current->thread.segv_continue = NULL;
343
goto out;
344
}
345
else if (current->mm == NULL) {
346
show_regs(container_of(regs, struct pt_regs, regs));
347
panic("Segfault with no mm");
348
}
349
else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) {
350
show_regs(container_of(regs, struct pt_regs, regs));
351
panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
352
address, ip);
353
}
354
355
if (SEGV_IS_FIXABLE(&fi))
356
err = handle_page_fault(address, ip, is_write, is_user,
357
&si_code);
358
else {
359
err = -EFAULT;
360
/*
361
* A thread accessed NULL, we get a fault, but CR2 is invalid.
362
* This code is used in __do_copy_from_user() of TT mode.
363
* XXX tt mode is gone, so maybe this isn't needed any more
364
*/
365
address = 0;
366
}
367
368
if (!err)
369
goto out;
370
else if (!is_user && arch_fixup(ip, regs))
371
goto out;
372
373
if (!is_user) {
374
show_regs(container_of(regs, struct pt_regs, regs));
375
panic("Kernel mode fault at addr 0x%lx, ip 0x%lx",
376
address, ip);
377
}
378
379
show_segv_info(regs);
380
381
if (err == -EACCES) {
382
current->thread.arch.faultinfo = fi;
383
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
384
} else {
385
BUG_ON(err != -EFAULT);
386
current->thread.arch.faultinfo = fi;
387
force_sig_fault(SIGSEGV, si_code, (void __user *) address);
388
}
389
390
out:
391
if (regs)
392
current->thread.segv_regs = NULL;
393
394
return 0;
395
}
396
397
void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs,
398
void *mc)
399
{
400
int code, err;
401
if (!UPT_IS_USER(regs)) {
402
if (sig == SIGBUS)
403
printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
404
"mount likely just ran out of space\n");
405
panic("Kernel mode signal %d", sig);
406
}
407
408
arch_examine_signal(sig, regs);
409
410
/* Is the signal layout for the signal known?
411
* Signal data must be scrubbed to prevent information leaks.
412
*/
413
code = si->si_code;
414
err = si->si_errno;
415
if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
416
struct faultinfo *fi = UPT_FAULTINFO(regs);
417
current->thread.arch.faultinfo = *fi;
418
force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
419
} else {
420
printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
421
sig, code, err);
422
force_sig(sig);
423
}
424
}
425
426
void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
427
void *mc)
428
{
429
do_IRQ(WINCH_IRQ, regs);
430
}
431
432