Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/um/os-Linux/start_up.c
26444 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2021 Benjamin Berg <[email protected]>
4
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5
*/
6
7
#include <stdio.h>
8
#include <stdlib.h>
9
#include <stdarg.h>
10
#include <unistd.h>
11
#include <errno.h>
12
#include <fcntl.h>
13
#include <sched.h>
14
#include <signal.h>
15
#include <string.h>
16
#include <sys/mman.h>
17
#include <sys/stat.h>
18
#include <sys/wait.h>
19
#include <sys/time.h>
20
#include <sys/resource.h>
21
#include <asm/ldt.h>
22
#include <asm/unistd.h>
23
#include <init.h>
24
#include <os.h>
25
#include <kern_util.h>
26
#include <mem_user.h>
27
#include <ptrace_user.h>
28
#include <stdbool.h>
29
#include <stub-data.h>
30
#include <sys/prctl.h>
31
#include <linux/seccomp.h>
32
#include <linux/filter.h>
33
#include <sysdep/mcontext.h>
34
#include <sysdep/stub.h>
35
#include <registers.h>
36
#include <skas.h>
37
#include "internal.h"
38
39
static void ptrace_child(void)
40
{
41
int ret;
42
/* Calling os_getpid because some libcs cached getpid incorrectly */
43
int pid = os_getpid(), ppid = getppid();
44
int sc_result;
45
46
if (change_sig(SIGWINCH, 0) < 0 ||
47
ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
48
perror("ptrace");
49
kill(pid, SIGKILL);
50
}
51
kill(pid, SIGSTOP);
52
53
/*
54
* This syscall will be intercepted by the parent. Don't call more than
55
* once, please.
56
*/
57
sc_result = os_getpid();
58
59
if (sc_result == pid)
60
/* Nothing modified by the parent, we are running normally. */
61
ret = 1;
62
else if (sc_result == ppid)
63
/*
64
* Expected in check_ptrace and check_sysemu when they succeed
65
* in modifying the stack frame
66
*/
67
ret = 0;
68
else
69
/* Serious trouble! This could be caused by a bug in host 2.6
70
* SKAS3/2.6 patch before release -V6, together with a bug in
71
* the UML code itself.
72
*/
73
ret = 2;
74
75
exit(ret);
76
}
77
78
static void fatal_perror(const char *str)
79
{
80
perror(str);
81
exit(1);
82
}
83
84
static void fatal(char *fmt, ...)
85
{
86
va_list list;
87
88
va_start(list, fmt);
89
vfprintf(stderr, fmt, list);
90
va_end(list);
91
92
exit(1);
93
}
94
95
static void non_fatal(char *fmt, ...)
96
{
97
va_list list;
98
99
va_start(list, fmt);
100
vfprintf(stderr, fmt, list);
101
va_end(list);
102
}
103
104
static int start_ptraced_child(void)
105
{
106
int pid, n, status;
107
108
fflush(stdout);
109
110
pid = fork();
111
if (pid == 0)
112
ptrace_child();
113
else if (pid < 0)
114
fatal_perror("start_ptraced_child : fork failed");
115
116
CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
117
if (n < 0)
118
fatal_perror("check_ptrace : waitpid failed");
119
if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
120
fatal("check_ptrace : expected SIGSTOP, got status = %d",
121
status);
122
123
return pid;
124
}
125
126
static void stop_ptraced_child(int pid, int exitcode)
127
{
128
int status, n;
129
130
if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
131
fatal_perror("stop_ptraced_child : ptrace failed");
132
133
CATCH_EINTR(n = waitpid(pid, &status, 0));
134
if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
135
int exit_with = WEXITSTATUS(status);
136
fatal("stop_ptraced_child : child exited with exitcode %d, "
137
"while expecting %d; status 0x%x\n", exit_with,
138
exitcode, status);
139
}
140
}
141
142
static void __init check_sysemu(void)
143
{
144
int pid, n, status, count=0;
145
146
os_info("Checking syscall emulation for ptrace...");
147
pid = start_ptraced_child();
148
149
if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
150
(void *) PTRACE_O_TRACESYSGOOD) < 0))
151
fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
152
153
while (1) {
154
count++;
155
if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
156
goto fail;
157
CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
158
if (n < 0)
159
fatal_perror("check_sysemu: wait failed");
160
161
if (WIFSTOPPED(status) &&
162
(WSTOPSIG(status) == (SIGTRAP|0x80))) {
163
if (!count) {
164
non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
165
"doesn't singlestep");
166
goto fail;
167
}
168
n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
169
os_getpid());
170
if (n < 0)
171
fatal_perror("check_sysemu : failed to modify "
172
"system call return");
173
break;
174
}
175
else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
176
count++;
177
else {
178
non_fatal("check_sysemu: expected SIGTRAP or "
179
"(SIGTRAP | 0x80), got status = %d\n",
180
status);
181
goto fail;
182
}
183
}
184
stop_ptraced_child(pid, 0);
185
186
os_info("OK\n");
187
188
return;
189
190
fail:
191
stop_ptraced_child(pid, 1);
192
fatal("missing\n");
193
}
194
195
static void __init check_ptrace(void)
196
{
197
int pid, syscall, n, status;
198
199
os_info("Checking that ptrace can change system call numbers...");
200
pid = start_ptraced_child();
201
202
if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
203
(void *) PTRACE_O_TRACESYSGOOD) < 0))
204
fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
205
206
while (1) {
207
if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
208
fatal_perror("check_ptrace : ptrace failed");
209
210
CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
211
if (n < 0)
212
fatal_perror("check_ptrace : wait failed");
213
214
if (!WIFSTOPPED(status) ||
215
(WSTOPSIG(status) != (SIGTRAP | 0x80)))
216
fatal("check_ptrace : expected (SIGTRAP|0x80), "
217
"got status = %d", status);
218
219
syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
220
0);
221
if (syscall == __NR_getpid) {
222
n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
223
__NR_getppid);
224
if (n < 0)
225
fatal_perror("check_ptrace : failed to modify "
226
"system call");
227
break;
228
}
229
}
230
stop_ptraced_child(pid, 0);
231
os_info("OK\n");
232
check_sysemu();
233
}
234
235
extern unsigned long host_fp_size;
236
extern unsigned long exec_regs[MAX_REG_NR];
237
extern unsigned long *exec_fp_regs;
238
239
__initdata static struct stub_data *seccomp_test_stub_data;
240
241
static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
242
{
243
ucontext_t *uc = p;
244
245
/* Stow away the location of the mcontext in the stack */
246
seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
247
(unsigned long)&seccomp_test_stub_data->sigstack[0];
248
249
/* Prevent libc from clearing memory (mctx_offset in particular) */
250
syscall(__NR_exit, 0);
251
}
252
253
static int __init seccomp_helper(void *data)
254
{
255
static struct sock_filter filter[] = {
256
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
257
offsetof(struct seccomp_data, nr)),
258
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
259
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
260
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
261
};
262
static struct sock_fprog prog = {
263
.len = ARRAY_SIZE(filter),
264
.filter = filter,
265
};
266
struct sigaction sa;
267
268
/* close_range is needed for the stub */
269
if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
270
exit(1);
271
272
set_sigstack(seccomp_test_stub_data->sigstack,
273
sizeof(seccomp_test_stub_data->sigstack));
274
275
sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
276
sa.sa_sigaction = (void *) sigsys_handler;
277
sa.sa_restorer = NULL;
278
if (sigaction(SIGSYS, &sa, NULL) < 0)
279
exit(2);
280
281
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
282
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
283
SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
284
exit(3);
285
286
sleep(0);
287
288
/* Never reached. */
289
_exit(4);
290
}
291
292
static bool __init init_seccomp(void)
293
{
294
int pid;
295
int status;
296
int n;
297
unsigned long sp;
298
299
/*
300
* We check that we can install a seccomp filter and then exit(0)
301
* from a trapped syscall.
302
*
303
* Note that we cannot verify that no seccomp filter already exists
304
* for a syscall that results in the process/thread to be killed.
305
*/
306
307
os_info("Checking that seccomp filters can be installed...");
308
309
seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
310
PROT_READ | PROT_WRITE,
311
MAP_SHARED | MAP_ANON, 0, 0);
312
313
/* Use the syscall data area as stack, we just need something */
314
sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
315
sizeof(seccomp_test_stub_data->syscall_data) -
316
sizeof(void *);
317
pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
318
319
if (pid < 0)
320
fatal_perror("check_seccomp : clone failed");
321
322
CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
323
if (n < 0)
324
fatal_perror("check_seccomp : waitpid failed");
325
326
if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
327
struct uml_pt_regs *regs;
328
unsigned long fp_size;
329
int r;
330
331
/* Fill in the host_fp_size from the mcontext. */
332
regs = calloc(1, sizeof(struct uml_pt_regs));
333
get_stub_state(regs, seccomp_test_stub_data, &fp_size);
334
host_fp_size = fp_size;
335
free(regs);
336
337
/* Repeat with the correct size */
338
regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
339
r = get_stub_state(regs, seccomp_test_stub_data, NULL);
340
341
/* Store as the default startup registers */
342
exec_fp_regs = malloc(host_fp_size);
343
memcpy(exec_regs, regs->gp, sizeof(exec_regs));
344
memcpy(exec_fp_regs, regs->fp, host_fp_size);
345
346
munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
347
348
free(regs);
349
350
if (r) {
351
os_info("failed to fetch registers: %d\n", r);
352
return false;
353
}
354
355
os_info("OK\n");
356
return true;
357
}
358
359
if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
360
os_info("missing\n");
361
else
362
os_info("error\n");
363
364
munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
365
return false;
366
}
367
368
369
static void __init check_coredump_limit(void)
370
{
371
struct rlimit lim;
372
int err = getrlimit(RLIMIT_CORE, &lim);
373
374
if (err) {
375
perror("Getting core dump limit");
376
return;
377
}
378
379
os_info("Core dump limits :\n\tsoft - ");
380
if (lim.rlim_cur == RLIM_INFINITY)
381
os_info("NONE\n");
382
else
383
os_info("%llu\n", (unsigned long long)lim.rlim_cur);
384
385
os_info("\thard - ");
386
if (lim.rlim_max == RLIM_INFINITY)
387
os_info("NONE\n");
388
else
389
os_info("%llu\n", (unsigned long long)lim.rlim_max);
390
}
391
392
void __init get_host_cpu_features(
393
void (*flags_helper_func)(char *line),
394
void (*cache_helper_func)(char *line))
395
{
396
FILE *cpuinfo;
397
char *line = NULL;
398
size_t len = 0;
399
int done_parsing = 0;
400
401
cpuinfo = fopen("/proc/cpuinfo", "r");
402
if (cpuinfo == NULL) {
403
os_info("Failed to get host CPU features\n");
404
} else {
405
while ((getline(&line, &len, cpuinfo)) != -1) {
406
if (strstr(line, "flags")) {
407
flags_helper_func(line);
408
done_parsing++;
409
}
410
if (strstr(line, "cache_alignment")) {
411
cache_helper_func(line);
412
done_parsing++;
413
}
414
free(line);
415
line = NULL;
416
if (done_parsing > 1)
417
break;
418
}
419
fclose(cpuinfo);
420
}
421
}
422
423
static int seccomp_config __initdata;
424
425
static int __init uml_seccomp_config(char *line, int *add)
426
{
427
*add = 0;
428
429
if (strcmp(line, "off") == 0)
430
seccomp_config = 0;
431
else if (strcmp(line, "auto") == 0)
432
seccomp_config = 1;
433
else if (strcmp(line, "on") == 0)
434
seccomp_config = 2;
435
else
436
fatal("Invalid seccomp option '%s', expected on/auto/off\n",
437
line);
438
439
return 0;
440
}
441
442
__uml_setup("seccomp=", uml_seccomp_config,
443
"seccomp=<on/auto/off>\n"
444
" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
445
" processes work collaboratively with the kernel instead of being\n"
446
" traced using ptrace. All syscalls from the application are caught and\n"
447
" redirected using a signal. This signal handler in turn is permitted to\n"
448
" do the selected set of syscalls to communicate with the UML kernel and\n"
449
" do the required memory management.\n"
450
"\n"
451
" This method is overall faster than the ptrace based userspace, primarily\n"
452
" because it reduces the number of context switches for (minor) page faults.\n"
453
"\n"
454
" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
455
" userspace from reading and writing all physical memory. Userspace\n"
456
" processes could also trick the stub into disabling SIGALRM which\n"
457
" prevents it from being interrupted for scheduling purposes.\n"
458
"\n"
459
" This is insecure and should only be used with a trusted userspace\n\n"
460
);
461
462
void __init os_early_checks(void)
463
{
464
int pid;
465
466
/* Print out the core dump limits early */
467
check_coredump_limit();
468
469
/* Need to check this early because mmapping happens before the
470
* kernel is running.
471
*/
472
check_tmpexec();
473
474
if (seccomp_config) {
475
if (init_seccomp()) {
476
using_seccomp = 1;
477
return;
478
}
479
480
if (seccomp_config == 2)
481
fatal("SECCOMP userspace requested but not functional!\n");
482
}
483
484
using_seccomp = 0;
485
check_ptrace();
486
487
pid = start_ptraced_child();
488
if (init_pid_registers(pid))
489
fatal("Failed to initialize default registers");
490
stop_ptraced_child(pid, 1);
491
}
492
493
int __init parse_iomem(char *str, int *add)
494
{
495
struct iomem_region *new;
496
struct stat64 buf;
497
char *file, *driver;
498
int fd, size;
499
500
driver = str;
501
file = strchr(str,',');
502
if (file == NULL) {
503
os_warn("parse_iomem : failed to parse iomem\n");
504
goto out;
505
}
506
*file = '\0';
507
file++;
508
fd = open(file, O_RDWR, 0);
509
if (fd < 0) {
510
perror("parse_iomem - Couldn't open io file");
511
goto out;
512
}
513
514
if (fstat64(fd, &buf) < 0) {
515
perror("parse_iomem - cannot stat_fd file");
516
goto out_close;
517
}
518
519
new = malloc(sizeof(*new));
520
if (new == NULL) {
521
perror("Couldn't allocate iomem_region struct");
522
goto out_close;
523
}
524
525
size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1);
526
527
*new = ((struct iomem_region) { .next = iomem_regions,
528
.driver = driver,
529
.fd = fd,
530
.size = size,
531
.phys = 0,
532
.virt = 0 });
533
iomem_regions = new;
534
iomem_size += new->size + UM_KERN_PAGE_SIZE;
535
536
return 0;
537
out_close:
538
close(fd);
539
out:
540
return 1;
541
}
542
543