Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/amd64/linux/linux_sysvec.c
39492 views
1
/*-
2
* Copyright (c) 2004 Tim J. Robbins
3
* Copyright (c) 2003 Peter Wemm
4
* Copyright (c) 2002 Doug Rabson
5
* Copyright (c) 1998-1999 Andrew Gallatin
6
* Copyright (c) 1994-1996 Søren Schmidt
7
* All rights reserved.
8
* Copyright (c) 2013, 2021 Dmitry Chagin <[email protected]>
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer
15
* in this position and unchanged.
16
* 2. Redistributions in binary form must reproduce the above copyright
17
* notice, this list of conditions and the following disclaimer in the
18
* documentation and/or other materials provided with the distribution.
19
* 3. The name of the author may not be used to endorse or promote products
20
* derived from this software without specific prior written permission
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
*/
33
34
#define __ELF_WORD_SIZE 64
35
36
#include <sys/param.h>
37
#include <sys/exec.h>
38
#include <sys/imgact.h>
39
#include <sys/imgact_elf.h>
40
#include <sys/kernel.h>
41
#include <sys/ktr.h>
42
#include <sys/lock.h>
43
#include <sys/module.h>
44
#include <sys/mutex.h>
45
#include <sys/proc.h>
46
#include <sys/stddef.h>
47
#include <sys/syscallsubr.h>
48
#include <sys/sysctl.h>
49
#include <sys/sysent.h>
50
51
#include <vm/pmap.h>
52
#include <vm/vm.h>
53
#include <vm/vm_param.h>
54
55
#include <machine/md_var.h>
56
#include <machine/trap.h>
57
58
#include <x86/linux/linux_x86.h>
59
#include <amd64/linux/linux.h>
60
#include <amd64/linux/linux_proto.h>
61
#include <compat/linux/linux_elf.h>
62
#include <compat/linux/linux_emul.h>
63
#include <compat/linux/linux_fork.h>
64
#include <compat/linux/linux_ioctl.h>
65
#include <compat/linux/linux_mib.h>
66
#include <compat/linux/linux_misc.h>
67
#include <compat/linux/linux_signal.h>
68
#include <compat/linux/linux_util.h>
69
#include <compat/linux/linux_vdso.h>
70
71
#include <x86/linux/linux_x86_sigframe.h>
72
73
_Static_assert(sizeof(struct l_fpstate) ==
74
sizeof(__typeof(((mcontext_t *)0)->mc_fpstate)),
75
"fxsave area size incorrect");
76
77
MODULE_VERSION(linux64, 1);
78
79
#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2
80
#define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \
81
LINUX_VDSOPAGE_SIZE)
82
#define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE)
83
/*
84
* PAGE_SIZE - the size
85
* of the native SHAREDPAGE
86
*/
87
#define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48
88
#define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \
89
sizeof(struct ps_strings))
90
91
static int linux_szsigcode;
92
static vm_object_t linux_vdso_obj;
93
static char *linux_vdso_mapping;
94
extern char _binary_linux_vdso_so_o_start;
95
extern char _binary_linux_vdso_so_o_end;
96
static vm_offset_t linux_vdso_base;
97
98
extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
99
extern const char *linux_syscallnames[];
100
101
SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
102
103
static void linux_vdso_install(const void *param);
104
static void linux_vdso_deinstall(const void *param);
105
static void linux_vdso_reloc(char *mapping, Elf_Addr offset);
106
static void linux_set_syscall_retval(struct thread *td, int error);
107
static int linux_fetch_syscall_args(struct thread *td);
108
static void linux_exec_setregs(struct thread *td, struct image_params *imgp,
109
uintptr_t stack);
110
static void linux_exec_sysvec_init(void *param);
111
static int linux_on_exec_vmspace(struct proc *p,
112
struct image_params *imgp);
113
static void linux_set_fork_retval(struct thread *td);
114
static int linux_vsyscall(struct thread *td);
115
116
LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
117
LINUX_VDSO_SYM_CHAR(linux_platform);
118
LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
119
LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
120
LINUX_VDSO_SYM_INTPTR(kern_cpu_selector);
121
122
/*
123
* According to the Intel x86 ISA 64-bit syscall
124
* saves %rip to %rcx and rflags to %r11. Registers on syscall entry:
125
* %rax system call number
126
* %rcx return address
127
* %r11 saved rflags
128
* %rdi arg1
129
* %rsi arg2
130
* %rdx arg3
131
* %r10 arg4
132
* %r8 arg5
133
* %r9 arg6
134
*
135
* Then FreeBSD fast_syscall() move registers:
136
* %rcx -> trapframe.tf_rip
137
* %r10 -> trapframe.tf_rcx
138
*/
139
static int
140
linux_fetch_syscall_args(struct thread *td)
141
{
142
struct proc *p;
143
struct trapframe *frame;
144
struct syscall_args *sa;
145
146
p = td->td_proc;
147
frame = td->td_frame;
148
sa = &td->td_sa;
149
150
sa->args[0] = frame->tf_rdi;
151
sa->args[1] = frame->tf_rsi;
152
sa->args[2] = frame->tf_rdx;
153
sa->args[3] = frame->tf_rcx;
154
sa->args[4] = frame->tf_r8;
155
sa->args[5] = frame->tf_r9;
156
sa->code = frame->tf_rax;
157
sa->original_code = sa->code;
158
159
if (sa->code >= p->p_sysent->sv_size)
160
/* nosys */
161
sa->callp = &nosys_sysent;
162
else
163
sa->callp = &p->p_sysent->sv_table[sa->code];
164
165
/* Restore r10 earlier to avoid doing this multiply times. */
166
frame->tf_r10 = frame->tf_rcx;
167
/* Restore %rcx for machine context. */
168
frame->tf_rcx = frame->tf_rip;
169
170
td->td_retval[0] = 0;
171
return (0);
172
}
173
174
static void
175
linux_set_syscall_retval(struct thread *td, int error)
176
{
177
struct trapframe *frame;
178
179
frame = td->td_frame;
180
181
switch (error) {
182
case 0:
183
frame->tf_rax = td->td_retval[0];
184
break;
185
186
case ERESTART:
187
/*
188
* Reconstruct pc, we know that 'syscall' is 2 bytes,
189
* lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
190
* We saved this in tf_err.
191
*
192
*/
193
frame->tf_rip -= frame->tf_err;
194
break;
195
196
case EJUSTRETURN:
197
break;
198
199
default:
200
frame->tf_rax = bsd_to_linux_errno(error);
201
break;
202
}
203
204
/*
205
* Differently from FreeBSD native ABI, on Linux only %rcx
206
* and %r11 values are not preserved across the syscall.
207
* Require full context restore to get all registers except
208
* those two restored at return to usermode.
209
*/
210
set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
211
}
212
213
static void
214
linux_set_fork_retval(struct thread *td)
215
{
216
struct trapframe *frame = td->td_frame;
217
218
frame->tf_rax = 0;
219
}
220
221
void
222
linux64_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos)
223
{
224
225
AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
226
AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, cpu_feature);
227
AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP2, linux_x86_elf_hwcap2());
228
AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform));
229
}
230
231
/*
232
* Reset registers to default values on exec.
233
*/
234
static void
235
linux_exec_setregs(struct thread *td, struct image_params *imgp,
236
uintptr_t stack)
237
{
238
struct trapframe *regs;
239
struct pcb *pcb;
240
register_t saved_rflags;
241
242
regs = td->td_frame;
243
pcb = td->td_pcb;
244
245
if (td->td_proc->p_md.md_ldt != NULL)
246
user_ldt_free(td);
247
248
pcb->pcb_fsbase = 0;
249
pcb->pcb_gsbase = 0;
250
clear_pcb_flags(pcb, PCB_32BIT | PCB_TLSBASE);
251
pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
252
set_pcb_flags(pcb, PCB_FULL_IRET);
253
254
saved_rflags = regs->tf_rflags & PSL_T;
255
bzero((char *)regs, sizeof(struct trapframe));
256
regs->tf_rip = imgp->entry_addr;
257
regs->tf_rsp = stack;
258
regs->tf_rflags = PSL_USER | saved_rflags;
259
regs->tf_ss = _udatasel;
260
regs->tf_cs = _ucodesel;
261
regs->tf_ds = _udatasel;
262
regs->tf_es = _udatasel;
263
regs->tf_fs = _ufssel;
264
regs->tf_gs = _ugssel;
265
regs->tf_flags = TF_HASSEGS;
266
267
x86_clear_dbregs(pcb);
268
269
/*
270
* Drop the FP state if we hold it, so that the process gets a
271
* clean FP state if it uses the FPU again.
272
*/
273
fpstate_drop(td);
274
}
275
276
static int
277
linux_fxrstor(struct thread *td, mcontext_t *mcp, struct l_sigcontext *sc)
278
{
279
struct savefpu *fp = (struct savefpu *)&mcp->mc_fpstate[0];
280
int error;
281
282
error = copyin(PTRIN(sc->sc_fpstate), fp, sizeof(mcp->mc_fpstate));
283
if (error != 0)
284
return (error);
285
bzero(&fp->sv_pad[0], sizeof(fp->sv_pad));
286
return (set_fpcontext(td, mcp, NULL, 0));
287
}
288
289
static int
290
linux_xrstor(struct thread *td, mcontext_t *mcp, struct l_sigcontext *sc)
291
{
292
struct savefpu *fp = (struct savefpu *)&mcp->mc_fpstate[0];
293
char *xfpustate;
294
struct proc *p;
295
uint32_t magic2;
296
int error;
297
298
p = td->td_proc;
299
mcp->mc_xfpustate_len = cpu_max_ext_state_size - sizeof(struct savefpu);
300
301
/* Legacy region of an xsave area. */
302
error = copyin(PTRIN(sc->sc_fpstate), fp, sizeof(mcp->mc_fpstate));
303
if (error != 0)
304
return (error);
305
bzero(&fp->sv_pad[0], sizeof(fp->sv_pad));
306
307
/* Extended region of an xsave area. */
308
sc->sc_fpstate += sizeof(mcp->mc_fpstate);
309
xfpustate = (char *)fpu_save_area_alloc();
310
error = copyin(PTRIN(sc->sc_fpstate), xfpustate, mcp->mc_xfpustate_len);
311
if (error != 0) {
312
fpu_save_area_free((struct savefpu *)xfpustate);
313
uprintf("pid %d (%s): linux xrstor failed\n", p->p_pid,
314
td->td_name);
315
return (error);
316
}
317
318
/* Linux specific end of xsave area marker. */
319
sc->sc_fpstate += mcp->mc_xfpustate_len;
320
error = copyin(PTRIN(sc->sc_fpstate), &magic2, LINUX_FP_XSTATE_MAGIC2_SIZE);
321
if (error != 0 || magic2 != LINUX_FP_XSTATE_MAGIC2) {
322
fpu_save_area_free((struct savefpu *)xfpustate);
323
uprintf("pid %d (%s): sigreturn magic2 0x%x error %d\n",
324
p->p_pid, td->td_name, magic2, error);
325
return (error);
326
}
327
328
error = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
329
fpu_save_area_free((struct savefpu *)xfpustate);
330
if (error != 0) {
331
uprintf("pid %d (%s): sigreturn set_fpcontext error %d\n",
332
p->p_pid, td->td_name, error);
333
}
334
return (error);
335
}
336
337
static int
338
linux_copyin_fpstate(struct thread *td, struct l_ucontext *uc)
339
{
340
mcontext_t mc;
341
342
bzero(&mc, sizeof(mc));
343
mc.mc_ownedfp = _MC_FPOWNED_FPU;
344
mc.mc_fpformat = _MC_FPFMT_XMM;
345
346
if ((uc->uc_flags & LINUX_UC_FP_XSTATE) != 0)
347
return (linux_xrstor(td, &mc, &uc->uc_mcontext));
348
else
349
return (linux_fxrstor(td, &mc, &uc->uc_mcontext));
350
}
351
352
/*
353
* Copied from amd64/amd64/machdep.c
354
*/
355
int
356
linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
357
{
358
struct proc *p;
359
struct l_rt_sigframe sf;
360
struct l_sigcontext *context;
361
struct trapframe *regs;
362
unsigned long rflags;
363
sigset_t bmask;
364
int error;
365
ksiginfo_t ksi;
366
367
regs = td->td_frame;
368
error = copyin((void *)regs->tf_rbx, &sf, sizeof(sf));
369
if (error != 0)
370
return (error);
371
372
p = td->td_proc;
373
context = &sf.sf_uc.uc_mcontext;
374
rflags = context->sc_rflags;
375
376
/*
377
* Don't allow users to change privileged or reserved flags.
378
*/
379
/*
380
* XXX do allow users to change the privileged flag PSL_RF.
381
* The cpu sets PSL_RF in tf_rflags for faults. Debuggers
382
* should sometimes set it there too. tf_rflags is kept in
383
* the signal context during signal handling and there is no
384
* other place to remember it, so the PSL_RF bit may be
385
* corrupted by the signal handler without us knowing.
386
* Corruption of the PSL_RF bit at worst causes one more or
387
* one less debugger trap, so allowing it is fairly harmless.
388
*/
389
if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
390
uprintf("pid %d comm %s linux mangled rflags %#lx\n",
391
p->p_pid, p->p_comm, rflags);
392
return (EINVAL);
393
}
394
395
/*
396
* Don't allow users to load a valid privileged %cs. Let the
397
* hardware check for invalid selectors, excess privilege in
398
* other selectors, invalid %eip's and invalid %esp's.
399
*/
400
if (!CS_SECURE(context->sc_cs)) {
401
uprintf("pid %d comm %s linux mangled cs %#x\n",
402
p->p_pid, p->p_comm, context->sc_cs);
403
ksiginfo_init_trap(&ksi);
404
ksi.ksi_signo = SIGBUS;
405
ksi.ksi_code = BUS_OBJERR;
406
ksi.ksi_trapno = T_PROTFLT;
407
ksi.ksi_addr = (void *)regs->tf_rip;
408
trapsignal(td, &ksi);
409
return (EINVAL);
410
}
411
412
linux_to_bsd_sigset(&sf.sf_uc.uc_sigmask, &bmask);
413
kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
414
415
regs->tf_rdi = context->sc_rdi;
416
regs->tf_rsi = context->sc_rsi;
417
regs->tf_rdx = context->sc_rdx;
418
regs->tf_rbp = context->sc_rbp;
419
regs->tf_rbx = context->sc_rbx;
420
regs->tf_rcx = context->sc_rcx;
421
regs->tf_rax = context->sc_rax;
422
regs->tf_rip = context->sc_rip;
423
regs->tf_rsp = context->sc_rsp;
424
regs->tf_r8 = context->sc_r8;
425
regs->tf_r9 = context->sc_r9;
426
regs->tf_r10 = context->sc_r10;
427
regs->tf_r11 = context->sc_r11;
428
regs->tf_r12 = context->sc_r12;
429
regs->tf_r13 = context->sc_r13;
430
regs->tf_r14 = context->sc_r14;
431
regs->tf_r15 = context->sc_r15;
432
regs->tf_cs = context->sc_cs;
433
regs->tf_err = context->sc_err;
434
regs->tf_rflags = rflags;
435
436
error = linux_copyin_fpstate(td, &sf.sf_uc);
437
if (error != 0) {
438
uprintf("pid %d comm %s linux can't restore fpu state %d\n",
439
p->p_pid, p->p_comm, error);
440
return (error);
441
}
442
443
set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
444
return (EJUSTRETURN);
445
}
446
447
static int
448
linux_fxsave(mcontext_t *mcp, void *ufp)
449
{
450
struct l_fpstate *fx = (struct l_fpstate *)&mcp->mc_fpstate[0];
451
452
bzero(&fx->reserved2[0], sizeof(fx->reserved2));
453
return (copyout(fx, ufp, sizeof(*fx)));
454
}
455
456
static int
457
linux_xsave(mcontext_t *mcp, char *xfpusave, char *ufp)
458
{
459
struct l_fpstate *fx = (struct l_fpstate *)&mcp->mc_fpstate[0];
460
uint32_t magic2;
461
int error;
462
463
/* Legacy region of an xsave area. */
464
fx->sw_reserved.magic1 = LINUX_FP_XSTATE_MAGIC1;
465
fx->sw_reserved.xstate_size = mcp->mc_xfpustate_len + sizeof(*fx);
466
fx->sw_reserved.extended_size = fx->sw_reserved.xstate_size +
467
LINUX_FP_XSTATE_MAGIC2_SIZE;
468
fx->sw_reserved.xfeatures = xsave_mask;
469
470
error = copyout(fx, ufp, sizeof(*fx));
471
if (error != 0)
472
return (error);
473
ufp += sizeof(*fx);
474
475
/* Extended region of an xsave area. */
476
error = copyout(xfpusave, ufp, mcp->mc_xfpustate_len);
477
if (error != 0)
478
return (error);
479
480
/* Linux specific end of xsave area marker. */
481
ufp += mcp->mc_xfpustate_len;
482
magic2 = LINUX_FP_XSTATE_MAGIC2;
483
return (copyout(&magic2, ufp, LINUX_FP_XSTATE_MAGIC2_SIZE));
484
}
485
486
static int
487
linux_copyout_fpstate(struct thread *td, struct l_ucontext *uc, char **sp)
488
{
489
size_t xfpusave_len;
490
char *xfpusave;
491
mcontext_t mc;
492
char *ufp = *sp;
493
494
get_fpcontext(td, &mc, &xfpusave, &xfpusave_len);
495
KASSERT(mc.mc_fpformat != _MC_FPFMT_NODEV, ("fpu not present"));
496
497
/* Room for fxsave area. */
498
ufp -= sizeof(struct l_fpstate);
499
if (xfpusave != NULL) {
500
/* Room for xsave area. */
501
ufp -= (xfpusave_len + LINUX_FP_XSTATE_MAGIC2_SIZE);
502
uc->uc_flags |= LINUX_UC_FP_XSTATE;
503
}
504
*sp = ufp = (char *)((unsigned long)ufp & ~0x3Ful);
505
506
if (xfpusave != NULL)
507
return (linux_xsave(&mc, xfpusave, ufp));
508
else
509
return (linux_fxsave(&mc, ufp));
510
}
511
512
/*
513
* copied from amd64/amd64/machdep.c
514
*
515
* Send an interrupt to process.
516
*/
517
static void
518
linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
519
{
520
struct l_rt_sigframe sf, *sfp;
521
struct proc *p;
522
struct thread *td;
523
struct sigacts *psp;
524
char *sp;
525
struct trapframe *regs;
526
int sig, code;
527
int oonstack, issiginfo;
528
529
td = curthread;
530
p = td->td_proc;
531
PROC_LOCK_ASSERT(p, MA_OWNED);
532
sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno);
533
psp = p->p_sigacts;
534
issiginfo = SIGISMEMBER(psp->ps_siginfo, sig);
535
code = ksi->ksi_code;
536
mtx_assert(&psp->ps_mtx, MA_OWNED);
537
regs = td->td_frame;
538
oonstack = sigonstack(regs->tf_rsp);
539
540
LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u",
541
catcher, sig, mask, code);
542
543
bzero(&sf, sizeof(sf));
544
sf.sf_uc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
545
sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
546
sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
547
? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
548
549
/* Allocate space for the signal handler context. */
550
if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
551
SIGISMEMBER(psp->ps_sigonstack, sig)) {
552
sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
553
} else
554
sp = (char *)regs->tf_rsp - 128;
555
556
mtx_unlock(&psp->ps_mtx);
557
PROC_UNLOCK(p);
558
559
if (linux_copyout_fpstate(td, &sf.sf_uc, &sp) != 0) {
560
uprintf("pid %d comm %s linux can't save fpu state, killing\n",
561
p->p_pid, p->p_comm);
562
PROC_LOCK(p);
563
sigexit(td, SIGILL);
564
}
565
sf.sf_uc.uc_mcontext.sc_fpstate = (register_t)sp;
566
567
/* Make room, keeping the stack aligned. */
568
sp -= sizeof(struct l_rt_sigframe);
569
sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul);
570
571
/* Save user context. */
572
bsd_to_linux_sigset(mask, &sf.sf_uc.uc_sigmask);
573
sf.sf_uc.uc_mcontext.sc_mask = sf.sf_uc.uc_sigmask;
574
sf.sf_uc.uc_mcontext.sc_rdi = regs->tf_rdi;
575
sf.sf_uc.uc_mcontext.sc_rsi = regs->tf_rsi;
576
sf.sf_uc.uc_mcontext.sc_rdx = regs->tf_rdx;
577
sf.sf_uc.uc_mcontext.sc_rbp = regs->tf_rbp;
578
sf.sf_uc.uc_mcontext.sc_rbx = regs->tf_rbx;
579
sf.sf_uc.uc_mcontext.sc_rcx = regs->tf_rcx;
580
sf.sf_uc.uc_mcontext.sc_rax = regs->tf_rax;
581
sf.sf_uc.uc_mcontext.sc_rip = regs->tf_rip;
582
sf.sf_uc.uc_mcontext.sc_rsp = regs->tf_rsp;
583
sf.sf_uc.uc_mcontext.sc_r8 = regs->tf_r8;
584
sf.sf_uc.uc_mcontext.sc_r9 = regs->tf_r9;
585
sf.sf_uc.uc_mcontext.sc_r10 = regs->tf_r10;
586
sf.sf_uc.uc_mcontext.sc_r11 = regs->tf_r11;
587
sf.sf_uc.uc_mcontext.sc_r12 = regs->tf_r12;
588
sf.sf_uc.uc_mcontext.sc_r13 = regs->tf_r13;
589
sf.sf_uc.uc_mcontext.sc_r14 = regs->tf_r14;
590
sf.sf_uc.uc_mcontext.sc_r15 = regs->tf_r15;
591
sf.sf_uc.uc_mcontext.sc_cs = regs->tf_cs;
592
sf.sf_uc.uc_mcontext.sc_rflags = regs->tf_rflags;
593
sf.sf_uc.uc_mcontext.sc_err = regs->tf_err;
594
sf.sf_uc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
595
sf.sf_uc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr;
596
597
/* Translate the signal. */
598
sig = bsd_to_linux_signal(sig);
599
/* Fill in POSIX parts. */
600
siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig);
601
602
/* Copy the sigframe out to the user's stack. */
603
if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
604
uprintf("pid %d comm %s has trashed its stack, killing\n",
605
p->p_pid, p->p_comm);
606
PROC_LOCK(p);
607
sigexit(td, SIGILL);
608
}
609
610
fpstate_drop(td);
611
/* Build the argument list for the signal handler. */
612
regs->tf_rdi = sig; /* arg 1 in %rdi */
613
regs->tf_rax = 0;
614
if (issiginfo) {
615
regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
616
regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
617
} else {
618
regs->tf_rsi = 0;
619
regs->tf_rdx = 0;
620
}
621
regs->tf_rcx = (register_t)catcher;
622
regs->tf_rsp = (long)sfp;
623
regs->tf_rip = linux_rt_sigcode;
624
regs->tf_rflags &= ~(PSL_T | PSL_D);
625
regs->tf_cs = _ucodesel;
626
set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
627
PROC_LOCK(p);
628
mtx_lock(&psp->ps_mtx);
629
}
630
631
#define LINUX_VSYSCALL_START (-10UL << 20)
632
#define LINUX_VSYSCALL_SZ 1024
633
634
const unsigned long linux_vsyscall_vector[] = {
635
LINUX_SYS_gettimeofday,
636
LINUX_SYS_linux_time,
637
LINUX_SYS_linux_getcpu,
638
};
639
640
static int
641
linux_vsyscall(struct thread *td)
642
{
643
struct trapframe *frame;
644
uint64_t retqaddr;
645
int code, traced;
646
int error;
647
648
frame = td->td_frame;
649
650
/* Check %rip for vsyscall area. */
651
if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START))
652
return (EINVAL);
653
if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0)
654
return (EINVAL);
655
code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ;
656
if (code >= nitems(linux_vsyscall_vector))
657
return (EINVAL);
658
659
/*
660
* vsyscall called as callq *(%rax), so we must
661
* use return address from %rsp and also fixup %rsp.
662
*/
663
error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr));
664
if (error)
665
return (error);
666
667
frame->tf_rip = retqaddr;
668
frame->tf_rax = linux_vsyscall_vector[code];
669
frame->tf_rsp += 8;
670
671
traced = (frame->tf_flags & PSL_T);
672
673
amd64_syscall(td, traced);
674
675
return (0);
676
}
677
678
struct sysentvec elf_linux_sysvec = {
679
.sv_size = LINUX_SYS_MAXSYSCALL,
680
.sv_table = linux_sysent,
681
.sv_fixup = __elfN(freebsd_fixup),
682
.sv_sendsig = linux_rt_sendsig,
683
.sv_sigcode = &_binary_linux_vdso_so_o_start,
684
.sv_szsigcode = &linux_szsigcode,
685
.sv_name = "Linux ELF64",
686
.sv_coredump = elf64_coredump,
687
.sv_elf_core_osabi = ELFOSABI_NONE,
688
.sv_elf_core_abi_vendor = LINUX_ABI_VENDOR,
689
.sv_elf_core_prepare_notes = linux64_prepare_notes,
690
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
691
.sv_minuser = VM_MIN_ADDRESS,
692
.sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
693
.sv_usrstack = LINUX_USRSTACK_LA48,
694
.sv_psstrings = LINUX_PS_STRINGS_LA48,
695
.sv_psstringssz = sizeof(struct ps_strings),
696
.sv_stackprot = VM_PROT_ALL,
697
.sv_copyout_auxargs = __linuxN(copyout_auxargs),
698
.sv_copyout_strings = __linuxN(copyout_strings),
699
.sv_setregs = linux_exec_setregs,
700
.sv_fixlimit = NULL,
701
.sv_maxssiz = NULL,
702
.sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN |
703
SV_SIG_WAITNDQ | SV_TIMEKEEP,
704
.sv_set_syscall_retval = linux_set_syscall_retval,
705
.sv_fetch_syscall_args = linux_fetch_syscall_args,
706
.sv_syscallnames = linux_syscallnames,
707
.sv_shared_page_base = LINUX_SHAREDPAGE_LA48,
708
.sv_shared_page_len = PAGE_SIZE,
709
.sv_schedtail = linux_schedtail,
710
.sv_thread_detach = linux_thread_detach,
711
.sv_trap = linux_vsyscall,
712
.sv_hwcap = NULL,
713
.sv_hwcap2 = NULL,
714
.sv_hwcap3 = NULL,
715
.sv_hwcap4 = NULL,
716
.sv_onexec = linux_on_exec_vmspace,
717
.sv_onexit = linux_on_exit,
718
.sv_ontdexit = linux_thread_dtor,
719
.sv_setid_allowed = &linux_setid_allowed_query,
720
.sv_set_fork_retval = linux_set_fork_retval,
721
};
722
723
static int
724
linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
725
{
726
int error;
727
728
error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
729
LINUX_VDSOPAGE_SIZE, imgp);
730
if (error == 0)
731
error = linux_on_exec(p, imgp);
732
return (error);
733
}
734
735
/*
736
* linux_vdso_install() and linux_exec_sysvec_init() must be called
737
* after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY).
738
*/
739
static void
740
linux_exec_sysvec_init(void *param)
741
{
742
l_uintptr_t *ktimekeep_base, *ktsc_selector;
743
struct sysentvec *sv;
744
ptrdiff_t tkoff;
745
746
sv = param;
747
amd64_lower_shared_page(sv);
748
/* Fill timekeep_base */
749
exec_sysvec_init(sv);
750
751
tkoff = kern_timekeep_base - linux_vdso_base;
752
ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
753
*ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset;
754
755
tkoff = kern_tsc_selector - linux_vdso_base;
756
ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
757
*ktsc_selector = linux_vdso_tsc_selector_idx();
758
if (bootverbose)
759
printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector);
760
761
tkoff = kern_cpu_selector - linux_vdso_base;
762
ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
763
*ktsc_selector = linux_vdso_cpu_selector_idx();
764
if (bootverbose)
765
printf("Linux x86-64 vDSO cpu_selector: %lu\n", *ktsc_selector);
766
}
767
SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY,
768
linux_exec_sysvec_init, &elf_linux_sysvec);
769
770
static void
771
linux_vdso_install(const void *param)
772
{
773
char *vdso_start = &_binary_linux_vdso_so_o_start;
774
char *vdso_end = &_binary_linux_vdso_so_o_end;
775
776
linux_szsigcode = vdso_end - vdso_start;
777
MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
778
779
linux_vdso_base = LINUX_VDSOPAGE_LA48;
780
if (hw_lower_amd64_sharedpage != 0)
781
linux_vdso_base -= PAGE_SIZE;
782
783
__elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
784
785
linux_vdso_obj = __elfN(linux_shared_page_init)
786
(&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
787
bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
788
789
linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
790
}
791
SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST,
792
linux_vdso_install, NULL);
793
794
static void
795
linux_vdso_deinstall(const void *param)
796
{
797
798
__elfN(linux_shared_page_fini)(linux_vdso_obj,
799
linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
800
}
801
SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
802
linux_vdso_deinstall, NULL);
803
804
static void
805
linux_vdso_reloc(char *mapping, Elf_Addr offset)
806
{
807
const Elf_Ehdr *ehdr;
808
const Elf_Shdr *shdr;
809
Elf64_Addr *where, val;
810
Elf_Size rtype, symidx;
811
const Elf_Rela *rela;
812
Elf_Addr addr, addend;
813
int relacnt;
814
int i, j;
815
816
MPASS(offset != 0);
817
818
relacnt = 0;
819
ehdr = (const Elf_Ehdr *)mapping;
820
shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
821
for (i = 0; i < ehdr->e_shnum; i++)
822
{
823
switch (shdr[i].sh_type) {
824
case SHT_REL:
825
printf("Linux x86_64 vDSO: unexpected Rel section\n");
826
break;
827
case SHT_RELA:
828
rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset);
829
relacnt = shdr[i].sh_size / sizeof(*rela);
830
}
831
}
832
833
for (j = 0; j < relacnt; j++, rela++) {
834
where = (Elf_Addr *)(mapping + rela->r_offset);
835
addend = rela->r_addend;
836
rtype = ELF_R_TYPE(rela->r_info);
837
symidx = ELF_R_SYM(rela->r_info);
838
839
switch (rtype) {
840
case R_X86_64_NONE: /* none */
841
break;
842
843
case R_X86_64_RELATIVE: /* B + A */
844
addr = (Elf_Addr)(offset + addend);
845
val = addr;
846
if (*where != val)
847
*where = val;
848
break;
849
case R_X86_64_IRELATIVE:
850
printf("Linux x86_64 vDSO: unexpected ifunc relocation, "
851
"symbol index %ld\n", symidx);
852
break;
853
default:
854
printf("Linux x86_64 vDSO: unexpected relocation type %ld, "
855
"symbol index %ld\n", rtype, symidx);
856
}
857
}
858
}
859
860
static Elf_Brandnote linux64_brandnote = {
861
.hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
862
.hdr.n_descsz = 16,
863
.hdr.n_type = 1,
864
.vendor = GNU_ABI_VENDOR,
865
.flags = BN_TRANSLATE_OSREL,
866
.trans_osrel = linux_trans_osrel
867
};
868
869
static Elf64_Brandinfo linux_glibc2brand = {
870
.brand = ELFOSABI_LINUX,
871
.machine = EM_X86_64,
872
.compat_3_brand = "Linux",
873
.interp_path = "/lib64/ld-linux-x86-64.so.2",
874
.sysvec = &elf_linux_sysvec,
875
.interp_newpath = NULL,
876
.brand_note = &linux64_brandnote,
877
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
878
};
879
880
static Elf64_Brandinfo linux_glibc2brandshort = {
881
.brand = ELFOSABI_LINUX,
882
.machine = EM_X86_64,
883
.compat_3_brand = "Linux",
884
.interp_path = "/lib64/ld-linux.so.2",
885
.sysvec = &elf_linux_sysvec,
886
.interp_newpath = NULL,
887
.brand_note = &linux64_brandnote,
888
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
889
};
890
891
static Elf64_Brandinfo linux_muslbrand = {
892
.brand = ELFOSABI_LINUX,
893
.machine = EM_X86_64,
894
.compat_3_brand = "Linux",
895
.interp_path = "/lib/ld-musl-x86_64.so.1",
896
.sysvec = &elf_linux_sysvec,
897
.interp_newpath = NULL,
898
.brand_note = &linux64_brandnote,
899
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE |
900
LINUX_BI_FUTEX_REQUEUE
901
};
902
903
static Elf64_Brandinfo *linux_brandlist[] = {
904
&linux_glibc2brand,
905
&linux_glibc2brandshort,
906
&linux_muslbrand,
907
NULL
908
};
909
910
static int
911
linux64_elf_modevent(module_t mod, int type, void *data)
912
{
913
Elf64_Brandinfo **brandinfo;
914
int error;
915
struct linux_ioctl_handler **lihp;
916
917
error = 0;
918
919
switch(type) {
920
case MOD_LOAD:
921
for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
922
++brandinfo)
923
if (elf64_insert_brand_entry(*brandinfo) < 0)
924
error = EINVAL;
925
if (error == 0) {
926
SET_FOREACH(lihp, linux_ioctl_handler_set)
927
linux_ioctl_register_handler(*lihp);
928
stclohz = (stathz ? stathz : hz);
929
if (bootverbose)
930
printf("Linux x86-64 ELF exec handler installed\n");
931
} else
932
printf("cannot insert Linux x86-64 ELF brand handler\n");
933
break;
934
case MOD_UNLOAD:
935
for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
936
++brandinfo)
937
if (elf64_brand_inuse(*brandinfo))
938
error = EBUSY;
939
if (error == 0) {
940
for (brandinfo = &linux_brandlist[0];
941
*brandinfo != NULL; ++brandinfo)
942
if (elf64_remove_brand_entry(*brandinfo) < 0)
943
error = EINVAL;
944
}
945
if (error == 0) {
946
SET_FOREACH(lihp, linux_ioctl_handler_set)
947
linux_ioctl_unregister_handler(*lihp);
948
if (bootverbose)
949
printf("Linux x86_64 ELF exec handler removed\n");
950
} else
951
printf("Could not deinstall Linux x86_64 ELF interpreter entry\n");
952
break;
953
default:
954
return (EOPNOTSUPP);
955
}
956
return (error);
957
}
958
959
static moduledata_t linux64_elf_mod = {
960
"linux64elf",
961
linux64_elf_modevent,
962
0
963
};
964
965
DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
966
MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1);
967
FEATURE(linux64, "Linux 64bit support");
968
969