Path: blob/master/arch/x86/entry/vsyscall/vsyscall_64.c
52728 views
// SPDX-License-Identifier: GPL-2.01/*2* Copyright (c) 2012-2014 Andy Lutomirski <[email protected]>3*4* Based on the original implementation which is:5* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE6* Copyright 2003 Andi Kleen, SuSE Labs.7*8* Parts of the original code have been moved to arch/x86/vdso/vma.c9*10* This file implements vsyscall emulation. vsyscalls are a legacy ABI:11* Userspace can request certain kernel services by calling fixed12* addresses. This concept is problematic:13*14* - It interferes with ASLR.15* - It's awkward to write code that lives in kernel addresses but is16* callable by userspace at fixed addresses.17* - The whole concept is impossible for 32-bit compat userspace.18* - UML cannot easily virtualize a vsyscall.19*20* As of mid-2014, I believe that there is no new userspace code that21* will use a vsyscall if the vDSO is present. I hope that there will22* soon be no new userspace code that will ever use a vsyscall.23*24* The code in this file emulates vsyscalls when notified of a page25* fault to a vsyscall address.26*/2728#include <linux/kernel.h>29#include <linux/timer.h>30#include <linux/sched/signal.h>31#include <linux/mm_types.h>32#include <linux/syscalls.h>33#include <linux/ratelimit.h>3435#include <asm/vsyscall.h>36#include <asm/unistd.h>37#include <asm/fixmap.h>38#include <asm/traps.h>3940#define CREATE_TRACE_POINTS41#include "vsyscall_trace.h"4243static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =44#ifdef CONFIG_LEGACY_VSYSCALL_NONE45NONE;46#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)47XONLY;48#else49#error VSYSCALL config is broken50#endif5152static int __init vsyscall_setup(char *str)53{54if (str) {55if (!strcmp("emulate", str))56vsyscall_mode = EMULATE;57else if (!strcmp("xonly", str))58vsyscall_mode = XONLY;59else if (!strcmp("none", str))60vsyscall_mode = NONE;61else62return -EINVAL;6364return 0;65}6667return -EINVAL;68}69early_param("vsyscall", vsyscall_setup);7071static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,72const char *message)73{74if (!show_unhandled_signals)75return;7677printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",78level, current->comm, task_pid_nr(current),79message, regs->ip, regs->cs,80regs->sp, regs->ax, regs->si, regs->di);81}8283static int addr_to_vsyscall_nr(unsigned long addr)84{85int nr;8687if ((addr & ~0xC00UL) != VSYSCALL_ADDR)88return -EINVAL;8990nr = (addr & 0xC00UL) >> 10;91if (nr >= 3)92return -EINVAL;9394return nr;95}9697static bool write_ok_or_segv(unsigned long ptr, size_t size)98{99if (!access_ok((void __user *)ptr, size)) {100struct thread_struct *thread = ¤t->thread;101102thread->error_code = X86_PF_USER | X86_PF_WRITE;103thread->cr2 = ptr;104thread->trap_nr = X86_TRAP_PF;105106force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);107return false;108} else {109return true;110}111}112113bool emulate_vsyscall(unsigned long error_code,114struct pt_regs *regs, unsigned long address)115{116unsigned long caller;117int vsyscall_nr, syscall_nr, tmp;118long ret;119unsigned long orig_dx;120121/* Write faults or kernel-privilege faults never get fixed up. */122if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)123return false;124125/*126* Assume that faults at regs->ip are because of an127* instruction fetch. Return early and avoid128* emulation for faults during data accesses:129*/130if (address != regs->ip) {131/* Failed vsyscall read */132if (vsyscall_mode == EMULATE)133return false;134135/*136* User code tried and failed to read the vsyscall page.137*/138warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");139return false;140}141142/*143* X86_PF_INSTR is only set when NX is supported. When144* available, use it to double-check that the emulation code145* is only being used for instruction fetches:146*/147if (cpu_feature_enabled(X86_FEATURE_NX))148WARN_ON_ONCE(!(error_code & X86_PF_INSTR));149150/*151* No point in checking CS -- the only way to get here is a user mode152* trap to a high address, which means that we're in 64-bit user code.153*/154155if (vsyscall_mode == NONE) {156warn_bad_vsyscall(KERN_INFO, regs,157"vsyscall attempted with vsyscall=none");158return false;159}160161vsyscall_nr = addr_to_vsyscall_nr(address);162163trace_emulate_vsyscall(vsyscall_nr);164165if (vsyscall_nr < 0) {166warn_bad_vsyscall(KERN_WARNING, regs,167"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");168goto sigsegv;169}170171if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {172warn_bad_vsyscall(KERN_WARNING, regs,173"vsyscall with bad stack (exploit attempt?)");174goto sigsegv;175}176177/*178* Check for access_ok violations and find the syscall nr.179*180* NULL is a valid user pointer (in the access_ok sense) on 32-bit and181* 64-bit, so we don't need to special-case it here. For all the182* vsyscalls, NULL means "don't write anything" not "write it at183* address 0".184*/185switch (vsyscall_nr) {186case 0:187if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||188!write_ok_or_segv(regs->si, sizeof(struct timezone))) {189ret = -EFAULT;190goto check_fault;191}192193syscall_nr = __NR_gettimeofday;194break;195196case 1:197if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {198ret = -EFAULT;199goto check_fault;200}201202syscall_nr = __NR_time;203break;204205case 2:206if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||207!write_ok_or_segv(regs->si, sizeof(unsigned))) {208ret = -EFAULT;209goto check_fault;210}211212syscall_nr = __NR_getcpu;213break;214}215216/*217* Handle seccomp. regs->ip must be the original value.218* See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.219*220* We could optimize the seccomp disabled case, but performance221* here doesn't matter.222*/223regs->orig_ax = syscall_nr;224regs->ax = -ENOSYS;225tmp = secure_computing();226if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {227warn_bad_vsyscall(KERN_DEBUG, regs,228"seccomp tried to change syscall nr or ip");229force_exit_sig(SIGSYS);230return true;231}232regs->orig_ax = -1;233if (tmp)234goto do_ret; /* skip requested */235236/*237* With a real vsyscall, page faults cause SIGSEGV.238*/239ret = -EFAULT;240switch (vsyscall_nr) {241case 0:242/* this decodes regs->di and regs->si on its own */243ret = __x64_sys_gettimeofday(regs);244break;245246case 1:247/* this decodes regs->di on its own */248ret = __x64_sys_time(regs);249break;250251case 2:252/* while we could clobber regs->dx, we didn't in the past... */253orig_dx = regs->dx;254regs->dx = 0;255/* this decodes regs->di, regs->si and regs->dx on its own */256ret = __x64_sys_getcpu(regs);257regs->dx = orig_dx;258break;259}260261check_fault:262if (ret == -EFAULT) {263/* Bad news -- userspace fed a bad pointer to a vsyscall. */264warn_bad_vsyscall(KERN_INFO, regs,265"vsyscall fault (exploit attempt?)");266goto sigsegv;267}268269regs->ax = ret;270271do_ret:272/* Emulate a ret instruction. */273regs->ip = caller;274regs->sp += 8;275return true;276277sigsegv:278force_sig(SIGSEGV);279return true;280}281282/*283* A pseudo VMA to allow ptrace access for the vsyscall page. This only284* covers the 64bit vsyscall page now. 32bit has a real VMA now and does285* not need special handling anymore:286*/287static const char *gate_vma_name(struct vm_area_struct *vma)288{289return "[vsyscall]";290}291static const struct vm_operations_struct gate_vma_ops = {292.name = gate_vma_name,293};294static struct vm_area_struct gate_vma __ro_after_init = {295.vm_start = VSYSCALL_ADDR,296.vm_end = VSYSCALL_ADDR + PAGE_SIZE,297.vm_page_prot = PAGE_READONLY_EXEC,298.vm_flags = VM_READ | VM_EXEC,299.vm_ops = &gate_vma_ops,300};301302struct vm_area_struct *get_gate_vma(struct mm_struct *mm)303{304#ifdef CONFIG_COMPAT305if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))306return NULL;307#endif308if (vsyscall_mode == NONE)309return NULL;310return &gate_vma;311}312313int in_gate_area(struct mm_struct *mm, unsigned long addr)314{315struct vm_area_struct *vma = get_gate_vma(mm);316317if (!vma)318return 0;319320return (addr >= vma->vm_start) && (addr < vma->vm_end);321}322323/*324* Use this when you have no reliable mm, typically from interrupt325* context. It is less reliable than using a task's mm and may give326* false positives.327*/328int in_gate_area_no_mm(unsigned long addr)329{330return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;331}332333/*334* The VSYSCALL page is the only user-accessible page in the kernel address335* range. Normally, the kernel page tables can have _PAGE_USER clear, but336* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls337* are enabled.338*339* Some day we may create a "minimal" vsyscall mode in which we emulate340* vsyscalls but leave the page not present. If so, we skip calling341* this.342*/343void __init set_vsyscall_pgtable_user_bits(pgd_t *root)344{345pgd_t *pgd;346p4d_t *p4d;347pud_t *pud;348pmd_t *pmd;349350pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);351set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));352p4d = p4d_offset(pgd, VSYSCALL_ADDR);353set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));354pud = pud_offset(p4d, VSYSCALL_ADDR);355set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));356pmd = pmd_offset(pud, VSYSCALL_ADDR);357set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));358}359360void __init map_vsyscall(void)361{362extern char __vsyscall_page;363unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);364365/*366* For full emulation, the page needs to exist for real. In367* execute-only mode, there is no PTE at all backing the vsyscall368* page.369*/370if (vsyscall_mode == EMULATE) {371__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,372PAGE_KERNEL_VVAR);373set_vsyscall_pgtable_user_bits(swapper_pg_dir);374}375376if (vsyscall_mode == XONLY)377vm_flags_init(&gate_vma, VM_EXEC);378379BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=380(unsigned long)VSYSCALL_ADDR);381}382383384