// SPDX-License-Identifier: GPL-2.0-only1/* 32-bit system call dispatch */23#include <linux/linkage.h>4#include <linux/sys.h>5#include <linux/cache.h>6#include <linux/syscalls.h>7#include <linux/entry-common.h>8#include <linux/nospec.h>9#include <linux/uaccess.h>10#include <asm/apic.h>11#include <asm/traps.h>12#include <asm/cpufeature.h>13#include <asm/syscall.h>1415#ifdef CONFIG_IA32_EMULATION16#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat)17#else18#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)19#endif2021#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);22#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);23#include <asm/syscalls_32.h>24#undef __SYSCALL2526#undef __SYSCALL_NORETURN27#define __SYSCALL_NORETURN __SYSCALL2829/*30* The sys_call_table[] is no longer used for system calls, but31* kernel/trace/trace_syscalls.c still wants to know the system32* call address.33*/34#ifdef CONFIG_X86_3235#define __SYSCALL(nr, sym) __ia32_##sym,36const sys_call_ptr_t sys_call_table[] = {37#include <asm/syscalls_32.h>38};39#undef __SYSCALL40#endif4142#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);43long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)44{45switch (nr) {46#include <asm/syscalls_32.h>47default: return __ia32_sys_ni_syscall(regs);48}49}5051static __always_inline int syscall_32_enter(struct pt_regs *regs)52{53if (IS_ENABLED(CONFIG_IA32_EMULATION))54current_thread_info()->status |= TS_COMPAT;5556return (int)regs->orig_ax;57}5859#ifdef CONFIG_IA32_EMULATION60bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);6162static int __init ia32_emulation_override_cmdline(char *arg)63{64return kstrtobool(arg, &__ia32_enabled);65}66early_param("ia32_emulation", ia32_emulation_override_cmdline);67#endif6869/*70* Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.71*/72static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)73{74/*75* Convert negative numbers to very high and thus out of range76* numbers for comparisons.77*/78unsigned int unr = nr;7980if (likely(unr < IA32_NR_syscalls)) {81unr = array_index_nospec(unr, IA32_NR_syscalls);82regs->ax = ia32_sys_call(regs, unr);83} else if (nr != -1) {84regs->ax = __ia32_sys_ni_syscall(regs);85}86}8788#ifdef CONFIG_IA32_EMULATION89static __always_inline bool int80_is_external(void)90{91const unsigned int offs = (0x80 / 32) * 0x10;92const u32 bit = BIT(0x80 % 32);9394/* The local APIC on XENPV guests is fake */95if (cpu_feature_enabled(X86_FEATURE_XENPV))96return false;9798/*99* If vector 0x80 is set in the APIC ISR then this is an external100* interrupt. Either from broken hardware or injected by a VMM.101*102* Note: In guest mode this is only valid for secure guests where103* the secure module fully controls the vAPIC exposed to the guest.104*/105return apic_read(APIC_ISR + offs) & bit;106}107108/**109* do_int80_emulation - 32-bit legacy syscall C entry from asm110* @regs: syscall arguments in struct pt_args on the stack.111*112* This entry point can be used by 32-bit and 64-bit programs to perform113* 32-bit system calls. Instances of INT $0x80 can be found inline in114* various programs and libraries. It is also used by the vDSO's115* __kernel_vsyscall fallback for hardware that doesn't support a faster116* entry method. Restarted 32-bit system calls also fall back to INT117* $0x80 regardless of what instruction was originally used to do the118* system call.119*120* This is considered a slow path. It is not used by most libc121* implementations on modern hardware except during process startup.122*123* The arguments for the INT $0x80 based syscall are on stack in the124* pt_regs structure:125* eax: system call number126* ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6127*/128__visible noinstr void do_int80_emulation(struct pt_regs *regs)129{130int nr;131132/* Kernel does not use INT $0x80! */133if (unlikely(!user_mode(regs))) {134irqentry_enter(regs);135instrumentation_begin();136panic("Unexpected external interrupt 0x80\n");137}138139/*140* Establish kernel context for instrumentation, including for141* int80_is_external() below which calls into the APIC driver.142* Identical for soft and external interrupts.143*/144enter_from_user_mode(regs);145146instrumentation_begin();147add_random_kstack_offset();148149/* Validate that this is a soft interrupt to the extent possible */150if (unlikely(int80_is_external()))151panic("Unexpected external interrupt 0x80\n");152153/*154* The low level idtentry code pushed -1 into regs::orig_ax155* and regs::ax contains the syscall number.156*157* User tracing code (ptrace or signal handlers) might assume158* that the regs::orig_ax contains a 32-bit number on invoking159* a 32-bit syscall.160*161* Establish the syscall convention by saving the 32bit truncated162* syscall number in regs::orig_ax and by invalidating regs::ax.163*/164regs->orig_ax = regs->ax & GENMASK(31, 0);165regs->ax = -ENOSYS;166167nr = syscall_32_enter(regs);168169local_irq_enable();170nr = syscall_enter_from_user_mode_work(regs, nr);171do_syscall_32_irqs_on(regs, nr);172173instrumentation_end();174syscall_exit_to_user_mode(regs);175}176177#ifdef CONFIG_X86_FRED178/*179* A FRED-specific INT80 handler is warranted for the follwing reasons:180*181* 1) As INT instructions and hardware interrupts are separate event182* types, FRED does not preclude the use of vector 0x80 for external183* interrupts. As a result, the FRED setup code does not reserve184* vector 0x80 and calling int80_is_external() is not merely185* suboptimal but actively incorrect: it could cause a system call186* to be incorrectly ignored.187*188* 2) It is called only for handling vector 0x80 of event type189* EVENT_TYPE_SWINT and will never be called to handle any external190* interrupt (event type EVENT_TYPE_EXTINT).191*192* 3) FRED has separate entry flows depending on if the event came from193* user space or kernel space, and because the kernel does not use194* INT insns, the FRED kernel entry handler fred_entry_from_kernel()195* falls through to fred_bad_type() if the event type is196* EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling197* an INT insn, it can only be from a user level.198*199* 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will200* likely take a different approach if it is ever needed: it201* probably belongs in either fred_intx()/ fred_other() or202* asm_fred_entrypoint_user(), depending on if this ought to be done203* for all entries from userspace or only system204* calls.205*206* 5) INT $0x80 is the fast path for 32-bit system calls under FRED.207*/208DEFINE_FREDENTRY_RAW(int80_emulation)209{210int nr;211212enter_from_user_mode(regs);213214instrumentation_begin();215add_random_kstack_offset();216217/*218* FRED pushed 0 into regs::orig_ax and regs::ax contains the219* syscall number.220*221* User tracing code (ptrace or signal handlers) might assume222* that the regs::orig_ax contains a 32-bit number on invoking223* a 32-bit syscall.224*225* Establish the syscall convention by saving the 32bit truncated226* syscall number in regs::orig_ax and by invalidating regs::ax.227*/228regs->orig_ax = regs->ax & GENMASK(31, 0);229regs->ax = -ENOSYS;230231nr = syscall_32_enter(regs);232233local_irq_enable();234nr = syscall_enter_from_user_mode_work(regs, nr);235do_syscall_32_irqs_on(regs, nr);236237instrumentation_end();238syscall_exit_to_user_mode(regs);239}240#endif /* CONFIG_X86_FRED */241242#else /* CONFIG_IA32_EMULATION */243244/* Handles int $0x80 on a 32bit kernel */245__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)246{247int nr = syscall_32_enter(regs);248249add_random_kstack_offset();250/*251* Subtlety here: if ptrace pokes something larger than 2^31-1 into252* orig_ax, the int return value truncates it. This matches253* the semantics of syscall_get_nr().254*/255nr = syscall_enter_from_user_mode(regs, nr);256instrumentation_begin();257258do_syscall_32_irqs_on(regs, nr);259260instrumentation_end();261syscall_exit_to_user_mode(regs);262}263#endif /* !CONFIG_IA32_EMULATION */264265static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)266{267int nr = syscall_32_enter(regs);268int res;269270add_random_kstack_offset();271/*272* This cannot use syscall_enter_from_user_mode() as it has to273* fetch EBP before invoking any of the syscall entry work274* functions.275*/276syscall_enter_from_user_mode_prepare(regs);277278instrumentation_begin();279/* Fetch EBP from where the vDSO stashed it. */280if (IS_ENABLED(CONFIG_X86_64)) {281/*282* Micro-optimization: the pointer we're following is283* explicitly 32 bits, so it can't be out of range.284*/285res = __get_user(*(u32 *)®s->bp,286(u32 __user __force *)(unsigned long)(u32)regs->sp);287} else {288res = get_user(*(u32 *)®s->bp,289(u32 __user __force *)(unsigned long)(u32)regs->sp);290}291292if (res) {293/* User code screwed up. */294regs->ax = -EFAULT;295296local_irq_disable();297instrumentation_end();298irqentry_exit_to_user_mode(regs);299return false;300}301302nr = syscall_enter_from_user_mode_work(regs, nr);303304/* Now this is just like a normal syscall. */305do_syscall_32_irqs_on(regs, nr);306307instrumentation_end();308syscall_exit_to_user_mode(regs);309return true;310}311312/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */313__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)314{315/*316* Called using the internal vDSO SYSENTER/SYSCALL32 calling317* convention. Adjust regs so it looks like we entered using int80.318*/319unsigned long landing_pad = (unsigned long)current->mm->context.vdso +320vdso_image_32.sym_int80_landing_pad;321322/*323* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward324* so that 'regs->ip -= 2' lands back on an int $0x80 instruction.325* Fix it up.326*/327regs->ip = landing_pad;328329/* Invoke the syscall. If it failed, keep it simple: use IRET. */330if (!__do_fast_syscall_32(regs))331return false;332333/*334* Check that the register state is valid for using SYSRETL/SYSEXIT335* to exit to userspace. Otherwise use the slower but fully capable336* IRET exit path.337*/338339/* XEN PV guests always use the IRET path */340if (cpu_feature_enabled(X86_FEATURE_XENPV))341return false;342343/* EIP must point to the VDSO landing pad */344if (unlikely(regs->ip != landing_pad))345return false;346347/* CS and SS must match the values set in MSR_STAR */348if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))349return false;350351/* If the TF, RF, or VM flags are set, use IRET */352if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))353return false;354355/* Use SYSRETL/SYSEXIT to exit to userspace */356return true;357}358359/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */360__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)361{362/* SYSENTER loses RSP, but the vDSO saved it in RBP. */363regs->sp = regs->bp;364365/* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */366regs->flags |= X86_EFLAGS_IF;367368return do_fast_syscall_32(regs);369}370371372