/* SPDX-License-Identifier: GPL-2.0 */1/*2* Copyright (C) 1991,1992 Linus Torvalds3*4* entry_32.S contains the system-call and low-level fault and trap handling routines.5*6* Stack layout while running C code:7* ptrace needs to have all registers on the stack.8* If the order here is changed, it needs to be9* updated in fork.c:copy_process(), signal.c:do_signal(),10* ptrace.c and ptrace.h11*12* 0(%esp) - %ebx13* 4(%esp) - %ecx14* 8(%esp) - %edx15* C(%esp) - %esi16* 10(%esp) - %edi17* 14(%esp) - %ebp18* 18(%esp) - %eax19* 1C(%esp) - %ds20* 20(%esp) - %es21* 24(%esp) - %fs22* 28(%esp) - unused -- was %gs on old stackprotector kernels23* 2C(%esp) - orig_eax24* 30(%esp) - %eip25* 34(%esp) - %cs26* 38(%esp) - %eflags27* 3C(%esp) - %oldesp28* 40(%esp) - %oldss29*/3031#include <linux/linkage.h>32#include <linux/err.h>33#include <asm/thread_info.h>34#include <asm/irqflags.h>35#include <asm/errno.h>36#include <asm/segment.h>37#include <asm/smp.h>38#include <asm/percpu.h>39#include <asm/processor-flags.h>40#include <asm/irq_vectors.h>41#include <asm/cpufeatures.h>42#include <asm/alternative.h>43#include <asm/asm.h>44#include <asm/smap.h>45#include <asm/frame.h>46#include <asm/trapnr.h>47#include <asm/nospec-branch.h>4849#include "calling.h"5051.section .entry.text, "ax"5253#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)5455/* Unconditionally switch to user cr3 */56.macro SWITCH_TO_USER_CR3 scratch_reg:req57ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI5859movl %cr3, \scratch_reg60orl $PTI_SWITCH_MASK, \scratch_reg61movl \scratch_reg, %cr362.Lend_\@:63.endm6465.macro BUG_IF_WRONG_CR3 no_user_check=066#ifdef CONFIG_DEBUG_ENTRY67ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI68.if \no_user_check == 069/* coming from usermode? */70testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)71jz .Lend_\@72.endif73/* On user-cr3? */74movl %cr3, %eax75testl $PTI_SWITCH_MASK, %eax76jnz .Lend_\@77/* From userspace with kernel cr3 - BUG */78ud279.Lend_\@:80#endif81.endm8283/*84* Switch to kernel cr3 if not already loaded and return current cr3 in85* \scratch_reg86*/87.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req88ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI89movl %cr3, \scratch_reg90/* Test if we are already on kernel CR3 */91testl $PTI_SWITCH_MASK, \scratch_reg92jz .Lend_\@93andl $(~PTI_SWITCH_MASK), \scratch_reg94movl \scratch_reg, %cr395/* Return original CR3 in \scratch_reg */96orl $PTI_SWITCH_MASK, \scratch_reg97.Lend_\@:98.endm99100#define CS_FROM_ENTRY_STACK (1 << 31)101#define CS_FROM_USER_CR3 (1 << 30)102#define CS_FROM_KERNEL (1 << 29)103#define CS_FROM_ESPFIX (1 << 28)104105.macro FIXUP_FRAME106/*107* The high bits of the CS dword (__csh) are used for CS_FROM_*.108* Clear them in case hardware didn't do this for us.109*/110andl $0x0000ffff, 4*4(%esp)111112#ifdef CONFIG_VM86113testl $X86_EFLAGS_VM, 5*4(%esp)114jnz .Lfrom_usermode_no_fixup_\@115#endif116testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)117jnz .Lfrom_usermode_no_fixup_\@118119orl $CS_FROM_KERNEL, 4*4(%esp)120121/*122* When we're here from kernel mode; the (exception) stack looks like:123*124* 6*4(%esp) - <previous context>125* 5*4(%esp) - flags126* 4*4(%esp) - cs127* 3*4(%esp) - ip128* 2*4(%esp) - orig_eax129* 1*4(%esp) - gs / function130* 0*4(%esp) - fs131*132* Lets build a 5 entry IRET frame after that, such that struct pt_regs133* is complete and in particular regs->sp is correct. This gives us134* the original 6 entries as gap:135*136* 14*4(%esp) - <previous context>137* 13*4(%esp) - gap / flags138* 12*4(%esp) - gap / cs139* 11*4(%esp) - gap / ip140* 10*4(%esp) - gap / orig_eax141* 9*4(%esp) - gap / gs / function142* 8*4(%esp) - gap / fs143* 7*4(%esp) - ss144* 6*4(%esp) - sp145* 5*4(%esp) - flags146* 4*4(%esp) - cs147* 3*4(%esp) - ip148* 2*4(%esp) - orig_eax149* 1*4(%esp) - gs / function150* 0*4(%esp) - fs151*/152153pushl %ss # ss154pushl %esp # sp (points at ss)155addl $7*4, (%esp) # point sp back at the previous context156pushl 7*4(%esp) # flags157pushl 7*4(%esp) # cs158pushl 7*4(%esp) # ip159pushl 7*4(%esp) # orig_eax160pushl 7*4(%esp) # gs / function161pushl 7*4(%esp) # fs162.Lfrom_usermode_no_fixup_\@:163.endm164165.macro IRET_FRAME166/*167* We're called with %ds, %es, %fs, and %gs from the interrupted168* frame, so we shouldn't use them. Also, we may be in ESPFIX169* mode and therefore have a nonzero SS base and an offset ESP,170* so any attempt to access the stack needs to use SS. (except for171* accesses through %esp, which automatically use SS.)172*/173testl $CS_FROM_KERNEL, 1*4(%esp)174jz .Lfinished_frame_\@175176/*177* Reconstruct the 3 entry IRET frame right after the (modified)178* regs->sp without lowering %esp in between, such that an NMI in the179* middle doesn't scribble our stack.180*/181pushl %eax182pushl %ecx183movl 5*4(%esp), %eax # (modified) regs->sp184185movl 4*4(%esp), %ecx # flags186movl %ecx, %ss:-1*4(%eax)187188movl 3*4(%esp), %ecx # cs189andl $0x0000ffff, %ecx190movl %ecx, %ss:-2*4(%eax)191192movl 2*4(%esp), %ecx # ip193movl %ecx, %ss:-3*4(%eax)194195movl 1*4(%esp), %ecx # eax196movl %ecx, %ss:-4*4(%eax)197198popl %ecx199lea -4*4(%eax), %esp200popl %eax201.Lfinished_frame_\@:202.endm203204.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0205cld206.if \skip_gs == 0207pushl $0208.endif209pushl %fs210211pushl %eax212movl $(__KERNEL_PERCPU), %eax213movl %eax, %fs214.if \unwind_espfix > 0215UNWIND_ESPFIX_STACK216.endif217popl %eax218219FIXUP_FRAME220pushl %es221pushl %ds222pushl \pt_regs_ax223pushl %ebp224pushl %edi225pushl %esi226pushl %edx227pushl %ecx228pushl %ebx229movl $(__USER_DS), %edx230movl %edx, %ds231movl %edx, %es232/* Switch to kernel stack if necessary */233.if \switch_stacks > 0234SWITCH_TO_KERNEL_STACK235.endif236.endm237238.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0239SAVE_ALL unwind_espfix=\unwind_espfix240241BUG_IF_WRONG_CR3242243/*244* Now switch the CR3 when PTI is enabled.245*246* We can enter with either user or kernel cr3, the code will247* store the old cr3 in \cr3_reg and switches to the kernel cr3248* if necessary.249*/250SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg251252.Lend_\@:253.endm254255.macro RESTORE_INT_REGS256popl %ebx257popl %ecx258popl %edx259popl %esi260popl %edi261popl %ebp262popl %eax263.endm264265.macro RESTORE_REGS pop=0266RESTORE_INT_REGS2671: popl %ds2682: popl %es2693: popl %fs2704: addl $(4 + \pop), %esp /* pop the unused "gs" slot */271IRET_FRAME272273/*274* There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is275* ASM the registers are known and we can trivially hard-code them.276*/277_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)278_ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)279_ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)280.endm281282.macro RESTORE_ALL_NMI cr3_reg:req pop=0283/*284* Now switch the CR3 when PTI is enabled.285*286* We enter with kernel cr3 and switch the cr3 to the value287* stored on \cr3_reg, which is either a user or a kernel cr3.288*/289ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI290291testl $PTI_SWITCH_MASK, \cr3_reg292jz .Lswitched_\@293294/* User cr3 in \cr3_reg - write it to hardware cr3 */295movl \cr3_reg, %cr3296297.Lswitched_\@:298299BUG_IF_WRONG_CR3300301RESTORE_REGS pop=\pop302.endm303304.macro CHECK_AND_APPLY_ESPFIX305#ifdef CONFIG_X86_ESPFIX32306#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)307#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET)308309ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX310311movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS312/*313* Warning: PT_OLDSS(%esp) contains the wrong/random values if we314* are returning to the kernel.315* See comments in process.c:copy_thread() for details.316*/317movb PT_OLDSS(%esp), %ah318movb PT_CS(%esp), %al319andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax320cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax321jne .Lend_\@ # returning to user-space with LDT SS322323/*324* Setup and switch to ESPFIX stack325*326* We're returning to userspace with a 16 bit stack. The CPU will not327* restore the high word of ESP for us on executing iret... This is an328* "official" bug of all the x86-compatible CPUs, which we can work329* around to make dosemu and wine happy. We do this by preloading the330* high word of ESP with the high word of the userspace ESP while331* compensating for the offset by changing to the ESPFIX segment with332* a base address that matches for the difference.333*/334mov %esp, %edx /* load kernel esp */335mov PT_OLDESP(%esp), %eax /* load userspace esp */336mov %dx, %ax /* eax: new kernel esp */337sub %eax, %edx /* offset (low word is 0) */338shr $16, %edx339mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */340mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */341pushl $__ESPFIX_SS342pushl %eax /* new kernel esp */343/*344* Disable interrupts, but do not irqtrace this section: we345* will soon execute iret and the tracer was already set to346* the irqstate after the IRET:347*/348cli349lss (%esp), %esp /* switch to espfix segment */350.Lend_\@:351#endif /* CONFIG_X86_ESPFIX32 */352.endm353354/*355* Called with pt_regs fully populated and kernel segments loaded,356* so we can access PER_CPU and use the integer registers.357*358* We need to be very careful here with the %esp switch, because an NMI359* can happen everywhere. If the NMI handler finds itself on the360* entry-stack, it will overwrite the task-stack and everything we361* copied there. So allocate the stack-frame on the task-stack and362* switch to it before we do any copying.363*/364365.macro SWITCH_TO_KERNEL_STACK366367BUG_IF_WRONG_CR3368369SWITCH_TO_KERNEL_CR3 scratch_reg=%eax370371/*372* %eax now contains the entry cr3 and we carry it forward in373* that register for the time this macro runs374*/375376/* Are we on the entry stack? Bail out if not! */377movl PER_CPU_VAR(cpu_entry_area), %ecx378addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx379subl %esp, %ecx /* ecx = (end of entry_stack) - esp */380cmpl $SIZEOF_entry_stack, %ecx381jae .Lend_\@382383/* Load stack pointer into %esi and %edi */384movl %esp, %esi385movl %esi, %edi386387/* Move %edi to the top of the entry stack */388andl $(MASK_entry_stack), %edi389addl $(SIZEOF_entry_stack), %edi390391/* Load top of task-stack into %edi */392movl TSS_entry2task_stack(%edi), %edi393394/* Special case - entry from kernel mode via entry stack */395#ifdef CONFIG_VM86396movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS397movb PT_CS(%esp), %cl398andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx399#else400movl PT_CS(%esp), %ecx401andl $SEGMENT_RPL_MASK, %ecx402#endif403cmpl $USER_RPL, %ecx404jb .Lentry_from_kernel_\@405406/* Bytes to copy */407movl $PTREGS_SIZE, %ecx408409#ifdef CONFIG_VM86410testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)411jz .Lcopy_pt_regs_\@412413/*414* Stack-frame contains 4 additional segment registers when415* coming from VM86 mode416*/417addl $(4 * 4), %ecx418419#endif420.Lcopy_pt_regs_\@:421422/* Allocate frame on task-stack */423subl %ecx, %edi424425/* Switch to task-stack */426movl %edi, %esp427428/*429* We are now on the task-stack and can safely copy over the430* stack-frame431*/432shrl $2, %ecx433cld434rep movsl435436jmp .Lend_\@437438.Lentry_from_kernel_\@:439440/*441* This handles the case when we enter the kernel from442* kernel-mode and %esp points to the entry-stack. When this443* happens we need to switch to the task-stack to run C code,444* but switch back to the entry-stack again when we approach445* iret and return to the interrupted code-path. This usually446* happens when we hit an exception while restoring user-space447* segment registers on the way back to user-space or when the448* sysenter handler runs with eflags.tf set.449*450* When we switch to the task-stack here, we can't trust the451* contents of the entry-stack anymore, as the exception handler452* might be scheduled out or moved to another CPU. Therefore we453* copy the complete entry-stack to the task-stack and set a454* marker in the iret-frame (bit 31 of the CS dword) to detect455* what we've done on the iret path.456*457* On the iret path we copy everything back and switch to the458* entry-stack, so that the interrupted kernel code-path459* continues on the same stack it was interrupted with.460*461* Be aware that an NMI can happen anytime in this code.462*463* %esi: Entry-Stack pointer (same as %esp)464* %edi: Top of the task stack465* %eax: CR3 on kernel entry466*/467468/* Calculate number of bytes on the entry stack in %ecx */469movl %esi, %ecx470471/* %ecx to the top of entry-stack */472andl $(MASK_entry_stack), %ecx473addl $(SIZEOF_entry_stack), %ecx474475/* Number of bytes on the entry stack to %ecx */476sub %esi, %ecx477478/* Mark stackframe as coming from entry stack */479orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)480481/*482* Test the cr3 used to enter the kernel and add a marker483* so that we can switch back to it before iret.484*/485testl $PTI_SWITCH_MASK, %eax486jz .Lcopy_pt_regs_\@487orl $CS_FROM_USER_CR3, PT_CS(%esp)488489/*490* %esi and %edi are unchanged, %ecx contains the number of491* bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate492* the stack-frame on task-stack and copy everything over493*/494jmp .Lcopy_pt_regs_\@495496.Lend_\@:497.endm498499/*500* Switch back from the kernel stack to the entry stack.501*502* The %esp register must point to pt_regs on the task stack. It will503* first calculate the size of the stack-frame to copy, depending on504* whether we return to VM86 mode or not. With that it uses 'rep movsl'505* to copy the contents of the stack over to the entry stack.506*507* We must be very careful here, as we can't trust the contents of the508* task-stack once we switched to the entry-stack. When an NMI happens509* while on the entry-stack, the NMI handler will switch back to the top510* of the task stack, overwriting our stack-frame we are about to copy.511* Therefore we switch the stack only after everything is copied over.512*/513.macro SWITCH_TO_ENTRY_STACK514515/* Bytes to copy */516movl $PTREGS_SIZE, %ecx517518#ifdef CONFIG_VM86519testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)520jz .Lcopy_pt_regs_\@521522/* Additional 4 registers to copy when returning to VM86 mode */523addl $(4 * 4), %ecx524525.Lcopy_pt_regs_\@:526#endif527528/* Initialize source and destination for movsl */529movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi530subl %ecx, %edi531movl %esp, %esi532533/* Save future stack pointer in %ebx */534movl %edi, %ebx535536/* Copy over the stack-frame */537shrl $2, %ecx538cld539rep movsl540541/*542* Switch to entry-stack - needs to happen after everything is543* copied because the NMI handler will overwrite the task-stack544* when on entry-stack545*/546movl %ebx, %esp547548.Lend_\@:549.endm550551/*552* This macro handles the case when we return to kernel-mode on the iret553* path and have to switch back to the entry stack and/or user-cr3554*555* See the comments below the .Lentry_from_kernel_\@ label in the556* SWITCH_TO_KERNEL_STACK macro for more details.557*/558.macro PARANOID_EXIT_TO_KERNEL_MODE559560/*561* Test if we entered the kernel with the entry-stack. Most562* likely we did not, because this code only runs on the563* return-to-kernel path.564*/565testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)566jz .Lend_\@567568/* Unlikely slow-path */569570/* Clear marker from stack-frame */571andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)572573/* Copy the remaining task-stack contents to entry-stack */574movl %esp, %esi575movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi576577/* Bytes on the task-stack to ecx */578movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx579subl %esi, %ecx580581/* Allocate stack-frame on entry-stack */582subl %ecx, %edi583584/*585* Save future stack-pointer, we must not switch until the586* copy is done, otherwise the NMI handler could destroy the587* contents of the task-stack we are about to copy.588*/589movl %edi, %ebx590591/* Do the copy */592shrl $2, %ecx593cld594rep movsl595596/* Safe to switch to entry-stack now */597movl %ebx, %esp598599/*600* We came from entry-stack and need to check if we also need to601* switch back to user cr3.602*/603testl $CS_FROM_USER_CR3, PT_CS(%esp)604jz .Lend_\@605606/* Clear marker from stack-frame */607andl $(~CS_FROM_USER_CR3), PT_CS(%esp)608609SWITCH_TO_USER_CR3 scratch_reg=%eax610611.Lend_\@:612.endm613614/**615* idtentry - Macro to generate entry stubs for simple IDT entries616* @vector: Vector number617* @asmsym: ASM symbol for the entry point618* @cfunc: C function to be called619* @has_error_code: Hardware pushed error code on stack620*/621.macro idtentry vector asmsym cfunc has_error_code:req622SYM_CODE_START(\asmsym)623ASM_CLAC624cld625626.if \has_error_code == 0627pushl $0 /* Clear the error code */628.endif629630/* Push the C-function address into the GS slot */631pushl $\cfunc632/* Invoke the common exception entry */633jmp handle_exception634SYM_CODE_END(\asmsym)635.endm636637.macro idtentry_irq vector cfunc638.p2align CONFIG_X86_L1_CACHE_SHIFT639SYM_CODE_START_LOCAL(asm_\cfunc)640ASM_CLAC641SAVE_ALL switch_stacks=1642ENCODE_FRAME_POINTER643movl %esp, %eax644movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */645movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */646call \cfunc647jmp handle_exception_return648SYM_CODE_END(asm_\cfunc)649.endm650651/*652* Include the defines which emit the idt entries which are shared653* shared between 32 and 64 bit and emit the __irqentry_text_* markers654* so the stacktrace boundary checks work.655*/656.align 16657.globl __irqentry_text_start658__irqentry_text_start:659660#include <asm/idtentry.h>661662.align 16663.globl __irqentry_text_end664__irqentry_text_end:665666/*667* %eax: prev task668* %edx: next task669*/670.pushsection .text, "ax"671SYM_CODE_START(__switch_to_asm)672/*673* Save callee-saved registers674* This must match the order in struct inactive_task_frame675*/676pushl %ebp677pushl %ebx678pushl %edi679pushl %esi680/*681* Flags are saved to prevent AC leakage. This could go682* away if objtool would have 32bit support to verify683* the STAC/CLAC correctness.684*/685pushfl686687/* switch stack */688movl %esp, TASK_threadsp(%eax)689movl TASK_threadsp(%edx), %esp690691#ifdef CONFIG_STACKPROTECTOR692movl TASK_stack_canary(%edx), %ebx693movl %ebx, PER_CPU_VAR(__stack_chk_guard)694#endif695696/*697* When switching from a shallower to a deeper call stack698* the RSB may either underflow or use entries populated699* with userspace addresses. On CPUs where those concerns700* exist, overwrite the RSB with entries which capture701* speculative execution to prevent attack.702*/703FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW704705/* Restore flags or the incoming task to restore AC state. */706popfl707/* restore callee-saved registers */708popl %esi709popl %edi710popl %ebx711popl %ebp712713jmp __switch_to714SYM_CODE_END(__switch_to_asm)715.popsection716717/*718* A newly forked process directly context switches into this address.719*720* eax: prev task we switched from721* ebx: kernel thread func (NULL for user thread)722* edi: kernel thread arg723*/724.pushsection .text, "ax"725SYM_CODE_START(ret_from_fork_asm)726movl %esp, %edx /* regs */727728/* return address for the stack unwinder */729pushl $.Lsyscall_32_done730731FRAME_BEGIN732/* prev already in EAX */733movl %ebx, %ecx /* fn */734pushl %edi /* fn_arg */735call ret_from_fork736addl $4, %esp737FRAME_END738739RET740SYM_CODE_END(ret_from_fork_asm)741.popsection742743SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)744/*745* All code from here through __end_SYSENTER_singlestep_region is subject746* to being single-stepped if a user program sets TF and executes SYSENTER.747* There is absolutely nothing that we can do to prevent this from happening748* (thanks Intel!). To keep our handling of this situation as simple as749* possible, we handle TF just like AC and NT, except that our #DB handler750* will ignore all of the single-step traps generated in this range.751*/752753/*754* 32-bit SYSENTER entry.755*756* 32-bit system calls through the vDSO's __kernel_vsyscall enter here757* if X86_FEATURE_SEP is available. This is the preferred system call758* entry on 32-bit systems.759*760* The SYSENTER instruction, in principle, should *only* occur in the761* vDSO. In practice, a small number of Android devices were shipped762* with a copy of Bionic that inlined a SYSENTER instruction. This763* never happened in any of Google's Bionic versions -- it only happened764* in a narrow range of Intel-provided versions.765*766* SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.767* IF and VM in RFLAGS are cleared (IOW: interrupts are off).768* SYSENTER does not save anything on the stack,769* and does not save old EIP (!!!), ESP, or EFLAGS.770*771* To avoid losing track of EFLAGS.VM (and thus potentially corrupting772* user and/or vm86 state), we explicitly disable the SYSENTER773* instruction in vm86 mode by reprogramming the MSRs.774*775* Arguments:776* eax system call number777* ebx arg1778* ecx arg2779* edx arg3780* esi arg4781* edi arg5782* ebp user stack783* 0(%ebp) arg6784*/785SYM_FUNC_START(entry_SYSENTER_32)786/*787* On entry-stack with all userspace-regs live - save and788* restore eflags and %eax to use it as scratch-reg for the cr3789* switch.790*/791pushfl792pushl %eax793BUG_IF_WRONG_CR3 no_user_check=1794SWITCH_TO_KERNEL_CR3 scratch_reg=%eax795popl %eax796popfl797798/* Stack empty again, switch to task stack */799movl TSS_entry2task_stack(%esp), %esp800801.Lsysenter_past_esp:802pushl $__USER_DS /* pt_regs->ss */803pushl $0 /* pt_regs->sp (placeholder) */804pushfl /* pt_regs->flags (except IF = 0) */805pushl $__USER_CS /* pt_regs->cs */806pushl $0 /* pt_regs->ip = 0 (placeholder) */807pushl %eax /* pt_regs->orig_ax */808SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */809810/*811* SYSENTER doesn't filter flags, so we need to clear NT, AC812* and TF ourselves. To save a few cycles, we can check whether813* either was set instead of doing an unconditional popfq.814* This needs to happen before enabling interrupts so that815* we don't get preempted with NT set.816*817* If TF is set, we will single-step all the way to here -- do_debug818* will ignore all the traps. (Yes, this is slow, but so is819* single-stepping in general. This allows us to avoid having820* a more complicated code to handle the case where a user program821* forces us to single-step through the SYSENTER entry code.)822*823* NB.: .Lsysenter_fix_flags is a label with the code under it moved824* out-of-line as an optimization: NT is unlikely to be set in the825* majority of the cases and instead of polluting the I$ unnecessarily,826* we're keeping that code behind a branch which will predict as827* not-taken and therefore its instructions won't be fetched.828*/829testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)830jnz .Lsysenter_fix_flags831.Lsysenter_flags_fixed:832833movl %esp, %eax834call do_SYSENTER_32835testb %al, %al836jz .Lsyscall_32_done837838STACKLEAK_ERASE839840/* Opportunistic SYSEXIT */841842/*843* Setup entry stack - we keep the pointer in %eax and do the844* switch after almost all user-state is restored.845*/846847/* Load entry stack pointer and allocate frame for eflags/eax */848movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax849subl $(2*4), %eax850851/* Copy eflags and eax to entry stack */852movl PT_EFLAGS(%esp), %edi853movl PT_EAX(%esp), %esi854movl %edi, (%eax)855movl %esi, 4(%eax)856857/* Restore user registers and segments */858movl PT_EIP(%esp), %edx /* pt_regs->ip */859movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */8601: mov PT_FS(%esp), %fs861862popl %ebx /* pt_regs->bx */863addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */864popl %esi /* pt_regs->si */865popl %edi /* pt_regs->di */866popl %ebp /* pt_regs->bp */867868/* Switch to entry stack */869movl %eax, %esp870871/* Now ready to switch the cr3 */872SWITCH_TO_USER_CR3 scratch_reg=%eax873/* Clobbers ZF */874CLEAR_CPU_BUFFERS875876/*877* Restore all flags except IF. (We restore IF separately because878* STI gives a one-instruction window in which we won't be interrupted,879* whereas POPF does not.)880*/881btrl $X86_EFLAGS_IF_BIT, (%esp)882BUG_IF_WRONG_CR3 no_user_check=1883popfl884popl %eax885886/*887* Return back to the vDSO, which will pop ecx and edx.888* Don't bother with DS and ES (they already contain __USER_DS).889*/890sti891sysexit8928932: movl $0, PT_FS(%esp)894jmp 1b895_ASM_EXTABLE(1b, 2b)896897.Lsysenter_fix_flags:898pushl $X86_EFLAGS_FIXED899popfl900jmp .Lsysenter_flags_fixed901SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)902SYM_FUNC_END(entry_SYSENTER_32)903904/*905* 32-bit legacy system call entry.906*907* 32-bit x86 Linux system calls traditionally used the INT $0x80908* instruction. INT $0x80 lands here.909*910* This entry point can be used by any 32-bit perform system calls.911* Instances of INT $0x80 can be found inline in various programs and912* libraries. It is also used by the vDSO's __kernel_vsyscall913* fallback for hardware that doesn't support a faster entry method.914* Restarted 32-bit system calls also fall back to INT $0x80915* regardless of what instruction was originally used to do the system916* call. (64-bit programs can use INT $0x80 as well, but they can917* only run on 64-bit kernels and therefore land in918* entry_INT80_compat.)919*920* This is considered a slow path. It is not used by most libc921* implementations on modern hardware except during process startup.922*923* Arguments:924* eax system call number925* ebx arg1926* ecx arg2927* edx arg3928* esi arg4929* edi arg5930* ebp arg6931*/932SYM_FUNC_START(entry_INT80_32)933ASM_CLAC934pushl %eax /* pt_regs->orig_ax */935936SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */937938movl %esp, %eax939call do_int80_syscall_32940.Lsyscall_32_done:941STACKLEAK_ERASE942943restore_all_switch_stack:944SWITCH_TO_ENTRY_STACK945CHECK_AND_APPLY_ESPFIX946947/* Switch back to user CR3 */948SWITCH_TO_USER_CR3 scratch_reg=%eax949950BUG_IF_WRONG_CR3951952/* Restore user state */953RESTORE_REGS pop=4 # skip orig_eax/error_code954CLEAR_CPU_BUFFERS955.Lirq_return:956/*957* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization958* when returning from IPI handler and when returning from959* scheduler to user-space.960*/961iret962963.Lasm_iret_error:964pushl $0 # no error code965pushl $iret_error966967#ifdef CONFIG_DEBUG_ENTRY968/*969* The stack-frame here is the one that iret faulted on, so its a970* return-to-user frame. We are on kernel-cr3 because we come here from971* the fixup code. This confuses the CR3 checker, so switch to user-cr3972* as the checker expects it.973*/974pushl %eax975SWITCH_TO_USER_CR3 scratch_reg=%eax976popl %eax977#endif978979jmp handle_exception980981_ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)982SYM_FUNC_END(entry_INT80_32)983984.macro FIXUP_ESPFIX_STACK985/*986* Switch back for ESPFIX stack to the normal zerobased stack987*988* We can't call C functions using the ESPFIX stack. This code reads989* the high word of the segment base from the GDT and swiches to the990* normal stack and adjusts ESP with the matching offset.991*992* We might be on user CR3 here, so percpu data is not mapped and we can't993* access the GDT through the percpu segment. Instead, use SGDT to find994* the cpu_entry_area alias of the GDT.995*/996#ifdef CONFIG_X86_ESPFIX32997/* fixup the stack */998pushl %ecx999subl $2*4, %esp1000sgdt (%esp)1001movl 2(%esp), %ecx /* GDT address */1002/*1003* Careful: ECX is a linear pointer, so we need to force base1004* zero. %cs is the only known-linear segment we have right now.1005*/1006mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */1007mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */1008shl $16, %eax1009addl $2*4, %esp1010popl %ecx1011addl %esp, %eax /* the adjusted stack pointer */1012pushl $__KERNEL_DS1013pushl %eax1014lss (%esp), %esp /* switch to the normal stack segment */1015#endif1016.endm10171018.macro UNWIND_ESPFIX_STACK1019/* It's safe to clobber %eax, all other regs need to be preserved */1020#ifdef CONFIG_X86_ESPFIX321021movl %ss, %eax1022/* see if on espfix stack */1023cmpw $__ESPFIX_SS, %ax1024jne .Lno_fixup_\@1025/* switch to normal stack */1026FIXUP_ESPFIX_STACK1027.Lno_fixup_\@:1028#endif1029.endm10301031SYM_CODE_START_LOCAL_NOALIGN(handle_exception)1032/* the function address is in %gs's slot on the stack */1033SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=11034ENCODE_FRAME_POINTER10351036movl PT_GS(%esp), %edi # get the function address10371038/* fixup orig %eax */1039movl PT_ORIG_EAX(%esp), %edx # get the error code1040movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart10411042movl %esp, %eax # pt_regs pointer1043CALL_NOSPEC edi10441045handle_exception_return:1046#ifdef CONFIG_VM861047movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS1048movb PT_CS(%esp), %al1049andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax1050#else1051/*1052* We can be coming here from child spawned by kernel_thread().1053*/1054movl PT_CS(%esp), %eax1055andl $SEGMENT_RPL_MASK, %eax1056#endif1057cmpl $USER_RPL, %eax # returning to v8086 or userspace ?1058jnb ret_to_user10591060PARANOID_EXIT_TO_KERNEL_MODE1061BUG_IF_WRONG_CR31062RESTORE_REGS 41063jmp .Lirq_return10641065ret_to_user:1066movl %esp, %eax1067jmp restore_all_switch_stack1068SYM_CODE_END(handle_exception)10691070SYM_CODE_START(asm_exc_double_fault)10711:1072/*1073* This is a task gate handler, not an interrupt gate handler.1074* The error code is on the stack, but the stack is otherwise1075* empty. Interrupts are off. Our state is sane with the following1076* exceptions:1077*1078* - CR0.TS is set. "TS" literally means "task switched".1079* - EFLAGS.NT is set because we're a "nested task".1080* - The doublefault TSS has back_link set and has been marked busy.1081* - TR points to the doublefault TSS and the normal TSS is busy.1082* - CR3 is the normal kernel PGD. This would be delightful, except1083* that the CPU didn't bother to save the old CR3 anywhere. This1084* would make it very awkward to return back to the context we came1085* from.1086*1087* The rest of EFLAGS is sanitized for us, so we don't need to1088* worry about AC or DF.1089*1090* Don't even bother popping the error code. It's always zero,1091* and ignoring it makes us a bit more robust against buggy1092* hypervisor task gate implementations.1093*1094* We will manually undo the task switch instead of doing a1095* task-switching IRET.1096*/10971098clts /* clear CR0.TS */1099pushl $X86_EFLAGS_FIXED1100popfl /* clear EFLAGS.NT */11011102call doublefault_shim11031104/* We don't support returning, so we have no IRET here. */11051:1106hlt1107jmp 1b1108SYM_CODE_END(asm_exc_double_fault)11091110/*1111* NMI is doubly nasty. It can happen on the first instruction of1112* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning1113* of the #DB handler even if that #DB in turn hit before entry_SYSENTER_321114* switched stacks. We handle both conditions by simply checking whether we1115* interrupted kernel code running on the SYSENTER stack.1116*/1117SYM_CODE_START(asm_exc_nmi)1118ASM_CLAC11191120#ifdef CONFIG_X86_ESPFIX321121/*1122* ESPFIX_SS is only ever set on the return to user path1123* after we've switched to the entry stack.1124*/1125pushl %eax1126movl %ss, %eax1127cmpw $__ESPFIX_SS, %ax1128popl %eax1129je .Lnmi_espfix_stack1130#endif11311132pushl %eax # pt_regs->orig_ax1133SAVE_ALL_NMI cr3_reg=%edi1134ENCODE_FRAME_POINTER1135xorl %edx, %edx # zero error code1136movl %esp, %eax # pt_regs pointer11371138/* Are we currently on the SYSENTER stack? */1139movl PER_CPU_VAR(cpu_entry_area), %ecx1140addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx1141subl %eax, %ecx /* ecx = (end of entry_stack) - esp */1142cmpl $SIZEOF_entry_stack, %ecx1143jb .Lnmi_from_sysenter_stack11441145/* Not on SYSENTER stack. */1146call exc_nmi1147jmp .Lnmi_return11481149.Lnmi_from_sysenter_stack:1150/*1151* We're on the SYSENTER stack. Switch off. No one (not even debug)1152* is using the thread stack right now, so it's safe for us to use it.1153*/1154movl %esp, %ebx1155movl PER_CPU_VAR(cpu_current_top_of_stack), %esp1156call exc_nmi1157movl %ebx, %esp11581159.Lnmi_return:1160#ifdef CONFIG_X86_ESPFIX321161testl $CS_FROM_ESPFIX, PT_CS(%esp)1162jnz .Lnmi_from_espfix1163#endif11641165CHECK_AND_APPLY_ESPFIX1166RESTORE_ALL_NMI cr3_reg=%edi pop=41167CLEAR_CPU_BUFFERS1168jmp .Lirq_return11691170#ifdef CONFIG_X86_ESPFIX321171.Lnmi_espfix_stack:1172/*1173* Create the pointer to LSS back1174*/1175pushl %ss1176pushl %esp1177addl $4, (%esp)11781179/* Copy the (short) IRET frame */1180pushl 4*4(%esp) # flags1181pushl 4*4(%esp) # cs1182pushl 4*4(%esp) # ip11831184pushl %eax # orig_ax11851186SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=11187ENCODE_FRAME_POINTER11881189/* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */1190xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)11911192xorl %edx, %edx # zero error code1193movl %esp, %eax # pt_regs pointer1194jmp .Lnmi_from_sysenter_stack11951196.Lnmi_from_espfix:1197RESTORE_ALL_NMI cr3_reg=%edi1198/*1199* Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to1200* fix up the gap and long frame:1201*1202* 3 - original frame (exception)1203* 2 - ESPFIX block (above)1204* 6 - gap (FIXUP_FRAME)1205* 5 - long frame (FIXUP_FRAME)1206* 1 - orig_ax1207*/1208lss (1+5+6)*4(%esp), %esp # back to espfix stack1209CLEAR_CPU_BUFFERS1210jmp .Lirq_return1211#endif1212SYM_CODE_END(asm_exc_nmi)12131214.pushsection .text, "ax"1215SYM_CODE_START(rewind_stack_and_make_dead)1216/* Prevent any naive code from trying to unwind to our caller. */1217xorl %ebp, %ebp12181219movl PER_CPU_VAR(cpu_current_top_of_stack), %esi1220leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp12211222call make_task_dead12231: jmp 1b1224SYM_CODE_END(rewind_stack_and_make_dead)1225.popsection122612271228