/* SPDX-License-Identifier: GPL-2.0 */1/*2* linux/arch/x86_64/entry.S3*4* Copyright (C) 1991, 1992 Linus Torvalds5* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs6* Copyright (C) 2000 Pavel Machek <[email protected]>7*8* entry.S contains the system-call and fault low-level handling routines.9*10* Some of this is documented in Documentation/arch/x86/entry_64.rst11*12* A note on terminology:13* - iret frame: Architecture defined interrupt frame from SS to RIP14* at the top of the kernel process stack.15*16* Some macro usage:17* - SYM_FUNC_START/END:Define functions in the symbol table.18* - idtentry: Define exception entry points.19*/20#include <linux/export.h>21#include <linux/linkage.h>22#include <asm/segment.h>23#include <asm/cache.h>24#include <asm/errno.h>25#include <asm/asm-offsets.h>26#include <asm/msr.h>27#include <asm/unistd.h>28#include <asm/thread_info.h>29#include <asm/hw_irq.h>30#include <asm/page_types.h>31#include <asm/irqflags.h>32#include <asm/paravirt.h>33#include <asm/percpu.h>34#include <asm/asm.h>35#include <asm/smap.h>36#include <asm/pgtable_types.h>37#include <asm/frame.h>38#include <asm/trapnr.h>39#include <asm/nospec-branch.h>40#include <asm/fsgsbase.h>41#include <linux/err.h>4243#include "calling.h"4445.code6446.section .entry.text, "ax"4748/*49* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.50*51* This is the only entry point used for 64-bit system calls. The52* hardware interface is reasonably well designed and the register to53* argument mapping Linux uses fits well with the registers that are54* available when SYSCALL is used.55*56* SYSCALL instructions can be found inlined in libc implementations as57* well as some other programs and libraries. There are also a handful58* of SYSCALL instructions in the vDSO used, for example, as a59* clock_gettimeofday fallback.60*61* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,62* then loads new ss, cs, and rip from previously programmed MSRs.63* rflags gets masked by a value from another MSR (so CLD and CLAC64* are not needed). SYSCALL does not save anything on the stack65* and does not change rsp.66*67* Registers on entry:68* rax system call number69* rcx return address70* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)71* rdi arg072* rsi arg173* rdx arg274* r10 arg3 (needs to be moved to rcx to conform to C ABI)75* r8 arg476* r9 arg577* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)78*79* Only called from user space.80*81* When user can change pt_regs->foo always force IRET. That is because82* it deals with uncanonical addresses better. SYSRET has trouble83* with them due to bugs in both AMD and Intel CPUs.84*/8586SYM_CODE_START(entry_SYSCALL_64)87UNWIND_HINT_ENTRY88ENDBR8990swapgs91/* tss.sp2 is scratch space. */92movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)93SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp94movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp9596SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)97ANNOTATE_NOENDBR9899/* Construct struct pt_regs on stack */100pushq $__USER_DS /* pt_regs->ss */101pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */102pushq %r11 /* pt_regs->flags */103pushq $__USER_CS /* pt_regs->cs */104pushq %rcx /* pt_regs->ip */105SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)106pushq %rax /* pt_regs->orig_ax */107108PUSH_AND_CLEAR_REGS rax=$-ENOSYS109110/* IRQs are off. */111movq %rsp, %rdi112/* Sign extend the lower 32bit as syscall numbers are treated as int */113movslq %eax, %rsi114115/* clobbers %rax, make sure it is after saving the syscall nr */116IBRS_ENTER117UNTRAIN_RET118CLEAR_BRANCH_HISTORY119120call do_syscall_64 /* returns with IRQs disabled */121122/*123* Try to use SYSRET instead of IRET if we're returning to124* a completely clean 64-bit userspace context. If we're not,125* go to the slow exit path.126* In the Xen PV case we must use iret anyway.127*/128129ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \130"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV131132/*133* We win! This label is here just for ease of understanding134* perf profiles. Nothing jumps here.135*/136syscall_return_via_sysret:137IBRS_EXIT138POP_REGS pop_rdi=0139140/*141* Now all regs are restored except RSP and RDI.142* Save old stack pointer and switch to trampoline stack.143*/144movq %rsp, %rdi145movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp146UNWIND_HINT_END_OF_STACK147148pushq RSP-RDI(%rdi) /* RSP */149pushq (%rdi) /* RDI */150151/*152* We are on the trampoline stack. All regs except RDI are live.153* We can do future final exit work right here.154*/155STACKLEAK_ERASE_NOCLOBBER156157SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi158159popq %rdi160popq %rsp161SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)162ANNOTATE_NOENDBR163swapgs164CLEAR_CPU_BUFFERS165sysretq166SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)167ANNOTATE_NOENDBR168int3169SYM_CODE_END(entry_SYSCALL_64)170171/*172* %rdi: prev task173* %rsi: next task174*/175.pushsection .text, "ax"176SYM_FUNC_START(__switch_to_asm)177ANNOTATE_NOENDBR178/*179* Save callee-saved registers180* This must match the order in inactive_task_frame181*/182pushq %rbp183pushq %rbx184pushq %r12185pushq %r13186pushq %r14187pushq %r15188189/* switch stack */190movq %rsp, TASK_threadsp(%rdi)191movq TASK_threadsp(%rsi), %rsp192193#ifdef CONFIG_STACKPROTECTOR194movq TASK_stack_canary(%rsi), %rbx195movq %rbx, PER_CPU_VAR(__stack_chk_guard)196#endif197198/*199* When switching from a shallower to a deeper call stack200* the RSB may either underflow or use entries populated201* with userspace addresses. On CPUs where those concerns202* exist, overwrite the RSB with entries which capture203* speculative execution to prevent attack.204*/205FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW206207/* restore callee-saved registers */208popq %r15209popq %r14210popq %r13211popq %r12212popq %rbx213popq %rbp214215jmp __switch_to216SYM_FUNC_END(__switch_to_asm)217.popsection218219/*220* A newly forked process directly context switches into this address.221*222* rax: prev task we switched from223* rbx: kernel thread func (NULL for user thread)224* r12: kernel thread arg225*/226.pushsection .text, "ax"227SYM_CODE_START(ret_from_fork_asm)228/*229* This is the start of the kernel stack; even through there's a230* register set at the top, the regset isn't necessarily coherent231* (consider kthreads) and one cannot unwind further.232*233* This ensures stack unwinds of kernel threads terminate in a known234* good state.235*/236UNWIND_HINT_END_OF_STACK237ANNOTATE_NOENDBR // copy_thread238CALL_DEPTH_ACCOUNT239240movq %rax, %rdi /* prev */241movq %rsp, %rsi /* regs */242movq %rbx, %rdx /* fn */243movq %r12, %rcx /* fn_arg */244call ret_from_fork245246/*247* Set the stack state to what is expected for the target function248* -- at this point the register set should be a valid user set249* and unwind should work normally.250*/251UNWIND_HINT_REGS252253#ifdef CONFIG_X86_FRED254ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \255"jmp asm_fred_exit_user", X86_FEATURE_FRED256#else257jmp swapgs_restore_regs_and_return_to_usermode258#endif259SYM_CODE_END(ret_from_fork_asm)260.popsection261262.macro DEBUG_ENTRY_ASSERT_IRQS_OFF263#ifdef CONFIG_DEBUG_ENTRY264pushq %rax265SAVE_FLAGS266testl $X86_EFLAGS_IF, %eax267jz .Lokay_\@268ud2269.Lokay_\@:270popq %rax271#endif272.endm273274SYM_CODE_START(xen_error_entry)275ANNOTATE_NOENDBR276UNWIND_HINT_FUNC277PUSH_AND_CLEAR_REGS save_ret=1278ENCODE_FRAME_POINTER 8279UNTRAIN_RET_FROM_CALL280RET281SYM_CODE_END(xen_error_entry)282283/**284* idtentry_body - Macro to emit code calling the C function285* @cfunc: C function to be called286* @has_error_code: Hardware pushed error code on stack287*/288.macro idtentry_body cfunc has_error_code:req289290/*291* Call error_entry() and switch to the task stack if from userspace.292*293* When in XENPV, it is already in the task stack, and it can't fault294* for native_iret() nor native_load_gs_index() since XENPV uses its295* own pvops for IRET and load_gs_index(). And it doesn't need to296* switch the CR3. So it can skip invoking error_entry().297*/298ALTERNATIVE "call error_entry; movq %rax, %rsp", \299"call xen_error_entry", X86_FEATURE_XENPV300301ENCODE_FRAME_POINTER302UNWIND_HINT_REGS303304movq %rsp, %rdi /* pt_regs pointer into 1st argument*/305306.if \has_error_code == 1307movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/308movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */309.endif310311/* For some configurations \cfunc ends up being a noreturn. */312ANNOTATE_REACHABLE313call \cfunc314315jmp error_return316.endm317318/**319* idtentry - Macro to generate entry stubs for simple IDT entries320* @vector: Vector number321* @asmsym: ASM symbol for the entry point322* @cfunc: C function to be called323* @has_error_code: Hardware pushed error code on stack324*325* The macro emits code to set up the kernel context for straight forward326* and simple IDT entries. No IST stack, no paranoid entry checks.327*/328.macro idtentry vector asmsym cfunc has_error_code:req329SYM_CODE_START(\asmsym)330331.if \vector == X86_TRAP_BP332/* #BP advances %rip to the next instruction */333UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0334.else335UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8336.endif337338ENDBR339ASM_CLAC340cld341342.if \has_error_code == 0343pushq $-1 /* ORIG_RAX: no syscall to restart */344.endif345346.if \vector == X86_TRAP_BP347/*348* If coming from kernel space, create a 6-word gap to allow the349* int3 handler to emulate a call instruction.350*/351testb $3, CS-ORIG_RAX(%rsp)352jnz .Lfrom_usermode_no_gap_\@353.rept 6354pushq 5*8(%rsp)355.endr356UNWIND_HINT_IRET_REGS offset=8357.Lfrom_usermode_no_gap_\@:358.endif359360idtentry_body \cfunc \has_error_code361362_ASM_NOKPROBE(\asmsym)363SYM_CODE_END(\asmsym)364.endm365366/*367* Interrupt entry/exit.368*369+ The interrupt stubs push (vector) onto the stack, which is the error_code370* position of idtentry exceptions, and jump to one of the two idtentry points371* (common/spurious).372*373* common_interrupt is a hotpath, align it to a cache line374*/375.macro idtentry_irq vector cfunc376.p2align CONFIG_X86_L1_CACHE_SHIFT377idtentry \vector asm_\cfunc \cfunc has_error_code=1378.endm379380/**381* idtentry_mce_db - Macro to generate entry stubs for #MC and #DB382* @vector: Vector number383* @asmsym: ASM symbol for the entry point384* @cfunc: C function to be called385*386* The macro emits code to set up the kernel context for #MC and #DB387*388* If the entry comes from user space it uses the normal entry path389* including the return to user space work and preemption checks on390* exit.391*392* If hits in kernel mode then it needs to go through the paranoid393* entry as the exception can hit any random state. No preemption394* check on exit to keep the paranoid path simple.395*/396.macro idtentry_mce_db vector asmsym cfunc397SYM_CODE_START(\asmsym)398UNWIND_HINT_IRET_ENTRY399ENDBR400ASM_CLAC401cld402403pushq $-1 /* ORIG_RAX: no syscall to restart */404405/*406* If the entry is from userspace, switch stacks and treat it as407* a normal entry.408*/409testb $3, CS-ORIG_RAX(%rsp)410jnz .Lfrom_usermode_switch_stack_\@411412/* paranoid_entry returns GS information for paranoid_exit in EBX. */413call paranoid_entry414415UNWIND_HINT_REGS416417movq %rsp, %rdi /* pt_regs pointer */418419call \cfunc420421jmp paranoid_exit422423/* Switch to the regular task stack and use the noist entry point */424.Lfrom_usermode_switch_stack_\@:425idtentry_body noist_\cfunc, has_error_code=0426427_ASM_NOKPROBE(\asmsym)428SYM_CODE_END(\asmsym)429.endm430431#ifdef CONFIG_AMD_MEM_ENCRYPT432/**433* idtentry_vc - Macro to generate entry stub for #VC434* @vector: Vector number435* @asmsym: ASM symbol for the entry point436* @cfunc: C function to be called437*438* The macro emits code to set up the kernel context for #VC. The #VC handler439* runs on an IST stack and needs to be able to cause nested #VC exceptions.440*441* To make this work the #VC entry code tries its best to pretend it doesn't use442* an IST stack by switching to the task stack if coming from user-space (which443* includes early SYSCALL entry path) or back to the stack in the IRET frame if444* entered from kernel-mode.445*446* If entered from kernel-mode the return stack is validated first, and if it is447* not safe to use (e.g. because it points to the entry stack) the #VC handler448* will switch to a fall-back stack (VC2) and call a special handler function.449*450* The macro is only used for one vector, but it is planned to be extended in451* the future for the #HV exception.452*/453.macro idtentry_vc vector asmsym cfunc454SYM_CODE_START(\asmsym)455UNWIND_HINT_IRET_ENTRY456ENDBR457ASM_CLAC458cld459460/*461* If the entry is from userspace, switch stacks and treat it as462* a normal entry.463*/464testb $3, CS-ORIG_RAX(%rsp)465jnz .Lfrom_usermode_switch_stack_\@466467/*468* paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.469* EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS470*/471call paranoid_entry472473UNWIND_HINT_REGS474475/*476* Switch off the IST stack to make it free for nested exceptions. The477* vc_switch_off_ist() function will switch back to the interrupted478* stack if it is safe to do so. If not it switches to the VC fall-back479* stack.480*/481movq %rsp, %rdi /* pt_regs pointer */482call vc_switch_off_ist483movq %rax, %rsp /* Switch to new stack */484485ENCODE_FRAME_POINTER486UNWIND_HINT_REGS487488/* Update pt_regs */489movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/490movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */491492movq %rsp, %rdi /* pt_regs pointer */493494call kernel_\cfunc495496/*497* No need to switch back to the IST stack. The current stack is either498* identical to the stack in the IRET frame or the VC fall-back stack,499* so it is definitely mapped even with PTI enabled.500*/501jmp paranoid_exit502503/* Switch to the regular task stack */504.Lfrom_usermode_switch_stack_\@:505idtentry_body user_\cfunc, has_error_code=1506507_ASM_NOKPROBE(\asmsym)508SYM_CODE_END(\asmsym)509.endm510#endif511512/*513* Double fault entry. Straight paranoid. No checks from which context514* this comes because for the espfix induced #DF this would do the wrong515* thing.516*/517.macro idtentry_df vector asmsym cfunc518SYM_CODE_START(\asmsym)519UNWIND_HINT_IRET_ENTRY offset=8520ENDBR521ASM_CLAC522cld523524/* paranoid_entry returns GS information for paranoid_exit in EBX. */525call paranoid_entry526UNWIND_HINT_REGS527528movq %rsp, %rdi /* pt_regs pointer into first argument */529movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/530movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */531532/* For some configurations \cfunc ends up being a noreturn. */533ANNOTATE_REACHABLE534call \cfunc535536jmp paranoid_exit537538_ASM_NOKPROBE(\asmsym)539SYM_CODE_END(\asmsym)540.endm541542/*543* Include the defines which emit the idt entries which are shared544* shared between 32 and 64 bit and emit the __irqentry_text_* markers545* so the stacktrace boundary checks work.546*/547__ALIGN548.globl __irqentry_text_start549__irqentry_text_start:550551#include <asm/idtentry.h>552553__ALIGN554.globl __irqentry_text_end555__irqentry_text_end:556ANNOTATE_NOENDBR557558SYM_CODE_START_LOCAL(common_interrupt_return)559SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)560IBRS_EXIT561#ifdef CONFIG_XEN_PV562ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV563#endif564#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION565ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI566#endif567568STACKLEAK_ERASE569POP_REGS570add $8, %rsp /* orig_ax */571UNWIND_HINT_IRET_REGS572573.Lswapgs_and_iret:574swapgs575CLEAR_CPU_BUFFERS576/* Assert that the IRET frame indicates user mode. */577testb $3, 8(%rsp)578jnz .Lnative_iret579ud2580581#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION582.Lpti_restore_regs_and_return_to_usermode:583POP_REGS pop_rdi=0584585/*586* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.587* Save old stack pointer and switch to trampoline stack.588*/589movq %rsp, %rdi590movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp591UNWIND_HINT_END_OF_STACK592593/* Copy the IRET frame to the trampoline stack. */594pushq 6*8(%rdi) /* SS */595pushq 5*8(%rdi) /* RSP */596pushq 4*8(%rdi) /* EFLAGS */597pushq 3*8(%rdi) /* CS */598pushq 2*8(%rdi) /* RIP */599600/* Push user RDI on the trampoline stack. */601pushq (%rdi)602603/*604* We are on the trampoline stack. All regs except RDI are live.605* We can do future final exit work right here.606*/607STACKLEAK_ERASE_NOCLOBBER608609push %rax610SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax611pop %rax612613/* Restore RDI. */614popq %rdi615jmp .Lswapgs_and_iret616#endif617618SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)619#ifdef CONFIG_DEBUG_ENTRY620/* Assert that pt_regs indicates kernel mode. */621testb $3, CS(%rsp)622jz 1f623ud26241:625#endif626POP_REGS627addq $8, %rsp /* skip regs->orig_ax */628/*629* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization630* when returning from IPI handler.631*/632#ifdef CONFIG_XEN_PV633SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)634ANNOTATE_NOENDBR635.byte 0xe9636.long .Lnative_iret - (. + 4)637#endif638639.Lnative_iret:640UNWIND_HINT_IRET_REGS641/*642* Are we returning to a stack segment from the LDT? Note: in643* 64-bit mode SS:RSP on the exception stack is always valid.644*/645#ifdef CONFIG_X86_ESPFIX64646testb $4, (SS-RIP)(%rsp)647jnz native_irq_return_ldt648#endif649650SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)651ANNOTATE_NOENDBR // exc_double_fault652/*653* This may fault. Non-paranoid faults on return to userspace are654* handled by fixup_bad_iret. These include #SS, #GP, and #NP.655* Double-faults due to espfix64 are handled in exc_double_fault.656* Other faults here are fatal.657*/658iretq659660#ifdef CONFIG_X86_ESPFIX64661native_irq_return_ldt:662/*663* We are running with user GSBASE. All GPRs contain their user664* values. We have a percpu ESPFIX stack that is eight slots665* long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom666* of the ESPFIX stack.667*668* We clobber RAX and RDI in this code. We stash RDI on the669* normal stack and RAX on the ESPFIX stack.670*671* The ESPFIX stack layout we set up looks like this:672*673* --- top of ESPFIX stack ---674* SS675* RSP676* RFLAGS677* CS678* RIP <-- RSP points here when we're done679* RAX <-- espfix_waddr points here680* --- bottom of ESPFIX stack ---681*/682683pushq %rdi /* Stash user RDI */684swapgs /* to kernel GS */685SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */686687movq PER_CPU_VAR(espfix_waddr), %rdi688movq %rax, (0*8)(%rdi) /* user RAX */689movq (1*8)(%rsp), %rax /* user RIP */690movq %rax, (1*8)(%rdi)691movq (2*8)(%rsp), %rax /* user CS */692movq %rax, (2*8)(%rdi)693movq (3*8)(%rsp), %rax /* user RFLAGS */694movq %rax, (3*8)(%rdi)695movq (5*8)(%rsp), %rax /* user SS */696movq %rax, (5*8)(%rdi)697movq (4*8)(%rsp), %rax /* user RSP */698movq %rax, (4*8)(%rdi)699/* Now RAX == RSP. */700701andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */702703/*704* espfix_stack[31:16] == 0. The page tables are set up such that705* (espfix_stack | (X & 0xffff0000)) points to a read-only alias of706* espfix_waddr for any X. That is, there are 65536 RO aliases of707* the same page. Set up RSP so that RSP[31:16] contains the708* respective 16 bits of the /userspace/ RSP and RSP nonetheless709* still points to an RO alias of the ESPFIX stack.710*/711orq PER_CPU_VAR(espfix_stack), %rax712713SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi714swapgs /* to user GS */715popq %rdi /* Restore user RDI */716717movq %rax, %rsp718UNWIND_HINT_IRET_REGS offset=8719720/*721* At this point, we cannot write to the stack any more, but we can722* still read.723*/724popq %rax /* Restore user RAX */725726CLEAR_CPU_BUFFERS727728/*729* RSP now points to an ordinary IRET frame, except that the page730* is read-only and RSP[31:16] are preloaded with the userspace731* values. We can now IRET back to userspace.732*/733jmp native_irq_return_iret734#endif735SYM_CODE_END(common_interrupt_return)736_ASM_NOKPROBE(common_interrupt_return)737738/*739* Reload gs selector with exception handling740* di: new selector741*742* Is in entry.text as it shouldn't be instrumented.743*/744SYM_FUNC_START(asm_load_gs_index)745ANNOTATE_NOENDBR746FRAME_BEGIN747swapgs748.Lgs_change:749ANNOTATE_NOENDBR // error_entry750movl %edi, %gs7512: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE752swapgs753FRAME_END754RET755756/* running with kernelgs */757.Lbad_gs:758swapgs /* switch back to user gs */759.macro ZAP_GS760/* This can't be a string because the preprocessor needs to see it. */761movl $__USER_DS, %eax762movl %eax, %gs763.endm764ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG765xorl %eax, %eax766movl %eax, %gs767jmp 2b768769_ASM_EXTABLE(.Lgs_change, .Lbad_gs)770771SYM_FUNC_END(asm_load_gs_index)772EXPORT_SYMBOL(asm_load_gs_index)773774#ifdef CONFIG_XEN_PV775/*776* A note on the "critical region" in our callback handler.777* We want to avoid stacking callback handlers due to events occurring778* during handling of the last event. To do this, we keep events disabled779* until we've done all processing. HOWEVER, we must enable events before780* popping the stack frame (can't be done atomically) and so it would still781* be possible to get enough handler activations to overflow the stack.782* Although unlikely, bugs of that kind are hard to track down, so we'd783* like to avoid the possibility.784* So, on entry to the handler we detect whether we interrupted an785* existing activation in its critical region -- if so, we pop the current786* activation and restart the handler using the previous one.787*788* C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)789*/790__FUNC_ALIGN791SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)792793/*794* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will795* see the correct pointer to the pt_regs796*/797UNWIND_HINT_FUNC798movq %rdi, %rsp /* we don't return, adjust the stack frame */799UNWIND_HINT_REGS800801call xen_pv_evtchn_do_upcall802803jmp error_return804SYM_CODE_END(exc_xen_hypervisor_callback)805806/*807* Hypervisor uses this for application faults while it executes.808* We get here for two reasons:809* 1. Fault while reloading DS, ES, FS or GS810* 2. Fault while executing IRET811* Category 1 we do not need to fix up as Xen has already reloaded all segment812* registers that could be reloaded and zeroed the others.813* Category 2 we fix up by killing the current process. We cannot use the814* normal Linux return path in this case because if we use the IRET hypercall815* to pop the stack frame we end up in an infinite loop of failsafe callbacks.816* We distinguish between categories by comparing each saved segment register817* with its current contents: any discrepancy means we in category 1.818*/819__FUNC_ALIGN820SYM_CODE_START_NOALIGN(xen_failsafe_callback)821UNWIND_HINT_UNDEFINED822ENDBR823movl %ds, %ecx824cmpw %cx, 0x10(%rsp)825jne 1f826movl %es, %ecx827cmpw %cx, 0x18(%rsp)828jne 1f829movl %fs, %ecx830cmpw %cx, 0x20(%rsp)831jne 1f832movl %gs, %ecx833cmpw %cx, 0x28(%rsp)834jne 1f835/* All segments match their saved values => Category 2 (Bad IRET). */836movq (%rsp), %rcx837movq 8(%rsp), %r11838addq $0x30, %rsp839pushq $0 /* RIP */840UNWIND_HINT_IRET_REGS offset=8841jmp asm_exc_general_protection8421: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */843movq (%rsp), %rcx844movq 8(%rsp), %r11845addq $0x30, %rsp846UNWIND_HINT_IRET_REGS847pushq $-1 /* orig_ax = -1 => not a system call */848PUSH_AND_CLEAR_REGS849ENCODE_FRAME_POINTER850jmp error_return851SYM_CODE_END(xen_failsafe_callback)852#endif /* CONFIG_XEN_PV */853854/*855* Save all registers in pt_regs. Return GSBASE related information856* in EBX depending on the availability of the FSGSBASE instructions:857*858* FSGSBASE R/EBX859* N 0 -> SWAPGS on exit860* 1 -> no SWAPGS on exit861*862* Y GSBASE value at entry, must be restored in paranoid_exit863*864* R14 - old CR3865* R15 - old SPEC_CTRL866*/867SYM_CODE_START(paranoid_entry)868ANNOTATE_NOENDBR869UNWIND_HINT_FUNC870PUSH_AND_CLEAR_REGS save_ret=1871ENCODE_FRAME_POINTER 8872873/*874* Always stash CR3 in %r14. This value will be restored,875* verbatim, at exit. Needed if paranoid_entry interrupted876* another entry that already switched to the user CR3 value877* but has not yet returned to userspace.878*879* This is also why CS (stashed in the "iret frame" by the880* hardware at entry) can not be used: this may be a return881* to kernel code, but with a user CR3 value.882*883* Switching CR3 does not depend on kernel GSBASE so it can884* be done before switching to the kernel GSBASE. This is885* required for FSGSBASE because the kernel GSBASE has to886* be retrieved from a kernel internal table.887*/888SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14889890/*891* Handling GSBASE depends on the availability of FSGSBASE.892*893* Without FSGSBASE the kernel enforces that negative GSBASE894* values indicate kernel GSBASE. With FSGSBASE no assumptions895* can be made about the GSBASE value when entering from user896* space.897*/898ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE899900/*901* Read the current GSBASE and store it in %rbx unconditionally,902* retrieve and set the current CPUs kernel GSBASE. The stored value903* has to be restored in paranoid_exit unconditionally.904*905* The unconditional write to GS base below ensures that no subsequent906* loads based on a mispredicted GS base can happen, therefore no LFENCE907* is needed here.908*/909SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx910jmp .Lparanoid_gsbase_done911912.Lparanoid_entry_checkgs:913/* EBX = 1 -> kernel GSBASE active, no restore required */914movl $1, %ebx915916/*917* The kernel-enforced convention is a negative GSBASE indicates918* a kernel value. No SWAPGS needed on entry and exit.919*/920movl $MSR_GS_BASE, %ecx921rdmsr922testl %edx, %edx923js .Lparanoid_kernel_gsbase924925/* EBX = 0 -> SWAPGS required on exit */926xorl %ebx, %ebx927swapgs928.Lparanoid_kernel_gsbase:929FENCE_SWAPGS_KERNEL_ENTRY930.Lparanoid_gsbase_done:931932/*933* Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like934* CR3 above, keep the old value in a callee saved register.935*/936IBRS_ENTER save_reg=%r15937UNTRAIN_RET_FROM_CALL938939RET940SYM_CODE_END(paranoid_entry)941942/*943* "Paranoid" exit path from exception stack. This is invoked944* only on return from non-NMI IST interrupts that came945* from kernel space.946*947* We may be returning to very strange contexts (e.g. very early948* in syscall entry), so checking for preemption here would949* be complicated. Fortunately, there's no good reason to try950* to handle preemption here.951*952* R/EBX contains the GSBASE related information depending on the953* availability of the FSGSBASE instructions:954*955* FSGSBASE R/EBX956* N 0 -> SWAPGS on exit957* 1 -> no SWAPGS on exit958*959* Y User space GSBASE, must be restored unconditionally960*961* R14 - old CR3962* R15 - old SPEC_CTRL963*/964SYM_CODE_START_LOCAL(paranoid_exit)965UNWIND_HINT_REGS966967/*968* Must restore IBRS state before both CR3 and %GS since we need access969* to the per-CPU x86_spec_ctrl_shadow variable.970*/971IBRS_EXIT save_reg=%r15972973/*974* The order of operations is important. PARANOID_RESTORE_CR3 requires975* kernel GSBASE.976*977* NB to anyone to try to optimize this code: this code does978* not execute at all for exceptions from user mode. Those979* exceptions go through error_return instead.980*/981PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14982983/* Handle the three GSBASE cases */984ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE985986/* With FSGSBASE enabled, unconditionally restore GSBASE */987wrgsbase %rbx988jmp restore_regs_and_return_to_kernel989990.Lparanoid_exit_checkgs:991/* On non-FSGSBASE systems, conditionally do SWAPGS */992testl %ebx, %ebx993jnz restore_regs_and_return_to_kernel994995/* We are returning to a context with user GSBASE */996swapgs997jmp restore_regs_and_return_to_kernel998SYM_CODE_END(paranoid_exit)9991000/*1001* Switch GS and CR3 if needed.1002*/1003SYM_CODE_START(error_entry)1004ANNOTATE_NOENDBR1005UNWIND_HINT_FUNC10061007PUSH_AND_CLEAR_REGS save_ret=11008ENCODE_FRAME_POINTER 810091010testb $3, CS+8(%rsp)1011jz .Lerror_kernelspace10121013/*1014* We entered from user mode or we're pretending to have entered1015* from user mode due to an IRET fault.1016*/1017swapgs1018FENCE_SWAPGS_USER_ENTRY1019/* We have user CR3. Change to kernel CR3. */1020SWITCH_TO_KERNEL_CR3 scratch_reg=%rax1021IBRS_ENTER1022UNTRAIN_RET_FROM_CALL10231024leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */1025/* Put us onto the real thread stack. */1026jmp sync_regs10271028/*1029* There are two places in the kernel that can potentially fault with1030* usergs. Handle them here. B stepping K8s sometimes report a1031* truncated RIP for IRET exceptions returning to compat mode. Check1032* for these here too.1033*/1034.Lerror_kernelspace:1035leaq native_irq_return_iret(%rip), %rcx1036cmpq %rcx, RIP+8(%rsp)1037je .Lerror_bad_iret1038movl %ecx, %eax /* zero extend */1039cmpq %rax, RIP+8(%rsp)1040je .Lbstep_iret1041cmpq $.Lgs_change, RIP+8(%rsp)1042jne .Lerror_entry_done_lfence10431044/*1045* hack: .Lgs_change can fail with user gsbase. If this happens, fix up1046* gsbase and proceed. We'll fix up the exception and land in1047* .Lgs_change's error handler with kernel gsbase.1048*/1049swapgs10501051/*1052* Issue an LFENCE to prevent GS speculation, regardless of whether it is a1053* kernel or user gsbase.1054*/1055.Lerror_entry_done_lfence:1056FENCE_SWAPGS_KERNEL_ENTRY1057CALL_DEPTH_ACCOUNT1058leaq 8(%rsp), %rax /* return pt_regs pointer */1059VALIDATE_UNRET_END1060RET10611062.Lbstep_iret:1063/* Fix truncated RIP */1064movq %rcx, RIP+8(%rsp)1065/* fall through */10661067.Lerror_bad_iret:1068/*1069* We came from an IRET to user mode, so we have user1070* gsbase and CR3. Switch to kernel gsbase and CR3:1071*/1072swapgs1073FENCE_SWAPGS_USER_ENTRY1074SWITCH_TO_KERNEL_CR3 scratch_reg=%rax1075IBRS_ENTER1076UNTRAIN_RET_FROM_CALL10771078/*1079* Pretend that the exception came from user mode: set up pt_regs1080* as if we faulted immediately after IRET.1081*/1082leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */1083call fixup_bad_iret1084mov %rax, %rdi1085jmp sync_regs1086SYM_CODE_END(error_entry)10871088SYM_CODE_START_LOCAL(error_return)1089UNWIND_HINT_REGS1090DEBUG_ENTRY_ASSERT_IRQS_OFF1091testb $3, CS(%rsp)1092jz restore_regs_and_return_to_kernel1093jmp swapgs_restore_regs_and_return_to_usermode1094SYM_CODE_END(error_return)10951096/*1097* Runs on exception stack. Xen PV does not go through this path at all,1098* so we can use real assembly here.1099*1100* Registers:1101* %r14: Used to save/restore the CR3 of the interrupted context1102* when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.1103*/1104SYM_CODE_START(asm_exc_nmi)1105UNWIND_HINT_IRET_ENTRY1106ENDBR11071108/*1109* We allow breakpoints in NMIs. If a breakpoint occurs, then1110* the iretq it performs will take us out of NMI context.1111* This means that we can have nested NMIs where the next1112* NMI is using the top of the stack of the previous NMI. We1113* can't let it execute because the nested NMI will corrupt the1114* stack of the previous NMI. NMI handlers are not re-entrant1115* anyway.1116*1117* To handle this case we do the following:1118* Check a special location on the stack that contains a1119* variable that is set when NMIs are executing.1120* The interrupted task's stack is also checked to see if it1121* is an NMI stack.1122* If the variable is not set and the stack is not the NMI1123* stack then:1124* o Set the special variable on the stack1125* o Copy the interrupt frame into an "outermost" location on the1126* stack1127* o Copy the interrupt frame into an "iret" location on the stack1128* o Continue processing the NMI1129* If the variable is set or the previous stack is the NMI stack:1130* o Modify the "iret" location to jump to the repeat_nmi1131* o return back to the first NMI1132*1133* Now on exit of the first NMI, we first clear the stack variable1134* The NMI stack will tell any nested NMIs at that point that it is1135* nested. Then we pop the stack normally with iret, and if there was1136* a nested NMI that updated the copy interrupt stack frame, a1137* jump will be made to the repeat_nmi code that will handle the second1138* NMI.1139*1140* However, espfix prevents us from directly returning to userspace1141* with a single IRET instruction. Similarly, IRET to user mode1142* can fault. We therefore handle NMIs from user space like1143* other IST entries.1144*/11451146ASM_CLAC1147cld11481149/* Use %rdx as our temp variable throughout */1150pushq %rdx11511152testb $3, CS-RIP+8(%rsp)1153jz .Lnmi_from_kernel11541155/*1156* NMI from user mode. We need to run on the thread stack, but we1157* can't go through the normal entry paths: NMIs are masked, and1158* we don't want to enable interrupts, because then we'll end1159* up in an awkward situation in which IRQs are on but NMIs1160* are off.1161*1162* We also must not push anything to the stack before switching1163* stacks lest we corrupt the "NMI executing" variable.1164*/11651166swapgs1167FENCE_SWAPGS_USER_ENTRY1168SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx1169movq %rsp, %rdx1170movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp1171UNWIND_HINT_IRET_REGS base=%rdx offset=81172pushq 5*8(%rdx) /* pt_regs->ss */1173pushq 4*8(%rdx) /* pt_regs->rsp */1174pushq 3*8(%rdx) /* pt_regs->flags */1175pushq 2*8(%rdx) /* pt_regs->cs */1176pushq 1*8(%rdx) /* pt_regs->rip */1177UNWIND_HINT_IRET_REGS1178pushq $-1 /* pt_regs->orig_ax */1179PUSH_AND_CLEAR_REGS rdx=(%rdx)1180ENCODE_FRAME_POINTER11811182IBRS_ENTER1183UNTRAIN_RET11841185/*1186* At this point we no longer need to worry about stack damage1187* due to nesting -- we're on the normal thread stack and we're1188* done with the NMI stack.1189*/11901191movq %rsp, %rdi1192call exc_nmi11931194/*1195* Return back to user mode. We must *not* do the normal exit1196* work, because we don't want to enable interrupts.1197*/1198jmp swapgs_restore_regs_and_return_to_usermode11991200.Lnmi_from_kernel:1201/*1202* Here's what our stack frame will look like:1203* +---------------------------------------------------------+1204* | original SS |1205* | original Return RSP |1206* | original RFLAGS |1207* | original CS |1208* | original RIP |1209* +---------------------------------------------------------+1210* | temp storage for rdx |1211* +---------------------------------------------------------+1212* | "NMI executing" variable |1213* +---------------------------------------------------------+1214* | iret SS } Copied from "outermost" frame |1215* | iret Return RSP } on each loop iteration; overwritten |1216* | iret RFLAGS } by a nested NMI to force another |1217* | iret CS } iteration if needed. |1218* | iret RIP } |1219* +---------------------------------------------------------+1220* | outermost SS } initialized in first_nmi; |1221* | outermost Return RSP } will not be changed before |1222* | outermost RFLAGS } NMI processing is done. |1223* | outermost CS } Copied to "iret" frame on each |1224* | outermost RIP } iteration. |1225* +---------------------------------------------------------+1226* | pt_regs |1227* +---------------------------------------------------------+1228*1229* The "original" frame is used by hardware. Before re-enabling1230* NMIs, we need to be done with it, and we need to leave enough1231* space for the asm code here.1232*1233* We return by executing IRET while RSP points to the "iret" frame.1234* That will either return for real or it will loop back into NMI1235* processing.1236*1237* The "outermost" frame is copied to the "iret" frame on each1238* iteration of the loop, so each iteration starts with the "iret"1239* frame pointing to the final return target.1240*/12411242/*1243* Determine whether we're a nested NMI.1244*1245* If we interrupted kernel code between repeat_nmi and1246* end_repeat_nmi, then we are a nested NMI. We must not1247* modify the "iret" frame because it's being written by1248* the outer NMI. That's okay; the outer NMI handler is1249* about to call exc_nmi() anyway, so we can just resume1250* the outer NMI.1251*/12521253movq $repeat_nmi, %rdx1254cmpq 8(%rsp), %rdx1255ja 1f1256movq $end_repeat_nmi, %rdx1257cmpq 8(%rsp), %rdx1258ja nested_nmi_out12591:12601261/*1262* Now check "NMI executing". If it's set, then we're nested.1263* This will not detect if we interrupted an outer NMI just1264* before IRET.1265*/1266cmpl $1, -8(%rsp)1267je nested_nmi12681269/*1270* Now test if the previous stack was an NMI stack. This covers1271* the case where we interrupt an outer NMI after it clears1272* "NMI executing" but before IRET. We need to be careful, though:1273* there is one case in which RSP could point to the NMI stack1274* despite there being no NMI active: naughty userspace controls1275* RSP at the very beginning of the SYSCALL targets. We can1276* pull a fast one on naughty userspace, though: we program1277* SYSCALL to mask DF, so userspace cannot cause DF to be set1278* if it controls the kernel's RSP. We set DF before we clear1279* "NMI executing".1280*/1281lea 6*8(%rsp), %rdx1282/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */1283cmpq %rdx, 4*8(%rsp)1284/* If the stack pointer is above the NMI stack, this is a normal NMI */1285ja first_nmi12861287subq $EXCEPTION_STKSZ, %rdx1288cmpq %rdx, 4*8(%rsp)1289/* If it is below the NMI stack, it is a normal NMI */1290jb first_nmi12911292/* Ah, it is within the NMI stack. */12931294testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)1295jz first_nmi /* RSP was user controlled. */12961297/* This is a nested NMI. */12981299nested_nmi:1300/*1301* Modify the "iret" frame to point to repeat_nmi, forcing another1302* iteration of NMI handling.1303*/1304subq $8, %rsp1305leaq -10*8(%rsp), %rdx1306pushq $__KERNEL_DS1307pushq %rdx1308pushfq1309pushq $__KERNEL_CS1310pushq $repeat_nmi13111312/* Put stack back */1313addq $(6*8), %rsp13141315nested_nmi_out:1316popq %rdx13171318/* We are returning to kernel mode, so this cannot result in a fault. */1319iretq13201321first_nmi:1322/* Restore rdx. */1323movq (%rsp), %rdx13241325/* Make room for "NMI executing". */1326pushq $013271328/* Leave room for the "iret" frame */1329subq $(5*8), %rsp13301331/* Copy the "original" frame to the "outermost" frame */1332.rept 51333pushq 11*8(%rsp)1334.endr1335UNWIND_HINT_IRET_REGS13361337/* Everything up to here is safe from nested NMIs */13381339#ifdef CONFIG_DEBUG_ENTRY1340/*1341* For ease of testing, unmask NMIs right away. Disabled by1342* default because IRET is very expensive.1343*/1344pushq $0 /* SS */1345pushq %rsp /* RSP (minus 8 because of the previous push) */1346addq $8, (%rsp) /* Fix up RSP */1347pushfq /* RFLAGS */1348pushq $__KERNEL_CS /* CS */1349pushq $1f /* RIP */1350iretq /* continues at repeat_nmi below */1351UNWIND_HINT_IRET_REGS13521:1353#endif13541355repeat_nmi:1356ANNOTATE_NOENDBR // this code1357/*1358* If there was a nested NMI, the first NMI's iret will return1359* here. But NMIs are still enabled and we can take another1360* nested NMI. The nested NMI checks the interrupted RIP to see1361* if it is between repeat_nmi and end_repeat_nmi, and if so1362* it will just return, as we are about to repeat an NMI anyway.1363* This makes it safe to copy to the stack frame that a nested1364* NMI will update.1365*1366* RSP is pointing to "outermost RIP". gsbase is unknown, but, if1367* we're repeating an NMI, gsbase has the same value that it had on1368* the first iteration. paranoid_entry will load the kernel1369* gsbase if needed before we call exc_nmi(). "NMI executing"1370* is zero.1371*/1372movq $1, 10*8(%rsp) /* Set "NMI executing". */13731374/*1375* Copy the "outermost" frame to the "iret" frame. NMIs that nest1376* here must not modify the "iret" frame while we're writing to1377* it or it will end up containing garbage.1378*/1379addq $(10*8), %rsp1380.rept 51381pushq -6*8(%rsp)1382.endr1383subq $(5*8), %rsp1384end_repeat_nmi:1385ANNOTATE_NOENDBR // this code13861387/*1388* Everything below this point can be preempted by a nested NMI.1389* If this happens, then the inner NMI will change the "iret"1390* frame to point back to repeat_nmi.1391*/1392pushq $-1 /* ORIG_RAX: no syscall to restart */13931394/*1395* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit1396* as we should not be calling schedule in NMI context.1397* Even with normal interrupts enabled. An NMI should not be1398* setting NEED_RESCHED or anything that normal interrupts and1399* exceptions might do.1400*/1401call paranoid_entry1402UNWIND_HINT_REGS14031404movq %rsp, %rdi1405call exc_nmi14061407/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */1408IBRS_EXIT save_reg=%r1514091410PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r1414111412/*1413* The above invocation of paranoid_entry stored the GSBASE1414* related information in R/EBX depending on the availability1415* of FSGSBASE.1416*1417* If FSGSBASE is enabled, restore the saved GSBASE value1418* unconditionally, otherwise take the conditional SWAPGS path.1419*/1420ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE14211422wrgsbase %rbx1423jmp nmi_restore14241425nmi_no_fsgsbase:1426/* EBX == 0 -> invoke SWAPGS */1427testl %ebx, %ebx1428jnz nmi_restore14291430nmi_swapgs:1431swapgs14321433nmi_restore:1434POP_REGS14351436/*1437* Skip orig_ax and the "outermost" frame to point RSP at the "iret"1438* at the "iret" frame.1439*/1440addq $6*8, %rsp14411442/*1443* Clear "NMI executing". Set DF first so that we can easily1444* distinguish the remaining code between here and IRET from1445* the SYSCALL entry and exit paths.1446*1447* We arguably should just inspect RIP instead, but I (Andy) wrote1448* this code when I had the misapprehension that Xen PV supported1449* NMIs, and Xen PV would break that approach.1450*/1451std1452movq $0, 5*8(%rsp) /* clear "NMI executing" */14531454/*1455* Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like1456* NMI in kernel after user state is restored. For an unprivileged user1457* these conditions are hard to meet.1458*/14591460/*1461* iretq reads the "iret" frame and exits the NMI stack in a1462* single instruction. We are returning to kernel mode, so this1463* cannot result in a fault. Similarly, we don't need to worry1464* about espfix64 on the way back to kernel mode.1465*/1466iretq1467SYM_CODE_END(asm_exc_nmi)14681469/*1470* This handles SYSCALL from 32-bit code. There is no way to program1471* MSRs to fully disable 32-bit SYSCALL.1472*/1473SYM_CODE_START(entry_SYSCALL32_ignore)1474UNWIND_HINT_END_OF_STACK1475ENDBR1476mov $-ENOSYS, %eax1477CLEAR_CPU_BUFFERS1478sysretl1479SYM_CODE_END(entry_SYSCALL32_ignore)14801481.pushsection .text, "ax"1482__FUNC_ALIGN1483SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)1484UNWIND_HINT_FUNC1485/* Prevent any naive code from trying to unwind to our caller. */1486xorl %ebp, %ebp14871488movq PER_CPU_VAR(cpu_current_top_of_stack), %rax1489leaq -PTREGS_SIZE(%rax), %rsp1490UNWIND_HINT_REGS14911492call make_task_dead1493SYM_CODE_END(rewind_stack_and_make_dead)1494.popsection14951496/*1497* This sequence executes branches in order to remove user branch information1498* from the branch history tracker in the Branch Predictor, therefore removing1499* user influence on subsequent BTB lookups.1500*1501* It should be used on parts prior to Alder Lake. Newer parts should use the1502* BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being1503* virtualized on newer hardware the VMM should protect against BHI attacks by1504* setting BHI_DIS_S for the guests.1505*1506* CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging1507* and not clearing the branch history. The call tree looks like:1508*1509* call 11510* call 21511* call 21512* call 21513* call 21514* call 21515* ret1516* ret1517* ret1518* ret1519* ret1520* ret1521*1522* This means that the stack is non-constant and ORC can't unwind it with %rsp1523* alone. Therefore we unconditionally set up the frame pointer, which allows1524* ORC to unwind properly.1525*1526* The alignment is for performance and not for safety, and may be safely1527* refactored in the future if needed. The .skips are for safety, to ensure1528* that all RETs are in the second half of a cacheline to mitigate Indirect1529* Target Selection, rather than taking the slowpath via its_return_thunk.1530*/1531SYM_FUNC_START(clear_bhb_loop)1532ANNOTATE_NOENDBR1533push %rbp1534mov %rsp, %rbp1535movl $5, %ecx1536ANNOTATE_INTRA_FUNCTION_CALL1537call 1f1538jmp 5f1539.align 64, 0xcc1540/*1541* Shift instructions so that the RET is in the upper half of the1542* cacheline and don't take the slowpath to its_return_thunk.1543*/1544.skip 32 - (.Lret1 - 1f), 0xcc1545ANNOTATE_INTRA_FUNCTION_CALL15461: call 2f1547.Lret1: RET1548.align 64, 0xcc1549/*1550* As above shift instructions for RET at .Lret2 as well.1551*1552* This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc1553* but some Clang versions (e.g. 18) don't like this.1554*/1555.skip 32 - 18, 0xcc15562: movl $5, %eax15573: jmp 4f1558nop15594: sub $1, %eax1560jnz 3b1561sub $1, %ecx1562jnz 1b1563.Lret2: RET15645: lfence1565pop %rbp1566RET1567SYM_FUNC_END(clear_bhb_loop)1568EXPORT_SYMBOL_GPL(clear_bhb_loop)1569STACK_FRAME_NON_STANDARD(clear_bhb_loop)157015711572