CoCalc -- entry

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/entry/entry_64.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 *  linux/arch/x86_64/entry.S
4
 *
5
 *  Copyright (C) 1991, 1992  Linus Torvalds
6
 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
7
 *  Copyright (C) 2000  Pavel Machek <[email protected]>
8
 *
9
 * entry.S contains the system-call and fault low-level handling routines.
10
 *
11
 * Some of this is documented in Documentation/arch/x86/entry_64.rst
12
 *
13
 * A note on terminology:
14
 * - iret frame:	Architecture defined interrupt frame from SS to RIP
15
 *			at the top of the kernel process stack.
16
 *
17
 * Some macro usage:
18
 * - SYM_FUNC_START/END:Define functions in the symbol table.
19
 * - idtentry:		Define exception entry points.
20
 */
21
#include <linux/export.h>
22
#include <linux/linkage.h>
23
#include <asm/segment.h>
24
#include <asm/cache.h>
25
#include <asm/errno.h>
26
#include <asm/asm-offsets.h>
27
#include <asm/msr.h>
28
#include <asm/unistd.h>
29
#include <asm/thread_info.h>
30
#include <asm/hw_irq.h>
31
#include <asm/page_types.h>
32
#include <asm/irqflags.h>
33
#include <asm/paravirt.h>
34
#include <asm/percpu.h>
35
#include <asm/asm.h>
36
#include <asm/smap.h>
37
#include <asm/pgtable_types.h>
38
#include <asm/frame.h>
39
#include <asm/trapnr.h>
40
#include <asm/nospec-branch.h>
41
#include <asm/fsgsbase.h>
42
#include <linux/err.h>
43

44
#include "calling.h"
45

46
.code64
47
.section .entry.text, "ax"
48

49
/*
50
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
51
 *
52
 * This is the only entry point used for 64-bit system calls.  The
53
 * hardware interface is reasonably well designed and the register to
54
 * argument mapping Linux uses fits well with the registers that are
55
 * available when SYSCALL is used.
56
 *
57
 * SYSCALL instructions can be found inlined in libc implementations as
58
 * well as some other programs and libraries.  There are also a handful
59
 * of SYSCALL instructions in the vDSO used, for example, as a
60
 * clock_gettimeofday fallback.
61
 *
62
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
63
 * then loads new ss, cs, and rip from previously programmed MSRs.
64
 * rflags gets masked by a value from another MSR (so CLD and CLAC
65
 * are not needed). SYSCALL does not save anything on the stack
66
 * and does not change rsp.
67
 *
68
 * Registers on entry:
69
 * rax  system call number
70
 * rcx  return address
71
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
72
 * rdi  arg0
73
 * rsi  arg1
74
 * rdx  arg2
75
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
76
 * r8   arg4
77
 * r9   arg5
78
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
79
 *
80
 * Only called from user space.
81
 *
82
 * When user can change pt_regs->foo always force IRET. That is because
83
 * it deals with uncanonical addresses better. SYSRET has trouble
84
 * with them due to bugs in both AMD and Intel CPUs.
85
 */
86

87
SYM_CODE_START(entry_SYSCALL_64)
88
	UNWIND_HINT_ENTRY
89
	ENDBR
90

91
	swapgs
92
	/* tss.sp2 is scratch space. */
93
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
94
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
95
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
96

97
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
98
	ANNOTATE_NOENDBR
99

100
	/* Construct struct pt_regs on stack */
101
	pushq	$__USER_DS				/* pt_regs->ss */
102
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
103
	pushq	%r11					/* pt_regs->flags */
104
	pushq	$__USER_CS				/* pt_regs->cs */
105
	pushq	%rcx					/* pt_regs->ip */
106
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
107
	pushq	%rax					/* pt_regs->orig_ax */
108

109
	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
110

111
	/* IRQs are off. */
112
	movq	%rsp, %rdi
113
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
114
	movslq	%eax, %rsi
115

116
	/* clobbers %rax, make sure it is after saving the syscall nr */
117
	IBRS_ENTER
118
	UNTRAIN_RET
119
	CLEAR_BRANCH_HISTORY
120

121
	call	do_syscall_64		/* returns with IRQs disabled */
122

123
	/*
124
	 * Try to use SYSRET instead of IRET if we're returning to
125
	 * a completely clean 64-bit userspace context.  If we're not,
126
	 * go to the slow exit path.
127
	 * In the Xen PV case we must use iret anyway.
128
	 */
129

130
	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
131
		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
132

133
	/*
134
	 * We win! This label is here just for ease of understanding
135
	 * perf profiles. Nothing jumps here.
136
	 */
137
syscall_return_via_sysret:
138
	IBRS_EXIT
139
	POP_REGS pop_rdi=0
140

141
	/*
142
	 * Now all regs are restored except RSP and RDI.
143
	 * Save old stack pointer and switch to trampoline stack.
144
	 */
145
	movq	%rsp, %rdi
146
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
147
	UNWIND_HINT_END_OF_STACK
148

149
	pushq	RSP-RDI(%rdi)	/* RSP */
150
	pushq	(%rdi)		/* RDI */
151

152
	/*
153
	 * We are on the trampoline stack.  All regs except RDI are live.
154
	 * We can do future final exit work right here.
155
	 */
156
	STACKLEAK_ERASE_NOCLOBBER
157

158
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
159

160
	popq	%rdi
161
	popq	%rsp
162
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
163
	ANNOTATE_NOENDBR
164
	swapgs
165
	CLEAR_CPU_BUFFERS
166
	sysretq
167
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
168
	ANNOTATE_NOENDBR
169
	int3
170
SYM_CODE_END(entry_SYSCALL_64)
171

172
/*
173
 * %rdi: prev task
174
 * %rsi: next task
175
 */
176
.pushsection .text, "ax"
177
SYM_FUNC_START(__switch_to_asm)
178
	ANNOTATE_NOENDBR
179
	/*
180
	 * Save callee-saved registers
181
	 * This must match the order in inactive_task_frame
182
	 */
183
	pushq	%rbp
184
	pushq	%rbx
185
	pushq	%r12
186
	pushq	%r13
187
	pushq	%r14
188
	pushq	%r15
189

190
	/* switch stack */
191
	movq	%rsp, TASK_threadsp(%rdi)
192
	movq	TASK_threadsp(%rsi), %rsp
193

194
#ifdef CONFIG_STACKPROTECTOR
195
	movq	TASK_stack_canary(%rsi), %rbx
196
	movq	%rbx, PER_CPU_VAR(__stack_chk_guard)
197
#endif
198

199
	/*
200
	 * When switching from a shallower to a deeper call stack
201
	 * the RSB may either underflow or use entries populated
202
	 * with userspace addresses. On CPUs where those concerns
203
	 * exist, overwrite the RSB with entries which capture
204
	 * speculative execution to prevent attack.
205
	 */
206
	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
207

208
	/* restore callee-saved registers */
209
	popq	%r15
210
	popq	%r14
211
	popq	%r13
212
	popq	%r12
213
	popq	%rbx
214
	popq	%rbp
215

216
	jmp	__switch_to
217
SYM_FUNC_END(__switch_to_asm)
218
.popsection
219

220
/*
221
 * A newly forked process directly context switches into this address.
222
 *
223
 * rax: prev task we switched from
224
 * rbx: kernel thread func (NULL for user thread)
225
 * r12: kernel thread arg
226
 */
227
.pushsection .text, "ax"
228
SYM_CODE_START(ret_from_fork_asm)
229
	/*
230
	 * This is the start of the kernel stack; even through there's a
231
	 * register set at the top, the regset isn't necessarily coherent
232
	 * (consider kthreads) and one cannot unwind further.
233
	 *
234
	 * This ensures stack unwinds of kernel threads terminate in a known
235
	 * good state.
236
	 */
237
	UNWIND_HINT_END_OF_STACK
238
	ANNOTATE_NOENDBR // copy_thread
239
	CALL_DEPTH_ACCOUNT
240

241
	movq	%rax, %rdi		/* prev */
242
	movq	%rsp, %rsi		/* regs */
243
	movq	%rbx, %rdx		/* fn */
244
	movq	%r12, %rcx		/* fn_arg */
245
	call	ret_from_fork
246

247
	/*
248
	 * Set the stack state to what is expected for the target function
249
	 * -- at this point the register set should be a valid user set
250
	 * and unwind should work normally.
251
	 */
252
	UNWIND_HINT_REGS
253

254
#ifdef CONFIG_X86_FRED
255
	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
256
		    "jmp asm_fred_exit_user", X86_FEATURE_FRED
257
#else
258
	jmp	swapgs_restore_regs_and_return_to_usermode
259
#endif
260
SYM_CODE_END(ret_from_fork_asm)
261
.popsection
262

263
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
264
#ifdef CONFIG_DEBUG_ENTRY
265
	pushq %rax
266
	SAVE_FLAGS
267
	testl $X86_EFLAGS_IF, %eax
268
	jz .Lokay_\@
269
	ud2
270
.Lokay_\@:
271
	popq %rax
272
#endif
273
.endm
274

275
SYM_CODE_START(xen_error_entry)
276
	ANNOTATE_NOENDBR
277
	UNWIND_HINT_FUNC
278
	PUSH_AND_CLEAR_REGS save_ret=1
279
	ENCODE_FRAME_POINTER 8
280
	UNTRAIN_RET_FROM_CALL
281
	RET
282
SYM_CODE_END(xen_error_entry)
283

284
/**
285
 * idtentry_body - Macro to emit code calling the C function
286
 * @cfunc:		C function to be called
287
 * @has_error_code:	Hardware pushed error code on stack
288
 */
289
.macro idtentry_body cfunc has_error_code:req
290

291
	/*
292
	 * Call error_entry() and switch to the task stack if from userspace.
293
	 *
294
	 * When in XENPV, it is already in the task stack, and it can't fault
295
	 * for native_iret() nor native_load_gs_index() since XENPV uses its
296
	 * own pvops for IRET and load_gs_index().  And it doesn't need to
297
	 * switch the CR3.  So it can skip invoking error_entry().
298
	 */
299
	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
300
		    "call xen_error_entry", X86_FEATURE_XENPV
301

302
	ENCODE_FRAME_POINTER
303
	UNWIND_HINT_REGS
304

305
	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
306

307
	.if \has_error_code == 1
308
		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
309
		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
310
	.endif
311

312
	/* For some configurations \cfunc ends up being a noreturn. */
313
	ANNOTATE_REACHABLE
314
	call	\cfunc
315

316
	jmp	error_return
317
.endm
318

319
/**
320
 * idtentry - Macro to generate entry stubs for simple IDT entries
321
 * @vector:		Vector number
322
 * @asmsym:		ASM symbol for the entry point
323
 * @cfunc:		C function to be called
324
 * @has_error_code:	Hardware pushed error code on stack
325
 *
326
 * The macro emits code to set up the kernel context for straight forward
327
 * and simple IDT entries. No IST stack, no paranoid entry checks.
328
 */
329
.macro idtentry vector asmsym cfunc has_error_code:req
330
SYM_CODE_START(\asmsym)
331

332
	.if \vector == X86_TRAP_BP
333
		/* #BP advances %rip to the next instruction */
334
		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
335
	.else
336
		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
337
	.endif
338

339
	ENDBR
340
	ASM_CLAC
341
	cld
342

343
	.if \has_error_code == 0
344
		pushq	$-1			/* ORIG_RAX: no syscall to restart */
345
	.endif
346

347
	.if \vector == X86_TRAP_BP
348
		/*
349
		 * If coming from kernel space, create a 6-word gap to allow the
350
		 * int3 handler to emulate a call instruction.
351
		 */
352
		testb	$3, CS-ORIG_RAX(%rsp)
353
		jnz	.Lfrom_usermode_no_gap_\@
354
		.rept	6
355
		pushq	5*8(%rsp)
356
		.endr
357
		UNWIND_HINT_IRET_REGS offset=8
358
.Lfrom_usermode_no_gap_\@:
359
	.endif
360

361
	idtentry_body \cfunc \has_error_code
362

363
_ASM_NOKPROBE(\asmsym)
364
SYM_CODE_END(\asmsym)
365
.endm
366

367
/*
368
 * Interrupt entry/exit.
369
 *
370
 + The interrupt stubs push (vector) onto the stack, which is the error_code
371
 * position of idtentry exceptions, and jump to one of the two idtentry points
372
 * (common/spurious).
373
 *
374
 * common_interrupt is a hotpath, align it to a cache line
375
 */
376
.macro idtentry_irq vector cfunc
377
	.p2align CONFIG_X86_L1_CACHE_SHIFT
378
	idtentry \vector asm_\cfunc \cfunc has_error_code=1
379
.endm
380

381
/**
382
 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
383
 * @vector:		Vector number
384
 * @asmsym:		ASM symbol for the entry point
385
 * @cfunc:		C function to be called
386
 *
387
 * The macro emits code to set up the kernel context for #MC and #DB
388
 *
389
 * If the entry comes from user space it uses the normal entry path
390
 * including the return to user space work and preemption checks on
391
 * exit.
392
 *
393
 * If hits in kernel mode then it needs to go through the paranoid
394
 * entry as the exception can hit any random state. No preemption
395
 * check on exit to keep the paranoid path simple.
396
 */
397
.macro idtentry_mce_db vector asmsym cfunc
398
SYM_CODE_START(\asmsym)
399
	UNWIND_HINT_IRET_ENTRY
400
	ENDBR
401
	ASM_CLAC
402
	cld
403

404
	pushq	$-1			/* ORIG_RAX: no syscall to restart */
405

406
	/*
407
	 * If the entry is from userspace, switch stacks and treat it as
408
	 * a normal entry.
409
	 */
410
	testb	$3, CS-ORIG_RAX(%rsp)
411
	jnz	.Lfrom_usermode_switch_stack_\@
412

413
	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
414
	call	paranoid_entry
415

416
	UNWIND_HINT_REGS
417

418
	movq	%rsp, %rdi		/* pt_regs pointer */
419

420
	call	\cfunc
421

422
	jmp	paranoid_exit
423

424
	/* Switch to the regular task stack and use the noist entry point */
425
.Lfrom_usermode_switch_stack_\@:
426
	idtentry_body noist_\cfunc, has_error_code=0
427

428
_ASM_NOKPROBE(\asmsym)
429
SYM_CODE_END(\asmsym)
430
.endm
431

432
#ifdef CONFIG_AMD_MEM_ENCRYPT
433
/**
434
 * idtentry_vc - Macro to generate entry stub for #VC
435
 * @vector:		Vector number
436
 * @asmsym:		ASM symbol for the entry point
437
 * @cfunc:		C function to be called
438
 *
439
 * The macro emits code to set up the kernel context for #VC. The #VC handler
440
 * runs on an IST stack and needs to be able to cause nested #VC exceptions.
441
 *
442
 * To make this work the #VC entry code tries its best to pretend it doesn't use
443
 * an IST stack by switching to the task stack if coming from user-space (which
444
 * includes early SYSCALL entry path) or back to the stack in the IRET frame if
445
 * entered from kernel-mode.
446
 *
447
 * If entered from kernel-mode the return stack is validated first, and if it is
448
 * not safe to use (e.g. because it points to the entry stack) the #VC handler
449
 * will switch to a fall-back stack (VC2) and call a special handler function.
450
 *
451
 * The macro is only used for one vector, but it is planned to be extended in
452
 * the future for the #HV exception.
453
 */
454
.macro idtentry_vc vector asmsym cfunc
455
SYM_CODE_START(\asmsym)
456
	UNWIND_HINT_IRET_ENTRY
457
	ENDBR
458
	ASM_CLAC
459
	cld
460

461
	/*
462
	 * If the entry is from userspace, switch stacks and treat it as
463
	 * a normal entry.
464
	 */
465
	testb	$3, CS-ORIG_RAX(%rsp)
466
	jnz	.Lfrom_usermode_switch_stack_\@
467

468
	/*
469
	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
470
	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
471
	 */
472
	call	paranoid_entry
473

474
	UNWIND_HINT_REGS
475

476
	/*
477
	 * Switch off the IST stack to make it free for nested exceptions. The
478
	 * vc_switch_off_ist() function will switch back to the interrupted
479
	 * stack if it is safe to do so. If not it switches to the VC fall-back
480
	 * stack.
481
	 */
482
	movq	%rsp, %rdi		/* pt_regs pointer */
483
	call	vc_switch_off_ist
484
	movq	%rax, %rsp		/* Switch to new stack */
485

486
	ENCODE_FRAME_POINTER
487
	UNWIND_HINT_REGS
488

489
	/* Update pt_regs */
490
	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
491
	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
492

493
	movq	%rsp, %rdi		/* pt_regs pointer */
494

495
	call	kernel_\cfunc
496

497
	/*
498
	 * No need to switch back to the IST stack. The current stack is either
499
	 * identical to the stack in the IRET frame or the VC fall-back stack,
500
	 * so it is definitely mapped even with PTI enabled.
501
	 */
502
	jmp	paranoid_exit
503

504
	/* Switch to the regular task stack */
505
.Lfrom_usermode_switch_stack_\@:
506
	idtentry_body user_\cfunc, has_error_code=1
507

508
_ASM_NOKPROBE(\asmsym)
509
SYM_CODE_END(\asmsym)
510
.endm
511
#endif
512

513
/*
514
 * Double fault entry. Straight paranoid. No checks from which context
515
 * this comes because for the espfix induced #DF this would do the wrong
516
 * thing.
517
 */
518
.macro idtentry_df vector asmsym cfunc
519
SYM_CODE_START(\asmsym)
520
	UNWIND_HINT_IRET_ENTRY offset=8
521
	ENDBR
522
	ASM_CLAC
523
	cld
524

525
	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
526
	call	paranoid_entry
527
	UNWIND_HINT_REGS
528

529
	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
530
	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
531
	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
532

533
	/* For some configurations \cfunc ends up being a noreturn. */
534
	ANNOTATE_REACHABLE
535
	call	\cfunc
536

537
	jmp	paranoid_exit
538

539
_ASM_NOKPROBE(\asmsym)
540
SYM_CODE_END(\asmsym)
541
.endm
542

543
/*
544
 * Include the defines which emit the idt entries which are shared
545
 * shared between 32 and 64 bit and emit the __irqentry_text_* markers
546
 * so the stacktrace boundary checks work.
547
 */
548
	__ALIGN
549
	.globl __irqentry_text_start
550
__irqentry_text_start:
551

552
#include <asm/idtentry.h>
553

554
	__ALIGN
555
	.globl __irqentry_text_end
556
__irqentry_text_end:
557
	ANNOTATE_NOENDBR
558

559
SYM_CODE_START_LOCAL(common_interrupt_return)
560
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
561
	IBRS_EXIT
562
#ifdef CONFIG_XEN_PV
563
	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
564
#endif
565
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
566
	ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
567
#endif
568

569
	STACKLEAK_ERASE
570
	POP_REGS
571
	add	$8, %rsp	/* orig_ax */
572
	UNWIND_HINT_IRET_REGS
573

574
.Lswapgs_and_iret:
575
	swapgs
576
	CLEAR_CPU_BUFFERS
577
	/* Assert that the IRET frame indicates user mode. */
578
	testb	$3, 8(%rsp)
579
	jnz	.Lnative_iret
580
	ud2
581

582
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
583
.Lpti_restore_regs_and_return_to_usermode:
584
	POP_REGS pop_rdi=0
585

586
	/*
587
	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
588
	 * Save old stack pointer and switch to trampoline stack.
589
	 */
590
	movq	%rsp, %rdi
591
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
592
	UNWIND_HINT_END_OF_STACK
593

594
	/* Copy the IRET frame to the trampoline stack. */
595
	pushq	6*8(%rdi)	/* SS */
596
	pushq	5*8(%rdi)	/* RSP */
597
	pushq	4*8(%rdi)	/* EFLAGS */
598
	pushq	3*8(%rdi)	/* CS */
599
	pushq	2*8(%rdi)	/* RIP */
600

601
	/* Push user RDI on the trampoline stack. */
602
	pushq	(%rdi)
603

604
	/*
605
	 * We are on the trampoline stack.  All regs except RDI are live.
606
	 * We can do future final exit work right here.
607
	 */
608
	STACKLEAK_ERASE_NOCLOBBER
609

610
	push	%rax
611
	SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
612
	pop	%rax
613

614
	/* Restore RDI. */
615
	popq	%rdi
616
	jmp	.Lswapgs_and_iret
617
#endif
618

619
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
620
#ifdef CONFIG_DEBUG_ENTRY
621
	/* Assert that pt_regs indicates kernel mode. */
622
	testb	$3, CS(%rsp)
623
	jz	1f
624
	ud2
625
1:
626
#endif
627
	POP_REGS
628
	addq	$8, %rsp	/* skip regs->orig_ax */
629
	/*
630
	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
631
	 * when returning from IPI handler.
632
	 */
633
#ifdef CONFIG_XEN_PV
634
SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
635
	ANNOTATE_NOENDBR
636
	.byte 0xe9
637
	.long .Lnative_iret - (. + 4)
638
#endif
639

640
.Lnative_iret:
641
	UNWIND_HINT_IRET_REGS
642
	/*
643
	 * Are we returning to a stack segment from the LDT?  Note: in
644
	 * 64-bit mode SS:RSP on the exception stack is always valid.
645
	 */
646
#ifdef CONFIG_X86_ESPFIX64
647
	testb	$4, (SS-RIP)(%rsp)
648
	jnz	native_irq_return_ldt
649
#endif
650

651
SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
652
	ANNOTATE_NOENDBR // exc_double_fault
653
	/*
654
	 * This may fault.  Non-paranoid faults on return to userspace are
655
	 * handled by fixup_bad_iret.  These include #SS, #GP, and #NP.
656
	 * Double-faults due to espfix64 are handled in exc_double_fault.
657
	 * Other faults here are fatal.
658
	 */
659
	iretq
660

661
#ifdef CONFIG_X86_ESPFIX64
662
native_irq_return_ldt:
663
	/*
664
	 * We are running with user GSBASE.  All GPRs contain their user
665
	 * values.  We have a percpu ESPFIX stack that is eight slots
666
	 * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
667
	 * of the ESPFIX stack.
668
	 *
669
	 * We clobber RAX and RDI in this code.  We stash RDI on the
670
	 * normal stack and RAX on the ESPFIX stack.
671
	 *
672
	 * The ESPFIX stack layout we set up looks like this:
673
	 *
674
	 * --- top of ESPFIX stack ---
675
	 * SS
676
	 * RSP
677
	 * RFLAGS
678
	 * CS
679
	 * RIP  <-- RSP points here when we're done
680
	 * RAX  <-- espfix_waddr points here
681
	 * --- bottom of ESPFIX stack ---
682
	 */
683

684
	pushq	%rdi				/* Stash user RDI */
685
	swapgs					/* to kernel GS */
686
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
687

688
	movq	PER_CPU_VAR(espfix_waddr), %rdi
689
	movq	%rax, (0*8)(%rdi)		/* user RAX */
690
	movq	(1*8)(%rsp), %rax		/* user RIP */
691
	movq	%rax, (1*8)(%rdi)
692
	movq	(2*8)(%rsp), %rax		/* user CS */
693
	movq	%rax, (2*8)(%rdi)
694
	movq	(3*8)(%rsp), %rax		/* user RFLAGS */
695
	movq	%rax, (3*8)(%rdi)
696
	movq	(5*8)(%rsp), %rax		/* user SS */
697
	movq	%rax, (5*8)(%rdi)
698
	movq	(4*8)(%rsp), %rax		/* user RSP */
699
	movq	%rax, (4*8)(%rdi)
700
	/* Now RAX == RSP. */
701

702
	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
703

704
	/*
705
	 * espfix_stack[31:16] == 0.  The page tables are set up such that
706
	 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
707
	 * espfix_waddr for any X.  That is, there are 65536 RO aliases of
708
	 * the same page.  Set up RSP so that RSP[31:16] contains the
709
	 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
710
	 * still points to an RO alias of the ESPFIX stack.
711
	 */
712
	orq	PER_CPU_VAR(espfix_stack), %rax
713

714
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
715
	swapgs					/* to user GS */
716
	popq	%rdi				/* Restore user RDI */
717

718
	movq	%rax, %rsp
719
	UNWIND_HINT_IRET_REGS offset=8
720

721
	/*
722
	 * At this point, we cannot write to the stack any more, but we can
723
	 * still read.
724
	 */
725
	popq	%rax				/* Restore user RAX */
726

727
	CLEAR_CPU_BUFFERS
728

729
	/*
730
	 * RSP now points to an ordinary IRET frame, except that the page
731
	 * is read-only and RSP[31:16] are preloaded with the userspace
732
	 * values.  We can now IRET back to userspace.
733
	 */
734
	jmp	native_irq_return_iret
735
#endif
736
SYM_CODE_END(common_interrupt_return)
737
_ASM_NOKPROBE(common_interrupt_return)
738

739
/*
740
 * Reload gs selector with exception handling
741
 *  di:  new selector
742
 *
743
 * Is in entry.text as it shouldn't be instrumented.
744
 */
745
SYM_FUNC_START(asm_load_gs_index)
746
	ANNOTATE_NOENDBR
747
	FRAME_BEGIN
748
	swapgs
749
.Lgs_change:
750
	ANNOTATE_NOENDBR // error_entry
751
	movl	%edi, %gs
752
2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
753
	swapgs
754
	FRAME_END
755
	RET
756

757
	/* running with kernelgs */
758
.Lbad_gs:
759
	swapgs					/* switch back to user gs */
760
.macro ZAP_GS
761
	/* This can't be a string because the preprocessor needs to see it. */
762
	movl $__USER_DS, %eax
763
	movl %eax, %gs
764
.endm
765
	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
766
	xorl	%eax, %eax
767
	movl	%eax, %gs
768
	jmp	2b
769

770
	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
771

772
SYM_FUNC_END(asm_load_gs_index)
773
EXPORT_SYMBOL(asm_load_gs_index)
774

775
#ifdef CONFIG_XEN_PV
776
/*
777
 * A note on the "critical region" in our callback handler.
778
 * We want to avoid stacking callback handlers due to events occurring
779
 * during handling of the last event. To do this, we keep events disabled
780
 * until we've done all processing. HOWEVER, we must enable events before
781
 * popping the stack frame (can't be done atomically) and so it would still
782
 * be possible to get enough handler activations to overflow the stack.
783
 * Although unlikely, bugs of that kind are hard to track down, so we'd
784
 * like to avoid the possibility.
785
 * So, on entry to the handler we detect whether we interrupted an
786
 * existing activation in its critical region -- if so, we pop the current
787
 * activation and restart the handler using the previous one.
788
 *
789
 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
790
 */
791
	__FUNC_ALIGN
792
SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
793

794
/*
795
 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
796
 * see the correct pointer to the pt_regs
797
 */
798
	UNWIND_HINT_FUNC
799
	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
800
	UNWIND_HINT_REGS
801

802
	call	xen_pv_evtchn_do_upcall
803

804
	jmp	error_return
805
SYM_CODE_END(exc_xen_hypervisor_callback)
806

807
/*
808
 * Hypervisor uses this for application faults while it executes.
809
 * We get here for two reasons:
810
 *  1. Fault while reloading DS, ES, FS or GS
811
 *  2. Fault while executing IRET
812
 * Category 1 we do not need to fix up as Xen has already reloaded all segment
813
 * registers that could be reloaded and zeroed the others.
814
 * Category 2 we fix up by killing the current process. We cannot use the
815
 * normal Linux return path in this case because if we use the IRET hypercall
816
 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
817
 * We distinguish between categories by comparing each saved segment register
818
 * with its current contents: any discrepancy means we in category 1.
819
 */
820
	__FUNC_ALIGN
821
SYM_CODE_START_NOALIGN(xen_failsafe_callback)
822
	UNWIND_HINT_UNDEFINED
823
	ENDBR
824
	movl	%ds, %ecx
825
	cmpw	%cx, 0x10(%rsp)
826
	jne	1f
827
	movl	%es, %ecx
828
	cmpw	%cx, 0x18(%rsp)
829
	jne	1f
830
	movl	%fs, %ecx
831
	cmpw	%cx, 0x20(%rsp)
832
	jne	1f
833
	movl	%gs, %ecx
834
	cmpw	%cx, 0x28(%rsp)
835
	jne	1f
836
	/* All segments match their saved values => Category 2 (Bad IRET). */
837
	movq	(%rsp), %rcx
838
	movq	8(%rsp), %r11
839
	addq	$0x30, %rsp
840
	pushq	$0				/* RIP */
841
	UNWIND_HINT_IRET_REGS offset=8
842
	jmp	asm_exc_general_protection
843
1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
844
	movq	(%rsp), %rcx
845
	movq	8(%rsp), %r11
846
	addq	$0x30, %rsp
847
	UNWIND_HINT_IRET_REGS
848
	pushq	$-1 /* orig_ax = -1 => not a system call */
849
	PUSH_AND_CLEAR_REGS
850
	ENCODE_FRAME_POINTER
851
	jmp	error_return
852
SYM_CODE_END(xen_failsafe_callback)
853
#endif /* CONFIG_XEN_PV */
854

855
/*
856
 * Save all registers in pt_regs. Return GSBASE related information
857
 * in EBX depending on the availability of the FSGSBASE instructions:
858
 *
859
 * FSGSBASE	R/EBX
860
 *     N        0 -> SWAPGS on exit
861
 *              1 -> no SWAPGS on exit
862
 *
863
 *     Y        GSBASE value at entry, must be restored in paranoid_exit
864
 *
865
 * R14 - old CR3
866
 * R15 - old SPEC_CTRL
867
 */
868
SYM_CODE_START(paranoid_entry)
869
	ANNOTATE_NOENDBR
870
	UNWIND_HINT_FUNC
871
	PUSH_AND_CLEAR_REGS save_ret=1
872
	ENCODE_FRAME_POINTER 8
873

874
	/*
875
	 * Always stash CR3 in %r14.  This value will be restored,
876
	 * verbatim, at exit.  Needed if paranoid_entry interrupted
877
	 * another entry that already switched to the user CR3 value
878
	 * but has not yet returned to userspace.
879
	 *
880
	 * This is also why CS (stashed in the "iret frame" by the
881
	 * hardware at entry) can not be used: this may be a return
882
	 * to kernel code, but with a user CR3 value.
883
	 *
884
	 * Switching CR3 does not depend on kernel GSBASE so it can
885
	 * be done before switching to the kernel GSBASE. This is
886
	 * required for FSGSBASE because the kernel GSBASE has to
887
	 * be retrieved from a kernel internal table.
888
	 */
889
	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
890

891
	/*
892
	 * Handling GSBASE depends on the availability of FSGSBASE.
893
	 *
894
	 * Without FSGSBASE the kernel enforces that negative GSBASE
895
	 * values indicate kernel GSBASE. With FSGSBASE no assumptions
896
	 * can be made about the GSBASE value when entering from user
897
	 * space.
898
	 */
899
	ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
900

901
	/*
902
	 * Read the current GSBASE and store it in %rbx unconditionally,
903
	 * retrieve and set the current CPUs kernel GSBASE. The stored value
904
	 * has to be restored in paranoid_exit unconditionally.
905
	 *
906
	 * The unconditional write to GS base below ensures that no subsequent
907
	 * loads based on a mispredicted GS base can happen, therefore no LFENCE
908
	 * is needed here.
909
	 */
910
	SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
911
	jmp .Lparanoid_gsbase_done
912

913
.Lparanoid_entry_checkgs:
914
	/* EBX = 1 -> kernel GSBASE active, no restore required */
915
	movl	$1, %ebx
916

917
	/*
918
	 * The kernel-enforced convention is a negative GSBASE indicates
919
	 * a kernel value. No SWAPGS needed on entry and exit.
920
	 */
921
	movl	$MSR_GS_BASE, %ecx
922
	rdmsr
923
	testl	%edx, %edx
924
	js	.Lparanoid_kernel_gsbase
925

926
	/* EBX = 0 -> SWAPGS required on exit */
927
	xorl	%ebx, %ebx
928
	swapgs
929
.Lparanoid_kernel_gsbase:
930
	FENCE_SWAPGS_KERNEL_ENTRY
931
.Lparanoid_gsbase_done:
932

933
	/*
934
	 * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
935
	 * CR3 above, keep the old value in a callee saved register.
936
	 */
937
	IBRS_ENTER save_reg=%r15
938
	UNTRAIN_RET_FROM_CALL
939

940
	RET
941
SYM_CODE_END(paranoid_entry)
942

943
/*
944
 * "Paranoid" exit path from exception stack.  This is invoked
945
 * only on return from non-NMI IST interrupts that came
946
 * from kernel space.
947
 *
948
 * We may be returning to very strange contexts (e.g. very early
949
 * in syscall entry), so checking for preemption here would
950
 * be complicated.  Fortunately, there's no good reason to try
951
 * to handle preemption here.
952
 *
953
 * R/EBX contains the GSBASE related information depending on the
954
 * availability of the FSGSBASE instructions:
955
 *
956
 * FSGSBASE	R/EBX
957
 *     N        0 -> SWAPGS on exit
958
 *              1 -> no SWAPGS on exit
959
 *
960
 *     Y        User space GSBASE, must be restored unconditionally
961
 *
962
 * R14 - old CR3
963
 * R15 - old SPEC_CTRL
964
 */
965
SYM_CODE_START_LOCAL(paranoid_exit)
966
	UNWIND_HINT_REGS
967

968
	/*
969
	 * Must restore IBRS state before both CR3 and %GS since we need access
970
	 * to the per-CPU x86_spec_ctrl_shadow variable.
971
	 */
972
	IBRS_EXIT save_reg=%r15
973

974
	/*
975
	 * The order of operations is important. PARANOID_RESTORE_CR3 requires
976
	 * kernel GSBASE.
977
	 *
978
	 * NB to anyone to try to optimize this code: this code does
979
	 * not execute at all for exceptions from user mode. Those
980
	 * exceptions go through error_return instead.
981
	 */
982
	PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
983

984
	/* Handle the three GSBASE cases */
985
	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
986

987
	/* With FSGSBASE enabled, unconditionally restore GSBASE */
988
	wrgsbase	%rbx
989
	jmp		restore_regs_and_return_to_kernel
990

991
.Lparanoid_exit_checkgs:
992
	/* On non-FSGSBASE systems, conditionally do SWAPGS */
993
	testl		%ebx, %ebx
994
	jnz		restore_regs_and_return_to_kernel
995

996
	/* We are returning to a context with user GSBASE */
997
	swapgs
998
	jmp		restore_regs_and_return_to_kernel
999
SYM_CODE_END(paranoid_exit)
1000

1001
/*
1002
 * Switch GS and CR3 if needed.
1003
 */
1004
SYM_CODE_START(error_entry)
1005
	ANNOTATE_NOENDBR
1006
	UNWIND_HINT_FUNC
1007

1008
	PUSH_AND_CLEAR_REGS save_ret=1
1009
	ENCODE_FRAME_POINTER 8
1010

1011
	testb	$3, CS+8(%rsp)
1012
	jz	.Lerror_kernelspace
1013

1014
	/*
1015
	 * We entered from user mode or we're pretending to have entered
1016
	 * from user mode due to an IRET fault.
1017
	 */
1018
	swapgs
1019
	FENCE_SWAPGS_USER_ENTRY
1020
	/* We have user CR3.  Change to kernel CR3. */
1021
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1022
	IBRS_ENTER
1023
	UNTRAIN_RET_FROM_CALL
1024

1025
	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
1026
	/* Put us onto the real thread stack. */
1027
	jmp	sync_regs
1028

1029
	/*
1030
	 * There are two places in the kernel that can potentially fault with
1031
	 * usergs. Handle them here.  B stepping K8s sometimes report a
1032
	 * truncated RIP for IRET exceptions returning to compat mode. Check
1033
	 * for these here too.
1034
	 */
1035
.Lerror_kernelspace:
1036
	leaq	native_irq_return_iret(%rip), %rcx
1037
	cmpq	%rcx, RIP+8(%rsp)
1038
	je	.Lerror_bad_iret
1039
	movl	%ecx, %eax			/* zero extend */
1040
	cmpq	%rax, RIP+8(%rsp)
1041
	je	.Lbstep_iret
1042
	cmpq	$.Lgs_change, RIP+8(%rsp)
1043
	jne	.Lerror_entry_done_lfence
1044

1045
	/*
1046
	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
1047
	 * gsbase and proceed.  We'll fix up the exception and land in
1048
	 * .Lgs_change's error handler with kernel gsbase.
1049
	 */
1050
	swapgs
1051

1052
	/*
1053
	 * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1054
	 * kernel or user gsbase.
1055
	 */
1056
.Lerror_entry_done_lfence:
1057
	FENCE_SWAPGS_KERNEL_ENTRY
1058
	CALL_DEPTH_ACCOUNT
1059
	leaq	8(%rsp), %rax			/* return pt_regs pointer */
1060
	VALIDATE_UNRET_END
1061
	RET
1062

1063
.Lbstep_iret:
1064
	/* Fix truncated RIP */
1065
	movq	%rcx, RIP+8(%rsp)
1066
	/* fall through */
1067

1068
.Lerror_bad_iret:
1069
	/*
1070
	 * We came from an IRET to user mode, so we have user
1071
	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
1072
	 */
1073
	swapgs
1074
	FENCE_SWAPGS_USER_ENTRY
1075
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1076
	IBRS_ENTER
1077
	UNTRAIN_RET_FROM_CALL
1078

1079
	/*
1080
	 * Pretend that the exception came from user mode: set up pt_regs
1081
	 * as if we faulted immediately after IRET.
1082
	 */
1083
	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
1084
	call	fixup_bad_iret
1085
	mov	%rax, %rdi
1086
	jmp	sync_regs
1087
SYM_CODE_END(error_entry)
1088

1089
SYM_CODE_START_LOCAL(error_return)
1090
	UNWIND_HINT_REGS
1091
	DEBUG_ENTRY_ASSERT_IRQS_OFF
1092
	testb	$3, CS(%rsp)
1093
	jz	restore_regs_and_return_to_kernel
1094
	jmp	swapgs_restore_regs_and_return_to_usermode
1095
SYM_CODE_END(error_return)
1096

1097
/*
1098
 * Runs on exception stack.  Xen PV does not go through this path at all,
1099
 * so we can use real assembly here.
1100
 *
1101
 * Registers:
1102
 *	%r14: Used to save/restore the CR3 of the interrupted context
1103
 *	      when MITIGATION_PAGE_TABLE_ISOLATION is in use.  Do not clobber.
1104
 */
1105
SYM_CODE_START(asm_exc_nmi)
1106
	UNWIND_HINT_IRET_ENTRY
1107
	ENDBR
1108

1109
	/*
1110
	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1111
	 * the iretq it performs will take us out of NMI context.
1112
	 * This means that we can have nested NMIs where the next
1113
	 * NMI is using the top of the stack of the previous NMI. We
1114
	 * can't let it execute because the nested NMI will corrupt the
1115
	 * stack of the previous NMI. NMI handlers are not re-entrant
1116
	 * anyway.
1117
	 *
1118
	 * To handle this case we do the following:
1119
	 *  Check a special location on the stack that contains a
1120
	 *  variable that is set when NMIs are executing.
1121
	 *  The interrupted task's stack is also checked to see if it
1122
	 *  is an NMI stack.
1123
	 *  If the variable is not set and the stack is not the NMI
1124
	 *  stack then:
1125
	 *    o Set the special variable on the stack
1126
	 *    o Copy the interrupt frame into an "outermost" location on the
1127
	 *      stack
1128
	 *    o Copy the interrupt frame into an "iret" location on the stack
1129
	 *    o Continue processing the NMI
1130
	 *  If the variable is set or the previous stack is the NMI stack:
1131
	 *    o Modify the "iret" location to jump to the repeat_nmi
1132
	 *    o return back to the first NMI
1133
	 *
1134
	 * Now on exit of the first NMI, we first clear the stack variable
1135
	 * The NMI stack will tell any nested NMIs at that point that it is
1136
	 * nested. Then we pop the stack normally with iret, and if there was
1137
	 * a nested NMI that updated the copy interrupt stack frame, a
1138
	 * jump will be made to the repeat_nmi code that will handle the second
1139
	 * NMI.
1140
	 *
1141
	 * However, espfix prevents us from directly returning to userspace
1142
	 * with a single IRET instruction.  Similarly, IRET to user mode
1143
	 * can fault.  We therefore handle NMIs from user space like
1144
	 * other IST entries.
1145
	 */
1146

1147
	ASM_CLAC
1148
	cld
1149

1150
	/* Use %rdx as our temp variable throughout */
1151
	pushq	%rdx
1152

1153
	testb	$3, CS-RIP+8(%rsp)
1154
	jz	.Lnmi_from_kernel
1155

1156
	/*
1157
	 * NMI from user mode.  We need to run on the thread stack, but we
1158
	 * can't go through the normal entry paths: NMIs are masked, and
1159
	 * we don't want to enable interrupts, because then we'll end
1160
	 * up in an awkward situation in which IRQs are on but NMIs
1161
	 * are off.
1162
	 *
1163
	 * We also must not push anything to the stack before switching
1164
	 * stacks lest we corrupt the "NMI executing" variable.
1165
	 */
1166

1167
	swapgs
1168
	FENCE_SWAPGS_USER_ENTRY
1169
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
1170
	movq	%rsp, %rdx
1171
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1172
	UNWIND_HINT_IRET_REGS base=%rdx offset=8
1173
	pushq	5*8(%rdx)	/* pt_regs->ss */
1174
	pushq	4*8(%rdx)	/* pt_regs->rsp */
1175
	pushq	3*8(%rdx)	/* pt_regs->flags */
1176
	pushq	2*8(%rdx)	/* pt_regs->cs */
1177
	pushq	1*8(%rdx)	/* pt_regs->rip */
1178
	UNWIND_HINT_IRET_REGS
1179
	pushq   $-1		/* pt_regs->orig_ax */
1180
	PUSH_AND_CLEAR_REGS rdx=(%rdx)
1181
	ENCODE_FRAME_POINTER
1182

1183
	IBRS_ENTER
1184
	UNTRAIN_RET
1185

1186
	/*
1187
	 * At this point we no longer need to worry about stack damage
1188
	 * due to nesting -- we're on the normal thread stack and we're
1189
	 * done with the NMI stack.
1190
	 */
1191

1192
	movq	%rsp, %rdi
1193
	call	exc_nmi
1194

1195
	/*
1196
	 * Return back to user mode.  We must *not* do the normal exit
1197
	 * work, because we don't want to enable interrupts.
1198
	 */
1199
	jmp	swapgs_restore_regs_and_return_to_usermode
1200

1201
.Lnmi_from_kernel:
1202
	/*
1203
	 * Here's what our stack frame will look like:
1204
	 * +---------------------------------------------------------+
1205
	 * | original SS                                             |
1206
	 * | original Return RSP                                     |
1207
	 * | original RFLAGS                                         |
1208
	 * | original CS                                             |
1209
	 * | original RIP                                            |
1210
	 * +---------------------------------------------------------+
1211
	 * | temp storage for rdx                                    |
1212
	 * +---------------------------------------------------------+
1213
	 * | "NMI executing" variable                                |
1214
	 * +---------------------------------------------------------+
1215
	 * | iret SS          } Copied from "outermost" frame        |
1216
	 * | iret Return RSP  } on each loop iteration; overwritten  |
1217
	 * | iret RFLAGS      } by a nested NMI to force another     |
1218
	 * | iret CS          } iteration if needed.                 |
1219
	 * | iret RIP         }                                      |
1220
	 * +---------------------------------------------------------+
1221
	 * | outermost SS          } initialized in first_nmi;       |
1222
	 * | outermost Return RSP  } will not be changed before      |
1223
	 * | outermost RFLAGS      } NMI processing is done.         |
1224
	 * | outermost CS          } Copied to "iret" frame on each  |
1225
	 * | outermost RIP         } iteration.                      |
1226
	 * +---------------------------------------------------------+
1227
	 * | pt_regs                                                 |
1228
	 * +---------------------------------------------------------+
1229
	 *
1230
	 * The "original" frame is used by hardware.  Before re-enabling
1231
	 * NMIs, we need to be done with it, and we need to leave enough
1232
	 * space for the asm code here.
1233
	 *
1234
	 * We return by executing IRET while RSP points to the "iret" frame.
1235
	 * That will either return for real or it will loop back into NMI
1236
	 * processing.
1237
	 *
1238
	 * The "outermost" frame is copied to the "iret" frame on each
1239
	 * iteration of the loop, so each iteration starts with the "iret"
1240
	 * frame pointing to the final return target.
1241
	 */
1242

1243
	/*
1244
	 * Determine whether we're a nested NMI.
1245
	 *
1246
	 * If we interrupted kernel code between repeat_nmi and
1247
	 * end_repeat_nmi, then we are a nested NMI.  We must not
1248
	 * modify the "iret" frame because it's being written by
1249
	 * the outer NMI.  That's okay; the outer NMI handler is
1250
	 * about to call exc_nmi() anyway, so we can just resume
1251
	 * the outer NMI.
1252
	 */
1253

1254
	movq	$repeat_nmi, %rdx
1255
	cmpq	8(%rsp), %rdx
1256
	ja	1f
1257
	movq	$end_repeat_nmi, %rdx
1258
	cmpq	8(%rsp), %rdx
1259
	ja	nested_nmi_out
1260
1:
1261

1262
	/*
1263
	 * Now check "NMI executing".  If it's set, then we're nested.
1264
	 * This will not detect if we interrupted an outer NMI just
1265
	 * before IRET.
1266
	 */
1267
	cmpl	$1, -8(%rsp)
1268
	je	nested_nmi
1269

1270
	/*
1271
	 * Now test if the previous stack was an NMI stack.  This covers
1272
	 * the case where we interrupt an outer NMI after it clears
1273
	 * "NMI executing" but before IRET.  We need to be careful, though:
1274
	 * there is one case in which RSP could point to the NMI stack
1275
	 * despite there being no NMI active: naughty userspace controls
1276
	 * RSP at the very beginning of the SYSCALL targets.  We can
1277
	 * pull a fast one on naughty userspace, though: we program
1278
	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1279
	 * if it controls the kernel's RSP.  We set DF before we clear
1280
	 * "NMI executing".
1281
	 */
1282
	lea	6*8(%rsp), %rdx
1283
	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1284
	cmpq	%rdx, 4*8(%rsp)
1285
	/* If the stack pointer is above the NMI stack, this is a normal NMI */
1286
	ja	first_nmi
1287

1288
	subq	$EXCEPTION_STKSZ, %rdx
1289
	cmpq	%rdx, 4*8(%rsp)
1290
	/* If it is below the NMI stack, it is a normal NMI */
1291
	jb	first_nmi
1292

1293
	/* Ah, it is within the NMI stack. */
1294

1295
	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1296
	jz	first_nmi	/* RSP was user controlled. */
1297

1298
	/* This is a nested NMI. */
1299

1300
nested_nmi:
1301
	/*
1302
	 * Modify the "iret" frame to point to repeat_nmi, forcing another
1303
	 * iteration of NMI handling.
1304
	 */
1305
	subq	$8, %rsp
1306
	leaq	-10*8(%rsp), %rdx
1307
	pushq	$__KERNEL_DS
1308
	pushq	%rdx
1309
	pushfq
1310
	pushq	$__KERNEL_CS
1311
	pushq	$repeat_nmi
1312

1313
	/* Put stack back */
1314
	addq	$(6*8), %rsp
1315

1316
nested_nmi_out:
1317
	popq	%rdx
1318

1319
	/* We are returning to kernel mode, so this cannot result in a fault. */
1320
	iretq
1321

1322
first_nmi:
1323
	/* Restore rdx. */
1324
	movq	(%rsp), %rdx
1325

1326
	/* Make room for "NMI executing". */
1327
	pushq	$0
1328

1329
	/* Leave room for the "iret" frame */
1330
	subq	$(5*8), %rsp
1331

1332
	/* Copy the "original" frame to the "outermost" frame */
1333
	.rept 5
1334
	pushq	11*8(%rsp)
1335
	.endr
1336
	UNWIND_HINT_IRET_REGS
1337

1338
	/* Everything up to here is safe from nested NMIs */
1339

1340
#ifdef CONFIG_DEBUG_ENTRY
1341
	/*
1342
	 * For ease of testing, unmask NMIs right away.  Disabled by
1343
	 * default because IRET is very expensive.
1344
	 */
1345
	pushq	$0		/* SS */
1346
	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
1347
	addq	$8, (%rsp)	/* Fix up RSP */
1348
	pushfq			/* RFLAGS */
1349
	pushq	$__KERNEL_CS	/* CS */
1350
	pushq	$1f		/* RIP */
1351
	iretq			/* continues at repeat_nmi below */
1352
	UNWIND_HINT_IRET_REGS
1353
1:
1354
#endif
1355

1356
repeat_nmi:
1357
	ANNOTATE_NOENDBR // this code
1358
	/*
1359
	 * If there was a nested NMI, the first NMI's iret will return
1360
	 * here. But NMIs are still enabled and we can take another
1361
	 * nested NMI. The nested NMI checks the interrupted RIP to see
1362
	 * if it is between repeat_nmi and end_repeat_nmi, and if so
1363
	 * it will just return, as we are about to repeat an NMI anyway.
1364
	 * This makes it safe to copy to the stack frame that a nested
1365
	 * NMI will update.
1366
	 *
1367
	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
1368
	 * we're repeating an NMI, gsbase has the same value that it had on
1369
	 * the first iteration.  paranoid_entry will load the kernel
1370
	 * gsbase if needed before we call exc_nmi().  "NMI executing"
1371
	 * is zero.
1372
	 */
1373
	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
1374

1375
	/*
1376
	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
1377
	 * here must not modify the "iret" frame while we're writing to
1378
	 * it or it will end up containing garbage.
1379
	 */
1380
	addq	$(10*8), %rsp
1381
	.rept 5
1382
	pushq	-6*8(%rsp)
1383
	.endr
1384
	subq	$(5*8), %rsp
1385
end_repeat_nmi:
1386
	ANNOTATE_NOENDBR // this code
1387

1388
	/*
1389
	 * Everything below this point can be preempted by a nested NMI.
1390
	 * If this happens, then the inner NMI will change the "iret"
1391
	 * frame to point back to repeat_nmi.
1392
	 */
1393
	pushq	$-1				/* ORIG_RAX: no syscall to restart */
1394

1395
	/*
1396
	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1397
	 * as we should not be calling schedule in NMI context.
1398
	 * Even with normal interrupts enabled. An NMI should not be
1399
	 * setting NEED_RESCHED or anything that normal interrupts and
1400
	 * exceptions might do.
1401
	 */
1402
	call	paranoid_entry
1403
	UNWIND_HINT_REGS
1404

1405
	movq	%rsp, %rdi
1406
	call	exc_nmi
1407

1408
	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
1409
	IBRS_EXIT save_reg=%r15
1410

1411
	PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1412

1413
	/*
1414
	 * The above invocation of paranoid_entry stored the GSBASE
1415
	 * related information in R/EBX depending on the availability
1416
	 * of FSGSBASE.
1417
	 *
1418
	 * If FSGSBASE is enabled, restore the saved GSBASE value
1419
	 * unconditionally, otherwise take the conditional SWAPGS path.
1420
	 */
1421
	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1422

1423
	wrgsbase	%rbx
1424
	jmp	nmi_restore
1425

1426
nmi_no_fsgsbase:
1427
	/* EBX == 0 -> invoke SWAPGS */
1428
	testl	%ebx, %ebx
1429
	jnz	nmi_restore
1430

1431
nmi_swapgs:
1432
	swapgs
1433

1434
nmi_restore:
1435
	POP_REGS
1436

1437
	/*
1438
	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1439
	 * at the "iret" frame.
1440
	 */
1441
	addq	$6*8, %rsp
1442

1443
	/*
1444
	 * Clear "NMI executing".  Set DF first so that we can easily
1445
	 * distinguish the remaining code between here and IRET from
1446
	 * the SYSCALL entry and exit paths.
1447
	 *
1448
	 * We arguably should just inspect RIP instead, but I (Andy) wrote
1449
	 * this code when I had the misapprehension that Xen PV supported
1450
	 * NMIs, and Xen PV would break that approach.
1451
	 */
1452
	std
1453
	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
1454

1455
	/*
1456
	 * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
1457
	 * NMI in kernel after user state is restored. For an unprivileged user
1458
	 * these conditions are hard to meet.
1459
	 */
1460

1461
	/*
1462
	 * iretq reads the "iret" frame and exits the NMI stack in a
1463
	 * single instruction.  We are returning to kernel mode, so this
1464
	 * cannot result in a fault.  Similarly, we don't need to worry
1465
	 * about espfix64 on the way back to kernel mode.
1466
	 */
1467
	iretq
1468
SYM_CODE_END(asm_exc_nmi)
1469

1470
/*
1471
 * This handles SYSCALL from 32-bit code.  There is no way to program
1472
 * MSRs to fully disable 32-bit SYSCALL.
1473
 */
1474
SYM_CODE_START(entry_SYSCALL32_ignore)
1475
	UNWIND_HINT_END_OF_STACK
1476
	ENDBR
1477
	mov	$-ENOSYS, %eax
1478
	CLEAR_CPU_BUFFERS
1479
	sysretl
1480
SYM_CODE_END(entry_SYSCALL32_ignore)
1481

1482
.pushsection .text, "ax"
1483
	__FUNC_ALIGN
1484
SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
1485
	UNWIND_HINT_FUNC
1486
	/* Prevent any naive code from trying to unwind to our caller. */
1487
	xorl	%ebp, %ebp
1488

1489
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
1490
	leaq	-PTREGS_SIZE(%rax), %rsp
1491
	UNWIND_HINT_REGS
1492

1493
	call	make_task_dead
1494
SYM_CODE_END(rewind_stack_and_make_dead)
1495
.popsection
1496

1497
/*
1498
 * This sequence executes branches in order to remove user branch information
1499
 * from the branch history tracker in the Branch Predictor, therefore removing
1500
 * user influence on subsequent BTB lookups.
1501
 *
1502
 * It should be used on parts prior to Alder Lake. Newer parts should use the
1503
 * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
1504
 * virtualized on newer hardware the VMM should protect against BHI attacks by
1505
 * setting BHI_DIS_S for the guests.
1506
 *
1507
 * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
1508
 * and not clearing the branch history. The call tree looks like:
1509
 *
1510
 * call 1
1511
 *    call 2
1512
 *      call 2
1513
 *        call 2
1514
 *          call 2
1515
 * 	      call 2
1516
 * 	      ret
1517
 * 	    ret
1518
 *        ret
1519
 *      ret
1520
 *    ret
1521
 * ret
1522
 *
1523
 * This means that the stack is non-constant and ORC can't unwind it with %rsp
1524
 * alone.  Therefore we unconditionally set up the frame pointer, which allows
1525
 * ORC to unwind properly.
1526
 *
1527
 * The alignment is for performance and not for safety, and may be safely
1528
 * refactored in the future if needed. The .skips are for safety, to ensure
1529
 * that all RETs are in the second half of a cacheline to mitigate Indirect
1530
 * Target Selection, rather than taking the slowpath via its_return_thunk.
1531
 */
1532
SYM_FUNC_START(clear_bhb_loop)
1533
	ANNOTATE_NOENDBR
1534
	push	%rbp
1535
	mov	%rsp, %rbp
1536
	movl	$5, %ecx
1537
	ANNOTATE_INTRA_FUNCTION_CALL
1538
	call	1f
1539
	jmp	5f
1540
	.align 64, 0xcc
1541
	/*
1542
	 * Shift instructions so that the RET is in the upper half of the
1543
	 * cacheline and don't take the slowpath to its_return_thunk.
1544
	 */
1545
	.skip 32 - (.Lret1 - 1f), 0xcc
1546
	ANNOTATE_INTRA_FUNCTION_CALL
1547
1:	call	2f
1548
.Lret1:	RET
1549
	.align 64, 0xcc
1550
	/*
1551
	 * As above shift instructions for RET at .Lret2 as well.
1552
	 *
1553
	 * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
1554
	 * but some Clang versions (e.g. 18) don't like this.
1555
	 */
1556
	.skip 32 - 18, 0xcc
1557
2:	movl	$5, %eax
1558
3:	jmp	4f
1559
	nop
1560
4:	sub	$1, %eax
1561
	jnz	3b
1562
	sub	$1, %ecx
1563
	jnz	1b
1564
.Lret2:	RET
1565
5:	lfence
1566
	pop	%rbp
1567
	RET
1568
SYM_FUNC_END(clear_bhb_loop)
1569
EXPORT_SYMBOL_GPL(clear_bhb_loop)
1570
STACK_FRAME_NON_STANDARD(clear_bhb_loop)
1571

1572
Product

Resources

Company