Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/vmx.c
50904 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine driver for Linux
4
*
5
* This module enables machines with Intel VT-x extensions to run virtual
6
* machines without emulation or binary translation.
7
*
8
* Copyright (C) 2006 Qumranet, Inc.
9
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
10
*
11
* Authors:
12
* Avi Kivity <[email protected]>
13
* Yaniv Kamay <[email protected]>
14
*/
15
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17
#include <linux/highmem.h>
18
#include <linux/hrtimer.h>
19
#include <linux/kernel.h>
20
#include <linux/kvm_host.h>
21
#include <linux/module.h>
22
#include <linux/moduleparam.h>
23
#include <linux/mod_devicetable.h>
24
#include <linux/mm.h>
25
#include <linux/objtool.h>
26
#include <linux/sched.h>
27
#include <linux/sched/smt.h>
28
#include <linux/slab.h>
29
#include <linux/tboot.h>
30
#include <linux/trace_events.h>
31
32
#include <asm/apic.h>
33
#include <asm/asm.h>
34
#include <asm/cpu.h>
35
#include <asm/cpu_device_id.h>
36
#include <asm/debugreg.h>
37
#include <asm/desc.h>
38
#include <asm/fpu/api.h>
39
#include <asm/fpu/xstate.h>
40
#include <asm/fred.h>
41
#include <asm/idtentry.h>
42
#include <asm/io.h>
43
#include <asm/irq_remapping.h>
44
#include <asm/reboot.h>
45
#include <asm/perf_event.h>
46
#include <asm/mmu_context.h>
47
#include <asm/mshyperv.h>
48
#include <asm/msr.h>
49
#include <asm/mwait.h>
50
#include <asm/spec-ctrl.h>
51
#include <asm/vmx.h>
52
53
#include <trace/events/ipi.h>
54
55
#include "capabilities.h"
56
#include "common.h"
57
#include "cpuid.h"
58
#include "hyperv.h"
59
#include "kvm_onhyperv.h"
60
#include "irq.h"
61
#include "kvm_cache_regs.h"
62
#include "lapic.h"
63
#include "mmu.h"
64
#include "nested.h"
65
#include "pmu.h"
66
#include "sgx.h"
67
#include "trace.h"
68
#include "vmcs.h"
69
#include "vmcs12.h"
70
#include "vmx.h"
71
#include "x86.h"
72
#include "x86_ops.h"
73
#include "smm.h"
74
#include "vmx_onhyperv.h"
75
#include "posted_intr.h"
76
77
#include "mmu/spte.h"
78
79
MODULE_AUTHOR("Qumranet");
80
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
81
MODULE_LICENSE("GPL");
82
83
#ifdef MODULE
84
static const struct x86_cpu_id vmx_cpu_id[] = {
85
X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
86
{}
87
};
88
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
89
#endif
90
91
bool __read_mostly enable_vpid = 1;
92
module_param_named(vpid, enable_vpid, bool, 0444);
93
94
static bool __read_mostly enable_vnmi = 1;
95
module_param_named(vnmi, enable_vnmi, bool, 0444);
96
97
bool __read_mostly flexpriority_enabled = 1;
98
module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
99
100
bool __read_mostly enable_ept = 1;
101
module_param_named(ept, enable_ept, bool, 0444);
102
103
bool __read_mostly enable_unrestricted_guest = 1;
104
module_param_named(unrestricted_guest,
105
enable_unrestricted_guest, bool, 0444);
106
107
bool __read_mostly enable_ept_ad_bits = 1;
108
module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
109
110
static bool __read_mostly emulate_invalid_guest_state = true;
111
module_param(emulate_invalid_guest_state, bool, 0444);
112
113
static bool __read_mostly fasteoi = 1;
114
module_param(fasteoi, bool, 0444);
115
116
module_param(enable_apicv, bool, 0444);
117
module_param(enable_ipiv, bool, 0444);
118
119
module_param(enable_device_posted_irqs, bool, 0444);
120
121
/*
122
* If nested=1, nested virtualization is supported, i.e., guests may use
123
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
124
* use VMX instructions.
125
*/
126
static bool __read_mostly nested = 1;
127
module_param(nested, bool, 0444);
128
129
bool __read_mostly enable_pml = 1;
130
module_param_named(pml, enable_pml, bool, 0444);
131
132
static bool __read_mostly error_on_inconsistent_vmcs_config = true;
133
module_param(error_on_inconsistent_vmcs_config, bool, 0444);
134
135
static bool __read_mostly dump_invalid_vmcs = 0;
136
module_param(dump_invalid_vmcs, bool, 0644);
137
138
#define MSR_BITMAP_MODE_X2APIC 1
139
#define MSR_BITMAP_MODE_X2APIC_APICV 2
140
141
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
142
143
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
144
static int __read_mostly cpu_preemption_timer_multi;
145
static bool __read_mostly enable_preemption_timer = 1;
146
#ifdef CONFIG_X86_64
147
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
148
#endif
149
150
extern bool __read_mostly allow_smaller_maxphyaddr;
151
module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
152
153
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
154
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
155
#define KVM_VM_CR0_ALWAYS_ON \
156
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
157
158
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
159
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
160
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
161
162
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
163
164
#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
165
RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
166
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
167
RTIT_STATUS_BYTECNT))
168
169
/*
170
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
171
* ple_gap: upper bound on the amount of time between two successive
172
* executions of PAUSE in a loop. Also indicate if ple enabled.
173
* According to test, this time is usually smaller than 128 cycles.
174
* ple_window: upper bound on the amount of time a guest is allowed to execute
175
* in a PAUSE loop. Tests indicate that most spinlocks are held for
176
* less than 2^12 cycles
177
* Time is measured based on a counter that runs at the same rate as the TSC,
178
* refer SDM volume 3b section 21.6.13 & 22.1.3.
179
*/
180
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
181
module_param(ple_gap, uint, 0444);
182
183
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
184
module_param(ple_window, uint, 0444);
185
186
/* Default doubles per-vcpu window every exit. */
187
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
188
module_param(ple_window_grow, uint, 0444);
189
190
/* Default resets per-vcpu window every exit to ple_window. */
191
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
192
module_param(ple_window_shrink, uint, 0444);
193
194
/* Default is to compute the maximum so we can never overflow. */
195
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
196
module_param(ple_window_max, uint, 0444);
197
198
/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
199
int __read_mostly pt_mode = PT_MODE_SYSTEM;
200
#ifdef CONFIG_BROKEN
201
module_param(pt_mode, int, S_IRUGO);
202
#endif
203
204
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
205
206
#ifdef CONFIG_CPU_MITIGATIONS
207
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
208
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
209
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
210
211
/* Storage for pre module init parameter parsing */
212
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
213
214
static const struct {
215
const char *option;
216
bool for_parse;
217
} vmentry_l1d_param[] = {
218
[VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
219
[VMENTER_L1D_FLUSH_NEVER] = {"never", true},
220
[VMENTER_L1D_FLUSH_COND] = {"cond", true},
221
[VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
222
[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
223
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
224
};
225
226
#define L1D_CACHE_ORDER 4
227
static void *vmx_l1d_flush_pages;
228
229
static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
230
{
231
struct page *page;
232
unsigned int i;
233
234
if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
235
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
236
return 0;
237
}
238
239
if (!enable_ept) {
240
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
241
return 0;
242
}
243
244
if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
245
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
246
return 0;
247
}
248
249
/* If set to auto use the default l1tf mitigation method */
250
if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
251
switch (l1tf_mitigation) {
252
case L1TF_MITIGATION_OFF:
253
l1tf = VMENTER_L1D_FLUSH_NEVER;
254
break;
255
case L1TF_MITIGATION_AUTO:
256
case L1TF_MITIGATION_FLUSH_NOWARN:
257
case L1TF_MITIGATION_FLUSH:
258
case L1TF_MITIGATION_FLUSH_NOSMT:
259
l1tf = VMENTER_L1D_FLUSH_COND;
260
break;
261
case L1TF_MITIGATION_FULL:
262
case L1TF_MITIGATION_FULL_FORCE:
263
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
264
break;
265
}
266
} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
267
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
268
}
269
270
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
271
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
272
/*
273
* This allocation for vmx_l1d_flush_pages is not tied to a VM
274
* lifetime and so should not be charged to a memcg.
275
*/
276
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
277
if (!page)
278
return -ENOMEM;
279
vmx_l1d_flush_pages = page_address(page);
280
281
/*
282
* Initialize each page with a different pattern in
283
* order to protect against KSM in the nested
284
* virtualization case.
285
*/
286
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
287
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
288
PAGE_SIZE);
289
}
290
}
291
292
l1tf_vmx_mitigation = l1tf;
293
294
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
295
static_branch_enable(&vmx_l1d_should_flush);
296
else
297
static_branch_disable(&vmx_l1d_should_flush);
298
299
if (l1tf == VMENTER_L1D_FLUSH_COND)
300
static_branch_enable(&vmx_l1d_flush_cond);
301
else
302
static_branch_disable(&vmx_l1d_flush_cond);
303
return 0;
304
}
305
306
static int vmx_setup_l1d_flush(void)
307
{
308
/*
309
* Hand the parameter mitigation value in which was stored in the pre
310
* module init parser. If no parameter was given, it will contain
311
* 'auto' which will be turned into the default 'cond' mitigation mode.
312
*/
313
return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
314
}
315
316
static void vmx_cleanup_l1d_flush(void)
317
{
318
if (vmx_l1d_flush_pages) {
319
free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
320
vmx_l1d_flush_pages = NULL;
321
}
322
/* Restore state so sysfs ignores VMX */
323
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
324
}
325
326
static int vmentry_l1d_flush_parse(const char *s)
327
{
328
unsigned int i;
329
330
if (s) {
331
for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
332
if (vmentry_l1d_param[i].for_parse &&
333
sysfs_streq(s, vmentry_l1d_param[i].option))
334
return i;
335
}
336
}
337
return -EINVAL;
338
}
339
340
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
341
{
342
int l1tf, ret;
343
344
l1tf = vmentry_l1d_flush_parse(s);
345
if (l1tf < 0)
346
return l1tf;
347
348
if (!boot_cpu_has(X86_BUG_L1TF))
349
return 0;
350
351
/*
352
* Has vmx_init() run already? If not then this is the pre init
353
* parameter parsing. In that case just store the value and let
354
* vmx_init() do the proper setup after enable_ept has been
355
* established.
356
*/
357
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
358
vmentry_l1d_flush_param = l1tf;
359
return 0;
360
}
361
362
mutex_lock(&vmx_l1d_flush_mutex);
363
ret = __vmx_setup_l1d_flush(l1tf);
364
mutex_unlock(&vmx_l1d_flush_mutex);
365
return ret;
366
}
367
368
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
369
{
370
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
371
return sysfs_emit(s, "???\n");
372
373
return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
374
}
375
376
/*
377
* Software based L1D cache flush which is used when microcode providing
378
* the cache control MSR is not loaded.
379
*
380
* The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
381
* flush it is required to read in 64 KiB because the replacement algorithm
382
* is not exactly LRU. This could be sized at runtime via topology
383
* information but as all relevant affected CPUs have 32KiB L1D cache size
384
* there is no point in doing so.
385
*/
386
static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
387
{
388
int size = PAGE_SIZE << L1D_CACHE_ORDER;
389
390
if (!static_branch_unlikely(&vmx_l1d_should_flush))
391
return;
392
393
/*
394
* This code is only executed when the flush mode is 'cond' or
395
* 'always'
396
*/
397
if (static_branch_likely(&vmx_l1d_flush_cond)) {
398
/*
399
* Clear the per-cpu flush bit, it gets set again if the vCPU
400
* is reloaded, i.e. if the vCPU is scheduled out or if KVM
401
* exits to userspace, or if KVM reaches one of the unsafe
402
* VMEXIT handlers, e.g. if KVM calls into the emulator,
403
* or from the interrupt handlers.
404
*/
405
if (!kvm_get_cpu_l1tf_flush_l1d())
406
return;
407
kvm_clear_cpu_l1tf_flush_l1d();
408
}
409
410
vcpu->stat.l1d_flush++;
411
412
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
413
native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
414
return;
415
}
416
417
asm volatile(
418
/* First ensure the pages are in the TLB */
419
"xorl %%eax, %%eax\n"
420
".Lpopulate_tlb:\n\t"
421
"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
422
"addl $4096, %%eax\n\t"
423
"cmpl %%eax, %[size]\n\t"
424
"jne .Lpopulate_tlb\n\t"
425
"xorl %%eax, %%eax\n\t"
426
"cpuid\n\t"
427
/* Now fill the cache */
428
"xorl %%eax, %%eax\n"
429
".Lfill_cache:\n"
430
"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
431
"addl $64, %%eax\n\t"
432
"cmpl %%eax, %[size]\n\t"
433
"jne .Lfill_cache\n\t"
434
"lfence\n"
435
:: [flush_pages] "r" (vmx_l1d_flush_pages),
436
[size] "r" (size)
437
: "eax", "ebx", "ecx", "edx");
438
}
439
440
#else /* CONFIG_CPU_MITIGATIONS*/
441
static int vmx_setup_l1d_flush(void)
442
{
443
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
444
return 0;
445
}
446
static void vmx_cleanup_l1d_flush(void)
447
{
448
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
449
}
450
static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
451
{
452
453
}
454
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
455
{
456
pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
457
return 0;
458
}
459
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
460
{
461
return sysfs_emit(s, "never\n");
462
}
463
#endif
464
465
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
466
.set = vmentry_l1d_flush_set,
467
.get = vmentry_l1d_flush_get,
468
};
469
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
470
471
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
472
{
473
u64 msr;
474
475
if (!vmx->disable_fb_clear)
476
return;
477
478
msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
479
msr |= FB_CLEAR_DIS;
480
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
481
/* Cache the MSR value to avoid reading it later */
482
vmx->msr_ia32_mcu_opt_ctrl = msr;
483
}
484
485
static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
486
{
487
if (!vmx->disable_fb_clear)
488
return;
489
490
vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
491
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
492
}
493
494
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
495
{
496
/*
497
* Disable VERW's behavior of clearing CPU buffers for the guest if the
498
* CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
499
* the mitigation. Disabling the clearing behavior provides a
500
* performance boost for guests that aren't aware that manually clearing
501
* CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
502
* and VM-Exit.
503
*/
504
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
505
(kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
506
!boot_cpu_has_bug(X86_BUG_MDS) &&
507
!boot_cpu_has_bug(X86_BUG_TAA);
508
509
/*
510
* If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
511
* at VMEntry. Skip the MSR read/write when a guest has no use case to
512
* execute VERW.
513
*/
514
if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
515
((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
516
(vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
517
(vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
518
(vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
519
(vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
520
vmx->disable_fb_clear = false;
521
}
522
523
static u32 vmx_segment_access_rights(struct kvm_segment *var);
524
525
void vmx_vmexit(void);
526
527
#define vmx_insn_failed(fmt...) \
528
do { \
529
WARN_ONCE(1, fmt); \
530
pr_warn_ratelimited(fmt); \
531
} while (0)
532
533
noinline void vmread_error(unsigned long field)
534
{
535
vmx_insn_failed("vmread failed: field=%lx\n", field);
536
}
537
538
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
539
noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
540
{
541
if (fault) {
542
kvm_spurious_fault();
543
} else {
544
instrumentation_begin();
545
vmread_error(field);
546
instrumentation_end();
547
}
548
}
549
#endif
550
551
noinline void vmwrite_error(unsigned long field, unsigned long value)
552
{
553
vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
554
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
555
}
556
557
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
558
{
559
vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
560
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
561
}
562
563
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
564
{
565
vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
566
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
567
}
568
569
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
570
{
571
vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
572
ext, vpid, gva);
573
}
574
575
noinline void invept_error(unsigned long ext, u64 eptp)
576
{
577
vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
578
}
579
580
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
581
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
582
/*
583
* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
584
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
585
*/
586
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
587
588
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
589
static DEFINE_SPINLOCK(vmx_vpid_lock);
590
591
struct vmcs_config vmcs_config __ro_after_init;
592
struct vmx_capability vmx_capability __ro_after_init;
593
594
#define VMX_SEGMENT_FIELD(seg) \
595
[VCPU_SREG_##seg] = { \
596
.selector = GUEST_##seg##_SELECTOR, \
597
.base = GUEST_##seg##_BASE, \
598
.limit = GUEST_##seg##_LIMIT, \
599
.ar_bytes = GUEST_##seg##_AR_BYTES, \
600
}
601
602
static const struct kvm_vmx_segment_field {
603
unsigned selector;
604
unsigned base;
605
unsigned limit;
606
unsigned ar_bytes;
607
} kvm_vmx_segment_fields[] = {
608
VMX_SEGMENT_FIELD(CS),
609
VMX_SEGMENT_FIELD(DS),
610
VMX_SEGMENT_FIELD(ES),
611
VMX_SEGMENT_FIELD(FS),
612
VMX_SEGMENT_FIELD(GS),
613
VMX_SEGMENT_FIELD(SS),
614
VMX_SEGMENT_FIELD(TR),
615
VMX_SEGMENT_FIELD(LDTR),
616
};
617
618
619
static unsigned long host_idt_base;
620
621
#if IS_ENABLED(CONFIG_HYPERV)
622
static bool __read_mostly enlightened_vmcs = true;
623
module_param(enlightened_vmcs, bool, 0444);
624
625
static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
626
{
627
struct hv_enlightened_vmcs *evmcs;
628
hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
629
630
if (partition_assist_page == INVALID_PAGE)
631
return -ENOMEM;
632
633
evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
634
635
evmcs->partition_assist_page = partition_assist_page;
636
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
637
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
638
639
return 0;
640
}
641
642
static __init void hv_init_evmcs(void)
643
{
644
int cpu;
645
646
if (!enlightened_vmcs)
647
return;
648
649
/*
650
* Enlightened VMCS usage should be recommended and the host needs
651
* to support eVMCS v1 or above.
652
*/
653
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
654
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
655
KVM_EVMCS_VERSION) {
656
657
/* Check that we have assist pages on all online CPUs */
658
for_each_online_cpu(cpu) {
659
if (!hv_get_vp_assist_page(cpu)) {
660
enlightened_vmcs = false;
661
break;
662
}
663
}
664
665
if (enlightened_vmcs) {
666
pr_info("Using Hyper-V Enlightened VMCS\n");
667
static_branch_enable(&__kvm_is_using_evmcs);
668
}
669
670
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
671
vt_x86_ops.enable_l2_tlb_flush
672
= hv_enable_l2_tlb_flush;
673
} else {
674
enlightened_vmcs = false;
675
}
676
}
677
678
static void hv_reset_evmcs(void)
679
{
680
struct hv_vp_assist_page *vp_ap;
681
682
if (!kvm_is_using_evmcs())
683
return;
684
685
/*
686
* KVM should enable eVMCS if and only if all CPUs have a VP assist
687
* page, and should reject CPU onlining if eVMCS is enabled the CPU
688
* doesn't have a VP assist page allocated.
689
*/
690
vp_ap = hv_get_vp_assist_page(smp_processor_id());
691
if (WARN_ON_ONCE(!vp_ap))
692
return;
693
694
/*
695
* Reset everything to support using non-enlightened VMCS access later
696
* (e.g. when we reload the module with enlightened_vmcs=0)
697
*/
698
vp_ap->nested_control.features.directhypercall = 0;
699
vp_ap->current_nested_vmcs = 0;
700
vp_ap->enlighten_vmentry = 0;
701
}
702
703
#else /* IS_ENABLED(CONFIG_HYPERV) */
704
static void hv_init_evmcs(void) {}
705
static void hv_reset_evmcs(void) {}
706
#endif /* IS_ENABLED(CONFIG_HYPERV) */
707
708
/*
709
* Comment's format: document - errata name - stepping - processor name.
710
* Refer from
711
* https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
712
*/
713
static u32 vmx_preemption_cpu_tfms[] = {
714
/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
715
0x000206E6,
716
/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
717
/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
718
/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
719
0x00020652,
720
/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
721
0x00020655,
722
/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
723
/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
724
/*
725
* 320767.pdf - AAP86 - B1 -
726
* i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
727
*/
728
0x000106E5,
729
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
730
0x000106A0,
731
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
732
0x000106A1,
733
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
734
0x000106A4,
735
/* 321333.pdf - AAM126 - D0 - Xeon 3500 */
736
/* 321324.pdf - AAK139 - D0 - Xeon 5500 */
737
/* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
738
0x000106A5,
739
/* Xeon E3-1220 V2 */
740
0x000306A8,
741
};
742
743
static inline bool cpu_has_broken_vmx_preemption_timer(void)
744
{
745
u32 eax = cpuid_eax(0x00000001), i;
746
747
/* Clear the reserved bits */
748
eax &= ~(0x3U << 14 | 0xfU << 28);
749
for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
750
if (eax == vmx_preemption_cpu_tfms[i])
751
return true;
752
753
return false;
754
}
755
756
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
757
{
758
return flexpriority_enabled && lapic_in_kernel(vcpu);
759
}
760
761
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
762
{
763
int i;
764
765
i = kvm_find_user_return_msr(msr);
766
if (i >= 0)
767
return &vmx->guest_uret_msrs[i];
768
return NULL;
769
}
770
771
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
772
struct vmx_uret_msr *msr, u64 data)
773
{
774
unsigned int slot = msr - vmx->guest_uret_msrs;
775
int ret = 0;
776
777
if (msr->load_into_hardware) {
778
preempt_disable();
779
ret = kvm_set_user_return_msr(slot, data, msr->mask);
780
preempt_enable();
781
}
782
if (!ret)
783
msr->data = data;
784
return ret;
785
}
786
787
/*
788
* Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
789
*
790
* Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
791
* atomically track post-VMXON state, e.g. this may be called in NMI context.
792
* Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
793
* faults are guaranteed to be due to the !post-VMXON check unless the CPU is
794
* magically in RM, VM86, compat mode, or at CPL>0.
795
*/
796
static int kvm_cpu_vmxoff(void)
797
{
798
asm goto("1: vmxoff\n\t"
799
_ASM_EXTABLE(1b, %l[fault])
800
::: "cc", "memory" : fault);
801
802
cr4_clear_bits(X86_CR4_VMXE);
803
return 0;
804
805
fault:
806
cr4_clear_bits(X86_CR4_VMXE);
807
return -EIO;
808
}
809
810
void vmx_emergency_disable_virtualization_cpu(void)
811
{
812
int cpu = raw_smp_processor_id();
813
struct loaded_vmcs *v;
814
815
kvm_rebooting = true;
816
817
/*
818
* Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
819
* set in task context. If this races with VMX is disabled by an NMI,
820
* VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
821
* kvm_rebooting set.
822
*/
823
if (!(__read_cr4() & X86_CR4_VMXE))
824
return;
825
826
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
827
loaded_vmcss_on_cpu_link) {
828
vmcs_clear(v->vmcs);
829
if (v->shadow_vmcs)
830
vmcs_clear(v->shadow_vmcs);
831
}
832
833
kvm_cpu_vmxoff();
834
}
835
836
static void __loaded_vmcs_clear(void *arg)
837
{
838
struct loaded_vmcs *loaded_vmcs = arg;
839
int cpu = raw_smp_processor_id();
840
841
if (loaded_vmcs->cpu != cpu)
842
return; /* vcpu migration can race with cpu offline */
843
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
844
per_cpu(current_vmcs, cpu) = NULL;
845
846
vmcs_clear(loaded_vmcs->vmcs);
847
if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
848
vmcs_clear(loaded_vmcs->shadow_vmcs);
849
850
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
851
852
/*
853
* Ensure all writes to loaded_vmcs, including deleting it from its
854
* current percpu list, complete before setting loaded_vmcs->cpu to
855
* -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
856
* and add loaded_vmcs to its percpu list before it's deleted from this
857
* cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
858
*/
859
smp_wmb();
860
861
loaded_vmcs->cpu = -1;
862
loaded_vmcs->launched = 0;
863
}
864
865
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
866
{
867
int cpu = loaded_vmcs->cpu;
868
869
if (cpu != -1)
870
smp_call_function_single(cpu,
871
__loaded_vmcs_clear, loaded_vmcs, 1);
872
}
873
874
static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
875
unsigned field)
876
{
877
bool ret;
878
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
879
880
if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
881
kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
882
vmx->segment_cache.bitmask = 0;
883
}
884
ret = vmx->segment_cache.bitmask & mask;
885
vmx->segment_cache.bitmask |= mask;
886
return ret;
887
}
888
889
static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
890
{
891
u16 *p = &vmx->segment_cache.seg[seg].selector;
892
893
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
894
*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
895
return *p;
896
}
897
898
static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
899
{
900
ulong *p = &vmx->segment_cache.seg[seg].base;
901
902
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
903
*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
904
return *p;
905
}
906
907
static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
908
{
909
u32 *p = &vmx->segment_cache.seg[seg].limit;
910
911
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
912
*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
913
return *p;
914
}
915
916
static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
917
{
918
u32 *p = &vmx->segment_cache.seg[seg].ar;
919
920
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
921
*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
922
return *p;
923
}
924
925
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
926
{
927
u32 eb;
928
929
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
930
(1u << DB_VECTOR) | (1u << AC_VECTOR);
931
/*
932
* #VE isn't used for VMX. To test against unexpected changes
933
* related to #VE for VMX, intercept unexpected #VE and warn on it.
934
*/
935
if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
936
eb |= 1u << VE_VECTOR;
937
/*
938
* Guest access to VMware backdoor ports could legitimately
939
* trigger #GP because of TSS I/O permission bitmap.
940
* We intercept those #GP and allow access to them anyway
941
* as VMware does.
942
*/
943
if (enable_vmware_backdoor)
944
eb |= (1u << GP_VECTOR);
945
if ((vcpu->guest_debug &
946
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
947
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
948
eb |= 1u << BP_VECTOR;
949
if (to_vmx(vcpu)->rmode.vm86_active)
950
eb = ~0;
951
if (!vmx_need_pf_intercept(vcpu))
952
eb &= ~(1u << PF_VECTOR);
953
954
/* When we are running a nested L2 guest and L1 specified for it a
955
* certain exception bitmap, we must trap the same exceptions and pass
956
* them to L1. When running L2, we will only handle the exceptions
957
* specified above if L1 did not want them.
958
*/
959
if (is_guest_mode(vcpu))
960
eb |= get_vmcs12(vcpu)->exception_bitmap;
961
else {
962
int mask = 0, match = 0;
963
964
if (enable_ept && (eb & (1u << PF_VECTOR))) {
965
/*
966
* If EPT is enabled, #PF is currently only intercepted
967
* if MAXPHYADDR is smaller on the guest than on the
968
* host. In that case we only care about present,
969
* non-reserved faults. For vmcs02, however, PFEC_MASK
970
* and PFEC_MATCH are set in prepare_vmcs02_rare.
971
*/
972
mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
973
match = PFERR_PRESENT_MASK;
974
}
975
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
976
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
977
}
978
979
/*
980
* Disabling xfd interception indicates that dynamic xfeatures
981
* might be used in the guest. Always trap #NM in this case
982
* to save guest xfd_err timely.
983
*/
984
if (vcpu->arch.xfd_no_write_intercept)
985
eb |= (1u << NM_VECTOR);
986
987
vmcs_write32(EXCEPTION_BITMAP, eb);
988
}
989
990
/*
991
* Check if MSR is intercepted for currently loaded MSR bitmap.
992
*/
993
static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
994
{
995
if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
996
return true;
997
998
return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
999
}
1000
1001
unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
1002
{
1003
unsigned int flags = 0;
1004
1005
if (vmx->loaded_vmcs->launched)
1006
flags |= VMX_RUN_VMRESUME;
1007
1008
/*
1009
* If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
1010
* to change it directly without causing a vmexit. In that case read
1011
* it after vmexit and store it in vmx->spec_ctrl.
1012
*/
1013
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
1014
flags |= VMX_RUN_SAVE_SPEC_CTRL;
1015
1016
if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
1017
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
1018
flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
1019
1020
return flags;
1021
}
1022
1023
static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1024
unsigned long entry, unsigned long exit)
1025
{
1026
vm_entry_controls_clearbit(vmx, entry);
1027
vm_exit_controls_clearbit(vmx, exit);
1028
}
1029
1030
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
1031
{
1032
unsigned int i;
1033
1034
for (i = 0; i < m->nr; ++i) {
1035
if (m->val[i].index == msr)
1036
return i;
1037
}
1038
return -ENOENT;
1039
}
1040
1041
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1042
{
1043
int i;
1044
struct msr_autoload *m = &vmx->msr_autoload;
1045
1046
switch (msr) {
1047
case MSR_EFER:
1048
if (cpu_has_load_ia32_efer()) {
1049
clear_atomic_switch_msr_special(vmx,
1050
VM_ENTRY_LOAD_IA32_EFER,
1051
VM_EXIT_LOAD_IA32_EFER);
1052
return;
1053
}
1054
break;
1055
case MSR_CORE_PERF_GLOBAL_CTRL:
1056
if (cpu_has_load_perf_global_ctrl()) {
1057
clear_atomic_switch_msr_special(vmx,
1058
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1059
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1060
return;
1061
}
1062
break;
1063
}
1064
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1065
if (i < 0)
1066
goto skip_guest;
1067
--m->guest.nr;
1068
m->guest.val[i] = m->guest.val[m->guest.nr];
1069
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1070
1071
skip_guest:
1072
i = vmx_find_loadstore_msr_slot(&m->host, msr);
1073
if (i < 0)
1074
return;
1075
1076
--m->host.nr;
1077
m->host.val[i] = m->host.val[m->host.nr];
1078
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1079
}
1080
1081
static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1082
unsigned long entry, unsigned long exit,
1083
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1084
u64 guest_val, u64 host_val)
1085
{
1086
vmcs_write64(guest_val_vmcs, guest_val);
1087
if (host_val_vmcs != HOST_IA32_EFER)
1088
vmcs_write64(host_val_vmcs, host_val);
1089
vm_entry_controls_setbit(vmx, entry);
1090
vm_exit_controls_setbit(vmx, exit);
1091
}
1092
1093
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1094
u64 guest_val, u64 host_val, bool entry_only)
1095
{
1096
int i, j = 0;
1097
struct msr_autoload *m = &vmx->msr_autoload;
1098
1099
switch (msr) {
1100
case MSR_EFER:
1101
if (cpu_has_load_ia32_efer()) {
1102
add_atomic_switch_msr_special(vmx,
1103
VM_ENTRY_LOAD_IA32_EFER,
1104
VM_EXIT_LOAD_IA32_EFER,
1105
GUEST_IA32_EFER,
1106
HOST_IA32_EFER,
1107
guest_val, host_val);
1108
return;
1109
}
1110
break;
1111
case MSR_CORE_PERF_GLOBAL_CTRL:
1112
if (cpu_has_load_perf_global_ctrl()) {
1113
add_atomic_switch_msr_special(vmx,
1114
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1115
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1116
GUEST_IA32_PERF_GLOBAL_CTRL,
1117
HOST_IA32_PERF_GLOBAL_CTRL,
1118
guest_val, host_val);
1119
return;
1120
}
1121
break;
1122
case MSR_IA32_PEBS_ENABLE:
1123
/* PEBS needs a quiescent period after being disabled (to write
1124
* a record). Disabling PEBS through VMX MSR swapping doesn't
1125
* provide that period, so a CPU could write host's record into
1126
* guest's memory.
1127
*/
1128
wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
1129
}
1130
1131
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1132
if (!entry_only)
1133
j = vmx_find_loadstore_msr_slot(&m->host, msr);
1134
1135
if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1136
(j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1137
printk_once(KERN_WARNING "Not enough msr switch entries. "
1138
"Can't add msr %x\n", msr);
1139
return;
1140
}
1141
if (i < 0) {
1142
i = m->guest.nr++;
1143
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1144
}
1145
m->guest.val[i].index = msr;
1146
m->guest.val[i].value = guest_val;
1147
1148
if (entry_only)
1149
return;
1150
1151
if (j < 0) {
1152
j = m->host.nr++;
1153
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1154
}
1155
m->host.val[j].index = msr;
1156
m->host.val[j].value = host_val;
1157
}
1158
1159
static bool update_transition_efer(struct vcpu_vmx *vmx)
1160
{
1161
u64 guest_efer = vmx->vcpu.arch.efer;
1162
u64 ignore_bits = 0;
1163
int i;
1164
1165
/* Shadow paging assumes NX to be available. */
1166
if (!enable_ept)
1167
guest_efer |= EFER_NX;
1168
1169
/*
1170
* LMA and LME handled by hardware; SCE meaningless outside long mode.
1171
*/
1172
ignore_bits |= EFER_SCE;
1173
#ifdef CONFIG_X86_64
1174
ignore_bits |= EFER_LMA | EFER_LME;
1175
/* SCE is meaningful only in long mode on Intel */
1176
if (guest_efer & EFER_LMA)
1177
ignore_bits &= ~(u64)EFER_SCE;
1178
#endif
1179
1180
/*
1181
* On EPT, we can't emulate NX, so we must switch EFER atomically.
1182
* On CPUs that support "load IA32_EFER", always switch EFER
1183
* atomically, since it's faster than switching it manually.
1184
*/
1185
if (cpu_has_load_ia32_efer() ||
1186
(enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
1187
if (!(guest_efer & EFER_LMA))
1188
guest_efer &= ~EFER_LME;
1189
if (guest_efer != kvm_host.efer)
1190
add_atomic_switch_msr(vmx, MSR_EFER,
1191
guest_efer, kvm_host.efer, false);
1192
else
1193
clear_atomic_switch_msr(vmx, MSR_EFER);
1194
return false;
1195
}
1196
1197
i = kvm_find_user_return_msr(MSR_EFER);
1198
if (i < 0)
1199
return false;
1200
1201
clear_atomic_switch_msr(vmx, MSR_EFER);
1202
1203
guest_efer &= ~ignore_bits;
1204
guest_efer |= kvm_host.efer & ignore_bits;
1205
1206
vmx->guest_uret_msrs[i].data = guest_efer;
1207
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1208
1209
return true;
1210
}
1211
1212
#ifdef CONFIG_X86_32
1213
/*
1214
* On 32-bit kernels, VM exits still load the FS and GS bases from the
1215
* VMCS rather than the segment table. KVM uses this helper to figure
1216
* out the current bases to poke them into the VMCS before entry.
1217
*/
1218
static unsigned long segment_base(u16 selector)
1219
{
1220
struct desc_struct *table;
1221
unsigned long v;
1222
1223
if (!(selector & ~SEGMENT_RPL_MASK))
1224
return 0;
1225
1226
table = get_current_gdt_ro();
1227
1228
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1229
u16 ldt_selector = kvm_read_ldt();
1230
1231
if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1232
return 0;
1233
1234
table = (struct desc_struct *)segment_base(ldt_selector);
1235
}
1236
v = get_desc_base(&table[selector >> 3]);
1237
return v;
1238
}
1239
#endif
1240
1241
static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1242
{
1243
return vmx_pt_mode_is_host_guest() &&
1244
!(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1245
}
1246
1247
static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1248
{
1249
/* The base must be 128-byte aligned and a legal physical address. */
1250
return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1251
}
1252
1253
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1254
{
1255
u32 i;
1256
1257
wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1258
wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1259
wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1260
wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1261
for (i = 0; i < addr_range; i++) {
1262
wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1263
wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1264
}
1265
}
1266
1267
static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1268
{
1269
u32 i;
1270
1271
rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1272
rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1273
rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1274
rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1275
for (i = 0; i < addr_range; i++) {
1276
rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1277
rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1278
}
1279
}
1280
1281
static void pt_guest_enter(struct vcpu_vmx *vmx)
1282
{
1283
if (vmx_pt_mode_is_system())
1284
return;
1285
1286
/*
1287
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
1288
* Save host state before VM entry.
1289
*/
1290
rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1291
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1292
wrmsrq(MSR_IA32_RTIT_CTL, 0);
1293
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1294
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1295
}
1296
}
1297
1298
static void pt_guest_exit(struct vcpu_vmx *vmx)
1299
{
1300
if (vmx_pt_mode_is_system())
1301
return;
1302
1303
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1304
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1305
pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1306
}
1307
1308
/*
1309
* KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1310
* i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1311
*/
1312
if (vmx->pt_desc.host.ctl)
1313
wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1314
}
1315
1316
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1317
unsigned long fs_base, unsigned long gs_base)
1318
{
1319
if (unlikely(fs_sel != host->fs_sel)) {
1320
if (!(fs_sel & 7))
1321
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1322
else
1323
vmcs_write16(HOST_FS_SELECTOR, 0);
1324
host->fs_sel = fs_sel;
1325
}
1326
if (unlikely(gs_sel != host->gs_sel)) {
1327
if (!(gs_sel & 7))
1328
vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1329
else
1330
vmcs_write16(HOST_GS_SELECTOR, 0);
1331
host->gs_sel = gs_sel;
1332
}
1333
if (unlikely(fs_base != host->fs_base)) {
1334
vmcs_writel(HOST_FS_BASE, fs_base);
1335
host->fs_base = fs_base;
1336
}
1337
if (unlikely(gs_base != host->gs_base)) {
1338
vmcs_writel(HOST_GS_BASE, gs_base);
1339
host->gs_base = gs_base;
1340
}
1341
}
1342
1343
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1344
{
1345
struct vcpu_vmx *vmx = to_vmx(vcpu);
1346
struct vcpu_vt *vt = to_vt(vcpu);
1347
struct vmcs_host_state *host_state;
1348
#ifdef CONFIG_X86_64
1349
int cpu = raw_smp_processor_id();
1350
#endif
1351
unsigned long fs_base, gs_base;
1352
u16 fs_sel, gs_sel;
1353
int i;
1354
1355
/*
1356
* Note that guest MSRs to be saved/restored can also be changed
1357
* when guest state is loaded. This happens when guest transitions
1358
* to/from long-mode by setting MSR_EFER.LMA.
1359
*/
1360
if (!vmx->guest_uret_msrs_loaded) {
1361
vmx->guest_uret_msrs_loaded = true;
1362
for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1363
if (!vmx->guest_uret_msrs[i].load_into_hardware)
1364
continue;
1365
1366
kvm_set_user_return_msr(i,
1367
vmx->guest_uret_msrs[i].data,
1368
vmx->guest_uret_msrs[i].mask);
1369
}
1370
}
1371
1372
if (vmx->nested.need_vmcs12_to_shadow_sync)
1373
nested_sync_vmcs12_to_shadow(vcpu);
1374
1375
if (vt->guest_state_loaded)
1376
return;
1377
1378
host_state = &vmx->loaded_vmcs->host_state;
1379
1380
/*
1381
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1382
* allow segment selectors with cpl > 0 or ti == 1.
1383
*/
1384
host_state->ldt_sel = kvm_read_ldt();
1385
1386
#ifdef CONFIG_X86_64
1387
savesegment(ds, host_state->ds_sel);
1388
savesegment(es, host_state->es_sel);
1389
1390
gs_base = cpu_kernelmode_gs_base(cpu);
1391
if (likely(is_64bit_mm(current->mm))) {
1392
current_save_fsgs();
1393
fs_sel = current->thread.fsindex;
1394
gs_sel = current->thread.gsindex;
1395
fs_base = current->thread.fsbase;
1396
vt->msr_host_kernel_gs_base = current->thread.gsbase;
1397
} else {
1398
savesegment(fs, fs_sel);
1399
savesegment(gs, gs_sel);
1400
fs_base = read_msr(MSR_FS_BASE);
1401
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1402
}
1403
1404
wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1405
#else
1406
savesegment(fs, fs_sel);
1407
savesegment(gs, gs_sel);
1408
fs_base = segment_base(fs_sel);
1409
gs_base = segment_base(gs_sel);
1410
#endif
1411
1412
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1413
vt->guest_state_loaded = true;
1414
}
1415
1416
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1417
{
1418
struct vmcs_host_state *host_state;
1419
1420
if (!vmx->vt.guest_state_loaded)
1421
return;
1422
1423
host_state = &vmx->loaded_vmcs->host_state;
1424
1425
++vmx->vcpu.stat.host_state_reload;
1426
1427
#ifdef CONFIG_X86_64
1428
rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1429
#endif
1430
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1431
kvm_load_ldt(host_state->ldt_sel);
1432
#ifdef CONFIG_X86_64
1433
load_gs_index(host_state->gs_sel);
1434
#else
1435
loadsegment(gs, host_state->gs_sel);
1436
#endif
1437
}
1438
if (host_state->fs_sel & 7)
1439
loadsegment(fs, host_state->fs_sel);
1440
#ifdef CONFIG_X86_64
1441
if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1442
loadsegment(ds, host_state->ds_sel);
1443
loadsegment(es, host_state->es_sel);
1444
}
1445
#endif
1446
invalidate_tss_limit();
1447
#ifdef CONFIG_X86_64
1448
wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
1449
#endif
1450
load_fixmap_gdt(raw_smp_processor_id());
1451
vmx->vt.guest_state_loaded = false;
1452
vmx->guest_uret_msrs_loaded = false;
1453
}
1454
1455
#ifdef CONFIG_X86_64
1456
static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
1457
{
1458
preempt_disable();
1459
if (vmx->vt.guest_state_loaded)
1460
*cache = read_msr(msr);
1461
preempt_enable();
1462
return *cache;
1463
}
1464
1465
static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
1466
u64 *cache)
1467
{
1468
preempt_disable();
1469
if (vmx->vt.guest_state_loaded)
1470
wrmsrns(msr, data);
1471
preempt_enable();
1472
*cache = data;
1473
}
1474
1475
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1476
{
1477
return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
1478
&vmx->msr_guest_kernel_gs_base);
1479
}
1480
1481
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1482
{
1483
vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
1484
&vmx->msr_guest_kernel_gs_base);
1485
}
1486
#endif
1487
1488
static void grow_ple_window(struct kvm_vcpu *vcpu)
1489
{
1490
struct vcpu_vmx *vmx = to_vmx(vcpu);
1491
unsigned int old = vmx->ple_window;
1492
1493
vmx->ple_window = __grow_ple_window(old, ple_window,
1494
ple_window_grow,
1495
ple_window_max);
1496
1497
if (vmx->ple_window != old) {
1498
vmx->ple_window_dirty = true;
1499
trace_kvm_ple_window_update(vcpu->vcpu_id,
1500
vmx->ple_window, old);
1501
}
1502
}
1503
1504
static void shrink_ple_window(struct kvm_vcpu *vcpu)
1505
{
1506
struct vcpu_vmx *vmx = to_vmx(vcpu);
1507
unsigned int old = vmx->ple_window;
1508
1509
vmx->ple_window = __shrink_ple_window(old, ple_window,
1510
ple_window_shrink,
1511
ple_window);
1512
1513
if (vmx->ple_window != old) {
1514
vmx->ple_window_dirty = true;
1515
trace_kvm_ple_window_update(vcpu->vcpu_id,
1516
vmx->ple_window, old);
1517
}
1518
}
1519
1520
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
1521
{
1522
struct vcpu_vmx *vmx = to_vmx(vcpu);
1523
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1524
struct vmcs *prev;
1525
1526
if (!already_loaded) {
1527
loaded_vmcs_clear(vmx->loaded_vmcs);
1528
local_irq_disable();
1529
1530
/*
1531
* Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1532
* this cpu's percpu list, otherwise it may not yet be deleted
1533
* from its previous cpu's percpu list. Pairs with the
1534
* smb_wmb() in __loaded_vmcs_clear().
1535
*/
1536
smp_rmb();
1537
1538
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1539
&per_cpu(loaded_vmcss_on_cpu, cpu));
1540
local_irq_enable();
1541
}
1542
1543
prev = per_cpu(current_vmcs, cpu);
1544
if (prev != vmx->loaded_vmcs->vmcs) {
1545
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1546
vmcs_load(vmx->loaded_vmcs->vmcs);
1547
}
1548
1549
if (!already_loaded) {
1550
void *gdt = get_current_gdt_ro();
1551
1552
/*
1553
* Flush all EPTP/VPID contexts, the new pCPU may have stale
1554
* TLB entries from its previous association with the vCPU.
1555
*/
1556
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1557
1558
/*
1559
* Linux uses per-cpu TSS and GDT, so set these when switching
1560
* processors. See 22.2.4.
1561
*/
1562
vmcs_writel(HOST_TR_BASE,
1563
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1564
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
1565
1566
if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1567
/* 22.2.3 */
1568
vmcs_writel(HOST_IA32_SYSENTER_ESP,
1569
(unsigned long)(cpu_entry_stack(cpu) + 1));
1570
}
1571
1572
vmx->loaded_vmcs->cpu = cpu;
1573
}
1574
}
1575
1576
/*
1577
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
1578
* vcpu mutex is already taken.
1579
*/
1580
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1581
{
1582
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1583
shrink_ple_window(vcpu);
1584
1585
vmx_vcpu_load_vmcs(vcpu, cpu);
1586
1587
vmx_vcpu_pi_load(vcpu, cpu);
1588
}
1589
1590
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1591
{
1592
vmx_vcpu_pi_put(vcpu);
1593
1594
vmx_prepare_switch_to_host(to_vmx(vcpu));
1595
}
1596
1597
bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1598
{
1599
return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1600
}
1601
1602
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1603
{
1604
struct vcpu_vmx *vmx = to_vmx(vcpu);
1605
unsigned long rflags, save_rflags;
1606
1607
if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1608
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1609
rflags = vmcs_readl(GUEST_RFLAGS);
1610
if (vmx->rmode.vm86_active) {
1611
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1612
save_rflags = vmx->rmode.save_rflags;
1613
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1614
}
1615
vmx->rflags = rflags;
1616
}
1617
return vmx->rflags;
1618
}
1619
1620
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1621
{
1622
struct vcpu_vmx *vmx = to_vmx(vcpu);
1623
unsigned long old_rflags;
1624
1625
/*
1626
* Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1627
* is an unrestricted guest in order to mark L2 as needing emulation
1628
* if L1 runs L2 as a restricted guest.
1629
*/
1630
if (is_unrestricted_guest(vcpu)) {
1631
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1632
vmx->rflags = rflags;
1633
vmcs_writel(GUEST_RFLAGS, rflags);
1634
return;
1635
}
1636
1637
old_rflags = vmx_get_rflags(vcpu);
1638
vmx->rflags = rflags;
1639
if (vmx->rmode.vm86_active) {
1640
vmx->rmode.save_rflags = rflags;
1641
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1642
}
1643
vmcs_writel(GUEST_RFLAGS, rflags);
1644
1645
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1646
vmx->vt.emulation_required = vmx_emulation_required(vcpu);
1647
}
1648
1649
bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1650
{
1651
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1652
}
1653
1654
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1655
{
1656
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1657
int ret = 0;
1658
1659
if (interruptibility & GUEST_INTR_STATE_STI)
1660
ret |= KVM_X86_SHADOW_INT_STI;
1661
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1662
ret |= KVM_X86_SHADOW_INT_MOV_SS;
1663
1664
return ret;
1665
}
1666
1667
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1668
{
1669
u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1670
u32 interruptibility = interruptibility_old;
1671
1672
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1673
1674
if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1675
interruptibility |= GUEST_INTR_STATE_MOV_SS;
1676
else if (mask & KVM_X86_SHADOW_INT_STI)
1677
interruptibility |= GUEST_INTR_STATE_STI;
1678
1679
if ((interruptibility != interruptibility_old))
1680
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1681
}
1682
1683
static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1684
{
1685
struct vcpu_vmx *vmx = to_vmx(vcpu);
1686
unsigned long value;
1687
1688
/*
1689
* Any MSR write that attempts to change bits marked reserved will
1690
* case a #GP fault.
1691
*/
1692
if (data & vmx->pt_desc.ctl_bitmask)
1693
return 1;
1694
1695
/*
1696
* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1697
* result in a #GP unless the same write also clears TraceEn.
1698
*/
1699
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1700
(data & RTIT_CTL_TRACEEN) &&
1701
data != vmx->pt_desc.guest.ctl)
1702
return 1;
1703
1704
/*
1705
* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1706
* and FabricEn would cause #GP, if
1707
* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1708
*/
1709
if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1710
!(data & RTIT_CTL_FABRIC_EN) &&
1711
!intel_pt_validate_cap(vmx->pt_desc.caps,
1712
PT_CAP_single_range_output))
1713
return 1;
1714
1715
/*
1716
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1717
* utilize encodings marked reserved will cause a #GP fault.
1718
*/
1719
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1720
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1721
!test_bit((data & RTIT_CTL_MTC_RANGE) >>
1722
RTIT_CTL_MTC_RANGE_OFFSET, &value))
1723
return 1;
1724
value = intel_pt_validate_cap(vmx->pt_desc.caps,
1725
PT_CAP_cycle_thresholds);
1726
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1727
!test_bit((data & RTIT_CTL_CYC_THRESH) >>
1728
RTIT_CTL_CYC_THRESH_OFFSET, &value))
1729
return 1;
1730
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1731
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1732
!test_bit((data & RTIT_CTL_PSB_FREQ) >>
1733
RTIT_CTL_PSB_FREQ_OFFSET, &value))
1734
return 1;
1735
1736
/*
1737
* If ADDRx_CFG is reserved or the encodings is >2 will
1738
* cause a #GP fault.
1739
*/
1740
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1741
if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1742
return 1;
1743
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1744
if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1745
return 1;
1746
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1747
if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1748
return 1;
1749
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1750
if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1751
return 1;
1752
1753
return 0;
1754
}
1755
1756
int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1757
void *insn, int insn_len)
1758
{
1759
/*
1760
* Emulation of instructions in SGX enclaves is impossible as RIP does
1761
* not point at the failing instruction, and even if it did, the code
1762
* stream is inaccessible. Inject #UD instead of exiting to userspace
1763
* so that guest userspace can't DoS the guest simply by triggering
1764
* emulation (enclaves are CPL3 only).
1765
*/
1766
if (vmx_get_exit_reason(vcpu).enclave_mode) {
1767
kvm_queue_exception(vcpu, UD_VECTOR);
1768
return X86EMUL_PROPAGATE_FAULT;
1769
}
1770
1771
/* Check that emulation is possible during event vectoring */
1772
if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
1773
!kvm_can_emulate_event_vectoring(emul_type))
1774
return X86EMUL_UNHANDLEABLE_VECTORING;
1775
1776
return X86EMUL_CONTINUE;
1777
}
1778
1779
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1780
{
1781
union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1782
unsigned long rip, orig_rip;
1783
u32 instr_len;
1784
1785
/*
1786
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1787
* undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1788
* set when EPT misconfig occurs. In practice, real hardware updates
1789
* VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1790
* (namely Hyper-V) don't set it due to it being undefined behavior,
1791
* i.e. we end up advancing IP with some random value.
1792
*/
1793
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1794
exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1795
instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1796
1797
/*
1798
* Emulating an enclave's instructions isn't supported as KVM
1799
* cannot access the enclave's memory or its true RIP, e.g. the
1800
* vmcs.GUEST_RIP points at the exit point of the enclave, not
1801
* the RIP that actually triggered the VM-Exit. But, because
1802
* most instructions that cause VM-Exit will #UD in an enclave,
1803
* most instruction-based VM-Exits simply do not occur.
1804
*
1805
* There are a few exceptions, notably the debug instructions
1806
* INT1ICEBRK and INT3, as they are allowed in debug enclaves
1807
* and generate #DB/#BP as expected, which KVM might intercept.
1808
* But again, the CPU does the dirty work and saves an instr
1809
* length of zero so VMMs don't shoot themselves in the foot.
1810
* WARN if KVM tries to skip a non-zero length instruction on
1811
* a VM-Exit from an enclave.
1812
*/
1813
if (!instr_len)
1814
goto rip_updated;
1815
1816
WARN_ONCE(exit_reason.enclave_mode,
1817
"skipping instruction after SGX enclave VM-Exit");
1818
1819
orig_rip = kvm_rip_read(vcpu);
1820
rip = orig_rip + instr_len;
1821
#ifdef CONFIG_X86_64
1822
/*
1823
* We need to mask out the high 32 bits of RIP if not in 64-bit
1824
* mode, but just finding out that we are in 64-bit mode is
1825
* quite expensive. Only do it if there was a carry.
1826
*/
1827
if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1828
rip = (u32)rip;
1829
#endif
1830
kvm_rip_write(vcpu, rip);
1831
} else {
1832
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1833
return 0;
1834
}
1835
1836
rip_updated:
1837
/* skipping an emulated instruction also counts */
1838
vmx_set_interrupt_shadow(vcpu, 0);
1839
1840
return 1;
1841
}
1842
1843
/*
1844
* Recognizes a pending MTF VM-exit and records the nested state for later
1845
* delivery.
1846
*/
1847
void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1848
{
1849
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1850
struct vcpu_vmx *vmx = to_vmx(vcpu);
1851
1852
if (!is_guest_mode(vcpu))
1853
return;
1854
1855
/*
1856
* Per the SDM, MTF takes priority over debug-trap exceptions besides
1857
* TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1858
* or ICEBP (in the emulator proper), and skipping of ICEBP after an
1859
* intercepted #DB deliberately avoids single-step #DB and MTF updates
1860
* as ICEBP is higher priority than both. As instruction emulation is
1861
* completed at this point (i.e. KVM is at the instruction boundary),
1862
* any #DB exception pending delivery must be a debug-trap of lower
1863
* priority than MTF. Record the pending MTF state to be delivered in
1864
* vmx_check_nested_events().
1865
*/
1866
if (nested_cpu_has_mtf(vmcs12) &&
1867
(!vcpu->arch.exception.pending ||
1868
vcpu->arch.exception.vector == DB_VECTOR) &&
1869
(!vcpu->arch.exception_vmexit.pending ||
1870
vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1871
vmx->nested.mtf_pending = true;
1872
kvm_make_request(KVM_REQ_EVENT, vcpu);
1873
} else {
1874
vmx->nested.mtf_pending = false;
1875
}
1876
}
1877
1878
int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1879
{
1880
vmx_update_emulated_instruction(vcpu);
1881
return skip_emulated_instruction(vcpu);
1882
}
1883
1884
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1885
{
1886
/*
1887
* Ensure that we clear the HLT state in the VMCS. We don't need to
1888
* explicitly skip the instruction because if the HLT state is set,
1889
* then the instruction is already executing and RIP has already been
1890
* advanced.
1891
*/
1892
if (kvm_hlt_in_guest(vcpu->kvm) &&
1893
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1894
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1895
}
1896
1897
void vmx_inject_exception(struct kvm_vcpu *vcpu)
1898
{
1899
struct kvm_queued_exception *ex = &vcpu->arch.exception;
1900
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1901
struct vcpu_vmx *vmx = to_vmx(vcpu);
1902
1903
kvm_deliver_exception_payload(vcpu, ex);
1904
1905
if (ex->has_error_code) {
1906
/*
1907
* Despite the error code being architecturally defined as 32
1908
* bits, and the VMCS field being 32 bits, Intel CPUs and thus
1909
* VMX don't actually supporting setting bits 31:16. Hardware
1910
* will (should) never provide a bogus error code, but AMD CPUs
1911
* do generate error codes with bits 31:16 set, and so KVM's
1912
* ABI lets userspace shove in arbitrary 32-bit values. Drop
1913
* the upper bits to avoid VM-Fail, losing information that
1914
* doesn't really exist is preferable to killing the VM.
1915
*/
1916
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1917
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1918
}
1919
1920
if (vmx->rmode.vm86_active) {
1921
int inc_eip = 0;
1922
if (kvm_exception_is_soft(ex->vector))
1923
inc_eip = vcpu->arch.event_exit_inst_len;
1924
kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1925
return;
1926
}
1927
1928
WARN_ON_ONCE(vmx->vt.emulation_required);
1929
1930
if (kvm_exception_is_soft(ex->vector)) {
1931
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1932
vmx->vcpu.arch.event_exit_inst_len);
1933
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1934
} else
1935
intr_info |= INTR_TYPE_HARD_EXCEPTION;
1936
1937
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1938
1939
vmx_clear_hlt(vcpu);
1940
}
1941
1942
static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1943
bool load_into_hardware)
1944
{
1945
struct vmx_uret_msr *uret_msr;
1946
1947
uret_msr = vmx_find_uret_msr(vmx, msr);
1948
if (!uret_msr)
1949
return;
1950
1951
uret_msr->load_into_hardware = load_into_hardware;
1952
}
1953
1954
/*
1955
* Configuring user return MSRs to automatically save, load, and restore MSRs
1956
* that need to be shoved into hardware when running the guest. Note, omitting
1957
* an MSR here does _NOT_ mean it's not emulated, only that it will not be
1958
* loaded into hardware when running the guest.
1959
*/
1960
static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1961
{
1962
#ifdef CONFIG_X86_64
1963
bool load_syscall_msrs;
1964
1965
/*
1966
* The SYSCALL MSRs are only needed on long mode guests, and only
1967
* when EFER.SCE is set.
1968
*/
1969
load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1970
(vmx->vcpu.arch.efer & EFER_SCE);
1971
1972
vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1973
vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1974
vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1975
#endif
1976
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1977
1978
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1979
guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1980
guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
1981
1982
/*
1983
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1984
* kernel and old userspace. If those guests run on a tsx=off host, do
1985
* allow guests to use TSX_CTRL, but don't change the value in hardware
1986
* so that TSX remains always disabled.
1987
*/
1988
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1989
1990
/*
1991
* The set of MSRs to load may have changed, reload MSRs before the
1992
* next VM-Enter.
1993
*/
1994
vmx->guest_uret_msrs_loaded = false;
1995
}
1996
1997
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1998
{
1999
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2000
2001
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
2002
return vmcs12->tsc_offset;
2003
2004
return 0;
2005
}
2006
2007
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
2008
{
2009
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2010
2011
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
2012
nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
2013
return vmcs12->tsc_multiplier;
2014
2015
return kvm_caps.default_tsc_scaling_ratio;
2016
}
2017
2018
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
2019
{
2020
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2021
}
2022
2023
void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
2024
{
2025
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2026
}
2027
2028
/*
2029
* Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
2030
* guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
2031
* backwards compatibility even though KVM doesn't support emulating SMX. And
2032
* because userspace set "VMX in SMX", the guest must also be allowed to set it,
2033
* e.g. if the MSR is left unlocked and the guest does a RMW operation.
2034
*/
2035
#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
2036
FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
2037
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
2038
FEAT_CTL_SGX_LC_ENABLED | \
2039
FEAT_CTL_SGX_ENABLED | \
2040
FEAT_CTL_LMCE_ENABLED)
2041
2042
static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
2043
struct msr_data *msr)
2044
{
2045
uint64_t valid_bits;
2046
2047
/*
2048
* Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
2049
* exposed to the guest.
2050
*/
2051
WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
2052
~KVM_SUPPORTED_FEATURE_CONTROL);
2053
2054
if (!msr->host_initiated &&
2055
(vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
2056
return false;
2057
2058
if (msr->host_initiated)
2059
valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
2060
else
2061
valid_bits = vmx->msr_ia32_feature_control_valid_bits;
2062
2063
return !(msr->data & ~valid_bits);
2064
}
2065
2066
int vmx_get_feature_msr(u32 msr, u64 *data)
2067
{
2068
switch (msr) {
2069
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2070
if (!nested)
2071
return 1;
2072
return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
2073
default:
2074
return KVM_MSR_RET_UNSUPPORTED;
2075
}
2076
}
2077
2078
/*
2079
* Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
2080
* Returns 0 on success, non-0 otherwise.
2081
* Assumes vcpu_load() was already called.
2082
*/
2083
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2084
{
2085
struct vcpu_vmx *vmx = to_vmx(vcpu);
2086
struct vmx_uret_msr *msr;
2087
u32 index;
2088
2089
switch (msr_info->index) {
2090
#ifdef CONFIG_X86_64
2091
case MSR_FS_BASE:
2092
msr_info->data = vmcs_readl(GUEST_FS_BASE);
2093
break;
2094
case MSR_GS_BASE:
2095
msr_info->data = vmcs_readl(GUEST_GS_BASE);
2096
break;
2097
case MSR_KERNEL_GS_BASE:
2098
msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2099
break;
2100
#endif
2101
case MSR_EFER:
2102
return kvm_get_msr_common(vcpu, msr_info);
2103
case MSR_IA32_TSX_CTRL:
2104
if (!msr_info->host_initiated &&
2105
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2106
return 1;
2107
goto find_uret_msr;
2108
case MSR_IA32_UMWAIT_CONTROL:
2109
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2110
return 1;
2111
2112
msr_info->data = vmx->msr_ia32_umwait_control;
2113
break;
2114
case MSR_IA32_SPEC_CTRL:
2115
if (!msr_info->host_initiated &&
2116
!guest_has_spec_ctrl_msr(vcpu))
2117
return 1;
2118
2119
msr_info->data = to_vmx(vcpu)->spec_ctrl;
2120
break;
2121
case MSR_IA32_SYSENTER_CS:
2122
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2123
break;
2124
case MSR_IA32_SYSENTER_EIP:
2125
msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2126
break;
2127
case MSR_IA32_SYSENTER_ESP:
2128
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2129
break;
2130
case MSR_IA32_BNDCFGS:
2131
if (!kvm_mpx_supported() ||
2132
(!msr_info->host_initiated &&
2133
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2134
return 1;
2135
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2136
break;
2137
case MSR_IA32_MCG_EXT_CTL:
2138
if (!msr_info->host_initiated &&
2139
!(vmx->msr_ia32_feature_control &
2140
FEAT_CTL_LMCE_ENABLED))
2141
return 1;
2142
msr_info->data = vcpu->arch.mcg_ext_ctl;
2143
break;
2144
case MSR_IA32_FEAT_CTL:
2145
msr_info->data = vmx->msr_ia32_feature_control;
2146
break;
2147
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2148
if (!msr_info->host_initiated &&
2149
!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
2150
return 1;
2151
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2152
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2153
break;
2154
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2155
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2156
return 1;
2157
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2158
&msr_info->data))
2159
return 1;
2160
#ifdef CONFIG_KVM_HYPERV
2161
/*
2162
* Enlightened VMCS v1 doesn't have certain VMCS fields but
2163
* instead of just ignoring the features, different Hyper-V
2164
* versions are either trying to use them and fail or do some
2165
* sanity checking and refuse to boot. Filter all unsupported
2166
* features out.
2167
*/
2168
if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
2169
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2170
&msr_info->data);
2171
#endif
2172
break;
2173
case MSR_IA32_RTIT_CTL:
2174
if (!vmx_pt_mode_is_host_guest())
2175
return 1;
2176
msr_info->data = vmx->pt_desc.guest.ctl;
2177
break;
2178
case MSR_IA32_RTIT_STATUS:
2179
if (!vmx_pt_mode_is_host_guest())
2180
return 1;
2181
msr_info->data = vmx->pt_desc.guest.status;
2182
break;
2183
case MSR_IA32_RTIT_CR3_MATCH:
2184
if (!vmx_pt_mode_is_host_guest() ||
2185
!intel_pt_validate_cap(vmx->pt_desc.caps,
2186
PT_CAP_cr3_filtering))
2187
return 1;
2188
msr_info->data = vmx->pt_desc.guest.cr3_match;
2189
break;
2190
case MSR_IA32_RTIT_OUTPUT_BASE:
2191
if (!vmx_pt_mode_is_host_guest() ||
2192
(!intel_pt_validate_cap(vmx->pt_desc.caps,
2193
PT_CAP_topa_output) &&
2194
!intel_pt_validate_cap(vmx->pt_desc.caps,
2195
PT_CAP_single_range_output)))
2196
return 1;
2197
msr_info->data = vmx->pt_desc.guest.output_base;
2198
break;
2199
case MSR_IA32_RTIT_OUTPUT_MASK:
2200
if (!vmx_pt_mode_is_host_guest() ||
2201
(!intel_pt_validate_cap(vmx->pt_desc.caps,
2202
PT_CAP_topa_output) &&
2203
!intel_pt_validate_cap(vmx->pt_desc.caps,
2204
PT_CAP_single_range_output)))
2205
return 1;
2206
msr_info->data = vmx->pt_desc.guest.output_mask;
2207
break;
2208
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2209
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2210
if (!vmx_pt_mode_is_host_guest() ||
2211
(index >= 2 * vmx->pt_desc.num_address_ranges))
2212
return 1;
2213
if (index % 2)
2214
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2215
else
2216
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2217
break;
2218
case MSR_IA32_S_CET:
2219
msr_info->data = vmcs_readl(GUEST_S_CET);
2220
break;
2221
case MSR_KVM_INTERNAL_GUEST_SSP:
2222
msr_info->data = vmcs_readl(GUEST_SSP);
2223
break;
2224
case MSR_IA32_INT_SSP_TAB:
2225
msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
2226
break;
2227
case MSR_IA32_DEBUGCTLMSR:
2228
msr_info->data = vmx_guest_debugctl_read();
2229
break;
2230
default:
2231
find_uret_msr:
2232
msr = vmx_find_uret_msr(vmx, msr_info->index);
2233
if (msr) {
2234
msr_info->data = msr->data;
2235
break;
2236
}
2237
return kvm_get_msr_common(vcpu, msr_info);
2238
}
2239
2240
return 0;
2241
}
2242
2243
static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2244
u64 data)
2245
{
2246
#ifdef CONFIG_X86_64
2247
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
2248
return (u32)data;
2249
#endif
2250
return (unsigned long)data;
2251
}
2252
2253
u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2254
{
2255
u64 debugctl = 0;
2256
2257
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2258
(host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2259
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2260
2261
if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
2262
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2263
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2264
2265
if (boot_cpu_has(X86_FEATURE_RTM) &&
2266
(host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
2267
debugctl |= DEBUGCTLMSR_RTM_DEBUG;
2268
2269
return debugctl;
2270
}
2271
2272
bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
2273
{
2274
u64 invalid;
2275
2276
invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
2277
if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
2278
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
2279
invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
2280
}
2281
return !invalid;
2282
}
2283
2284
/*
2285
* Writes msr value into the appropriate "register".
2286
* Returns 0 on success, non-0 otherwise.
2287
* Assumes vcpu_load() was already called.
2288
*/
2289
int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2290
{
2291
struct vcpu_vmx *vmx = to_vmx(vcpu);
2292
struct vmx_uret_msr *msr;
2293
int ret = 0;
2294
u32 msr_index = msr_info->index;
2295
u64 data = msr_info->data;
2296
u32 index;
2297
2298
switch (msr_index) {
2299
case MSR_EFER:
2300
ret = kvm_set_msr_common(vcpu, msr_info);
2301
break;
2302
#ifdef CONFIG_X86_64
2303
case MSR_FS_BASE:
2304
vmx_segment_cache_clear(vmx);
2305
vmcs_writel(GUEST_FS_BASE, data);
2306
break;
2307
case MSR_GS_BASE:
2308
vmx_segment_cache_clear(vmx);
2309
vmcs_writel(GUEST_GS_BASE, data);
2310
break;
2311
case MSR_KERNEL_GS_BASE:
2312
vmx_write_guest_kernel_gs_base(vmx, data);
2313
break;
2314
case MSR_IA32_XFD:
2315
ret = kvm_set_msr_common(vcpu, msr_info);
2316
/*
2317
* Always intercepting WRMSR could incur non-negligible
2318
* overhead given xfd might be changed frequently in
2319
* guest context switch. Disable write interception
2320
* upon the first write with a non-zero value (indicating
2321
* potential usage on dynamic xfeatures). Also update
2322
* exception bitmap to trap #NM for proper virtualization
2323
* of guest xfd_err.
2324
*/
2325
if (!ret && data) {
2326
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2327
MSR_TYPE_RW);
2328
vcpu->arch.xfd_no_write_intercept = true;
2329
vmx_update_exception_bitmap(vcpu);
2330
}
2331
break;
2332
#endif
2333
case MSR_IA32_SYSENTER_CS:
2334
if (is_guest_mode(vcpu))
2335
get_vmcs12(vcpu)->guest_sysenter_cs = data;
2336
vmcs_write32(GUEST_SYSENTER_CS, data);
2337
break;
2338
case MSR_IA32_SYSENTER_EIP:
2339
if (is_guest_mode(vcpu)) {
2340
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2341
get_vmcs12(vcpu)->guest_sysenter_eip = data;
2342
}
2343
vmcs_writel(GUEST_SYSENTER_EIP, data);
2344
break;
2345
case MSR_IA32_SYSENTER_ESP:
2346
if (is_guest_mode(vcpu)) {
2347
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2348
get_vmcs12(vcpu)->guest_sysenter_esp = data;
2349
}
2350
vmcs_writel(GUEST_SYSENTER_ESP, data);
2351
break;
2352
case MSR_IA32_DEBUGCTLMSR:
2353
if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
2354
return 1;
2355
2356
data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2357
2358
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2359
VM_EXIT_SAVE_DEBUG_CONTROLS)
2360
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2361
2362
vmx_guest_debugctl_write(vcpu, data);
2363
2364
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2365
(data & DEBUGCTLMSR_LBR))
2366
intel_pmu_create_guest_lbr_event(vcpu);
2367
return 0;
2368
case MSR_IA32_BNDCFGS:
2369
if (!kvm_mpx_supported() ||
2370
(!msr_info->host_initiated &&
2371
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2372
return 1;
2373
if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
2374
(data & MSR_IA32_BNDCFGS_RSVD))
2375
return 1;
2376
2377
if (is_guest_mode(vcpu) &&
2378
((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2379
(vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2380
get_vmcs12(vcpu)->guest_bndcfgs = data;
2381
2382
vmcs_write64(GUEST_BNDCFGS, data);
2383
break;
2384
case MSR_IA32_UMWAIT_CONTROL:
2385
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2386
return 1;
2387
2388
/* The reserved bit 1 and non-32 bit [63:32] should be zero */
2389
if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2390
return 1;
2391
2392
vmx->msr_ia32_umwait_control = data;
2393
break;
2394
case MSR_IA32_SPEC_CTRL:
2395
if (!msr_info->host_initiated &&
2396
!guest_has_spec_ctrl_msr(vcpu))
2397
return 1;
2398
2399
if (kvm_spec_ctrl_test_value(data))
2400
return 1;
2401
2402
vmx->spec_ctrl = data;
2403
if (!data)
2404
break;
2405
2406
/*
2407
* For non-nested:
2408
* When it's written (to non-zero) for the first time, pass
2409
* it through.
2410
*
2411
* For nested:
2412
* The handling of the MSR bitmap for L2 guests is done in
2413
* nested_vmx_prepare_msr_bitmap. We should not touch the
2414
* vmcs02.msr_bitmap here since it gets completely overwritten
2415
* in the merging. We update the vmcs01 here for L1 as well
2416
* since it will end up touching the MSR anyway now.
2417
*/
2418
vmx_disable_intercept_for_msr(vcpu,
2419
MSR_IA32_SPEC_CTRL,
2420
MSR_TYPE_RW);
2421
break;
2422
case MSR_IA32_TSX_CTRL:
2423
if (!msr_info->host_initiated &&
2424
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2425
return 1;
2426
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2427
return 1;
2428
goto find_uret_msr;
2429
case MSR_IA32_CR_PAT:
2430
ret = kvm_set_msr_common(vcpu, msr_info);
2431
if (ret)
2432
break;
2433
2434
if (is_guest_mode(vcpu) &&
2435
get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2436
get_vmcs12(vcpu)->guest_ia32_pat = data;
2437
2438
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2439
vmcs_write64(GUEST_IA32_PAT, data);
2440
break;
2441
case MSR_IA32_MCG_EXT_CTL:
2442
if ((!msr_info->host_initiated &&
2443
!(to_vmx(vcpu)->msr_ia32_feature_control &
2444
FEAT_CTL_LMCE_ENABLED)) ||
2445
(data & ~MCG_EXT_CTL_LMCE_EN))
2446
return 1;
2447
vcpu->arch.mcg_ext_ctl = data;
2448
break;
2449
case MSR_IA32_FEAT_CTL:
2450
if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2451
return 1;
2452
2453
vmx->msr_ia32_feature_control = data;
2454
if (msr_info->host_initiated && data == 0)
2455
vmx_leave_nested(vcpu);
2456
2457
/* SGX may be enabled/disabled by guest's firmware */
2458
vmx_write_encls_bitmap(vcpu, NULL);
2459
break;
2460
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2461
/*
2462
* On real hardware, the LE hash MSRs are writable before
2463
* the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2464
* at which point SGX related bits in IA32_FEATURE_CONTROL
2465
* become writable.
2466
*
2467
* KVM does not emulate SGX activation for simplicity, so
2468
* allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2469
* is unlocked. This is technically not architectural
2470
* behavior, but it's close enough.
2471
*/
2472
if (!msr_info->host_initiated &&
2473
(!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
2474
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2475
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2476
return 1;
2477
vmx->msr_ia32_sgxlepubkeyhash
2478
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2479
break;
2480
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2481
if (!msr_info->host_initiated)
2482
return 1; /* they are read-only */
2483
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2484
return 1;
2485
return vmx_set_vmx_msr(vcpu, msr_index, data);
2486
case MSR_IA32_RTIT_CTL:
2487
if (!vmx_pt_mode_is_host_guest() ||
2488
vmx_rtit_ctl_check(vcpu, data) ||
2489
vmx->nested.vmxon)
2490
return 1;
2491
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2492
vmx->pt_desc.guest.ctl = data;
2493
pt_update_intercept_for_msr(vcpu);
2494
break;
2495
case MSR_IA32_RTIT_STATUS:
2496
if (!pt_can_write_msr(vmx))
2497
return 1;
2498
if (data & MSR_IA32_RTIT_STATUS_MASK)
2499
return 1;
2500
vmx->pt_desc.guest.status = data;
2501
break;
2502
case MSR_IA32_RTIT_CR3_MATCH:
2503
if (!pt_can_write_msr(vmx))
2504
return 1;
2505
if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2506
PT_CAP_cr3_filtering))
2507
return 1;
2508
vmx->pt_desc.guest.cr3_match = data;
2509
break;
2510
case MSR_IA32_RTIT_OUTPUT_BASE:
2511
if (!pt_can_write_msr(vmx))
2512
return 1;
2513
if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2514
PT_CAP_topa_output) &&
2515
!intel_pt_validate_cap(vmx->pt_desc.caps,
2516
PT_CAP_single_range_output))
2517
return 1;
2518
if (!pt_output_base_valid(vcpu, data))
2519
return 1;
2520
vmx->pt_desc.guest.output_base = data;
2521
break;
2522
case MSR_IA32_RTIT_OUTPUT_MASK:
2523
if (!pt_can_write_msr(vmx))
2524
return 1;
2525
if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2526
PT_CAP_topa_output) &&
2527
!intel_pt_validate_cap(vmx->pt_desc.caps,
2528
PT_CAP_single_range_output))
2529
return 1;
2530
vmx->pt_desc.guest.output_mask = data;
2531
break;
2532
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2533
if (!pt_can_write_msr(vmx))
2534
return 1;
2535
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2536
if (index >= 2 * vmx->pt_desc.num_address_ranges)
2537
return 1;
2538
if (is_noncanonical_msr_address(data, vcpu))
2539
return 1;
2540
if (index % 2)
2541
vmx->pt_desc.guest.addr_b[index / 2] = data;
2542
else
2543
vmx->pt_desc.guest.addr_a[index / 2] = data;
2544
break;
2545
case MSR_IA32_S_CET:
2546
vmcs_writel(GUEST_S_CET, data);
2547
break;
2548
case MSR_KVM_INTERNAL_GUEST_SSP:
2549
vmcs_writel(GUEST_SSP, data);
2550
break;
2551
case MSR_IA32_INT_SSP_TAB:
2552
vmcs_writel(GUEST_INTR_SSP_TABLE, data);
2553
break;
2554
case MSR_IA32_PERF_CAPABILITIES:
2555
if (data & PERF_CAP_LBR_FMT) {
2556
if ((data & PERF_CAP_LBR_FMT) !=
2557
(kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
2558
return 1;
2559
if (!cpuid_model_is_consistent(vcpu))
2560
return 1;
2561
}
2562
if (data & PERF_CAP_PEBS_FORMAT) {
2563
if ((data & PERF_CAP_PEBS_MASK) !=
2564
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2565
return 1;
2566
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
2567
return 1;
2568
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
2569
return 1;
2570
if (!cpuid_model_is_consistent(vcpu))
2571
return 1;
2572
}
2573
ret = kvm_set_msr_common(vcpu, msr_info);
2574
break;
2575
2576
default:
2577
find_uret_msr:
2578
msr = vmx_find_uret_msr(vmx, msr_index);
2579
if (msr)
2580
ret = vmx_set_guest_uret_msr(vmx, msr, data);
2581
else
2582
ret = kvm_set_msr_common(vcpu, msr_info);
2583
}
2584
2585
/* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2586
if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2587
vmx_update_fb_clear_dis(vcpu, vmx);
2588
2589
return ret;
2590
}
2591
2592
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2593
{
2594
unsigned long guest_owned_bits;
2595
2596
kvm_register_mark_available(vcpu, reg);
2597
2598
switch (reg) {
2599
case VCPU_REGS_RSP:
2600
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2601
break;
2602
case VCPU_REGS_RIP:
2603
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2604
break;
2605
case VCPU_EXREG_PDPTR:
2606
if (enable_ept)
2607
ept_save_pdptrs(vcpu);
2608
break;
2609
case VCPU_EXREG_CR0:
2610
guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2611
2612
vcpu->arch.cr0 &= ~guest_owned_bits;
2613
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2614
break;
2615
case VCPU_EXREG_CR3:
2616
/*
2617
* When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2618
* CR3 is loaded into hardware, not the guest's CR3.
2619
*/
2620
if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2621
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2622
break;
2623
case VCPU_EXREG_CR4:
2624
guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2625
2626
vcpu->arch.cr4 &= ~guest_owned_bits;
2627
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2628
break;
2629
default:
2630
KVM_BUG_ON(1, vcpu->kvm);
2631
break;
2632
}
2633
}
2634
2635
/*
2636
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2637
* directly instead of going through cpu_has(), to ensure KVM is trapping
2638
* ENCLS whenever it's supported in hardware. It does not matter whether
2639
* the host OS supports or has enabled SGX.
2640
*/
2641
static bool cpu_has_sgx(void)
2642
{
2643
return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2644
}
2645
2646
static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2647
{
2648
u32 vmx_msr_low, vmx_msr_high;
2649
u32 ctl = ctl_min | ctl_opt;
2650
2651
rdmsr(msr, vmx_msr_low, vmx_msr_high);
2652
2653
ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2654
ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2655
2656
/* Ensure minimum (required) set of control bits are supported. */
2657
if (ctl_min & ~ctl)
2658
return -EIO;
2659
2660
*result = ctl;
2661
return 0;
2662
}
2663
2664
static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2665
{
2666
u64 allowed;
2667
2668
rdmsrq(msr, allowed);
2669
2670
return ctl_opt & allowed;
2671
}
2672
2673
#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
2674
({ \
2675
int i, r = 0; \
2676
\
2677
BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
2678
BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
2679
\
2680
for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
2681
typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
2682
typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
2683
\
2684
if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
2685
continue; \
2686
\
2687
pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
2688
"entry = %llx (%llx), exit = %llx (%llx)\n", \
2689
(u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
2690
(u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
2691
\
2692
if (error_on_inconsistent_vmcs_config) \
2693
r = -EIO; \
2694
\
2695
entry_controls &= ~n_ctrl; \
2696
exit_controls &= ~x_ctrl; \
2697
} \
2698
r; \
2699
})
2700
2701
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2702
struct vmx_capability *vmx_cap)
2703
{
2704
u32 _pin_based_exec_control = 0;
2705
u32 _cpu_based_exec_control = 0;
2706
u32 _cpu_based_2nd_exec_control = 0;
2707
u64 _cpu_based_3rd_exec_control = 0;
2708
u32 _vmexit_control = 0;
2709
u32 _vmentry_control = 0;
2710
u64 basic_msr;
2711
u64 misc_msr;
2712
2713
/*
2714
* LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2715
* SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2716
* intercepts writes to PAT and EFER, i.e. never enables those controls.
2717
*/
2718
struct {
2719
u32 entry_control;
2720
u32 exit_control;
2721
} const vmcs_entry_exit_pairs[] = {
2722
{ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2723
{ VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2724
{ VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2725
{ VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2726
{ VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2727
{ VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
2728
};
2729
2730
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2731
2732
if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2733
KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2734
MSR_IA32_VMX_PROCBASED_CTLS,
2735
&_cpu_based_exec_control))
2736
return -EIO;
2737
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2738
if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2739
KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2740
MSR_IA32_VMX_PROCBASED_CTLS2,
2741
&_cpu_based_2nd_exec_control))
2742
return -EIO;
2743
}
2744
if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
2745
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2746
2747
#ifndef CONFIG_X86_64
2748
if (!(_cpu_based_2nd_exec_control &
2749
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2750
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2751
#endif
2752
2753
if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2754
_cpu_based_2nd_exec_control &= ~(
2755
SECONDARY_EXEC_APIC_REGISTER_VIRT |
2756
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2757
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2758
2759
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2760
&vmx_cap->ept, &vmx_cap->vpid);
2761
2762
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2763
vmx_cap->ept) {
2764
pr_warn_once("EPT CAP should not exist if not support "
2765
"1-setting enable EPT VM-execution control\n");
2766
2767
if (error_on_inconsistent_vmcs_config)
2768
return -EIO;
2769
2770
vmx_cap->ept = 0;
2771
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2772
}
2773
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2774
vmx_cap->vpid) {
2775
pr_warn_once("VPID CAP should not exist if not support "
2776
"1-setting enable VPID VM-execution control\n");
2777
2778
if (error_on_inconsistent_vmcs_config)
2779
return -EIO;
2780
2781
vmx_cap->vpid = 0;
2782
}
2783
2784
if (!cpu_has_sgx())
2785
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2786
2787
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2788
_cpu_based_3rd_exec_control =
2789
adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2790
MSR_IA32_VMX_PROCBASED_CTLS3);
2791
2792
if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2793
KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2794
MSR_IA32_VMX_EXIT_CTLS,
2795
&_vmexit_control))
2796
return -EIO;
2797
2798
if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2799
KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2800
MSR_IA32_VMX_PINBASED_CTLS,
2801
&_pin_based_exec_control))
2802
return -EIO;
2803
2804
if (cpu_has_broken_vmx_preemption_timer())
2805
_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2806
if (!(_cpu_based_2nd_exec_control &
2807
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2808
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2809
2810
if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2811
KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2812
MSR_IA32_VMX_ENTRY_CTLS,
2813
&_vmentry_control))
2814
return -EIO;
2815
2816
if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
2817
_vmentry_control, _vmexit_control))
2818
return -EIO;
2819
2820
/*
2821
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2822
* can't be used due to an errata where VM Exit may incorrectly clear
2823
* IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2824
* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2825
*/
2826
switch (boot_cpu_data.x86_vfm) {
2827
case INTEL_NEHALEM_EP: /* AAK155 */
2828
case INTEL_NEHALEM: /* AAP115 */
2829
case INTEL_WESTMERE: /* AAT100 */
2830
case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
2831
case INTEL_NEHALEM_EX: /* BA97 */
2832
_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2833
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2834
pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2835
"does not work properly. Using workaround\n");
2836
break;
2837
default:
2838
break;
2839
}
2840
2841
rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
2842
2843
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2844
if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
2845
return -EIO;
2846
2847
#ifdef CONFIG_X86_64
2848
/*
2849
* KVM expects to be able to shove all legal physical addresses into
2850
* VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
2851
* 0 for processors that support Intel 64 architecture".
2852
*/
2853
if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
2854
return -EIO;
2855
#endif
2856
2857
/* Require Write-Back (WB) memory type for VMCS accesses. */
2858
if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
2859
return -EIO;
2860
2861
rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
2862
2863
vmcs_conf->basic = basic_msr;
2864
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2865
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2866
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2867
vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2868
vmcs_conf->vmexit_ctrl = _vmexit_control;
2869
vmcs_conf->vmentry_ctrl = _vmentry_control;
2870
vmcs_conf->misc = misc_msr;
2871
2872
#if IS_ENABLED(CONFIG_HYPERV)
2873
if (enlightened_vmcs)
2874
evmcs_sanitize_exec_ctrls(vmcs_conf);
2875
#endif
2876
2877
return 0;
2878
}
2879
2880
static bool __kvm_is_vmx_supported(void)
2881
{
2882
int cpu = smp_processor_id();
2883
2884
if (!(cpuid_ecx(1) & feature_bit(VMX))) {
2885
pr_err("VMX not supported by CPU %d\n", cpu);
2886
return false;
2887
}
2888
2889
if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2890
!this_cpu_has(X86_FEATURE_VMX)) {
2891
pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2892
return false;
2893
}
2894
2895
return true;
2896
}
2897
2898
static bool kvm_is_vmx_supported(void)
2899
{
2900
bool supported;
2901
2902
migrate_disable();
2903
supported = __kvm_is_vmx_supported();
2904
migrate_enable();
2905
2906
return supported;
2907
}
2908
2909
int vmx_check_processor_compat(void)
2910
{
2911
int cpu = raw_smp_processor_id();
2912
struct vmcs_config vmcs_conf;
2913
struct vmx_capability vmx_cap;
2914
2915
if (!__kvm_is_vmx_supported())
2916
return -EIO;
2917
2918
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2919
pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2920
return -EIO;
2921
}
2922
if (nested)
2923
nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2924
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2925
pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2926
return -EIO;
2927
}
2928
return 0;
2929
}
2930
2931
static int kvm_cpu_vmxon(u64 vmxon_pointer)
2932
{
2933
u64 msr;
2934
2935
cr4_set_bits(X86_CR4_VMXE);
2936
2937
asm goto("1: vmxon %[vmxon_pointer]\n\t"
2938
_ASM_EXTABLE(1b, %l[fault])
2939
: : [vmxon_pointer] "m"(vmxon_pointer)
2940
: : fault);
2941
return 0;
2942
2943
fault:
2944
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2945
rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2946
cr4_clear_bits(X86_CR4_VMXE);
2947
2948
return -EFAULT;
2949
}
2950
2951
int vmx_enable_virtualization_cpu(void)
2952
{
2953
int cpu = raw_smp_processor_id();
2954
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2955
int r;
2956
2957
if (cr4_read_shadow() & X86_CR4_VMXE)
2958
return -EBUSY;
2959
2960
/*
2961
* This can happen if we hot-added a CPU but failed to allocate
2962
* VP assist page for it.
2963
*/
2964
if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2965
return -EFAULT;
2966
2967
intel_pt_handle_vmx(1);
2968
2969
r = kvm_cpu_vmxon(phys_addr);
2970
if (r) {
2971
intel_pt_handle_vmx(0);
2972
return r;
2973
}
2974
2975
return 0;
2976
}
2977
2978
static void vmclear_local_loaded_vmcss(void)
2979
{
2980
int cpu = raw_smp_processor_id();
2981
struct loaded_vmcs *v, *n;
2982
2983
list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2984
loaded_vmcss_on_cpu_link)
2985
__loaded_vmcs_clear(v);
2986
}
2987
2988
void vmx_disable_virtualization_cpu(void)
2989
{
2990
vmclear_local_loaded_vmcss();
2991
2992
if (kvm_cpu_vmxoff())
2993
kvm_spurious_fault();
2994
2995
hv_reset_evmcs();
2996
2997
intel_pt_handle_vmx(0);
2998
}
2999
3000
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
3001
{
3002
int node = cpu_to_node(cpu);
3003
struct page *pages;
3004
struct vmcs *vmcs;
3005
3006
pages = __alloc_pages_node(node, flags, 0);
3007
if (!pages)
3008
return NULL;
3009
vmcs = page_address(pages);
3010
memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
3011
3012
/* KVM supports Enlightened VMCS v1 only */
3013
if (kvm_is_using_evmcs())
3014
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
3015
else
3016
vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3017
3018
if (shadow)
3019
vmcs->hdr.shadow_vmcs = 1;
3020
return vmcs;
3021
}
3022
3023
void free_vmcs(struct vmcs *vmcs)
3024
{
3025
free_page((unsigned long)vmcs);
3026
}
3027
3028
/*
3029
* Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3030
*/
3031
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3032
{
3033
if (!loaded_vmcs->vmcs)
3034
return;
3035
loaded_vmcs_clear(loaded_vmcs);
3036
free_vmcs(loaded_vmcs->vmcs);
3037
loaded_vmcs->vmcs = NULL;
3038
if (loaded_vmcs->msr_bitmap)
3039
free_page((unsigned long)loaded_vmcs->msr_bitmap);
3040
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3041
}
3042
3043
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3044
{
3045
loaded_vmcs->vmcs = alloc_vmcs(false);
3046
if (!loaded_vmcs->vmcs)
3047
return -ENOMEM;
3048
3049
vmcs_clear(loaded_vmcs->vmcs);
3050
3051
loaded_vmcs->shadow_vmcs = NULL;
3052
loaded_vmcs->hv_timer_soft_disabled = false;
3053
loaded_vmcs->cpu = -1;
3054
loaded_vmcs->launched = 0;
3055
3056
if (cpu_has_vmx_msr_bitmap()) {
3057
loaded_vmcs->msr_bitmap = (unsigned long *)
3058
__get_free_page(GFP_KERNEL_ACCOUNT);
3059
if (!loaded_vmcs->msr_bitmap)
3060
goto out_vmcs;
3061
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3062
}
3063
3064
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3065
memset(&loaded_vmcs->controls_shadow, 0,
3066
sizeof(struct vmcs_controls_shadow));
3067
3068
return 0;
3069
3070
out_vmcs:
3071
free_loaded_vmcs(loaded_vmcs);
3072
return -ENOMEM;
3073
}
3074
3075
static void free_kvm_area(void)
3076
{
3077
int cpu;
3078
3079
for_each_possible_cpu(cpu) {
3080
free_vmcs(per_cpu(vmxarea, cpu));
3081
per_cpu(vmxarea, cpu) = NULL;
3082
}
3083
}
3084
3085
static __init int alloc_kvm_area(void)
3086
{
3087
int cpu;
3088
3089
for_each_possible_cpu(cpu) {
3090
struct vmcs *vmcs;
3091
3092
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
3093
if (!vmcs) {
3094
free_kvm_area();
3095
return -ENOMEM;
3096
}
3097
3098
/*
3099
* When eVMCS is enabled, alloc_vmcs_cpu() sets
3100
* vmcs->revision_id to KVM_EVMCS_VERSION instead of
3101
* revision_id reported by MSR_IA32_VMX_BASIC.
3102
*
3103
* However, even though not explicitly documented by
3104
* TLFS, VMXArea passed as VMXON argument should
3105
* still be marked with revision_id reported by
3106
* physical CPU.
3107
*/
3108
if (kvm_is_using_evmcs())
3109
vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3110
3111
per_cpu(vmxarea, cpu) = vmcs;
3112
}
3113
return 0;
3114
}
3115
3116
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3117
struct kvm_segment *save)
3118
{
3119
if (!emulate_invalid_guest_state) {
3120
/*
3121
* CS and SS RPL should be equal during guest entry according
3122
* to VMX spec, but in reality it is not always so. Since vcpu
3123
* is in the middle of the transition from real mode to
3124
* protected mode it is safe to assume that RPL 0 is a good
3125
* default value.
3126
*/
3127
if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3128
save->selector &= ~SEGMENT_RPL_MASK;
3129
save->dpl = save->selector & SEGMENT_RPL_MASK;
3130
save->s = 1;
3131
}
3132
__vmx_set_segment(vcpu, save, seg);
3133
}
3134
3135
static void enter_pmode(struct kvm_vcpu *vcpu)
3136
{
3137
unsigned long flags;
3138
struct vcpu_vmx *vmx = to_vmx(vcpu);
3139
3140
/*
3141
* Update real mode segment cache. It may be not up-to-date if segment
3142
* register was written while vcpu was in a guest mode.
3143
*/
3144
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3145
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3146
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3147
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3148
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3149
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3150
3151
vmx->rmode.vm86_active = 0;
3152
3153
__vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3154
3155
flags = vmcs_readl(GUEST_RFLAGS);
3156
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3157
flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3158
vmcs_writel(GUEST_RFLAGS, flags);
3159
3160
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3161
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3162
3163
vmx_update_exception_bitmap(vcpu);
3164
3165
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3166
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3167
fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3168
fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3169
fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3170
fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3171
}
3172
3173
static void fix_rmode_seg(int seg, struct kvm_segment *save)
3174
{
3175
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3176
struct kvm_segment var = *save;
3177
3178
var.dpl = 0x3;
3179
if (seg == VCPU_SREG_CS)
3180
var.type = 0x3;
3181
3182
if (!emulate_invalid_guest_state) {
3183
var.selector = var.base >> 4;
3184
var.base = var.base & 0xffff0;
3185
var.limit = 0xffff;
3186
var.g = 0;
3187
var.db = 0;
3188
var.present = 1;
3189
var.s = 1;
3190
var.l = 0;
3191
var.unusable = 0;
3192
var.type = 0x3;
3193
var.avl = 0;
3194
if (save->base & 0xf)
3195
pr_warn_once("segment base is not paragraph aligned "
3196
"when entering protected mode (seg=%d)", seg);
3197
}
3198
3199
vmcs_write16(sf->selector, var.selector);
3200
vmcs_writel(sf->base, var.base);
3201
vmcs_write32(sf->limit, var.limit);
3202
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3203
}
3204
3205
static void enter_rmode(struct kvm_vcpu *vcpu)
3206
{
3207
unsigned long flags;
3208
struct vcpu_vmx *vmx = to_vmx(vcpu);
3209
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3210
3211
/*
3212
* KVM should never use VM86 to virtualize Real Mode when L2 is active,
3213
* as using VM86 is unnecessary if unrestricted guest is enabled, and
3214
* if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3215
* should VM-Fail and KVM should reject userspace attempts to stuff
3216
* CR0.PG=0 when L2 is active.
3217
*/
3218
WARN_ON_ONCE(is_guest_mode(vcpu));
3219
3220
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3221
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3222
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3223
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3224
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3225
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3226
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3227
3228
vmx->rmode.vm86_active = 1;
3229
3230
vmx_segment_cache_clear(vmx);
3231
3232
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3233
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3234
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3235
3236
flags = vmcs_readl(GUEST_RFLAGS);
3237
vmx->rmode.save_rflags = flags;
3238
3239
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3240
3241
vmcs_writel(GUEST_RFLAGS, flags);
3242
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3243
vmx_update_exception_bitmap(vcpu);
3244
3245
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3246
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3247
fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3248
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3249
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3250
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3251
}
3252
3253
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3254
{
3255
struct vcpu_vmx *vmx = to_vmx(vcpu);
3256
3257
/* Nothing to do if hardware doesn't support EFER. */
3258
if (!vmx_find_uret_msr(vmx, MSR_EFER))
3259
return 0;
3260
3261
vcpu->arch.efer = efer;
3262
#ifdef CONFIG_X86_64
3263
if (efer & EFER_LMA)
3264
vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3265
else
3266
vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3267
#else
3268
if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3269
return 1;
3270
#endif
3271
3272
vmx_setup_uret_msrs(vmx);
3273
return 0;
3274
}
3275
3276
#ifdef CONFIG_X86_64
3277
3278
static void enter_lmode(struct kvm_vcpu *vcpu)
3279
{
3280
u32 guest_tr_ar;
3281
3282
vmx_segment_cache_clear(to_vmx(vcpu));
3283
3284
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3285
if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3286
pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3287
__func__);
3288
vmcs_write32(GUEST_TR_AR_BYTES,
3289
(guest_tr_ar & ~VMX_AR_TYPE_MASK)
3290
| VMX_AR_TYPE_BUSY_64_TSS);
3291
}
3292
vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3293
}
3294
3295
static void exit_lmode(struct kvm_vcpu *vcpu)
3296
{
3297
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3298
}
3299
3300
#endif
3301
3302
void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3303
{
3304
struct vcpu_vmx *vmx = to_vmx(vcpu);
3305
3306
/*
3307
* INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3308
* the CPU is not required to invalidate guest-physical mappings on
3309
* VM-Entry, even if VPID is disabled. Guest-physical mappings are
3310
* associated with the root EPT structure and not any particular VPID
3311
* (INVVPID also isn't required to invalidate guest-physical mappings).
3312
*/
3313
if (enable_ept) {
3314
ept_sync_global();
3315
} else if (enable_vpid) {
3316
if (cpu_has_vmx_invvpid_global()) {
3317
vpid_sync_vcpu_global();
3318
} else {
3319
vpid_sync_vcpu_single(vmx->vpid);
3320
vpid_sync_vcpu_single(vmx->nested.vpid02);
3321
}
3322
}
3323
}
3324
3325
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3326
{
3327
if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
3328
return nested_get_vpid02(vcpu);
3329
return to_vmx(vcpu)->vpid;
3330
}
3331
3332
static u64 construct_eptp(hpa_t root_hpa)
3333
{
3334
u64 eptp = root_hpa | VMX_EPTP_MT_WB;
3335
struct kvm_mmu_page *root;
3336
3337
if (kvm_mmu_is_dummy_root(root_hpa))
3338
return eptp | VMX_EPTP_PWL_4;
3339
3340
/*
3341
* EPT roots should always have an associated MMU page. Return a "bad"
3342
* EPTP to induce VM-Fail instead of continuing on in a unknown state.
3343
*/
3344
root = root_to_sp(root_hpa);
3345
if (WARN_ON_ONCE(!root))
3346
return INVALID_PAGE;
3347
3348
eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3349
3350
if (enable_ept_ad_bits && !root->role.ad_disabled)
3351
eptp |= VMX_EPTP_AD_ENABLE_BIT;
3352
3353
return eptp;
3354
}
3355
3356
static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
3357
{
3358
u64 eptp = construct_eptp(root_hpa);
3359
3360
if (VALID_PAGE(eptp))
3361
ept_sync_context(eptp);
3362
else
3363
ept_sync_global();
3364
}
3365
3366
void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3367
{
3368
struct kvm_mmu *mmu = vcpu->arch.mmu;
3369
u64 root_hpa = mmu->root.hpa;
3370
3371
/* No flush required if the current context is invalid. */
3372
if (!VALID_PAGE(root_hpa))
3373
return;
3374
3375
if (enable_ept)
3376
vmx_flush_tlb_ept_root(root_hpa);
3377
else
3378
vpid_sync_context(vmx_get_current_vpid(vcpu));
3379
}
3380
3381
void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3382
{
3383
/*
3384
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3385
* vmx_flush_tlb_guest() for an explanation of why this is ok.
3386
*/
3387
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3388
}
3389
3390
void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3391
{
3392
/*
3393
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3394
* vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3395
* required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3396
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3397
* i.e. no explicit INVVPID is necessary.
3398
*/
3399
vpid_sync_context(vmx_get_current_vpid(vcpu));
3400
}
3401
3402
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3403
{
3404
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3405
3406
if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3407
return;
3408
3409
if (is_pae_paging(vcpu)) {
3410
vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3411
vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3412
vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3413
vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3414
}
3415
}
3416
3417
void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3418
{
3419
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3420
3421
if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3422
return;
3423
3424
mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3425
mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3426
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3427
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3428
3429
kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3430
}
3431
3432
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3433
CPU_BASED_CR3_STORE_EXITING)
3434
3435
bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3436
{
3437
if (is_guest_mode(vcpu))
3438
return nested_guest_cr0_valid(vcpu, cr0);
3439
3440
if (to_vmx(vcpu)->nested.vmxon)
3441
return nested_host_cr0_valid(vcpu, cr0);
3442
3443
return true;
3444
}
3445
3446
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3447
{
3448
struct vcpu_vmx *vmx = to_vmx(vcpu);
3449
unsigned long hw_cr0, old_cr0_pg;
3450
u32 tmp;
3451
3452
old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3453
3454
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3455
if (enable_unrestricted_guest)
3456
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3457
else {
3458
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3459
if (!enable_ept)
3460
hw_cr0 |= X86_CR0_WP;
3461
3462
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3463
enter_pmode(vcpu);
3464
3465
if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3466
enter_rmode(vcpu);
3467
}
3468
3469
vmcs_writel(CR0_READ_SHADOW, cr0);
3470
vmcs_writel(GUEST_CR0, hw_cr0);
3471
vcpu->arch.cr0 = cr0;
3472
kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3473
3474
#ifdef CONFIG_X86_64
3475
if (vcpu->arch.efer & EFER_LME) {
3476
if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3477
enter_lmode(vcpu);
3478
else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3479
exit_lmode(vcpu);
3480
}
3481
#endif
3482
3483
if (enable_ept && !enable_unrestricted_guest) {
3484
/*
3485
* Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3486
* the below code _enables_ CR3 exiting, vmx_cache_reg() will
3487
* (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3488
* KVM's CR3 is installed.
3489
*/
3490
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3491
vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3492
3493
/*
3494
* When running with EPT but not unrestricted guest, KVM must
3495
* intercept CR3 accesses when paging is _disabled_. This is
3496
* necessary because restricted guests can't actually run with
3497
* paging disabled, and so KVM stuffs its own CR3 in order to
3498
* run the guest when identity mapped page tables.
3499
*
3500
* Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3501
* update, it may be stale with respect to CR3 interception,
3502
* e.g. after nested VM-Enter.
3503
*
3504
* Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3505
* stores to forward them to L1, even if KVM does not need to
3506
* intercept them to preserve its identity mapped page tables.
3507
*/
3508
if (!(cr0 & X86_CR0_PG)) {
3509
exec_controls_setbit(vmx, CR3_EXITING_BITS);
3510
} else if (!is_guest_mode(vcpu)) {
3511
exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3512
} else {
3513
tmp = exec_controls_get(vmx);
3514
tmp &= ~CR3_EXITING_BITS;
3515
tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3516
exec_controls_set(vmx, tmp);
3517
}
3518
3519
/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3520
if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3521
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3522
3523
/*
3524
* When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3525
* GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3526
*/
3527
if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3528
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3529
}
3530
3531
/* depends on vcpu->arch.cr0 to be set to a new value */
3532
vmx->vt.emulation_required = vmx_emulation_required(vcpu);
3533
}
3534
3535
static int vmx_get_max_ept_level(void)
3536
{
3537
if (cpu_has_vmx_ept_5levels())
3538
return 5;
3539
return 4;
3540
}
3541
3542
void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3543
{
3544
struct kvm *kvm = vcpu->kvm;
3545
bool update_guest_cr3 = true;
3546
unsigned long guest_cr3;
3547
3548
if (enable_ept) {
3549
KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
3550
root_level != root_to_sp(root_hpa)->role.level);
3551
vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
3552
3553
hv_track_root_tdp(vcpu, root_hpa);
3554
3555
if (!enable_unrestricted_guest && !is_paging(vcpu))
3556
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3557
else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3558
guest_cr3 = vcpu->arch.cr3;
3559
else /* vmcs.GUEST_CR3 is already up-to-date. */
3560
update_guest_cr3 = false;
3561
vmx_ept_load_pdptrs(vcpu);
3562
} else {
3563
guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
3564
kvm_get_active_cr3_lam_bits(vcpu);
3565
}
3566
3567
if (update_guest_cr3)
3568
vmcs_writel(GUEST_CR3, guest_cr3);
3569
}
3570
3571
bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3572
{
3573
/*
3574
* We operate under the default treatment of SMM, so VMX cannot be
3575
* enabled under SMM. Note, whether or not VMXE is allowed at all,
3576
* i.e. is a reserved bit, is handled by common x86 code.
3577
*/
3578
if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3579
return false;
3580
3581
if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3582
return false;
3583
3584
return true;
3585
}
3586
3587
void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3588
{
3589
unsigned long old_cr4 = kvm_read_cr4(vcpu);
3590
struct vcpu_vmx *vmx = to_vmx(vcpu);
3591
unsigned long hw_cr4;
3592
3593
/*
3594
* Pass through host's Machine Check Enable value to hw_cr4, which
3595
* is in force while we are in guest mode. Do not let guests control
3596
* this bit, even if host CR4.MCE == 0.
3597
*/
3598
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3599
if (enable_unrestricted_guest)
3600
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3601
else if (vmx->rmode.vm86_active)
3602
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3603
else
3604
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3605
3606
if (vmx_umip_emulated()) {
3607
if (cr4 & X86_CR4_UMIP) {
3608
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3609
hw_cr4 &= ~X86_CR4_UMIP;
3610
} else if (!is_guest_mode(vcpu) ||
3611
!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3612
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3613
}
3614
}
3615
3616
vcpu->arch.cr4 = cr4;
3617
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3618
3619
if (!enable_unrestricted_guest) {
3620
if (enable_ept) {
3621
if (!is_paging(vcpu)) {
3622
hw_cr4 &= ~X86_CR4_PAE;
3623
hw_cr4 |= X86_CR4_PSE;
3624
} else if (!(cr4 & X86_CR4_PAE)) {
3625
hw_cr4 &= ~X86_CR4_PAE;
3626
}
3627
}
3628
3629
/*
3630
* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3631
* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3632
* to be manually disabled when guest switches to non-paging
3633
* mode.
3634
*
3635
* If !enable_unrestricted_guest, the CPU is always running
3636
* with CR0.PG=1 and CR4 needs to be modified.
3637
* If enable_unrestricted_guest, the CPU automatically
3638
* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3639
*/
3640
if (!is_paging(vcpu))
3641
hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3642
}
3643
3644
vmcs_writel(CR4_READ_SHADOW, cr4);
3645
vmcs_writel(GUEST_CR4, hw_cr4);
3646
3647
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3648
vcpu->arch.cpuid_dynamic_bits_dirty = true;
3649
}
3650
3651
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3652
{
3653
struct vcpu_vmx *vmx = to_vmx(vcpu);
3654
u32 ar;
3655
3656
if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3657
*var = vmx->rmode.segs[seg];
3658
if (seg == VCPU_SREG_TR
3659
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
3660
return;
3661
var->base = vmx_read_guest_seg_base(vmx, seg);
3662
var->selector = vmx_read_guest_seg_selector(vmx, seg);
3663
return;
3664
}
3665
var->base = vmx_read_guest_seg_base(vmx, seg);
3666
var->limit = vmx_read_guest_seg_limit(vmx, seg);
3667
var->selector = vmx_read_guest_seg_selector(vmx, seg);
3668
ar = vmx_read_guest_seg_ar(vmx, seg);
3669
var->unusable = (ar >> 16) & 1;
3670
var->type = ar & 15;
3671
var->s = (ar >> 4) & 1;
3672
var->dpl = (ar >> 5) & 3;
3673
/*
3674
* Some userspaces do not preserve unusable property. Since usable
3675
* segment has to be present according to VMX spec we can use present
3676
* property to amend userspace bug by making unusable segment always
3677
* nonpresent. vmx_segment_access_rights() already marks nonpresent
3678
* segment as unusable.
3679
*/
3680
var->present = !var->unusable;
3681
var->avl = (ar >> 12) & 1;
3682
var->l = (ar >> 13) & 1;
3683
var->db = (ar >> 14) & 1;
3684
var->g = (ar >> 15) & 1;
3685
}
3686
3687
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3688
{
3689
struct kvm_segment s;
3690
3691
if (to_vmx(vcpu)->rmode.vm86_active) {
3692
vmx_get_segment(vcpu, &s, seg);
3693
return s.base;
3694
}
3695
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3696
}
3697
3698
static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
3699
{
3700
struct vcpu_vmx *vmx = to_vmx(vcpu);
3701
int ar;
3702
3703
if (unlikely(vmx->rmode.vm86_active))
3704
return 0;
3705
3706
if (no_cache)
3707
ar = vmcs_read32(GUEST_SS_AR_BYTES);
3708
else
3709
ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3710
return VMX_AR_DPL(ar);
3711
}
3712
3713
int vmx_get_cpl(struct kvm_vcpu *vcpu)
3714
{
3715
return __vmx_get_cpl(vcpu, false);
3716
}
3717
3718
int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
3719
{
3720
return __vmx_get_cpl(vcpu, true);
3721
}
3722
3723
static u32 vmx_segment_access_rights(struct kvm_segment *var)
3724
{
3725
u32 ar;
3726
3727
ar = var->type & 15;
3728
ar |= (var->s & 1) << 4;
3729
ar |= (var->dpl & 3) << 5;
3730
ar |= (var->present & 1) << 7;
3731
ar |= (var->avl & 1) << 12;
3732
ar |= (var->l & 1) << 13;
3733
ar |= (var->db & 1) << 14;
3734
ar |= (var->g & 1) << 15;
3735
ar |= (var->unusable || !var->present) << 16;
3736
3737
return ar;
3738
}
3739
3740
void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3741
{
3742
struct vcpu_vmx *vmx = to_vmx(vcpu);
3743
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3744
3745
vmx_segment_cache_clear(vmx);
3746
3747
if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3748
vmx->rmode.segs[seg] = *var;
3749
if (seg == VCPU_SREG_TR)
3750
vmcs_write16(sf->selector, var->selector);
3751
else if (var->s)
3752
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3753
return;
3754
}
3755
3756
vmcs_writel(sf->base, var->base);
3757
vmcs_write32(sf->limit, var->limit);
3758
vmcs_write16(sf->selector, var->selector);
3759
3760
/*
3761
* Fix the "Accessed" bit in AR field of segment registers for older
3762
* qemu binaries.
3763
* IA32 arch specifies that at the time of processor reset the
3764
* "Accessed" bit in the AR field of segment registers is 1. And qemu
3765
* is setting it to 0 in the userland code. This causes invalid guest
3766
* state vmexit when "unrestricted guest" mode is turned on.
3767
* Fix for this setup issue in cpu_reset is being pushed in the qemu
3768
* tree. Newer qemu binaries with that qemu fix would not need this
3769
* kvm hack.
3770
*/
3771
if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3772
var->type |= 0x1; /* Accessed */
3773
3774
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3775
}
3776
3777
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3778
{
3779
__vmx_set_segment(vcpu, var, seg);
3780
3781
to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
3782
}
3783
3784
void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3785
{
3786
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3787
3788
*db = (ar >> 14) & 1;
3789
*l = (ar >> 13) & 1;
3790
}
3791
3792
void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3793
{
3794
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3795
dt->address = vmcs_readl(GUEST_IDTR_BASE);
3796
}
3797
3798
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3799
{
3800
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3801
vmcs_writel(GUEST_IDTR_BASE, dt->address);
3802
}
3803
3804
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3805
{
3806
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3807
dt->address = vmcs_readl(GUEST_GDTR_BASE);
3808
}
3809
3810
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3811
{
3812
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3813
vmcs_writel(GUEST_GDTR_BASE, dt->address);
3814
}
3815
3816
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3817
{
3818
struct kvm_segment var;
3819
u32 ar;
3820
3821
vmx_get_segment(vcpu, &var, seg);
3822
var.dpl = 0x3;
3823
if (seg == VCPU_SREG_CS)
3824
var.type = 0x3;
3825
ar = vmx_segment_access_rights(&var);
3826
3827
if (var.base != (var.selector << 4))
3828
return false;
3829
if (var.limit != 0xffff)
3830
return false;
3831
if (ar != 0xf3)
3832
return false;
3833
3834
return true;
3835
}
3836
3837
static bool code_segment_valid(struct kvm_vcpu *vcpu)
3838
{
3839
struct kvm_segment cs;
3840
unsigned int cs_rpl;
3841
3842
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3843
cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3844
3845
if (cs.unusable)
3846
return false;
3847
if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3848
return false;
3849
if (!cs.s)
3850
return false;
3851
if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3852
if (cs.dpl > cs_rpl)
3853
return false;
3854
} else {
3855
if (cs.dpl != cs_rpl)
3856
return false;
3857
}
3858
if (!cs.present)
3859
return false;
3860
3861
/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3862
return true;
3863
}
3864
3865
static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3866
{
3867
struct kvm_segment ss;
3868
unsigned int ss_rpl;
3869
3870
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3871
ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3872
3873
if (ss.unusable)
3874
return true;
3875
if (ss.type != 3 && ss.type != 7)
3876
return false;
3877
if (!ss.s)
3878
return false;
3879
if (ss.dpl != ss_rpl) /* DPL != RPL */
3880
return false;
3881
if (!ss.present)
3882
return false;
3883
3884
return true;
3885
}
3886
3887
static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3888
{
3889
struct kvm_segment var;
3890
unsigned int rpl;
3891
3892
vmx_get_segment(vcpu, &var, seg);
3893
rpl = var.selector & SEGMENT_RPL_MASK;
3894
3895
if (var.unusable)
3896
return true;
3897
if (!var.s)
3898
return false;
3899
if (!var.present)
3900
return false;
3901
if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3902
if (var.dpl < rpl) /* DPL < RPL */
3903
return false;
3904
}
3905
3906
/* TODO: Add other members to kvm_segment_field to allow checking for other access
3907
* rights flags
3908
*/
3909
return true;
3910
}
3911
3912
static bool tr_valid(struct kvm_vcpu *vcpu)
3913
{
3914
struct kvm_segment tr;
3915
3916
vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3917
3918
if (tr.unusable)
3919
return false;
3920
if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3921
return false;
3922
if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3923
return false;
3924
if (!tr.present)
3925
return false;
3926
3927
return true;
3928
}
3929
3930
static bool ldtr_valid(struct kvm_vcpu *vcpu)
3931
{
3932
struct kvm_segment ldtr;
3933
3934
vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3935
3936
if (ldtr.unusable)
3937
return true;
3938
if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3939
return false;
3940
if (ldtr.type != 2)
3941
return false;
3942
if (!ldtr.present)
3943
return false;
3944
3945
return true;
3946
}
3947
3948
static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3949
{
3950
struct kvm_segment cs, ss;
3951
3952
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3953
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3954
3955
return ((cs.selector & SEGMENT_RPL_MASK) ==
3956
(ss.selector & SEGMENT_RPL_MASK));
3957
}
3958
3959
/*
3960
* Check if guest state is valid. Returns true if valid, false if
3961
* not.
3962
* We assume that registers are always usable
3963
*/
3964
bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3965
{
3966
/* real mode guest state checks */
3967
if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3968
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3969
return false;
3970
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3971
return false;
3972
if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3973
return false;
3974
if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3975
return false;
3976
if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3977
return false;
3978
if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3979
return false;
3980
} else {
3981
/* protected mode guest state checks */
3982
if (!cs_ss_rpl_check(vcpu))
3983
return false;
3984
if (!code_segment_valid(vcpu))
3985
return false;
3986
if (!stack_segment_valid(vcpu))
3987
return false;
3988
if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3989
return false;
3990
if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3991
return false;
3992
if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3993
return false;
3994
if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3995
return false;
3996
if (!tr_valid(vcpu))
3997
return false;
3998
if (!ldtr_valid(vcpu))
3999
return false;
4000
}
4001
/* TODO:
4002
* - Add checks on RIP
4003
* - Add checks on RFLAGS
4004
*/
4005
4006
return true;
4007
}
4008
4009
static int init_rmode_tss(struct kvm *kvm, void __user *ua)
4010
{
4011
const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
4012
u16 data;
4013
int i;
4014
4015
for (i = 0; i < 3; i++) {
4016
if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
4017
return -EFAULT;
4018
}
4019
4020
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4021
if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
4022
return -EFAULT;
4023
4024
data = ~0;
4025
if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
4026
return -EFAULT;
4027
4028
return 0;
4029
}
4030
4031
static int init_rmode_identity_map(struct kvm *kvm)
4032
{
4033
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4034
int i, r = 0;
4035
void __user *uaddr;
4036
u32 tmp;
4037
4038
/* Protect kvm_vmx->ept_identity_pagetable_done. */
4039
mutex_lock(&kvm->slots_lock);
4040
4041
if (likely(kvm_vmx->ept_identity_pagetable_done))
4042
goto out;
4043
4044
if (!kvm_vmx->ept_identity_map_addr)
4045
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4046
4047
uaddr = __x86_set_memory_region(kvm,
4048
IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4049
kvm_vmx->ept_identity_map_addr,
4050
PAGE_SIZE);
4051
if (IS_ERR(uaddr)) {
4052
r = PTR_ERR(uaddr);
4053
goto out;
4054
}
4055
4056
/* Set up identity-mapping pagetable for EPT in real mode */
4057
for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
4058
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4059
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4060
if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
4061
r = -EFAULT;
4062
goto out;
4063
}
4064
}
4065
kvm_vmx->ept_identity_pagetable_done = true;
4066
4067
out:
4068
mutex_unlock(&kvm->slots_lock);
4069
return r;
4070
}
4071
4072
static void seg_setup(int seg)
4073
{
4074
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4075
unsigned int ar;
4076
4077
vmcs_write16(sf->selector, 0);
4078
vmcs_writel(sf->base, 0);
4079
vmcs_write32(sf->limit, 0xffff);
4080
ar = 0x93;
4081
if (seg == VCPU_SREG_CS)
4082
ar |= 0x08; /* code segment */
4083
4084
vmcs_write32(sf->ar_bytes, ar);
4085
}
4086
4087
int allocate_vpid(void)
4088
{
4089
int vpid;
4090
4091
if (!enable_vpid)
4092
return 0;
4093
spin_lock(&vmx_vpid_lock);
4094
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4095
if (vpid < VMX_NR_VPIDS)
4096
__set_bit(vpid, vmx_vpid_bitmap);
4097
else
4098
vpid = 0;
4099
spin_unlock(&vmx_vpid_lock);
4100
return vpid;
4101
}
4102
4103
void free_vpid(int vpid)
4104
{
4105
if (!enable_vpid || vpid == 0)
4106
return;
4107
spin_lock(&vmx_vpid_lock);
4108
__clear_bit(vpid, vmx_vpid_bitmap);
4109
spin_unlock(&vmx_vpid_lock);
4110
}
4111
4112
static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
4113
{
4114
/*
4115
* When KVM is a nested hypervisor on top of Hyper-V and uses
4116
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
4117
* bitmap has changed.
4118
*/
4119
if (kvm_is_using_evmcs()) {
4120
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
4121
4122
if (evmcs->hv_enlightenments_control.msr_bitmap)
4123
evmcs->hv_clean_fields &=
4124
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
4125
}
4126
4127
vmx->nested.force_msr_bitmap_recalc = true;
4128
}
4129
4130
void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
4131
{
4132
struct vcpu_vmx *vmx = to_vmx(vcpu);
4133
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4134
4135
if (!cpu_has_vmx_msr_bitmap())
4136
return;
4137
4138
vmx_msr_bitmap_l01_changed(vmx);
4139
4140
if (type & MSR_TYPE_R) {
4141
if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
4142
vmx_clear_msr_bitmap_read(msr_bitmap, msr);
4143
else
4144
vmx_set_msr_bitmap_read(msr_bitmap, msr);
4145
}
4146
4147
if (type & MSR_TYPE_W) {
4148
if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
4149
vmx_clear_msr_bitmap_write(msr_bitmap, msr);
4150
else
4151
vmx_set_msr_bitmap_write(msr_bitmap, msr);
4152
}
4153
}
4154
4155
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4156
{
4157
/*
4158
* x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4159
* of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
4160
* i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4161
*/
4162
const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4163
const int write_idx = read_idx + (0x800 / sizeof(u64));
4164
struct vcpu_vmx *vmx = to_vmx(vcpu);
4165
u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4166
u8 mode;
4167
4168
if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4169
return;
4170
4171
if (cpu_has_secondary_exec_ctrls() &&
4172
(secondary_exec_controls_get(vmx) &
4173
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4174
mode = MSR_BITMAP_MODE_X2APIC;
4175
if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4176
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4177
} else {
4178
mode = 0;
4179
}
4180
4181
if (mode == vmx->x2apic_msr_bitmap_mode)
4182
return;
4183
4184
vmx->x2apic_msr_bitmap_mode = mode;
4185
4186
/*
4187
* Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
4188
* registers (0x840 and above) intercepted, KVM doesn't support them.
4189
* Intercept all writes by default and poke holes as needed. Pass
4190
* through reads for all valid registers by default in x2APIC+APICv
4191
* mode, only the current timer count needs on-demand emulation by KVM.
4192
*/
4193
if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4194
msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4195
else
4196
msr_bitmap[read_idx] = ~0ull;
4197
msr_bitmap[write_idx] = ~0ull;
4198
4199
/*
4200
* TPR reads and writes can be virtualized even if virtual interrupt
4201
* delivery is not in use.
4202
*/
4203
vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4204
!(mode & MSR_BITMAP_MODE_X2APIC));
4205
4206
if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4207
vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4208
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4209
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4210
if (enable_ipiv)
4211
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4212
}
4213
}
4214
4215
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4216
{
4217
struct vcpu_vmx *vmx = to_vmx(vcpu);
4218
bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4219
u32 i;
4220
4221
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4222
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4223
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4224
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4225
for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4226
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4227
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4228
}
4229
}
4230
4231
static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4232
{
4233
bool intercept;
4234
4235
if (!cpu_has_vmx_msr_bitmap())
4236
return;
4237
4238
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
4239
#ifdef CONFIG_X86_64
4240
vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
4241
vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
4242
vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
4243
#endif
4244
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
4245
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
4246
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
4247
if (kvm_cstate_in_guest(vcpu->kvm)) {
4248
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
4249
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
4250
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
4251
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
4252
}
4253
if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
4254
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
4255
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
4256
}
4257
4258
/* PT MSRs can be passed through iff PT is exposed to the guest. */
4259
if (vmx_pt_mode_is_host_guest())
4260
pt_update_intercept_for_msr(vcpu);
4261
4262
if (vcpu->arch.xfd_no_write_intercept)
4263
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
4264
4265
vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
4266
!to_vmx(vcpu)->spec_ctrl);
4267
4268
if (kvm_cpu_cap_has(X86_FEATURE_XFD))
4269
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
4270
!guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
4271
4272
if (cpu_feature_enabled(X86_FEATURE_IBPB))
4273
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
4274
!guest_has_pred_cmd_msr(vcpu));
4275
4276
if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
4277
vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
4278
!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
4279
4280
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
4281
intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4282
4283
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
4284
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
4285
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
4286
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
4287
}
4288
4289
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
4290
intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
4291
!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4292
4293
vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
4294
vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
4295
}
4296
4297
/*
4298
* x2APIC and LBR MSR intercepts are modified on-demand and cannot be
4299
* filtered by userspace.
4300
*/
4301
}
4302
4303
void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
4304
{
4305
vmx_recalc_msr_intercepts(vcpu);
4306
}
4307
4308
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4309
int vector)
4310
{
4311
struct vcpu_vmx *vmx = to_vmx(vcpu);
4312
4313
/*
4314
* DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
4315
* and freed, and must not be accessed outside of vcpu->mutex. The
4316
* vCPU's cached PI NV is valid if and only if posted interrupts
4317
* enabled in its vmcs12, i.e. checking the vector also checks that
4318
* L1 has enabled posted interrupts for L2.
4319
*/
4320
if (is_guest_mode(vcpu) &&
4321
vector == vmx->nested.posted_intr_nv) {
4322
/*
4323
* If a posted intr is not recognized by hardware,
4324
* we will accomplish it in the next vmentry.
4325
*/
4326
vmx->nested.pi_pending = true;
4327
kvm_make_request(KVM_REQ_EVENT, vcpu);
4328
4329
/*
4330
* This pairs with the smp_mb_*() after setting vcpu->mode in
4331
* vcpu_enter_guest() to guarantee the vCPU sees the event
4332
* request if triggering a posted interrupt "fails" because
4333
* vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4334
* the smb_wmb() in kvm_make_request() only ensures everything
4335
* done before making the request is visible when the request
4336
* is visible, it doesn't ensure ordering between the store to
4337
* vcpu->requests and the load from vcpu->mode.
4338
*/
4339
smp_mb__after_atomic();
4340
4341
/* the PIR and ON have been set by L1. */
4342
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4343
return 0;
4344
}
4345
return -1;
4346
}
4347
/*
4348
* Send interrupt to vcpu via posted interrupt way.
4349
* 1. If target vcpu is running(non-root mode), send posted interrupt
4350
* notification to vcpu and hardware will sync PIR to vIRR atomically.
4351
* 2. If target vcpu isn't running(root mode), kick it to pick up the
4352
* interrupt from PIR in next vmentry.
4353
*/
4354
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4355
{
4356
struct vcpu_vt *vt = to_vt(vcpu);
4357
int r;
4358
4359
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4360
if (!r)
4361
return 0;
4362
4363
/* Note, this is called iff the local APIC is in-kernel. */
4364
if (!vcpu->arch.apic->apicv_active)
4365
return -1;
4366
4367
__vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
4368
return 0;
4369
}
4370
4371
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4372
int trig_mode, int vector)
4373
{
4374
struct kvm_vcpu *vcpu = apic->vcpu;
4375
4376
if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4377
kvm_lapic_set_irr(vector, apic);
4378
kvm_make_request(KVM_REQ_EVENT, vcpu);
4379
kvm_vcpu_kick(vcpu);
4380
} else {
4381
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4382
trig_mode, vector);
4383
}
4384
}
4385
4386
/*
4387
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
4388
* will not change in the lifetime of the guest.
4389
* Note that host-state that does change is set elsewhere. E.g., host-state
4390
* that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4391
*/
4392
void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4393
{
4394
u32 low32, high32;
4395
unsigned long tmpl;
4396
unsigned long cr0, cr3, cr4;
4397
4398
cr0 = read_cr0();
4399
WARN_ON(cr0 & X86_CR0_TS);
4400
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
4401
4402
/*
4403
* Save the most likely value for this task's CR3 in the VMCS.
4404
* We can't use __get_current_cr3_fast() because we're not atomic.
4405
*/
4406
cr3 = __read_cr3();
4407
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
4408
vmx->loaded_vmcs->host_state.cr3 = cr3;
4409
4410
/* Save the most likely value for this task's CR4 in the VMCS. */
4411
cr4 = cr4_read_shadow();
4412
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
4413
vmx->loaded_vmcs->host_state.cr4 = cr4;
4414
4415
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
4416
#ifdef CONFIG_X86_64
4417
/*
4418
* Load null selectors, so we can avoid reloading them in
4419
* vmx_prepare_switch_to_host(), in case userspace uses
4420
* the null selectors too (the expected case).
4421
*/
4422
vmcs_write16(HOST_DS_SELECTOR, 0);
4423
vmcs_write16(HOST_ES_SELECTOR, 0);
4424
#else
4425
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4426
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4427
#endif
4428
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4429
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4430
4431
vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
4432
4433
vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4434
4435
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4436
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4437
4438
/*
4439
* SYSENTER is used for 32-bit system calls on either 32-bit or
4440
* 64-bit kernels. It is always zero If neither is allowed, otherwise
4441
* vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4442
* have already done so!).
4443
*/
4444
if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4445
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4446
4447
rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
4448
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4449
4450
if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4451
rdmsr(MSR_IA32_CR_PAT, low32, high32);
4452
vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4453
}
4454
4455
if (cpu_has_load_ia32_efer())
4456
vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
4457
4458
/*
4459
* Supervisor shadow stack is not enabled on host side, i.e.,
4460
* host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
4461
* description(RDSSP instruction), SSP is not readable in CPL0,
4462
* so resetting the two registers to 0s at VM-Exit does no harm
4463
* to kernel execution. When execution flow exits to userspace,
4464
* SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
4465
* 3 and 4 for details.
4466
*/
4467
if (cpu_has_load_cet_ctrl()) {
4468
vmcs_writel(HOST_S_CET, kvm_host.s_cet);
4469
vmcs_writel(HOST_SSP, 0);
4470
vmcs_writel(HOST_INTR_SSP_TABLE, 0);
4471
}
4472
}
4473
4474
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4475
{
4476
struct kvm_vcpu *vcpu = &vmx->vcpu;
4477
4478
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4479
~vcpu->arch.cr4_guest_rsvd_bits;
4480
if (!enable_ept) {
4481
vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4482
vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4483
}
4484
if (is_guest_mode(&vmx->vcpu))
4485
vcpu->arch.cr4_guest_owned_bits &=
4486
~get_vmcs12(vcpu)->cr4_guest_host_mask;
4487
vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4488
}
4489
4490
static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4491
{
4492
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4493
4494
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4495
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4496
4497
if (!enable_vnmi)
4498
pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4499
4500
if (!enable_preemption_timer)
4501
pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4502
4503
return pin_based_exec_ctrl;
4504
}
4505
4506
static u32 vmx_get_initial_vmentry_ctrl(void)
4507
{
4508
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4509
4510
if (vmx_pt_mode_is_system())
4511
vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4512
VM_ENTRY_LOAD_IA32_RTIT_CTL);
4513
/*
4514
* IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4515
*/
4516
vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4517
VM_ENTRY_LOAD_IA32_EFER |
4518
VM_ENTRY_IA32E_MODE);
4519
4520
return vmentry_ctrl;
4521
}
4522
4523
static u32 vmx_get_initial_vmexit_ctrl(void)
4524
{
4525
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4526
4527
/*
4528
* Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4529
* nested virtualization and thus allowed to be set in vmcs12.
4530
*/
4531
vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4532
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4533
4534
if (vmx_pt_mode_is_system())
4535
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4536
VM_EXIT_CLEAR_IA32_RTIT_CTL);
4537
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4538
return vmexit_ctrl &
4539
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4540
}
4541
4542
void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4543
{
4544
struct vcpu_vmx *vmx = to_vmx(vcpu);
4545
4546
if (is_guest_mode(vcpu)) {
4547
vmx->nested.update_vmcs01_apicv_status = true;
4548
return;
4549
}
4550
4551
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4552
4553
secondary_exec_controls_changebit(vmx,
4554
SECONDARY_EXEC_APIC_REGISTER_VIRT |
4555
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
4556
kvm_vcpu_apicv_active(vcpu));
4557
if (enable_ipiv)
4558
tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
4559
kvm_vcpu_apicv_active(vcpu));
4560
4561
vmx_update_msr_bitmap_x2apic(vcpu);
4562
}
4563
4564
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4565
{
4566
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4567
4568
/*
4569
* Not used by KVM, but fully supported for nesting, i.e. are allowed in
4570
* vmcs12 and propagated to vmcs02 when set in vmcs12.
4571
*/
4572
exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4573
CPU_BASED_USE_IO_BITMAPS |
4574
CPU_BASED_MONITOR_TRAP_FLAG |
4575
CPU_BASED_PAUSE_EXITING);
4576
4577
/* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4578
exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4579
CPU_BASED_NMI_WINDOW_EXITING);
4580
4581
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4582
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4583
4584
if (!cpu_need_tpr_shadow(&vmx->vcpu))
4585
exec_control &= ~CPU_BASED_TPR_SHADOW;
4586
4587
#ifdef CONFIG_X86_64
4588
if (exec_control & CPU_BASED_TPR_SHADOW)
4589
exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4590
CPU_BASED_CR8_STORE_EXITING);
4591
else
4592
exec_control |= CPU_BASED_CR8_STORE_EXITING |
4593
CPU_BASED_CR8_LOAD_EXITING;
4594
#endif
4595
/* No need to intercept CR3 access or INVPLG when using EPT. */
4596
if (enable_ept)
4597
exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4598
CPU_BASED_CR3_STORE_EXITING |
4599
CPU_BASED_INVLPG_EXITING);
4600
if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4601
exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4602
CPU_BASED_MONITOR_EXITING);
4603
if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4604
exec_control &= ~CPU_BASED_HLT_EXITING;
4605
return exec_control;
4606
}
4607
4608
static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4609
{
4610
u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4611
4612
/*
4613
* IPI virtualization relies on APICv. Disable IPI virtualization if
4614
* APICv is inhibited.
4615
*/
4616
if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4617
exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4618
4619
return exec_control;
4620
}
4621
4622
/*
4623
* Adjust a single secondary execution control bit to intercept/allow an
4624
* instruction in the guest. This is usually done based on whether or not a
4625
* feature has been exposed to the guest in order to correctly emulate faults.
4626
*/
4627
static inline void
4628
vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4629
u32 control, bool enabled, bool exiting)
4630
{
4631
/*
4632
* If the control is for an opt-in feature, clear the control if the
4633
* feature is not exposed to the guest, i.e. not enabled. If the
4634
* control is opt-out, i.e. an exiting control, clear the control if
4635
* the feature _is_ exposed to the guest, i.e. exiting/interception is
4636
* disabled for the associated instruction. Note, the caller is
4637
* responsible presetting exec_control to set all supported bits.
4638
*/
4639
if (enabled == exiting)
4640
*exec_control &= ~control;
4641
4642
/*
4643
* Update the nested MSR settings so that a nested VMM can/can't set
4644
* controls for features that are/aren't exposed to the guest.
4645
*/
4646
if (nested &&
4647
kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
4648
/*
4649
* All features that can be added or removed to VMX MSRs must
4650
* be supported in the first place for nested virtualization.
4651
*/
4652
if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4653
enabled = false;
4654
4655
if (enabled)
4656
vmx->nested.msrs.secondary_ctls_high |= control;
4657
else
4658
vmx->nested.msrs.secondary_ctls_high &= ~control;
4659
}
4660
}
4661
4662
/*
4663
* Wrapper macro for the common case of adjusting a secondary execution control
4664
* based on a single guest CPUID bit, with a dedicated feature bit. This also
4665
* verifies that the control is actually supported by KVM and hardware.
4666
*/
4667
#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4668
({ \
4669
struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
4670
bool __enabled; \
4671
\
4672
if (cpu_has_vmx_##name()) { \
4673
__enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
4674
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
4675
__enabled, exiting); \
4676
} \
4677
})
4678
4679
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4680
#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4681
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4682
4683
#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4684
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4685
4686
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4687
{
4688
struct kvm_vcpu *vcpu = &vmx->vcpu;
4689
4690
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4691
4692
if (vmx_pt_mode_is_system())
4693
exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4694
if (!cpu_need_virtualize_apic_accesses(vcpu))
4695
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4696
if (vmx->vpid == 0)
4697
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4698
if (!enable_ept) {
4699
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4700
exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
4701
enable_unrestricted_guest = 0;
4702
}
4703
if (!enable_unrestricted_guest)
4704
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4705
if (kvm_pause_in_guest(vmx->vcpu.kvm))
4706
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4707
if (!kvm_vcpu_apicv_active(vcpu))
4708
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4709
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4710
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4711
4712
/*
4713
* KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4714
* base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4715
*/
4716
exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4717
4718
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4719
* in vmx_set_cr4. */
4720
exec_control &= ~SECONDARY_EXEC_DESC;
4721
4722
/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4723
(handle_vmptrld).
4724
We can NOT enable shadow_vmcs here because we don't have yet
4725
a current VMCS12
4726
*/
4727
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4728
4729
/*
4730
* PML is enabled/disabled when dirty logging of memsmlots changes, but
4731
* it needs to be set here when dirty logging is already active, e.g.
4732
* if this vCPU was created after dirty logging was enabled.
4733
*/
4734
if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4735
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4736
4737
vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
4738
4739
/*
4740
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4741
* feature is exposed to the guest. This creates a virtualization hole
4742
* if both are supported in hardware but only one is exposed to the
4743
* guest, but letting the guest execute RDTSCP or RDPID when either one
4744
* is advertised is preferable to emulating the advertised instruction
4745
* in KVM on #UD, and obviously better than incorrectly injecting #UD.
4746
*/
4747
if (cpu_has_vmx_rdtscp()) {
4748
bool rdpid_or_rdtscp_enabled =
4749
guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
4750
guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
4751
4752
vmx_adjust_secondary_exec_control(vmx, &exec_control,
4753
SECONDARY_EXEC_ENABLE_RDTSCP,
4754
rdpid_or_rdtscp_enabled, false);
4755
}
4756
4757
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4758
4759
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4760
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4761
4762
vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4763
ENABLE_USR_WAIT_PAUSE, false);
4764
4765
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4766
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4767
4768
if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4769
exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4770
4771
return exec_control;
4772
}
4773
4774
static inline int vmx_get_pid_table_order(struct kvm *kvm)
4775
{
4776
return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4777
}
4778
4779
static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4780
{
4781
struct page *pages;
4782
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4783
4784
if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4785
return 0;
4786
4787
if (kvm_vmx->pid_table)
4788
return 0;
4789
4790
pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
4791
vmx_get_pid_table_order(kvm));
4792
if (!pages)
4793
return -ENOMEM;
4794
4795
kvm_vmx->pid_table = (void *)page_address(pages);
4796
return 0;
4797
}
4798
4799
int vmx_vcpu_precreate(struct kvm *kvm)
4800
{
4801
return vmx_alloc_ipiv_pid_table(kvm);
4802
}
4803
4804
#define VMX_XSS_EXIT_BITMAP 0
4805
4806
static void init_vmcs(struct vcpu_vmx *vmx)
4807
{
4808
struct kvm *kvm = vmx->vcpu.kvm;
4809
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4810
4811
if (nested)
4812
nested_vmx_set_vmcs_shadowing_bitmap();
4813
4814
if (cpu_has_vmx_msr_bitmap())
4815
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4816
4817
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4818
4819
/* Control */
4820
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4821
4822
exec_controls_set(vmx, vmx_exec_control(vmx));
4823
4824
if (cpu_has_secondary_exec_ctrls()) {
4825
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4826
if (vmx->ve_info)
4827
vmcs_write64(VE_INFORMATION_ADDRESS,
4828
__pa(vmx->ve_info));
4829
}
4830
4831
if (cpu_has_tertiary_exec_ctrls())
4832
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4833
4834
if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4835
vmcs_write64(EOI_EXIT_BITMAP0, 0);
4836
vmcs_write64(EOI_EXIT_BITMAP1, 0);
4837
vmcs_write64(EOI_EXIT_BITMAP2, 0);
4838
vmcs_write64(EOI_EXIT_BITMAP3, 0);
4839
4840
vmcs_write16(GUEST_INTR_STATUS, 0);
4841
4842
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4843
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
4844
}
4845
4846
if (vmx_can_use_ipiv(&vmx->vcpu)) {
4847
vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4848
vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4849
}
4850
4851
if (!kvm_pause_in_guest(kvm)) {
4852
vmcs_write32(PLE_GAP, ple_gap);
4853
vmx->ple_window = ple_window;
4854
vmx->ple_window_dirty = true;
4855
}
4856
4857
if (kvm_notify_vmexit_enabled(kvm))
4858
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4859
4860
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4861
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4862
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4863
4864
vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4865
vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4866
vmx_set_constant_host_state(vmx);
4867
vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4868
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4869
4870
if (cpu_has_vmx_vmfunc())
4871
vmcs_write64(VM_FUNCTION_CONTROL, 0);
4872
4873
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4874
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4875
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4876
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4877
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4878
4879
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4880
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4881
4882
vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
4883
4884
/* 22.2.1, 20.8.1 */
4885
vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
4886
4887
vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4888
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4889
4890
set_cr4_guest_host_mask(vmx);
4891
4892
if (vmx->vpid != 0)
4893
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4894
4895
if (cpu_has_vmx_xsaves())
4896
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4897
4898
if (enable_pml) {
4899
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4900
vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
4901
}
4902
4903
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4904
4905
if (vmx_pt_mode_is_host_guest()) {
4906
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4907
/* Bit[6~0] are forced to 1, writes are ignored. */
4908
vmx->pt_desc.guest.output_mask = 0x7F;
4909
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4910
}
4911
4912
vmcs_write32(GUEST_SYSENTER_CS, 0);
4913
vmcs_writel(GUEST_SYSENTER_ESP, 0);
4914
vmcs_writel(GUEST_SYSENTER_EIP, 0);
4915
4916
vmx_guest_debugctl_write(&vmx->vcpu, 0);
4917
4918
if (cpu_has_vmx_tpr_shadow()) {
4919
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4920
if (cpu_need_tpr_shadow(&vmx->vcpu))
4921
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4922
__pa(vmx->vcpu.arch.apic->regs));
4923
vmcs_write32(TPR_THRESHOLD, 0);
4924
}
4925
4926
vmx_setup_uret_msrs(vmx);
4927
}
4928
4929
static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4930
{
4931
struct vcpu_vmx *vmx = to_vmx(vcpu);
4932
4933
init_vmcs(vmx);
4934
4935
if (nested &&
4936
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4937
memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4938
4939
vcpu_setup_sgx_lepubkeyhash(vcpu);
4940
4941
vmx->nested.posted_intr_nv = -1;
4942
vmx->nested.vmxon_ptr = INVALID_GPA;
4943
vmx->nested.current_vmptr = INVALID_GPA;
4944
4945
#ifdef CONFIG_KVM_HYPERV
4946
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4947
#endif
4948
4949
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4950
vcpu->arch.microcode_version = 0x100000000ULL;
4951
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4952
4953
/*
4954
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4955
* or POSTED_INTR_WAKEUP_VECTOR.
4956
*/
4957
vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
4958
__pi_set_sn(&vmx->vt.pi_desc);
4959
}
4960
4961
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4962
{
4963
struct vcpu_vmx *vmx = to_vmx(vcpu);
4964
4965
if (!init_event)
4966
__vmx_vcpu_reset(vcpu);
4967
4968
vmx->rmode.vm86_active = 0;
4969
vmx->spec_ctrl = 0;
4970
4971
vmx->msr_ia32_umwait_control = 0;
4972
4973
vmx->hv_deadline_tsc = -1;
4974
kvm_set_cr8(vcpu, 0);
4975
4976
seg_setup(VCPU_SREG_CS);
4977
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4978
vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4979
4980
seg_setup(VCPU_SREG_DS);
4981
seg_setup(VCPU_SREG_ES);
4982
seg_setup(VCPU_SREG_FS);
4983
seg_setup(VCPU_SREG_GS);
4984
seg_setup(VCPU_SREG_SS);
4985
4986
vmcs_write16(GUEST_TR_SELECTOR, 0);
4987
vmcs_writel(GUEST_TR_BASE, 0);
4988
vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4989
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4990
4991
vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4992
vmcs_writel(GUEST_LDTR_BASE, 0);
4993
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4994
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4995
4996
vmcs_writel(GUEST_GDTR_BASE, 0);
4997
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4998
4999
vmcs_writel(GUEST_IDTR_BASE, 0);
5000
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5001
5002
vmx_segment_cache_clear(vmx);
5003
kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
5004
5005
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5006
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5007
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5008
if (kvm_mpx_supported())
5009
vmcs_write64(GUEST_BNDCFGS, 0);
5010
5011
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5012
5013
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
5014
vmcs_writel(GUEST_SSP, 0);
5015
vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
5016
}
5017
if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
5018
kvm_cpu_cap_has(X86_FEATURE_SHSTK))
5019
vmcs_writel(GUEST_S_CET, 0);
5020
5021
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5022
5023
vpid_sync_context(vmx->vpid);
5024
5025
vmx_update_fb_clear_dis(vcpu, vmx);
5026
}
5027
5028
void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
5029
{
5030
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5031
}
5032
5033
void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
5034
{
5035
if (!enable_vnmi ||
5036
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5037
vmx_enable_irq_window(vcpu);
5038
return;
5039
}
5040
5041
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5042
}
5043
5044
void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
5045
{
5046
struct vcpu_vmx *vmx = to_vmx(vcpu);
5047
uint32_t intr;
5048
int irq = vcpu->arch.interrupt.nr;
5049
5050
trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
5051
5052
++vcpu->stat.irq_injections;
5053
if (vmx->rmode.vm86_active) {
5054
int inc_eip = 0;
5055
if (vcpu->arch.interrupt.soft)
5056
inc_eip = vcpu->arch.event_exit_inst_len;
5057
kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
5058
return;
5059
}
5060
intr = irq | INTR_INFO_VALID_MASK;
5061
if (vcpu->arch.interrupt.soft) {
5062
intr |= INTR_TYPE_SOFT_INTR;
5063
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5064
vmx->vcpu.arch.event_exit_inst_len);
5065
} else
5066
intr |= INTR_TYPE_EXT_INTR;
5067
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5068
5069
vmx_clear_hlt(vcpu);
5070
}
5071
5072
void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5073
{
5074
struct vcpu_vmx *vmx = to_vmx(vcpu);
5075
5076
if (!enable_vnmi) {
5077
/*
5078
* Tracking the NMI-blocked state in software is built upon
5079
* finding the next open IRQ window. This, in turn, depends on
5080
* well-behaving guests: They have to keep IRQs disabled at
5081
* least as long as the NMI handler runs. Otherwise we may
5082
* cause NMI nesting, maybe breaking the guest. But as this is
5083
* highly unlikely, we can live with the residual risk.
5084
*/
5085
vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5086
vmx->loaded_vmcs->vnmi_blocked_time = 0;
5087
}
5088
5089
++vcpu->stat.nmi_injections;
5090
vmx->loaded_vmcs->nmi_known_unmasked = false;
5091
5092
if (vmx->rmode.vm86_active) {
5093
kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5094
return;
5095
}
5096
5097
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5098
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5099
5100
vmx_clear_hlt(vcpu);
5101
}
5102
5103
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5104
{
5105
struct vcpu_vmx *vmx = to_vmx(vcpu);
5106
bool masked;
5107
5108
if (!enable_vnmi)
5109
return vmx->loaded_vmcs->soft_vnmi_blocked;
5110
if (vmx->loaded_vmcs->nmi_known_unmasked)
5111
return false;
5112
masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5113
vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5114
return masked;
5115
}
5116
5117
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5118
{
5119
struct vcpu_vmx *vmx = to_vmx(vcpu);
5120
5121
if (!enable_vnmi) {
5122
if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5123
vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5124
vmx->loaded_vmcs->vnmi_blocked_time = 0;
5125
}
5126
} else {
5127
vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5128
if (masked)
5129
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5130
GUEST_INTR_STATE_NMI);
5131
else
5132
vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5133
GUEST_INTR_STATE_NMI);
5134
}
5135
}
5136
5137
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5138
{
5139
if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5140
return false;
5141
5142
if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5143
return true;
5144
5145
return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5146
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5147
GUEST_INTR_STATE_NMI));
5148
}
5149
5150
int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5151
{
5152
if (to_vmx(vcpu)->nested.nested_run_pending)
5153
return -EBUSY;
5154
5155
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
5156
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5157
return -EBUSY;
5158
5159
return !vmx_nmi_blocked(vcpu);
5160
}
5161
5162
bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5163
{
5164
return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5165
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5166
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5167
}
5168
5169
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5170
{
5171
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5172
return false;
5173
5174
return __vmx_interrupt_blocked(vcpu);
5175
}
5176
5177
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5178
{
5179
if (to_vmx(vcpu)->nested.nested_run_pending)
5180
return -EBUSY;
5181
5182
/*
5183
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5184
* e.g. if the IRQ arrived asynchronously after checking nested events.
5185
*/
5186
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5187
return -EBUSY;
5188
5189
return !vmx_interrupt_blocked(vcpu);
5190
}
5191
5192
int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5193
{
5194
void __user *ret;
5195
5196
if (enable_unrestricted_guest)
5197
return 0;
5198
5199
mutex_lock(&kvm->slots_lock);
5200
ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5201
PAGE_SIZE * 3);
5202
mutex_unlock(&kvm->slots_lock);
5203
5204
if (IS_ERR(ret))
5205
return PTR_ERR(ret);
5206
5207
to_kvm_vmx(kvm)->tss_addr = addr;
5208
5209
return init_rmode_tss(kvm, ret);
5210
}
5211
5212
int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5213
{
5214
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5215
return 0;
5216
}
5217
5218
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5219
{
5220
switch (vec) {
5221
case BP_VECTOR:
5222
/*
5223
* Update instruction length as we may reinject the exception
5224
* from user space while in guest debugging mode.
5225
*/
5226
to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5227
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5228
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5229
return false;
5230
fallthrough;
5231
case DB_VECTOR:
5232
return !(vcpu->guest_debug &
5233
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5234
case DE_VECTOR:
5235
case OF_VECTOR:
5236
case BR_VECTOR:
5237
case UD_VECTOR:
5238
case DF_VECTOR:
5239
case SS_VECTOR:
5240
case GP_VECTOR:
5241
case MF_VECTOR:
5242
return true;
5243
}
5244
return false;
5245
}
5246
5247
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5248
int vec, u32 err_code)
5249
{
5250
/*
5251
* Instruction with address size override prefix opcode 0x67
5252
* Cause the #SS fault with 0 error code in VM86 mode.
5253
*/
5254
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5255
if (kvm_emulate_instruction(vcpu, 0)) {
5256
if (vcpu->arch.halt_request) {
5257
vcpu->arch.halt_request = 0;
5258
return kvm_emulate_halt_noskip(vcpu);
5259
}
5260
return 1;
5261
}
5262
return 0;
5263
}
5264
5265
/*
5266
* Forward all other exceptions that are valid in real mode.
5267
* FIXME: Breaks guest debugging in real mode, needs to be fixed with
5268
* the required debugging infrastructure rework.
5269
*/
5270
kvm_queue_exception(vcpu, vec);
5271
return 1;
5272
}
5273
5274
static int handle_machine_check(struct kvm_vcpu *vcpu)
5275
{
5276
/* handled by vmx_vcpu_run() */
5277
return 1;
5278
}
5279
5280
/*
5281
* If the host has split lock detection disabled, then #AC is
5282
* unconditionally injected into the guest, which is the pre split lock
5283
* detection behaviour.
5284
*
5285
* If the host has split lock detection enabled then #AC is
5286
* only injected into the guest when:
5287
* - Guest CPL == 3 (user mode)
5288
* - Guest has #AC detection enabled in CR0
5289
* - Guest EFLAGS has AC bit set
5290
*/
5291
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5292
{
5293
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5294
return true;
5295
5296
return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5297
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5298
}
5299
5300
static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
5301
{
5302
return vcpu->arch.guest_fpu.fpstate->xfd &&
5303
!kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
5304
}
5305
5306
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5307
{
5308
struct vcpu_vmx *vmx = to_vmx(vcpu);
5309
struct kvm_run *kvm_run = vcpu->run;
5310
u32 intr_info, ex_no, error_code;
5311
unsigned long cr2, dr6;
5312
u32 vect_info;
5313
5314
vect_info = vmx->idt_vectoring_info;
5315
intr_info = vmx_get_intr_info(vcpu);
5316
5317
/*
5318
* Machine checks are handled by handle_exception_irqoff(), or by
5319
* vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
5320
* vmx_vcpu_enter_exit().
5321
*/
5322
if (is_machine_check(intr_info) || is_nmi(intr_info))
5323
return 1;
5324
5325
/*
5326
* Queue the exception here instead of in handle_nm_fault_irqoff().
5327
* This ensures the nested_vmx check is not skipped so vmexit can
5328
* be reflected to L1 (when it intercepts #NM) before reaching this
5329
* point.
5330
*/
5331
if (is_nm_fault(intr_info)) {
5332
kvm_queue_exception_p(vcpu, NM_VECTOR,
5333
is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
5334
return 1;
5335
}
5336
5337
if (is_invalid_opcode(intr_info))
5338
return handle_ud(vcpu);
5339
5340
if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
5341
struct vmx_ve_information *ve_info = vmx->ve_info;
5342
5343
WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
5344
"Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
5345
dump_vmcs(vcpu);
5346
kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
5347
return 1;
5348
}
5349
5350
error_code = 0;
5351
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5352
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5353
5354
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5355
WARN_ON_ONCE(!enable_vmware_backdoor);
5356
5357
/*
5358
* VMware backdoor emulation on #GP interception only handles
5359
* IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5360
* error code on #GP.
5361
*/
5362
if (error_code) {
5363
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5364
return 1;
5365
}
5366
return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5367
}
5368
5369
/*
5370
* The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5371
* MMIO, it is better to report an internal error.
5372
* See the comments in vmx_handle_exit.
5373
*/
5374
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5375
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5376
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5377
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5378
vcpu->run->internal.ndata = 4;
5379
vcpu->run->internal.data[0] = vect_info;
5380
vcpu->run->internal.data[1] = intr_info;
5381
vcpu->run->internal.data[2] = error_code;
5382
vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5383
return 0;
5384
}
5385
5386
if (is_page_fault(intr_info)) {
5387
cr2 = vmx_get_exit_qual(vcpu);
5388
if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5389
/*
5390
* EPT will cause page fault only if we need to
5391
* detect illegal GPAs.
5392
*/
5393
WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5394
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5395
return 1;
5396
} else
5397
return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5398
}
5399
5400
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5401
5402
if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5403
return handle_rmode_exception(vcpu, ex_no, error_code);
5404
5405
switch (ex_no) {
5406
case DB_VECTOR:
5407
dr6 = vmx_get_exit_qual(vcpu);
5408
if (!(vcpu->guest_debug &
5409
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5410
/*
5411
* If the #DB was due to ICEBP, a.k.a. INT1, skip the
5412
* instruction. ICEBP generates a trap-like #DB, but
5413
* despite its interception control being tied to #DB,
5414
* is an instruction intercept, i.e. the VM-Exit occurs
5415
* on the ICEBP itself. Use the inner "skip" helper to
5416
* avoid single-step #DB and MTF updates, as ICEBP is
5417
* higher priority. Note, skipping ICEBP still clears
5418
* STI and MOVSS blocking.
5419
*
5420
* For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5421
* if single-step is enabled in RFLAGS and STI or MOVSS
5422
* blocking is active, as the CPU doesn't set the bit
5423
* on VM-Exit due to #DB interception. VM-Entry has a
5424
* consistency check that a single-step #DB is pending
5425
* in this scenario as the previous instruction cannot
5426
* have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5427
* don't modify RFLAGS), therefore the one instruction
5428
* delay when activating single-step breakpoints must
5429
* have already expired. Note, the CPU sets/clears BS
5430
* as appropriate for all other VM-Exits types.
5431
*/
5432
if (is_icebp(intr_info))
5433
WARN_ON(!skip_emulated_instruction(vcpu));
5434
else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5435
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5436
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5437
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5438
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5439
5440
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5441
return 1;
5442
}
5443
kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5444
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5445
fallthrough;
5446
case BP_VECTOR:
5447
/*
5448
* Update instruction length as we may reinject #BP from
5449
* user space while in guest debugging mode. Reading it for
5450
* #DB as well causes no harm, it is not used in that case.
5451
*/
5452
vmx->vcpu.arch.event_exit_inst_len =
5453
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5454
kvm_run->exit_reason = KVM_EXIT_DEBUG;
5455
kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5456
kvm_run->debug.arch.exception = ex_no;
5457
break;
5458
case AC_VECTOR:
5459
if (vmx_guest_inject_ac(vcpu)) {
5460
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5461
return 1;
5462
}
5463
5464
/*
5465
* Handle split lock. Depending on detection mode this will
5466
* either warn and disable split lock detection for this
5467
* task or force SIGBUS on it.
5468
*/
5469
if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5470
return 1;
5471
fallthrough;
5472
default:
5473
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5474
kvm_run->ex.exception = ex_no;
5475
kvm_run->ex.error_code = error_code;
5476
break;
5477
}
5478
return 0;
5479
}
5480
5481
static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5482
{
5483
++vcpu->stat.irq_exits;
5484
return 1;
5485
}
5486
5487
static int handle_triple_fault(struct kvm_vcpu *vcpu)
5488
{
5489
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5490
vcpu->mmio_needed = 0;
5491
return 0;
5492
}
5493
5494
static int handle_io(struct kvm_vcpu *vcpu)
5495
{
5496
unsigned long exit_qualification;
5497
int size, in, string;
5498
unsigned port;
5499
5500
exit_qualification = vmx_get_exit_qual(vcpu);
5501
string = (exit_qualification & 16) != 0;
5502
5503
++vcpu->stat.io_exits;
5504
5505
if (string)
5506
return kvm_emulate_instruction(vcpu, 0);
5507
5508
port = exit_qualification >> 16;
5509
size = (exit_qualification & 7) + 1;
5510
in = (exit_qualification & 8) != 0;
5511
5512
return kvm_fast_pio(vcpu, size, port, in);
5513
}
5514
5515
void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5516
{
5517
/*
5518
* Patch in the VMCALL instruction:
5519
*/
5520
hypercall[0] = 0x0f;
5521
hypercall[1] = 0x01;
5522
hypercall[2] = 0xc1;
5523
}
5524
5525
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5526
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5527
{
5528
if (is_guest_mode(vcpu)) {
5529
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5530
unsigned long orig_val = val;
5531
5532
/*
5533
* We get here when L2 changed cr0 in a way that did not change
5534
* any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5535
* but did change L0 shadowed bits. So we first calculate the
5536
* effective cr0 value that L1 would like to write into the
5537
* hardware. It consists of the L2-owned bits from the new
5538
* value combined with the L1-owned bits from L1's guest_cr0.
5539
*/
5540
val = (val & ~vmcs12->cr0_guest_host_mask) |
5541
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5542
5543
if (kvm_set_cr0(vcpu, val))
5544
return 1;
5545
vmcs_writel(CR0_READ_SHADOW, orig_val);
5546
return 0;
5547
} else {
5548
return kvm_set_cr0(vcpu, val);
5549
}
5550
}
5551
5552
static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5553
{
5554
if (is_guest_mode(vcpu)) {
5555
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5556
unsigned long orig_val = val;
5557
5558
/* analogously to handle_set_cr0 */
5559
val = (val & ~vmcs12->cr4_guest_host_mask) |
5560
(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5561
if (kvm_set_cr4(vcpu, val))
5562
return 1;
5563
vmcs_writel(CR4_READ_SHADOW, orig_val);
5564
return 0;
5565
} else
5566
return kvm_set_cr4(vcpu, val);
5567
}
5568
5569
static int handle_desc(struct kvm_vcpu *vcpu)
5570
{
5571
/*
5572
* UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5573
* and other code needs to be updated if UMIP can be guest owned.
5574
*/
5575
BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5576
5577
WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5578
return kvm_emulate_instruction(vcpu, 0);
5579
}
5580
5581
static int handle_cr(struct kvm_vcpu *vcpu)
5582
{
5583
unsigned long exit_qualification, val;
5584
int cr;
5585
int reg;
5586
int err;
5587
int ret;
5588
5589
exit_qualification = vmx_get_exit_qual(vcpu);
5590
cr = exit_qualification & 15;
5591
reg = (exit_qualification >> 8) & 15;
5592
switch ((exit_qualification >> 4) & 3) {
5593
case 0: /* mov to cr */
5594
val = kvm_register_read(vcpu, reg);
5595
trace_kvm_cr_write(cr, val);
5596
switch (cr) {
5597
case 0:
5598
err = handle_set_cr0(vcpu, val);
5599
return kvm_complete_insn_gp(vcpu, err);
5600
case 3:
5601
WARN_ON_ONCE(enable_unrestricted_guest);
5602
5603
err = kvm_set_cr3(vcpu, val);
5604
return kvm_complete_insn_gp(vcpu, err);
5605
case 4:
5606
err = handle_set_cr4(vcpu, val);
5607
return kvm_complete_insn_gp(vcpu, err);
5608
case 8: {
5609
u8 cr8_prev = kvm_get_cr8(vcpu);
5610
u8 cr8 = (u8)val;
5611
err = kvm_set_cr8(vcpu, cr8);
5612
ret = kvm_complete_insn_gp(vcpu, err);
5613
if (lapic_in_kernel(vcpu))
5614
return ret;
5615
if (cr8_prev <= cr8)
5616
return ret;
5617
/*
5618
* TODO: we might be squashing a
5619
* KVM_GUESTDBG_SINGLESTEP-triggered
5620
* KVM_EXIT_DEBUG here.
5621
*/
5622
vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5623
return 0;
5624
}
5625
}
5626
break;
5627
case 2: /* clts */
5628
KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5629
return -EIO;
5630
case 1: /*mov from cr*/
5631
switch (cr) {
5632
case 3:
5633
WARN_ON_ONCE(enable_unrestricted_guest);
5634
5635
val = kvm_read_cr3(vcpu);
5636
kvm_register_write(vcpu, reg, val);
5637
trace_kvm_cr_read(cr, val);
5638
return kvm_skip_emulated_instruction(vcpu);
5639
case 8:
5640
val = kvm_get_cr8(vcpu);
5641
kvm_register_write(vcpu, reg, val);
5642
trace_kvm_cr_read(cr, val);
5643
return kvm_skip_emulated_instruction(vcpu);
5644
}
5645
break;
5646
case 3: /* lmsw */
5647
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5648
trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
5649
kvm_lmsw(vcpu, val);
5650
5651
return kvm_skip_emulated_instruction(vcpu);
5652
default:
5653
break;
5654
}
5655
vcpu->run->exit_reason = 0;
5656
vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5657
(int)(exit_qualification >> 4) & 3, cr);
5658
return 0;
5659
}
5660
5661
static int handle_dr(struct kvm_vcpu *vcpu)
5662
{
5663
unsigned long exit_qualification;
5664
int dr, dr7, reg;
5665
int err = 1;
5666
5667
exit_qualification = vmx_get_exit_qual(vcpu);
5668
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5669
5670
/* First, if DR does not exist, trigger UD */
5671
if (!kvm_require_dr(vcpu, dr))
5672
return 1;
5673
5674
if (vmx_get_cpl(vcpu) > 0)
5675
goto out;
5676
5677
dr7 = vmcs_readl(GUEST_DR7);
5678
if (dr7 & DR7_GD) {
5679
/*
5680
* As the vm-exit takes precedence over the debug trap, we
5681
* need to emulate the latter, either for the host or the
5682
* guest debugging itself.
5683
*/
5684
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5685
vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5686
vcpu->run->debug.arch.dr7 = dr7;
5687
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5688
vcpu->run->debug.arch.exception = DB_VECTOR;
5689
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5690
return 0;
5691
} else {
5692
kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5693
return 1;
5694
}
5695
}
5696
5697
if (vcpu->guest_debug == 0) {
5698
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5699
5700
/*
5701
* No more DR vmexits; force a reload of the debug registers
5702
* and reenter on this instruction. The next vmexit will
5703
* retrieve the full state of the debug registers.
5704
*/
5705
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5706
return 1;
5707
}
5708
5709
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5710
if (exit_qualification & TYPE_MOV_FROM_DR) {
5711
kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
5712
err = 0;
5713
} else {
5714
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5715
}
5716
5717
out:
5718
return kvm_complete_insn_gp(vcpu, err);
5719
}
5720
5721
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5722
{
5723
get_debugreg(vcpu->arch.db[0], 0);
5724
get_debugreg(vcpu->arch.db[1], 1);
5725
get_debugreg(vcpu->arch.db[2], 2);
5726
get_debugreg(vcpu->arch.db[3], 3);
5727
get_debugreg(vcpu->arch.dr6, 6);
5728
vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5729
5730
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5731
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5732
5733
/*
5734
* exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5735
* a stale dr6 from the guest.
5736
*/
5737
set_debugreg(DR6_RESERVED, 6);
5738
}
5739
5740
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5741
{
5742
vmcs_writel(GUEST_DR7, val);
5743
}
5744
5745
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5746
{
5747
kvm_apic_update_ppr(vcpu);
5748
return 1;
5749
}
5750
5751
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5752
{
5753
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5754
5755
kvm_make_request(KVM_REQ_EVENT, vcpu);
5756
5757
++vcpu->stat.irq_window_exits;
5758
return 1;
5759
}
5760
5761
static int handle_invlpg(struct kvm_vcpu *vcpu)
5762
{
5763
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5764
5765
kvm_mmu_invlpg(vcpu, exit_qualification);
5766
return kvm_skip_emulated_instruction(vcpu);
5767
}
5768
5769
static int handle_apic_access(struct kvm_vcpu *vcpu)
5770
{
5771
if (likely(fasteoi)) {
5772
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5773
int access_type, offset;
5774
5775
access_type = exit_qualification & APIC_ACCESS_TYPE;
5776
offset = exit_qualification & APIC_ACCESS_OFFSET;
5777
/*
5778
* Sane guest uses MOV to write EOI, with written value
5779
* not cared. So make a short-circuit here by avoiding
5780
* heavy instruction emulation.
5781
*/
5782
if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5783
(offset == APIC_EOI)) {
5784
kvm_lapic_set_eoi(vcpu);
5785
return kvm_skip_emulated_instruction(vcpu);
5786
}
5787
}
5788
return kvm_emulate_instruction(vcpu, 0);
5789
}
5790
5791
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5792
{
5793
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5794
int vector = exit_qualification & 0xff;
5795
5796
/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5797
kvm_apic_set_eoi_accelerated(vcpu, vector);
5798
return 1;
5799
}
5800
5801
static int handle_apic_write(struct kvm_vcpu *vcpu)
5802
{
5803
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5804
5805
/*
5806
* APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5807
* hardware has done any necessary aliasing, offset adjustments, etc...
5808
* for the access. I.e. the correct value has already been written to
5809
* the vAPIC page for the correct 16-byte chunk. KVM needs only to
5810
* retrieve the register value and emulate the access.
5811
*/
5812
u32 offset = exit_qualification & 0xff0;
5813
5814
kvm_apic_write_nodecode(vcpu, offset);
5815
return 1;
5816
}
5817
5818
static int handle_task_switch(struct kvm_vcpu *vcpu)
5819
{
5820
struct vcpu_vmx *vmx = to_vmx(vcpu);
5821
unsigned long exit_qualification;
5822
bool has_error_code = false;
5823
u32 error_code = 0;
5824
u16 tss_selector;
5825
int reason, type, idt_v, idt_index;
5826
5827
idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5828
idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5829
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5830
5831
exit_qualification = vmx_get_exit_qual(vcpu);
5832
5833
reason = (u32)exit_qualification >> 30;
5834
if (reason == TASK_SWITCH_GATE && idt_v) {
5835
switch (type) {
5836
case INTR_TYPE_NMI_INTR:
5837
vcpu->arch.nmi_injected = false;
5838
vmx_set_nmi_mask(vcpu, true);
5839
break;
5840
case INTR_TYPE_EXT_INTR:
5841
case INTR_TYPE_SOFT_INTR:
5842
kvm_clear_interrupt_queue(vcpu);
5843
break;
5844
case INTR_TYPE_HARD_EXCEPTION:
5845
if (vmx->idt_vectoring_info &
5846
VECTORING_INFO_DELIVER_CODE_MASK) {
5847
has_error_code = true;
5848
error_code =
5849
vmcs_read32(IDT_VECTORING_ERROR_CODE);
5850
}
5851
fallthrough;
5852
case INTR_TYPE_SOFT_EXCEPTION:
5853
kvm_clear_exception_queue(vcpu);
5854
break;
5855
default:
5856
break;
5857
}
5858
}
5859
tss_selector = exit_qualification;
5860
5861
if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5862
type != INTR_TYPE_EXT_INTR &&
5863
type != INTR_TYPE_NMI_INTR))
5864
WARN_ON(!skip_emulated_instruction(vcpu));
5865
5866
/*
5867
* TODO: What about debug traps on tss switch?
5868
* Are we supposed to inject them and update dr6?
5869
*/
5870
return kvm_task_switch(vcpu, tss_selector,
5871
type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5872
reason, has_error_code, error_code);
5873
}
5874
5875
static int handle_ept_violation(struct kvm_vcpu *vcpu)
5876
{
5877
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5878
gpa_t gpa;
5879
5880
/*
5881
* EPT violation happened while executing iret from NMI,
5882
* "blocked by NMI" bit has to be set before next VM entry.
5883
* There are errata that may cause this bit to not be set:
5884
* AAK134, BY25.
5885
*/
5886
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5887
enable_vnmi &&
5888
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5889
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5890
5891
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5892
trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5893
5894
/*
5895
* Check that the GPA doesn't exceed physical memory limits, as that is
5896
* a guest page fault. We have to emulate the instruction here, because
5897
* if the illegal address is that of a paging structure, then
5898
* EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5899
* would also use advanced VM-exit information for EPT violations to
5900
* reconstruct the page fault error code.
5901
*/
5902
if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
5903
return kvm_emulate_instruction(vcpu, 0);
5904
5905
return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
5906
}
5907
5908
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5909
{
5910
gpa_t gpa;
5911
5912
if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5913
return 1;
5914
5915
/*
5916
* A nested guest cannot optimize MMIO vmexits, because we have an
5917
* nGPA here instead of the required GPA.
5918
*/
5919
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5920
if (!is_guest_mode(vcpu) &&
5921
!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5922
trace_kvm_fast_mmio(gpa);
5923
return kvm_skip_emulated_instruction(vcpu);
5924
}
5925
5926
return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5927
}
5928
5929
static int handle_nmi_window(struct kvm_vcpu *vcpu)
5930
{
5931
if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5932
return -EIO;
5933
5934
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5935
++vcpu->stat.nmi_window_exits;
5936
kvm_make_request(KVM_REQ_EVENT, vcpu);
5937
5938
return 1;
5939
}
5940
5941
/*
5942
* Returns true if emulation is required (due to the vCPU having invalid state
5943
* with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
5944
* current vCPU state.
5945
*/
5946
static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
5947
{
5948
struct vcpu_vmx *vmx = to_vmx(vcpu);
5949
5950
if (!vmx->vt.emulation_required)
5951
return false;
5952
5953
/*
5954
* It is architecturally impossible for emulation to be required when a
5955
* nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
5956
* guest state is invalid and unrestricted guest is disabled, i.e. KVM
5957
* should synthesize VM-Fail instead emulation L2 code. This path is
5958
* only reachable if userspace modifies L2 guest state after KVM has
5959
* performed the nested VM-Enter consistency checks.
5960
*/
5961
if (vmx->nested.nested_run_pending)
5962
return true;
5963
5964
/*
5965
* KVM only supports emulating exceptions if the vCPU is in Real Mode.
5966
* If emulation is required, KVM can't perform a successful VM-Enter to
5967
* inject the exception.
5968
*/
5969
return !vmx->rmode.vm86_active &&
5970
(kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5971
}
5972
5973
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5974
{
5975
struct vcpu_vmx *vmx = to_vmx(vcpu);
5976
bool intr_window_requested;
5977
unsigned count = 130;
5978
5979
intr_window_requested = exec_controls_get(vmx) &
5980
CPU_BASED_INTR_WINDOW_EXITING;
5981
5982
while (vmx->vt.emulation_required && count-- != 0) {
5983
if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5984
return handle_interrupt_window(&vmx->vcpu);
5985
5986
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5987
return 1;
5988
5989
/*
5990
* Ensure that any updates to kvm->buses[] observed by the
5991
* previous instruction (emulated or otherwise) are also
5992
* visible to the instruction KVM is about to emulate.
5993
*/
5994
smp_rmb();
5995
5996
if (!kvm_emulate_instruction(vcpu, 0))
5997
return 0;
5998
5999
if (vmx_unhandleable_emulation_required(vcpu)) {
6000
kvm_prepare_emulation_failure_exit(vcpu);
6001
return 0;
6002
}
6003
6004
if (vcpu->arch.halt_request) {
6005
vcpu->arch.halt_request = 0;
6006
return kvm_emulate_halt_noskip(vcpu);
6007
}
6008
6009
/*
6010
* Note, return 1 and not 0, vcpu_run() will invoke
6011
* xfer_to_guest_mode() which will create a proper return
6012
* code.
6013
*/
6014
if (__xfer_to_guest_mode_work_pending())
6015
return 1;
6016
}
6017
6018
return 1;
6019
}
6020
6021
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
6022
{
6023
if (vmx_unhandleable_emulation_required(vcpu)) {
6024
kvm_prepare_emulation_failure_exit(vcpu);
6025
return 0;
6026
}
6027
6028
return 1;
6029
}
6030
6031
/*
6032
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6033
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6034
*/
6035
static int handle_pause(struct kvm_vcpu *vcpu)
6036
{
6037
if (!kvm_pause_in_guest(vcpu->kvm))
6038
grow_ple_window(vcpu);
6039
6040
/*
6041
* Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6042
* VM-execution control is ignored if CPL > 0. OTOH, KVM
6043
* never set PAUSE_EXITING and just set PLE if supported,
6044
* so the vcpu must be CPL=0 if it gets a PAUSE exit.
6045
*/
6046
kvm_vcpu_on_spin(vcpu, true);
6047
return kvm_skip_emulated_instruction(vcpu);
6048
}
6049
6050
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6051
{
6052
return 1;
6053
}
6054
6055
static int handle_invpcid(struct kvm_vcpu *vcpu)
6056
{
6057
u32 vmx_instruction_info;
6058
unsigned long type;
6059
gva_t gva;
6060
struct {
6061
u64 pcid;
6062
u64 gla;
6063
} operand;
6064
int gpr_index;
6065
6066
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
6067
kvm_queue_exception(vcpu, UD_VECTOR);
6068
return 1;
6069
}
6070
6071
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6072
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6073
type = kvm_register_read(vcpu, gpr_index);
6074
6075
/* According to the Intel instruction reference, the memory operand
6076
* is read even if it isn't needed (e.g., for type==all)
6077
*/
6078
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6079
vmx_instruction_info, false,
6080
sizeof(operand), &gva))
6081
return 1;
6082
6083
return kvm_handle_invpcid(vcpu, type, gva);
6084
}
6085
6086
static int handle_pml_full(struct kvm_vcpu *vcpu)
6087
{
6088
unsigned long exit_qualification;
6089
6090
trace_kvm_pml_full(vcpu->vcpu_id);
6091
6092
exit_qualification = vmx_get_exit_qual(vcpu);
6093
6094
/*
6095
* PML buffer FULL happened while executing iret from NMI,
6096
* "blocked by NMI" bit has to be set before next VM entry.
6097
*/
6098
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6099
enable_vnmi &&
6100
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
6101
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6102
GUEST_INTR_STATE_NMI);
6103
6104
/*
6105
* PML buffer already flushed at beginning of VMEXIT. Nothing to do
6106
* here.., and there's no userspace involvement needed for PML.
6107
*/
6108
return 1;
6109
}
6110
6111
static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
6112
bool force_immediate_exit)
6113
{
6114
struct vcpu_vmx *vmx = to_vmx(vcpu);
6115
6116
/*
6117
* In the *extremely* unlikely scenario that this is a spurious VM-Exit
6118
* due to the timer expiring while it was "soft" disabled, just eat the
6119
* exit and re-enter the guest.
6120
*/
6121
if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6122
return EXIT_FASTPATH_REENTER_GUEST;
6123
6124
/*
6125
* If the timer expired because KVM used it to force an immediate exit,
6126
* then mission accomplished.
6127
*/
6128
if (force_immediate_exit)
6129
return EXIT_FASTPATH_EXIT_HANDLED;
6130
6131
/*
6132
* If L2 is active, go down the slow path as emulating the guest timer
6133
* expiration likely requires synthesizing a nested VM-Exit.
6134
*/
6135
if (is_guest_mode(vcpu))
6136
return EXIT_FASTPATH_NONE;
6137
6138
kvm_lapic_expired_hv_timer(vcpu);
6139
return EXIT_FASTPATH_REENTER_GUEST;
6140
}
6141
6142
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6143
{
6144
/*
6145
* This non-fastpath handler is reached if and only if the preemption
6146
* timer was being used to emulate a guest timer while L2 is active.
6147
* All other scenarios are supposed to be handled in the fastpath.
6148
*/
6149
WARN_ON_ONCE(!is_guest_mode(vcpu));
6150
kvm_lapic_expired_hv_timer(vcpu);
6151
return 1;
6152
}
6153
6154
/*
6155
* When nested=0, all VMX instruction VM Exits filter here. The handlers
6156
* are overwritten by nested_vmx_hardware_setup() when nested=1.
6157
*/
6158
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6159
{
6160
kvm_queue_exception(vcpu, UD_VECTOR);
6161
return 1;
6162
}
6163
6164
static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
6165
{
6166
kvm_queue_exception(vcpu, UD_VECTOR);
6167
return 1;
6168
}
6169
6170
#ifndef CONFIG_X86_SGX_KVM
6171
static int handle_encls(struct kvm_vcpu *vcpu)
6172
{
6173
/*
6174
* SGX virtualization is disabled. There is no software enable bit for
6175
* SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6176
* the guest from executing ENCLS (when SGX is supported by hardware).
6177
*/
6178
kvm_queue_exception(vcpu, UD_VECTOR);
6179
return 1;
6180
}
6181
#endif /* CONFIG_X86_SGX_KVM */
6182
6183
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6184
{
6185
/*
6186
* Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6187
* VM-Exits. Unconditionally set the flag here and leave the handling to
6188
* vmx_handle_exit().
6189
*/
6190
to_vt(vcpu)->exit_reason.bus_lock_detected = true;
6191
return 1;
6192
}
6193
6194
static int handle_notify(struct kvm_vcpu *vcpu)
6195
{
6196
unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6197
bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6198
6199
++vcpu->stat.notify_window_exits;
6200
6201
/*
6202
* Notify VM exit happened while executing iret from NMI,
6203
* "blocked by NMI" bit has to be set before next VM entry.
6204
*/
6205
if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6206
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6207
GUEST_INTR_STATE_NMI);
6208
6209
if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6210
context_invalid) {
6211
vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6212
vcpu->run->notify.flags = context_invalid ?
6213
KVM_NOTIFY_CONTEXT_INVALID : 0;
6214
return 0;
6215
}
6216
6217
return 1;
6218
}
6219
6220
static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
6221
{
6222
return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
6223
}
6224
6225
static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
6226
{
6227
return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6228
vmx_get_msr_imm_reg(vcpu));
6229
}
6230
6231
static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
6232
{
6233
return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6234
vmx_get_msr_imm_reg(vcpu));
6235
}
6236
6237
/*
6238
* The exit handlers return 1 if the exit was handled fully and guest execution
6239
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
6240
* to be done to userspace and return 0.
6241
*/
6242
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6243
[EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6244
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6245
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6246
[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6247
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
6248
[EXIT_REASON_CR_ACCESS] = handle_cr,
6249
[EXIT_REASON_DR_ACCESS] = handle_dr,
6250
[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6251
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6252
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6253
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6254
[EXIT_REASON_HLT] = kvm_emulate_halt,
6255
[EXIT_REASON_INVD] = kvm_emulate_invd,
6256
[EXIT_REASON_INVLPG] = handle_invlpg,
6257
[EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
6258
[EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
6259
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6260
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6261
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6262
[EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6263
[EXIT_REASON_VMREAD] = handle_vmx_instruction,
6264
[EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6265
[EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6266
[EXIT_REASON_VMOFF] = handle_vmx_instruction,
6267
[EXIT_REASON_VMON] = handle_vmx_instruction,
6268
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6269
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6270
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
6271
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6272
[EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
6273
[EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
6274
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6275
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6276
[EXIT_REASON_GDTR_IDTR] = handle_desc,
6277
[EXIT_REASON_LDTR_TR] = handle_desc,
6278
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6279
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6280
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6281
[EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
6282
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6283
[EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
6284
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
6285
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
6286
[EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6287
[EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
6288
[EXIT_REASON_PML_FULL] = handle_pml_full,
6289
[EXIT_REASON_INVPCID] = handle_invpcid,
6290
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6291
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6292
[EXIT_REASON_ENCLS] = handle_encls,
6293
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
6294
[EXIT_REASON_NOTIFY] = handle_notify,
6295
[EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
6296
[EXIT_REASON_TDCALL] = handle_tdx_instruction,
6297
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
6298
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
6299
};
6300
6301
static const int kvm_vmx_max_exit_handlers =
6302
ARRAY_SIZE(kvm_vmx_exit_handlers);
6303
6304
void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6305
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
6306
{
6307
struct vcpu_vmx *vmx = to_vmx(vcpu);
6308
6309
*reason = vmx->vt.exit_reason.full;
6310
*info1 = vmx_get_exit_qual(vcpu);
6311
if (!(vmx->vt.exit_reason.failed_vmentry)) {
6312
*info2 = vmx->idt_vectoring_info;
6313
*intr_info = vmx_get_intr_info(vcpu);
6314
if (is_exception_with_error_code(*intr_info))
6315
*error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6316
else
6317
*error_code = 0;
6318
} else {
6319
*info2 = 0;
6320
*intr_info = 0;
6321
*error_code = 0;
6322
}
6323
}
6324
6325
void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
6326
{
6327
*intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
6328
if (is_exception_with_error_code(*intr_info))
6329
*error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
6330
else
6331
*error_code = 0;
6332
}
6333
6334
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6335
{
6336
if (vmx->pml_pg) {
6337
__free_page(vmx->pml_pg);
6338
vmx->pml_pg = NULL;
6339
}
6340
}
6341
6342
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6343
{
6344
struct vcpu_vmx *vmx = to_vmx(vcpu);
6345
u16 pml_idx, pml_tail_index;
6346
u64 *pml_buf;
6347
int i;
6348
6349
pml_idx = vmcs_read16(GUEST_PML_INDEX);
6350
6351
/* Do nothing if PML buffer is empty */
6352
if (pml_idx == PML_HEAD_INDEX)
6353
return;
6354
/*
6355
* PML index always points to the next available PML buffer entity
6356
* unless PML log has just overflowed.
6357
*/
6358
pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
6359
6360
/*
6361
* PML log is written backwards: the CPU first writes the entry 511
6362
* then the entry 510, and so on.
6363
*
6364
* Read the entries in the same order they were written, to ensure that
6365
* the dirty ring is filled in the same order the CPU wrote them.
6366
*/
6367
pml_buf = page_address(vmx->pml_pg);
6368
6369
for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
6370
u64 gpa;
6371
6372
gpa = pml_buf[i];
6373
WARN_ON(gpa & (PAGE_SIZE - 1));
6374
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6375
}
6376
6377
/* reset PML index */
6378
vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
6379
}
6380
6381
static void vmx_dump_sel(char *name, uint32_t sel)
6382
{
6383
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6384
name, vmcs_read16(sel),
6385
vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6386
vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6387
vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6388
}
6389
6390
static void vmx_dump_dtsel(char *name, uint32_t limit)
6391
{
6392
pr_err("%s limit=0x%08x, base=0x%016lx\n",
6393
name, vmcs_read32(limit),
6394
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6395
}
6396
6397
static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6398
{
6399
unsigned int i;
6400
struct vmx_msr_entry *e;
6401
6402
pr_err("MSR %s:\n", name);
6403
for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6404
pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6405
}
6406
6407
void dump_vmcs(struct kvm_vcpu *vcpu)
6408
{
6409
struct vcpu_vmx *vmx = to_vmx(vcpu);
6410
u32 vmentry_ctl, vmexit_ctl;
6411
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6412
u64 tertiary_exec_control;
6413
unsigned long cr4;
6414
int efer_slot;
6415
6416
if (!dump_invalid_vmcs) {
6417
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6418
return;
6419
}
6420
6421
vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6422
vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6423
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6424
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6425
cr4 = vmcs_readl(GUEST_CR4);
6426
6427
if (cpu_has_secondary_exec_ctrls())
6428
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6429
else
6430
secondary_exec_control = 0;
6431
6432
if (cpu_has_tertiary_exec_ctrls())
6433
tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6434
else
6435
tertiary_exec_control = 0;
6436
6437
pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6438
vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6439
pr_err("*** Guest State ***\n");
6440
pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6441
vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6442
vmcs_readl(CR0_GUEST_HOST_MASK));
6443
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6444
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6445
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6446
if (cpu_has_vmx_ept()) {
6447
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6448
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6449
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6450
vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6451
}
6452
pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6453
vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6454
pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6455
vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6456
pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6457
vmcs_readl(GUEST_SYSENTER_ESP),
6458
vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6459
vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6460
vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6461
vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6462
vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6463
vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6464
vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6465
vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6466
vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6467
vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6468
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
6469
efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6470
if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6471
pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6472
else if (efer_slot >= 0)
6473
pr_err("EFER= 0x%016llx (autoload)\n",
6474
vmx->msr_autoload.guest.val[efer_slot].value);
6475
else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6476
pr_err("EFER= 0x%016llx (effective)\n",
6477
vcpu->arch.efer | (EFER_LMA | EFER_LME));
6478
else
6479
pr_err("EFER= 0x%016llx (effective)\n",
6480
vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6481
if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6482
pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6483
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6484
vmcs_read64(GUEST_IA32_DEBUGCTL),
6485
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6486
if (cpu_has_load_perf_global_ctrl() &&
6487
vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6488
pr_err("PerfGlobCtl = 0x%016llx\n",
6489
vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6490
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6491
pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6492
pr_err("Interruptibility = %08x ActivityState = %08x\n",
6493
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6494
vmcs_read32(GUEST_ACTIVITY_STATE));
6495
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6496
pr_err("InterruptStatus = %04x\n",
6497
vmcs_read16(GUEST_INTR_STATUS));
6498
if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6499
vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6500
if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6501
vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6502
6503
if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
6504
pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6505
vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
6506
vmcs_readl(GUEST_INTR_SSP_TABLE));
6507
pr_err("*** Host State ***\n");
6508
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6509
vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6510
pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6511
vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6512
vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6513
vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6514
vmcs_read16(HOST_TR_SELECTOR));
6515
pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6516
vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6517
vmcs_readl(HOST_TR_BASE));
6518
pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6519
vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6520
pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6521
vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6522
vmcs_readl(HOST_CR4));
6523
pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6524
vmcs_readl(HOST_IA32_SYSENTER_ESP),
6525
vmcs_read32(HOST_IA32_SYSENTER_CS),
6526
vmcs_readl(HOST_IA32_SYSENTER_EIP));
6527
if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6528
pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6529
if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6530
pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6531
if (cpu_has_load_perf_global_ctrl() &&
6532
vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6533
pr_err("PerfGlobCtl = 0x%016llx\n",
6534
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6535
if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6536
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6537
if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
6538
pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6539
vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
6540
vmcs_readl(HOST_INTR_SSP_TABLE));
6541
6542
pr_err("*** Control State ***\n");
6543
pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6544
cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6545
pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6546
pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6547
pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6548
vmcs_read32(EXCEPTION_BITMAP),
6549
vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6550
vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6551
pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6552
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6553
vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6554
vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6555
pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6556
vmcs_read32(VM_EXIT_INTR_INFO),
6557
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6558
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6559
pr_err(" reason=%08x qualification=%016lx\n",
6560
vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6561
pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6562
vmcs_read32(IDT_VECTORING_INFO_FIELD),
6563
vmcs_read32(IDT_VECTORING_ERROR_CODE));
6564
pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6565
if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6566
pr_err("TSC Multiplier = 0x%016llx\n",
6567
vmcs_read64(TSC_MULTIPLIER));
6568
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6569
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6570
u16 status = vmcs_read16(GUEST_INTR_STATUS);
6571
pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6572
}
6573
pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6574
if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6575
pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6576
pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6577
}
6578
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6579
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6580
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6581
pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6582
if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6583
pr_err("PLE Gap=%08x Window=%08x\n",
6584
vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6585
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6586
pr_err("Virtual processor ID = 0x%04x\n",
6587
vmcs_read16(VIRTUAL_PROCESSOR_ID));
6588
if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
6589
struct vmx_ve_information *ve_info = vmx->ve_info;
6590
u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
6591
6592
/*
6593
* If KVM is dumping the VMCS, then something has gone wrong
6594
* already. Derefencing an address from the VMCS, which could
6595
* very well be corrupted, is a terrible idea. The virtual
6596
* address is known so use it.
6597
*/
6598
pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
6599
ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
6600
pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
6601
ve_info->exit_reason, ve_info->delivery,
6602
ve_info->exit_qualification,
6603
ve_info->guest_linear_address,
6604
ve_info->guest_physical_address, ve_info->eptp_index);
6605
}
6606
}
6607
6608
/*
6609
* The guest has exited. See if we can fix it or if we need userspace
6610
* assistance.
6611
*/
6612
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6613
{
6614
struct vcpu_vmx *vmx = to_vmx(vcpu);
6615
union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
6616
u32 vectoring_info = vmx->idt_vectoring_info;
6617
u16 exit_handler_index;
6618
6619
/*
6620
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
6621
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6622
* querying dirty_bitmap, we only need to kick all vcpus out of guest
6623
* mode as if vcpus is in root mode, the PML buffer must has been
6624
* flushed already. Note, PML is never enabled in hardware while
6625
* running L2.
6626
*/
6627
if (enable_pml && !is_guest_mode(vcpu))
6628
vmx_flush_pml_buffer(vcpu);
6629
6630
/*
6631
* KVM should never reach this point with a pending nested VM-Enter.
6632
* More specifically, short-circuiting VM-Entry to emulate L2 due to
6633
* invalid guest state should never happen as that means KVM knowingly
6634
* allowed a nested VM-Enter with an invalid vmcs12. More below.
6635
*/
6636
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6637
return -EIO;
6638
6639
if (is_guest_mode(vcpu)) {
6640
/*
6641
* PML is never enabled when running L2, bail immediately if a
6642
* PML full exit occurs as something is horribly wrong.
6643
*/
6644
if (exit_reason.basic == EXIT_REASON_PML_FULL)
6645
goto unexpected_vmexit;
6646
6647
/*
6648
* The host physical addresses of some pages of guest memory
6649
* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6650
* Page). The CPU may write to these pages via their host
6651
* physical address while L2 is running, bypassing any
6652
* address-translation-based dirty tracking (e.g. EPT write
6653
* protection).
6654
*
6655
* Mark them dirty on every exit from L2 to prevent them from
6656
* getting out of sync with dirty tracking.
6657
*/
6658
nested_mark_vmcs12_pages_dirty(vcpu);
6659
6660
/*
6661
* Synthesize a triple fault if L2 state is invalid. In normal
6662
* operation, nested VM-Enter rejects any attempt to enter L2
6663
* with invalid state. However, those checks are skipped if
6664
* state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6665
* L2 state is invalid, it means either L1 modified SMRAM state
6666
* or userspace provided bad state. Synthesize TRIPLE_FAULT as
6667
* doing so is architecturally allowed in the RSM case, and is
6668
* the least awful solution for the userspace case without
6669
* risking false positives.
6670
*/
6671
if (vmx->vt.emulation_required) {
6672
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6673
return 1;
6674
}
6675
6676
if (nested_vmx_reflect_vmexit(vcpu))
6677
return 1;
6678
}
6679
6680
/* If guest state is invalid, start emulating. L2 is handled above. */
6681
if (vmx->vt.emulation_required)
6682
return handle_invalid_guest_state(vcpu);
6683
6684
if (exit_reason.failed_vmentry) {
6685
dump_vmcs(vcpu);
6686
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6687
vcpu->run->fail_entry.hardware_entry_failure_reason
6688
= exit_reason.full;
6689
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6690
return 0;
6691
}
6692
6693
if (unlikely(vmx->fail)) {
6694
dump_vmcs(vcpu);
6695
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6696
vcpu->run->fail_entry.hardware_entry_failure_reason
6697
= vmcs_read32(VM_INSTRUCTION_ERROR);
6698
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6699
return 0;
6700
}
6701
6702
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6703
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6704
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6705
exit_reason.basic != EXIT_REASON_PML_FULL &&
6706
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6707
exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6708
exit_reason.basic != EXIT_REASON_NOTIFY &&
6709
exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
6710
kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
6711
return 0;
6712
}
6713
6714
if (unlikely(!enable_vnmi &&
6715
vmx->loaded_vmcs->soft_vnmi_blocked)) {
6716
if (!vmx_interrupt_blocked(vcpu)) {
6717
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6718
} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6719
vcpu->arch.nmi_pending) {
6720
/*
6721
* This CPU don't support us in finding the end of an
6722
* NMI-blocked window if the guest runs with IRQs
6723
* disabled. So we pull the trigger after 1 s of
6724
* futile waiting, but inform the user about this.
6725
*/
6726
printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6727
"state on VCPU %d after 1 s timeout\n",
6728
__func__, vcpu->vcpu_id);
6729
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6730
}
6731
}
6732
6733
if (exit_fastpath != EXIT_FASTPATH_NONE)
6734
return 1;
6735
6736
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6737
goto unexpected_vmexit;
6738
#ifdef CONFIG_MITIGATION_RETPOLINE
6739
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6740
return kvm_emulate_wrmsr(vcpu);
6741
else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6742
return handle_wrmsr_imm(vcpu);
6743
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6744
return handle_preemption_timer(vcpu);
6745
else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6746
return handle_interrupt_window(vcpu);
6747
else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6748
return handle_external_interrupt(vcpu);
6749
else if (exit_reason.basic == EXIT_REASON_HLT)
6750
return kvm_emulate_halt(vcpu);
6751
else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6752
return handle_ept_misconfig(vcpu);
6753
#endif
6754
6755
exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6756
kvm_vmx_max_exit_handlers);
6757
if (!kvm_vmx_exit_handlers[exit_handler_index])
6758
goto unexpected_vmexit;
6759
6760
return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6761
6762
unexpected_vmexit:
6763
dump_vmcs(vcpu);
6764
kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
6765
return 0;
6766
}
6767
6768
int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6769
{
6770
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6771
6772
/*
6773
* Exit to user space when bus lock detected to inform that there is
6774
* a bus lock in guest.
6775
*/
6776
if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
6777
if (ret > 0)
6778
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6779
6780
vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6781
return 0;
6782
}
6783
return ret;
6784
}
6785
6786
void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6787
{
6788
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6789
int tpr_threshold;
6790
6791
if (is_guest_mode(vcpu) &&
6792
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6793
return;
6794
6795
tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6796
if (is_guest_mode(vcpu))
6797
to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6798
else
6799
vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6800
}
6801
6802
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6803
{
6804
struct vcpu_vmx *vmx = to_vmx(vcpu);
6805
u32 sec_exec_control;
6806
6807
if (!lapic_in_kernel(vcpu))
6808
return;
6809
6810
if (!flexpriority_enabled &&
6811
!cpu_has_vmx_virtualize_x2apic_mode())
6812
return;
6813
6814
/* Postpone execution until vmcs01 is the current VMCS. */
6815
if (is_guest_mode(vcpu)) {
6816
vmx->nested.change_vmcs01_virtual_apic_mode = true;
6817
return;
6818
}
6819
6820
sec_exec_control = secondary_exec_controls_get(vmx);
6821
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6822
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6823
6824
switch (kvm_get_apic_mode(vcpu)) {
6825
case LAPIC_MODE_INVALID:
6826
WARN_ONCE(true, "Invalid local APIC state");
6827
break;
6828
case LAPIC_MODE_DISABLED:
6829
break;
6830
case LAPIC_MODE_XAPIC:
6831
if (flexpriority_enabled) {
6832
sec_exec_control |=
6833
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6834
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6835
6836
/*
6837
* Flush the TLB, reloading the APIC access page will
6838
* only do so if its physical address has changed, but
6839
* the guest may have inserted a non-APIC mapping into
6840
* the TLB while the APIC access page was disabled.
6841
*/
6842
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6843
}
6844
break;
6845
case LAPIC_MODE_X2APIC:
6846
if (cpu_has_vmx_virtualize_x2apic_mode())
6847
sec_exec_control |=
6848
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6849
break;
6850
}
6851
secondary_exec_controls_set(vmx, sec_exec_control);
6852
6853
vmx_update_msr_bitmap_x2apic(vcpu);
6854
}
6855
6856
void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6857
{
6858
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
6859
struct kvm *kvm = vcpu->kvm;
6860
struct kvm_memslots *slots = kvm_memslots(kvm);
6861
struct kvm_memory_slot *slot;
6862
struct page *refcounted_page;
6863
unsigned long mmu_seq;
6864
kvm_pfn_t pfn;
6865
bool writable;
6866
6867
/* Defer reload until vmcs01 is the current VMCS. */
6868
if (is_guest_mode(vcpu)) {
6869
to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6870
return;
6871
}
6872
6873
if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6874
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6875
return;
6876
6877
/*
6878
* Explicitly grab the memslot using KVM's internal slot ID to ensure
6879
* KVM doesn't unintentionally grab a userspace memslot. It _should_
6880
* be impossible for userspace to create a memslot for the APIC when
6881
* APICv is enabled, but paranoia won't hurt in this case.
6882
*/
6883
slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
6884
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
6885
return;
6886
6887
/*
6888
* Ensure that the mmu_notifier sequence count is read before KVM
6889
* retrieves the pfn from the primary MMU. Note, the memslot is
6890
* protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
6891
* in kvm_mmu_invalidate_end().
6892
*/
6893
mmu_seq = kvm->mmu_invalidate_seq;
6894
smp_rmb();
6895
6896
/*
6897
* No need to retry if the memslot does not exist or is invalid. KVM
6898
* controls the APIC-access page memslot, and only deletes the memslot
6899
* if APICv is permanently inhibited, i.e. the memslot won't reappear.
6900
*/
6901
pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
6902
if (is_error_noslot_pfn(pfn))
6903
return;
6904
6905
read_lock(&vcpu->kvm->mmu_lock);
6906
if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
6907
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6908
else
6909
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
6910
6911
/*
6912
* Do not pin the APIC access page in memory so that it can be freely
6913
* migrated, the MMU notifier will call us again if it is migrated or
6914
* swapped out. KVM backs the memslot with anonymous memory, the pfn
6915
* should always point at a refcounted page (if the pfn is valid).
6916
*/
6917
if (!WARN_ON_ONCE(!refcounted_page))
6918
kvm_release_page_clean(refcounted_page);
6919
6920
/*
6921
* No need for a manual TLB flush at this point, KVM has already done a
6922
* flush if there were SPTEs pointing at the previous page.
6923
*/
6924
read_unlock(&vcpu->kvm->mmu_lock);
6925
}
6926
6927
void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6928
{
6929
u16 status;
6930
u8 old;
6931
6932
/*
6933
* If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
6934
* is only relevant for if and only if Virtual Interrupt Delivery is
6935
* enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
6936
* vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
6937
* VM-Exit, otherwise L1 with run with a stale SVI.
6938
*/
6939
if (is_guest_mode(vcpu)) {
6940
to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
6941
return;
6942
}
6943
6944
if (max_isr == -1)
6945
max_isr = 0;
6946
6947
status = vmcs_read16(GUEST_INTR_STATUS);
6948
old = status >> 8;
6949
if (max_isr != old) {
6950
status &= 0xff;
6951
status |= max_isr << 8;
6952
vmcs_write16(GUEST_INTR_STATUS, status);
6953
}
6954
}
6955
6956
static void vmx_set_rvi(int vector)
6957
{
6958
u16 status;
6959
u8 old;
6960
6961
if (vector == -1)
6962
vector = 0;
6963
6964
status = vmcs_read16(GUEST_INTR_STATUS);
6965
old = (u8)status & 0xff;
6966
if ((u8)vector != old) {
6967
status &= ~0xff;
6968
status |= (u8)vector;
6969
vmcs_write16(GUEST_INTR_STATUS, status);
6970
}
6971
}
6972
6973
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6974
{
6975
struct vcpu_vt *vt = to_vt(vcpu);
6976
int max_irr;
6977
bool got_posted_interrupt;
6978
6979
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6980
return -EIO;
6981
6982
if (pi_test_on(&vt->pi_desc)) {
6983
pi_clear_on(&vt->pi_desc);
6984
/*
6985
* IOMMU can write to PID.ON, so the barrier matters even on UP.
6986
* But on x86 this is just a compiler barrier anyway.
6987
*/
6988
smp_mb__after_atomic();
6989
got_posted_interrupt =
6990
kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
6991
} else {
6992
max_irr = kvm_lapic_find_highest_irr(vcpu);
6993
got_posted_interrupt = false;
6994
}
6995
6996
/*
6997
* Newly recognized interrupts are injected via either virtual interrupt
6998
* delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
6999
* disabled in two cases:
7000
*
7001
* 1) If L2 is running and the vCPU has a new pending interrupt. If L1
7002
* wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
7003
* VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
7004
* into L2, but KVM doesn't use virtual interrupt delivery to inject
7005
* interrupts into L2, and so KVM_REQ_EVENT is again needed.
7006
*
7007
* 2) If APICv is disabled for this vCPU, assigned devices may still
7008
* attempt to post interrupts. The posted interrupt vector will cause
7009
* a VM-Exit and the subsequent entry will call sync_pir_to_irr.
7010
*/
7011
if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
7012
vmx_set_rvi(max_irr);
7013
else if (got_posted_interrupt)
7014
kvm_make_request(KVM_REQ_EVENT, vcpu);
7015
7016
return max_irr;
7017
}
7018
7019
void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7020
{
7021
if (!kvm_vcpu_apicv_active(vcpu))
7022
return;
7023
7024
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7025
vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7026
vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7027
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7028
}
7029
7030
void vmx_do_interrupt_irqoff(unsigned long entry);
7031
void vmx_do_nmi_irqoff(void);
7032
7033
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
7034
{
7035
/*
7036
* Save xfd_err to guest_fpu before interrupt is enabled, so the
7037
* MSR value is not clobbered by the host activity before the guest
7038
* has chance to consume it.
7039
*
7040
* Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
7041
* interception may have been caused by L1 interception. Per the SDM,
7042
* XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
7043
*
7044
* Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
7045
* unlike CR2 and DR6, the value is not a payload that is attached to
7046
* the #NM exception.
7047
*/
7048
if (is_xfd_nm_fault(vcpu))
7049
rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
7050
}
7051
7052
static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
7053
{
7054
/* if exit due to PF check for async PF */
7055
if (is_page_fault(intr_info))
7056
vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
7057
/* if exit due to NM, handle before interrupts are enabled */
7058
else if (is_nm_fault(intr_info))
7059
handle_nm_fault_irqoff(vcpu);
7060
/* Handle machine checks before interrupts are enabled */
7061
else if (is_machine_check(intr_info))
7062
kvm_machine_check();
7063
}
7064
7065
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
7066
u32 intr_info)
7067
{
7068
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
7069
7070
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
7071
"unexpected VM-Exit interrupt info: 0x%x", intr_info))
7072
return;
7073
7074
/*
7075
* Invoke the kernel's IRQ handler for the vector. Use the FRED path
7076
* when it's available even if FRED isn't fully enabled, e.g. even if
7077
* FRED isn't supported in hardware, in order to avoid the indirect
7078
* CALL in the non-FRED path.
7079
*/
7080
kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
7081
if (IS_ENABLED(CONFIG_X86_FRED))
7082
fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
7083
else
7084
vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
7085
kvm_after_interrupt(vcpu);
7086
7087
vcpu->arch.at_instruction_boundary = true;
7088
}
7089
7090
void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
7091
{
7092
if (to_vt(vcpu)->emulation_required)
7093
return;
7094
7095
switch (vmx_get_exit_reason(vcpu).basic) {
7096
case EXIT_REASON_EXTERNAL_INTERRUPT:
7097
handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
7098
break;
7099
case EXIT_REASON_EXCEPTION_NMI:
7100
handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
7101
break;
7102
case EXIT_REASON_MCE_DURING_VMENTRY:
7103
kvm_machine_check();
7104
break;
7105
default:
7106
break;
7107
}
7108
}
7109
7110
/*
7111
* The kvm parameter can be NULL (module initialization, or invocation before
7112
* VM creation). Be sure to check the kvm parameter before using it.
7113
*/
7114
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
7115
{
7116
switch (index) {
7117
case MSR_IA32_SMBASE:
7118
if (!IS_ENABLED(CONFIG_KVM_SMM))
7119
return false;
7120
/*
7121
* We cannot do SMM unless we can run the guest in big
7122
* real mode.
7123
*/
7124
return enable_unrestricted_guest || emulate_invalid_guest_state;
7125
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7126
return nested;
7127
case MSR_AMD64_VIRT_SPEC_CTRL:
7128
case MSR_AMD64_TSC_RATIO:
7129
/* This is AMD only. */
7130
return false;
7131
default:
7132
return true;
7133
}
7134
}
7135
7136
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7137
{
7138
u32 exit_intr_info;
7139
bool unblock_nmi;
7140
u8 vector;
7141
bool idtv_info_valid;
7142
7143
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7144
7145
if (enable_vnmi) {
7146
if (vmx->loaded_vmcs->nmi_known_unmasked)
7147
return;
7148
7149
exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7150
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7151
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7152
/*
7153
* SDM 3: 27.7.1.2 (September 2008)
7154
* Re-set bit "block by NMI" before VM entry if vmexit caused by
7155
* a guest IRET fault.
7156
* SDM 3: 23.2.2 (September 2008)
7157
* Bit 12 is undefined in any of the following cases:
7158
* If the VM exit sets the valid bit in the IDT-vectoring
7159
* information field.
7160
* If the VM exit is due to a double fault.
7161
*/
7162
if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7163
vector != DF_VECTOR && !idtv_info_valid)
7164
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7165
GUEST_INTR_STATE_NMI);
7166
else
7167
vmx->loaded_vmcs->nmi_known_unmasked =
7168
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7169
& GUEST_INTR_STATE_NMI);
7170
} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7171
vmx->loaded_vmcs->vnmi_blocked_time +=
7172
ktime_to_ns(ktime_sub(ktime_get(),
7173
vmx->loaded_vmcs->entry_time));
7174
}
7175
7176
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7177
u32 idt_vectoring_info,
7178
int instr_len_field,
7179
int error_code_field)
7180
{
7181
u8 vector;
7182
int type;
7183
bool idtv_info_valid;
7184
7185
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7186
7187
vcpu->arch.nmi_injected = false;
7188
kvm_clear_exception_queue(vcpu);
7189
kvm_clear_interrupt_queue(vcpu);
7190
7191
if (!idtv_info_valid)
7192
return;
7193
7194
kvm_make_request(KVM_REQ_EVENT, vcpu);
7195
7196
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7197
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7198
7199
switch (type) {
7200
case INTR_TYPE_NMI_INTR:
7201
vcpu->arch.nmi_injected = true;
7202
/*
7203
* SDM 3: 27.7.1.2 (September 2008)
7204
* Clear bit "block by NMI" before VM entry if a NMI
7205
* delivery faulted.
7206
*/
7207
vmx_set_nmi_mask(vcpu, false);
7208
break;
7209
case INTR_TYPE_SOFT_EXCEPTION:
7210
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7211
fallthrough;
7212
case INTR_TYPE_HARD_EXCEPTION: {
7213
u32 error_code = 0;
7214
7215
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
7216
error_code = vmcs_read32(error_code_field);
7217
7218
kvm_requeue_exception(vcpu, vector,
7219
idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
7220
error_code);
7221
break;
7222
}
7223
case INTR_TYPE_SOFT_INTR:
7224
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7225
fallthrough;
7226
case INTR_TYPE_EXT_INTR:
7227
kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7228
break;
7229
default:
7230
break;
7231
}
7232
}
7233
7234
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7235
{
7236
__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7237
VM_EXIT_INSTRUCTION_LEN,
7238
IDT_VECTORING_ERROR_CODE);
7239
}
7240
7241
void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7242
{
7243
__vmx_complete_interrupts(vcpu,
7244
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7245
VM_ENTRY_INSTRUCTION_LEN,
7246
VM_ENTRY_EXCEPTION_ERROR_CODE);
7247
7248
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7249
}
7250
7251
static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7252
{
7253
int i, nr_msrs;
7254
struct perf_guest_switch_msr *msrs;
7255
struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7256
7257
pmu->host_cross_mapped_mask = 0;
7258
if (pmu->pebs_enable & pmu->global_ctrl)
7259
intel_pmu_cross_mapped_check(pmu);
7260
7261
/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7262
msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7263
if (!msrs)
7264
return;
7265
7266
for (i = 0; i < nr_msrs; i++)
7267
if (msrs[i].host == msrs[i].guest)
7268
clear_atomic_switch_msr(vmx, msrs[i].msr);
7269
else
7270
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7271
msrs[i].host, false);
7272
}
7273
7274
static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7275
{
7276
struct vcpu_vmx *vmx = to_vmx(vcpu);
7277
u64 tscl;
7278
u32 delta_tsc;
7279
7280
if (force_immediate_exit) {
7281
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7282
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7283
} else if (vmx->hv_deadline_tsc != -1) {
7284
tscl = rdtsc();
7285
if (vmx->hv_deadline_tsc > tscl)
7286
/* set_hv_timer ensures the delta fits in 32-bits */
7287
delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7288
cpu_preemption_timer_multi);
7289
else
7290
delta_tsc = 0;
7291
7292
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7293
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7294
} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7295
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7296
vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7297
}
7298
}
7299
7300
void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7301
{
7302
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7303
vmx->loaded_vmcs->host_state.rsp = host_rsp;
7304
vmcs_writel(HOST_RSP, host_rsp);
7305
}
7306
}
7307
7308
void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7309
unsigned int flags)
7310
{
7311
u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7312
7313
if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7314
return;
7315
7316
if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7317
vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
7318
7319
/*
7320
* If the guest/host SPEC_CTRL values differ, restore the host value.
7321
*
7322
* For legacy IBRS, the IBRS bit always needs to be written after
7323
* transitioning from a less privileged predictor mode, regardless of
7324
* whether the guest/host values differ.
7325
*/
7326
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7327
vmx->spec_ctrl != hostval)
7328
native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
7329
7330
barrier_nospec();
7331
}
7332
7333
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
7334
bool force_immediate_exit)
7335
{
7336
/*
7337
* If L2 is active, some VMX preemption timer exits can be handled in
7338
* the fastpath even, all other exits must use the slow path.
7339
*/
7340
if (is_guest_mode(vcpu) &&
7341
vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
7342
return EXIT_FASTPATH_NONE;
7343
7344
switch (vmx_get_exit_reason(vcpu).basic) {
7345
case EXIT_REASON_MSR_WRITE:
7346
return handle_fastpath_wrmsr(vcpu);
7347
case EXIT_REASON_MSR_WRITE_IMM:
7348
return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
7349
vmx_get_msr_imm_reg(vcpu));
7350
case EXIT_REASON_PREEMPTION_TIMER:
7351
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
7352
case EXIT_REASON_HLT:
7353
return handle_fastpath_hlt(vcpu);
7354
case EXIT_REASON_INVD:
7355
return handle_fastpath_invd(vcpu);
7356
default:
7357
return EXIT_FASTPATH_NONE;
7358
}
7359
}
7360
7361
noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
7362
{
7363
if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
7364
!is_nmi(vmx_get_intr_info(vcpu)))
7365
return;
7366
7367
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7368
if (cpu_feature_enabled(X86_FEATURE_FRED))
7369
fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
7370
else
7371
vmx_do_nmi_irqoff();
7372
kvm_after_interrupt(vcpu);
7373
}
7374
7375
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7376
unsigned int flags)
7377
{
7378
struct vcpu_vmx *vmx = to_vmx(vcpu);
7379
7380
guest_state_enter_irqoff();
7381
7382
vmx_l1d_flush(vcpu);
7383
7384
vmx_disable_fb_clear(vmx);
7385
7386
if (vcpu->arch.cr2 != native_read_cr2())
7387
native_write_cr2(vcpu->arch.cr2);
7388
7389
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7390
flags);
7391
7392
vcpu->arch.cr2 = native_read_cr2();
7393
vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7394
7395
vmx->idt_vectoring_info = 0;
7396
7397
vmx_enable_fb_clear(vmx);
7398
7399
if (unlikely(vmx->fail)) {
7400
vmx->vt.exit_reason.full = 0xdead;
7401
goto out;
7402
}
7403
7404
vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7405
if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
7406
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7407
7408
vmx_handle_nmi(vcpu);
7409
7410
out:
7411
guest_state_exit_irqoff();
7412
}
7413
7414
fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
7415
{
7416
bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
7417
struct vcpu_vmx *vmx = to_vmx(vcpu);
7418
unsigned long cr3, cr4;
7419
7420
/* Record the guest's net vcpu time for enforced NMI injections. */
7421
if (unlikely(!enable_vnmi &&
7422
vmx->loaded_vmcs->soft_vnmi_blocked))
7423
vmx->loaded_vmcs->entry_time = ktime_get();
7424
7425
/*
7426
* Don't enter VMX if guest state is invalid, let the exit handler
7427
* start emulation until we arrive back to a valid state. Synthesize a
7428
* consistency check VM-Exit due to invalid guest state and bail.
7429
*/
7430
if (unlikely(vmx->vt.emulation_required)) {
7431
vmx->fail = 0;
7432
7433
vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
7434
vmx->vt.exit_reason.failed_vmentry = 1;
7435
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7436
vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
7437
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7438
vmx->vt.exit_intr_info = 0;
7439
return EXIT_FASTPATH_NONE;
7440
}
7441
7442
trace_kvm_entry(vcpu, force_immediate_exit);
7443
7444
if (vmx->ple_window_dirty) {
7445
vmx->ple_window_dirty = false;
7446
vmcs_write32(PLE_WINDOW, vmx->ple_window);
7447
}
7448
7449
/*
7450
* We did this in prepare_switch_to_guest, because it needs to
7451
* be within srcu_read_lock.
7452
*/
7453
WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7454
7455
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7456
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7457
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7458
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7459
vcpu->arch.regs_dirty = 0;
7460
7461
if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
7462
set_debugreg(vcpu->arch.dr6, 6);
7463
7464
if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
7465
vmx_reload_guest_debugctl(vcpu);
7466
7467
/*
7468
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7469
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7470
* it switches back to the current->mm, which can occur in KVM context
7471
* when switching to a temporary mm to patch kernel code, e.g. if KVM
7472
* toggles a static key while handling a VM-Exit.
7473
*/
7474
cr3 = __get_current_cr3_fast();
7475
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7476
vmcs_writel(HOST_CR3, cr3);
7477
vmx->loaded_vmcs->host_state.cr3 = cr3;
7478
}
7479
7480
cr4 = cr4_read_shadow();
7481
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7482
vmcs_writel(HOST_CR4, cr4);
7483
vmx->loaded_vmcs->host_state.cr4 = cr4;
7484
}
7485
7486
/* When single-stepping over STI and MOV SS, we must clear the
7487
* corresponding interruptibility bits in the guest state. Otherwise
7488
* vmentry fails as it then expects bit 14 (BS) in pending debug
7489
* exceptions being set, but that's not correct for the guest debugging
7490
* case. */
7491
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7492
vmx_set_interrupt_shadow(vcpu, 0);
7493
7494
pt_guest_enter(vmx);
7495
7496
atomic_switch_perf_msrs(vmx);
7497
if (intel_pmu_lbr_is_enabled(vcpu))
7498
vmx_passthrough_lbr_msrs(vcpu);
7499
7500
if (enable_preemption_timer)
7501
vmx_update_hv_timer(vcpu, force_immediate_exit);
7502
else if (force_immediate_exit)
7503
smp_send_reschedule(vcpu->cpu);
7504
7505
kvm_wait_lapic_expire(vcpu);
7506
7507
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
7508
vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7509
7510
/* All fields are clean at this point */
7511
if (kvm_is_using_evmcs()) {
7512
current_evmcs->hv_clean_fields |=
7513
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7514
7515
current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7516
}
7517
7518
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7519
if (vcpu->arch.host_debugctl)
7520
update_debugctlmsr(vcpu->arch.host_debugctl);
7521
7522
#ifndef CONFIG_X86_64
7523
/*
7524
* The sysexit path does not restore ds/es, so we must set them to
7525
* a reasonable value ourselves.
7526
*
7527
* We can't defer this to vmx_prepare_switch_to_host() since that
7528
* function may be executed in interrupt context, which saves and
7529
* restore segments around it, nullifying its effect.
7530
*/
7531
loadsegment(ds, __USER_DS);
7532
loadsegment(es, __USER_DS);
7533
#endif
7534
7535
pt_guest_exit(vmx);
7536
7537
if (is_guest_mode(vcpu)) {
7538
/*
7539
* Track VMLAUNCH/VMRESUME that have made past guest state
7540
* checking.
7541
*/
7542
if (vmx->nested.nested_run_pending &&
7543
!vmx_get_exit_reason(vcpu).failed_vmentry)
7544
++vcpu->stat.nested_run;
7545
7546
vmx->nested.nested_run_pending = 0;
7547
}
7548
7549
if (unlikely(vmx->fail))
7550
return EXIT_FASTPATH_NONE;
7551
7552
trace_kvm_exit(vcpu, KVM_ISA_VMX);
7553
7554
if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
7555
return EXIT_FASTPATH_NONE;
7556
7557
vmx->loaded_vmcs->launched = 1;
7558
7559
vmx_recover_nmi_blocking(vmx);
7560
vmx_complete_interrupts(vmx);
7561
7562
return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
7563
}
7564
7565
void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7566
{
7567
struct vcpu_vmx *vmx = to_vmx(vcpu);
7568
7569
if (enable_pml)
7570
vmx_destroy_pml_buffer(vmx);
7571
free_vpid(vmx->vpid);
7572
nested_vmx_free_vcpu(vcpu);
7573
free_loaded_vmcs(vmx->loaded_vmcs);
7574
free_page((unsigned long)vmx->ve_info);
7575
}
7576
7577
int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7578
{
7579
struct vmx_uret_msr *tsx_ctrl;
7580
struct vcpu_vmx *vmx;
7581
int i, err;
7582
7583
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7584
vmx = to_vmx(vcpu);
7585
7586
INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
7587
7588
err = -ENOMEM;
7589
7590
vmx->vpid = allocate_vpid();
7591
7592
/*
7593
* If PML is turned on, failure on enabling PML just results in failure
7594
* of creating the vcpu, therefore we can simplify PML logic (by
7595
* avoiding dealing with cases, such as enabling PML partially on vcpus
7596
* for the guest), etc.
7597
*/
7598
if (enable_pml) {
7599
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7600
if (!vmx->pml_pg)
7601
goto free_vpid;
7602
}
7603
7604
for (i = 0; i < kvm_nr_uret_msrs; ++i)
7605
vmx->guest_uret_msrs[i].mask = -1ull;
7606
if (boot_cpu_has(X86_FEATURE_RTM)) {
7607
/*
7608
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7609
* Keep the host value unchanged to avoid changing CPUID bits
7610
* under the host kernel's feet.
7611
*/
7612
tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7613
if (tsx_ctrl)
7614
tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7615
}
7616
7617
err = alloc_loaded_vmcs(&vmx->vmcs01);
7618
if (err < 0)
7619
goto free_pml;
7620
7621
/*
7622
* Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7623
* nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7624
* feature only for vmcs01, KVM currently isn't equipped to realize any
7625
* performance benefits from enabling it for vmcs02.
7626
*/
7627
if (kvm_is_using_evmcs() &&
7628
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7629
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7630
7631
evmcs->hv_enlightenments_control.msr_bitmap = 1;
7632
}
7633
7634
vmx->loaded_vmcs = &vmx->vmcs01;
7635
7636
if (cpu_need_virtualize_apic_accesses(vcpu)) {
7637
err = kvm_alloc_apic_access_page(vcpu->kvm);
7638
if (err)
7639
goto free_vmcs;
7640
}
7641
7642
if (enable_ept && !enable_unrestricted_guest) {
7643
err = init_rmode_identity_map(vcpu->kvm);
7644
if (err)
7645
goto free_vmcs;
7646
}
7647
7648
err = -ENOMEM;
7649
if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
7650
struct page *page;
7651
7652
BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
7653
7654
/* ve_info must be page aligned. */
7655
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7656
if (!page)
7657
goto free_vmcs;
7658
7659
vmx->ve_info = page_to_virt(page);
7660
}
7661
7662
if (vmx_can_use_ipiv(vcpu))
7663
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7664
__pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
7665
7666
return 0;
7667
7668
free_vmcs:
7669
free_loaded_vmcs(vmx->loaded_vmcs);
7670
free_pml:
7671
vmx_destroy_pml_buffer(vmx);
7672
free_vpid:
7673
free_vpid(vmx->vpid);
7674
return err;
7675
}
7676
7677
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7678
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7679
7680
int vmx_vm_init(struct kvm *kvm)
7681
{
7682
if (!ple_gap)
7683
kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
7684
7685
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7686
switch (l1tf_mitigation) {
7687
case L1TF_MITIGATION_OFF:
7688
case L1TF_MITIGATION_FLUSH_NOWARN:
7689
/* 'I explicitly don't care' is set */
7690
break;
7691
case L1TF_MITIGATION_AUTO:
7692
case L1TF_MITIGATION_FLUSH:
7693
case L1TF_MITIGATION_FLUSH_NOSMT:
7694
case L1TF_MITIGATION_FULL:
7695
/*
7696
* Warn upon starting the first VM in a potentially
7697
* insecure environment.
7698
*/
7699
if (sched_smt_active())
7700
pr_warn_once(L1TF_MSG_SMT);
7701
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7702
pr_warn_once(L1TF_MSG_L1D);
7703
break;
7704
case L1TF_MITIGATION_FULL_FORCE:
7705
/* Flush is enforced */
7706
break;
7707
}
7708
}
7709
7710
if (enable_pml)
7711
kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
7712
return 0;
7713
}
7714
7715
static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
7716
{
7717
/*
7718
* Non-coherent DMA devices need the guest to flush CPU properly.
7719
* In that case it is not possible to map all guest RAM as WB, so
7720
* always trust guest PAT.
7721
*/
7722
return !kvm_arch_has_noncoherent_dma(kvm) &&
7723
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
7724
}
7725
7726
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7727
{
7728
/*
7729
* Force UC for host MMIO regions, as allowing the guest to access MMIO
7730
* with cacheable accesses will result in Machine Checks.
7731
*/
7732
if (is_mmio)
7733
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7734
7735
/* Force WB if ignoring guest PAT */
7736
if (vmx_ignore_guest_pat(vcpu->kvm))
7737
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7738
7739
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
7740
}
7741
7742
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7743
{
7744
/*
7745
* These bits in the secondary execution controls field
7746
* are dynamic, the others are mostly based on the hypervisor
7747
* architecture and the guest's CPUID. Do not touch the
7748
* dynamic bits.
7749
*/
7750
u32 mask =
7751
SECONDARY_EXEC_SHADOW_VMCS |
7752
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7753
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7754
SECONDARY_EXEC_DESC;
7755
7756
u32 cur_ctl = secondary_exec_controls_get(vmx);
7757
7758
secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7759
}
7760
7761
/*
7762
* Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7763
* (indicating "allowed-1") if they are supported in the guest's CPUID.
7764
*/
7765
static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7766
{
7767
struct vcpu_vmx *vmx = to_vmx(vcpu);
7768
struct kvm_cpuid_entry2 *entry;
7769
7770
vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7771
vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7772
7773
#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7774
if (entry && (entry->_reg & (_cpuid_mask))) \
7775
vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7776
} while (0)
7777
7778
entry = kvm_find_cpuid_entry(vcpu, 0x1);
7779
cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7780
cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7781
cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7782
cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7783
cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7784
cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7785
cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7786
cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7787
cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7788
cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7789
cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7790
cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7791
cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7792
cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
7793
7794
entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7795
cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7796
cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7797
cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7798
cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7799
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7800
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
7801
cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
7802
cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
7803
7804
entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
7805
cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
7806
7807
#undef cr4_fixed1_update
7808
}
7809
7810
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7811
{
7812
struct vcpu_vmx *vmx = to_vmx(vcpu);
7813
struct kvm_cpuid_entry2 *best = NULL;
7814
int i;
7815
7816
for (i = 0; i < PT_CPUID_LEAVES; i++) {
7817
best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7818
if (!best)
7819
return;
7820
vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7821
vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7822
vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7823
vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7824
}
7825
7826
/* Get the number of configurable Address Ranges for filtering */
7827
vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7828
PT_CAP_num_address_ranges);
7829
7830
/* Initialize and clear the no dependency bits */
7831
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7832
RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7833
RTIT_CTL_BRANCH_EN);
7834
7835
/*
7836
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7837
* will inject an #GP
7838
*/
7839
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7840
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7841
7842
/*
7843
* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7844
* PSBFreq can be set
7845
*/
7846
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7847
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7848
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7849
7850
/*
7851
* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7852
*/
7853
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7854
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7855
RTIT_CTL_MTC_RANGE);
7856
7857
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7858
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7859
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7860
RTIT_CTL_PTW_EN);
7861
7862
/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7863
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7864
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7865
7866
/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7867
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7868
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7869
7870
/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7871
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7872
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7873
7874
/* unmask address range configure area */
7875
for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7876
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7877
}
7878
7879
void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7880
{
7881
struct vcpu_vmx *vmx = to_vmx(vcpu);
7882
7883
/*
7884
* XSAVES is effectively enabled if and only if XSAVE is also exposed
7885
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
7886
* set if and only if XSAVE is supported.
7887
*/
7888
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
7889
guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
7890
7891
vmx_setup_uret_msrs(vmx);
7892
7893
if (cpu_has_secondary_exec_ctrls())
7894
vmcs_set_secondary_exec_control(vmx,
7895
vmx_secondary_exec_control(vmx));
7896
7897
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
7898
vmx->msr_ia32_feature_control_valid_bits |=
7899
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7900
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7901
else
7902
vmx->msr_ia32_feature_control_valid_bits &=
7903
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7904
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7905
7906
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
7907
nested_vmx_cr_fixed1_bits_update(vcpu);
7908
7909
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7910
guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
7911
update_intel_pt_cfg(vcpu);
7912
7913
if (boot_cpu_has(X86_FEATURE_RTM)) {
7914
struct vmx_uret_msr *msr;
7915
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7916
if (msr) {
7917
bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
7918
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7919
}
7920
}
7921
7922
set_cr4_guest_host_mask(vmx);
7923
7924
vmx_write_encls_bitmap(vcpu, NULL);
7925
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
7926
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7927
else
7928
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7929
7930
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
7931
vmx->msr_ia32_feature_control_valid_bits |=
7932
FEAT_CTL_SGX_LC_ENABLED;
7933
else
7934
vmx->msr_ia32_feature_control_valid_bits &=
7935
~FEAT_CTL_SGX_LC_ENABLED;
7936
7937
/* Refresh #PF interception to account for MAXPHYADDR changes. */
7938
vmx_update_exception_bitmap(vcpu);
7939
}
7940
7941
static __init u64 vmx_get_perf_capabilities(void)
7942
{
7943
u64 perf_cap = PERF_CAP_FW_WRITES;
7944
u64 host_perf_cap = 0;
7945
7946
if (!enable_pmu)
7947
return 0;
7948
7949
if (boot_cpu_has(X86_FEATURE_PDCM))
7950
rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7951
7952
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7953
x86_perf_get_lbr(&vmx_lbr_caps);
7954
7955
/*
7956
* KVM requires LBR callstack support, as the overhead due to
7957
* context switching LBRs without said support is too high.
7958
* See intel_pmu_create_guest_lbr_event() for more info.
7959
*/
7960
if (!vmx_lbr_caps.has_callstack)
7961
memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
7962
else if (vmx_lbr_caps.nr)
7963
perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
7964
}
7965
7966
if (vmx_pebs_supported()) {
7967
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7968
7969
/*
7970
* Disallow adaptive PEBS as it is functionally broken, can be
7971
* used by the guest to read *host* LBRs, and can be used to
7972
* bypass userspace event filters. To correctly and safely
7973
* support adaptive PEBS, KVM needs to:
7974
*
7975
* 1. Account for the ADAPTIVE flag when (re)programming fixed
7976
* counters.
7977
*
7978
* 2. Gain support from perf (or take direct control of counter
7979
* programming) to support events without adaptive PEBS
7980
* enabled for the hardware counter.
7981
*
7982
* 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
7983
* adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
7984
*
7985
* 4. Document which PMU events are effectively exposed to the
7986
* guest via adaptive PEBS, and make adaptive PEBS mutually
7987
* exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
7988
*/
7989
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7990
}
7991
7992
return perf_cap;
7993
}
7994
7995
static __init void vmx_set_cpu_caps(void)
7996
{
7997
kvm_set_cpu_caps();
7998
7999
/* CPUID 0x1 */
8000
if (nested)
8001
kvm_cpu_cap_set(X86_FEATURE_VMX);
8002
8003
/* CPUID 0x7 */
8004
if (kvm_mpx_supported())
8005
kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
8006
if (!cpu_has_vmx_invpcid())
8007
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
8008
if (vmx_pt_mode_is_host_guest())
8009
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
8010
if (vmx_pebs_supported()) {
8011
kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
8012
kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
8013
}
8014
8015
if (!enable_pmu)
8016
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
8017
kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
8018
8019
if (!enable_sgx) {
8020
kvm_cpu_cap_clear(X86_FEATURE_SGX);
8021
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
8022
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
8023
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
8024
kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
8025
}
8026
8027
if (vmx_umip_emulated())
8028
kvm_cpu_cap_set(X86_FEATURE_UMIP);
8029
8030
/* CPUID 0xD.1 */
8031
if (!cpu_has_vmx_xsaves())
8032
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
8033
8034
/* CPUID 0x80000001 and 0x7 (RDPID) */
8035
if (!cpu_has_vmx_rdtscp()) {
8036
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
8037
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
8038
}
8039
8040
if (cpu_has_vmx_waitpkg())
8041
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
8042
8043
/*
8044
* Disable CET if unrestricted_guest is unsupported as KVM doesn't
8045
* enforce CET HW behaviors in emulator. On platforms with
8046
* VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
8047
* fails, so disable CET in this case too.
8048
*/
8049
if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
8050
!cpu_has_vmx_basic_no_hw_errcode_cc()) {
8051
kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
8052
kvm_cpu_cap_clear(X86_FEATURE_IBT);
8053
}
8054
8055
kvm_setup_xss_caps();
8056
}
8057
8058
static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
8059
struct x86_instruction_info *info,
8060
unsigned long *exit_qualification)
8061
{
8062
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8063
unsigned short port;
8064
int size;
8065
bool imm;
8066
8067
/*
8068
* If the 'use IO bitmaps' VM-execution control is 0, IO instruction
8069
* VM-exits depend on the 'unconditional IO exiting' VM-execution
8070
* control.
8071
*
8072
* Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
8073
*/
8074
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8075
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
8076
8077
if (info->intercept == x86_intercept_in ||
8078
info->intercept == x86_intercept_ins) {
8079
port = info->src_val;
8080
size = info->dst_bytes;
8081
imm = info->src_type == OP_IMM;
8082
} else {
8083
port = info->dst_val;
8084
size = info->src_bytes;
8085
imm = info->dst_type == OP_IMM;
8086
}
8087
8088
8089
*exit_qualification = ((unsigned long)port << 16) | (size - 1);
8090
8091
if (info->intercept == x86_intercept_ins ||
8092
info->intercept == x86_intercept_outs)
8093
*exit_qualification |= BIT(4);
8094
8095
if (info->rep_prefix)
8096
*exit_qualification |= BIT(5);
8097
8098
if (imm)
8099
*exit_qualification |= BIT(6);
8100
8101
return nested_vmx_check_io_bitmaps(vcpu, port, size);
8102
}
8103
8104
int vmx_check_intercept(struct kvm_vcpu *vcpu,
8105
struct x86_instruction_info *info,
8106
enum x86_intercept_stage stage,
8107
struct x86_exception *exception)
8108
{
8109
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8110
unsigned long exit_qualification = 0;
8111
u32 vm_exit_reason;
8112
u64 exit_insn_len;
8113
8114
switch (info->intercept) {
8115
case x86_intercept_rdpid:
8116
/*
8117
* RDPID causes #UD if not enabled through secondary execution
8118
* controls (ENABLE_RDTSCP). Note, the implicit MSR access to
8119
* TSC_AUX is NOT subject to interception, i.e. checking only
8120
* the dedicated execution control is architecturally correct.
8121
*/
8122
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
8123
exception->vector = UD_VECTOR;
8124
exception->error_code_valid = false;
8125
return X86EMUL_PROPAGATE_FAULT;
8126
}
8127
return X86EMUL_CONTINUE;
8128
8129
case x86_intercept_in:
8130
case x86_intercept_ins:
8131
case x86_intercept_out:
8132
case x86_intercept_outs:
8133
if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
8134
return X86EMUL_CONTINUE;
8135
8136
vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
8137
break;
8138
8139
case x86_intercept_lgdt:
8140
case x86_intercept_lidt:
8141
case x86_intercept_lldt:
8142
case x86_intercept_ltr:
8143
case x86_intercept_sgdt:
8144
case x86_intercept_sidt:
8145
case x86_intercept_sldt:
8146
case x86_intercept_str:
8147
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
8148
return X86EMUL_CONTINUE;
8149
8150
if (info->intercept == x86_intercept_lldt ||
8151
info->intercept == x86_intercept_ltr ||
8152
info->intercept == x86_intercept_sldt ||
8153
info->intercept == x86_intercept_str)
8154
vm_exit_reason = EXIT_REASON_LDTR_TR;
8155
else
8156
vm_exit_reason = EXIT_REASON_GDTR_IDTR;
8157
/*
8158
* FIXME: Decode the ModR/M to generate the correct exit
8159
* qualification for memory operands.
8160
*/
8161
break;
8162
8163
case x86_intercept_hlt:
8164
if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
8165
return X86EMUL_CONTINUE;
8166
8167
vm_exit_reason = EXIT_REASON_HLT;
8168
break;
8169
8170
case x86_intercept_pause:
8171
/*
8172
* PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
8173
* with vanilla NOPs in the emulator. Apply the interception
8174
* check only to actual PAUSE instructions. Don't check
8175
* PAUSE-loop-exiting, software can't expect a given PAUSE to
8176
* exit, i.e. KVM is within its rights to allow L2 to execute
8177
* the PAUSE.
8178
*/
8179
if ((info->rep_prefix != REPE_PREFIX) ||
8180
!nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
8181
return X86EMUL_CONTINUE;
8182
8183
vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
8184
break;
8185
8186
/* TODO: check more intercepts... */
8187
default:
8188
return X86EMUL_UNHANDLEABLE;
8189
}
8190
8191
exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
8192
if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
8193
return X86EMUL_UNHANDLEABLE;
8194
8195
__nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
8196
exit_insn_len);
8197
return X86EMUL_INTERCEPTED;
8198
}
8199
8200
#ifdef CONFIG_X86_64
8201
/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
8202
static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8203
u64 divisor, u64 *result)
8204
{
8205
u64 low = a << shift, high = a >> (64 - shift);
8206
8207
/* To avoid the overflow on divq */
8208
if (high >= divisor)
8209
return 1;
8210
8211
/* Low hold the result, high hold rem which is discarded */
8212
asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8213
"rm" (divisor), "0" (low), "1" (high));
8214
*result = low;
8215
8216
return 0;
8217
}
8218
8219
int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8220
bool *expired)
8221
{
8222
struct vcpu_vmx *vmx;
8223
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8224
struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8225
8226
vmx = to_vmx(vcpu);
8227
tscl = rdtsc();
8228
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8229
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8230
lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8231
ktimer->timer_advance_ns);
8232
8233
if (delta_tsc > lapic_timer_advance_cycles)
8234
delta_tsc -= lapic_timer_advance_cycles;
8235
else
8236
delta_tsc = 0;
8237
8238
/* Convert to host delta tsc if tsc scaling is enabled */
8239
if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8240
delta_tsc && u64_shl_div_u64(delta_tsc,
8241
kvm_caps.tsc_scaling_ratio_frac_bits,
8242
vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
8243
return -ERANGE;
8244
8245
/*
8246
* If the delta tsc can't fit in the 32 bit after the multi shift,
8247
* we can't use the preemption timer.
8248
* It's possible that it fits on later vmentries, but checking
8249
* on every vmentry is costly so we just use an hrtimer.
8250
*/
8251
if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8252
return -ERANGE;
8253
8254
vmx->hv_deadline_tsc = tscl + delta_tsc;
8255
*expired = !delta_tsc;
8256
return 0;
8257
}
8258
8259
void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8260
{
8261
to_vmx(vcpu)->hv_deadline_tsc = -1;
8262
}
8263
#endif
8264
8265
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8266
{
8267
struct vcpu_vmx *vmx = to_vmx(vcpu);
8268
8269
if (WARN_ON_ONCE(!enable_pml))
8270
return;
8271
8272
if (is_guest_mode(vcpu)) {
8273
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8274
return;
8275
}
8276
8277
/*
8278
* Note, nr_memslots_dirty_logging can be changed concurrent with this
8279
* code, but in that case another update request will be made and so
8280
* the guest will never run with a stale PML value.
8281
*/
8282
if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8283
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8284
else
8285
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8286
}
8287
8288
void vmx_setup_mce(struct kvm_vcpu *vcpu)
8289
{
8290
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8291
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8292
FEAT_CTL_LMCE_ENABLED;
8293
else
8294
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8295
~FEAT_CTL_LMCE_ENABLED;
8296
}
8297
8298
#ifdef CONFIG_KVM_SMM
8299
int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8300
{
8301
/* we need a nested vmexit to enter SMM, postpone if run is pending */
8302
if (to_vmx(vcpu)->nested.nested_run_pending)
8303
return -EBUSY;
8304
return !is_smm(vcpu);
8305
}
8306
8307
int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8308
{
8309
struct vcpu_vmx *vmx = to_vmx(vcpu);
8310
8311
/*
8312
* TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8313
* SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8314
* SMI and RSM only modify state that is saved and restored via SMRAM.
8315
* E.g. most MSRs are left untouched, but many are modified by VM-Exit
8316
* and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8317
*/
8318
vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8319
if (vmx->nested.smm.guest_mode)
8320
nested_vmx_vmexit(vcpu, -1, 0, 0);
8321
8322
vmx->nested.smm.vmxon = vmx->nested.vmxon;
8323
vmx->nested.vmxon = false;
8324
vmx_clear_hlt(vcpu);
8325
return 0;
8326
}
8327
8328
int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8329
{
8330
struct vcpu_vmx *vmx = to_vmx(vcpu);
8331
int ret;
8332
8333
if (vmx->nested.smm.vmxon) {
8334
vmx->nested.vmxon = true;
8335
vmx->nested.smm.vmxon = false;
8336
}
8337
8338
if (vmx->nested.smm.guest_mode) {
8339
ret = nested_vmx_enter_non_root_mode(vcpu, false);
8340
if (ret)
8341
return ret;
8342
8343
vmx->nested.nested_run_pending = 1;
8344
vmx->nested.smm.guest_mode = false;
8345
}
8346
return 0;
8347
}
8348
8349
void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8350
{
8351
/* RSM will cause a vmexit anyway. */
8352
}
8353
#endif
8354
8355
bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8356
{
8357
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8358
}
8359
8360
void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8361
{
8362
if (is_guest_mode(vcpu)) {
8363
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8364
8365
if (hrtimer_try_to_cancel(timer) == 1)
8366
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8367
}
8368
}
8369
8370
void vmx_hardware_unsetup(void)
8371
{
8372
kvm_set_posted_intr_wakeup_handler(NULL);
8373
8374
if (nested)
8375
nested_vmx_hardware_unsetup();
8376
8377
free_kvm_area();
8378
}
8379
8380
void vmx_vm_destroy(struct kvm *kvm)
8381
{
8382
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8383
8384
free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8385
}
8386
8387
/*
8388
* Note, the SDM states that the linear address is masked *after* the modified
8389
* canonicality check, whereas KVM masks (untags) the address and then performs
8390
* a "normal" canonicality check. Functionally, the two methods are identical,
8391
* and when the masking occurs relative to the canonicality check isn't visible
8392
* to software, i.e. KVM's behavior doesn't violate the SDM.
8393
*/
8394
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
8395
{
8396
int lam_bit;
8397
unsigned long cr3_bits;
8398
8399
if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
8400
return gva;
8401
8402
if (!is_64_bit_mode(vcpu))
8403
return gva;
8404
8405
/*
8406
* Bit 63 determines if the address should be treated as user address
8407
* or a supervisor address.
8408
*/
8409
if (!(gva & BIT_ULL(63))) {
8410
cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
8411
if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
8412
return gva;
8413
8414
/* LAM_U48 is ignored if LAM_U57 is set. */
8415
lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
8416
} else {
8417
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
8418
return gva;
8419
8420
lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
8421
}
8422
8423
/*
8424
* Untag the address by sign-extending the lam_bit, but NOT to bit 63.
8425
* Bit 63 is retained from the raw virtual address so that untagging
8426
* doesn't change a user access to a supervisor access, and vice versa.
8427
*/
8428
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
8429
}
8430
8431
static unsigned int vmx_handle_intel_pt_intr(void)
8432
{
8433
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8434
8435
/* '0' on failure so that the !PT case can use a RET0 static call. */
8436
if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8437
return 0;
8438
8439
kvm_make_request(KVM_REQ_PMI, vcpu);
8440
__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8441
(unsigned long *)&vcpu->arch.pmu.global_status);
8442
return 1;
8443
}
8444
8445
static __init void vmx_setup_user_return_msrs(void)
8446
{
8447
8448
/*
8449
* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8450
* will emulate SYSCALL in legacy mode if the vendor string in guest
8451
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8452
* support this emulation, MSR_STAR is included in the list for i386,
8453
* but is never loaded into hardware. MSR_CSTAR is also never loaded
8454
* into hardware and is here purely for emulation purposes.
8455
*/
8456
const u32 vmx_uret_msrs_list[] = {
8457
#ifdef CONFIG_X86_64
8458
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8459
#endif
8460
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8461
MSR_IA32_TSX_CTRL,
8462
};
8463
int i;
8464
8465
BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8466
8467
for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8468
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8469
}
8470
8471
static void __init vmx_setup_me_spte_mask(void)
8472
{
8473
u64 me_mask = 0;
8474
8475
/*
8476
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8477
* kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
8478
* boot_cpu_data.x86_phys_bits holds the actual physical address
8479
* w/o the KeyID bits, and kvm_host.maxphyaddr equals to
8480
* MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
8481
*/
8482
if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
8483
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8484
kvm_host.maxphyaddr - 1);
8485
8486
/*
8487
* Unlike SME, host kernel doesn't support setting up any
8488
* MKTME KeyID on Intel platforms. No memory encryption
8489
* bits should be included into the SPTE.
8490
*/
8491
kvm_mmu_set_me_spte_mask(0, me_mask);
8492
}
8493
8494
__init int vmx_hardware_setup(void)
8495
{
8496
unsigned long host_bndcfgs;
8497
struct desc_ptr dt;
8498
int r;
8499
8500
store_idt(&dt);
8501
host_idt_base = dt.address;
8502
8503
vmx_setup_user_return_msrs();
8504
8505
8506
if (boot_cpu_has(X86_FEATURE_NX))
8507
kvm_enable_efer_bits(EFER_NX);
8508
8509
if (boot_cpu_has(X86_FEATURE_MPX)) {
8510
rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
8511
WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8512
}
8513
8514
if (!cpu_has_vmx_mpx())
8515
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8516
XFEATURE_MASK_BNDCSR);
8517
8518
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8519
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8520
enable_vpid = 0;
8521
8522
if (!cpu_has_vmx_ept() ||
8523
!cpu_has_vmx_ept_4levels() ||
8524
!cpu_has_vmx_ept_mt_wb() ||
8525
!cpu_has_vmx_invept_global())
8526
enable_ept = 0;
8527
8528
/* NX support is required for shadow paging. */
8529
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8530
pr_err_ratelimited("NX (Execute Disable) not supported\n");
8531
return -EOPNOTSUPP;
8532
}
8533
8534
/*
8535
* Shadow paging doesn't have a (further) performance penalty
8536
* from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8537
* by default
8538
*/
8539
if (!enable_ept)
8540
allow_smaller_maxphyaddr = true;
8541
8542
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8543
enable_ept_ad_bits = 0;
8544
8545
if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8546
enable_unrestricted_guest = 0;
8547
8548
if (!cpu_has_vmx_flexpriority())
8549
flexpriority_enabled = 0;
8550
8551
if (!cpu_has_virtual_nmis())
8552
enable_vnmi = 0;
8553
8554
#ifdef CONFIG_X86_SGX_KVM
8555
if (!cpu_has_vmx_encls_vmexit())
8556
enable_sgx = false;
8557
#endif
8558
8559
/*
8560
* set_apic_access_page_addr() is used to reload apic access
8561
* page upon invalidation. No need to do anything if not
8562
* using the APIC_ACCESS_ADDR VMCS field.
8563
*/
8564
if (!flexpriority_enabled)
8565
vt_x86_ops.set_apic_access_page_addr = NULL;
8566
8567
if (!cpu_has_vmx_tpr_shadow())
8568
vt_x86_ops.update_cr8_intercept = NULL;
8569
8570
#if IS_ENABLED(CONFIG_HYPERV)
8571
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8572
&& enable_ept) {
8573
vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8574
vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8575
}
8576
#endif
8577
8578
if (!cpu_has_vmx_ple()) {
8579
ple_gap = 0;
8580
ple_window = 0;
8581
ple_window_grow = 0;
8582
ple_window_max = 0;
8583
ple_window_shrink = 0;
8584
}
8585
8586
if (!cpu_has_vmx_apicv())
8587
enable_apicv = 0;
8588
if (!enable_apicv)
8589
vt_x86_ops.sync_pir_to_irr = NULL;
8590
8591
if (!enable_apicv || !cpu_has_vmx_ipiv())
8592
enable_ipiv = false;
8593
8594
if (cpu_has_vmx_tsc_scaling())
8595
kvm_caps.has_tsc_control = true;
8596
8597
kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8598
kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8599
kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8600
kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8601
8602
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8603
8604
if (enable_ept)
8605
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8606
cpu_has_vmx_ept_execute_only());
8607
else
8608
vt_x86_ops.get_mt_mask = NULL;
8609
8610
/*
8611
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8612
* bits to shadow_zero_check.
8613
*/
8614
vmx_setup_me_spte_mask();
8615
8616
kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
8617
ept_caps_to_lpage_level(vmx_capability.ept));
8618
8619
/*
8620
* Only enable PML when hardware supports PML feature, and both EPT
8621
* and EPT A/D bit features are enabled -- PML depends on them to work.
8622
*/
8623
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8624
enable_pml = 0;
8625
8626
if (!cpu_has_vmx_preemption_timer())
8627
enable_preemption_timer = false;
8628
8629
if (enable_preemption_timer) {
8630
u64 use_timer_freq = 5000ULL * 1000 * 1000;
8631
8632
cpu_preemption_timer_multi =
8633
vmx_misc_preemption_timer_rate(vmcs_config.misc);
8634
8635
if (tsc_khz)
8636
use_timer_freq = (u64)tsc_khz * 1000;
8637
use_timer_freq >>= cpu_preemption_timer_multi;
8638
8639
/*
8640
* KVM "disables" the preemption timer by setting it to its max
8641
* value. Don't use the timer if it might cause spurious exits
8642
* at a rate faster than 0.1 Hz (of uninterrupted guest time).
8643
*/
8644
if (use_timer_freq > 0xffffffffu / 10)
8645
enable_preemption_timer = false;
8646
}
8647
8648
if (!enable_preemption_timer) {
8649
vt_x86_ops.set_hv_timer = NULL;
8650
vt_x86_ops.cancel_hv_timer = NULL;
8651
}
8652
8653
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8654
kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8655
8656
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8657
return -EINVAL;
8658
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8659
pt_mode = PT_MODE_SYSTEM;
8660
if (pt_mode == PT_MODE_HOST_GUEST)
8661
vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8662
else
8663
vt_init_ops.handle_intel_pt_intr = NULL;
8664
8665
setup_default_sgx_lepubkeyhash();
8666
8667
vmx_set_cpu_caps();
8668
8669
/*
8670
* Configure nested capabilities after core CPU capabilities so that
8671
* nested support can be conditional on base support, e.g. so that KVM
8672
* can hide/show features based on kvm_cpu_cap_has().
8673
*/
8674
if (nested) {
8675
nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8676
8677
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8678
if (r)
8679
return r;
8680
}
8681
8682
r = alloc_kvm_area();
8683
if (r && nested)
8684
nested_vmx_hardware_unsetup();
8685
8686
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8687
8688
/*
8689
* On Intel CPUs that lack self-snoop feature, letting the guest control
8690
* memory types may result in unexpected behavior. So always ignore guest
8691
* PAT on those CPUs and map VM as writeback, not allowing userspace to
8692
* disable the quirk.
8693
*
8694
* On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
8695
* supported, UC is slow enough to cause issues with some older guests (e.g.
8696
* an old version of bochs driver uses ioremap() instead of ioremap_wc() to
8697
* map the video RAM, causing wayland desktop to fail to get started
8698
* correctly). To avoid breaking those older guests that rely on KVM to force
8699
* memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
8700
* safer (for performance) default behavior.
8701
*
8702
* On top of this, non-coherent DMA devices need the guest to flush CPU
8703
* caches properly. This also requires honoring guest PAT, and is forced
8704
* independent of the quirk in vmx_ignore_guest_pat().
8705
*/
8706
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
8707
kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8708
8709
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8710
8711
return r;
8712
}
8713
8714
void vmx_exit(void)
8715
{
8716
allow_smaller_maxphyaddr = false;
8717
8718
vmx_cleanup_l1d_flush();
8719
8720
kvm_x86_vendor_exit();
8721
}
8722
8723
int __init vmx_init(void)
8724
{
8725
int r, cpu;
8726
8727
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
8728
8729
if (!kvm_is_vmx_supported())
8730
return -EOPNOTSUPP;
8731
8732
/*
8733
* Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
8734
* i.e. there's nothing to unwind if a later step fails.
8735
*/
8736
hv_init_evmcs();
8737
8738
/*
8739
* Parse the VMCS config and VMX capabilities before anything else, so
8740
* that the information is available to all setup flows.
8741
*/
8742
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8743
return -EIO;
8744
8745
r = kvm_x86_vendor_init(&vt_init_ops);
8746
if (r)
8747
return r;
8748
8749
/* Must be called after common x86 init so enable_ept is setup. */
8750
r = vmx_setup_l1d_flush();
8751
if (r)
8752
goto err_l1d_flush;
8753
8754
for_each_possible_cpu(cpu) {
8755
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8756
8757
pi_init_cpu(cpu);
8758
}
8759
8760
vmx_check_vmcs12_offsets();
8761
8762
return 0;
8763
8764
err_l1d_flush:
8765
kvm_x86_vendor_exit();
8766
return r;
8767
}
8768
8769