Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/nested.c
50376 views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4
#include <linux/objtool.h>
5
#include <linux/percpu.h>
6
7
#include <asm/debugreg.h>
8
#include <asm/mmu_context.h>
9
#include <asm/msr.h>
10
11
#include "x86.h"
12
#include "cpuid.h"
13
#include "hyperv.h"
14
#include "mmu.h"
15
#include "nested.h"
16
#include "pmu.h"
17
#include "posted_intr.h"
18
#include "sgx.h"
19
#include "trace.h"
20
#include "vmx.h"
21
#include "smm.h"
22
#include "x86_ops.h"
23
24
static bool __read_mostly enable_shadow_vmcs = 1;
25
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
26
27
static bool __ro_after_init warn_on_missed_cc;
28
module_param(warn_on_missed_cc, bool, 0444);
29
30
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
31
32
/*
33
* Hyper-V requires all of these, so mark them as supported even though
34
* they are just treated the same as all-context.
35
*/
36
#define VMX_VPID_EXTENT_SUPPORTED_MASK \
37
(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
38
VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
39
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
40
VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
41
42
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
43
44
enum {
45
VMX_VMREAD_BITMAP,
46
VMX_VMWRITE_BITMAP,
47
VMX_BITMAP_NR
48
};
49
static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
50
51
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
52
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
53
54
struct shadow_vmcs_field {
55
u16 encoding;
56
u16 offset;
57
};
58
static struct shadow_vmcs_field shadow_read_only_fields[] = {
59
#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
60
#include "vmcs_shadow_fields.h"
61
};
62
static int max_shadow_read_only_fields =
63
ARRAY_SIZE(shadow_read_only_fields);
64
65
static struct shadow_vmcs_field shadow_read_write_fields[] = {
66
#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
67
#include "vmcs_shadow_fields.h"
68
};
69
static int max_shadow_read_write_fields =
70
ARRAY_SIZE(shadow_read_write_fields);
71
72
static void init_vmcs_shadow_fields(void)
73
{
74
int i, j;
75
76
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
77
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
78
79
for (i = j = 0; i < max_shadow_read_only_fields; i++) {
80
struct shadow_vmcs_field entry = shadow_read_only_fields[i];
81
u16 field = entry.encoding;
82
83
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
84
(i + 1 == max_shadow_read_only_fields ||
85
shadow_read_only_fields[i + 1].encoding != field + 1))
86
pr_err("Missing field from shadow_read_only_field %x\n",
87
field + 1);
88
89
clear_bit(field, vmx_vmread_bitmap);
90
if (field & 1)
91
#ifdef CONFIG_X86_64
92
continue;
93
#else
94
entry.offset += sizeof(u32);
95
#endif
96
shadow_read_only_fields[j++] = entry;
97
}
98
max_shadow_read_only_fields = j;
99
100
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
101
struct shadow_vmcs_field entry = shadow_read_write_fields[i];
102
u16 field = entry.encoding;
103
104
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
105
(i + 1 == max_shadow_read_write_fields ||
106
shadow_read_write_fields[i + 1].encoding != field + 1))
107
pr_err("Missing field from shadow_read_write_field %x\n",
108
field + 1);
109
110
WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
111
field <= GUEST_TR_AR_BYTES,
112
"Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
113
114
/*
115
* PML and the preemption timer can be emulated, but the
116
* processor cannot vmwrite to fields that don't exist
117
* on bare metal.
118
*/
119
switch (field) {
120
case GUEST_PML_INDEX:
121
if (!cpu_has_vmx_pml())
122
continue;
123
break;
124
case VMX_PREEMPTION_TIMER_VALUE:
125
if (!cpu_has_vmx_preemption_timer())
126
continue;
127
break;
128
case GUEST_INTR_STATUS:
129
if (!cpu_has_vmx_apicv())
130
continue;
131
break;
132
default:
133
break;
134
}
135
136
clear_bit(field, vmx_vmwrite_bitmap);
137
clear_bit(field, vmx_vmread_bitmap);
138
if (field & 1)
139
#ifdef CONFIG_X86_64
140
continue;
141
#else
142
entry.offset += sizeof(u32);
143
#endif
144
shadow_read_write_fields[j++] = entry;
145
}
146
max_shadow_read_write_fields = j;
147
}
148
149
/*
150
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
151
* set the success or error code of an emulated VMX instruction (as specified
152
* by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
153
* instruction.
154
*/
155
static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
156
{
157
vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
158
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
159
X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
160
return kvm_skip_emulated_instruction(vcpu);
161
}
162
163
static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
164
{
165
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
166
& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
167
X86_EFLAGS_SF | X86_EFLAGS_OF))
168
| X86_EFLAGS_CF);
169
return kvm_skip_emulated_instruction(vcpu);
170
}
171
172
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
173
u32 vm_instruction_error)
174
{
175
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
176
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
177
X86_EFLAGS_SF | X86_EFLAGS_OF))
178
| X86_EFLAGS_ZF);
179
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
180
/*
181
* We don't need to force sync to shadow VMCS because
182
* VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
183
* fields and thus must be synced.
184
*/
185
if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
186
to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
187
188
return kvm_skip_emulated_instruction(vcpu);
189
}
190
191
static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
192
{
193
struct vcpu_vmx *vmx = to_vmx(vcpu);
194
195
/*
196
* failValid writes the error number to the current VMCS, which
197
* can't be done if there isn't a current VMCS.
198
*/
199
if (vmx->nested.current_vmptr == INVALID_GPA &&
200
!nested_vmx_is_evmptr12_valid(vmx))
201
return nested_vmx_failInvalid(vcpu);
202
203
return nested_vmx_failValid(vcpu, vm_instruction_error);
204
}
205
206
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
207
{
208
/* TODO: not to reset guest simply here. */
209
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
210
pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
211
}
212
213
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
214
{
215
return fixed_bits_valid(control, low, high);
216
}
217
218
static inline u64 vmx_control_msr(u32 low, u32 high)
219
{
220
return low | ((u64)high << 32);
221
}
222
223
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
224
{
225
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
226
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
227
vmx->nested.need_vmcs12_to_shadow_sync = false;
228
}
229
230
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
231
{
232
#ifdef CONFIG_KVM_HYPERV
233
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
234
struct vcpu_vmx *vmx = to_vmx(vcpu);
235
236
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
237
vmx->nested.hv_evmcs = NULL;
238
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
239
240
if (hv_vcpu) {
241
hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
242
hv_vcpu->nested.vm_id = 0;
243
hv_vcpu->nested.vp_id = 0;
244
}
245
#endif
246
}
247
248
static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
249
{
250
#ifdef CONFIG_KVM_HYPERV
251
struct vcpu_vmx *vmx = to_vmx(vcpu);
252
/*
253
* When Enlightened VMEntry is enabled on the calling CPU we treat
254
* memory area pointer by vmptr as Enlightened VMCS (as there's no good
255
* way to distinguish it from VMCS12) and we must not corrupt it by
256
* writing to the non-existent 'launch_state' field. The area doesn't
257
* have to be the currently active EVMCS on the calling CPU and there's
258
* nothing KVM has to do to transition it from 'active' to 'non-active'
259
* state. It is possible that the area will stay mapped as
260
* vmx->nested.hv_evmcs but this shouldn't be a problem.
261
*/
262
if (!guest_cpu_cap_has_evmcs(vcpu) ||
263
!evmptr_is_valid(nested_get_evmptr(vcpu)))
264
return false;
265
266
if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
267
nested_release_evmcs(vcpu);
268
269
return true;
270
#else
271
return false;
272
#endif
273
}
274
275
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
276
struct loaded_vmcs *prev)
277
{
278
struct vmcs_host_state *dest, *src;
279
280
if (unlikely(!vmx->vt.guest_state_loaded))
281
return;
282
283
src = &prev->host_state;
284
dest = &vmx->loaded_vmcs->host_state;
285
286
vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
287
dest->ldt_sel = src->ldt_sel;
288
#ifdef CONFIG_X86_64
289
dest->ds_sel = src->ds_sel;
290
dest->es_sel = src->es_sel;
291
#endif
292
}
293
294
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
295
{
296
struct vcpu_vmx *vmx = to_vmx(vcpu);
297
struct loaded_vmcs *prev;
298
int cpu;
299
300
if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
301
return;
302
303
cpu = get_cpu();
304
prev = vmx->loaded_vmcs;
305
vmx->loaded_vmcs = vmcs;
306
vmx_vcpu_load_vmcs(vcpu, cpu);
307
vmx_sync_vmcs_host_state(vmx, prev);
308
put_cpu();
309
310
vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
311
312
/*
313
* All lazily updated registers will be reloaded from VMCS12 on both
314
* vmentry and vmexit.
315
*/
316
vcpu->arch.regs_dirty = 0;
317
}
318
319
static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
320
{
321
struct vcpu_vmx *vmx = to_vmx(vcpu);
322
323
kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
324
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
325
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
326
vmx->nested.pi_desc = NULL;
327
}
328
329
/*
330
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
331
* just stops using VMX.
332
*/
333
static void free_nested(struct kvm_vcpu *vcpu)
334
{
335
struct vcpu_vmx *vmx = to_vmx(vcpu);
336
337
if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
338
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
339
340
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
341
return;
342
343
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
344
345
vmx->nested.vmxon = false;
346
vmx->nested.smm.vmxon = false;
347
vmx->nested.vmxon_ptr = INVALID_GPA;
348
free_vpid(vmx->nested.vpid02);
349
vmx->nested.posted_intr_nv = -1;
350
vmx->nested.current_vmptr = INVALID_GPA;
351
if (enable_shadow_vmcs) {
352
vmx_disable_shadow_vmcs(vmx);
353
vmcs_clear(vmx->vmcs01.shadow_vmcs);
354
free_vmcs(vmx->vmcs01.shadow_vmcs);
355
vmx->vmcs01.shadow_vmcs = NULL;
356
}
357
kfree(vmx->nested.cached_vmcs12);
358
vmx->nested.cached_vmcs12 = NULL;
359
kfree(vmx->nested.cached_shadow_vmcs12);
360
vmx->nested.cached_shadow_vmcs12 = NULL;
361
362
nested_put_vmcs12_pages(vcpu);
363
364
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
365
366
nested_release_evmcs(vcpu);
367
368
free_loaded_vmcs(&vmx->nested.vmcs02);
369
}
370
371
/*
372
* Ensure that the current vmcs of the logical processor is the
373
* vmcs01 of the vcpu before calling free_nested().
374
*/
375
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
376
{
377
vcpu_load(vcpu);
378
vmx_leave_nested(vcpu);
379
vcpu_put(vcpu);
380
}
381
382
#define EPTP_PA_MASK GENMASK_ULL(51, 12)
383
384
static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
385
{
386
return VALID_PAGE(root_hpa) &&
387
((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
388
}
389
390
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
391
gpa_t addr)
392
{
393
unsigned long roots = 0;
394
uint i;
395
struct kvm_mmu_root_info *cached_root;
396
397
WARN_ON_ONCE(!mmu_is_nested(vcpu));
398
399
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
400
cached_root = &vcpu->arch.mmu->prev_roots[i];
401
402
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
403
eptp))
404
roots |= KVM_MMU_ROOT_PREVIOUS(i);
405
}
406
if (roots)
407
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
408
}
409
410
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
411
struct x86_exception *fault)
412
{
413
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
414
struct vcpu_vmx *vmx = to_vmx(vcpu);
415
unsigned long exit_qualification;
416
u32 vm_exit_reason;
417
418
if (vmx->nested.pml_full) {
419
vm_exit_reason = EXIT_REASON_PML_FULL;
420
vmx->nested.pml_full = false;
421
422
/*
423
* It should be impossible to trigger a nested PML Full VM-Exit
424
* for anything other than an EPT Violation from L2. KVM *can*
425
* trigger nEPT page fault injection in response to an EPT
426
* Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
427
* tables also changed, but KVM should not treat EPT Misconfig
428
* VM-Exits as writes.
429
*/
430
WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
431
432
/*
433
* PML Full and EPT Violation VM-Exits both use bit 12 to report
434
* "NMI unblocking due to IRET", i.e. the bit can be propagated
435
* as-is from the original EXIT_QUALIFICATION.
436
*/
437
exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
438
} else {
439
if (fault->error_code & PFERR_RSVD_MASK) {
440
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
441
exit_qualification = 0;
442
} else {
443
exit_qualification = fault->exit_qualification;
444
exit_qualification |= vmx_get_exit_qual(vcpu) &
445
(EPT_VIOLATION_GVA_IS_VALID |
446
EPT_VIOLATION_GVA_TRANSLATED);
447
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
448
}
449
450
/*
451
* Although the caller (kvm_inject_emulated_page_fault) would
452
* have already synced the faulting address in the shadow EPT
453
* tables for the current EPTP12, we also need to sync it for
454
* any other cached EPTP02s based on the same EP4TA, since the
455
* TLB associates mappings to the EP4TA rather than the full EPTP.
456
*/
457
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
458
fault->address);
459
}
460
461
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
462
vmcs12->guest_physical_address = fault->address;
463
}
464
465
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
466
{
467
struct vcpu_vmx *vmx = to_vmx(vcpu);
468
bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
469
int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
470
471
kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
472
nested_ept_ad_enabled(vcpu),
473
nested_ept_get_eptp(vcpu));
474
}
475
476
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
477
{
478
WARN_ON(mmu_is_nested(vcpu));
479
480
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
481
nested_ept_new_eptp(vcpu);
482
vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
483
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
484
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
485
486
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
487
}
488
489
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
490
{
491
vcpu->arch.mmu = &vcpu->arch.root_mmu;
492
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
493
}
494
495
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
496
u16 error_code)
497
{
498
bool inequality, bit;
499
500
bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
501
inequality =
502
(error_code & vmcs12->page_fault_error_code_mask) !=
503
vmcs12->page_fault_error_code_match;
504
return inequality ^ bit;
505
}
506
507
static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
508
u32 error_code)
509
{
510
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
511
512
/*
513
* Drop bits 31:16 of the error code when performing the #PF mask+match
514
* check. All VMCS fields involved are 32 bits, but Intel CPUs never
515
* set bits 31:16 and VMX disallows setting bits 31:16 in the injected
516
* error code. Including the to-be-dropped bits in the check might
517
* result in an "impossible" or missed exit from L1's perspective.
518
*/
519
if (vector == PF_VECTOR)
520
return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
521
522
return (vmcs12->exception_bitmap & (1u << vector));
523
}
524
525
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
526
struct vmcs12 *vmcs12)
527
{
528
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
529
return 0;
530
531
if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
532
CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
533
return -EINVAL;
534
535
return 0;
536
}
537
538
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
539
struct vmcs12 *vmcs12)
540
{
541
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
542
return 0;
543
544
if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
545
return -EINVAL;
546
547
return 0;
548
}
549
550
static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
551
struct vmcs12 *vmcs12)
552
{
553
if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
554
return 0;
555
556
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
557
return -EINVAL;
558
559
if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
560
return -EINVAL;
561
562
return 0;
563
}
564
565
/*
566
* For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
567
* itself utilizing x2APIC. All MSRs were previously set to be intercepted,
568
* only the "disable intercept" case needs to be handled.
569
*/
570
static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
571
unsigned long *msr_bitmap_l0,
572
u32 msr, int type)
573
{
574
if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
575
vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
576
577
if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
578
vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
579
}
580
581
static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
582
{
583
int msr;
584
585
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
586
unsigned word = msr / BITS_PER_LONG;
587
588
msr_bitmap[word] = ~0;
589
msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
590
}
591
}
592
593
#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
594
static inline \
595
void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
596
unsigned long *msr_bitmap_l1, \
597
unsigned long *msr_bitmap_l0, u32 msr) \
598
{ \
599
if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
600
vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
601
vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
602
else \
603
vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
604
}
605
BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
606
BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
607
608
static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
609
unsigned long *msr_bitmap_l1,
610
unsigned long *msr_bitmap_l0,
611
u32 msr, int types)
612
{
613
if (types & MSR_TYPE_R)
614
nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
615
msr_bitmap_l0, msr);
616
if (types & MSR_TYPE_W)
617
nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
618
msr_bitmap_l0, msr);
619
}
620
621
/*
622
* Merge L0's and L1's MSR bitmap, return false to indicate that
623
* we do not use the hardware.
624
*/
625
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
626
struct vmcs12 *vmcs12)
627
{
628
struct vcpu_vmx *vmx = to_vmx(vcpu);
629
int msr;
630
unsigned long *msr_bitmap_l1;
631
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
632
struct kvm_host_map map;
633
634
/* Nothing to do if the MSR bitmap is not in use. */
635
if (!cpu_has_vmx_msr_bitmap() ||
636
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
637
return false;
638
639
/*
640
* MSR bitmap update can be skipped when:
641
* - MSR bitmap for L1 hasn't changed.
642
* - Nested hypervisor (L1) is attempting to launch the same L2 as
643
* before.
644
* - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
645
* and tells KVM (L0) there were no changes in MSR bitmap for L2.
646
*/
647
if (!vmx->nested.force_msr_bitmap_recalc) {
648
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
649
650
if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
651
evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
652
return true;
653
}
654
655
if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
656
return false;
657
658
msr_bitmap_l1 = (unsigned long *)map.hva;
659
660
/*
661
* To keep the control flow simple, pay eight 8-byte writes (sixteen
662
* 4-byte writes on 32-bit systems) up front to enable intercepts for
663
* the x2APIC MSR range and selectively toggle those relevant to L2.
664
*/
665
enable_x2apic_msr_intercepts(msr_bitmap_l0);
666
667
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
668
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
669
/*
670
* L0 need not intercept reads for MSRs between 0x800
671
* and 0x8ff, it just lets the processor take the value
672
* from the virtual-APIC page; take those 256 bits
673
* directly from the L1 bitmap.
674
*/
675
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
676
unsigned word = msr / BITS_PER_LONG;
677
678
msr_bitmap_l0[word] = msr_bitmap_l1[word];
679
}
680
}
681
682
nested_vmx_disable_intercept_for_x2apic_msr(
683
msr_bitmap_l1, msr_bitmap_l0,
684
X2APIC_MSR(APIC_TASKPRI),
685
MSR_TYPE_R | MSR_TYPE_W);
686
687
if (nested_cpu_has_vid(vmcs12)) {
688
nested_vmx_disable_intercept_for_x2apic_msr(
689
msr_bitmap_l1, msr_bitmap_l0,
690
X2APIC_MSR(APIC_EOI),
691
MSR_TYPE_W);
692
nested_vmx_disable_intercept_for_x2apic_msr(
693
msr_bitmap_l1, msr_bitmap_l0,
694
X2APIC_MSR(APIC_SELF_IPI),
695
MSR_TYPE_W);
696
}
697
}
698
699
/*
700
* Always check vmcs01's bitmap to honor userspace MSR filters and any
701
* other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
702
*/
703
#ifdef CONFIG_X86_64
704
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
705
MSR_FS_BASE, MSR_TYPE_RW);
706
707
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
708
MSR_GS_BASE, MSR_TYPE_RW);
709
710
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
711
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
712
#endif
713
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
714
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
715
716
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
717
MSR_IA32_PRED_CMD, MSR_TYPE_W);
718
719
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
720
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
721
722
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
723
MSR_IA32_APERF, MSR_TYPE_R);
724
725
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
726
MSR_IA32_MPERF, MSR_TYPE_R);
727
728
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
729
MSR_IA32_U_CET, MSR_TYPE_RW);
730
731
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
732
MSR_IA32_S_CET, MSR_TYPE_RW);
733
734
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
735
MSR_IA32_PL0_SSP, MSR_TYPE_RW);
736
737
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
738
MSR_IA32_PL1_SSP, MSR_TYPE_RW);
739
740
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
741
MSR_IA32_PL2_SSP, MSR_TYPE_RW);
742
743
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
744
MSR_IA32_PL3_SSP, MSR_TYPE_RW);
745
746
kvm_vcpu_unmap(vcpu, &map);
747
748
vmx->nested.force_msr_bitmap_recalc = false;
749
750
return true;
751
}
752
753
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
754
struct vmcs12 *vmcs12)
755
{
756
struct vcpu_vmx *vmx = to_vmx(vcpu);
757
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
758
759
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
760
vmcs12->vmcs_link_pointer == INVALID_GPA)
761
return;
762
763
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
764
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
765
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
766
return;
767
768
kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
769
VMCS12_SIZE);
770
}
771
772
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
773
struct vmcs12 *vmcs12)
774
{
775
struct vcpu_vmx *vmx = to_vmx(vcpu);
776
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
777
778
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
779
vmcs12->vmcs_link_pointer == INVALID_GPA)
780
return;
781
782
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
783
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
784
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
785
return;
786
787
kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
788
VMCS12_SIZE);
789
}
790
791
/*
792
* In nested virtualization, check if L1 has set
793
* VM_EXIT_ACK_INTR_ON_EXIT
794
*/
795
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
796
{
797
return get_vmcs12(vcpu)->vm_exit_controls &
798
VM_EXIT_ACK_INTR_ON_EXIT;
799
}
800
801
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
802
struct vmcs12 *vmcs12)
803
{
804
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
805
CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
806
return -EINVAL;
807
else
808
return 0;
809
}
810
811
static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
812
struct vmcs12 *vmcs12)
813
{
814
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
815
!nested_cpu_has_apic_reg_virt(vmcs12) &&
816
!nested_cpu_has_vid(vmcs12) &&
817
!nested_cpu_has_posted_intr(vmcs12))
818
return 0;
819
820
/*
821
* If virtualize x2apic mode is enabled,
822
* virtualize apic access must be disabled.
823
*/
824
if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
825
nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
826
return -EINVAL;
827
828
/*
829
* If virtual interrupt delivery is enabled,
830
* we must exit on external interrupts.
831
*/
832
if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
833
return -EINVAL;
834
835
/*
836
* bits 15:8 should be zero in posted_intr_nv,
837
* the descriptor address has been already checked
838
* in nested_get_vmcs12_pages.
839
*
840
* bits 5:0 of posted_intr_desc_addr should be zero.
841
*/
842
if (nested_cpu_has_posted_intr(vmcs12) &&
843
(CC(!nested_cpu_has_vid(vmcs12)) ||
844
CC(!nested_exit_intr_ack_set(vcpu)) ||
845
CC((vmcs12->posted_intr_nv & 0xff00)) ||
846
CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
847
return -EINVAL;
848
849
/* tpr shadow is needed by all apicv features. */
850
if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
851
return -EINVAL;
852
853
return 0;
854
}
855
856
static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
857
{
858
struct vcpu_vmx *vmx = to_vmx(vcpu);
859
u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
860
vmx->nested.msrs.misc_high);
861
862
return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
863
}
864
865
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
866
u32 count, u64 addr)
867
{
868
if (count == 0)
869
return 0;
870
871
/*
872
* Exceeding the limit results in architecturally _undefined_ behavior,
873
* i.e. KVM is allowed to do literally anything in response to a bad
874
* limit. Immediately generate a consistency check so that code that
875
* consumes the count doesn't need to worry about extreme edge cases.
876
*/
877
if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
878
return -EINVAL;
879
880
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
881
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
882
return -EINVAL;
883
884
return 0;
885
}
886
887
static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
888
struct vmcs12 *vmcs12)
889
{
890
if (CC(nested_vmx_check_msr_switch(vcpu,
891
vmcs12->vm_exit_msr_load_count,
892
vmcs12->vm_exit_msr_load_addr)) ||
893
CC(nested_vmx_check_msr_switch(vcpu,
894
vmcs12->vm_exit_msr_store_count,
895
vmcs12->vm_exit_msr_store_addr)))
896
return -EINVAL;
897
898
return 0;
899
}
900
901
static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
902
struct vmcs12 *vmcs12)
903
{
904
if (CC(nested_vmx_check_msr_switch(vcpu,
905
vmcs12->vm_entry_msr_load_count,
906
vmcs12->vm_entry_msr_load_addr)))
907
return -EINVAL;
908
909
return 0;
910
}
911
912
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
913
struct vmcs12 *vmcs12)
914
{
915
if (!nested_cpu_has_pml(vmcs12))
916
return 0;
917
918
if (CC(!nested_cpu_has_ept(vmcs12)) ||
919
CC(!page_address_valid(vcpu, vmcs12->pml_address)))
920
return -EINVAL;
921
922
return 0;
923
}
924
925
static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
926
struct vmcs12 *vmcs12)
927
{
928
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
929
!nested_cpu_has_ept(vmcs12)))
930
return -EINVAL;
931
return 0;
932
}
933
934
static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
935
struct vmcs12 *vmcs12)
936
{
937
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
938
!nested_cpu_has_ept(vmcs12)))
939
return -EINVAL;
940
return 0;
941
}
942
943
static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
944
struct vmcs12 *vmcs12)
945
{
946
if (!nested_cpu_has_shadow_vmcs(vmcs12))
947
return 0;
948
949
if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
950
CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
951
return -EINVAL;
952
953
return 0;
954
}
955
956
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
957
struct vmx_msr_entry *e)
958
{
959
/* x2APIC MSR accesses are not allowed */
960
if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
961
return -EINVAL;
962
if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
963
CC(e->index == MSR_IA32_UCODE_REV))
964
return -EINVAL;
965
if (CC(e->reserved != 0))
966
return -EINVAL;
967
return 0;
968
}
969
970
static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
971
struct vmx_msr_entry *e)
972
{
973
if (CC(e->index == MSR_FS_BASE) ||
974
CC(e->index == MSR_GS_BASE) ||
975
CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
976
nested_vmx_msr_check_common(vcpu, e))
977
return -EINVAL;
978
return 0;
979
}
980
981
static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
982
struct vmx_msr_entry *e)
983
{
984
if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
985
nested_vmx_msr_check_common(vcpu, e))
986
return -EINVAL;
987
return 0;
988
}
989
990
/*
991
* Load guest's/host's msr at nested entry/exit.
992
* return 0 for success, entry index for failure.
993
*
994
* One of the failure modes for MSR load/store is when a list exceeds the
995
* virtual hardware's capacity. To maintain compatibility with hardware inasmuch
996
* as possible, process all valid entries before failing rather than precheck
997
* for a capacity violation.
998
*/
999
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1000
{
1001
u32 i;
1002
struct vmx_msr_entry e;
1003
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1004
1005
for (i = 0; i < count; i++) {
1006
if (WARN_ON_ONCE(i >= max_msr_list_size))
1007
goto fail;
1008
1009
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
1010
&e, sizeof(e))) {
1011
pr_debug_ratelimited(
1012
"%s cannot read MSR entry (%u, 0x%08llx)\n",
1013
__func__, i, gpa + i * sizeof(e));
1014
goto fail;
1015
}
1016
if (nested_vmx_load_msr_check(vcpu, &e)) {
1017
pr_debug_ratelimited(
1018
"%s check failed (%u, 0x%x, 0x%x)\n",
1019
__func__, i, e.index, e.reserved);
1020
goto fail;
1021
}
1022
if (kvm_emulate_msr_write(vcpu, e.index, e.value)) {
1023
pr_debug_ratelimited(
1024
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1025
__func__, i, e.index, e.value);
1026
goto fail;
1027
}
1028
}
1029
return 0;
1030
fail:
1031
/* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
1032
return i + 1;
1033
}
1034
1035
static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
1036
u32 msr_index,
1037
u64 *data)
1038
{
1039
struct vcpu_vmx *vmx = to_vmx(vcpu);
1040
1041
/*
1042
* If the L0 hypervisor stored a more accurate value for the TSC that
1043
* does not include the time taken for emulation of the L2->L1
1044
* VM-exit in L0, use the more accurate value.
1045
*/
1046
if (msr_index == MSR_IA32_TSC) {
1047
int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
1048
MSR_IA32_TSC);
1049
1050
if (i >= 0) {
1051
u64 val = vmx->msr_autostore.guest.val[i].value;
1052
1053
*data = kvm_read_l1_tsc(vcpu, val);
1054
return true;
1055
}
1056
}
1057
1058
if (kvm_emulate_msr_read(vcpu, msr_index, data)) {
1059
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
1060
msr_index);
1061
return false;
1062
}
1063
return true;
1064
}
1065
1066
static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1067
struct vmx_msr_entry *e)
1068
{
1069
if (kvm_vcpu_read_guest(vcpu,
1070
gpa + i * sizeof(*e),
1071
e, 2 * sizeof(u32))) {
1072
pr_debug_ratelimited(
1073
"%s cannot read MSR entry (%u, 0x%08llx)\n",
1074
__func__, i, gpa + i * sizeof(*e));
1075
return false;
1076
}
1077
if (nested_vmx_store_msr_check(vcpu, e)) {
1078
pr_debug_ratelimited(
1079
"%s check failed (%u, 0x%x, 0x%x)\n",
1080
__func__, i, e->index, e->reserved);
1081
return false;
1082
}
1083
return true;
1084
}
1085
1086
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1087
{
1088
u64 data;
1089
u32 i;
1090
struct vmx_msr_entry e;
1091
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1092
1093
for (i = 0; i < count; i++) {
1094
if (WARN_ON_ONCE(i >= max_msr_list_size))
1095
return -EINVAL;
1096
1097
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1098
return -EINVAL;
1099
1100
if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1101
return -EINVAL;
1102
1103
if (kvm_vcpu_write_guest(vcpu,
1104
gpa + i * sizeof(e) +
1105
offsetof(struct vmx_msr_entry, value),
1106
&data, sizeof(data))) {
1107
pr_debug_ratelimited(
1108
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1109
__func__, i, e.index, data);
1110
return -EINVAL;
1111
}
1112
}
1113
return 0;
1114
}
1115
1116
static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1117
{
1118
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1119
u32 count = vmcs12->vm_exit_msr_store_count;
1120
u64 gpa = vmcs12->vm_exit_msr_store_addr;
1121
struct vmx_msr_entry e;
1122
u32 i;
1123
1124
for (i = 0; i < count; i++) {
1125
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1126
return false;
1127
1128
if (e.index == msr_index)
1129
return true;
1130
}
1131
return false;
1132
}
1133
1134
static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1135
u32 msr_index)
1136
{
1137
struct vcpu_vmx *vmx = to_vmx(vcpu);
1138
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1139
bool in_vmcs12_store_list;
1140
int msr_autostore_slot;
1141
bool in_autostore_list;
1142
int last;
1143
1144
msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1145
in_autostore_list = msr_autostore_slot >= 0;
1146
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1147
1148
if (in_vmcs12_store_list && !in_autostore_list) {
1149
if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1150
/*
1151
* Emulated VMEntry does not fail here. Instead a less
1152
* accurate value will be returned by
1153
* nested_vmx_get_vmexit_msr_value() by reading KVM's
1154
* internal MSR state instead of reading the value from
1155
* the vmcs02 VMExit MSR-store area.
1156
*/
1157
pr_warn_ratelimited(
1158
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
1159
msr_index);
1160
return;
1161
}
1162
last = autostore->nr++;
1163
autostore->val[last].index = msr_index;
1164
} else if (!in_vmcs12_store_list && in_autostore_list) {
1165
last = --autostore->nr;
1166
autostore->val[msr_autostore_slot] = autostore->val[last];
1167
}
1168
}
1169
1170
/*
1171
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1172
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1173
* Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1174
* @entry_failure_code.
1175
*/
1176
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1177
bool nested_ept, bool reload_pdptrs,
1178
enum vm_entry_failure_code *entry_failure_code)
1179
{
1180
if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
1181
*entry_failure_code = ENTRY_FAIL_DEFAULT;
1182
return -EINVAL;
1183
}
1184
1185
/*
1186
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
1187
* must not be dereferenced.
1188
*/
1189
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1190
CC(!load_pdptrs(vcpu, cr3))) {
1191
*entry_failure_code = ENTRY_FAIL_PDPTE;
1192
return -EINVAL;
1193
}
1194
1195
vcpu->arch.cr3 = cr3;
1196
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1197
1198
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1199
kvm_init_mmu(vcpu);
1200
1201
if (!nested_ept)
1202
kvm_mmu_new_pgd(vcpu, cr3);
1203
1204
return 0;
1205
}
1206
1207
/*
1208
* Returns if KVM is able to config CPU to tag TLB entries
1209
* populated by L2 differently than TLB entries populated
1210
* by L1.
1211
*
1212
* If L0 uses EPT, L1 and L2 run with different EPTP because
1213
* guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1214
* are tagged with different EPTP.
1215
*
1216
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1217
* with different VPID (L1 entries are tagged with vmx->vpid
1218
* while L2 entries are tagged with vmx->nested.vpid02).
1219
*/
1220
static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1221
{
1222
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1223
1224
return enable_ept ||
1225
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1226
}
1227
1228
static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1229
struct vmcs12 *vmcs12,
1230
bool is_vmenter)
1231
{
1232
struct vcpu_vmx *vmx = to_vmx(vcpu);
1233
1234
/* Handle pending Hyper-V TLB flush requests */
1235
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
1236
1237
/*
1238
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
1239
* same VPID as the host, and so architecturally, linear and combined
1240
* mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
1241
* emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
1242
* and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
1243
* is required if VPID is disabled in KVM, as a TLB flush (there are no
1244
* VPIDs) still occurs from L1's perspective, and KVM may need to
1245
* synchronize the MMU in response to the guest TLB flush.
1246
*
1247
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1248
* EPT is a special snowflake, as guest-physical mappings aren't
1249
* flushed on VPID invalidations, including VM-Enter or VM-Exit with
1250
* VPID disabled. As a result, KVM _never_ needs to sync nEPT
1251
* entries on VM-Enter because L1 can't rely on VM-Enter to flush
1252
* those mappings.
1253
*/
1254
if (!nested_cpu_has_vpid(vmcs12)) {
1255
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1256
return;
1257
}
1258
1259
/* L2 should never have a VPID if VPID is disabled. */
1260
WARN_ON(!enable_vpid);
1261
1262
/*
1263
* VPID is enabled and in use by vmcs12. If vpid12 is changing, then
1264
* emulate a guest TLB flush as KVM does not track vpid12 history nor
1265
* is the VPID incorporated into the MMU context. I.e. KVM must assume
1266
* that the new vpid12 has never been used and thus represents a new
1267
* guest ASID that cannot have entries in the TLB.
1268
*/
1269
if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1270
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1271
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1272
return;
1273
}
1274
1275
/*
1276
* If VPID is enabled, used by vmc12, and vpid12 is not changing but
1277
* does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1278
* KVM was unable to allocate a VPID for L2, flush the current context
1279
* as the effective ASID is common to both L1 and L2.
1280
*/
1281
if (!nested_has_guest_tlb_tag(vcpu))
1282
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1283
}
1284
1285
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1286
{
1287
superset &= mask;
1288
subset &= mask;
1289
1290
return (superset | subset) == superset;
1291
}
1292
1293
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1294
{
1295
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
1296
VMX_BASIC_INOUT |
1297
VMX_BASIC_TRUE_CTLS |
1298
VMX_BASIC_NO_HW_ERROR_CODE_CC;
1299
1300
const u64 reserved_bits = GENMASK_ULL(63, 57) |
1301
GENMASK_ULL(47, 45) |
1302
BIT_ULL(31);
1303
1304
u64 vmx_basic = vmcs_config.nested.basic;
1305
1306
BUILD_BUG_ON(feature_bits & reserved_bits);
1307
1308
/*
1309
* Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
1310
* inverted polarity), the incoming value must not set feature bits or
1311
* reserved bits that aren't allowed/supported by KVM. Fields, i.e.
1312
* multi-bit values, are explicitly checked below.
1313
*/
1314
if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
1315
return -EINVAL;
1316
1317
/*
1318
* KVM does not emulate a version of VMX that constrains physical
1319
* addresses of VMX structures (e.g. VMCS) to 32-bits.
1320
*/
1321
if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
1322
return -EINVAL;
1323
1324
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1325
vmx_basic_vmcs_revision_id(data))
1326
return -EINVAL;
1327
1328
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1329
return -EINVAL;
1330
1331
vmx->nested.msrs.basic = data;
1332
return 0;
1333
}
1334
1335
static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1336
u32 **low, u32 **high)
1337
{
1338
switch (msr_index) {
1339
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1340
*low = &msrs->pinbased_ctls_low;
1341
*high = &msrs->pinbased_ctls_high;
1342
break;
1343
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1344
*low = &msrs->procbased_ctls_low;
1345
*high = &msrs->procbased_ctls_high;
1346
break;
1347
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1348
*low = &msrs->exit_ctls_low;
1349
*high = &msrs->exit_ctls_high;
1350
break;
1351
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1352
*low = &msrs->entry_ctls_low;
1353
*high = &msrs->entry_ctls_high;
1354
break;
1355
case MSR_IA32_VMX_PROCBASED_CTLS2:
1356
*low = &msrs->secondary_ctls_low;
1357
*high = &msrs->secondary_ctls_high;
1358
break;
1359
default:
1360
BUG();
1361
}
1362
}
1363
1364
static int
1365
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1366
{
1367
u32 *lowp, *highp;
1368
u64 supported;
1369
1370
vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1371
1372
supported = vmx_control_msr(*lowp, *highp);
1373
1374
/* Check must-be-1 bits are still 1. */
1375
if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1376
return -EINVAL;
1377
1378
/* Check must-be-0 bits are still 0. */
1379
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1380
return -EINVAL;
1381
1382
vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1383
*lowp = data;
1384
*highp = data >> 32;
1385
return 0;
1386
}
1387
1388
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1389
{
1390
const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
1391
VMX_MISC_ACTIVITY_HLT |
1392
VMX_MISC_ACTIVITY_SHUTDOWN |
1393
VMX_MISC_ACTIVITY_WAIT_SIPI |
1394
VMX_MISC_INTEL_PT |
1395
VMX_MISC_RDMSR_IN_SMM |
1396
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
1397
VMX_MISC_VMXOFF_BLOCK_SMI |
1398
VMX_MISC_ZERO_LEN_INS;
1399
1400
const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
1401
1402
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1403
vmcs_config.nested.misc_high);
1404
1405
BUILD_BUG_ON(feature_bits & reserved_bits);
1406
1407
/*
1408
* The incoming value must not set feature bits or reserved bits that
1409
* aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
1410
* explicitly checked below.
1411
*/
1412
if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
1413
return -EINVAL;
1414
1415
if ((vmx->nested.msrs.pinbased_ctls_high &
1416
PIN_BASED_VMX_PREEMPTION_TIMER) &&
1417
vmx_misc_preemption_timer_rate(data) !=
1418
vmx_misc_preemption_timer_rate(vmx_misc))
1419
return -EINVAL;
1420
1421
if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1422
return -EINVAL;
1423
1424
if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1425
return -EINVAL;
1426
1427
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1428
return -EINVAL;
1429
1430
vmx->nested.msrs.misc_low = data;
1431
vmx->nested.msrs.misc_high = data >> 32;
1432
1433
return 0;
1434
}
1435
1436
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1437
{
1438
u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1439
vmcs_config.nested.vpid_caps);
1440
1441
/* Every bit is either reserved or a feature bit. */
1442
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1443
return -EINVAL;
1444
1445
vmx->nested.msrs.ept_caps = data;
1446
vmx->nested.msrs.vpid_caps = data >> 32;
1447
return 0;
1448
}
1449
1450
static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1451
{
1452
switch (msr_index) {
1453
case MSR_IA32_VMX_CR0_FIXED0:
1454
return &msrs->cr0_fixed0;
1455
case MSR_IA32_VMX_CR4_FIXED0:
1456
return &msrs->cr4_fixed0;
1457
default:
1458
BUG();
1459
}
1460
}
1461
1462
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1463
{
1464
const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1465
1466
/*
1467
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
1468
* must be 1 in the restored value.
1469
*/
1470
if (!is_bitwise_subset(data, *msr, -1ULL))
1471
return -EINVAL;
1472
1473
*vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1474
return 0;
1475
}
1476
1477
/*
1478
* Called when userspace is restoring VMX MSRs.
1479
*
1480
* Returns 0 on success, non-0 otherwise.
1481
*/
1482
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1483
{
1484
struct vcpu_vmx *vmx = to_vmx(vcpu);
1485
1486
/*
1487
* Don't allow changes to the VMX capability MSRs while the vCPU
1488
* is in VMX operation.
1489
*/
1490
if (vmx->nested.vmxon)
1491
return -EBUSY;
1492
1493
switch (msr_index) {
1494
case MSR_IA32_VMX_BASIC:
1495
return vmx_restore_vmx_basic(vmx, data);
1496
case MSR_IA32_VMX_PINBASED_CTLS:
1497
case MSR_IA32_VMX_PROCBASED_CTLS:
1498
case MSR_IA32_VMX_EXIT_CTLS:
1499
case MSR_IA32_VMX_ENTRY_CTLS:
1500
/*
1501
* The "non-true" VMX capability MSRs are generated from the
1502
* "true" MSRs, so we do not support restoring them directly.
1503
*
1504
* If userspace wants to emulate VMX_BASIC[55]=0, userspace
1505
* should restore the "true" MSRs with the must-be-1 bits
1506
* set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1507
* DEFAULT SETTINGS".
1508
*/
1509
return -EINVAL;
1510
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1511
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1512
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1513
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1514
case MSR_IA32_VMX_PROCBASED_CTLS2:
1515
return vmx_restore_control_msr(vmx, msr_index, data);
1516
case MSR_IA32_VMX_MISC:
1517
return vmx_restore_vmx_misc(vmx, data);
1518
case MSR_IA32_VMX_CR0_FIXED0:
1519
case MSR_IA32_VMX_CR4_FIXED0:
1520
return vmx_restore_fixed0_msr(vmx, msr_index, data);
1521
case MSR_IA32_VMX_CR0_FIXED1:
1522
case MSR_IA32_VMX_CR4_FIXED1:
1523
/*
1524
* These MSRs are generated based on the vCPU's CPUID, so we
1525
* do not support restoring them directly.
1526
*/
1527
return -EINVAL;
1528
case MSR_IA32_VMX_EPT_VPID_CAP:
1529
return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1530
case MSR_IA32_VMX_VMCS_ENUM:
1531
vmx->nested.msrs.vmcs_enum = data;
1532
return 0;
1533
case MSR_IA32_VMX_VMFUNC:
1534
if (data & ~vmcs_config.nested.vmfunc_controls)
1535
return -EINVAL;
1536
vmx->nested.msrs.vmfunc_controls = data;
1537
return 0;
1538
default:
1539
/*
1540
* The rest of the VMX capability MSRs do not support restore.
1541
*/
1542
return -EINVAL;
1543
}
1544
}
1545
1546
/* Returns 0 on success, non-0 otherwise. */
1547
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1548
{
1549
switch (msr_index) {
1550
case MSR_IA32_VMX_BASIC:
1551
*pdata = msrs->basic;
1552
break;
1553
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1554
case MSR_IA32_VMX_PINBASED_CTLS:
1555
*pdata = vmx_control_msr(
1556
msrs->pinbased_ctls_low,
1557
msrs->pinbased_ctls_high);
1558
if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1559
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1560
break;
1561
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1562
case MSR_IA32_VMX_PROCBASED_CTLS:
1563
*pdata = vmx_control_msr(
1564
msrs->procbased_ctls_low,
1565
msrs->procbased_ctls_high);
1566
if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1567
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1568
break;
1569
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1570
case MSR_IA32_VMX_EXIT_CTLS:
1571
*pdata = vmx_control_msr(
1572
msrs->exit_ctls_low,
1573
msrs->exit_ctls_high);
1574
if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1575
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1576
break;
1577
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1578
case MSR_IA32_VMX_ENTRY_CTLS:
1579
*pdata = vmx_control_msr(
1580
msrs->entry_ctls_low,
1581
msrs->entry_ctls_high);
1582
if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1583
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1584
break;
1585
case MSR_IA32_VMX_MISC:
1586
*pdata = vmx_control_msr(
1587
msrs->misc_low,
1588
msrs->misc_high);
1589
break;
1590
case MSR_IA32_VMX_CR0_FIXED0:
1591
*pdata = msrs->cr0_fixed0;
1592
break;
1593
case MSR_IA32_VMX_CR0_FIXED1:
1594
*pdata = msrs->cr0_fixed1;
1595
break;
1596
case MSR_IA32_VMX_CR4_FIXED0:
1597
*pdata = msrs->cr4_fixed0;
1598
break;
1599
case MSR_IA32_VMX_CR4_FIXED1:
1600
*pdata = msrs->cr4_fixed1;
1601
break;
1602
case MSR_IA32_VMX_VMCS_ENUM:
1603
*pdata = msrs->vmcs_enum;
1604
break;
1605
case MSR_IA32_VMX_PROCBASED_CTLS2:
1606
*pdata = vmx_control_msr(
1607
msrs->secondary_ctls_low,
1608
msrs->secondary_ctls_high);
1609
break;
1610
case MSR_IA32_VMX_EPT_VPID_CAP:
1611
*pdata = msrs->ept_caps |
1612
((u64)msrs->vpid_caps << 32);
1613
break;
1614
case MSR_IA32_VMX_VMFUNC:
1615
*pdata = msrs->vmfunc_controls;
1616
break;
1617
default:
1618
return 1;
1619
}
1620
1621
return 0;
1622
}
1623
1624
/*
1625
* Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1626
* been modified by the L1 guest. Note, "writable" in this context means
1627
* "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1628
* fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1629
* VM-exit information fields (which are actually writable if the vCPU is
1630
* configured to support "VMWRITE to any supported field in the VMCS").
1631
*/
1632
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1633
{
1634
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1635
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1636
struct shadow_vmcs_field field;
1637
unsigned long val;
1638
int i;
1639
1640
if (WARN_ON(!shadow_vmcs))
1641
return;
1642
1643
preempt_disable();
1644
1645
vmcs_load(shadow_vmcs);
1646
1647
for (i = 0; i < max_shadow_read_write_fields; i++) {
1648
field = shadow_read_write_fields[i];
1649
val = __vmcs_readl(field.encoding);
1650
vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1651
}
1652
1653
vmcs_clear(shadow_vmcs);
1654
vmcs_load(vmx->loaded_vmcs->vmcs);
1655
1656
preempt_enable();
1657
}
1658
1659
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1660
{
1661
const struct shadow_vmcs_field *fields[] = {
1662
shadow_read_write_fields,
1663
shadow_read_only_fields
1664
};
1665
const int max_fields[] = {
1666
max_shadow_read_write_fields,
1667
max_shadow_read_only_fields
1668
};
1669
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1670
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1671
struct shadow_vmcs_field field;
1672
unsigned long val;
1673
int i, q;
1674
1675
if (WARN_ON(!shadow_vmcs))
1676
return;
1677
1678
vmcs_load(shadow_vmcs);
1679
1680
for (q = 0; q < ARRAY_SIZE(fields); q++) {
1681
for (i = 0; i < max_fields[q]; i++) {
1682
field = fields[q][i];
1683
val = vmcs12_read_any(vmcs12, field.encoding,
1684
field.offset);
1685
__vmcs_writel(field.encoding, val);
1686
}
1687
}
1688
1689
vmcs_clear(shadow_vmcs);
1690
vmcs_load(vmx->loaded_vmcs->vmcs);
1691
}
1692
1693
static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1694
{
1695
#ifdef CONFIG_KVM_HYPERV
1696
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1697
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1698
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
1699
1700
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1701
vmcs12->tpr_threshold = evmcs->tpr_threshold;
1702
vmcs12->guest_rip = evmcs->guest_rip;
1703
1704
if (unlikely(!(hv_clean_fields &
1705
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
1706
hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
1707
hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
1708
hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
1709
}
1710
1711
if (unlikely(!(hv_clean_fields &
1712
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1713
vmcs12->guest_rsp = evmcs->guest_rsp;
1714
vmcs12->guest_rflags = evmcs->guest_rflags;
1715
vmcs12->guest_interruptibility_info =
1716
evmcs->guest_interruptibility_info;
1717
/*
1718
* Not present in struct vmcs12:
1719
* vmcs12->guest_ssp = evmcs->guest_ssp;
1720
*/
1721
}
1722
1723
if (unlikely(!(hv_clean_fields &
1724
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1725
vmcs12->cpu_based_vm_exec_control =
1726
evmcs->cpu_based_vm_exec_control;
1727
}
1728
1729
if (unlikely(!(hv_clean_fields &
1730
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1731
vmcs12->exception_bitmap = evmcs->exception_bitmap;
1732
}
1733
1734
if (unlikely(!(hv_clean_fields &
1735
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1736
vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1737
}
1738
1739
if (unlikely(!(hv_clean_fields &
1740
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1741
vmcs12->vm_entry_intr_info_field =
1742
evmcs->vm_entry_intr_info_field;
1743
vmcs12->vm_entry_exception_error_code =
1744
evmcs->vm_entry_exception_error_code;
1745
vmcs12->vm_entry_instruction_len =
1746
evmcs->vm_entry_instruction_len;
1747
}
1748
1749
if (unlikely(!(hv_clean_fields &
1750
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1751
vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1752
vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1753
vmcs12->host_cr0 = evmcs->host_cr0;
1754
vmcs12->host_cr3 = evmcs->host_cr3;
1755
vmcs12->host_cr4 = evmcs->host_cr4;
1756
vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1757
vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1758
vmcs12->host_rip = evmcs->host_rip;
1759
vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1760
vmcs12->host_es_selector = evmcs->host_es_selector;
1761
vmcs12->host_cs_selector = evmcs->host_cs_selector;
1762
vmcs12->host_ss_selector = evmcs->host_ss_selector;
1763
vmcs12->host_ds_selector = evmcs->host_ds_selector;
1764
vmcs12->host_fs_selector = evmcs->host_fs_selector;
1765
vmcs12->host_gs_selector = evmcs->host_gs_selector;
1766
vmcs12->host_tr_selector = evmcs->host_tr_selector;
1767
vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
1768
/*
1769
* Not present in struct vmcs12:
1770
* vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
1771
* vmcs12->host_ssp = evmcs->host_ssp;
1772
* vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
1773
*/
1774
}
1775
1776
if (unlikely(!(hv_clean_fields &
1777
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1778
vmcs12->pin_based_vm_exec_control =
1779
evmcs->pin_based_vm_exec_control;
1780
vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1781
vmcs12->secondary_vm_exec_control =
1782
evmcs->secondary_vm_exec_control;
1783
}
1784
1785
if (unlikely(!(hv_clean_fields &
1786
HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1787
vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1788
vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1789
}
1790
1791
if (unlikely(!(hv_clean_fields &
1792
HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1793
vmcs12->msr_bitmap = evmcs->msr_bitmap;
1794
}
1795
1796
if (unlikely(!(hv_clean_fields &
1797
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1798
vmcs12->guest_es_base = evmcs->guest_es_base;
1799
vmcs12->guest_cs_base = evmcs->guest_cs_base;
1800
vmcs12->guest_ss_base = evmcs->guest_ss_base;
1801
vmcs12->guest_ds_base = evmcs->guest_ds_base;
1802
vmcs12->guest_fs_base = evmcs->guest_fs_base;
1803
vmcs12->guest_gs_base = evmcs->guest_gs_base;
1804
vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1805
vmcs12->guest_tr_base = evmcs->guest_tr_base;
1806
vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1807
vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1808
vmcs12->guest_es_limit = evmcs->guest_es_limit;
1809
vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1810
vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1811
vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1812
vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1813
vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1814
vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1815
vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1816
vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1817
vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1818
vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1819
vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1820
vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1821
vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1822
vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1823
vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1824
vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1825
vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1826
vmcs12->guest_es_selector = evmcs->guest_es_selector;
1827
vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1828
vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1829
vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1830
vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1831
vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1832
vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1833
vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1834
}
1835
1836
if (unlikely(!(hv_clean_fields &
1837
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1838
vmcs12->tsc_offset = evmcs->tsc_offset;
1839
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1840
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1841
vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
1842
vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
1843
}
1844
1845
if (unlikely(!(hv_clean_fields &
1846
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1847
vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1848
vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1849
vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1850
vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1851
vmcs12->guest_cr0 = evmcs->guest_cr0;
1852
vmcs12->guest_cr3 = evmcs->guest_cr3;
1853
vmcs12->guest_cr4 = evmcs->guest_cr4;
1854
vmcs12->guest_dr7 = evmcs->guest_dr7;
1855
}
1856
1857
if (unlikely(!(hv_clean_fields &
1858
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1859
vmcs12->host_fs_base = evmcs->host_fs_base;
1860
vmcs12->host_gs_base = evmcs->host_gs_base;
1861
vmcs12->host_tr_base = evmcs->host_tr_base;
1862
vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1863
vmcs12->host_idtr_base = evmcs->host_idtr_base;
1864
vmcs12->host_rsp = evmcs->host_rsp;
1865
}
1866
1867
if (unlikely(!(hv_clean_fields &
1868
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1869
vmcs12->ept_pointer = evmcs->ept_pointer;
1870
vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1871
}
1872
1873
if (unlikely(!(hv_clean_fields &
1874
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1875
vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1876
vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1877
vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1878
vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1879
vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1880
vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1881
vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1882
vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1883
vmcs12->guest_pending_dbg_exceptions =
1884
evmcs->guest_pending_dbg_exceptions;
1885
vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1886
vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1887
vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1888
vmcs12->guest_activity_state = evmcs->guest_activity_state;
1889
vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1890
vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
1891
/*
1892
* Not present in struct vmcs12:
1893
* vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
1894
* vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
1895
* vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
1896
*/
1897
}
1898
1899
/*
1900
* Not used?
1901
* vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1902
* vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1903
* vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1904
* vmcs12->page_fault_error_code_mask =
1905
* evmcs->page_fault_error_code_mask;
1906
* vmcs12->page_fault_error_code_match =
1907
* evmcs->page_fault_error_code_match;
1908
* vmcs12->cr3_target_count = evmcs->cr3_target_count;
1909
* vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1910
* vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1911
* vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1912
*/
1913
1914
/*
1915
* Read only fields:
1916
* vmcs12->guest_physical_address = evmcs->guest_physical_address;
1917
* vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1918
* vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1919
* vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1920
* vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1921
* vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1922
* vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1923
* vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1924
* vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1925
* vmcs12->exit_qualification = evmcs->exit_qualification;
1926
* vmcs12->guest_linear_address = evmcs->guest_linear_address;
1927
*
1928
* Not present in struct vmcs12:
1929
* vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1930
* vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1931
* vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1932
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1933
*/
1934
1935
return;
1936
#else /* CONFIG_KVM_HYPERV */
1937
KVM_BUG_ON(1, vmx->vcpu.kvm);
1938
#endif /* CONFIG_KVM_HYPERV */
1939
}
1940
1941
static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1942
{
1943
#ifdef CONFIG_KVM_HYPERV
1944
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1945
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1946
1947
/*
1948
* Should not be changed by KVM:
1949
*
1950
* evmcs->host_es_selector = vmcs12->host_es_selector;
1951
* evmcs->host_cs_selector = vmcs12->host_cs_selector;
1952
* evmcs->host_ss_selector = vmcs12->host_ss_selector;
1953
* evmcs->host_ds_selector = vmcs12->host_ds_selector;
1954
* evmcs->host_fs_selector = vmcs12->host_fs_selector;
1955
* evmcs->host_gs_selector = vmcs12->host_gs_selector;
1956
* evmcs->host_tr_selector = vmcs12->host_tr_selector;
1957
* evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1958
* evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1959
* evmcs->host_cr0 = vmcs12->host_cr0;
1960
* evmcs->host_cr3 = vmcs12->host_cr3;
1961
* evmcs->host_cr4 = vmcs12->host_cr4;
1962
* evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1963
* evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1964
* evmcs->host_rip = vmcs12->host_rip;
1965
* evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1966
* evmcs->host_fs_base = vmcs12->host_fs_base;
1967
* evmcs->host_gs_base = vmcs12->host_gs_base;
1968
* evmcs->host_tr_base = vmcs12->host_tr_base;
1969
* evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1970
* evmcs->host_idtr_base = vmcs12->host_idtr_base;
1971
* evmcs->host_rsp = vmcs12->host_rsp;
1972
* sync_vmcs02_to_vmcs12() doesn't read these:
1973
* evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1974
* evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1975
* evmcs->msr_bitmap = vmcs12->msr_bitmap;
1976
* evmcs->ept_pointer = vmcs12->ept_pointer;
1977
* evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1978
* evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1979
* evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1980
* evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1981
* evmcs->tpr_threshold = vmcs12->tpr_threshold;
1982
* evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1983
* evmcs->exception_bitmap = vmcs12->exception_bitmap;
1984
* evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1985
* evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1986
* evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1987
* evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1988
* evmcs->page_fault_error_code_mask =
1989
* vmcs12->page_fault_error_code_mask;
1990
* evmcs->page_fault_error_code_match =
1991
* vmcs12->page_fault_error_code_match;
1992
* evmcs->cr3_target_count = vmcs12->cr3_target_count;
1993
* evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1994
* evmcs->tsc_offset = vmcs12->tsc_offset;
1995
* evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1996
* evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1997
* evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1998
* evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1999
* evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
2000
* evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
2001
* evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
2002
* evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
2003
* evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
2004
* evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
2005
* evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
2006
* evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
2007
*
2008
* Not present in struct vmcs12:
2009
* evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
2010
* evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
2011
* evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
2012
* evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
2013
* evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
2014
* evmcs->host_ssp = vmcs12->host_ssp;
2015
* evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
2016
* evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
2017
* evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
2018
* evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
2019
* evmcs->guest_ssp = vmcs12->guest_ssp;
2020
*/
2021
2022
evmcs->guest_es_selector = vmcs12->guest_es_selector;
2023
evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
2024
evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
2025
evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
2026
evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
2027
evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
2028
evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
2029
evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
2030
2031
evmcs->guest_es_limit = vmcs12->guest_es_limit;
2032
evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
2033
evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
2034
evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
2035
evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
2036
evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
2037
evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
2038
evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
2039
evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
2040
evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
2041
2042
evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
2043
evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
2044
evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
2045
evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
2046
evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
2047
evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
2048
evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
2049
evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
2050
2051
evmcs->guest_es_base = vmcs12->guest_es_base;
2052
evmcs->guest_cs_base = vmcs12->guest_cs_base;
2053
evmcs->guest_ss_base = vmcs12->guest_ss_base;
2054
evmcs->guest_ds_base = vmcs12->guest_ds_base;
2055
evmcs->guest_fs_base = vmcs12->guest_fs_base;
2056
evmcs->guest_gs_base = vmcs12->guest_gs_base;
2057
evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
2058
evmcs->guest_tr_base = vmcs12->guest_tr_base;
2059
evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
2060
evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
2061
2062
evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
2063
evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
2064
2065
evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
2066
evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
2067
evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
2068
evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
2069
2070
evmcs->guest_pending_dbg_exceptions =
2071
vmcs12->guest_pending_dbg_exceptions;
2072
evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
2073
evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
2074
2075
evmcs->guest_activity_state = vmcs12->guest_activity_state;
2076
evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
2077
2078
evmcs->guest_cr0 = vmcs12->guest_cr0;
2079
evmcs->guest_cr3 = vmcs12->guest_cr3;
2080
evmcs->guest_cr4 = vmcs12->guest_cr4;
2081
evmcs->guest_dr7 = vmcs12->guest_dr7;
2082
2083
evmcs->guest_physical_address = vmcs12->guest_physical_address;
2084
2085
evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
2086
evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
2087
evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
2088
evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
2089
evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
2090
evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
2091
evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
2092
evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
2093
2094
evmcs->exit_qualification = vmcs12->exit_qualification;
2095
2096
evmcs->guest_linear_address = vmcs12->guest_linear_address;
2097
evmcs->guest_rsp = vmcs12->guest_rsp;
2098
evmcs->guest_rflags = vmcs12->guest_rflags;
2099
2100
evmcs->guest_interruptibility_info =
2101
vmcs12->guest_interruptibility_info;
2102
evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
2103
evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
2104
evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
2105
evmcs->vm_entry_exception_error_code =
2106
vmcs12->vm_entry_exception_error_code;
2107
evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
2108
2109
evmcs->guest_rip = vmcs12->guest_rip;
2110
2111
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
2112
2113
return;
2114
#else /* CONFIG_KVM_HYPERV */
2115
KVM_BUG_ON(1, vmx->vcpu.kvm);
2116
#endif /* CONFIG_KVM_HYPERV */
2117
}
2118
2119
/*
2120
* This is an equivalent of the nested hypervisor executing the vmptrld
2121
* instruction.
2122
*/
2123
static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
2124
struct kvm_vcpu *vcpu, bool from_launch)
2125
{
2126
#ifdef CONFIG_KVM_HYPERV
2127
struct vcpu_vmx *vmx = to_vmx(vcpu);
2128
bool evmcs_gpa_changed = false;
2129
u64 evmcs_gpa;
2130
2131
if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
2132
return EVMPTRLD_DISABLED;
2133
2134
evmcs_gpa = nested_get_evmptr(vcpu);
2135
if (!evmptr_is_valid(evmcs_gpa)) {
2136
nested_release_evmcs(vcpu);
2137
return EVMPTRLD_DISABLED;
2138
}
2139
2140
if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
2141
vmx->nested.current_vmptr = INVALID_GPA;
2142
2143
nested_release_evmcs(vcpu);
2144
2145
if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2146
&vmx->nested.hv_evmcs_map))
2147
return EVMPTRLD_ERROR;
2148
2149
vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2150
2151
/*
2152
* Currently, KVM only supports eVMCS version 1
2153
* (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2154
* value to first u32 field of eVMCS which should specify eVMCS
2155
* VersionNumber.
2156
*
2157
* Guest should be aware of supported eVMCS versions by host by
2158
* examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2159
* expected to set this CPUID leaf according to the value
2160
* returned in vmcs_version from nested_enable_evmcs().
2161
*
2162
* However, it turns out that Microsoft Hyper-V fails to comply
2163
* to their own invented interface: When Hyper-V use eVMCS, it
2164
* just sets first u32 field of eVMCS to revision_id specified
2165
* in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2166
* which is one of the supported versions specified in
2167
* CPUID.0x4000000A.EAX[0:15].
2168
*
2169
* To overcome Hyper-V bug, we accept here either a supported
2170
* eVMCS version or VMCS12 revision_id as valid values for first
2171
* u32 field of eVMCS.
2172
*/
2173
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2174
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2175
nested_release_evmcs(vcpu);
2176
return EVMPTRLD_VMFAIL;
2177
}
2178
2179
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2180
2181
evmcs_gpa_changed = true;
2182
/*
2183
* Unlike normal vmcs12, enlightened vmcs12 is not fully
2184
* reloaded from guest's memory (read only fields, fields not
2185
* present in struct hv_enlightened_vmcs, ...). Make sure there
2186
* are no leftovers.
2187
*/
2188
if (from_launch) {
2189
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2190
memset(vmcs12, 0, sizeof(*vmcs12));
2191
vmcs12->hdr.revision_id = VMCS12_REVISION;
2192
}
2193
2194
}
2195
2196
/*
2197
* Clean fields data can't be used on VMLAUNCH and when we switch
2198
* between different L2 guests as KVM keeps a single VMCS12 per L1.
2199
*/
2200
if (from_launch || evmcs_gpa_changed) {
2201
vmx->nested.hv_evmcs->hv_clean_fields &=
2202
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2203
2204
vmx->nested.force_msr_bitmap_recalc = true;
2205
}
2206
2207
return EVMPTRLD_SUCCEEDED;
2208
#else
2209
return EVMPTRLD_DISABLED;
2210
#endif
2211
}
2212
2213
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2214
{
2215
struct vcpu_vmx *vmx = to_vmx(vcpu);
2216
2217
if (nested_vmx_is_evmptr12_valid(vmx))
2218
copy_vmcs12_to_enlightened(vmx);
2219
else
2220
copy_vmcs12_to_shadow(vmx);
2221
2222
vmx->nested.need_vmcs12_to_shadow_sync = false;
2223
}
2224
2225
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2226
{
2227
struct vcpu_vmx *vmx =
2228
container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2229
2230
vmx->nested.preemption_timer_expired = true;
2231
kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2232
kvm_vcpu_kick(&vmx->vcpu);
2233
2234
return HRTIMER_NORESTART;
2235
}
2236
2237
static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2238
{
2239
struct vcpu_vmx *vmx = to_vmx(vcpu);
2240
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2241
2242
u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2243
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2244
2245
if (!vmx->nested.has_preemption_timer_deadline) {
2246
vmx->nested.preemption_timer_deadline =
2247
vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2248
vmx->nested.has_preemption_timer_deadline = true;
2249
}
2250
return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2251
}
2252
2253
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2254
u64 preemption_timeout)
2255
{
2256
struct vcpu_vmx *vmx = to_vmx(vcpu);
2257
2258
/*
2259
* A timer value of zero is architecturally guaranteed to cause
2260
* a VMExit prior to executing any instructions in the guest.
2261
*/
2262
if (preemption_timeout == 0) {
2263
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2264
return;
2265
}
2266
2267
if (vcpu->arch.virtual_tsc_khz == 0)
2268
return;
2269
2270
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2271
preemption_timeout *= 1000000;
2272
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2273
hrtimer_start(&vmx->nested.preemption_timer,
2274
ktime_add_ns(ktime_get(), preemption_timeout),
2275
HRTIMER_MODE_ABS_PINNED);
2276
}
2277
2278
static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2279
{
2280
if (vmx->nested.nested_run_pending &&
2281
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2282
return vmcs12->guest_ia32_efer;
2283
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2284
return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2285
else
2286
return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2287
}
2288
2289
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2290
{
2291
struct kvm *kvm = vmx->vcpu.kvm;
2292
2293
/*
2294
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
2295
* according to L0's settings (vmcs12 is irrelevant here). Host
2296
* fields that come from L0 and are not constant, e.g. HOST_CR3,
2297
* will be set as needed prior to VMLAUNCH/VMRESUME.
2298
*/
2299
if (vmx->nested.vmcs02_initialized)
2300
return;
2301
vmx->nested.vmcs02_initialized = true;
2302
2303
if (vmx->ve_info)
2304
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
2305
2306
/* All VMFUNCs are currently emulated through L0 vmexits. */
2307
if (cpu_has_vmx_vmfunc())
2308
vmcs_write64(VM_FUNCTION_CONTROL, 0);
2309
2310
if (cpu_has_vmx_posted_intr())
2311
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2312
2313
if (cpu_has_vmx_msr_bitmap())
2314
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2315
2316
/*
2317
* PML is emulated for L2, but never enabled in hardware as the MMU
2318
* handles A/D emulation. Disabling PML for L2 also avoids having to
2319
* deal with filtering out L2 GPAs from the buffer.
2320
*/
2321
if (enable_pml) {
2322
vmcs_write64(PML_ADDRESS, 0);
2323
vmcs_write16(GUEST_PML_INDEX, -1);
2324
}
2325
2326
if (cpu_has_vmx_encls_vmexit())
2327
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
2328
2329
if (kvm_notify_vmexit_enabled(kvm))
2330
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
2331
2332
/*
2333
* Set the MSR load/store lists to match L0's settings. Only the
2334
* addresses are constant (for vmcs02), the counts can change based
2335
* on L2's behavior, e.g. switching to/from long mode.
2336
*/
2337
vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2338
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2339
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2340
2341
vmx_set_constant_host_state(vmx);
2342
}
2343
2344
static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2345
struct vmcs12 *vmcs12)
2346
{
2347
prepare_vmcs02_constant_state(vmx);
2348
2349
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
2350
2351
/*
2352
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
2353
* same VPID as the host. Emulate this behavior by using vpid01 for L2
2354
* if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
2355
* and VM-Exit are architecturally required to flush VPID=0, but *only*
2356
* VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
2357
* required flushes), but doing so would cause KVM to over-flush. E.g.
2358
* if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
2359
* and then runs L2 X again, then KVM can and should retain TLB entries
2360
* for VPID12=1.
2361
*/
2362
if (enable_vpid) {
2363
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2364
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2365
else
2366
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2367
}
2368
}
2369
2370
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2371
struct vmcs12 *vmcs12)
2372
{
2373
u32 exec_control;
2374
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2375
2376
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
2377
prepare_vmcs02_early_rare(vmx, vmcs12);
2378
2379
/*
2380
* PIN CONTROLS
2381
*/
2382
exec_control = __pin_controls_get(vmcs01);
2383
exec_control |= (vmcs12->pin_based_vm_exec_control &
2384
~PIN_BASED_VMX_PREEMPTION_TIMER);
2385
2386
/* Posted interrupts setting is only taken from vmcs12. */
2387
vmx->nested.pi_pending = false;
2388
if (nested_cpu_has_posted_intr(vmcs12)) {
2389
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2390
} else {
2391
vmx->nested.posted_intr_nv = -1;
2392
exec_control &= ~PIN_BASED_POSTED_INTR;
2393
}
2394
pin_controls_set(vmx, exec_control);
2395
2396
/*
2397
* EXEC CONTROLS
2398
*/
2399
exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2400
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2401
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2402
exec_control &= ~CPU_BASED_TPR_SHADOW;
2403
exec_control |= vmcs12->cpu_based_vm_exec_control;
2404
2405
vmx->nested.l1_tpr_threshold = -1;
2406
if (exec_control & CPU_BASED_TPR_SHADOW)
2407
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2408
#ifdef CONFIG_X86_64
2409
else
2410
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2411
CPU_BASED_CR8_STORE_EXITING;
2412
#endif
2413
2414
/*
2415
* A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2416
* for I/O port accesses.
2417
*/
2418
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2419
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2420
2421
/*
2422
* This bit will be computed in nested_get_vmcs12_pages, because
2423
* we do not have access to L1's MSR bitmap yet. For now, keep
2424
* the same bit as before, hoping to avoid multiple VMWRITEs that
2425
* only set/clear this bit.
2426
*/
2427
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2428
exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2429
2430
exec_controls_set(vmx, exec_control);
2431
2432
/*
2433
* SECONDARY EXEC CONTROLS
2434
*/
2435
if (cpu_has_secondary_exec_ctrls()) {
2436
exec_control = __secondary_exec_controls_get(vmcs01);
2437
2438
/* Take the following fields only from vmcs12 */
2439
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2440
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2441
SECONDARY_EXEC_ENABLE_INVPCID |
2442
SECONDARY_EXEC_ENABLE_RDTSCP |
2443
SECONDARY_EXEC_ENABLE_XSAVES |
2444
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2445
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2446
SECONDARY_EXEC_APIC_REGISTER_VIRT |
2447
SECONDARY_EXEC_ENABLE_VMFUNC |
2448
SECONDARY_EXEC_DESC);
2449
2450
if (nested_cpu_has(vmcs12,
2451
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2452
exec_control |= vmcs12->secondary_vm_exec_control;
2453
2454
/* PML is emulated and never enabled in hardware for L2. */
2455
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2456
2457
/* VMCS shadowing for L2 is emulated for now */
2458
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2459
2460
/*
2461
* Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2462
* will not have to rewrite the controls just for this bit.
2463
*/
2464
if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
2465
exec_control |= SECONDARY_EXEC_DESC;
2466
2467
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2468
vmcs_write16(GUEST_INTR_STATUS,
2469
vmcs12->guest_intr_status);
2470
2471
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2472
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2473
2474
if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2475
vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2476
2477
secondary_exec_controls_set(vmx, exec_control);
2478
}
2479
2480
/*
2481
* ENTRY CONTROLS
2482
*
2483
* vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2484
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2485
* on the related bits (if supported by the CPU) in the hope that
2486
* we can avoid VMWrites during vmx_set_efer().
2487
*
2488
* Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
2489
* loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
2490
* do the same for L2.
2491
*/
2492
exec_control = __vm_entry_controls_get(vmcs01);
2493
exec_control |= (vmcs12->vm_entry_controls &
2494
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
2495
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2496
if (cpu_has_load_ia32_efer()) {
2497
if (guest_efer & EFER_LMA)
2498
exec_control |= VM_ENTRY_IA32E_MODE;
2499
if (guest_efer != kvm_host.efer)
2500
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2501
}
2502
vm_entry_controls_set(vmx, exec_control);
2503
2504
/*
2505
* EXIT CONTROLS
2506
*
2507
* L2->L1 exit controls are emulated - the hardware exit is to L0 so
2508
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2509
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
2510
*/
2511
exec_control = __vm_exit_controls_get(vmcs01);
2512
if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
2513
exec_control |= VM_EXIT_LOAD_IA32_EFER;
2514
else
2515
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2516
vm_exit_controls_set(vmx, exec_control);
2517
2518
/*
2519
* Interrupt/Exception Fields
2520
*/
2521
if (vmx->nested.nested_run_pending) {
2522
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2523
vmcs12->vm_entry_intr_info_field);
2524
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2525
vmcs12->vm_entry_exception_error_code);
2526
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2527
vmcs12->vm_entry_instruction_len);
2528
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2529
vmcs12->guest_interruptibility_info);
2530
vmx->loaded_vmcs->nmi_known_unmasked =
2531
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2532
} else {
2533
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2534
}
2535
}
2536
2537
static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet,
2538
u64 *ssp, u64 *ssp_tbl)
2539
{
2540
if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
2541
guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
2542
*s_cet = vmcs_readl(GUEST_S_CET);
2543
2544
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
2545
*ssp = vmcs_readl(GUEST_SSP);
2546
*ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
2547
}
2548
}
2549
2550
static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet,
2551
u64 ssp, u64 ssp_tbl)
2552
{
2553
if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
2554
guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
2555
vmcs_writel(GUEST_S_CET, s_cet);
2556
2557
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
2558
vmcs_writel(GUEST_SSP, ssp);
2559
vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl);
2560
}
2561
}
2562
2563
static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2564
{
2565
struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
2566
2567
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2568
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2569
2570
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2571
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2572
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2573
vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2574
vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2575
vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2576
vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2577
vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2578
vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2579
vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2580
vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2581
vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2582
vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2583
vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2584
vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2585
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2586
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2587
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2588
vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2589
vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2590
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2591
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2592
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2593
vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2594
vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2595
vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2596
vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2597
vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2598
vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2599
vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2600
vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2601
vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2602
vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2603
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2604
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2605
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2606
2607
vmx_segment_cache_clear(vmx);
2608
}
2609
2610
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2611
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2612
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2613
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2614
vmcs12->guest_pending_dbg_exceptions);
2615
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2616
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2617
2618
/*
2619
* L1 may access the L2's PDPTR, so save them to construct
2620
* vmcs12
2621
*/
2622
if (enable_ept) {
2623
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2624
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2625
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2626
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2627
}
2628
2629
if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2630
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2631
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2632
}
2633
2634
if (nested_cpu_has_xsaves(vmcs12))
2635
vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2636
2637
/*
2638
* Whether page-faults are trapped is determined by a combination of
2639
* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2640
* doesn't care about page faults then we should set all of these to
2641
* L1's desires. However, if L0 does care about (some) page faults, it
2642
* is not easy (if at all possible?) to merge L0 and L1's desires, we
2643
* simply ask to exit on each and every L2 page fault. This is done by
2644
* setting MASK=MATCH=0 and (see below) EB.PF=1.
2645
* Note that below we don't need special code to set EB.PF beyond the
2646
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2647
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2648
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
2649
*/
2650
if (vmx_need_pf_intercept(&vmx->vcpu)) {
2651
/*
2652
* TODO: if both L0 and L1 need the same MASK and MATCH,
2653
* go ahead and use it?
2654
*/
2655
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2656
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2657
} else {
2658
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2659
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2660
}
2661
2662
if (cpu_has_vmx_apicv()) {
2663
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2664
vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2665
vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2666
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2667
}
2668
2669
/*
2670
* Make sure the msr_autostore list is up to date before we set the
2671
* count in the vmcs02.
2672
*/
2673
prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2674
2675
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2676
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2677
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2678
2679
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)
2680
vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet,
2681
vmcs12->guest_ssp, vmcs12->guest_ssp_tbl);
2682
2683
set_cr4_guest_host_mask(vmx);
2684
}
2685
2686
/*
2687
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2688
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2689
* with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2690
* guest in a way that will both be appropriate to L1's requests, and our
2691
* needs. In addition to modifying the active vmcs (which is vmcs02), this
2692
* function also has additional necessary side-effects, like setting various
2693
* vcpu->arch fields.
2694
* Returns 0 on success, 1 on failure. Invalid state exit qualification code
2695
* is assigned to entry_failure_code on failure.
2696
*/
2697
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2698
bool from_vmentry,
2699
enum vm_entry_failure_code *entry_failure_code)
2700
{
2701
struct vcpu_vmx *vmx = to_vmx(vcpu);
2702
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
2703
bool load_guest_pdptrs_vmcs12 = false;
2704
2705
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
2706
prepare_vmcs02_rare(vmx, vmcs12);
2707
vmx->nested.dirty_vmcs12 = false;
2708
2709
load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
2710
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2711
}
2712
2713
if (vmx->nested.nested_run_pending &&
2714
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2715
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2716
vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
2717
vmx_get_supported_debugctl(vcpu, false));
2718
} else {
2719
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2720
vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
2721
}
2722
2723
if (!vmx->nested.nested_run_pending ||
2724
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
2725
vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet,
2726
vmx->nested.pre_vmenter_ssp,
2727
vmx->nested.pre_vmenter_ssp_tbl);
2728
2729
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2730
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2731
vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
2732
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2733
2734
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2735
* bitwise-or of what L1 wants to trap for L2, and what we want to
2736
* trap. Note that CR0.TS also needs updating - we do this later.
2737
*/
2738
vmx_update_exception_bitmap(vcpu);
2739
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2740
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2741
2742
if (vmx->nested.nested_run_pending &&
2743
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2744
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2745
vcpu->arch.pat = vmcs12->guest_ia32_pat;
2746
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2747
vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
2748
}
2749
2750
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2751
vcpu->arch.l1_tsc_offset,
2752
vmx_get_l2_tsc_offset(vcpu),
2753
vmx_get_l2_tsc_multiplier(vcpu));
2754
2755
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2756
vcpu->arch.l1_tsc_scaling_ratio,
2757
vmx_get_l2_tsc_multiplier(vcpu));
2758
2759
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2760
if (kvm_caps.has_tsc_control)
2761
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2762
2763
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2764
2765
if (nested_cpu_has_ept(vmcs12))
2766
nested_ept_init_mmu_context(vcpu);
2767
2768
/*
2769
* Override the CR0/CR4 read shadows after setting the effective guest
2770
* CR0/CR4. The common helpers also set the shadows, but they don't
2771
* account for vmcs12's cr0/4_guest_host_mask.
2772
*/
2773
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2774
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2775
2776
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2777
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2778
2779
vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2780
/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2781
vmx_set_efer(vcpu, vcpu->arch.efer);
2782
2783
/*
2784
* Guest state is invalid and unrestricted guest is disabled,
2785
* which means L1 attempted VMEntry to L2 with invalid state.
2786
* Fail the VMEntry.
2787
*
2788
* However when force loading the guest state (SMM exit or
2789
* loading nested state after migration, it is possible to
2790
* have invalid guest state now, which will be later fixed by
2791
* restoring L2 register state
2792
*/
2793
if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2794
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2795
return -EINVAL;
2796
}
2797
2798
/* Shadow page tables on either EPT or shadow page tables. */
2799
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2800
from_vmentry, entry_failure_code))
2801
return -EINVAL;
2802
2803
/*
2804
* Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2805
* on nested VM-Exit, which can occur without actually running L2 and
2806
* thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2807
* vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2808
* transition to HLT instead of running L2.
2809
*/
2810
if (enable_ept)
2811
vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2812
2813
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2814
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2815
is_pae_paging(vcpu)) {
2816
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2817
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2818
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2819
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2820
}
2821
2822
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2823
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
2824
WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2825
vmcs12->guest_ia32_perf_global_ctrl))) {
2826
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2827
return -EINVAL;
2828
}
2829
2830
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2831
kvm_rip_write(vcpu, vmcs12->guest_rip);
2832
2833
/*
2834
* It was observed that genuine Hyper-V running in L1 doesn't reset
2835
* 'hv_clean_fields' by itself, it only sets the corresponding dirty
2836
* bits when it changes a field in eVMCS. Mark all fields as clean
2837
* here.
2838
*/
2839
if (nested_vmx_is_evmptr12_valid(vmx))
2840
evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2841
2842
return 0;
2843
}
2844
2845
static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2846
{
2847
if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2848
nested_cpu_has_virtual_nmis(vmcs12)))
2849
return -EINVAL;
2850
2851
if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2852
nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2853
return -EINVAL;
2854
2855
return 0;
2856
}
2857
2858
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2859
{
2860
struct vcpu_vmx *vmx = to_vmx(vcpu);
2861
2862
/* Check for memory type validity */
2863
switch (new_eptp & VMX_EPTP_MT_MASK) {
2864
case VMX_EPTP_MT_UC:
2865
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2866
return false;
2867
break;
2868
case VMX_EPTP_MT_WB:
2869
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2870
return false;
2871
break;
2872
default:
2873
return false;
2874
}
2875
2876
/* Page-walk levels validity. */
2877
switch (new_eptp & VMX_EPTP_PWL_MASK) {
2878
case VMX_EPTP_PWL_5:
2879
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2880
return false;
2881
break;
2882
case VMX_EPTP_PWL_4:
2883
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2884
return false;
2885
break;
2886
default:
2887
return false;
2888
}
2889
2890
/* Reserved bits should not be set */
2891
if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2892
return false;
2893
2894
/* AD, if set, should be supported */
2895
if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2896
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2897
return false;
2898
}
2899
2900
return true;
2901
}
2902
2903
/*
2904
* Checks related to VM-Execution Control Fields
2905
*/
2906
static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2907
struct vmcs12 *vmcs12)
2908
{
2909
struct vcpu_vmx *vmx = to_vmx(vcpu);
2910
2911
if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2912
vmx->nested.msrs.pinbased_ctls_low,
2913
vmx->nested.msrs.pinbased_ctls_high)) ||
2914
CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2915
vmx->nested.msrs.procbased_ctls_low,
2916
vmx->nested.msrs.procbased_ctls_high)))
2917
return -EINVAL;
2918
2919
if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2920
CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2921
vmx->nested.msrs.secondary_ctls_low,
2922
vmx->nested.msrs.secondary_ctls_high)))
2923
return -EINVAL;
2924
2925
if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2926
nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2927
nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2928
nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2929
nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2930
nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2931
nested_vmx_check_nmi_controls(vmcs12) ||
2932
nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2933
nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2934
nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2935
nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2936
CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2937
return -EINVAL;
2938
2939
if (!nested_cpu_has_preemption_timer(vmcs12) &&
2940
nested_cpu_has_save_preemption_timer(vmcs12))
2941
return -EINVAL;
2942
2943
if (nested_cpu_has_ept(vmcs12) &&
2944
CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2945
return -EINVAL;
2946
2947
if (nested_cpu_has_vmfunc(vmcs12)) {
2948
if (CC(vmcs12->vm_function_control &
2949
~vmx->nested.msrs.vmfunc_controls))
2950
return -EINVAL;
2951
2952
if (nested_cpu_has_eptp_switching(vmcs12)) {
2953
if (CC(!nested_cpu_has_ept(vmcs12)) ||
2954
CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2955
return -EINVAL;
2956
}
2957
}
2958
2959
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
2960
CC(!vmcs12->tsc_multiplier))
2961
return -EINVAL;
2962
2963
return 0;
2964
}
2965
2966
/*
2967
* Checks related to VM-Exit Control Fields
2968
*/
2969
static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2970
struct vmcs12 *vmcs12)
2971
{
2972
struct vcpu_vmx *vmx = to_vmx(vcpu);
2973
2974
if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2975
vmx->nested.msrs.exit_ctls_low,
2976
vmx->nested.msrs.exit_ctls_high)) ||
2977
CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2978
return -EINVAL;
2979
2980
return 0;
2981
}
2982
2983
/*
2984
* Checks related to VM-Entry Control Fields
2985
*/
2986
static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2987
struct vmcs12 *vmcs12)
2988
{
2989
struct vcpu_vmx *vmx = to_vmx(vcpu);
2990
2991
if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2992
vmx->nested.msrs.entry_ctls_low,
2993
vmx->nested.msrs.entry_ctls_high)))
2994
return -EINVAL;
2995
2996
/*
2997
* From the Intel SDM, volume 3:
2998
* Fields relevant to VM-entry event injection must be set properly.
2999
* These fields are the VM-entry interruption-information field, the
3000
* VM-entry exception error code, and the VM-entry instruction length.
3001
*/
3002
if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
3003
u32 intr_info = vmcs12->vm_entry_intr_info_field;
3004
u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
3005
u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
3006
bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
3007
bool urg = nested_cpu_has2(vmcs12,
3008
SECONDARY_EXEC_UNRESTRICTED_GUEST);
3009
bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
3010
3011
/* VM-entry interruption-info field: interruption type */
3012
if (CC(intr_type == INTR_TYPE_RESERVED) ||
3013
CC(intr_type == INTR_TYPE_OTHER_EVENT &&
3014
!nested_cpu_supports_monitor_trap_flag(vcpu)))
3015
return -EINVAL;
3016
3017
/* VM-entry interruption-info field: vector */
3018
if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
3019
CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
3020
CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
3021
return -EINVAL;
3022
3023
/*
3024
* Cannot deliver error code in real mode or if the interrupt
3025
* type is not hardware exception. For other cases, do the
3026
* consistency check only if the vCPU doesn't enumerate
3027
* VMX_BASIC_NO_HW_ERROR_CODE_CC.
3028
*/
3029
if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) {
3030
if (CC(has_error_code))
3031
return -EINVAL;
3032
} else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) {
3033
if (CC(has_error_code != x86_exception_has_error_code(vector)))
3034
return -EINVAL;
3035
}
3036
3037
/* VM-entry exception error code */
3038
if (CC(has_error_code &&
3039
vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
3040
return -EINVAL;
3041
3042
/* VM-entry interruption-info field: reserved bits */
3043
if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
3044
return -EINVAL;
3045
3046
/* VM-entry instruction length */
3047
switch (intr_type) {
3048
case INTR_TYPE_SOFT_EXCEPTION:
3049
case INTR_TYPE_SOFT_INTR:
3050
case INTR_TYPE_PRIV_SW_EXCEPTION:
3051
if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
3052
CC(vmcs12->vm_entry_instruction_len == 0 &&
3053
CC(!nested_cpu_has_zero_length_injection(vcpu))))
3054
return -EINVAL;
3055
}
3056
}
3057
3058
if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
3059
return -EINVAL;
3060
3061
return 0;
3062
}
3063
3064
static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
3065
struct vmcs12 *vmcs12)
3066
{
3067
if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
3068
nested_check_vm_exit_controls(vcpu, vmcs12) ||
3069
nested_check_vm_entry_controls(vcpu, vmcs12))
3070
return -EINVAL;
3071
3072
#ifdef CONFIG_KVM_HYPERV
3073
if (guest_cpu_cap_has_evmcs(vcpu))
3074
return nested_evmcs_check_controls(vmcs12);
3075
#endif
3076
3077
return 0;
3078
}
3079
3080
static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
3081
struct vmcs12 *vmcs12)
3082
{
3083
void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
3084
u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;
3085
3086
/*
3087
* Don't bother with the consistency checks if KVM isn't configured to
3088
* WARN on missed consistency checks, as KVM needs to rely on hardware
3089
* to fully detect an illegal vTPR vs. TRP Threshold combination due to
3090
* the vTPR being writable by L1 at all times (it's an in-memory value,
3091
* not a VMCS field). I.e. even if the check passes now, it might fail
3092
* at the actual VM-Enter.
3093
*
3094
* Keying off the module param also allows treating an invalid vAPIC
3095
* mapping as a consistency check failure without increasing the risk
3096
* of breaking a "real" VM.
3097
*/
3098
if (!warn_on_missed_cc)
3099
return 0;
3100
3101
if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
3102
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
3103
!nested_cpu_has_vid(vmcs12) &&
3104
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
3105
(CC(!vapic) ||
3106
CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
3107
return -EINVAL;
3108
3109
return 0;
3110
}
3111
3112
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
3113
struct vmcs12 *vmcs12)
3114
{
3115
#ifdef CONFIG_X86_64
3116
if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
3117
!!(vcpu->arch.efer & EFER_LMA)))
3118
return -EINVAL;
3119
#endif
3120
return 0;
3121
}
3122
3123
static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
3124
{
3125
/*
3126
* Check that the given linear address is canonical after a VM exit
3127
* from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
3128
*/
3129
u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
3130
3131
return !__is_canonical_address(la, l1_address_bits_on_exit);
3132
}
3133
3134
static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet,
3135
u64 ssp, u64 ssp_tbl)
3136
{
3137
if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) ||
3138
CC(is_noncanonical_msr_address(ssp_tbl, vcpu)))
3139
return -EINVAL;
3140
3141
return 0;
3142
}
3143
3144
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
3145
struct vmcs12 *vmcs12)
3146
{
3147
bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
3148
3149
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
3150
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
3151
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
3152
return -EINVAL;
3153
3154
if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP)))
3155
return -EINVAL;
3156
3157
if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
3158
CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
3159
return -EINVAL;
3160
3161
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
3162
CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
3163
return -EINVAL;
3164
3165
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3166
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3167
vmcs12->host_ia32_perf_global_ctrl)))
3168
return -EINVAL;
3169
3170
if (ia32e) {
3171
if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
3172
return -EINVAL;
3173
} else {
3174
if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
3175
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
3176
CC((vmcs12->host_rip) >> 32))
3177
return -EINVAL;
3178
}
3179
3180
if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3181
CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3182
CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3183
CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3184
CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3185
CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3186
CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3187
CC(vmcs12->host_cs_selector == 0) ||
3188
CC(vmcs12->host_tr_selector == 0) ||
3189
CC(vmcs12->host_ss_selector == 0 && !ia32e))
3190
return -EINVAL;
3191
3192
if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
3193
CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
3194
CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
3195
CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
3196
CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
3197
CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
3198
return -EINVAL;
3199
3200
/*
3201
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
3202
* IA32_EFER MSR must be 0 in the field for that register. In addition,
3203
* the values of the LMA and LME bits in the field must each be that of
3204
* the host address-space size VM-exit control.
3205
*/
3206
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
3207
if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
3208
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
3209
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
3210
return -EINVAL;
3211
}
3212
3213
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) {
3214
if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet,
3215
vmcs12->host_ssp,
3216
vmcs12->host_ssp_tbl))
3217
return -EINVAL;
3218
3219
/*
3220
* IA32_S_CET and SSP must be canonical if the host will
3221
* enter 64-bit mode after VM-exit; otherwise, higher
3222
* 32-bits must be all 0s.
3223
*/
3224
if (ia32e) {
3225
if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) ||
3226
CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu)))
3227
return -EINVAL;
3228
} else {
3229
if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32))
3230
return -EINVAL;
3231
}
3232
}
3233
3234
return 0;
3235
}
3236
3237
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
3238
struct vmcs12 *vmcs12)
3239
{
3240
struct vcpu_vmx *vmx = to_vmx(vcpu);
3241
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
3242
struct vmcs_hdr hdr;
3243
3244
if (vmcs12->vmcs_link_pointer == INVALID_GPA)
3245
return 0;
3246
3247
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
3248
return -EINVAL;
3249
3250
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
3251
CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
3252
vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
3253
return -EINVAL;
3254
3255
if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
3256
offsetof(struct vmcs12, hdr),
3257
sizeof(hdr))))
3258
return -EINVAL;
3259
3260
if (CC(hdr.revision_id != VMCS12_REVISION) ||
3261
CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
3262
return -EINVAL;
3263
3264
return 0;
3265
}
3266
3267
/*
3268
* Checks related to Guest Non-register State
3269
*/
3270
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
3271
{
3272
if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
3273
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
3274
vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
3275
return -EINVAL;
3276
3277
return 0;
3278
}
3279
3280
static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
3281
struct vmcs12 *vmcs12,
3282
enum vm_entry_failure_code *entry_failure_code)
3283
{
3284
bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
3285
3286
*entry_failure_code = ENTRY_FAIL_DEFAULT;
3287
3288
if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3289
CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3290
return -EINVAL;
3291
3292
if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP)))
3293
return -EINVAL;
3294
3295
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3296
(CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
3297
CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
3298
return -EINVAL;
3299
3300
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3301
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3302
return -EINVAL;
3303
3304
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3305
*entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3306
return -EINVAL;
3307
}
3308
3309
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3310
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3311
vmcs12->guest_ia32_perf_global_ctrl)))
3312
return -EINVAL;
3313
3314
if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
3315
return -EINVAL;
3316
3317
if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
3318
CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
3319
return -EINVAL;
3320
3321
/*
3322
* If the load IA32_EFER VM-entry control is 1, the following checks
3323
* are performed on the field for the IA32_EFER MSR:
3324
* - Bits reserved in the IA32_EFER MSR must be 0.
3325
* - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3326
* the IA-32e mode guest VM-exit control. It must also be identical
3327
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3328
* CR0.PG) is 1.
3329
*/
3330
if (to_vmx(vcpu)->nested.nested_run_pending &&
3331
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3332
if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3333
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3334
CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3335
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3336
return -EINVAL;
3337
}
3338
3339
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3340
(CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3341
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3342
return -EINVAL;
3343
3344
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) {
3345
if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet,
3346
vmcs12->guest_ssp,
3347
vmcs12->guest_ssp_tbl))
3348
return -EINVAL;
3349
3350
/*
3351
* Guest SSP must have 63:N bits identical, rather than
3352
* be canonical (i.e., 63:N-1 bits identical), where N is
3353
* the CPU's maximum linear-address width. Similar to
3354
* is_noncanonical_msr_address(), use the host's
3355
* linear-address width.
3356
*/
3357
if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1)))
3358
return -EINVAL;
3359
}
3360
3361
if (nested_check_guest_non_reg_state(vmcs12))
3362
return -EINVAL;
3363
3364
return 0;
3365
}
3366
3367
#ifdef CONFIG_KVM_HYPERV
3368
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3369
{
3370
struct vcpu_vmx *vmx = to_vmx(vcpu);
3371
3372
/*
3373
* hv_evmcs may end up being not mapped after migration (when
3374
* L2 was running), map it here to make sure vmcs12 changes are
3375
* properly reflected.
3376
*/
3377
if (guest_cpu_cap_has_evmcs(vcpu) &&
3378
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3379
enum nested_evmptrld_status evmptrld_status =
3380
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3381
3382
if (evmptrld_status == EVMPTRLD_VMFAIL ||
3383
evmptrld_status == EVMPTRLD_ERROR)
3384
return false;
3385
3386
/*
3387
* Post migration VMCS12 always provides the most actual
3388
* information, copy it to eVMCS upon entry.
3389
*/
3390
vmx->nested.need_vmcs12_to_shadow_sync = true;
3391
}
3392
3393
return true;
3394
}
3395
#endif
3396
3397
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3398
{
3399
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3400
struct vcpu_vmx *vmx = to_vmx(vcpu);
3401
struct kvm_host_map *map;
3402
3403
if (!vcpu->arch.pdptrs_from_userspace &&
3404
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3405
/*
3406
* Reload the guest's PDPTRs since after a migration
3407
* the guest CR3 might be restored prior to setting the nested
3408
* state which can lead to a load of wrong PDPTRs.
3409
*/
3410
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
3411
return false;
3412
}
3413
3414
3415
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3416
map = &vmx->nested.apic_access_page_map;
3417
3418
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
3419
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
3420
} else {
3421
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
3422
__func__);
3423
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3424
vcpu->run->internal.suberror =
3425
KVM_INTERNAL_ERROR_EMULATION;
3426
vcpu->run->internal.ndata = 0;
3427
return false;
3428
}
3429
}
3430
3431
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3432
map = &vmx->nested.virtual_apic_map;
3433
3434
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3435
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3436
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3437
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3438
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3439
/*
3440
* The processor will never use the TPR shadow, simply
3441
* clear the bit from the execution control. Such a
3442
* configuration is useless, but it happens in tests.
3443
* For any other configuration, failing the vm entry is
3444
* _not_ what the processor does but it's basically the
3445
* only possibility we have.
3446
*/
3447
exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3448
} else {
3449
/*
3450
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3451
* force VM-Entry to fail.
3452
*/
3453
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
3454
}
3455
}
3456
3457
if (nested_cpu_has_posted_intr(vmcs12)) {
3458
map = &vmx->nested.pi_desc_map;
3459
3460
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3461
vmx->nested.pi_desc =
3462
(struct pi_desc *)(((void *)map->hva) +
3463
offset_in_page(vmcs12->posted_intr_desc_addr));
3464
vmcs_write64(POSTED_INTR_DESC_ADDR,
3465
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3466
} else {
3467
/*
3468
* Defer the KVM_INTERNAL_EXIT until KVM tries to
3469
* access the contents of the VMCS12 posted interrupt
3470
* descriptor. (Note that KVM may do this when it
3471
* should not, per the architectural specification.)
3472
*/
3473
vmx->nested.pi_desc = NULL;
3474
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3475
}
3476
}
3477
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3478
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3479
else
3480
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3481
3482
return true;
3483
}
3484
3485
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3486
{
3487
#ifdef CONFIG_KVM_HYPERV
3488
/*
3489
* Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
3490
* in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
3491
* to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
3492
* migration.
3493
*/
3494
if (!nested_get_evmcs_page(vcpu)) {
3495
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3496
__func__);
3497
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3498
vcpu->run->internal.suberror =
3499
KVM_INTERNAL_ERROR_EMULATION;
3500
vcpu->run->internal.ndata = 0;
3501
3502
return false;
3503
}
3504
#endif
3505
3506
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3507
return false;
3508
3509
return true;
3510
}
3511
3512
static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3513
{
3514
struct vmcs12 *vmcs12;
3515
struct vcpu_vmx *vmx = to_vmx(vcpu);
3516
gpa_t dst;
3517
3518
if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3519
return 0;
3520
3521
if (WARN_ON_ONCE(vmx->nested.pml_full))
3522
return 1;
3523
3524
/*
3525
* Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3526
* set is already checked as part of A/D emulation.
3527
*/
3528
vmcs12 = get_vmcs12(vcpu);
3529
if (!nested_cpu_has_pml(vmcs12))
3530
return 0;
3531
3532
if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
3533
vmx->nested.pml_full = true;
3534
return 1;
3535
}
3536
3537
gpa &= ~0xFFFull;
3538
dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3539
3540
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3541
offset_in_page(dst), sizeof(gpa)))
3542
return 0;
3543
3544
vmcs12->guest_pml_index--;
3545
3546
return 0;
3547
}
3548
3549
/*
3550
* Intel's VMX Instruction Reference specifies a common set of prerequisites
3551
* for running VMX instructions (except VMXON, whose prerequisites are
3552
* slightly different). It also specifies what exception to inject otherwise.
3553
* Note that many of these exceptions have priority over VM exits, so they
3554
* don't have to be checked again here.
3555
*/
3556
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3557
{
3558
if (!to_vmx(vcpu)->nested.vmxon) {
3559
kvm_queue_exception(vcpu, UD_VECTOR);
3560
return 0;
3561
}
3562
3563
if (vmx_get_cpl(vcpu)) {
3564
kvm_inject_gp(vcpu, 0);
3565
return 0;
3566
}
3567
3568
return 1;
3569
}
3570
3571
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3572
struct vmcs12 *vmcs12);
3573
3574
/*
3575
* If from_vmentry is false, this is being called from state restore (either RSM
3576
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3577
*
3578
* Returns:
3579
* NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3580
* NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3581
* NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3582
* NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3583
*/
3584
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3585
bool from_vmentry)
3586
{
3587
struct vcpu_vmx *vmx = to_vmx(vcpu);
3588
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3589
enum vm_entry_failure_code entry_failure_code;
3590
union vmx_exit_reason exit_reason = {
3591
.basic = EXIT_REASON_INVALID_STATE,
3592
.failed_vmentry = 1,
3593
};
3594
u32 failed_index;
3595
3596
trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
3597
vmx->nested.current_vmptr,
3598
vmcs12->guest_rip,
3599
vmcs12->guest_intr_status,
3600
vmcs12->vm_entry_intr_info_field,
3601
vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
3602
vmcs12->ept_pointer,
3603
vmcs12->guest_cr3,
3604
KVM_ISA_VMX);
3605
3606
kvm_service_local_tlb_flush_requests(vcpu);
3607
3608
if (!vmx->nested.nested_run_pending ||
3609
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3610
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
3611
if (kvm_mpx_supported() &&
3612
(!vmx->nested.nested_run_pending ||
3613
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3614
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3615
3616
if (!vmx->nested.nested_run_pending ||
3617
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
3618
vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet,
3619
&vmx->nested.pre_vmenter_ssp,
3620
&vmx->nested.pre_vmenter_ssp_tbl);
3621
3622
/*
3623
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
3624
* event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
3625
* not KVM, KVM must unwind its software model to the pre-VM-Entry host
3626
* state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
3627
* L1's "real" CR3, which causes nested_vmx_restore_host_state() to
3628
* corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
3629
* unwind naturally setting arch.cr3 to the correct value. Smashing
3630
* vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
3631
* reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
3632
* overwritten with a shadow CR3 prior to re-entering L1.
3633
*/
3634
if (!enable_ept)
3635
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3636
3637
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3638
3639
prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3640
3641
if (from_vmentry) {
3642
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3643
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3644
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3645
}
3646
3647
if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
3648
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3649
return NVMX_VMENTRY_VMFAIL;
3650
}
3651
3652
if (nested_vmx_check_guest_state(vcpu, vmcs12,
3653
&entry_failure_code)) {
3654
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3655
vmcs12->exit_qualification = entry_failure_code;
3656
goto vmentry_fail_vmexit;
3657
}
3658
}
3659
3660
enter_guest_mode(vcpu);
3661
3662
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3663
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3664
vmcs12->exit_qualification = entry_failure_code;
3665
goto vmentry_fail_vmexit_guest_mode;
3666
}
3667
3668
if (from_vmentry) {
3669
failed_index = nested_vmx_load_msr(vcpu,
3670
vmcs12->vm_entry_msr_load_addr,
3671
vmcs12->vm_entry_msr_load_count);
3672
if (failed_index) {
3673
exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3674
vmcs12->exit_qualification = failed_index;
3675
goto vmentry_fail_vmexit_guest_mode;
3676
}
3677
} else {
3678
/*
3679
* The MMU is not initialized to point at the right entities yet and
3680
* "get pages" would need to read data from the guest (i.e. we will
3681
* need to perform gpa to hpa translation). Request a call
3682
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3683
* have already been set at vmentry time and should not be reset.
3684
*/
3685
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3686
}
3687
3688
/*
3689
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
3690
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
3691
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
3692
* unconditionally. Take care to pull data from vmcs01 as appropriate,
3693
* e.g. when checking for interrupt windows, as vmcs02 is now loaded.
3694
*/
3695
if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
3696
CPU_BASED_NMI_WINDOW_EXITING)) ||
3697
kvm_apic_has_pending_init_or_sipi(vcpu) ||
3698
kvm_apic_has_interrupt(vcpu))
3699
kvm_make_request(KVM_REQ_EVENT, vcpu);
3700
3701
/*
3702
* Do not start the preemption timer hrtimer until after we know
3703
* we are successful, so that only nested_vmx_vmexit needs to cancel
3704
* the timer.
3705
*/
3706
vmx->nested.preemption_timer_expired = false;
3707
if (nested_cpu_has_preemption_timer(vmcs12)) {
3708
u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3709
vmx_start_preemption_timer(vcpu, timer_value);
3710
}
3711
3712
/*
3713
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3714
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3715
* returned as far as L1 is concerned. It will only return (and set
3716
* the success flag) when L2 exits (see nested_vmx_vmexit()).
3717
*/
3718
return NVMX_VMENTRY_SUCCESS;
3719
3720
/*
3721
* A failed consistency check that leads to a VMExit during L1's
3722
* VMEnter to L2 is a variation of a normal VMexit, as explained in
3723
* 26.7 "VM-entry failures during or after loading guest state".
3724
*/
3725
vmentry_fail_vmexit_guest_mode:
3726
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3727
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3728
leave_guest_mode(vcpu);
3729
3730
vmentry_fail_vmexit:
3731
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3732
3733
if (!from_vmentry)
3734
return NVMX_VMENTRY_VMEXIT;
3735
3736
load_vmcs12_host_state(vcpu, vmcs12);
3737
vmcs12->vm_exit_reason = exit_reason.full;
3738
if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
3739
vmx->nested.need_vmcs12_to_shadow_sync = true;
3740
return NVMX_VMENTRY_VMEXIT;
3741
}
3742
3743
/*
3744
* nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3745
* for running an L2 nested guest.
3746
*/
3747
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3748
{
3749
struct vmcs12 *vmcs12;
3750
enum nvmx_vmentry_status status;
3751
struct vcpu_vmx *vmx = to_vmx(vcpu);
3752
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3753
enum nested_evmptrld_status evmptrld_status;
3754
3755
if (!nested_vmx_check_permission(vcpu))
3756
return 1;
3757
3758
evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3759
if (evmptrld_status == EVMPTRLD_ERROR) {
3760
kvm_queue_exception(vcpu, UD_VECTOR);
3761
return 1;
3762
}
3763
3764
kvm_pmu_branch_retired(vcpu);
3765
3766
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3767
return nested_vmx_failInvalid(vcpu);
3768
3769
if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
3770
vmx->nested.current_vmptr == INVALID_GPA))
3771
return nested_vmx_failInvalid(vcpu);
3772
3773
vmcs12 = get_vmcs12(vcpu);
3774
3775
/*
3776
* Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3777
* that there *is* a valid VMCS pointer, RFLAGS.CF is set
3778
* rather than RFLAGS.ZF, and no error number is stored to the
3779
* VM-instruction error field.
3780
*/
3781
if (CC(vmcs12->hdr.shadow_vmcs))
3782
return nested_vmx_failInvalid(vcpu);
3783
3784
if (nested_vmx_is_evmptr12_valid(vmx)) {
3785
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
3786
3787
copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
3788
/* Enlightened VMCS doesn't have launch state */
3789
vmcs12->launch_state = !launch;
3790
} else if (enable_shadow_vmcs) {
3791
copy_shadow_to_vmcs12(vmx);
3792
}
3793
3794
/*
3795
* The nested entry process starts with enforcing various prerequisites
3796
* on vmcs12 as required by the Intel SDM, and act appropriately when
3797
* they fail: As the SDM explains, some conditions should cause the
3798
* instruction to fail, while others will cause the instruction to seem
3799
* to succeed, but return an EXIT_REASON_INVALID_STATE.
3800
* To speed up the normal (success) code path, we should avoid checking
3801
* for misconfigurations which will anyway be caught by the processor
3802
* when using the merged vmcs02.
3803
*/
3804
if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3805
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3806
3807
if (CC(vmcs12->launch_state == launch))
3808
return nested_vmx_fail(vcpu,
3809
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3810
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3811
3812
if (nested_vmx_check_controls(vcpu, vmcs12))
3813
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3814
3815
if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3816
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3817
3818
if (nested_vmx_check_host_state(vcpu, vmcs12))
3819
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3820
3821
/*
3822
* We're finally done with prerequisite checking, and can start with
3823
* the nested entry.
3824
*/
3825
vmx->nested.nested_run_pending = 1;
3826
vmx->nested.has_preemption_timer_deadline = false;
3827
status = nested_vmx_enter_non_root_mode(vcpu, true);
3828
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3829
goto vmentry_failed;
3830
3831
/* Hide L1D cache contents from the nested guest. */
3832
kvm_request_l1tf_flush_l1d();
3833
3834
/*
3835
* Must happen outside of nested_vmx_enter_non_root_mode() as it will
3836
* also be used as part of restoring nVMX state for
3837
* snapshot restore (migration).
3838
*
3839
* In this flow, it is assumed that vmcs12 cache was
3840
* transferred as part of captured nVMX state and should
3841
* therefore not be read from guest memory (which may not
3842
* exist on destination host yet).
3843
*/
3844
nested_cache_shadow_vmcs12(vcpu, vmcs12);
3845
3846
switch (vmcs12->guest_activity_state) {
3847
case GUEST_ACTIVITY_HLT:
3848
/*
3849
* If we're entering a halted L2 vcpu and the L2 vcpu won't be
3850
* awakened by event injection or by an NMI-window VM-exit or
3851
* by an interrupt-window VM-exit, halt the vcpu.
3852
*/
3853
if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3854
!nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3855
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3856
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3857
vmx->nested.nested_run_pending = 0;
3858
return kvm_emulate_halt_noskip(vcpu);
3859
}
3860
break;
3861
case GUEST_ACTIVITY_WAIT_SIPI:
3862
vmx->nested.nested_run_pending = 0;
3863
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
3864
break;
3865
default:
3866
break;
3867
}
3868
3869
return 1;
3870
3871
vmentry_failed:
3872
vmx->nested.nested_run_pending = 0;
3873
if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3874
return 0;
3875
if (status == NVMX_VMENTRY_VMEXIT)
3876
return 1;
3877
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3878
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3879
}
3880
3881
/*
3882
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3883
* because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3884
* This function returns the new value we should put in vmcs12.guest_cr0.
3885
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3886
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3887
* available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3888
* didn't trap the bit, because if L1 did, so would L0).
3889
* 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3890
* been modified by L2, and L1 knows it. So just leave the old value of
3891
* the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3892
* isn't relevant, because if L0 traps this bit it can set it to anything.
3893
* 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3894
* changed these bits, and therefore they need to be updated, but L0
3895
* didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3896
* put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3897
*/
3898
static inline unsigned long
3899
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3900
{
3901
return
3902
/*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3903
/*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3904
/*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3905
vcpu->arch.cr0_guest_owned_bits));
3906
}
3907
3908
static inline unsigned long
3909
vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3910
{
3911
return
3912
/*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3913
/*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3914
/*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3915
vcpu->arch.cr4_guest_owned_bits));
3916
}
3917
3918
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3919
struct vmcs12 *vmcs12,
3920
u32 vm_exit_reason, u32 exit_intr_info)
3921
{
3922
u32 idt_vectoring;
3923
unsigned int nr;
3924
3925
/*
3926
* Per the SDM, VM-Exits due to double and triple faults are never
3927
* considered to occur during event delivery, even if the double/triple
3928
* fault is the result of an escalating vectoring issue.
3929
*
3930
* Note, the SDM qualifies the double fault behavior with "The original
3931
* event results in a double-fault exception". It's unclear why the
3932
* qualification exists since exits due to double fault can occur only
3933
* while vectoring a different exception (injected events are never
3934
* subject to interception), i.e. there's _always_ an original event.
3935
*
3936
* The SDM also uses NMI as a confusing example for the "original event
3937
* causes the VM exit directly" clause. NMI isn't special in any way,
3938
* the same rule applies to all events that cause an exit directly.
3939
* NMI is an odd choice for the example because NMIs can only occur on
3940
* instruction boundaries, i.e. they _can't_ occur during vectoring.
3941
*/
3942
if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3943
((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3944
is_double_fault(exit_intr_info))) {
3945
vmcs12->idt_vectoring_info_field = 0;
3946
} else if (vcpu->arch.exception.injected) {
3947
nr = vcpu->arch.exception.vector;
3948
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3949
3950
if (kvm_exception_is_soft(nr)) {
3951
vmcs12->vm_exit_instruction_len =
3952
vcpu->arch.event_exit_inst_len;
3953
idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3954
} else
3955
idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3956
3957
if (vcpu->arch.exception.has_error_code) {
3958
idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3959
vmcs12->idt_vectoring_error_code =
3960
vcpu->arch.exception.error_code;
3961
}
3962
3963
vmcs12->idt_vectoring_info_field = idt_vectoring;
3964
} else if (vcpu->arch.nmi_injected) {
3965
vmcs12->idt_vectoring_info_field =
3966
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3967
} else if (vcpu->arch.interrupt.injected) {
3968
nr = vcpu->arch.interrupt.nr;
3969
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3970
3971
if (vcpu->arch.interrupt.soft) {
3972
idt_vectoring |= INTR_TYPE_SOFT_INTR;
3973
vmcs12->vm_entry_instruction_len =
3974
vcpu->arch.event_exit_inst_len;
3975
} else
3976
idt_vectoring |= INTR_TYPE_EXT_INTR;
3977
3978
vmcs12->idt_vectoring_info_field = idt_vectoring;
3979
} else {
3980
vmcs12->idt_vectoring_info_field = 0;
3981
}
3982
}
3983
3984
3985
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3986
{
3987
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3988
gfn_t gfn;
3989
3990
/*
3991
* Don't need to mark the APIC access page dirty; it is never
3992
* written to by the CPU during APIC virtualization.
3993
*/
3994
3995
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3996
gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3997
kvm_vcpu_mark_page_dirty(vcpu, gfn);
3998
}
3999
4000
if (nested_cpu_has_posted_intr(vmcs12)) {
4001
gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
4002
kvm_vcpu_mark_page_dirty(vcpu, gfn);
4003
}
4004
}
4005
4006
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4007
{
4008
struct vcpu_vmx *vmx = to_vmx(vcpu);
4009
int max_irr;
4010
void *vapic_page;
4011
u16 status;
4012
4013
if (!vmx->nested.pi_pending)
4014
return 0;
4015
4016
if (!vmx->nested.pi_desc)
4017
goto mmio_needed;
4018
4019
vmx->nested.pi_pending = false;
4020
4021
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4022
return 0;
4023
4024
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4025
if (max_irr > 0) {
4026
vapic_page = vmx->nested.virtual_apic_map.hva;
4027
if (!vapic_page)
4028
goto mmio_needed;
4029
4030
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
4031
vapic_page, &max_irr);
4032
status = vmcs_read16(GUEST_INTR_STATUS);
4033
if ((u8)max_irr > ((u8)status & 0xff)) {
4034
status &= ~0xff;
4035
status |= (u8)max_irr;
4036
vmcs_write16(GUEST_INTR_STATUS, status);
4037
}
4038
}
4039
4040
nested_mark_vmcs12_pages_dirty(vcpu);
4041
return 0;
4042
4043
mmio_needed:
4044
kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
4045
return -ENXIO;
4046
}
4047
4048
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
4049
{
4050
struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
4051
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
4052
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4053
unsigned long exit_qual;
4054
4055
if (ex->has_payload) {
4056
exit_qual = ex->payload;
4057
} else if (ex->vector == PF_VECTOR) {
4058
exit_qual = vcpu->arch.cr2;
4059
} else if (ex->vector == DB_VECTOR) {
4060
exit_qual = vcpu->arch.dr6;
4061
exit_qual &= ~DR6_BT;
4062
exit_qual ^= DR6_ACTIVE_LOW;
4063
} else {
4064
exit_qual = 0;
4065
}
4066
4067
/*
4068
* Unlike AMD's Paged Real Mode, which reports an error code on #PF
4069
* VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
4070
* "has error code" flags on VM-Exit if the CPU is in Real Mode.
4071
*/
4072
if (ex->has_error_code && is_protmode(vcpu)) {
4073
/*
4074
* Intel CPUs do not generate error codes with bits 31:16 set,
4075
* and more importantly VMX disallows setting bits 31:16 in the
4076
* injected error code for VM-Entry. Drop the bits to mimic
4077
* hardware and avoid inducing failure on nested VM-Entry if L1
4078
* chooses to inject the exception back to L2. AMD CPUs _do_
4079
* generate "full" 32-bit error codes, so KVM allows userspace
4080
* to inject exception error codes with bits 31:16 set.
4081
*/
4082
vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
4083
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
4084
}
4085
4086
if (kvm_exception_is_soft(ex->vector))
4087
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
4088
else
4089
intr_info |= INTR_TYPE_HARD_EXCEPTION;
4090
4091
if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
4092
vmx_get_nmi_mask(vcpu))
4093
intr_info |= INTR_INFO_UNBLOCK_NMI;
4094
4095
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
4096
}
4097
4098
/*
4099
* Returns true if a debug trap is (likely) pending delivery. Infer the class
4100
* of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
4101
* Using the payload is flawed because code breakpoints (fault-like) and data
4102
* breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
4103
* this will return false positives if a to-be-injected code breakpoint #DB is
4104
* pending (from KVM's perspective, but not "pending" across an instruction
4105
* boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
4106
* too is trap-like.
4107
*
4108
* KVM "works" despite these flaws as ICEBP isn't currently supported by the
4109
* emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
4110
* #DB has already happened), and MTF isn't marked pending on code breakpoints
4111
* from the emulator (because such #DBs are fault-like and thus don't trigger
4112
* actions that fire on instruction retire).
4113
*/
4114
static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
4115
{
4116
if (!ex->pending || ex->vector != DB_VECTOR)
4117
return 0;
4118
4119
/* General Detect #DBs are always fault-like. */
4120
return ex->payload & ~DR6_BD;
4121
}
4122
4123
/*
4124
* Returns true if there's a pending #DB exception that is lower priority than
4125
* a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
4126
* KVM, but could theoretically be injected by userspace. Note, this code is
4127
* imperfect, see above.
4128
*/
4129
static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
4130
{
4131
return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
4132
}
4133
4134
/*
4135
* Certain VM-exits set the 'pending debug exceptions' field to indicate a
4136
* recognized #DB (data or single-step) that has yet to be delivered. Since KVM
4137
* represents these debug traps with a payload that is said to be compatible
4138
* with the 'pending debug exceptions' field, write the payload to the VMCS
4139
* field if a VM-exit is delivered before the debug trap.
4140
*/
4141
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
4142
{
4143
unsigned long pending_dbg;
4144
4145
pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
4146
if (pending_dbg)
4147
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
4148
}
4149
4150
static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
4151
{
4152
return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
4153
to_vmx(vcpu)->nested.preemption_timer_expired;
4154
}
4155
4156
static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
4157
{
4158
struct vcpu_vmx *vmx = to_vmx(vcpu);
4159
void *vapic = vmx->nested.virtual_apic_map.hva;
4160
int max_irr, vppr;
4161
4162
if (nested_vmx_preemption_timer_pending(vcpu) ||
4163
vmx->nested.mtf_pending)
4164
return true;
4165
4166
/*
4167
* Virtual Interrupt Delivery doesn't require manual injection. Either
4168
* the interrupt is already in GUEST_RVI and will be recognized by CPU
4169
* at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
4170
* the interrupt from the PIR to RVI prior to entering the guest.
4171
*/
4172
if (for_injection)
4173
return false;
4174
4175
if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4176
__vmx_interrupt_blocked(vcpu))
4177
return false;
4178
4179
if (!vapic)
4180
return false;
4181
4182
vppr = *((u32 *)(vapic + APIC_PROCPRI));
4183
4184
max_irr = vmx_get_rvi();
4185
if ((max_irr & 0xf0) > (vppr & 0xf0))
4186
return true;
4187
4188
if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
4189
pi_test_on(vmx->nested.pi_desc)) {
4190
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4191
if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
4192
return true;
4193
}
4194
4195
return false;
4196
}
4197
4198
/*
4199
* Per the Intel SDM's table "Priority Among Concurrent Events", with minor
4200
* edits to fill in missing examples, e.g. #DB due to split-lock accesses,
4201
* and less minor edits to splice in the priority of VMX Non-Root specific
4202
* events, e.g. MTF and NMI/INTR-window exiting.
4203
*
4204
* 1 Hardware Reset and Machine Checks
4205
* - RESET
4206
* - Machine Check
4207
*
4208
* 2 Trap on Task Switch
4209
* - T flag in TSS is set (on task switch)
4210
*
4211
* 3 External Hardware Interventions
4212
* - FLUSH
4213
* - STOPCLK
4214
* - SMI
4215
* - INIT
4216
*
4217
* 3.5 Monitor Trap Flag (MTF) VM-exit[1]
4218
*
4219
* 4 Traps on Previous Instruction
4220
* - Breakpoints
4221
* - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
4222
* breakpoint, or #DB due to a split-lock access)
4223
*
4224
* 4.3 VMX-preemption timer expired VM-exit
4225
*
4226
* 4.6 NMI-window exiting VM-exit[2]
4227
*
4228
* 5 Nonmaskable Interrupts (NMI)
4229
*
4230
* 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
4231
*
4232
* 6 Maskable Hardware Interrupts
4233
*
4234
* 7 Code Breakpoint Fault
4235
*
4236
* 8 Faults from Fetching Next Instruction
4237
* - Code-Segment Limit Violation
4238
* - Code Page Fault
4239
* - Control protection exception (missing ENDBRANCH at target of indirect
4240
* call or jump)
4241
*
4242
* 9 Faults from Decoding Next Instruction
4243
* - Instruction length > 15 bytes
4244
* - Invalid Opcode
4245
* - Coprocessor Not Available
4246
*
4247
*10 Faults on Executing Instruction
4248
* - Overflow
4249
* - Bound error
4250
* - Invalid TSS
4251
* - Segment Not Present
4252
* - Stack fault
4253
* - General Protection
4254
* - Data Page Fault
4255
* - Alignment Check
4256
* - x86 FPU Floating-point exception
4257
* - SIMD floating-point exception
4258
* - Virtualization exception
4259
* - Control protection exception
4260
*
4261
* [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
4262
* INIT signals, and higher priority events take priority over MTF VM exits.
4263
* MTF VM exits take priority over debug-trap exceptions and lower priority
4264
* events.
4265
*
4266
* [2] Debug-trap exceptions and higher priority events take priority over VM exits
4267
* caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
4268
* timer take priority over VM exits caused by the "NMI-window exiting"
4269
* VM-execution control and lower priority events.
4270
*
4271
* [3] Debug-trap exceptions and higher priority events take priority over VM exits
4272
* caused by "NMI-window exiting". VM exits caused by this control take
4273
* priority over non-maskable interrupts (NMIs) and lower priority events.
4274
*
4275
* [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
4276
* the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
4277
* non-maskable interrupts (NMIs) and higher priority events take priority over
4278
* delivery of a virtual interrupt; delivery of a virtual interrupt takes
4279
* priority over external interrupts and lower priority events.
4280
*/
4281
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
4282
{
4283
struct kvm_lapic *apic = vcpu->arch.apic;
4284
struct vcpu_vmx *vmx = to_vmx(vcpu);
4285
/*
4286
* Only a pending nested run blocks a pending exception. If there is a
4287
* previously injected event, the pending exception occurred while said
4288
* event was being delivered and thus needs to be handled.
4289
*/
4290
bool block_nested_exceptions = vmx->nested.nested_run_pending;
4291
/*
4292
* Events that don't require injection, i.e. that are virtualized by
4293
* hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
4294
* to regain control in order to deliver the event, and hardware will
4295
* handle event ordering, e.g. with respect to injected exceptions.
4296
*
4297
* But, new events (not exceptions) are only recognized at instruction
4298
* boundaries. If an event needs reinjection, then KVM is handling a
4299
* VM-Exit that occurred _during_ instruction execution; new events,
4300
* irrespective of whether or not they're injected, are blocked until
4301
* the instruction completes.
4302
*/
4303
bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
4304
/*
4305
* Inject events are blocked by nested VM-Enter, as KVM is responsible
4306
* for managing priority between concurrent events, i.e. KVM needs to
4307
* wait until after VM-Enter completes to deliver injected events.
4308
*/
4309
bool block_nested_events = block_nested_exceptions ||
4310
block_non_injected_events;
4311
4312
if (lapic_in_kernel(vcpu) &&
4313
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
4314
if (block_nested_events)
4315
return -EBUSY;
4316
nested_vmx_update_pending_dbg(vcpu);
4317
clear_bit(KVM_APIC_INIT, &apic->pending_events);
4318
if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
4319
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
4320
4321
/* MTF is discarded if the vCPU is in WFS. */
4322
vmx->nested.mtf_pending = false;
4323
return 0;
4324
}
4325
4326
if (lapic_in_kernel(vcpu) &&
4327
test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
4328
if (block_nested_events)
4329
return -EBUSY;
4330
4331
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
4332
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
4333
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
4334
apic->sipi_vector & 0xFFUL);
4335
return 0;
4336
}
4337
/* Fallthrough, the SIPI is completely ignored. */
4338
}
4339
4340
/*
4341
* Process exceptions that are higher priority than Monitor Trap Flag:
4342
* fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
4343
* could theoretically come in from userspace), and ICEBP (INT1).
4344
*
4345
* TODO: SMIs have higher priority than MTF and trap-like #DBs (except
4346
* for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
4347
* across SMI/RSM as it should; that needs to be addressed in order to
4348
* prioritize SMI over MTF and trap-like #DBs.
4349
*/
4350
if (vcpu->arch.exception_vmexit.pending &&
4351
!vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
4352
if (block_nested_exceptions)
4353
return -EBUSY;
4354
4355
nested_vmx_inject_exception_vmexit(vcpu);
4356
return 0;
4357
}
4358
4359
if (vcpu->arch.exception.pending &&
4360
!vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
4361
if (block_nested_exceptions)
4362
return -EBUSY;
4363
goto no_vmexit;
4364
}
4365
4366
if (vmx->nested.mtf_pending) {
4367
if (block_nested_events)
4368
return -EBUSY;
4369
nested_vmx_update_pending_dbg(vcpu);
4370
nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
4371
return 0;
4372
}
4373
4374
if (vcpu->arch.exception_vmexit.pending) {
4375
if (block_nested_exceptions)
4376
return -EBUSY;
4377
4378
nested_vmx_inject_exception_vmexit(vcpu);
4379
return 0;
4380
}
4381
4382
if (vcpu->arch.exception.pending) {
4383
if (block_nested_exceptions)
4384
return -EBUSY;
4385
goto no_vmexit;
4386
}
4387
4388
if (nested_vmx_preemption_timer_pending(vcpu)) {
4389
if (block_nested_events)
4390
return -EBUSY;
4391
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
4392
return 0;
4393
}
4394
4395
if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
4396
if (block_nested_events)
4397
return -EBUSY;
4398
goto no_vmexit;
4399
}
4400
4401
if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
4402
if (block_nested_events)
4403
return -EBUSY;
4404
if (!nested_exit_on_nmi(vcpu))
4405
goto no_vmexit;
4406
4407
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4408
NMI_VECTOR | INTR_TYPE_NMI_INTR |
4409
INTR_INFO_VALID_MASK, 0);
4410
/*
4411
* The NMI-triggered VM exit counts as injection:
4412
* clear this one and block further NMIs.
4413
*/
4414
vcpu->arch.nmi_pending = 0;
4415
vmx_set_nmi_mask(vcpu, true);
4416
return 0;
4417
}
4418
4419
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
4420
int irq;
4421
4422
if (!nested_exit_on_intr(vcpu)) {
4423
if (block_nested_events)
4424
return -EBUSY;
4425
4426
goto no_vmexit;
4427
}
4428
4429
if (!nested_exit_intr_ack_set(vcpu)) {
4430
if (block_nested_events)
4431
return -EBUSY;
4432
4433
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
4434
return 0;
4435
}
4436
4437
irq = kvm_cpu_get_extint(vcpu);
4438
if (irq != -1) {
4439
if (block_nested_events)
4440
return -EBUSY;
4441
4442
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4443
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4444
return 0;
4445
}
4446
4447
irq = kvm_apic_has_interrupt(vcpu);
4448
if (WARN_ON_ONCE(irq < 0))
4449
goto no_vmexit;
4450
4451
/*
4452
* If the IRQ is L2's PI notification vector, process posted
4453
* interrupts for L2 instead of injecting VM-Exit, as the
4454
* detection/morphing architecturally occurs when the IRQ is
4455
* delivered to the CPU. Note, only interrupts that are routed
4456
* through the local APIC trigger posted interrupt processing,
4457
* and enabling posted interrupts requires ACK-on-exit.
4458
*/
4459
if (irq == vmx->nested.posted_intr_nv) {
4460
/*
4461
* Nested posted interrupts are delivered via RVI, i.e.
4462
* aren't injected by KVM, and so can be queued even if
4463
* manual event injection is disallowed.
4464
*/
4465
if (block_non_injected_events)
4466
return -EBUSY;
4467
4468
vmx->nested.pi_pending = true;
4469
kvm_apic_clear_irr(vcpu, irq);
4470
goto no_vmexit;
4471
}
4472
4473
if (block_nested_events)
4474
return -EBUSY;
4475
4476
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4477
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4478
4479
/*
4480
* ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
4481
* be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
4482
* if APICv is active.
4483
*/
4484
kvm_apic_ack_interrupt(vcpu, irq);
4485
return 0;
4486
}
4487
4488
no_vmexit:
4489
return vmx_complete_nested_posted_interrupt(vcpu);
4490
}
4491
4492
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
4493
{
4494
ktime_t remaining =
4495
hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4496
u64 value;
4497
4498
if (ktime_to_ns(remaining) <= 0)
4499
return 0;
4500
4501
value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4502
do_div(value, 1000000);
4503
return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4504
}
4505
4506
static bool is_vmcs12_ext_field(unsigned long field)
4507
{
4508
switch (field) {
4509
case GUEST_ES_SELECTOR:
4510
case GUEST_CS_SELECTOR:
4511
case GUEST_SS_SELECTOR:
4512
case GUEST_DS_SELECTOR:
4513
case GUEST_FS_SELECTOR:
4514
case GUEST_GS_SELECTOR:
4515
case GUEST_LDTR_SELECTOR:
4516
case GUEST_TR_SELECTOR:
4517
case GUEST_ES_LIMIT:
4518
case GUEST_CS_LIMIT:
4519
case GUEST_SS_LIMIT:
4520
case GUEST_DS_LIMIT:
4521
case GUEST_FS_LIMIT:
4522
case GUEST_GS_LIMIT:
4523
case GUEST_LDTR_LIMIT:
4524
case GUEST_TR_LIMIT:
4525
case GUEST_GDTR_LIMIT:
4526
case GUEST_IDTR_LIMIT:
4527
case GUEST_ES_AR_BYTES:
4528
case GUEST_DS_AR_BYTES:
4529
case GUEST_FS_AR_BYTES:
4530
case GUEST_GS_AR_BYTES:
4531
case GUEST_LDTR_AR_BYTES:
4532
case GUEST_TR_AR_BYTES:
4533
case GUEST_ES_BASE:
4534
case GUEST_CS_BASE:
4535
case GUEST_SS_BASE:
4536
case GUEST_DS_BASE:
4537
case GUEST_FS_BASE:
4538
case GUEST_GS_BASE:
4539
case GUEST_LDTR_BASE:
4540
case GUEST_TR_BASE:
4541
case GUEST_GDTR_BASE:
4542
case GUEST_IDTR_BASE:
4543
case GUEST_PENDING_DBG_EXCEPTIONS:
4544
case GUEST_BNDCFGS:
4545
return true;
4546
default:
4547
break;
4548
}
4549
4550
return false;
4551
}
4552
4553
static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4554
struct vmcs12 *vmcs12)
4555
{
4556
struct vcpu_vmx *vmx = to_vmx(vcpu);
4557
4558
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4559
vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4560
vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4561
vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4562
vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4563
vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4564
vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4565
vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4566
vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4567
vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4568
vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4569
vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4570
vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4571
vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4572
vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4573
vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4574
vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4575
vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4576
vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4577
vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4578
vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4579
vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4580
vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4581
vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4582
vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4583
vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4584
vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4585
vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4586
vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4587
vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4588
vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4589
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4590
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4591
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4592
vmcs12->guest_pending_dbg_exceptions =
4593
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4594
4595
vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4596
}
4597
4598
static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4599
struct vmcs12 *vmcs12)
4600
{
4601
struct vcpu_vmx *vmx = to_vmx(vcpu);
4602
int cpu;
4603
4604
if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4605
return;
4606
4607
4608
WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4609
4610
cpu = get_cpu();
4611
vmx->loaded_vmcs = &vmx->nested.vmcs02;
4612
vmx_vcpu_load_vmcs(vcpu, cpu);
4613
4614
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4615
4616
vmx->loaded_vmcs = &vmx->vmcs01;
4617
vmx_vcpu_load_vmcs(vcpu, cpu);
4618
put_cpu();
4619
}
4620
4621
/*
4622
* Update the guest state fields of vmcs12 to reflect changes that
4623
* occurred while L2 was running. (The "IA-32e mode guest" bit of the
4624
* VM-entry controls is also updated, since this is really a guest
4625
* state bit.)
4626
*/
4627
static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4628
{
4629
struct vcpu_vmx *vmx = to_vmx(vcpu);
4630
4631
if (nested_vmx_is_evmptr12_valid(vmx))
4632
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4633
4634
vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4635
!nested_vmx_is_evmptr12_valid(vmx);
4636
4637
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4638
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4639
4640
vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4641
vmcs12->guest_rip = kvm_rip_read(vcpu);
4642
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4643
4644
vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4645
vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4646
4647
vmcs12->guest_interruptibility_info =
4648
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4649
4650
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4651
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4652
else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4653
vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4654
else
4655
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4656
4657
if (nested_cpu_has_preemption_timer(vmcs12) &&
4658
vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4659
!vmx->nested.nested_run_pending)
4660
vmcs12->vmx_preemption_timer_value =
4661
vmx_get_preemption_timer_value(vcpu);
4662
4663
/*
4664
* In some cases (usually, nested EPT), L2 is allowed to change its
4665
* own CR3 without exiting. If it has changed it, we must keep it.
4666
* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4667
* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4668
*
4669
* Additionally, restore L2's PDPTR to vmcs12.
4670
*/
4671
if (enable_ept) {
4672
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4673
if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4674
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4675
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4676
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4677
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4678
}
4679
}
4680
4681
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4682
4683
if (nested_cpu_has_vid(vmcs12))
4684
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4685
4686
vmcs12->vm_entry_controls =
4687
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4688
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4689
4690
/*
4691
* Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
4692
* Writes to DEBUGCTL that aren't intercepted by L1 are immediately
4693
* propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
4694
* vmcs02 doesn't strictly track vmcs12.
4695
*/
4696
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4697
vmcs12->guest_dr7 = vcpu->arch.dr7;
4698
4699
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4700
vmcs12->guest_ia32_efer = vcpu->arch.efer;
4701
4702
vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet,
4703
&vmcs12->guest_ssp,
4704
&vmcs12->guest_ssp_tbl);
4705
}
4706
4707
/*
4708
* prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4709
* and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4710
* and this function updates it to reflect the changes to the guest state while
4711
* L2 was running (and perhaps made some exits which were handled directly by L0
4712
* without going back to L1), and to reflect the exit reason.
4713
* Note that we do not have to copy here all VMCS fields, just those that
4714
* could have changed by the L2 guest or the exit - i.e., the guest-state and
4715
* exit-information fields only. Other fields are modified by L1 with VMWRITE,
4716
* which already writes to vmcs12 directly.
4717
*/
4718
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4719
u32 vm_exit_reason, u32 exit_intr_info,
4720
unsigned long exit_qualification, u32 exit_insn_len)
4721
{
4722
/* update exit information fields: */
4723
vmcs12->vm_exit_reason = vm_exit_reason;
4724
if (vmx_get_exit_reason(vcpu).enclave_mode)
4725
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4726
vmcs12->exit_qualification = exit_qualification;
4727
4728
/*
4729
* On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4730
* and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4731
* exit info fields are unmodified.
4732
*/
4733
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4734
vmcs12->launch_state = 1;
4735
4736
/* vm_entry_intr_info_field is cleared on exit. Emulate this
4737
* instead of reading the real value. */
4738
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4739
4740
/*
4741
* Transfer the event that L0 or L1 may wanted to inject into
4742
* L2 to IDT_VECTORING_INFO_FIELD.
4743
*/
4744
vmcs12_save_pending_event(vcpu, vmcs12,
4745
vm_exit_reason, exit_intr_info);
4746
4747
vmcs12->vm_exit_intr_info = exit_intr_info;
4748
vmcs12->vm_exit_instruction_len = exit_insn_len;
4749
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4750
4751
/*
4752
* According to spec, there's no need to store the guest's
4753
* MSRs if the exit is due to a VM-entry failure that occurs
4754
* during or after loading the guest state. Since this exit
4755
* does not fall in that category, we need to save the MSRs.
4756
*/
4757
if (nested_vmx_store_msr(vcpu,
4758
vmcs12->vm_exit_msr_store_addr,
4759
vmcs12->vm_exit_msr_store_count))
4760
nested_vmx_abort(vcpu,
4761
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4762
}
4763
}
4764
4765
/*
4766
* A part of what we need to when the nested L2 guest exits and we want to
4767
* run its L1 parent, is to reset L1's guest state to the host state specified
4768
* in vmcs12.
4769
* This function is to be called not only on normal nested exit, but also on
4770
* a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4771
* Failures During or After Loading Guest State").
4772
* This function should be called when the active VMCS is L1's (vmcs01).
4773
*/
4774
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4775
struct vmcs12 *vmcs12)
4776
{
4777
enum vm_entry_failure_code ignored;
4778
struct kvm_segment seg;
4779
4780
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4781
vcpu->arch.efer = vmcs12->host_ia32_efer;
4782
else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4783
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4784
else
4785
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4786
vmx_set_efer(vcpu, vcpu->arch.efer);
4787
4788
kvm_rsp_write(vcpu, vmcs12->host_rsp);
4789
kvm_rip_write(vcpu, vmcs12->host_rip);
4790
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4791
vmx_set_interrupt_shadow(vcpu, 0);
4792
4793
/*
4794
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4795
* actually changed, because vmx_set_cr0 refers to efer set above.
4796
*
4797
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
4798
* (KVM doesn't change it);
4799
*/
4800
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4801
vmx_set_cr0(vcpu, vmcs12->host_cr0);
4802
4803
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
4804
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4805
vmx_set_cr4(vcpu, vmcs12->host_cr4);
4806
4807
nested_ept_uninit_mmu_context(vcpu);
4808
4809
/*
4810
* Only PDPTE load can fail as the value of cr3 was checked on entry and
4811
* couldn't have changed.
4812
*/
4813
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4814
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4815
4816
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4817
4818
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4819
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4820
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4821
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4822
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4823
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4824
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4825
4826
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4827
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4828
vmcs_write64(GUEST_BNDCFGS, 0);
4829
4830
/*
4831
* Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set.
4832
* otherwise CET state should be retained across VM-exit, i.e.,
4833
* guest values should be propagated from vmcs12 to vmcs01.
4834
*/
4835
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE)
4836
vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp,
4837
vmcs12->host_ssp_tbl);
4838
else
4839
vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp,
4840
vmcs12->guest_ssp_tbl);
4841
4842
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4843
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4844
vcpu->arch.pat = vmcs12->host_ia32_pat;
4845
}
4846
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
4847
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
4848
WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4849
vmcs12->host_ia32_perf_global_ctrl));
4850
4851
/* Set L1 segment info according to Intel SDM
4852
27.5.2 Loading Host Segment and Descriptor-Table Registers */
4853
seg = (struct kvm_segment) {
4854
.base = 0,
4855
.limit = 0xFFFFFFFF,
4856
.selector = vmcs12->host_cs_selector,
4857
.type = 11,
4858
.present = 1,
4859
.s = 1,
4860
.g = 1
4861
};
4862
if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4863
seg.l = 1;
4864
else
4865
seg.db = 1;
4866
__vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4867
seg = (struct kvm_segment) {
4868
.base = 0,
4869
.limit = 0xFFFFFFFF,
4870
.type = 3,
4871
.present = 1,
4872
.s = 1,
4873
.db = 1,
4874
.g = 1
4875
};
4876
seg.selector = vmcs12->host_ds_selector;
4877
__vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4878
seg.selector = vmcs12->host_es_selector;
4879
__vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4880
seg.selector = vmcs12->host_ss_selector;
4881
__vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4882
seg.selector = vmcs12->host_fs_selector;
4883
seg.base = vmcs12->host_fs_base;
4884
__vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4885
seg.selector = vmcs12->host_gs_selector;
4886
seg.base = vmcs12->host_gs_base;
4887
__vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4888
seg = (struct kvm_segment) {
4889
.base = vmcs12->host_tr_base,
4890
.limit = 0x67,
4891
.selector = vmcs12->host_tr_selector,
4892
.type = 11,
4893
.present = 1
4894
};
4895
__vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4896
4897
memset(&seg, 0, sizeof(seg));
4898
seg.unusable = 1;
4899
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4900
4901
kvm_set_dr(vcpu, 7, 0x400);
4902
vmx_guest_debugctl_write(vcpu, 0);
4903
4904
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4905
vmcs12->vm_exit_msr_load_count))
4906
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4907
4908
to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4909
}
4910
4911
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4912
{
4913
struct vmx_uret_msr *efer_msr;
4914
unsigned int i;
4915
4916
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4917
return vmcs_read64(GUEST_IA32_EFER);
4918
4919
if (cpu_has_load_ia32_efer())
4920
return kvm_host.efer;
4921
4922
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4923
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4924
return vmx->msr_autoload.guest.val[i].value;
4925
}
4926
4927
efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4928
if (efer_msr)
4929
return efer_msr->data;
4930
4931
return kvm_host.efer;
4932
}
4933
4934
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4935
{
4936
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4937
struct vcpu_vmx *vmx = to_vmx(vcpu);
4938
struct vmx_msr_entry g, h;
4939
gpa_t gpa;
4940
u32 i, j;
4941
4942
vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4943
4944
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4945
/*
4946
* L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4947
* as vmcs01.GUEST_DR7 contains a userspace defined value
4948
* and vcpu->arch.dr7 is not squirreled away before the
4949
* nested VMENTER (not worth adding a variable in nested_vmx).
4950
*/
4951
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4952
kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4953
else
4954
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4955
}
4956
4957
/* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
4958
vmx_reload_guest_debugctl(vcpu);
4959
4960
/*
4961
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
4962
* handle a variety of side effects to KVM's software model.
4963
*/
4964
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4965
4966
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4967
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4968
4969
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4970
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4971
4972
nested_ept_uninit_mmu_context(vcpu);
4973
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4974
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4975
4976
/*
4977
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4978
* from vmcs01 (if necessary). The PDPTRs are not loaded on
4979
* VMFail, like everything else we just need to ensure our
4980
* software model is up-to-date.
4981
*/
4982
if (enable_ept && is_pae_paging(vcpu))
4983
ept_save_pdptrs(vcpu);
4984
4985
kvm_mmu_reset_context(vcpu);
4986
4987
/*
4988
* This nasty bit of open coding is a compromise between blindly
4989
* loading L1's MSRs using the exit load lists (incorrect emulation
4990
* of VMFail), leaving the nested VM's MSRs in the software model
4991
* (incorrect behavior) and snapshotting the modified MSRs (too
4992
* expensive since the lists are unbound by hardware). For each
4993
* MSR that was (prematurely) loaded from the nested VMEntry load
4994
* list, reload it from the exit load list if it exists and differs
4995
* from the guest value. The intent is to stuff host state as
4996
* silently as possible, not to fully process the exit load list.
4997
*/
4998
for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4999
gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
5000
if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
5001
pr_debug_ratelimited(
5002
"%s read MSR index failed (%u, 0x%08llx)\n",
5003
__func__, i, gpa);
5004
goto vmabort;
5005
}
5006
5007
for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
5008
gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
5009
if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
5010
pr_debug_ratelimited(
5011
"%s read MSR failed (%u, 0x%08llx)\n",
5012
__func__, j, gpa);
5013
goto vmabort;
5014
}
5015
if (h.index != g.index)
5016
continue;
5017
if (h.value == g.value)
5018
break;
5019
5020
if (nested_vmx_load_msr_check(vcpu, &h)) {
5021
pr_debug_ratelimited(
5022
"%s check failed (%u, 0x%x, 0x%x)\n",
5023
__func__, j, h.index, h.reserved);
5024
goto vmabort;
5025
}
5026
5027
if (kvm_emulate_msr_write(vcpu, h.index, h.value)) {
5028
pr_debug_ratelimited(
5029
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
5030
__func__, j, h.index, h.value);
5031
goto vmabort;
5032
}
5033
}
5034
}
5035
5036
return;
5037
5038
vmabort:
5039
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
5040
}
5041
5042
/*
5043
* Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
5044
* and modify vmcs12 to make it see what it would expect to see there if
5045
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
5046
*/
5047
void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
5048
u32 exit_intr_info, unsigned long exit_qualification,
5049
u32 exit_insn_len)
5050
{
5051
struct vcpu_vmx *vmx = to_vmx(vcpu);
5052
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5053
5054
/* Pending MTF traps are discarded on VM-Exit. */
5055
vmx->nested.mtf_pending = false;
5056
5057
/* trying to cancel vmlaunch/vmresume is a bug */
5058
WARN_ON_ONCE(vmx->nested.nested_run_pending);
5059
5060
#ifdef CONFIG_KVM_HYPERV
5061
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
5062
/*
5063
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
5064
* Enlightened VMCS after migration and we still need to
5065
* do that when something is forcing L2->L1 exit prior to
5066
* the first L2 run.
5067
*/
5068
(void)nested_get_evmcs_page(vcpu);
5069
}
5070
#endif
5071
5072
/* Service pending TLB flush requests for L2 before switching to L1. */
5073
kvm_service_local_tlb_flush_requests(vcpu);
5074
5075
/*
5076
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
5077
* now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
5078
* up-to-date before switching to L1.
5079
*/
5080
if (enable_ept && is_pae_paging(vcpu))
5081
vmx_ept_load_pdptrs(vcpu);
5082
5083
leave_guest_mode(vcpu);
5084
5085
if (nested_cpu_has_preemption_timer(vmcs12))
5086
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
5087
5088
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
5089
vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
5090
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
5091
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
5092
}
5093
5094
if (likely(!vmx->fail)) {
5095
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5096
5097
if (vm_exit_reason != -1)
5098
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
5099
exit_intr_info, exit_qualification,
5100
exit_insn_len);
5101
5102
/*
5103
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
5104
* also be used to capture vmcs12 cache as part of
5105
* capturing nVMX state for snapshot (migration).
5106
*
5107
* Otherwise, this flush will dirty guest memory at a
5108
* point it is already assumed by user-space to be
5109
* immutable.
5110
*/
5111
nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
5112
} else {
5113
/*
5114
* The only expected VM-instruction error is "VM entry with
5115
* invalid control field(s)." Anything else indicates a
5116
* problem with L0.
5117
*/
5118
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
5119
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5120
5121
/* VM-Fail at VM-Entry means KVM missed a consistency check. */
5122
WARN_ON_ONCE(warn_on_missed_cc);
5123
}
5124
5125
/*
5126
* Drop events/exceptions that were queued for re-injection to L2
5127
* (picked up via vmx_complete_interrupts()), as well as exceptions
5128
* that were pending for L2. Note, this must NOT be hoisted above
5129
* prepare_vmcs12(), events/exceptions queued for re-injection need to
5130
* be captured in vmcs12 (see vmcs12_save_pending_event()).
5131
*/
5132
vcpu->arch.nmi_injected = false;
5133
kvm_clear_exception_queue(vcpu);
5134
kvm_clear_interrupt_queue(vcpu);
5135
5136
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
5137
5138
kvm_nested_vmexit_handle_ibrs(vcpu);
5139
5140
/* Update any VMCS fields that might have changed while L2 ran */
5141
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
5142
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
5143
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
5144
if (kvm_caps.has_tsc_control)
5145
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
5146
5147
if (vmx->nested.l1_tpr_threshold != -1)
5148
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
5149
5150
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
5151
vmx->nested.change_vmcs01_virtual_apic_mode = false;
5152
vmx_set_virtual_apic_mode(vcpu);
5153
}
5154
5155
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
5156
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
5157
vmx_update_cpu_dirty_logging(vcpu);
5158
}
5159
5160
nested_put_vmcs12_pages(vcpu);
5161
5162
if (vmx->nested.reload_vmcs01_apic_access_page) {
5163
vmx->nested.reload_vmcs01_apic_access_page = false;
5164
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5165
}
5166
5167
if (vmx->nested.update_vmcs01_apicv_status) {
5168
vmx->nested.update_vmcs01_apicv_status = false;
5169
vmx_refresh_apicv_exec_ctrl(vcpu);
5170
}
5171
5172
if (vmx->nested.update_vmcs01_hwapic_isr) {
5173
vmx->nested.update_vmcs01_hwapic_isr = false;
5174
kvm_apic_update_hwapic_isr(vcpu);
5175
}
5176
5177
if ((vm_exit_reason != -1) &&
5178
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
5179
vmx->nested.need_vmcs12_to_shadow_sync = true;
5180
5181
/* in case we halted in L2 */
5182
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
5183
5184
if (likely(!vmx->fail)) {
5185
if (vm_exit_reason != -1)
5186
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
5187
vmcs12->exit_qualification,
5188
vmcs12->idt_vectoring_info_field,
5189
vmcs12->vm_exit_intr_info,
5190
vmcs12->vm_exit_intr_error_code,
5191
KVM_ISA_VMX);
5192
5193
load_vmcs12_host_state(vcpu, vmcs12);
5194
5195
/*
5196
* Process events if an injectable IRQ or NMI is pending, even
5197
* if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
5198
* If an event became pending while L2 was active, KVM needs to
5199
* either inject the event or request an IRQ/NMI window. SMIs
5200
* don't need to be processed as SMM is mutually exclusive with
5201
* non-root mode. INIT/SIPI don't need to be checked as INIT
5202
* is blocked post-VMXON, and SIPIs are ignored.
5203
*/
5204
if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
5205
kvm_make_request(KVM_REQ_EVENT, vcpu);
5206
return;
5207
}
5208
5209
/*
5210
* After an early L2 VM-entry failure, we're now back
5211
* in L1 which thinks it just finished a VMLAUNCH or
5212
* VMRESUME instruction, so we need to set the failure
5213
* flag and the VM-instruction error field of the VMCS
5214
* accordingly, and skip the emulated instruction.
5215
*/
5216
(void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5217
5218
/*
5219
* Restore L1's host state to KVM's software model. We're here
5220
* because a consistency check was caught by hardware, which
5221
* means some amount of guest state has been propagated to KVM's
5222
* model and needs to be unwound to the host's state.
5223
*/
5224
nested_vmx_restore_host_state(vcpu);
5225
5226
vmx->fail = 0;
5227
}
5228
5229
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
5230
{
5231
kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5232
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
5233
}
5234
5235
/*
5236
* Decode the memory-address operand of a vmx instruction, as recorded on an
5237
* exit caused by such an instruction (run by a guest hypervisor).
5238
* On success, returns 0. When the operand is invalid, returns 1 and throws
5239
* #UD, #GP, or #SS.
5240
*/
5241
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
5242
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
5243
{
5244
gva_t off;
5245
bool exn;
5246
struct kvm_segment s;
5247
5248
/*
5249
* According to Vol. 3B, "Information for VM Exits Due to Instruction
5250
* Execution", on an exit, vmx_instruction_info holds most of the
5251
* addressing components of the operand. Only the displacement part
5252
* is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5253
* For how an actual address is calculated from all these components,
5254
* refer to Vol. 1, "Operand Addressing".
5255
*/
5256
int scaling = vmx_instruction_info & 3;
5257
int addr_size = (vmx_instruction_info >> 7) & 7;
5258
bool is_reg = vmx_instruction_info & (1u << 10);
5259
int seg_reg = (vmx_instruction_info >> 15) & 7;
5260
int index_reg = (vmx_instruction_info >> 18) & 0xf;
5261
bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5262
int base_reg = (vmx_instruction_info >> 23) & 0xf;
5263
bool base_is_valid = !(vmx_instruction_info & (1u << 27));
5264
5265
if (is_reg) {
5266
kvm_queue_exception(vcpu, UD_VECTOR);
5267
return 1;
5268
}
5269
5270
/* Addr = segment_base + offset */
5271
/* offset = base + [index * scale] + displacement */
5272
off = exit_qualification; /* holds the displacement */
5273
if (addr_size == 1)
5274
off = (gva_t)sign_extend64(off, 31);
5275
else if (addr_size == 0)
5276
off = (gva_t)sign_extend64(off, 15);
5277
if (base_is_valid)
5278
off += kvm_register_read(vcpu, base_reg);
5279
if (index_is_valid)
5280
off += kvm_register_read(vcpu, index_reg) << scaling;
5281
vmx_get_segment(vcpu, &s, seg_reg);
5282
5283
/*
5284
* The effective address, i.e. @off, of a memory operand is truncated
5285
* based on the address size of the instruction. Note that this is
5286
* the *effective address*, i.e. the address prior to accounting for
5287
* the segment's base.
5288
*/
5289
if (addr_size == 1) /* 32 bit */
5290
off &= 0xffffffff;
5291
else if (addr_size == 0) /* 16 bit */
5292
off &= 0xffff;
5293
5294
/* Checks for #GP/#SS exceptions. */
5295
exn = false;
5296
if (is_long_mode(vcpu)) {
5297
/*
5298
* The virtual/linear address is never truncated in 64-bit
5299
* mode, e.g. a 32-bit address size can yield a 64-bit virtual
5300
* address when using FS/GS with a non-zero base.
5301
*/
5302
if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
5303
*ret = s.base + off;
5304
else
5305
*ret = off;
5306
5307
*ret = vmx_get_untagged_addr(vcpu, *ret, 0);
5308
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
5309
* non-canonical form. This is the only check on the memory
5310
* destination for long mode!
5311
*/
5312
exn = is_noncanonical_address(*ret, vcpu, 0);
5313
} else {
5314
/*
5315
* When not in long mode, the virtual/linear address is
5316
* unconditionally truncated to 32 bits regardless of the
5317
* address size.
5318
*/
5319
*ret = (s.base + off) & 0xffffffff;
5320
5321
/* Protected mode: apply checks for segment validity in the
5322
* following order:
5323
* - segment type check (#GP(0) may be thrown)
5324
* - usability check (#GP(0)/#SS(0))
5325
* - limit check (#GP(0)/#SS(0))
5326
*/
5327
if (wr)
5328
/* #GP(0) if the destination operand is located in a
5329
* read-only data segment or any code segment.
5330
*/
5331
exn = ((s.type & 0xa) == 0 || (s.type & 8));
5332
else
5333
/* #GP(0) if the source operand is located in an
5334
* execute-only code segment
5335
*/
5336
exn = ((s.type & 0xa) == 8);
5337
if (exn) {
5338
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5339
return 1;
5340
}
5341
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
5342
*/
5343
exn = (s.unusable != 0);
5344
5345
/*
5346
* Protected mode: #GP(0)/#SS(0) if the memory operand is
5347
* outside the segment limit. All CPUs that support VMX ignore
5348
* limit checks for flat segments, i.e. segments with base==0,
5349
* limit==0xffffffff and of type expand-up data or code.
5350
*/
5351
if (!(s.base == 0 && s.limit == 0xffffffff &&
5352
((s.type & 8) || !(s.type & 4))))
5353
exn = exn || ((u64)off + len - 1 > s.limit);
5354
}
5355
if (exn) {
5356
kvm_queue_exception_e(vcpu,
5357
seg_reg == VCPU_SREG_SS ?
5358
SS_VECTOR : GP_VECTOR,
5359
0);
5360
return 1;
5361
}
5362
5363
return 0;
5364
}
5365
5366
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
5367
int *ret)
5368
{
5369
gva_t gva;
5370
struct x86_exception e;
5371
int r;
5372
5373
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5374
vmcs_read32(VMX_INSTRUCTION_INFO), false,
5375
sizeof(*vmpointer), &gva)) {
5376
*ret = 1;
5377
return -EINVAL;
5378
}
5379
5380
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
5381
if (r != X86EMUL_CONTINUE) {
5382
*ret = kvm_handle_memory_failure(vcpu, r, &e);
5383
return -EINVAL;
5384
}
5385
5386
return 0;
5387
}
5388
5389
/*
5390
* Allocate a shadow VMCS and associate it with the currently loaded
5391
* VMCS, unless such a shadow VMCS already exists. The newly allocated
5392
* VMCS is also VMCLEARed, so that it is ready for use.
5393
*/
5394
static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
5395
{
5396
struct vcpu_vmx *vmx = to_vmx(vcpu);
5397
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
5398
5399
/*
5400
* KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
5401
* when L1 executes VMXOFF or the vCPU is forced out of nested
5402
* operation. VMXON faults if the CPU is already post-VMXON, so it
5403
* should be impossible to already have an allocated shadow VMCS. KVM
5404
* doesn't support virtualization of VMCS shadowing, so vmcs01 should
5405
* always be the loaded VMCS.
5406
*/
5407
if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
5408
return loaded_vmcs->shadow_vmcs;
5409
5410
loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
5411
if (loaded_vmcs->shadow_vmcs)
5412
vmcs_clear(loaded_vmcs->shadow_vmcs);
5413
5414
return loaded_vmcs->shadow_vmcs;
5415
}
5416
5417
static int enter_vmx_operation(struct kvm_vcpu *vcpu)
5418
{
5419
struct vcpu_vmx *vmx = to_vmx(vcpu);
5420
int r;
5421
5422
r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
5423
if (r < 0)
5424
goto out_vmcs02;
5425
5426
vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5427
if (!vmx->nested.cached_vmcs12)
5428
goto out_cached_vmcs12;
5429
5430
vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
5431
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5432
if (!vmx->nested.cached_shadow_vmcs12)
5433
goto out_cached_shadow_vmcs12;
5434
5435
if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
5436
goto out_shadow_vmcs;
5437
5438
hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
5439
HRTIMER_MODE_ABS_PINNED);
5440
5441
vmx->nested.vpid02 = allocate_vpid();
5442
5443
vmx->nested.vmcs02_initialized = false;
5444
vmx->nested.vmxon = true;
5445
5446
if (vmx_pt_mode_is_host_guest()) {
5447
vmx->pt_desc.guest.ctl = 0;
5448
pt_update_intercept_for_msr(vcpu);
5449
}
5450
5451
return 0;
5452
5453
out_shadow_vmcs:
5454
kfree(vmx->nested.cached_shadow_vmcs12);
5455
5456
out_cached_shadow_vmcs12:
5457
kfree(vmx->nested.cached_vmcs12);
5458
5459
out_cached_vmcs12:
5460
free_loaded_vmcs(&vmx->nested.vmcs02);
5461
5462
out_vmcs02:
5463
return -ENOMEM;
5464
}
5465
5466
/* Emulate the VMXON instruction. */
5467
static int handle_vmxon(struct kvm_vcpu *vcpu)
5468
{
5469
int ret;
5470
gpa_t vmptr;
5471
uint32_t revision;
5472
struct vcpu_vmx *vmx = to_vmx(vcpu);
5473
const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
5474
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
5475
5476
/*
5477
* Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
5478
* the guest and so cannot rely on hardware to perform the check,
5479
* which has higher priority than VM-Exit (see Intel SDM's pseudocode
5480
* for VMXON).
5481
*
5482
* Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
5483
* and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
5484
* force any of the relevant guest state. For a restricted guest, KVM
5485
* does force CR0.PE=1, but only to also force VM86 in order to emulate
5486
* Real Mode, and so there's no need to check CR0.PE manually.
5487
*/
5488
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
5489
kvm_queue_exception(vcpu, UD_VECTOR);
5490
return 1;
5491
}
5492
5493
/*
5494
* The CPL is checked for "not in VMX operation" and for "in VMX root",
5495
* and has higher priority than the VM-Fail due to being post-VMXON,
5496
* i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
5497
* VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
5498
* from L2 to L1, i.e. there's no need to check for the vCPU being in
5499
* VMX non-root.
5500
*
5501
* Forwarding the VM-Exit unconditionally, i.e. without performing the
5502
* #UD checks (see above), is functionally ok because KVM doesn't allow
5503
* L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
5504
* CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
5505
* missed by hardware due to shadowing CR0 and/or CR4.
5506
*/
5507
if (vmx_get_cpl(vcpu)) {
5508
kvm_inject_gp(vcpu, 0);
5509
return 1;
5510
}
5511
5512
if (vmx->nested.vmxon)
5513
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5514
5515
/*
5516
* Invalid CR0/CR4 generates #GP. These checks are performed if and
5517
* only if the vCPU isn't already in VMX operation, i.e. effectively
5518
* have lower priority than the VM-Fail above.
5519
*/
5520
if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
5521
!nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
5522
kvm_inject_gp(vcpu, 0);
5523
return 1;
5524
}
5525
5526
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5527
!= VMXON_NEEDED_FEATURES) {
5528
kvm_inject_gp(vcpu, 0);
5529
return 1;
5530
}
5531
5532
if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
5533
return ret;
5534
5535
/*
5536
* SDM 3: 24.11.5
5537
* The first 4 bytes of VMXON region contain the supported
5538
* VMCS revision identifier
5539
*
5540
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
5541
* which replaces physical address width with 32
5542
*/
5543
if (!page_address_valid(vcpu, vmptr))
5544
return nested_vmx_failInvalid(vcpu);
5545
5546
if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
5547
revision != VMCS12_REVISION)
5548
return nested_vmx_failInvalid(vcpu);
5549
5550
vmx->nested.vmxon_ptr = vmptr;
5551
ret = enter_vmx_operation(vcpu);
5552
if (ret)
5553
return ret;
5554
5555
return nested_vmx_succeed(vcpu);
5556
}
5557
5558
static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
5559
{
5560
struct vcpu_vmx *vmx = to_vmx(vcpu);
5561
5562
if (vmx->nested.current_vmptr == INVALID_GPA)
5563
return;
5564
5565
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5566
5567
if (enable_shadow_vmcs) {
5568
/* copy to memory all shadowed fields in case
5569
they were modified */
5570
copy_shadow_to_vmcs12(vmx);
5571
vmx_disable_shadow_vmcs(vmx);
5572
}
5573
vmx->nested.posted_intr_nv = -1;
5574
5575
/* Flush VMCS12 to guest memory */
5576
kvm_vcpu_write_guest_page(vcpu,
5577
vmx->nested.current_vmptr >> PAGE_SHIFT,
5578
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5579
5580
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5581
5582
vmx->nested.current_vmptr = INVALID_GPA;
5583
}
5584
5585
/* Emulate the VMXOFF instruction */
5586
static int handle_vmxoff(struct kvm_vcpu *vcpu)
5587
{
5588
if (!nested_vmx_check_permission(vcpu))
5589
return 1;
5590
5591
free_nested(vcpu);
5592
5593
if (kvm_apic_has_pending_init_or_sipi(vcpu))
5594
kvm_make_request(KVM_REQ_EVENT, vcpu);
5595
5596
return nested_vmx_succeed(vcpu);
5597
}
5598
5599
/* Emulate the VMCLEAR instruction */
5600
static int handle_vmclear(struct kvm_vcpu *vcpu)
5601
{
5602
struct vcpu_vmx *vmx = to_vmx(vcpu);
5603
u32 zero = 0;
5604
gpa_t vmptr;
5605
int r;
5606
5607
if (!nested_vmx_check_permission(vcpu))
5608
return 1;
5609
5610
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5611
return r;
5612
5613
if (!page_address_valid(vcpu, vmptr))
5614
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5615
5616
if (vmptr == vmx->nested.vmxon_ptr)
5617
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5618
5619
if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
5620
if (vmptr == vmx->nested.current_vmptr)
5621
nested_release_vmcs12(vcpu);
5622
5623
/*
5624
* Silently ignore memory errors on VMCLEAR, Intel's pseudocode
5625
* for VMCLEAR includes a "ensure that data for VMCS referenced
5626
* by the operand is in memory" clause that guards writes to
5627
* memory, i.e. doing nothing for I/O is architecturally valid.
5628
*
5629
* FIXME: Suppress failures if and only if no memslot is found,
5630
* i.e. exit to userspace if __copy_to_user() fails.
5631
*/
5632
(void)kvm_vcpu_write_guest(vcpu,
5633
vmptr + offsetof(struct vmcs12,
5634
launch_state),
5635
&zero, sizeof(zero));
5636
}
5637
5638
return nested_vmx_succeed(vcpu);
5639
}
5640
5641
/* Emulate the VMLAUNCH instruction */
5642
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5643
{
5644
return nested_vmx_run(vcpu, true);
5645
}
5646
5647
/* Emulate the VMRESUME instruction */
5648
static int handle_vmresume(struct kvm_vcpu *vcpu)
5649
{
5650
5651
return nested_vmx_run(vcpu, false);
5652
}
5653
5654
static int handle_vmread(struct kvm_vcpu *vcpu)
5655
{
5656
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5657
: get_vmcs12(vcpu);
5658
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5659
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5660
struct vcpu_vmx *vmx = to_vmx(vcpu);
5661
struct x86_exception e;
5662
unsigned long field;
5663
u64 value;
5664
gva_t gva = 0;
5665
short offset;
5666
int len, r;
5667
5668
if (!nested_vmx_check_permission(vcpu))
5669
return 1;
5670
5671
/* Decode instruction info and find the field to read */
5672
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5673
5674
if (!nested_vmx_is_evmptr12_valid(vmx)) {
5675
/*
5676
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5677
* any VMREAD sets the ALU flags for VMfailInvalid.
5678
*/
5679
if (vmx->nested.current_vmptr == INVALID_GPA ||
5680
(is_guest_mode(vcpu) &&
5681
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5682
return nested_vmx_failInvalid(vcpu);
5683
5684
offset = get_vmcs12_field_offset(field);
5685
if (offset < 0)
5686
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5687
5688
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5689
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5690
5691
/* Read the field, zero-extended to a u64 value */
5692
value = vmcs12_read_any(vmcs12, field, offset);
5693
} else {
5694
/*
5695
* Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5696
* enlightened VMCS is active VMREAD/VMWRITE instructions are
5697
* unsupported. Unfortunately, certain versions of Windows 11
5698
* don't comply with this requirement which is not enforced in
5699
* genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5700
* workaround, as misbehaving guests will panic on VM-Fail.
5701
* Note, enlightened VMCS is incompatible with shadow VMCS so
5702
* all VMREADs from L2 should go to L1.
5703
*/
5704
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5705
return nested_vmx_failInvalid(vcpu);
5706
5707
offset = evmcs_field_offset(field, NULL);
5708
if (offset < 0)
5709
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5710
5711
/* Read the field, zero-extended to a u64 value */
5712
value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
5713
}
5714
5715
/*
5716
* Now copy part of this value to register or memory, as requested.
5717
* Note that the number of bits actually copied is 32 or 64 depending
5718
* on the guest's mode (32 or 64 bit), not on the given field's length.
5719
*/
5720
if (instr_info & BIT(10)) {
5721
kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5722
} else {
5723
len = is_64_bit_mode(vcpu) ? 8 : 4;
5724
if (get_vmx_mem_address(vcpu, exit_qualification,
5725
instr_info, true, len, &gva))
5726
return 1;
5727
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
5728
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5729
if (r != X86EMUL_CONTINUE)
5730
return kvm_handle_memory_failure(vcpu, r, &e);
5731
}
5732
5733
return nested_vmx_succeed(vcpu);
5734
}
5735
5736
static bool is_shadow_field_rw(unsigned long field)
5737
{
5738
switch (field) {
5739
#define SHADOW_FIELD_RW(x, y) case x:
5740
#include "vmcs_shadow_fields.h"
5741
return true;
5742
default:
5743
break;
5744
}
5745
return false;
5746
}
5747
5748
static bool is_shadow_field_ro(unsigned long field)
5749
{
5750
switch (field) {
5751
#define SHADOW_FIELD_RO(x, y) case x:
5752
#include "vmcs_shadow_fields.h"
5753
return true;
5754
default:
5755
break;
5756
}
5757
return false;
5758
}
5759
5760
static int handle_vmwrite(struct kvm_vcpu *vcpu)
5761
{
5762
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5763
: get_vmcs12(vcpu);
5764
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5765
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5766
struct vcpu_vmx *vmx = to_vmx(vcpu);
5767
struct x86_exception e;
5768
unsigned long field;
5769
short offset;
5770
gva_t gva;
5771
int len, r;
5772
5773
/*
5774
* The value to write might be 32 or 64 bits, depending on L1's long
5775
* mode, and eventually we need to write that into a field of several
5776
* possible lengths. The code below first zero-extends the value to 64
5777
* bit (value), and then copies only the appropriate number of
5778
* bits into the vmcs12 field.
5779
*/
5780
u64 value = 0;
5781
5782
if (!nested_vmx_check_permission(vcpu))
5783
return 1;
5784
5785
/*
5786
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5787
* any VMWRITE sets the ALU flags for VMfailInvalid.
5788
*/
5789
if (vmx->nested.current_vmptr == INVALID_GPA ||
5790
(is_guest_mode(vcpu) &&
5791
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5792
return nested_vmx_failInvalid(vcpu);
5793
5794
if (instr_info & BIT(10))
5795
value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5796
else {
5797
len = is_64_bit_mode(vcpu) ? 8 : 4;
5798
if (get_vmx_mem_address(vcpu, exit_qualification,
5799
instr_info, false, len, &gva))
5800
return 1;
5801
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5802
if (r != X86EMUL_CONTINUE)
5803
return kvm_handle_memory_failure(vcpu, r, &e);
5804
}
5805
5806
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5807
5808
offset = get_vmcs12_field_offset(field);
5809
if (offset < 0)
5810
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5811
5812
/*
5813
* If the vCPU supports "VMWRITE to any supported field in the
5814
* VMCS," then the "read-only" fields are actually read/write.
5815
*/
5816
if (vmcs_field_readonly(field) &&
5817
!nested_cpu_has_vmwrite_any_field(vcpu))
5818
return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5819
5820
/*
5821
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5822
* vmcs12, else we may crush a field or consume a stale value.
5823
*/
5824
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5825
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5826
5827
/*
5828
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
5829
* fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5830
* behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5831
* field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5832
* from L1 will return a different value than VMREAD from L2 (L1 sees
5833
* the stripped down value, L2 sees the full value as stored by KVM).
5834
*/
5835
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5836
value &= 0x1f0ff;
5837
5838
vmcs12_write_any(vmcs12, field, offset, value);
5839
5840
/*
5841
* Do not track vmcs12 dirty-state if in guest-mode as we actually
5842
* dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5843
* by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5844
* "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5845
*/
5846
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5847
/*
5848
* L1 can read these fields without exiting, ensure the
5849
* shadow VMCS is up-to-date.
5850
*/
5851
if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5852
preempt_disable();
5853
vmcs_load(vmx->vmcs01.shadow_vmcs);
5854
5855
__vmcs_writel(field, value);
5856
5857
vmcs_clear(vmx->vmcs01.shadow_vmcs);
5858
vmcs_load(vmx->loaded_vmcs->vmcs);
5859
preempt_enable();
5860
}
5861
vmx->nested.dirty_vmcs12 = true;
5862
}
5863
5864
return nested_vmx_succeed(vcpu);
5865
}
5866
5867
static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5868
{
5869
vmx->nested.current_vmptr = vmptr;
5870
if (enable_shadow_vmcs) {
5871
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5872
vmcs_write64(VMCS_LINK_POINTER,
5873
__pa(vmx->vmcs01.shadow_vmcs));
5874
vmx->nested.need_vmcs12_to_shadow_sync = true;
5875
}
5876
vmx->nested.dirty_vmcs12 = true;
5877
vmx->nested.force_msr_bitmap_recalc = true;
5878
}
5879
5880
/* Emulate the VMPTRLD instruction */
5881
static int handle_vmptrld(struct kvm_vcpu *vcpu)
5882
{
5883
struct vcpu_vmx *vmx = to_vmx(vcpu);
5884
gpa_t vmptr;
5885
int r;
5886
5887
if (!nested_vmx_check_permission(vcpu))
5888
return 1;
5889
5890
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5891
return r;
5892
5893
if (!page_address_valid(vcpu, vmptr))
5894
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5895
5896
if (vmptr == vmx->nested.vmxon_ptr)
5897
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5898
5899
/* Forbid normal VMPTRLD if Enlightened version was used */
5900
if (nested_vmx_is_evmptr12_valid(vmx))
5901
return 1;
5902
5903
if (vmx->nested.current_vmptr != vmptr) {
5904
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5905
struct vmcs_hdr hdr;
5906
5907
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
5908
/*
5909
* Reads from an unbacked page return all 1s,
5910
* which means that the 32 bits located at the
5911
* given physical address won't match the required
5912
* VMCS12_REVISION identifier.
5913
*/
5914
return nested_vmx_fail(vcpu,
5915
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5916
}
5917
5918
if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5919
offsetof(struct vmcs12, hdr),
5920
sizeof(hdr))) {
5921
return nested_vmx_fail(vcpu,
5922
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5923
}
5924
5925
if (hdr.revision_id != VMCS12_REVISION ||
5926
(hdr.shadow_vmcs &&
5927
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5928
return nested_vmx_fail(vcpu,
5929
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5930
}
5931
5932
nested_release_vmcs12(vcpu);
5933
5934
/*
5935
* Load VMCS12 from guest memory since it is not already
5936
* cached.
5937
*/
5938
if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5939
VMCS12_SIZE)) {
5940
return nested_vmx_fail(vcpu,
5941
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5942
}
5943
5944
set_current_vmptr(vmx, vmptr);
5945
}
5946
5947
return nested_vmx_succeed(vcpu);
5948
}
5949
5950
/* Emulate the VMPTRST instruction */
5951
static int handle_vmptrst(struct kvm_vcpu *vcpu)
5952
{
5953
unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5954
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5955
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5956
struct x86_exception e;
5957
gva_t gva;
5958
int r;
5959
5960
if (!nested_vmx_check_permission(vcpu))
5961
return 1;
5962
5963
if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
5964
return 1;
5965
5966
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5967
true, sizeof(gpa_t), &gva))
5968
return 1;
5969
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5970
r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5971
sizeof(gpa_t), &e);
5972
if (r != X86EMUL_CONTINUE)
5973
return kvm_handle_memory_failure(vcpu, r, &e);
5974
5975
return nested_vmx_succeed(vcpu);
5976
}
5977
5978
/* Emulate the INVEPT instruction */
5979
static int handle_invept(struct kvm_vcpu *vcpu)
5980
{
5981
struct vcpu_vmx *vmx = to_vmx(vcpu);
5982
u32 vmx_instruction_info, types;
5983
unsigned long type, roots_to_free;
5984
struct kvm_mmu *mmu;
5985
gva_t gva;
5986
struct x86_exception e;
5987
struct {
5988
u64 eptp, gpa;
5989
} operand;
5990
int i, r, gpr_index;
5991
5992
if (!(vmx->nested.msrs.secondary_ctls_high &
5993
SECONDARY_EXEC_ENABLE_EPT) ||
5994
!(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5995
kvm_queue_exception(vcpu, UD_VECTOR);
5996
return 1;
5997
}
5998
5999
if (!nested_vmx_check_permission(vcpu))
6000
return 1;
6001
6002
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6003
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6004
type = kvm_register_read(vcpu, gpr_index);
6005
6006
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6007
6008
if (type >= 32 || !(types & (1 << type)))
6009
return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6010
6011
/* According to the Intel VMX instruction reference, the memory
6012
* operand is read even if it isn't needed (e.g., for type==global)
6013
*/
6014
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6015
vmx_instruction_info, false, sizeof(operand), &gva))
6016
return 1;
6017
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
6018
if (r != X86EMUL_CONTINUE)
6019
return kvm_handle_memory_failure(vcpu, r, &e);
6020
6021
/*
6022
* Nested EPT roots are always held through guest_mmu,
6023
* not root_mmu.
6024
*/
6025
mmu = &vcpu->arch.guest_mmu;
6026
6027
switch (type) {
6028
case VMX_EPT_EXTENT_CONTEXT:
6029
if (!nested_vmx_check_eptp(vcpu, operand.eptp))
6030
return nested_vmx_fail(vcpu,
6031
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6032
6033
roots_to_free = 0;
6034
if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
6035
operand.eptp))
6036
roots_to_free |= KVM_MMU_ROOT_CURRENT;
6037
6038
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
6039
if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
6040
mmu->prev_roots[i].pgd,
6041
operand.eptp))
6042
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
6043
}
6044
break;
6045
case VMX_EPT_EXTENT_GLOBAL:
6046
roots_to_free = KVM_MMU_ROOTS_ALL;
6047
break;
6048
default:
6049
BUG();
6050
break;
6051
}
6052
6053
if (roots_to_free)
6054
kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
6055
6056
return nested_vmx_succeed(vcpu);
6057
}
6058
6059
static int handle_invvpid(struct kvm_vcpu *vcpu)
6060
{
6061
struct vcpu_vmx *vmx = to_vmx(vcpu);
6062
u32 vmx_instruction_info;
6063
unsigned long type, types;
6064
gva_t gva;
6065
struct x86_exception e;
6066
struct {
6067
u64 vpid;
6068
u64 gla;
6069
} operand;
6070
u16 vpid02;
6071
int r, gpr_index;
6072
6073
if (!(vmx->nested.msrs.secondary_ctls_high &
6074
SECONDARY_EXEC_ENABLE_VPID) ||
6075
!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
6076
kvm_queue_exception(vcpu, UD_VECTOR);
6077
return 1;
6078
}
6079
6080
if (!nested_vmx_check_permission(vcpu))
6081
return 1;
6082
6083
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6084
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6085
type = kvm_register_read(vcpu, gpr_index);
6086
6087
types = (vmx->nested.msrs.vpid_caps &
6088
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
6089
6090
if (type >= 32 || !(types & (1 << type)))
6091
return nested_vmx_fail(vcpu,
6092
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6093
6094
/* according to the intel vmx instruction reference, the memory
6095
* operand is read even if it isn't needed (e.g., for type==global)
6096
*/
6097
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6098
vmx_instruction_info, false, sizeof(operand), &gva))
6099
return 1;
6100
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
6101
if (r != X86EMUL_CONTINUE)
6102
return kvm_handle_memory_failure(vcpu, r, &e);
6103
6104
if (operand.vpid >> 16)
6105
return nested_vmx_fail(vcpu,
6106
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6107
6108
/*
6109
* Always flush the effective vpid02, i.e. never flush the current VPID
6110
* and never explicitly flush vpid01. INVVPID targets a VPID, not a
6111
* VMCS, and so whether or not the current vmcs12 has VPID enabled is
6112
* irrelevant (and there may not be a loaded vmcs12).
6113
*/
6114
vpid02 = nested_get_vpid02(vcpu);
6115
switch (type) {
6116
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
6117
/*
6118
* LAM doesn't apply to addresses that are inputs to TLB
6119
* invalidation.
6120
*/
6121
if (!operand.vpid ||
6122
is_noncanonical_invlpg_address(operand.gla, vcpu))
6123
return nested_vmx_fail(vcpu,
6124
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6125
vpid_sync_vcpu_addr(vpid02, operand.gla);
6126
break;
6127
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
6128
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
6129
if (!operand.vpid)
6130
return nested_vmx_fail(vcpu,
6131
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6132
vpid_sync_context(vpid02);
6133
break;
6134
case VMX_VPID_EXTENT_ALL_CONTEXT:
6135
vpid_sync_context(vpid02);
6136
break;
6137
default:
6138
WARN_ON_ONCE(1);
6139
return kvm_skip_emulated_instruction(vcpu);
6140
}
6141
6142
/*
6143
* Sync the shadow page tables if EPT is disabled, L1 is invalidating
6144
* linear mappings for L2 (tagged with L2's VPID). Free all guest
6145
* roots as VPIDs are not tracked in the MMU role.
6146
*
6147
* Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
6148
* an MMU when EPT is disabled.
6149
*
6150
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
6151
*/
6152
if (!enable_ept)
6153
kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
6154
6155
return nested_vmx_succeed(vcpu);
6156
}
6157
6158
static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
6159
struct vmcs12 *vmcs12)
6160
{
6161
u32 index = kvm_rcx_read(vcpu);
6162
u64 new_eptp;
6163
6164
if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
6165
return 1;
6166
if (index >= VMFUNC_EPTP_ENTRIES)
6167
return 1;
6168
6169
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
6170
&new_eptp, index * 8, 8))
6171
return 1;
6172
6173
/*
6174
* If the (L2) guest does a vmfunc to the currently
6175
* active ept pointer, we don't have to do anything else
6176
*/
6177
if (vmcs12->ept_pointer != new_eptp) {
6178
if (!nested_vmx_check_eptp(vcpu, new_eptp))
6179
return 1;
6180
6181
vmcs12->ept_pointer = new_eptp;
6182
nested_ept_new_eptp(vcpu);
6183
6184
if (!nested_cpu_has_vpid(vmcs12))
6185
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
6186
}
6187
6188
return 0;
6189
}
6190
6191
static int handle_vmfunc(struct kvm_vcpu *vcpu)
6192
{
6193
struct vcpu_vmx *vmx = to_vmx(vcpu);
6194
struct vmcs12 *vmcs12;
6195
u32 function = kvm_rax_read(vcpu);
6196
6197
/*
6198
* VMFUNC should never execute cleanly while L1 is active; KVM supports
6199
* VMFUNC for nested VMs, but not for L1.
6200
*/
6201
if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
6202
kvm_queue_exception(vcpu, UD_VECTOR);
6203
return 1;
6204
}
6205
6206
vmcs12 = get_vmcs12(vcpu);
6207
6208
/*
6209
* #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
6210
* is enabled in vmcs02 if and only if it's enabled in vmcs12.
6211
*/
6212
if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
6213
kvm_queue_exception(vcpu, UD_VECTOR);
6214
return 1;
6215
}
6216
6217
if (!(vmcs12->vm_function_control & BIT_ULL(function)))
6218
goto fail;
6219
6220
switch (function) {
6221
case 0:
6222
if (nested_vmx_eptp_switching(vcpu, vmcs12))
6223
goto fail;
6224
break;
6225
default:
6226
goto fail;
6227
}
6228
return kvm_skip_emulated_instruction(vcpu);
6229
6230
fail:
6231
/*
6232
* This is effectively a reflected VM-Exit, as opposed to a synthesized
6233
* nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
6234
* EXIT_REASON_VMFUNC as the exit reason.
6235
*/
6236
nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
6237
vmx_get_intr_info(vcpu),
6238
vmx_get_exit_qual(vcpu));
6239
return 1;
6240
}
6241
6242
/*
6243
* Return true if an IO instruction with the specified port and size should cause
6244
* a VM-exit into L1.
6245
*/
6246
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
6247
int size)
6248
{
6249
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6250
gpa_t bitmap, last_bitmap;
6251
u8 b;
6252
6253
last_bitmap = INVALID_GPA;
6254
b = -1;
6255
6256
while (size > 0) {
6257
if (port < 0x8000)
6258
bitmap = vmcs12->io_bitmap_a;
6259
else if (port < 0x10000)
6260
bitmap = vmcs12->io_bitmap_b;
6261
else
6262
return true;
6263
bitmap += (port & 0x7fff) / 8;
6264
6265
if (last_bitmap != bitmap)
6266
if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
6267
return true;
6268
if (b & (1 << (port & 7)))
6269
return true;
6270
6271
port++;
6272
size--;
6273
last_bitmap = bitmap;
6274
}
6275
6276
return false;
6277
}
6278
6279
static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6280
struct vmcs12 *vmcs12)
6281
{
6282
unsigned long exit_qualification;
6283
unsigned short port;
6284
int size;
6285
6286
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6287
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6288
6289
exit_qualification = vmx_get_exit_qual(vcpu);
6290
6291
port = exit_qualification >> 16;
6292
size = (exit_qualification & 7) + 1;
6293
6294
return nested_vmx_check_io_bitmaps(vcpu, port, size);
6295
}
6296
6297
/*
6298
* Return 1 if we should exit from L2 to L1 to handle an MSR access,
6299
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
6300
* disinterest in the current event (read or write a specific MSR) by using an
6301
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
6302
*/
6303
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
6304
struct vmcs12 *vmcs12,
6305
union vmx_exit_reason exit_reason)
6306
{
6307
u32 msr_index;
6308
gpa_t bitmap;
6309
6310
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
6311
return true;
6312
6313
if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM ||
6314
exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6315
msr_index = vmx_get_exit_qual(vcpu);
6316
else
6317
msr_index = kvm_rcx_read(vcpu);
6318
6319
/*
6320
* The MSR_BITMAP page is divided into four 1024-byte bitmaps,
6321
* for the four combinations of read/write and low/high MSR numbers.
6322
* First we need to figure out which of the four to use:
6323
*/
6324
bitmap = vmcs12->msr_bitmap;
6325
if (exit_reason.basic == EXIT_REASON_MSR_WRITE ||
6326
exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6327
bitmap += 2048;
6328
if (msr_index >= 0xc0000000) {
6329
msr_index -= 0xc0000000;
6330
bitmap += 1024;
6331
}
6332
6333
/* Then read the msr_index'th bit from this bitmap: */
6334
if (msr_index < 1024*8) {
6335
unsigned char b;
6336
if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
6337
return true;
6338
return 1 & (b >> (msr_index & 7));
6339
} else
6340
return true; /* let L1 handle the wrong parameter */
6341
}
6342
6343
/*
6344
* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
6345
* rather than handle it ourselves in L0. I.e., check if L1 wanted to
6346
* intercept (via guest_host_mask etc.) the current event.
6347
*/
6348
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6349
struct vmcs12 *vmcs12)
6350
{
6351
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
6352
int cr = exit_qualification & 15;
6353
int reg;
6354
unsigned long val;
6355
6356
switch ((exit_qualification >> 4) & 3) {
6357
case 0: /* mov to cr */
6358
reg = (exit_qualification >> 8) & 15;
6359
val = kvm_register_read(vcpu, reg);
6360
switch (cr) {
6361
case 0:
6362
if (vmcs12->cr0_guest_host_mask &
6363
(val ^ vmcs12->cr0_read_shadow))
6364
return true;
6365
break;
6366
case 3:
6367
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
6368
return true;
6369
break;
6370
case 4:
6371
if (vmcs12->cr4_guest_host_mask &
6372
(vmcs12->cr4_read_shadow ^ val))
6373
return true;
6374
break;
6375
case 8:
6376
if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
6377
return true;
6378
break;
6379
}
6380
break;
6381
case 2: /* clts */
6382
if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
6383
(vmcs12->cr0_read_shadow & X86_CR0_TS))
6384
return true;
6385
break;
6386
case 1: /* mov from cr */
6387
switch (cr) {
6388
case 3:
6389
if (vmcs12->cpu_based_vm_exec_control &
6390
CPU_BASED_CR3_STORE_EXITING)
6391
return true;
6392
break;
6393
case 8:
6394
if (vmcs12->cpu_based_vm_exec_control &
6395
CPU_BASED_CR8_STORE_EXITING)
6396
return true;
6397
break;
6398
}
6399
break;
6400
case 3: /* lmsw */
6401
/*
6402
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
6403
* cr0. Other attempted changes are ignored, with no exit.
6404
*/
6405
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6406
if (vmcs12->cr0_guest_host_mask & 0xe &
6407
(val ^ vmcs12->cr0_read_shadow))
6408
return true;
6409
if ((vmcs12->cr0_guest_host_mask & 0x1) &&
6410
!(vmcs12->cr0_read_shadow & 0x1) &&
6411
(val & 0x1))
6412
return true;
6413
break;
6414
}
6415
return false;
6416
}
6417
6418
static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
6419
struct vmcs12 *vmcs12)
6420
{
6421
u32 encls_leaf;
6422
6423
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
6424
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
6425
return false;
6426
6427
encls_leaf = kvm_rax_read(vcpu);
6428
if (encls_leaf > 62)
6429
encls_leaf = 63;
6430
return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
6431
}
6432
6433
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
6434
struct vmcs12 *vmcs12, gpa_t bitmap)
6435
{
6436
u32 vmx_instruction_info;
6437
unsigned long field;
6438
u8 b;
6439
6440
if (!nested_cpu_has_shadow_vmcs(vmcs12))
6441
return true;
6442
6443
/* Decode instruction info and find the field to access */
6444
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6445
field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6446
6447
/* Out-of-range fields always cause a VM exit from L2 to L1 */
6448
if (field >> 15)
6449
return true;
6450
6451
if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
6452
return true;
6453
6454
return 1 & (b >> (field & 7));
6455
}
6456
6457
static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
6458
{
6459
u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
6460
6461
if (nested_cpu_has_mtf(vmcs12))
6462
return true;
6463
6464
/*
6465
* An MTF VM-exit may be injected into the guest by setting the
6466
* interruption-type to 7 (other event) and the vector field to 0. Such
6467
* is the case regardless of the 'monitor trap flag' VM-execution
6468
* control.
6469
*/
6470
return entry_intr_info == (INTR_INFO_VALID_MASK
6471
| INTR_TYPE_OTHER_EVENT);
6472
}
6473
6474
/*
6475
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
6476
* L1 wants the exit. Only call this when in is_guest_mode (L2).
6477
*/
6478
static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
6479
union vmx_exit_reason exit_reason)
6480
{
6481
u32 intr_info;
6482
6483
switch ((u16)exit_reason.basic) {
6484
case EXIT_REASON_EXCEPTION_NMI:
6485
intr_info = vmx_get_intr_info(vcpu);
6486
if (is_nmi(intr_info))
6487
return true;
6488
else if (is_page_fault(intr_info))
6489
return vcpu->arch.apf.host_apf_flags ||
6490
vmx_need_pf_intercept(vcpu);
6491
else if (is_debug(intr_info) &&
6492
vcpu->guest_debug &
6493
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6494
return true;
6495
else if (is_breakpoint(intr_info) &&
6496
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6497
return true;
6498
else if (is_alignment_check(intr_info) &&
6499
!vmx_guest_inject_ac(vcpu))
6500
return true;
6501
else if (is_ve_fault(intr_info))
6502
return true;
6503
return false;
6504
case EXIT_REASON_EXTERNAL_INTERRUPT:
6505
return true;
6506
case EXIT_REASON_MCE_DURING_VMENTRY:
6507
return true;
6508
case EXIT_REASON_EPT_VIOLATION:
6509
/*
6510
* L0 always deals with the EPT violation. If nested EPT is
6511
* used, and the nested mmu code discovers that the address is
6512
* missing in the guest EPT table (EPT12), the EPT violation
6513
* will be injected with nested_ept_inject_page_fault()
6514
*/
6515
return true;
6516
case EXIT_REASON_EPT_MISCONFIG:
6517
/*
6518
* L2 never uses directly L1's EPT, but rather L0's own EPT
6519
* table (shadow on EPT) or a merged EPT table that L0 built
6520
* (EPT on EPT). So any problems with the structure of the
6521
* table is L0's fault.
6522
*/
6523
return true;
6524
case EXIT_REASON_PREEMPTION_TIMER:
6525
return true;
6526
case EXIT_REASON_PML_FULL:
6527
/*
6528
* PML is emulated for an L1 VMM and should never be enabled in
6529
* vmcs02, always "handle" PML_FULL by exiting to userspace.
6530
*/
6531
return true;
6532
case EXIT_REASON_VMFUNC:
6533
/* VM functions are emulated through L2->L0 vmexits. */
6534
return true;
6535
case EXIT_REASON_BUS_LOCK:
6536
/*
6537
* At present, bus lock VM exit is never exposed to L1.
6538
* Handle L2's bus locks in L0 directly.
6539
*/
6540
return true;
6541
#ifdef CONFIG_KVM_HYPERV
6542
case EXIT_REASON_VMCALL:
6543
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
6544
return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
6545
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
6546
kvm_hv_is_tlb_flush_hcall(vcpu);
6547
#endif
6548
default:
6549
break;
6550
}
6551
return false;
6552
}
6553
6554
/*
6555
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
6556
* is_guest_mode (L2).
6557
*/
6558
static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
6559
union vmx_exit_reason exit_reason)
6560
{
6561
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6562
u32 intr_info;
6563
6564
switch ((u16)exit_reason.basic) {
6565
case EXIT_REASON_EXCEPTION_NMI:
6566
intr_info = vmx_get_intr_info(vcpu);
6567
if (is_nmi(intr_info))
6568
return true;
6569
else if (is_page_fault(intr_info))
6570
return true;
6571
return vmcs12->exception_bitmap &
6572
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
6573
case EXIT_REASON_EXTERNAL_INTERRUPT:
6574
return nested_exit_on_intr(vcpu);
6575
case EXIT_REASON_TRIPLE_FAULT:
6576
return true;
6577
case EXIT_REASON_INTERRUPT_WINDOW:
6578
return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
6579
case EXIT_REASON_NMI_WINDOW:
6580
return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
6581
case EXIT_REASON_TASK_SWITCH:
6582
return true;
6583
case EXIT_REASON_CPUID:
6584
return true;
6585
case EXIT_REASON_HLT:
6586
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6587
case EXIT_REASON_INVD:
6588
return true;
6589
case EXIT_REASON_INVLPG:
6590
return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6591
case EXIT_REASON_RDPMC:
6592
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6593
case EXIT_REASON_RDRAND:
6594
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6595
case EXIT_REASON_RDSEED:
6596
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6597
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6598
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6599
case EXIT_REASON_VMREAD:
6600
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6601
vmcs12->vmread_bitmap);
6602
case EXIT_REASON_VMWRITE:
6603
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6604
vmcs12->vmwrite_bitmap);
6605
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6606
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6607
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6608
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6609
case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6610
/*
6611
* VMX instructions trap unconditionally. This allows L1 to
6612
* emulate them for its L2 guest, i.e., allows 3-level nesting!
6613
*/
6614
return true;
6615
case EXIT_REASON_CR_ACCESS:
6616
return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6617
case EXIT_REASON_DR_ACCESS:
6618
return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6619
case EXIT_REASON_IO_INSTRUCTION:
6620
return nested_vmx_exit_handled_io(vcpu, vmcs12);
6621
case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6622
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6623
case EXIT_REASON_MSR_READ:
6624
case EXIT_REASON_MSR_WRITE:
6625
case EXIT_REASON_MSR_READ_IMM:
6626
case EXIT_REASON_MSR_WRITE_IMM:
6627
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6628
case EXIT_REASON_INVALID_STATE:
6629
return true;
6630
case EXIT_REASON_MWAIT_INSTRUCTION:
6631
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6632
case EXIT_REASON_MONITOR_TRAP_FLAG:
6633
return nested_vmx_exit_handled_mtf(vmcs12);
6634
case EXIT_REASON_MONITOR_INSTRUCTION:
6635
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6636
case EXIT_REASON_PAUSE_INSTRUCTION:
6637
return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6638
nested_cpu_has2(vmcs12,
6639
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6640
case EXIT_REASON_MCE_DURING_VMENTRY:
6641
return true;
6642
case EXIT_REASON_TPR_BELOW_THRESHOLD:
6643
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6644
case EXIT_REASON_APIC_ACCESS:
6645
case EXIT_REASON_APIC_WRITE:
6646
case EXIT_REASON_EOI_INDUCED:
6647
/*
6648
* The controls for "virtualize APIC accesses," "APIC-
6649
* register virtualization," and "virtual-interrupt
6650
* delivery" only come from vmcs12.
6651
*/
6652
return true;
6653
case EXIT_REASON_INVPCID:
6654
return
6655
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6656
nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6657
case EXIT_REASON_WBINVD:
6658
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6659
case EXIT_REASON_XSETBV:
6660
return true;
6661
case EXIT_REASON_XSAVES:
6662
case EXIT_REASON_XRSTORS:
6663
/*
6664
* Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize
6665
* XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap
6666
* verbatim, i.e. any exit is due to L1's bitmap. WARN if
6667
* XSAVES isn't enabled, as the CPU is supposed to inject #UD
6668
* in that case, before consulting the XSS-bitmap.
6669
*/
6670
WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES));
6671
return true;
6672
case EXIT_REASON_UMWAIT:
6673
case EXIT_REASON_TPAUSE:
6674
return nested_cpu_has2(vmcs12,
6675
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6676
case EXIT_REASON_ENCLS:
6677
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6678
case EXIT_REASON_NOTIFY:
6679
/* Notify VM exit is not exposed to L1 */
6680
return false;
6681
case EXIT_REASON_SEAMCALL:
6682
case EXIT_REASON_TDCALL:
6683
/*
6684
* SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
6685
* virtualized by KVM for L1 hypervisors, i.e. L1 should
6686
* never want or expect such an exit.
6687
*/
6688
return false;
6689
default:
6690
return true;
6691
}
6692
}
6693
6694
/*
6695
* Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6696
* reflected into L1.
6697
*/
6698
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6699
{
6700
struct vcpu_vmx *vmx = to_vmx(vcpu);
6701
union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
6702
unsigned long exit_qual;
6703
u32 exit_intr_info;
6704
6705
WARN_ON_ONCE(vmx->nested.nested_run_pending);
6706
6707
/*
6708
* Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6709
* has already loaded L2's state.
6710
*/
6711
if (unlikely(vmx->fail)) {
6712
trace_kvm_nested_vmenter_failed(
6713
"hardware VM-instruction error: ",
6714
vmcs_read32(VM_INSTRUCTION_ERROR));
6715
exit_intr_info = 0;
6716
exit_qual = 0;
6717
goto reflect_vmexit;
6718
}
6719
6720
trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
6721
6722
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
6723
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6724
return false;
6725
6726
/* If L1 doesn't want the exit, handle it in L0. */
6727
if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6728
return false;
6729
6730
/*
6731
* vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6732
* EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6733
* need to be synthesized by querying the in-kernel LAPIC, but external
6734
* interrupts are never reflected to L1 so it's a non-issue.
6735
*/
6736
exit_intr_info = vmx_get_intr_info(vcpu);
6737
if (is_exception_with_error_code(exit_intr_info)) {
6738
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6739
6740
vmcs12->vm_exit_intr_error_code =
6741
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6742
}
6743
exit_qual = vmx_get_exit_qual(vcpu);
6744
6745
reflect_vmexit:
6746
nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6747
return true;
6748
}
6749
6750
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6751
struct kvm_nested_state __user *user_kvm_nested_state,
6752
u32 user_data_size)
6753
{
6754
struct vcpu_vmx *vmx;
6755
struct vmcs12 *vmcs12;
6756
struct kvm_nested_state kvm_state = {
6757
.flags = 0,
6758
.format = KVM_STATE_NESTED_FORMAT_VMX,
6759
.size = sizeof(kvm_state),
6760
.hdr.vmx.flags = 0,
6761
.hdr.vmx.vmxon_pa = INVALID_GPA,
6762
.hdr.vmx.vmcs12_pa = INVALID_GPA,
6763
.hdr.vmx.preemption_timer_deadline = 0,
6764
};
6765
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6766
&user_kvm_nested_state->data.vmx[0];
6767
6768
if (!vcpu)
6769
return kvm_state.size + sizeof(*user_vmx_nested_state);
6770
6771
vmx = to_vmx(vcpu);
6772
vmcs12 = get_vmcs12(vcpu);
6773
6774
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
6775
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6776
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6777
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6778
6779
if (vmx_has_valid_vmcs12(vcpu)) {
6780
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6781
6782
/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6783
if (nested_vmx_is_evmptr12_set(vmx))
6784
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6785
6786
if (is_guest_mode(vcpu) &&
6787
nested_cpu_has_shadow_vmcs(vmcs12) &&
6788
vmcs12->vmcs_link_pointer != INVALID_GPA)
6789
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6790
}
6791
6792
if (vmx->nested.smm.vmxon)
6793
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6794
6795
if (vmx->nested.smm.guest_mode)
6796
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6797
6798
if (is_guest_mode(vcpu)) {
6799
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6800
6801
if (vmx->nested.nested_run_pending)
6802
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6803
6804
if (vmx->nested.mtf_pending)
6805
kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6806
6807
if (nested_cpu_has_preemption_timer(vmcs12) &&
6808
vmx->nested.has_preemption_timer_deadline) {
6809
kvm_state.hdr.vmx.flags |=
6810
KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6811
kvm_state.hdr.vmx.preemption_timer_deadline =
6812
vmx->nested.preemption_timer_deadline;
6813
}
6814
}
6815
}
6816
6817
if (user_data_size < kvm_state.size)
6818
goto out;
6819
6820
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6821
return -EFAULT;
6822
6823
if (!vmx_has_valid_vmcs12(vcpu))
6824
goto out;
6825
6826
/*
6827
* When running L2, the authoritative vmcs12 state is in the
6828
* vmcs02. When running L1, the authoritative vmcs12 state is
6829
* in the shadow or enlightened vmcs linked to vmcs01, unless
6830
* need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6831
* vmcs12 state is in the vmcs12 already.
6832
*/
6833
if (is_guest_mode(vcpu)) {
6834
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6835
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6836
} else {
6837
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6838
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6839
if (nested_vmx_is_evmptr12_valid(vmx))
6840
/*
6841
* L1 hypervisor is not obliged to keep eVMCS
6842
* clean fields data always up-to-date while
6843
* not in guest mode, 'hv_clean_fields' is only
6844
* supposed to be actual upon vmentry so we need
6845
* to ignore it here and do full copy.
6846
*/
6847
copy_enlightened_to_vmcs12(vmx, 0);
6848
else if (enable_shadow_vmcs)
6849
copy_shadow_to_vmcs12(vmx);
6850
}
6851
}
6852
6853
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6854
BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6855
6856
/*
6857
* Copy over the full allocated size of vmcs12 rather than just the size
6858
* of the struct.
6859
*/
6860
if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6861
return -EFAULT;
6862
6863
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6864
vmcs12->vmcs_link_pointer != INVALID_GPA) {
6865
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6866
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6867
return -EFAULT;
6868
}
6869
out:
6870
return kvm_state.size;
6871
}
6872
6873
void vmx_leave_nested(struct kvm_vcpu *vcpu)
6874
{
6875
if (is_guest_mode(vcpu)) {
6876
to_vmx(vcpu)->nested.nested_run_pending = 0;
6877
nested_vmx_vmexit(vcpu, -1, 0, 0);
6878
}
6879
free_nested(vcpu);
6880
}
6881
6882
static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6883
struct kvm_nested_state __user *user_kvm_nested_state,
6884
struct kvm_nested_state *kvm_state)
6885
{
6886
struct vcpu_vmx *vmx = to_vmx(vcpu);
6887
struct vmcs12 *vmcs12;
6888
enum vm_entry_failure_code ignored;
6889
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6890
&user_kvm_nested_state->data.vmx[0];
6891
int ret;
6892
6893
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6894
return -EINVAL;
6895
6896
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6897
if (kvm_state->hdr.vmx.smm.flags)
6898
return -EINVAL;
6899
6900
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
6901
return -EINVAL;
6902
6903
/*
6904
* KVM_STATE_NESTED_EVMCS used to signal that KVM should
6905
* enable eVMCS capability on vCPU. However, since then
6906
* code was changed such that flag signals vmcs12 should
6907
* be copied into eVMCS in guest memory.
6908
*
6909
* To preserve backwards compatibility, allow user
6910
* to set this flag even when there is no VMXON region.
6911
*/
6912
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6913
return -EINVAL;
6914
} else {
6915
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
6916
return -EINVAL;
6917
6918
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6919
return -EINVAL;
6920
}
6921
6922
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6923
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6924
return -EINVAL;
6925
6926
if (kvm_state->hdr.vmx.smm.flags &
6927
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6928
return -EINVAL;
6929
6930
if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6931
return -EINVAL;
6932
6933
/*
6934
* SMM temporarily disables VMX, so we cannot be in guest mode,
6935
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6936
* must be zero.
6937
*/
6938
if (is_smm(vcpu) ?
6939
(kvm_state->flags &
6940
(KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6941
: kvm_state->hdr.vmx.smm.flags)
6942
return -EINVAL;
6943
6944
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6945
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6946
return -EINVAL;
6947
6948
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6949
(!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
6950
!vmx->nested.enlightened_vmcs_enabled))
6951
return -EINVAL;
6952
6953
vmx_leave_nested(vcpu);
6954
6955
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
6956
return 0;
6957
6958
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6959
ret = enter_vmx_operation(vcpu);
6960
if (ret)
6961
return ret;
6962
6963
/* Empty 'VMXON' state is permitted if no VMCS loaded */
6964
if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6965
/* See vmx_has_valid_vmcs12. */
6966
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6967
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6968
(kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
6969
return -EINVAL;
6970
else
6971
return 0;
6972
}
6973
6974
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
6975
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6976
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6977
return -EINVAL;
6978
6979
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6980
#ifdef CONFIG_KVM_HYPERV
6981
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6982
/*
6983
* nested_vmx_handle_enlightened_vmptrld() cannot be called
6984
* directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6985
* restored yet. EVMCS will be mapped from
6986
* nested_get_vmcs12_pages().
6987
*/
6988
vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6989
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6990
#endif
6991
} else {
6992
return -EINVAL;
6993
}
6994
6995
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6996
vmx->nested.smm.vmxon = true;
6997
vmx->nested.vmxon = false;
6998
6999
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
7000
vmx->nested.smm.guest_mode = true;
7001
}
7002
7003
vmcs12 = get_vmcs12(vcpu);
7004
if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
7005
return -EFAULT;
7006
7007
if (vmcs12->hdr.revision_id != VMCS12_REVISION)
7008
return -EINVAL;
7009
7010
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
7011
return 0;
7012
7013
vmx->nested.nested_run_pending =
7014
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
7015
7016
vmx->nested.mtf_pending =
7017
!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
7018
7019
ret = -EINVAL;
7020
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
7021
vmcs12->vmcs_link_pointer != INVALID_GPA) {
7022
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
7023
7024
if (kvm_state->size <
7025
sizeof(*kvm_state) +
7026
sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
7027
goto error_guest_mode;
7028
7029
if (copy_from_user(shadow_vmcs12,
7030
user_vmx_nested_state->shadow_vmcs12,
7031
sizeof(*shadow_vmcs12))) {
7032
ret = -EFAULT;
7033
goto error_guest_mode;
7034
}
7035
7036
if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
7037
!shadow_vmcs12->hdr.shadow_vmcs)
7038
goto error_guest_mode;
7039
}
7040
7041
vmx->nested.has_preemption_timer_deadline = false;
7042
if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
7043
vmx->nested.has_preemption_timer_deadline = true;
7044
vmx->nested.preemption_timer_deadline =
7045
kvm_state->hdr.vmx.preemption_timer_deadline;
7046
}
7047
7048
if (nested_vmx_check_controls(vcpu, vmcs12) ||
7049
nested_vmx_check_host_state(vcpu, vmcs12) ||
7050
nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
7051
goto error_guest_mode;
7052
7053
vmx->nested.dirty_vmcs12 = true;
7054
vmx->nested.force_msr_bitmap_recalc = true;
7055
ret = nested_vmx_enter_non_root_mode(vcpu, false);
7056
if (ret)
7057
goto error_guest_mode;
7058
7059
if (vmx->nested.mtf_pending)
7060
kvm_make_request(KVM_REQ_EVENT, vcpu);
7061
7062
return 0;
7063
7064
error_guest_mode:
7065
vmx->nested.nested_run_pending = 0;
7066
return ret;
7067
}
7068
7069
void nested_vmx_set_vmcs_shadowing_bitmap(void)
7070
{
7071
if (enable_shadow_vmcs) {
7072
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
7073
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
7074
}
7075
}
7076
7077
/*
7078
* Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
7079
* that madness to get the encoding for comparison.
7080
*/
7081
#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
7082
7083
static u64 nested_vmx_calc_vmcs_enum_msr(void)
7084
{
7085
/*
7086
* Note these are the so called "index" of the VMCS field encoding, not
7087
* the index into vmcs12.
7088
*/
7089
unsigned int max_idx, idx;
7090
int i;
7091
7092
/*
7093
* For better or worse, KVM allows VMREAD/VMWRITE to all fields in
7094
* vmcs12, regardless of whether or not the associated feature is
7095
* exposed to L1. Simply find the field with the highest index.
7096
*/
7097
max_idx = 0;
7098
for (i = 0; i < nr_vmcs12_fields; i++) {
7099
/* The vmcs12 table is very, very sparsely populated. */
7100
if (!vmcs12_field_offsets[i])
7101
continue;
7102
7103
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
7104
if (idx > max_idx)
7105
max_idx = idx;
7106
}
7107
7108
return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
7109
}
7110
7111
static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
7112
struct nested_vmx_msrs *msrs)
7113
{
7114
msrs->pinbased_ctls_low =
7115
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7116
7117
msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
7118
msrs->pinbased_ctls_high &=
7119
PIN_BASED_EXT_INTR_MASK |
7120
PIN_BASED_NMI_EXITING |
7121
PIN_BASED_VIRTUAL_NMIS |
7122
(enable_apicv ? PIN_BASED_POSTED_INTR : 0);
7123
msrs->pinbased_ctls_high |=
7124
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7125
PIN_BASED_VMX_PREEMPTION_TIMER;
7126
}
7127
7128
static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
7129
struct nested_vmx_msrs *msrs)
7130
{
7131
msrs->exit_ctls_low =
7132
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
7133
7134
msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
7135
msrs->exit_ctls_high &=
7136
#ifdef CONFIG_X86_64
7137
VM_EXIT_HOST_ADDR_SPACE_SIZE |
7138
#endif
7139
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
7140
VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE;
7141
msrs->exit_ctls_high |=
7142
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
7143
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
7144
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
7145
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
7146
7147
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
7148
!kvm_cpu_cap_has(X86_FEATURE_IBT))
7149
msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE;
7150
7151
/* We support free control of debug control saving. */
7152
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
7153
}
7154
7155
static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
7156
struct nested_vmx_msrs *msrs)
7157
{
7158
msrs->entry_ctls_low =
7159
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
7160
7161
msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
7162
msrs->entry_ctls_high &=
7163
#ifdef CONFIG_X86_64
7164
VM_ENTRY_IA32E_MODE |
7165
#endif
7166
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
7167
VM_ENTRY_LOAD_CET_STATE;
7168
msrs->entry_ctls_high |=
7169
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
7170
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
7171
7172
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
7173
!kvm_cpu_cap_has(X86_FEATURE_IBT))
7174
msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE;
7175
7176
/* We support free control of debug control loading. */
7177
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
7178
}
7179
7180
static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
7181
struct nested_vmx_msrs *msrs)
7182
{
7183
msrs->procbased_ctls_low =
7184
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7185
7186
msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
7187
msrs->procbased_ctls_high &=
7188
CPU_BASED_INTR_WINDOW_EXITING |
7189
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
7190
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
7191
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
7192
CPU_BASED_CR3_STORE_EXITING |
7193
#ifdef CONFIG_X86_64
7194
CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
7195
#endif
7196
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
7197
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
7198
CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
7199
CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
7200
CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
7201
/*
7202
* We can allow some features even when not supported by the
7203
* hardware. For example, L1 can specify an MSR bitmap - and we
7204
* can use it to avoid exits to L1 - even when L0 runs L2
7205
* without MSR bitmaps.
7206
*/
7207
msrs->procbased_ctls_high |=
7208
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7209
CPU_BASED_USE_MSR_BITMAPS;
7210
7211
/* We support free control of CR3 access interception. */
7212
msrs->procbased_ctls_low &=
7213
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
7214
}
7215
7216
static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
7217
struct vmcs_config *vmcs_conf,
7218
struct nested_vmx_msrs *msrs)
7219
{
7220
msrs->secondary_ctls_low = 0;
7221
7222
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
7223
msrs->secondary_ctls_high &=
7224
SECONDARY_EXEC_DESC |
7225
SECONDARY_EXEC_ENABLE_RDTSCP |
7226
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7227
SECONDARY_EXEC_WBINVD_EXITING |
7228
SECONDARY_EXEC_APIC_REGISTER_VIRT |
7229
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
7230
SECONDARY_EXEC_RDRAND_EXITING |
7231
SECONDARY_EXEC_ENABLE_INVPCID |
7232
SECONDARY_EXEC_ENABLE_VMFUNC |
7233
SECONDARY_EXEC_RDSEED_EXITING |
7234
SECONDARY_EXEC_ENABLE_XSAVES |
7235
SECONDARY_EXEC_TSC_SCALING |
7236
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
7237
7238
/*
7239
* We can emulate "VMCS shadowing," even if the hardware
7240
* doesn't support it.
7241
*/
7242
msrs->secondary_ctls_high |=
7243
SECONDARY_EXEC_SHADOW_VMCS;
7244
7245
if (enable_ept) {
7246
/* nested EPT: emulate EPT also to L1 */
7247
msrs->secondary_ctls_high |=
7248
SECONDARY_EXEC_ENABLE_EPT;
7249
msrs->ept_caps =
7250
VMX_EPT_PAGE_WALK_4_BIT |
7251
VMX_EPT_PAGE_WALK_5_BIT |
7252
VMX_EPTP_WB_BIT |
7253
VMX_EPT_INVEPT_BIT |
7254
VMX_EPT_EXECUTE_ONLY_BIT;
7255
7256
msrs->ept_caps &= ept_caps;
7257
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7258
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
7259
VMX_EPT_1GB_PAGE_BIT;
7260
if (enable_ept_ad_bits) {
7261
msrs->secondary_ctls_high |=
7262
SECONDARY_EXEC_ENABLE_PML;
7263
msrs->ept_caps |= VMX_EPT_AD_BIT;
7264
}
7265
7266
/*
7267
* Advertise EPTP switching irrespective of hardware support,
7268
* KVM emulates it in software so long as VMFUNC is supported.
7269
*/
7270
if (cpu_has_vmx_vmfunc())
7271
msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
7272
}
7273
7274
/*
7275
* Old versions of KVM use the single-context version without
7276
* checking for support, so declare that it is supported even
7277
* though it is treated as global context. The alternative is
7278
* not failing the single-context invvpid, and it is worse.
7279
*/
7280
if (enable_vpid) {
7281
msrs->secondary_ctls_high |=
7282
SECONDARY_EXEC_ENABLE_VPID;
7283
msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
7284
VMX_VPID_EXTENT_SUPPORTED_MASK;
7285
}
7286
7287
if (enable_unrestricted_guest)
7288
msrs->secondary_ctls_high |=
7289
SECONDARY_EXEC_UNRESTRICTED_GUEST;
7290
7291
if (flexpriority_enabled)
7292
msrs->secondary_ctls_high |=
7293
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7294
7295
if (enable_sgx)
7296
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
7297
}
7298
7299
static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
7300
struct nested_vmx_msrs *msrs)
7301
{
7302
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
7303
msrs->misc_low |=
7304
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
7305
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
7306
VMX_MISC_ACTIVITY_HLT |
7307
VMX_MISC_ACTIVITY_WAIT_SIPI;
7308
msrs->misc_high = 0;
7309
}
7310
7311
static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
7312
{
7313
/*
7314
* This MSR reports some information about VMX support. We
7315
* should return information about the VMX we emulate for the
7316
* guest, and the VMCS structure we give it - not about the
7317
* VMX support of the underlying hardware.
7318
*/
7319
msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
7320
X86_MEMTYPE_WB);
7321
7322
msrs->basic |= VMX_BASIC_TRUE_CTLS;
7323
if (cpu_has_vmx_basic_inout())
7324
msrs->basic |= VMX_BASIC_INOUT;
7325
if (cpu_has_vmx_basic_no_hw_errcode_cc())
7326
msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC;
7327
}
7328
7329
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
7330
{
7331
/*
7332
* These MSRs specify bits which the guest must keep fixed on
7333
* while L1 is in VMXON mode (in L1's root mode, or running an L2).
7334
* We picked the standard core2 setting.
7335
*/
7336
#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
7337
#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
7338
msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
7339
msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
7340
7341
/* These MSRs specify bits which the guest must keep fixed off. */
7342
rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
7343
rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
7344
7345
if (vmx_umip_emulated())
7346
msrs->cr4_fixed1 |= X86_CR4_UMIP;
7347
}
7348
7349
/*
7350
* nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
7351
* returned for the various VMX controls MSRs when nested VMX is enabled.
7352
* The same values should also be used to verify that vmcs12 control fields are
7353
* valid during nested entry from L1 to L2.
7354
* Each of these control msrs has a low and high 32-bit half: A low bit is on
7355
* if the corresponding bit in the (32-bit) control field *must* be on, and a
7356
* bit in the high half is on if the corresponding bit in the control field
7357
* may be on. See also vmx_control_verify().
7358
*/
7359
void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
7360
{
7361
struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
7362
7363
/*
7364
* Note that as a general rule, the high half of the MSRs (bits in
7365
* the control fields which may be 1) should be initialized by the
7366
* intersection of the underlying hardware's MSR (i.e., features which
7367
* can be supported) and the list of features we want to expose -
7368
* because they are known to be properly supported in our code.
7369
* Also, usually, the low half of the MSRs (bits which must be 1) can
7370
* be set to 0, meaning that L1 may turn off any of these bits. The
7371
* reason is that if one of these bits is necessary, it will appear
7372
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
7373
* fields of vmcs01 and vmcs02, will turn these bits off - and
7374
* nested_vmx_l1_wants_exit() will not pass related exits to L1.
7375
* These rules have exceptions below.
7376
*/
7377
nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
7378
7379
nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
7380
7381
nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
7382
7383
nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
7384
7385
nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
7386
7387
nested_vmx_setup_misc_data(vmcs_conf, msrs);
7388
7389
nested_vmx_setup_basic(msrs);
7390
7391
nested_vmx_setup_cr_fixed(msrs);
7392
7393
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
7394
}
7395
7396
void nested_vmx_hardware_unsetup(void)
7397
{
7398
int i;
7399
7400
if (enable_shadow_vmcs) {
7401
for (i = 0; i < VMX_BITMAP_NR; i++)
7402
free_page((unsigned long)vmx_bitmap[i]);
7403
}
7404
}
7405
7406
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
7407
{
7408
int i;
7409
7410
if (!cpu_has_vmx_shadow_vmcs())
7411
enable_shadow_vmcs = 0;
7412
if (enable_shadow_vmcs) {
7413
for (i = 0; i < VMX_BITMAP_NR; i++) {
7414
/*
7415
* The vmx_bitmap is not tied to a VM and so should
7416
* not be charged to a memcg.
7417
*/
7418
vmx_bitmap[i] = (unsigned long *)
7419
__get_free_page(GFP_KERNEL);
7420
if (!vmx_bitmap[i]) {
7421
nested_vmx_hardware_unsetup();
7422
return -ENOMEM;
7423
}
7424
}
7425
7426
init_vmcs_shadow_fields();
7427
}
7428
7429
exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
7430
exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
7431
exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
7432
exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
7433
exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
7434
exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
7435
exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
7436
exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
7437
exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
7438
exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
7439
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
7440
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
7441
7442
return 0;
7443
}
7444
7445
struct kvm_x86_nested_ops vmx_nested_ops = {
7446
.leave_nested = vmx_leave_nested,
7447
.is_exception_vmexit = nested_vmx_is_exception_vmexit,
7448
.check_events = vmx_check_nested_events,
7449
.has_events = vmx_has_nested_events,
7450
.triple_fault = nested_vmx_triple_fault,
7451
.get_state = vmx_get_nested_state,
7452
.set_state = vmx_set_nested_state,
7453
.get_nested_state_pages = vmx_get_nested_state_pages,
7454
.write_log_dirty = nested_vmx_write_pml_buffer,
7455
#ifdef CONFIG_KVM_HYPERV
7456
.enable_evmcs = nested_enable_evmcs,
7457
.get_evmcs_version = nested_get_evmcs_version,
7458
.hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
7459
#endif
7460
};
7461
7462