Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/nested.c
26489 views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4
#include <linux/objtool.h>
5
#include <linux/percpu.h>
6
7
#include <asm/debugreg.h>
8
#include <asm/mmu_context.h>
9
#include <asm/msr.h>
10
11
#include "x86.h"
12
#include "cpuid.h"
13
#include "hyperv.h"
14
#include "mmu.h"
15
#include "nested.h"
16
#include "pmu.h"
17
#include "posted_intr.h"
18
#include "sgx.h"
19
#include "trace.h"
20
#include "vmx.h"
21
#include "smm.h"
22
23
static bool __read_mostly enable_shadow_vmcs = 1;
24
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
25
26
static bool __read_mostly nested_early_check = 0;
27
module_param(nested_early_check, bool, S_IRUGO);
28
29
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
30
31
/*
32
* Hyper-V requires all of these, so mark them as supported even though
33
* they are just treated the same as all-context.
34
*/
35
#define VMX_VPID_EXTENT_SUPPORTED_MASK \
36
(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
37
VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
38
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
39
VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
40
41
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
42
43
enum {
44
VMX_VMREAD_BITMAP,
45
VMX_VMWRITE_BITMAP,
46
VMX_BITMAP_NR
47
};
48
static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
49
50
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
51
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
52
53
struct shadow_vmcs_field {
54
u16 encoding;
55
u16 offset;
56
};
57
static struct shadow_vmcs_field shadow_read_only_fields[] = {
58
#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
59
#include "vmcs_shadow_fields.h"
60
};
61
static int max_shadow_read_only_fields =
62
ARRAY_SIZE(shadow_read_only_fields);
63
64
static struct shadow_vmcs_field shadow_read_write_fields[] = {
65
#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
66
#include "vmcs_shadow_fields.h"
67
};
68
static int max_shadow_read_write_fields =
69
ARRAY_SIZE(shadow_read_write_fields);
70
71
static void init_vmcs_shadow_fields(void)
72
{
73
int i, j;
74
75
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
76
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
77
78
for (i = j = 0; i < max_shadow_read_only_fields; i++) {
79
struct shadow_vmcs_field entry = shadow_read_only_fields[i];
80
u16 field = entry.encoding;
81
82
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
83
(i + 1 == max_shadow_read_only_fields ||
84
shadow_read_only_fields[i + 1].encoding != field + 1))
85
pr_err("Missing field from shadow_read_only_field %x\n",
86
field + 1);
87
88
clear_bit(field, vmx_vmread_bitmap);
89
if (field & 1)
90
#ifdef CONFIG_X86_64
91
continue;
92
#else
93
entry.offset += sizeof(u32);
94
#endif
95
shadow_read_only_fields[j++] = entry;
96
}
97
max_shadow_read_only_fields = j;
98
99
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
100
struct shadow_vmcs_field entry = shadow_read_write_fields[i];
101
u16 field = entry.encoding;
102
103
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
104
(i + 1 == max_shadow_read_write_fields ||
105
shadow_read_write_fields[i + 1].encoding != field + 1))
106
pr_err("Missing field from shadow_read_write_field %x\n",
107
field + 1);
108
109
WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
110
field <= GUEST_TR_AR_BYTES,
111
"Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
112
113
/*
114
* PML and the preemption timer can be emulated, but the
115
* processor cannot vmwrite to fields that don't exist
116
* on bare metal.
117
*/
118
switch (field) {
119
case GUEST_PML_INDEX:
120
if (!cpu_has_vmx_pml())
121
continue;
122
break;
123
case VMX_PREEMPTION_TIMER_VALUE:
124
if (!cpu_has_vmx_preemption_timer())
125
continue;
126
break;
127
case GUEST_INTR_STATUS:
128
if (!cpu_has_vmx_apicv())
129
continue;
130
break;
131
default:
132
break;
133
}
134
135
clear_bit(field, vmx_vmwrite_bitmap);
136
clear_bit(field, vmx_vmread_bitmap);
137
if (field & 1)
138
#ifdef CONFIG_X86_64
139
continue;
140
#else
141
entry.offset += sizeof(u32);
142
#endif
143
shadow_read_write_fields[j++] = entry;
144
}
145
max_shadow_read_write_fields = j;
146
}
147
148
/*
149
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
150
* set the success or error code of an emulated VMX instruction (as specified
151
* by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
152
* instruction.
153
*/
154
static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
155
{
156
vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
157
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
158
X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
159
return kvm_skip_emulated_instruction(vcpu);
160
}
161
162
static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
163
{
164
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
165
& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
166
X86_EFLAGS_SF | X86_EFLAGS_OF))
167
| X86_EFLAGS_CF);
168
return kvm_skip_emulated_instruction(vcpu);
169
}
170
171
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
172
u32 vm_instruction_error)
173
{
174
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
176
X86_EFLAGS_SF | X86_EFLAGS_OF))
177
| X86_EFLAGS_ZF);
178
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
179
/*
180
* We don't need to force sync to shadow VMCS because
181
* VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
182
* fields and thus must be synced.
183
*/
184
if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
185
to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
186
187
return kvm_skip_emulated_instruction(vcpu);
188
}
189
190
static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
191
{
192
struct vcpu_vmx *vmx = to_vmx(vcpu);
193
194
/*
195
* failValid writes the error number to the current VMCS, which
196
* can't be done if there isn't a current VMCS.
197
*/
198
if (vmx->nested.current_vmptr == INVALID_GPA &&
199
!nested_vmx_is_evmptr12_valid(vmx))
200
return nested_vmx_failInvalid(vcpu);
201
202
return nested_vmx_failValid(vcpu, vm_instruction_error);
203
}
204
205
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206
{
207
/* TODO: not to reset guest simply here. */
208
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209
pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
210
}
211
212
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213
{
214
return fixed_bits_valid(control, low, high);
215
}
216
217
static inline u64 vmx_control_msr(u32 low, u32 high)
218
{
219
return low | ((u64)high << 32);
220
}
221
222
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223
{
224
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
226
vmx->nested.need_vmcs12_to_shadow_sync = false;
227
}
228
229
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230
{
231
#ifdef CONFIG_KVM_HYPERV
232
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
233
struct vcpu_vmx *vmx = to_vmx(vcpu);
234
235
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
236
vmx->nested.hv_evmcs = NULL;
237
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
238
239
if (hv_vcpu) {
240
hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
241
hv_vcpu->nested.vm_id = 0;
242
hv_vcpu->nested.vp_id = 0;
243
}
244
#endif
245
}
246
247
static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
248
{
249
#ifdef CONFIG_KVM_HYPERV
250
struct vcpu_vmx *vmx = to_vmx(vcpu);
251
/*
252
* When Enlightened VMEntry is enabled on the calling CPU we treat
253
* memory area pointer by vmptr as Enlightened VMCS (as there's no good
254
* way to distinguish it from VMCS12) and we must not corrupt it by
255
* writing to the non-existent 'launch_state' field. The area doesn't
256
* have to be the currently active EVMCS on the calling CPU and there's
257
* nothing KVM has to do to transition it from 'active' to 'non-active'
258
* state. It is possible that the area will stay mapped as
259
* vmx->nested.hv_evmcs but this shouldn't be a problem.
260
*/
261
if (!guest_cpu_cap_has_evmcs(vcpu) ||
262
!evmptr_is_valid(nested_get_evmptr(vcpu)))
263
return false;
264
265
if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
266
nested_release_evmcs(vcpu);
267
268
return true;
269
#else
270
return false;
271
#endif
272
}
273
274
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
275
struct loaded_vmcs *prev)
276
{
277
struct vmcs_host_state *dest, *src;
278
279
if (unlikely(!vmx->vt.guest_state_loaded))
280
return;
281
282
src = &prev->host_state;
283
dest = &vmx->loaded_vmcs->host_state;
284
285
vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
286
dest->ldt_sel = src->ldt_sel;
287
#ifdef CONFIG_X86_64
288
dest->ds_sel = src->ds_sel;
289
dest->es_sel = src->es_sel;
290
#endif
291
}
292
293
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
294
{
295
struct vcpu_vmx *vmx = to_vmx(vcpu);
296
struct loaded_vmcs *prev;
297
int cpu;
298
299
if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
300
return;
301
302
cpu = get_cpu();
303
prev = vmx->loaded_vmcs;
304
vmx->loaded_vmcs = vmcs;
305
vmx_vcpu_load_vmcs(vcpu, cpu);
306
vmx_sync_vmcs_host_state(vmx, prev);
307
put_cpu();
308
309
vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
310
311
/*
312
* All lazily updated registers will be reloaded from VMCS12 on both
313
* vmentry and vmexit.
314
*/
315
vcpu->arch.regs_dirty = 0;
316
}
317
318
static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
319
{
320
struct vcpu_vmx *vmx = to_vmx(vcpu);
321
322
kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
323
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
324
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
325
vmx->nested.pi_desc = NULL;
326
}
327
328
/*
329
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
330
* just stops using VMX.
331
*/
332
static void free_nested(struct kvm_vcpu *vcpu)
333
{
334
struct vcpu_vmx *vmx = to_vmx(vcpu);
335
336
if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
337
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
338
339
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
340
return;
341
342
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
343
344
vmx->nested.vmxon = false;
345
vmx->nested.smm.vmxon = false;
346
vmx->nested.vmxon_ptr = INVALID_GPA;
347
free_vpid(vmx->nested.vpid02);
348
vmx->nested.posted_intr_nv = -1;
349
vmx->nested.current_vmptr = INVALID_GPA;
350
if (enable_shadow_vmcs) {
351
vmx_disable_shadow_vmcs(vmx);
352
vmcs_clear(vmx->vmcs01.shadow_vmcs);
353
free_vmcs(vmx->vmcs01.shadow_vmcs);
354
vmx->vmcs01.shadow_vmcs = NULL;
355
}
356
kfree(vmx->nested.cached_vmcs12);
357
vmx->nested.cached_vmcs12 = NULL;
358
kfree(vmx->nested.cached_shadow_vmcs12);
359
vmx->nested.cached_shadow_vmcs12 = NULL;
360
361
nested_put_vmcs12_pages(vcpu);
362
363
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
364
365
nested_release_evmcs(vcpu);
366
367
free_loaded_vmcs(&vmx->nested.vmcs02);
368
}
369
370
/*
371
* Ensure that the current vmcs of the logical processor is the
372
* vmcs01 of the vcpu before calling free_nested().
373
*/
374
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
375
{
376
vcpu_load(vcpu);
377
vmx_leave_nested(vcpu);
378
vcpu_put(vcpu);
379
}
380
381
#define EPTP_PA_MASK GENMASK_ULL(51, 12)
382
383
static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
384
{
385
return VALID_PAGE(root_hpa) &&
386
((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
387
}
388
389
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
390
gpa_t addr)
391
{
392
unsigned long roots = 0;
393
uint i;
394
struct kvm_mmu_root_info *cached_root;
395
396
WARN_ON_ONCE(!mmu_is_nested(vcpu));
397
398
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
399
cached_root = &vcpu->arch.mmu->prev_roots[i];
400
401
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
402
eptp))
403
roots |= KVM_MMU_ROOT_PREVIOUS(i);
404
}
405
if (roots)
406
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
407
}
408
409
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
410
struct x86_exception *fault)
411
{
412
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
413
struct vcpu_vmx *vmx = to_vmx(vcpu);
414
unsigned long exit_qualification;
415
u32 vm_exit_reason;
416
417
if (vmx->nested.pml_full) {
418
vm_exit_reason = EXIT_REASON_PML_FULL;
419
vmx->nested.pml_full = false;
420
421
/*
422
* It should be impossible to trigger a nested PML Full VM-Exit
423
* for anything other than an EPT Violation from L2. KVM *can*
424
* trigger nEPT page fault injection in response to an EPT
425
* Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
426
* tables also changed, but KVM should not treat EPT Misconfig
427
* VM-Exits as writes.
428
*/
429
WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
430
431
/*
432
* PML Full and EPT Violation VM-Exits both use bit 12 to report
433
* "NMI unblocking due to IRET", i.e. the bit can be propagated
434
* as-is from the original EXIT_QUALIFICATION.
435
*/
436
exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
437
} else {
438
if (fault->error_code & PFERR_RSVD_MASK) {
439
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
440
exit_qualification = 0;
441
} else {
442
exit_qualification = fault->exit_qualification;
443
exit_qualification |= vmx_get_exit_qual(vcpu) &
444
(EPT_VIOLATION_GVA_IS_VALID |
445
EPT_VIOLATION_GVA_TRANSLATED);
446
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
447
}
448
449
/*
450
* Although the caller (kvm_inject_emulated_page_fault) would
451
* have already synced the faulting address in the shadow EPT
452
* tables for the current EPTP12, we also need to sync it for
453
* any other cached EPTP02s based on the same EP4TA, since the
454
* TLB associates mappings to the EP4TA rather than the full EPTP.
455
*/
456
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
457
fault->address);
458
}
459
460
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
461
vmcs12->guest_physical_address = fault->address;
462
}
463
464
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
465
{
466
struct vcpu_vmx *vmx = to_vmx(vcpu);
467
bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
468
int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
469
470
kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
471
nested_ept_ad_enabled(vcpu),
472
nested_ept_get_eptp(vcpu));
473
}
474
475
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
476
{
477
WARN_ON(mmu_is_nested(vcpu));
478
479
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
480
nested_ept_new_eptp(vcpu);
481
vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
482
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
483
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
484
485
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
486
}
487
488
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
489
{
490
vcpu->arch.mmu = &vcpu->arch.root_mmu;
491
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
492
}
493
494
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
495
u16 error_code)
496
{
497
bool inequality, bit;
498
499
bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
500
inequality =
501
(error_code & vmcs12->page_fault_error_code_mask) !=
502
vmcs12->page_fault_error_code_match;
503
return inequality ^ bit;
504
}
505
506
static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
507
u32 error_code)
508
{
509
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
510
511
/*
512
* Drop bits 31:16 of the error code when performing the #PF mask+match
513
* check. All VMCS fields involved are 32 bits, but Intel CPUs never
514
* set bits 31:16 and VMX disallows setting bits 31:16 in the injected
515
* error code. Including the to-be-dropped bits in the check might
516
* result in an "impossible" or missed exit from L1's perspective.
517
*/
518
if (vector == PF_VECTOR)
519
return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
520
521
return (vmcs12->exception_bitmap & (1u << vector));
522
}
523
524
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
525
struct vmcs12 *vmcs12)
526
{
527
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
528
return 0;
529
530
if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
531
CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
532
return -EINVAL;
533
534
return 0;
535
}
536
537
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
538
struct vmcs12 *vmcs12)
539
{
540
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
541
return 0;
542
543
if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
544
return -EINVAL;
545
546
return 0;
547
}
548
549
static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
550
struct vmcs12 *vmcs12)
551
{
552
if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
553
return 0;
554
555
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
556
return -EINVAL;
557
558
return 0;
559
}
560
561
/*
562
* For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
563
* itself utilizing x2APIC. All MSRs were previously set to be intercepted,
564
* only the "disable intercept" case needs to be handled.
565
*/
566
static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
567
unsigned long *msr_bitmap_l0,
568
u32 msr, int type)
569
{
570
if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
571
vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
572
573
if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
574
vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
575
}
576
577
static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
578
{
579
int msr;
580
581
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
582
unsigned word = msr / BITS_PER_LONG;
583
584
msr_bitmap[word] = ~0;
585
msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
586
}
587
}
588
589
#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
590
static inline \
591
void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
592
unsigned long *msr_bitmap_l1, \
593
unsigned long *msr_bitmap_l0, u32 msr) \
594
{ \
595
if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
596
vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
597
vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
598
else \
599
vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
600
}
601
BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
602
BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
603
604
static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
605
unsigned long *msr_bitmap_l1,
606
unsigned long *msr_bitmap_l0,
607
u32 msr, int types)
608
{
609
if (types & MSR_TYPE_R)
610
nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
611
msr_bitmap_l0, msr);
612
if (types & MSR_TYPE_W)
613
nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
614
msr_bitmap_l0, msr);
615
}
616
617
/*
618
* Merge L0's and L1's MSR bitmap, return false to indicate that
619
* we do not use the hardware.
620
*/
621
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
622
struct vmcs12 *vmcs12)
623
{
624
struct vcpu_vmx *vmx = to_vmx(vcpu);
625
int msr;
626
unsigned long *msr_bitmap_l1;
627
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
628
struct kvm_host_map map;
629
630
/* Nothing to do if the MSR bitmap is not in use. */
631
if (!cpu_has_vmx_msr_bitmap() ||
632
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
633
return false;
634
635
/*
636
* MSR bitmap update can be skipped when:
637
* - MSR bitmap for L1 hasn't changed.
638
* - Nested hypervisor (L1) is attempting to launch the same L2 as
639
* before.
640
* - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
641
* and tells KVM (L0) there were no changes in MSR bitmap for L2.
642
*/
643
if (!vmx->nested.force_msr_bitmap_recalc) {
644
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
645
646
if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
647
evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
648
return true;
649
}
650
651
if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
652
return false;
653
654
msr_bitmap_l1 = (unsigned long *)map.hva;
655
656
/*
657
* To keep the control flow simple, pay eight 8-byte writes (sixteen
658
* 4-byte writes on 32-bit systems) up front to enable intercepts for
659
* the x2APIC MSR range and selectively toggle those relevant to L2.
660
*/
661
enable_x2apic_msr_intercepts(msr_bitmap_l0);
662
663
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
664
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
665
/*
666
* L0 need not intercept reads for MSRs between 0x800
667
* and 0x8ff, it just lets the processor take the value
668
* from the virtual-APIC page; take those 256 bits
669
* directly from the L1 bitmap.
670
*/
671
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
672
unsigned word = msr / BITS_PER_LONG;
673
674
msr_bitmap_l0[word] = msr_bitmap_l1[word];
675
}
676
}
677
678
nested_vmx_disable_intercept_for_x2apic_msr(
679
msr_bitmap_l1, msr_bitmap_l0,
680
X2APIC_MSR(APIC_TASKPRI),
681
MSR_TYPE_R | MSR_TYPE_W);
682
683
if (nested_cpu_has_vid(vmcs12)) {
684
nested_vmx_disable_intercept_for_x2apic_msr(
685
msr_bitmap_l1, msr_bitmap_l0,
686
X2APIC_MSR(APIC_EOI),
687
MSR_TYPE_W);
688
nested_vmx_disable_intercept_for_x2apic_msr(
689
msr_bitmap_l1, msr_bitmap_l0,
690
X2APIC_MSR(APIC_SELF_IPI),
691
MSR_TYPE_W);
692
}
693
}
694
695
/*
696
* Always check vmcs01's bitmap to honor userspace MSR filters and any
697
* other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
698
*/
699
#ifdef CONFIG_X86_64
700
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
701
MSR_FS_BASE, MSR_TYPE_RW);
702
703
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
704
MSR_GS_BASE, MSR_TYPE_RW);
705
706
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
707
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
708
#endif
709
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
710
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
711
712
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
713
MSR_IA32_PRED_CMD, MSR_TYPE_W);
714
715
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
716
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
717
718
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
719
MSR_IA32_APERF, MSR_TYPE_R);
720
721
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
722
MSR_IA32_MPERF, MSR_TYPE_R);
723
724
kvm_vcpu_unmap(vcpu, &map);
725
726
vmx->nested.force_msr_bitmap_recalc = false;
727
728
return true;
729
}
730
731
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
732
struct vmcs12 *vmcs12)
733
{
734
struct vcpu_vmx *vmx = to_vmx(vcpu);
735
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
736
737
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
738
vmcs12->vmcs_link_pointer == INVALID_GPA)
739
return;
740
741
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
742
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
743
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
744
return;
745
746
kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
747
VMCS12_SIZE);
748
}
749
750
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
751
struct vmcs12 *vmcs12)
752
{
753
struct vcpu_vmx *vmx = to_vmx(vcpu);
754
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
755
756
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
757
vmcs12->vmcs_link_pointer == INVALID_GPA)
758
return;
759
760
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
761
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
762
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
763
return;
764
765
kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
766
VMCS12_SIZE);
767
}
768
769
/*
770
* In nested virtualization, check if L1 has set
771
* VM_EXIT_ACK_INTR_ON_EXIT
772
*/
773
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
774
{
775
return get_vmcs12(vcpu)->vm_exit_controls &
776
VM_EXIT_ACK_INTR_ON_EXIT;
777
}
778
779
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
780
struct vmcs12 *vmcs12)
781
{
782
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
783
CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
784
return -EINVAL;
785
else
786
return 0;
787
}
788
789
static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
790
struct vmcs12 *vmcs12)
791
{
792
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
793
!nested_cpu_has_apic_reg_virt(vmcs12) &&
794
!nested_cpu_has_vid(vmcs12) &&
795
!nested_cpu_has_posted_intr(vmcs12))
796
return 0;
797
798
/*
799
* If virtualize x2apic mode is enabled,
800
* virtualize apic access must be disabled.
801
*/
802
if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
803
nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
804
return -EINVAL;
805
806
/*
807
* If virtual interrupt delivery is enabled,
808
* we must exit on external interrupts.
809
*/
810
if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
811
return -EINVAL;
812
813
/*
814
* bits 15:8 should be zero in posted_intr_nv,
815
* the descriptor address has been already checked
816
* in nested_get_vmcs12_pages.
817
*
818
* bits 5:0 of posted_intr_desc_addr should be zero.
819
*/
820
if (nested_cpu_has_posted_intr(vmcs12) &&
821
(CC(!nested_cpu_has_vid(vmcs12)) ||
822
CC(!nested_exit_intr_ack_set(vcpu)) ||
823
CC((vmcs12->posted_intr_nv & 0xff00)) ||
824
CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
825
return -EINVAL;
826
827
/* tpr shadow is needed by all apicv features. */
828
if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
829
return -EINVAL;
830
831
return 0;
832
}
833
834
static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
835
{
836
struct vcpu_vmx *vmx = to_vmx(vcpu);
837
u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
838
vmx->nested.msrs.misc_high);
839
840
return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
841
}
842
843
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
844
u32 count, u64 addr)
845
{
846
if (count == 0)
847
return 0;
848
849
/*
850
* Exceeding the limit results in architecturally _undefined_ behavior,
851
* i.e. KVM is allowed to do literally anything in response to a bad
852
* limit. Immediately generate a consistency check so that code that
853
* consumes the count doesn't need to worry about extreme edge cases.
854
*/
855
if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
856
return -EINVAL;
857
858
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
859
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
860
return -EINVAL;
861
862
return 0;
863
}
864
865
static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
866
struct vmcs12 *vmcs12)
867
{
868
if (CC(nested_vmx_check_msr_switch(vcpu,
869
vmcs12->vm_exit_msr_load_count,
870
vmcs12->vm_exit_msr_load_addr)) ||
871
CC(nested_vmx_check_msr_switch(vcpu,
872
vmcs12->vm_exit_msr_store_count,
873
vmcs12->vm_exit_msr_store_addr)))
874
return -EINVAL;
875
876
return 0;
877
}
878
879
static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
880
struct vmcs12 *vmcs12)
881
{
882
if (CC(nested_vmx_check_msr_switch(vcpu,
883
vmcs12->vm_entry_msr_load_count,
884
vmcs12->vm_entry_msr_load_addr)))
885
return -EINVAL;
886
887
return 0;
888
}
889
890
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
891
struct vmcs12 *vmcs12)
892
{
893
if (!nested_cpu_has_pml(vmcs12))
894
return 0;
895
896
if (CC(!nested_cpu_has_ept(vmcs12)) ||
897
CC(!page_address_valid(vcpu, vmcs12->pml_address)))
898
return -EINVAL;
899
900
return 0;
901
}
902
903
static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
904
struct vmcs12 *vmcs12)
905
{
906
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
907
!nested_cpu_has_ept(vmcs12)))
908
return -EINVAL;
909
return 0;
910
}
911
912
static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
913
struct vmcs12 *vmcs12)
914
{
915
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
916
!nested_cpu_has_ept(vmcs12)))
917
return -EINVAL;
918
return 0;
919
}
920
921
static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
922
struct vmcs12 *vmcs12)
923
{
924
if (!nested_cpu_has_shadow_vmcs(vmcs12))
925
return 0;
926
927
if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
928
CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
929
return -EINVAL;
930
931
return 0;
932
}
933
934
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
935
struct vmx_msr_entry *e)
936
{
937
/* x2APIC MSR accesses are not allowed */
938
if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
939
return -EINVAL;
940
if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
941
CC(e->index == MSR_IA32_UCODE_REV))
942
return -EINVAL;
943
if (CC(e->reserved != 0))
944
return -EINVAL;
945
return 0;
946
}
947
948
static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
949
struct vmx_msr_entry *e)
950
{
951
if (CC(e->index == MSR_FS_BASE) ||
952
CC(e->index == MSR_GS_BASE) ||
953
CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
954
nested_vmx_msr_check_common(vcpu, e))
955
return -EINVAL;
956
return 0;
957
}
958
959
static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
960
struct vmx_msr_entry *e)
961
{
962
if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
963
nested_vmx_msr_check_common(vcpu, e))
964
return -EINVAL;
965
return 0;
966
}
967
968
/*
969
* Load guest's/host's msr at nested entry/exit.
970
* return 0 for success, entry index for failure.
971
*
972
* One of the failure modes for MSR load/store is when a list exceeds the
973
* virtual hardware's capacity. To maintain compatibility with hardware inasmuch
974
* as possible, process all valid entries before failing rather than precheck
975
* for a capacity violation.
976
*/
977
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
978
{
979
u32 i;
980
struct vmx_msr_entry e;
981
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
982
983
for (i = 0; i < count; i++) {
984
if (WARN_ON_ONCE(i >= max_msr_list_size))
985
goto fail;
986
987
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
988
&e, sizeof(e))) {
989
pr_debug_ratelimited(
990
"%s cannot read MSR entry (%u, 0x%08llx)\n",
991
__func__, i, gpa + i * sizeof(e));
992
goto fail;
993
}
994
if (nested_vmx_load_msr_check(vcpu, &e)) {
995
pr_debug_ratelimited(
996
"%s check failed (%u, 0x%x, 0x%x)\n",
997
__func__, i, e.index, e.reserved);
998
goto fail;
999
}
1000
if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
1001
pr_debug_ratelimited(
1002
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1003
__func__, i, e.index, e.value);
1004
goto fail;
1005
}
1006
}
1007
return 0;
1008
fail:
1009
/* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
1010
return i + 1;
1011
}
1012
1013
static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
1014
u32 msr_index,
1015
u64 *data)
1016
{
1017
struct vcpu_vmx *vmx = to_vmx(vcpu);
1018
1019
/*
1020
* If the L0 hypervisor stored a more accurate value for the TSC that
1021
* does not include the time taken for emulation of the L2->L1
1022
* VM-exit in L0, use the more accurate value.
1023
*/
1024
if (msr_index == MSR_IA32_TSC) {
1025
int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
1026
MSR_IA32_TSC);
1027
1028
if (i >= 0) {
1029
u64 val = vmx->msr_autostore.guest.val[i].value;
1030
1031
*data = kvm_read_l1_tsc(vcpu, val);
1032
return true;
1033
}
1034
}
1035
1036
if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
1037
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
1038
msr_index);
1039
return false;
1040
}
1041
return true;
1042
}
1043
1044
static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1045
struct vmx_msr_entry *e)
1046
{
1047
if (kvm_vcpu_read_guest(vcpu,
1048
gpa + i * sizeof(*e),
1049
e, 2 * sizeof(u32))) {
1050
pr_debug_ratelimited(
1051
"%s cannot read MSR entry (%u, 0x%08llx)\n",
1052
__func__, i, gpa + i * sizeof(*e));
1053
return false;
1054
}
1055
if (nested_vmx_store_msr_check(vcpu, e)) {
1056
pr_debug_ratelimited(
1057
"%s check failed (%u, 0x%x, 0x%x)\n",
1058
__func__, i, e->index, e->reserved);
1059
return false;
1060
}
1061
return true;
1062
}
1063
1064
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1065
{
1066
u64 data;
1067
u32 i;
1068
struct vmx_msr_entry e;
1069
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1070
1071
for (i = 0; i < count; i++) {
1072
if (WARN_ON_ONCE(i >= max_msr_list_size))
1073
return -EINVAL;
1074
1075
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1076
return -EINVAL;
1077
1078
if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1079
return -EINVAL;
1080
1081
if (kvm_vcpu_write_guest(vcpu,
1082
gpa + i * sizeof(e) +
1083
offsetof(struct vmx_msr_entry, value),
1084
&data, sizeof(data))) {
1085
pr_debug_ratelimited(
1086
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1087
__func__, i, e.index, data);
1088
return -EINVAL;
1089
}
1090
}
1091
return 0;
1092
}
1093
1094
static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1095
{
1096
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1097
u32 count = vmcs12->vm_exit_msr_store_count;
1098
u64 gpa = vmcs12->vm_exit_msr_store_addr;
1099
struct vmx_msr_entry e;
1100
u32 i;
1101
1102
for (i = 0; i < count; i++) {
1103
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1104
return false;
1105
1106
if (e.index == msr_index)
1107
return true;
1108
}
1109
return false;
1110
}
1111
1112
static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1113
u32 msr_index)
1114
{
1115
struct vcpu_vmx *vmx = to_vmx(vcpu);
1116
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1117
bool in_vmcs12_store_list;
1118
int msr_autostore_slot;
1119
bool in_autostore_list;
1120
int last;
1121
1122
msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1123
in_autostore_list = msr_autostore_slot >= 0;
1124
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1125
1126
if (in_vmcs12_store_list && !in_autostore_list) {
1127
if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1128
/*
1129
* Emulated VMEntry does not fail here. Instead a less
1130
* accurate value will be returned by
1131
* nested_vmx_get_vmexit_msr_value() by reading KVM's
1132
* internal MSR state instead of reading the value from
1133
* the vmcs02 VMExit MSR-store area.
1134
*/
1135
pr_warn_ratelimited(
1136
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
1137
msr_index);
1138
return;
1139
}
1140
last = autostore->nr++;
1141
autostore->val[last].index = msr_index;
1142
} else if (!in_vmcs12_store_list && in_autostore_list) {
1143
last = --autostore->nr;
1144
autostore->val[msr_autostore_slot] = autostore->val[last];
1145
}
1146
}
1147
1148
/*
1149
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1150
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1151
* Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1152
* @entry_failure_code.
1153
*/
1154
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1155
bool nested_ept, bool reload_pdptrs,
1156
enum vm_entry_failure_code *entry_failure_code)
1157
{
1158
if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
1159
*entry_failure_code = ENTRY_FAIL_DEFAULT;
1160
return -EINVAL;
1161
}
1162
1163
/*
1164
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
1165
* must not be dereferenced.
1166
*/
1167
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1168
CC(!load_pdptrs(vcpu, cr3))) {
1169
*entry_failure_code = ENTRY_FAIL_PDPTE;
1170
return -EINVAL;
1171
}
1172
1173
vcpu->arch.cr3 = cr3;
1174
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1175
1176
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1177
kvm_init_mmu(vcpu);
1178
1179
if (!nested_ept)
1180
kvm_mmu_new_pgd(vcpu, cr3);
1181
1182
return 0;
1183
}
1184
1185
/*
1186
* Returns if KVM is able to config CPU to tag TLB entries
1187
* populated by L2 differently than TLB entries populated
1188
* by L1.
1189
*
1190
* If L0 uses EPT, L1 and L2 run with different EPTP because
1191
* guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1192
* are tagged with different EPTP.
1193
*
1194
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1195
* with different VPID (L1 entries are tagged with vmx->vpid
1196
* while L2 entries are tagged with vmx->nested.vpid02).
1197
*/
1198
static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1199
{
1200
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1201
1202
return enable_ept ||
1203
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1204
}
1205
1206
static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1207
struct vmcs12 *vmcs12,
1208
bool is_vmenter)
1209
{
1210
struct vcpu_vmx *vmx = to_vmx(vcpu);
1211
1212
/* Handle pending Hyper-V TLB flush requests */
1213
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
1214
1215
/*
1216
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
1217
* same VPID as the host, and so architecturally, linear and combined
1218
* mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
1219
* emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
1220
* and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
1221
* is required if VPID is disabled in KVM, as a TLB flush (there are no
1222
* VPIDs) still occurs from L1's perspective, and KVM may need to
1223
* synchronize the MMU in response to the guest TLB flush.
1224
*
1225
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1226
* EPT is a special snowflake, as guest-physical mappings aren't
1227
* flushed on VPID invalidations, including VM-Enter or VM-Exit with
1228
* VPID disabled. As a result, KVM _never_ needs to sync nEPT
1229
* entries on VM-Enter because L1 can't rely on VM-Enter to flush
1230
* those mappings.
1231
*/
1232
if (!nested_cpu_has_vpid(vmcs12)) {
1233
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1234
return;
1235
}
1236
1237
/* L2 should never have a VPID if VPID is disabled. */
1238
WARN_ON(!enable_vpid);
1239
1240
/*
1241
* VPID is enabled and in use by vmcs12. If vpid12 is changing, then
1242
* emulate a guest TLB flush as KVM does not track vpid12 history nor
1243
* is the VPID incorporated into the MMU context. I.e. KVM must assume
1244
* that the new vpid12 has never been used and thus represents a new
1245
* guest ASID that cannot have entries in the TLB.
1246
*/
1247
if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1248
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1249
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1250
return;
1251
}
1252
1253
/*
1254
* If VPID is enabled, used by vmc12, and vpid12 is not changing but
1255
* does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1256
* KVM was unable to allocate a VPID for L2, flush the current context
1257
* as the effective ASID is common to both L1 and L2.
1258
*/
1259
if (!nested_has_guest_tlb_tag(vcpu))
1260
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1261
}
1262
1263
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1264
{
1265
superset &= mask;
1266
subset &= mask;
1267
1268
return (superset | subset) == superset;
1269
}
1270
1271
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1272
{
1273
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
1274
VMX_BASIC_INOUT |
1275
VMX_BASIC_TRUE_CTLS;
1276
1277
const u64 reserved_bits = GENMASK_ULL(63, 56) |
1278
GENMASK_ULL(47, 45) |
1279
BIT_ULL(31);
1280
1281
u64 vmx_basic = vmcs_config.nested.basic;
1282
1283
BUILD_BUG_ON(feature_bits & reserved_bits);
1284
1285
/*
1286
* Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
1287
* inverted polarity), the incoming value must not set feature bits or
1288
* reserved bits that aren't allowed/supported by KVM. Fields, i.e.
1289
* multi-bit values, are explicitly checked below.
1290
*/
1291
if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
1292
return -EINVAL;
1293
1294
/*
1295
* KVM does not emulate a version of VMX that constrains physical
1296
* addresses of VMX structures (e.g. VMCS) to 32-bits.
1297
*/
1298
if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
1299
return -EINVAL;
1300
1301
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1302
vmx_basic_vmcs_revision_id(data))
1303
return -EINVAL;
1304
1305
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1306
return -EINVAL;
1307
1308
vmx->nested.msrs.basic = data;
1309
return 0;
1310
}
1311
1312
static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1313
u32 **low, u32 **high)
1314
{
1315
switch (msr_index) {
1316
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1317
*low = &msrs->pinbased_ctls_low;
1318
*high = &msrs->pinbased_ctls_high;
1319
break;
1320
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1321
*low = &msrs->procbased_ctls_low;
1322
*high = &msrs->procbased_ctls_high;
1323
break;
1324
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1325
*low = &msrs->exit_ctls_low;
1326
*high = &msrs->exit_ctls_high;
1327
break;
1328
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1329
*low = &msrs->entry_ctls_low;
1330
*high = &msrs->entry_ctls_high;
1331
break;
1332
case MSR_IA32_VMX_PROCBASED_CTLS2:
1333
*low = &msrs->secondary_ctls_low;
1334
*high = &msrs->secondary_ctls_high;
1335
break;
1336
default:
1337
BUG();
1338
}
1339
}
1340
1341
static int
1342
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1343
{
1344
u32 *lowp, *highp;
1345
u64 supported;
1346
1347
vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1348
1349
supported = vmx_control_msr(*lowp, *highp);
1350
1351
/* Check must-be-1 bits are still 1. */
1352
if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1353
return -EINVAL;
1354
1355
/* Check must-be-0 bits are still 0. */
1356
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1357
return -EINVAL;
1358
1359
vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1360
*lowp = data;
1361
*highp = data >> 32;
1362
return 0;
1363
}
1364
1365
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1366
{
1367
const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
1368
VMX_MISC_ACTIVITY_HLT |
1369
VMX_MISC_ACTIVITY_SHUTDOWN |
1370
VMX_MISC_ACTIVITY_WAIT_SIPI |
1371
VMX_MISC_INTEL_PT |
1372
VMX_MISC_RDMSR_IN_SMM |
1373
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
1374
VMX_MISC_VMXOFF_BLOCK_SMI |
1375
VMX_MISC_ZERO_LEN_INS;
1376
1377
const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
1378
1379
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1380
vmcs_config.nested.misc_high);
1381
1382
BUILD_BUG_ON(feature_bits & reserved_bits);
1383
1384
/*
1385
* The incoming value must not set feature bits or reserved bits that
1386
* aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
1387
* explicitly checked below.
1388
*/
1389
if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
1390
return -EINVAL;
1391
1392
if ((vmx->nested.msrs.pinbased_ctls_high &
1393
PIN_BASED_VMX_PREEMPTION_TIMER) &&
1394
vmx_misc_preemption_timer_rate(data) !=
1395
vmx_misc_preemption_timer_rate(vmx_misc))
1396
return -EINVAL;
1397
1398
if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1399
return -EINVAL;
1400
1401
if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1402
return -EINVAL;
1403
1404
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1405
return -EINVAL;
1406
1407
vmx->nested.msrs.misc_low = data;
1408
vmx->nested.msrs.misc_high = data >> 32;
1409
1410
return 0;
1411
}
1412
1413
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1414
{
1415
u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1416
vmcs_config.nested.vpid_caps);
1417
1418
/* Every bit is either reserved or a feature bit. */
1419
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1420
return -EINVAL;
1421
1422
vmx->nested.msrs.ept_caps = data;
1423
vmx->nested.msrs.vpid_caps = data >> 32;
1424
return 0;
1425
}
1426
1427
static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1428
{
1429
switch (msr_index) {
1430
case MSR_IA32_VMX_CR0_FIXED0:
1431
return &msrs->cr0_fixed0;
1432
case MSR_IA32_VMX_CR4_FIXED0:
1433
return &msrs->cr4_fixed0;
1434
default:
1435
BUG();
1436
}
1437
}
1438
1439
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1440
{
1441
const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1442
1443
/*
1444
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
1445
* must be 1 in the restored value.
1446
*/
1447
if (!is_bitwise_subset(data, *msr, -1ULL))
1448
return -EINVAL;
1449
1450
*vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1451
return 0;
1452
}
1453
1454
/*
1455
* Called when userspace is restoring VMX MSRs.
1456
*
1457
* Returns 0 on success, non-0 otherwise.
1458
*/
1459
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1460
{
1461
struct vcpu_vmx *vmx = to_vmx(vcpu);
1462
1463
/*
1464
* Don't allow changes to the VMX capability MSRs while the vCPU
1465
* is in VMX operation.
1466
*/
1467
if (vmx->nested.vmxon)
1468
return -EBUSY;
1469
1470
switch (msr_index) {
1471
case MSR_IA32_VMX_BASIC:
1472
return vmx_restore_vmx_basic(vmx, data);
1473
case MSR_IA32_VMX_PINBASED_CTLS:
1474
case MSR_IA32_VMX_PROCBASED_CTLS:
1475
case MSR_IA32_VMX_EXIT_CTLS:
1476
case MSR_IA32_VMX_ENTRY_CTLS:
1477
/*
1478
* The "non-true" VMX capability MSRs are generated from the
1479
* "true" MSRs, so we do not support restoring them directly.
1480
*
1481
* If userspace wants to emulate VMX_BASIC[55]=0, userspace
1482
* should restore the "true" MSRs with the must-be-1 bits
1483
* set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1484
* DEFAULT SETTINGS".
1485
*/
1486
return -EINVAL;
1487
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1488
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1489
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1490
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1491
case MSR_IA32_VMX_PROCBASED_CTLS2:
1492
return vmx_restore_control_msr(vmx, msr_index, data);
1493
case MSR_IA32_VMX_MISC:
1494
return vmx_restore_vmx_misc(vmx, data);
1495
case MSR_IA32_VMX_CR0_FIXED0:
1496
case MSR_IA32_VMX_CR4_FIXED0:
1497
return vmx_restore_fixed0_msr(vmx, msr_index, data);
1498
case MSR_IA32_VMX_CR0_FIXED1:
1499
case MSR_IA32_VMX_CR4_FIXED1:
1500
/*
1501
* These MSRs are generated based on the vCPU's CPUID, so we
1502
* do not support restoring them directly.
1503
*/
1504
return -EINVAL;
1505
case MSR_IA32_VMX_EPT_VPID_CAP:
1506
return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1507
case MSR_IA32_VMX_VMCS_ENUM:
1508
vmx->nested.msrs.vmcs_enum = data;
1509
return 0;
1510
case MSR_IA32_VMX_VMFUNC:
1511
if (data & ~vmcs_config.nested.vmfunc_controls)
1512
return -EINVAL;
1513
vmx->nested.msrs.vmfunc_controls = data;
1514
return 0;
1515
default:
1516
/*
1517
* The rest of the VMX capability MSRs do not support restore.
1518
*/
1519
return -EINVAL;
1520
}
1521
}
1522
1523
/* Returns 0 on success, non-0 otherwise. */
1524
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1525
{
1526
switch (msr_index) {
1527
case MSR_IA32_VMX_BASIC:
1528
*pdata = msrs->basic;
1529
break;
1530
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1531
case MSR_IA32_VMX_PINBASED_CTLS:
1532
*pdata = vmx_control_msr(
1533
msrs->pinbased_ctls_low,
1534
msrs->pinbased_ctls_high);
1535
if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1536
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1537
break;
1538
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1539
case MSR_IA32_VMX_PROCBASED_CTLS:
1540
*pdata = vmx_control_msr(
1541
msrs->procbased_ctls_low,
1542
msrs->procbased_ctls_high);
1543
if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1544
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1545
break;
1546
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1547
case MSR_IA32_VMX_EXIT_CTLS:
1548
*pdata = vmx_control_msr(
1549
msrs->exit_ctls_low,
1550
msrs->exit_ctls_high);
1551
if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1552
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1553
break;
1554
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1555
case MSR_IA32_VMX_ENTRY_CTLS:
1556
*pdata = vmx_control_msr(
1557
msrs->entry_ctls_low,
1558
msrs->entry_ctls_high);
1559
if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1560
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1561
break;
1562
case MSR_IA32_VMX_MISC:
1563
*pdata = vmx_control_msr(
1564
msrs->misc_low,
1565
msrs->misc_high);
1566
break;
1567
case MSR_IA32_VMX_CR0_FIXED0:
1568
*pdata = msrs->cr0_fixed0;
1569
break;
1570
case MSR_IA32_VMX_CR0_FIXED1:
1571
*pdata = msrs->cr0_fixed1;
1572
break;
1573
case MSR_IA32_VMX_CR4_FIXED0:
1574
*pdata = msrs->cr4_fixed0;
1575
break;
1576
case MSR_IA32_VMX_CR4_FIXED1:
1577
*pdata = msrs->cr4_fixed1;
1578
break;
1579
case MSR_IA32_VMX_VMCS_ENUM:
1580
*pdata = msrs->vmcs_enum;
1581
break;
1582
case MSR_IA32_VMX_PROCBASED_CTLS2:
1583
*pdata = vmx_control_msr(
1584
msrs->secondary_ctls_low,
1585
msrs->secondary_ctls_high);
1586
break;
1587
case MSR_IA32_VMX_EPT_VPID_CAP:
1588
*pdata = msrs->ept_caps |
1589
((u64)msrs->vpid_caps << 32);
1590
break;
1591
case MSR_IA32_VMX_VMFUNC:
1592
*pdata = msrs->vmfunc_controls;
1593
break;
1594
default:
1595
return 1;
1596
}
1597
1598
return 0;
1599
}
1600
1601
/*
1602
* Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1603
* been modified by the L1 guest. Note, "writable" in this context means
1604
* "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1605
* fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1606
* VM-exit information fields (which are actually writable if the vCPU is
1607
* configured to support "VMWRITE to any supported field in the VMCS").
1608
*/
1609
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1610
{
1611
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1612
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1613
struct shadow_vmcs_field field;
1614
unsigned long val;
1615
int i;
1616
1617
if (WARN_ON(!shadow_vmcs))
1618
return;
1619
1620
preempt_disable();
1621
1622
vmcs_load(shadow_vmcs);
1623
1624
for (i = 0; i < max_shadow_read_write_fields; i++) {
1625
field = shadow_read_write_fields[i];
1626
val = __vmcs_readl(field.encoding);
1627
vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1628
}
1629
1630
vmcs_clear(shadow_vmcs);
1631
vmcs_load(vmx->loaded_vmcs->vmcs);
1632
1633
preempt_enable();
1634
}
1635
1636
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1637
{
1638
const struct shadow_vmcs_field *fields[] = {
1639
shadow_read_write_fields,
1640
shadow_read_only_fields
1641
};
1642
const int max_fields[] = {
1643
max_shadow_read_write_fields,
1644
max_shadow_read_only_fields
1645
};
1646
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1647
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1648
struct shadow_vmcs_field field;
1649
unsigned long val;
1650
int i, q;
1651
1652
if (WARN_ON(!shadow_vmcs))
1653
return;
1654
1655
vmcs_load(shadow_vmcs);
1656
1657
for (q = 0; q < ARRAY_SIZE(fields); q++) {
1658
for (i = 0; i < max_fields[q]; i++) {
1659
field = fields[q][i];
1660
val = vmcs12_read_any(vmcs12, field.encoding,
1661
field.offset);
1662
__vmcs_writel(field.encoding, val);
1663
}
1664
}
1665
1666
vmcs_clear(shadow_vmcs);
1667
vmcs_load(vmx->loaded_vmcs->vmcs);
1668
}
1669
1670
static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1671
{
1672
#ifdef CONFIG_KVM_HYPERV
1673
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1674
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1675
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
1676
1677
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1678
vmcs12->tpr_threshold = evmcs->tpr_threshold;
1679
vmcs12->guest_rip = evmcs->guest_rip;
1680
1681
if (unlikely(!(hv_clean_fields &
1682
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
1683
hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
1684
hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
1685
hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
1686
}
1687
1688
if (unlikely(!(hv_clean_fields &
1689
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1690
vmcs12->guest_rsp = evmcs->guest_rsp;
1691
vmcs12->guest_rflags = evmcs->guest_rflags;
1692
vmcs12->guest_interruptibility_info =
1693
evmcs->guest_interruptibility_info;
1694
/*
1695
* Not present in struct vmcs12:
1696
* vmcs12->guest_ssp = evmcs->guest_ssp;
1697
*/
1698
}
1699
1700
if (unlikely(!(hv_clean_fields &
1701
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1702
vmcs12->cpu_based_vm_exec_control =
1703
evmcs->cpu_based_vm_exec_control;
1704
}
1705
1706
if (unlikely(!(hv_clean_fields &
1707
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1708
vmcs12->exception_bitmap = evmcs->exception_bitmap;
1709
}
1710
1711
if (unlikely(!(hv_clean_fields &
1712
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1713
vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1714
}
1715
1716
if (unlikely(!(hv_clean_fields &
1717
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1718
vmcs12->vm_entry_intr_info_field =
1719
evmcs->vm_entry_intr_info_field;
1720
vmcs12->vm_entry_exception_error_code =
1721
evmcs->vm_entry_exception_error_code;
1722
vmcs12->vm_entry_instruction_len =
1723
evmcs->vm_entry_instruction_len;
1724
}
1725
1726
if (unlikely(!(hv_clean_fields &
1727
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1728
vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1729
vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1730
vmcs12->host_cr0 = evmcs->host_cr0;
1731
vmcs12->host_cr3 = evmcs->host_cr3;
1732
vmcs12->host_cr4 = evmcs->host_cr4;
1733
vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1734
vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1735
vmcs12->host_rip = evmcs->host_rip;
1736
vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1737
vmcs12->host_es_selector = evmcs->host_es_selector;
1738
vmcs12->host_cs_selector = evmcs->host_cs_selector;
1739
vmcs12->host_ss_selector = evmcs->host_ss_selector;
1740
vmcs12->host_ds_selector = evmcs->host_ds_selector;
1741
vmcs12->host_fs_selector = evmcs->host_fs_selector;
1742
vmcs12->host_gs_selector = evmcs->host_gs_selector;
1743
vmcs12->host_tr_selector = evmcs->host_tr_selector;
1744
vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
1745
/*
1746
* Not present in struct vmcs12:
1747
* vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
1748
* vmcs12->host_ssp = evmcs->host_ssp;
1749
* vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
1750
*/
1751
}
1752
1753
if (unlikely(!(hv_clean_fields &
1754
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1755
vmcs12->pin_based_vm_exec_control =
1756
evmcs->pin_based_vm_exec_control;
1757
vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1758
vmcs12->secondary_vm_exec_control =
1759
evmcs->secondary_vm_exec_control;
1760
}
1761
1762
if (unlikely(!(hv_clean_fields &
1763
HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1764
vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1765
vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1766
}
1767
1768
if (unlikely(!(hv_clean_fields &
1769
HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1770
vmcs12->msr_bitmap = evmcs->msr_bitmap;
1771
}
1772
1773
if (unlikely(!(hv_clean_fields &
1774
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1775
vmcs12->guest_es_base = evmcs->guest_es_base;
1776
vmcs12->guest_cs_base = evmcs->guest_cs_base;
1777
vmcs12->guest_ss_base = evmcs->guest_ss_base;
1778
vmcs12->guest_ds_base = evmcs->guest_ds_base;
1779
vmcs12->guest_fs_base = evmcs->guest_fs_base;
1780
vmcs12->guest_gs_base = evmcs->guest_gs_base;
1781
vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1782
vmcs12->guest_tr_base = evmcs->guest_tr_base;
1783
vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1784
vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1785
vmcs12->guest_es_limit = evmcs->guest_es_limit;
1786
vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1787
vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1788
vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1789
vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1790
vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1791
vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1792
vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1793
vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1794
vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1795
vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1796
vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1797
vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1798
vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1799
vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1800
vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1801
vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1802
vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1803
vmcs12->guest_es_selector = evmcs->guest_es_selector;
1804
vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1805
vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1806
vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1807
vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1808
vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1809
vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1810
vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1811
}
1812
1813
if (unlikely(!(hv_clean_fields &
1814
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1815
vmcs12->tsc_offset = evmcs->tsc_offset;
1816
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1817
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1818
vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
1819
vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
1820
}
1821
1822
if (unlikely(!(hv_clean_fields &
1823
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1824
vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1825
vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1826
vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1827
vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1828
vmcs12->guest_cr0 = evmcs->guest_cr0;
1829
vmcs12->guest_cr3 = evmcs->guest_cr3;
1830
vmcs12->guest_cr4 = evmcs->guest_cr4;
1831
vmcs12->guest_dr7 = evmcs->guest_dr7;
1832
}
1833
1834
if (unlikely(!(hv_clean_fields &
1835
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1836
vmcs12->host_fs_base = evmcs->host_fs_base;
1837
vmcs12->host_gs_base = evmcs->host_gs_base;
1838
vmcs12->host_tr_base = evmcs->host_tr_base;
1839
vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1840
vmcs12->host_idtr_base = evmcs->host_idtr_base;
1841
vmcs12->host_rsp = evmcs->host_rsp;
1842
}
1843
1844
if (unlikely(!(hv_clean_fields &
1845
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1846
vmcs12->ept_pointer = evmcs->ept_pointer;
1847
vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1848
}
1849
1850
if (unlikely(!(hv_clean_fields &
1851
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1852
vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1853
vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1854
vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1855
vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1856
vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1857
vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1858
vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1859
vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1860
vmcs12->guest_pending_dbg_exceptions =
1861
evmcs->guest_pending_dbg_exceptions;
1862
vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1863
vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1864
vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1865
vmcs12->guest_activity_state = evmcs->guest_activity_state;
1866
vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1867
vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
1868
/*
1869
* Not present in struct vmcs12:
1870
* vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
1871
* vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
1872
* vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
1873
*/
1874
}
1875
1876
/*
1877
* Not used?
1878
* vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1879
* vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1880
* vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1881
* vmcs12->page_fault_error_code_mask =
1882
* evmcs->page_fault_error_code_mask;
1883
* vmcs12->page_fault_error_code_match =
1884
* evmcs->page_fault_error_code_match;
1885
* vmcs12->cr3_target_count = evmcs->cr3_target_count;
1886
* vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1887
* vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1888
* vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1889
*/
1890
1891
/*
1892
* Read only fields:
1893
* vmcs12->guest_physical_address = evmcs->guest_physical_address;
1894
* vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1895
* vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1896
* vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1897
* vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1898
* vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1899
* vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1900
* vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1901
* vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1902
* vmcs12->exit_qualification = evmcs->exit_qualification;
1903
* vmcs12->guest_linear_address = evmcs->guest_linear_address;
1904
*
1905
* Not present in struct vmcs12:
1906
* vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1907
* vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1908
* vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1909
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1910
*/
1911
1912
return;
1913
#else /* CONFIG_KVM_HYPERV */
1914
KVM_BUG_ON(1, vmx->vcpu.kvm);
1915
#endif /* CONFIG_KVM_HYPERV */
1916
}
1917
1918
static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1919
{
1920
#ifdef CONFIG_KVM_HYPERV
1921
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1922
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1923
1924
/*
1925
* Should not be changed by KVM:
1926
*
1927
* evmcs->host_es_selector = vmcs12->host_es_selector;
1928
* evmcs->host_cs_selector = vmcs12->host_cs_selector;
1929
* evmcs->host_ss_selector = vmcs12->host_ss_selector;
1930
* evmcs->host_ds_selector = vmcs12->host_ds_selector;
1931
* evmcs->host_fs_selector = vmcs12->host_fs_selector;
1932
* evmcs->host_gs_selector = vmcs12->host_gs_selector;
1933
* evmcs->host_tr_selector = vmcs12->host_tr_selector;
1934
* evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1935
* evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1936
* evmcs->host_cr0 = vmcs12->host_cr0;
1937
* evmcs->host_cr3 = vmcs12->host_cr3;
1938
* evmcs->host_cr4 = vmcs12->host_cr4;
1939
* evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1940
* evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1941
* evmcs->host_rip = vmcs12->host_rip;
1942
* evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1943
* evmcs->host_fs_base = vmcs12->host_fs_base;
1944
* evmcs->host_gs_base = vmcs12->host_gs_base;
1945
* evmcs->host_tr_base = vmcs12->host_tr_base;
1946
* evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1947
* evmcs->host_idtr_base = vmcs12->host_idtr_base;
1948
* evmcs->host_rsp = vmcs12->host_rsp;
1949
* sync_vmcs02_to_vmcs12() doesn't read these:
1950
* evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1951
* evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1952
* evmcs->msr_bitmap = vmcs12->msr_bitmap;
1953
* evmcs->ept_pointer = vmcs12->ept_pointer;
1954
* evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1955
* evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1956
* evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1957
* evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1958
* evmcs->tpr_threshold = vmcs12->tpr_threshold;
1959
* evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1960
* evmcs->exception_bitmap = vmcs12->exception_bitmap;
1961
* evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1962
* evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1963
* evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1964
* evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1965
* evmcs->page_fault_error_code_mask =
1966
* vmcs12->page_fault_error_code_mask;
1967
* evmcs->page_fault_error_code_match =
1968
* vmcs12->page_fault_error_code_match;
1969
* evmcs->cr3_target_count = vmcs12->cr3_target_count;
1970
* evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1971
* evmcs->tsc_offset = vmcs12->tsc_offset;
1972
* evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1973
* evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1974
* evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1975
* evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1976
* evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1977
* evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1978
* evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1979
* evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1980
* evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
1981
* evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
1982
* evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
1983
* evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
1984
*
1985
* Not present in struct vmcs12:
1986
* evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1987
* evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1988
* evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1989
* evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1990
* evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
1991
* evmcs->host_ssp = vmcs12->host_ssp;
1992
* evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
1993
* evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
1994
* evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
1995
* evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
1996
* evmcs->guest_ssp = vmcs12->guest_ssp;
1997
*/
1998
1999
evmcs->guest_es_selector = vmcs12->guest_es_selector;
2000
evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
2001
evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
2002
evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
2003
evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
2004
evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
2005
evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
2006
evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
2007
2008
evmcs->guest_es_limit = vmcs12->guest_es_limit;
2009
evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
2010
evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
2011
evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
2012
evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
2013
evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
2014
evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
2015
evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
2016
evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
2017
evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
2018
2019
evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
2020
evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
2021
evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
2022
evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
2023
evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
2024
evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
2025
evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
2026
evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
2027
2028
evmcs->guest_es_base = vmcs12->guest_es_base;
2029
evmcs->guest_cs_base = vmcs12->guest_cs_base;
2030
evmcs->guest_ss_base = vmcs12->guest_ss_base;
2031
evmcs->guest_ds_base = vmcs12->guest_ds_base;
2032
evmcs->guest_fs_base = vmcs12->guest_fs_base;
2033
evmcs->guest_gs_base = vmcs12->guest_gs_base;
2034
evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
2035
evmcs->guest_tr_base = vmcs12->guest_tr_base;
2036
evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
2037
evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
2038
2039
evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
2040
evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
2041
2042
evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
2043
evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
2044
evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
2045
evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
2046
2047
evmcs->guest_pending_dbg_exceptions =
2048
vmcs12->guest_pending_dbg_exceptions;
2049
evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
2050
evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
2051
2052
evmcs->guest_activity_state = vmcs12->guest_activity_state;
2053
evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
2054
2055
evmcs->guest_cr0 = vmcs12->guest_cr0;
2056
evmcs->guest_cr3 = vmcs12->guest_cr3;
2057
evmcs->guest_cr4 = vmcs12->guest_cr4;
2058
evmcs->guest_dr7 = vmcs12->guest_dr7;
2059
2060
evmcs->guest_physical_address = vmcs12->guest_physical_address;
2061
2062
evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
2063
evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
2064
evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
2065
evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
2066
evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
2067
evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
2068
evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
2069
evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
2070
2071
evmcs->exit_qualification = vmcs12->exit_qualification;
2072
2073
evmcs->guest_linear_address = vmcs12->guest_linear_address;
2074
evmcs->guest_rsp = vmcs12->guest_rsp;
2075
evmcs->guest_rflags = vmcs12->guest_rflags;
2076
2077
evmcs->guest_interruptibility_info =
2078
vmcs12->guest_interruptibility_info;
2079
evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
2080
evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
2081
evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
2082
evmcs->vm_entry_exception_error_code =
2083
vmcs12->vm_entry_exception_error_code;
2084
evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
2085
2086
evmcs->guest_rip = vmcs12->guest_rip;
2087
2088
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
2089
2090
return;
2091
#else /* CONFIG_KVM_HYPERV */
2092
KVM_BUG_ON(1, vmx->vcpu.kvm);
2093
#endif /* CONFIG_KVM_HYPERV */
2094
}
2095
2096
/*
2097
* This is an equivalent of the nested hypervisor executing the vmptrld
2098
* instruction.
2099
*/
2100
static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
2101
struct kvm_vcpu *vcpu, bool from_launch)
2102
{
2103
#ifdef CONFIG_KVM_HYPERV
2104
struct vcpu_vmx *vmx = to_vmx(vcpu);
2105
bool evmcs_gpa_changed = false;
2106
u64 evmcs_gpa;
2107
2108
if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
2109
return EVMPTRLD_DISABLED;
2110
2111
evmcs_gpa = nested_get_evmptr(vcpu);
2112
if (!evmptr_is_valid(evmcs_gpa)) {
2113
nested_release_evmcs(vcpu);
2114
return EVMPTRLD_DISABLED;
2115
}
2116
2117
if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
2118
vmx->nested.current_vmptr = INVALID_GPA;
2119
2120
nested_release_evmcs(vcpu);
2121
2122
if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2123
&vmx->nested.hv_evmcs_map))
2124
return EVMPTRLD_ERROR;
2125
2126
vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2127
2128
/*
2129
* Currently, KVM only supports eVMCS version 1
2130
* (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2131
* value to first u32 field of eVMCS which should specify eVMCS
2132
* VersionNumber.
2133
*
2134
* Guest should be aware of supported eVMCS versions by host by
2135
* examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2136
* expected to set this CPUID leaf according to the value
2137
* returned in vmcs_version from nested_enable_evmcs().
2138
*
2139
* However, it turns out that Microsoft Hyper-V fails to comply
2140
* to their own invented interface: When Hyper-V use eVMCS, it
2141
* just sets first u32 field of eVMCS to revision_id specified
2142
* in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2143
* which is one of the supported versions specified in
2144
* CPUID.0x4000000A.EAX[0:15].
2145
*
2146
* To overcome Hyper-V bug, we accept here either a supported
2147
* eVMCS version or VMCS12 revision_id as valid values for first
2148
* u32 field of eVMCS.
2149
*/
2150
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2151
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2152
nested_release_evmcs(vcpu);
2153
return EVMPTRLD_VMFAIL;
2154
}
2155
2156
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2157
2158
evmcs_gpa_changed = true;
2159
/*
2160
* Unlike normal vmcs12, enlightened vmcs12 is not fully
2161
* reloaded from guest's memory (read only fields, fields not
2162
* present in struct hv_enlightened_vmcs, ...). Make sure there
2163
* are no leftovers.
2164
*/
2165
if (from_launch) {
2166
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2167
memset(vmcs12, 0, sizeof(*vmcs12));
2168
vmcs12->hdr.revision_id = VMCS12_REVISION;
2169
}
2170
2171
}
2172
2173
/*
2174
* Clean fields data can't be used on VMLAUNCH and when we switch
2175
* between different L2 guests as KVM keeps a single VMCS12 per L1.
2176
*/
2177
if (from_launch || evmcs_gpa_changed) {
2178
vmx->nested.hv_evmcs->hv_clean_fields &=
2179
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2180
2181
vmx->nested.force_msr_bitmap_recalc = true;
2182
}
2183
2184
return EVMPTRLD_SUCCEEDED;
2185
#else
2186
return EVMPTRLD_DISABLED;
2187
#endif
2188
}
2189
2190
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2191
{
2192
struct vcpu_vmx *vmx = to_vmx(vcpu);
2193
2194
if (nested_vmx_is_evmptr12_valid(vmx))
2195
copy_vmcs12_to_enlightened(vmx);
2196
else
2197
copy_vmcs12_to_shadow(vmx);
2198
2199
vmx->nested.need_vmcs12_to_shadow_sync = false;
2200
}
2201
2202
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2203
{
2204
struct vcpu_vmx *vmx =
2205
container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2206
2207
vmx->nested.preemption_timer_expired = true;
2208
kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2209
kvm_vcpu_kick(&vmx->vcpu);
2210
2211
return HRTIMER_NORESTART;
2212
}
2213
2214
static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2215
{
2216
struct vcpu_vmx *vmx = to_vmx(vcpu);
2217
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2218
2219
u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2220
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2221
2222
if (!vmx->nested.has_preemption_timer_deadline) {
2223
vmx->nested.preemption_timer_deadline =
2224
vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2225
vmx->nested.has_preemption_timer_deadline = true;
2226
}
2227
return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2228
}
2229
2230
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2231
u64 preemption_timeout)
2232
{
2233
struct vcpu_vmx *vmx = to_vmx(vcpu);
2234
2235
/*
2236
* A timer value of zero is architecturally guaranteed to cause
2237
* a VMExit prior to executing any instructions in the guest.
2238
*/
2239
if (preemption_timeout == 0) {
2240
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2241
return;
2242
}
2243
2244
if (vcpu->arch.virtual_tsc_khz == 0)
2245
return;
2246
2247
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2248
preemption_timeout *= 1000000;
2249
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2250
hrtimer_start(&vmx->nested.preemption_timer,
2251
ktime_add_ns(ktime_get(), preemption_timeout),
2252
HRTIMER_MODE_ABS_PINNED);
2253
}
2254
2255
static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2256
{
2257
if (vmx->nested.nested_run_pending &&
2258
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2259
return vmcs12->guest_ia32_efer;
2260
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2261
return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2262
else
2263
return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2264
}
2265
2266
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2267
{
2268
struct kvm *kvm = vmx->vcpu.kvm;
2269
2270
/*
2271
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
2272
* according to L0's settings (vmcs12 is irrelevant here). Host
2273
* fields that come from L0 and are not constant, e.g. HOST_CR3,
2274
* will be set as needed prior to VMLAUNCH/VMRESUME.
2275
*/
2276
if (vmx->nested.vmcs02_initialized)
2277
return;
2278
vmx->nested.vmcs02_initialized = true;
2279
2280
/*
2281
* We don't care what the EPTP value is we just need to guarantee
2282
* it's valid so we don't get a false positive when doing early
2283
* consistency checks.
2284
*/
2285
if (enable_ept && nested_early_check)
2286
vmcs_write64(EPT_POINTER,
2287
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2288
2289
if (vmx->ve_info)
2290
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
2291
2292
/* All VMFUNCs are currently emulated through L0 vmexits. */
2293
if (cpu_has_vmx_vmfunc())
2294
vmcs_write64(VM_FUNCTION_CONTROL, 0);
2295
2296
if (cpu_has_vmx_posted_intr())
2297
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2298
2299
if (cpu_has_vmx_msr_bitmap())
2300
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2301
2302
/*
2303
* PML is emulated for L2, but never enabled in hardware as the MMU
2304
* handles A/D emulation. Disabling PML for L2 also avoids having to
2305
* deal with filtering out L2 GPAs from the buffer.
2306
*/
2307
if (enable_pml) {
2308
vmcs_write64(PML_ADDRESS, 0);
2309
vmcs_write16(GUEST_PML_INDEX, -1);
2310
}
2311
2312
if (cpu_has_vmx_encls_vmexit())
2313
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
2314
2315
if (kvm_notify_vmexit_enabled(kvm))
2316
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
2317
2318
/*
2319
* Set the MSR load/store lists to match L0's settings. Only the
2320
* addresses are constant (for vmcs02), the counts can change based
2321
* on L2's behavior, e.g. switching to/from long mode.
2322
*/
2323
vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2324
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2325
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2326
2327
vmx_set_constant_host_state(vmx);
2328
}
2329
2330
static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2331
struct vmcs12 *vmcs12)
2332
{
2333
prepare_vmcs02_constant_state(vmx);
2334
2335
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
2336
2337
/*
2338
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
2339
* same VPID as the host. Emulate this behavior by using vpid01 for L2
2340
* if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
2341
* and VM-Exit are architecturally required to flush VPID=0, but *only*
2342
* VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
2343
* required flushes), but doing so would cause KVM to over-flush. E.g.
2344
* if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
2345
* and then runs L2 X again, then KVM can and should retain TLB entries
2346
* for VPID12=1.
2347
*/
2348
if (enable_vpid) {
2349
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2350
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2351
else
2352
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2353
}
2354
}
2355
2356
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2357
struct vmcs12 *vmcs12)
2358
{
2359
u32 exec_control;
2360
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2361
2362
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
2363
prepare_vmcs02_early_rare(vmx, vmcs12);
2364
2365
/*
2366
* PIN CONTROLS
2367
*/
2368
exec_control = __pin_controls_get(vmcs01);
2369
exec_control |= (vmcs12->pin_based_vm_exec_control &
2370
~PIN_BASED_VMX_PREEMPTION_TIMER);
2371
2372
/* Posted interrupts setting is only taken from vmcs12. */
2373
vmx->nested.pi_pending = false;
2374
if (nested_cpu_has_posted_intr(vmcs12)) {
2375
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2376
} else {
2377
vmx->nested.posted_intr_nv = -1;
2378
exec_control &= ~PIN_BASED_POSTED_INTR;
2379
}
2380
pin_controls_set(vmx, exec_control);
2381
2382
/*
2383
* EXEC CONTROLS
2384
*/
2385
exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2386
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2387
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2388
exec_control &= ~CPU_BASED_TPR_SHADOW;
2389
exec_control |= vmcs12->cpu_based_vm_exec_control;
2390
2391
vmx->nested.l1_tpr_threshold = -1;
2392
if (exec_control & CPU_BASED_TPR_SHADOW)
2393
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2394
#ifdef CONFIG_X86_64
2395
else
2396
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2397
CPU_BASED_CR8_STORE_EXITING;
2398
#endif
2399
2400
/*
2401
* A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2402
* for I/O port accesses.
2403
*/
2404
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2405
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2406
2407
/*
2408
* This bit will be computed in nested_get_vmcs12_pages, because
2409
* we do not have access to L1's MSR bitmap yet. For now, keep
2410
* the same bit as before, hoping to avoid multiple VMWRITEs that
2411
* only set/clear this bit.
2412
*/
2413
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2414
exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2415
2416
exec_controls_set(vmx, exec_control);
2417
2418
/*
2419
* SECONDARY EXEC CONTROLS
2420
*/
2421
if (cpu_has_secondary_exec_ctrls()) {
2422
exec_control = __secondary_exec_controls_get(vmcs01);
2423
2424
/* Take the following fields only from vmcs12 */
2425
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2426
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2427
SECONDARY_EXEC_ENABLE_INVPCID |
2428
SECONDARY_EXEC_ENABLE_RDTSCP |
2429
SECONDARY_EXEC_ENABLE_XSAVES |
2430
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2431
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2432
SECONDARY_EXEC_APIC_REGISTER_VIRT |
2433
SECONDARY_EXEC_ENABLE_VMFUNC |
2434
SECONDARY_EXEC_DESC);
2435
2436
if (nested_cpu_has(vmcs12,
2437
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2438
exec_control |= vmcs12->secondary_vm_exec_control;
2439
2440
/* PML is emulated and never enabled in hardware for L2. */
2441
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2442
2443
/* VMCS shadowing for L2 is emulated for now */
2444
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2445
2446
/*
2447
* Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2448
* will not have to rewrite the controls just for this bit.
2449
*/
2450
if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
2451
exec_control |= SECONDARY_EXEC_DESC;
2452
2453
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2454
vmcs_write16(GUEST_INTR_STATUS,
2455
vmcs12->guest_intr_status);
2456
2457
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2458
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2459
2460
if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2461
vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2462
2463
secondary_exec_controls_set(vmx, exec_control);
2464
}
2465
2466
/*
2467
* ENTRY CONTROLS
2468
*
2469
* vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2470
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2471
* on the related bits (if supported by the CPU) in the hope that
2472
* we can avoid VMWrites during vmx_set_efer().
2473
*
2474
* Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
2475
* loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
2476
* do the same for L2.
2477
*/
2478
exec_control = __vm_entry_controls_get(vmcs01);
2479
exec_control |= (vmcs12->vm_entry_controls &
2480
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
2481
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2482
if (cpu_has_load_ia32_efer()) {
2483
if (guest_efer & EFER_LMA)
2484
exec_control |= VM_ENTRY_IA32E_MODE;
2485
if (guest_efer != kvm_host.efer)
2486
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2487
}
2488
vm_entry_controls_set(vmx, exec_control);
2489
2490
/*
2491
* EXIT CONTROLS
2492
*
2493
* L2->L1 exit controls are emulated - the hardware exit is to L0 so
2494
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2495
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
2496
*/
2497
exec_control = __vm_exit_controls_get(vmcs01);
2498
if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
2499
exec_control |= VM_EXIT_LOAD_IA32_EFER;
2500
else
2501
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2502
vm_exit_controls_set(vmx, exec_control);
2503
2504
/*
2505
* Interrupt/Exception Fields
2506
*/
2507
if (vmx->nested.nested_run_pending) {
2508
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2509
vmcs12->vm_entry_intr_info_field);
2510
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2511
vmcs12->vm_entry_exception_error_code);
2512
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2513
vmcs12->vm_entry_instruction_len);
2514
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2515
vmcs12->guest_interruptibility_info);
2516
vmx->loaded_vmcs->nmi_known_unmasked =
2517
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2518
} else {
2519
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2520
}
2521
}
2522
2523
static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2524
{
2525
struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
2526
2527
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2528
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2529
2530
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2531
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2532
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2533
vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2534
vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2535
vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2536
vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2537
vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2538
vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2539
vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2540
vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2541
vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2542
vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2543
vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2544
vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2545
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2546
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2547
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2548
vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2549
vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2550
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2551
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2552
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2553
vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2554
vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2555
vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2556
vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2557
vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2558
vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2559
vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2560
vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2561
vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2562
vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2563
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2564
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2565
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2566
2567
vmx_segment_cache_clear(vmx);
2568
}
2569
2570
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2571
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2572
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2573
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2574
vmcs12->guest_pending_dbg_exceptions);
2575
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2576
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2577
2578
/*
2579
* L1 may access the L2's PDPTR, so save them to construct
2580
* vmcs12
2581
*/
2582
if (enable_ept) {
2583
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2584
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2585
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2586
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2587
}
2588
2589
if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2590
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2591
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2592
}
2593
2594
if (nested_cpu_has_xsaves(vmcs12))
2595
vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2596
2597
/*
2598
* Whether page-faults are trapped is determined by a combination of
2599
* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2600
* doesn't care about page faults then we should set all of these to
2601
* L1's desires. However, if L0 does care about (some) page faults, it
2602
* is not easy (if at all possible?) to merge L0 and L1's desires, we
2603
* simply ask to exit on each and every L2 page fault. This is done by
2604
* setting MASK=MATCH=0 and (see below) EB.PF=1.
2605
* Note that below we don't need special code to set EB.PF beyond the
2606
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2607
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2608
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
2609
*/
2610
if (vmx_need_pf_intercept(&vmx->vcpu)) {
2611
/*
2612
* TODO: if both L0 and L1 need the same MASK and MATCH,
2613
* go ahead and use it?
2614
*/
2615
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2616
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2617
} else {
2618
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2619
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2620
}
2621
2622
if (cpu_has_vmx_apicv()) {
2623
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2624
vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2625
vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2626
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2627
}
2628
2629
/*
2630
* Make sure the msr_autostore list is up to date before we set the
2631
* count in the vmcs02.
2632
*/
2633
prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2634
2635
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2636
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2637
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2638
2639
set_cr4_guest_host_mask(vmx);
2640
}
2641
2642
/*
2643
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2644
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2645
* with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2646
* guest in a way that will both be appropriate to L1's requests, and our
2647
* needs. In addition to modifying the active vmcs (which is vmcs02), this
2648
* function also has additional necessary side-effects, like setting various
2649
* vcpu->arch fields.
2650
* Returns 0 on success, 1 on failure. Invalid state exit qualification code
2651
* is assigned to entry_failure_code on failure.
2652
*/
2653
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2654
bool from_vmentry,
2655
enum vm_entry_failure_code *entry_failure_code)
2656
{
2657
struct vcpu_vmx *vmx = to_vmx(vcpu);
2658
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
2659
bool load_guest_pdptrs_vmcs12 = false;
2660
2661
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
2662
prepare_vmcs02_rare(vmx, vmcs12);
2663
vmx->nested.dirty_vmcs12 = false;
2664
2665
load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
2666
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2667
}
2668
2669
if (vmx->nested.nested_run_pending &&
2670
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2671
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2672
vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
2673
vmx_get_supported_debugctl(vcpu, false));
2674
} else {
2675
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2676
vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
2677
}
2678
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2679
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2680
vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
2681
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2682
2683
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2684
* bitwise-or of what L1 wants to trap for L2, and what we want to
2685
* trap. Note that CR0.TS also needs updating - we do this later.
2686
*/
2687
vmx_update_exception_bitmap(vcpu);
2688
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2689
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2690
2691
if (vmx->nested.nested_run_pending &&
2692
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2693
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2694
vcpu->arch.pat = vmcs12->guest_ia32_pat;
2695
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2696
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2697
}
2698
2699
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2700
vcpu->arch.l1_tsc_offset,
2701
vmx_get_l2_tsc_offset(vcpu),
2702
vmx_get_l2_tsc_multiplier(vcpu));
2703
2704
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2705
vcpu->arch.l1_tsc_scaling_ratio,
2706
vmx_get_l2_tsc_multiplier(vcpu));
2707
2708
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2709
if (kvm_caps.has_tsc_control)
2710
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2711
2712
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2713
2714
if (nested_cpu_has_ept(vmcs12))
2715
nested_ept_init_mmu_context(vcpu);
2716
2717
/*
2718
* Override the CR0/CR4 read shadows after setting the effective guest
2719
* CR0/CR4. The common helpers also set the shadows, but they don't
2720
* account for vmcs12's cr0/4_guest_host_mask.
2721
*/
2722
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2723
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2724
2725
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2726
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2727
2728
vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2729
/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2730
vmx_set_efer(vcpu, vcpu->arch.efer);
2731
2732
/*
2733
* Guest state is invalid and unrestricted guest is disabled,
2734
* which means L1 attempted VMEntry to L2 with invalid state.
2735
* Fail the VMEntry.
2736
*
2737
* However when force loading the guest state (SMM exit or
2738
* loading nested state after migration, it is possible to
2739
* have invalid guest state now, which will be later fixed by
2740
* restoring L2 register state
2741
*/
2742
if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2743
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2744
return -EINVAL;
2745
}
2746
2747
/* Shadow page tables on either EPT or shadow page tables. */
2748
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2749
from_vmentry, entry_failure_code))
2750
return -EINVAL;
2751
2752
/*
2753
* Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2754
* on nested VM-Exit, which can occur without actually running L2 and
2755
* thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2756
* vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2757
* transition to HLT instead of running L2.
2758
*/
2759
if (enable_ept)
2760
vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2761
2762
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2763
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2764
is_pae_paging(vcpu)) {
2765
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2766
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2767
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2768
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2769
}
2770
2771
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2772
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
2773
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2774
vmcs12->guest_ia32_perf_global_ctrl))) {
2775
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2776
return -EINVAL;
2777
}
2778
2779
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2780
kvm_rip_write(vcpu, vmcs12->guest_rip);
2781
2782
/*
2783
* It was observed that genuine Hyper-V running in L1 doesn't reset
2784
* 'hv_clean_fields' by itself, it only sets the corresponding dirty
2785
* bits when it changes a field in eVMCS. Mark all fields as clean
2786
* here.
2787
*/
2788
if (nested_vmx_is_evmptr12_valid(vmx))
2789
evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2790
2791
return 0;
2792
}
2793
2794
static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2795
{
2796
if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2797
nested_cpu_has_virtual_nmis(vmcs12)))
2798
return -EINVAL;
2799
2800
if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2801
nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2802
return -EINVAL;
2803
2804
return 0;
2805
}
2806
2807
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2808
{
2809
struct vcpu_vmx *vmx = to_vmx(vcpu);
2810
2811
/* Check for memory type validity */
2812
switch (new_eptp & VMX_EPTP_MT_MASK) {
2813
case VMX_EPTP_MT_UC:
2814
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2815
return false;
2816
break;
2817
case VMX_EPTP_MT_WB:
2818
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2819
return false;
2820
break;
2821
default:
2822
return false;
2823
}
2824
2825
/* Page-walk levels validity. */
2826
switch (new_eptp & VMX_EPTP_PWL_MASK) {
2827
case VMX_EPTP_PWL_5:
2828
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2829
return false;
2830
break;
2831
case VMX_EPTP_PWL_4:
2832
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2833
return false;
2834
break;
2835
default:
2836
return false;
2837
}
2838
2839
/* Reserved bits should not be set */
2840
if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2841
return false;
2842
2843
/* AD, if set, should be supported */
2844
if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2845
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2846
return false;
2847
}
2848
2849
return true;
2850
}
2851
2852
/*
2853
* Checks related to VM-Execution Control Fields
2854
*/
2855
static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2856
struct vmcs12 *vmcs12)
2857
{
2858
struct vcpu_vmx *vmx = to_vmx(vcpu);
2859
2860
if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2861
vmx->nested.msrs.pinbased_ctls_low,
2862
vmx->nested.msrs.pinbased_ctls_high)) ||
2863
CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2864
vmx->nested.msrs.procbased_ctls_low,
2865
vmx->nested.msrs.procbased_ctls_high)))
2866
return -EINVAL;
2867
2868
if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2869
CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2870
vmx->nested.msrs.secondary_ctls_low,
2871
vmx->nested.msrs.secondary_ctls_high)))
2872
return -EINVAL;
2873
2874
if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2875
nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2876
nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2877
nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2878
nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2879
nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2880
nested_vmx_check_nmi_controls(vmcs12) ||
2881
nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2882
nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2883
nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2884
nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2885
CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2886
return -EINVAL;
2887
2888
if (!nested_cpu_has_preemption_timer(vmcs12) &&
2889
nested_cpu_has_save_preemption_timer(vmcs12))
2890
return -EINVAL;
2891
2892
if (nested_cpu_has_ept(vmcs12) &&
2893
CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2894
return -EINVAL;
2895
2896
if (nested_cpu_has_vmfunc(vmcs12)) {
2897
if (CC(vmcs12->vm_function_control &
2898
~vmx->nested.msrs.vmfunc_controls))
2899
return -EINVAL;
2900
2901
if (nested_cpu_has_eptp_switching(vmcs12)) {
2902
if (CC(!nested_cpu_has_ept(vmcs12)) ||
2903
CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2904
return -EINVAL;
2905
}
2906
}
2907
2908
return 0;
2909
}
2910
2911
/*
2912
* Checks related to VM-Exit Control Fields
2913
*/
2914
static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2915
struct vmcs12 *vmcs12)
2916
{
2917
struct vcpu_vmx *vmx = to_vmx(vcpu);
2918
2919
if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2920
vmx->nested.msrs.exit_ctls_low,
2921
vmx->nested.msrs.exit_ctls_high)) ||
2922
CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2923
return -EINVAL;
2924
2925
return 0;
2926
}
2927
2928
/*
2929
* Checks related to VM-Entry Control Fields
2930
*/
2931
static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2932
struct vmcs12 *vmcs12)
2933
{
2934
struct vcpu_vmx *vmx = to_vmx(vcpu);
2935
2936
if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2937
vmx->nested.msrs.entry_ctls_low,
2938
vmx->nested.msrs.entry_ctls_high)))
2939
return -EINVAL;
2940
2941
/*
2942
* From the Intel SDM, volume 3:
2943
* Fields relevant to VM-entry event injection must be set properly.
2944
* These fields are the VM-entry interruption-information field, the
2945
* VM-entry exception error code, and the VM-entry instruction length.
2946
*/
2947
if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2948
u32 intr_info = vmcs12->vm_entry_intr_info_field;
2949
u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2950
u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2951
bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2952
bool should_have_error_code;
2953
bool urg = nested_cpu_has2(vmcs12,
2954
SECONDARY_EXEC_UNRESTRICTED_GUEST);
2955
bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2956
2957
/* VM-entry interruption-info field: interruption type */
2958
if (CC(intr_type == INTR_TYPE_RESERVED) ||
2959
CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2960
!nested_cpu_supports_monitor_trap_flag(vcpu)))
2961
return -EINVAL;
2962
2963
/* VM-entry interruption-info field: vector */
2964
if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2965
CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2966
CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2967
return -EINVAL;
2968
2969
/* VM-entry interruption-info field: deliver error code */
2970
should_have_error_code =
2971
intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2972
x86_exception_has_error_code(vector);
2973
if (CC(has_error_code != should_have_error_code))
2974
return -EINVAL;
2975
2976
/* VM-entry exception error code */
2977
if (CC(has_error_code &&
2978
vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2979
return -EINVAL;
2980
2981
/* VM-entry interruption-info field: reserved bits */
2982
if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2983
return -EINVAL;
2984
2985
/* VM-entry instruction length */
2986
switch (intr_type) {
2987
case INTR_TYPE_SOFT_EXCEPTION:
2988
case INTR_TYPE_SOFT_INTR:
2989
case INTR_TYPE_PRIV_SW_EXCEPTION:
2990
if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
2991
CC(vmcs12->vm_entry_instruction_len == 0 &&
2992
CC(!nested_cpu_has_zero_length_injection(vcpu))))
2993
return -EINVAL;
2994
}
2995
}
2996
2997
if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2998
return -EINVAL;
2999
3000
return 0;
3001
}
3002
3003
static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
3004
struct vmcs12 *vmcs12)
3005
{
3006
if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
3007
nested_check_vm_exit_controls(vcpu, vmcs12) ||
3008
nested_check_vm_entry_controls(vcpu, vmcs12))
3009
return -EINVAL;
3010
3011
#ifdef CONFIG_KVM_HYPERV
3012
if (guest_cpu_cap_has_evmcs(vcpu))
3013
return nested_evmcs_check_controls(vmcs12);
3014
#endif
3015
3016
return 0;
3017
}
3018
3019
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
3020
struct vmcs12 *vmcs12)
3021
{
3022
#ifdef CONFIG_X86_64
3023
if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
3024
!!(vcpu->arch.efer & EFER_LMA)))
3025
return -EINVAL;
3026
#endif
3027
return 0;
3028
}
3029
3030
static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
3031
{
3032
/*
3033
* Check that the given linear address is canonical after a VM exit
3034
* from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
3035
*/
3036
u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
3037
3038
return !__is_canonical_address(la, l1_address_bits_on_exit);
3039
}
3040
3041
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
3042
struct vmcs12 *vmcs12)
3043
{
3044
bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
3045
3046
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
3047
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
3048
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
3049
return -EINVAL;
3050
3051
if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
3052
CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
3053
return -EINVAL;
3054
3055
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
3056
CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
3057
return -EINVAL;
3058
3059
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3060
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3061
vmcs12->host_ia32_perf_global_ctrl)))
3062
return -EINVAL;
3063
3064
if (ia32e) {
3065
if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
3066
return -EINVAL;
3067
} else {
3068
if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
3069
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
3070
CC((vmcs12->host_rip) >> 32))
3071
return -EINVAL;
3072
}
3073
3074
if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3075
CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3076
CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3077
CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3078
CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3079
CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3080
CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3081
CC(vmcs12->host_cs_selector == 0) ||
3082
CC(vmcs12->host_tr_selector == 0) ||
3083
CC(vmcs12->host_ss_selector == 0 && !ia32e))
3084
return -EINVAL;
3085
3086
if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
3087
CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
3088
CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
3089
CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
3090
CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
3091
CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
3092
return -EINVAL;
3093
3094
/*
3095
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
3096
* IA32_EFER MSR must be 0 in the field for that register. In addition,
3097
* the values of the LMA and LME bits in the field must each be that of
3098
* the host address-space size VM-exit control.
3099
*/
3100
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
3101
if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
3102
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
3103
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
3104
return -EINVAL;
3105
}
3106
3107
return 0;
3108
}
3109
3110
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
3111
struct vmcs12 *vmcs12)
3112
{
3113
struct vcpu_vmx *vmx = to_vmx(vcpu);
3114
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
3115
struct vmcs_hdr hdr;
3116
3117
if (vmcs12->vmcs_link_pointer == INVALID_GPA)
3118
return 0;
3119
3120
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
3121
return -EINVAL;
3122
3123
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
3124
CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
3125
vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
3126
return -EINVAL;
3127
3128
if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
3129
offsetof(struct vmcs12, hdr),
3130
sizeof(hdr))))
3131
return -EINVAL;
3132
3133
if (CC(hdr.revision_id != VMCS12_REVISION) ||
3134
CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
3135
return -EINVAL;
3136
3137
return 0;
3138
}
3139
3140
/*
3141
* Checks related to Guest Non-register State
3142
*/
3143
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
3144
{
3145
if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
3146
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
3147
vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
3148
return -EINVAL;
3149
3150
return 0;
3151
}
3152
3153
static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
3154
struct vmcs12 *vmcs12,
3155
enum vm_entry_failure_code *entry_failure_code)
3156
{
3157
bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
3158
3159
*entry_failure_code = ENTRY_FAIL_DEFAULT;
3160
3161
if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3162
CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3163
return -EINVAL;
3164
3165
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3166
(CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
3167
CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
3168
return -EINVAL;
3169
3170
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3171
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3172
return -EINVAL;
3173
3174
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3175
*entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3176
return -EINVAL;
3177
}
3178
3179
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3180
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3181
vmcs12->guest_ia32_perf_global_ctrl)))
3182
return -EINVAL;
3183
3184
if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
3185
return -EINVAL;
3186
3187
if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
3188
CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
3189
return -EINVAL;
3190
3191
/*
3192
* If the load IA32_EFER VM-entry control is 1, the following checks
3193
* are performed on the field for the IA32_EFER MSR:
3194
* - Bits reserved in the IA32_EFER MSR must be 0.
3195
* - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3196
* the IA-32e mode guest VM-exit control. It must also be identical
3197
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3198
* CR0.PG) is 1.
3199
*/
3200
if (to_vmx(vcpu)->nested.nested_run_pending &&
3201
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3202
if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3203
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3204
CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3205
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3206
return -EINVAL;
3207
}
3208
3209
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3210
(CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3211
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3212
return -EINVAL;
3213
3214
if (nested_check_guest_non_reg_state(vmcs12))
3215
return -EINVAL;
3216
3217
return 0;
3218
}
3219
3220
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3221
{
3222
struct vcpu_vmx *vmx = to_vmx(vcpu);
3223
unsigned long cr3, cr4;
3224
bool vm_fail;
3225
3226
if (!nested_early_check)
3227
return 0;
3228
3229
if (vmx->msr_autoload.host.nr)
3230
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3231
if (vmx->msr_autoload.guest.nr)
3232
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3233
3234
preempt_disable();
3235
3236
vmx_prepare_switch_to_guest(vcpu);
3237
3238
/*
3239
* Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3240
* which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3241
* be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3242
* there is no need to preserve other bits or save/restore the field.
3243
*/
3244
vmcs_writel(GUEST_RFLAGS, 0);
3245
3246
cr3 = __get_current_cr3_fast();
3247
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3248
vmcs_writel(HOST_CR3, cr3);
3249
vmx->loaded_vmcs->host_state.cr3 = cr3;
3250
}
3251
3252
cr4 = cr4_read_shadow();
3253
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3254
vmcs_writel(HOST_CR4, cr4);
3255
vmx->loaded_vmcs->host_state.cr4 = cr4;
3256
}
3257
3258
vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3259
__vmx_vcpu_run_flags(vmx));
3260
3261
if (vmx->msr_autoload.host.nr)
3262
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3263
if (vmx->msr_autoload.guest.nr)
3264
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3265
3266
if (vm_fail) {
3267
u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3268
3269
preempt_enable();
3270
3271
trace_kvm_nested_vmenter_failed(
3272
"early hardware check VM-instruction error: ", error);
3273
WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3274
return 1;
3275
}
3276
3277
/*
3278
* VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3279
*/
3280
if (hw_breakpoint_active())
3281
set_debugreg(__this_cpu_read(cpu_dr7), 7);
3282
local_irq_enable();
3283
preempt_enable();
3284
3285
/*
3286
* A non-failing VMEntry means we somehow entered guest mode with
3287
* an illegal RIP, and that's just the tip of the iceberg. There
3288
* is no telling what memory has been modified or what state has
3289
* been exposed to unknown code. Hitting this all but guarantees
3290
* a (very critical) hardware issue.
3291
*/
3292
WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3293
VMX_EXIT_REASONS_FAILED_VMENTRY));
3294
3295
return 0;
3296
}
3297
3298
#ifdef CONFIG_KVM_HYPERV
3299
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3300
{
3301
struct vcpu_vmx *vmx = to_vmx(vcpu);
3302
3303
/*
3304
* hv_evmcs may end up being not mapped after migration (when
3305
* L2 was running), map it here to make sure vmcs12 changes are
3306
* properly reflected.
3307
*/
3308
if (guest_cpu_cap_has_evmcs(vcpu) &&
3309
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3310
enum nested_evmptrld_status evmptrld_status =
3311
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3312
3313
if (evmptrld_status == EVMPTRLD_VMFAIL ||
3314
evmptrld_status == EVMPTRLD_ERROR)
3315
return false;
3316
3317
/*
3318
* Post migration VMCS12 always provides the most actual
3319
* information, copy it to eVMCS upon entry.
3320
*/
3321
vmx->nested.need_vmcs12_to_shadow_sync = true;
3322
}
3323
3324
return true;
3325
}
3326
#endif
3327
3328
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3329
{
3330
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3331
struct vcpu_vmx *vmx = to_vmx(vcpu);
3332
struct kvm_host_map *map;
3333
3334
if (!vcpu->arch.pdptrs_from_userspace &&
3335
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3336
/*
3337
* Reload the guest's PDPTRs since after a migration
3338
* the guest CR3 might be restored prior to setting the nested
3339
* state which can lead to a load of wrong PDPTRs.
3340
*/
3341
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
3342
return false;
3343
}
3344
3345
3346
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3347
map = &vmx->nested.apic_access_page_map;
3348
3349
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
3350
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
3351
} else {
3352
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
3353
__func__);
3354
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3355
vcpu->run->internal.suberror =
3356
KVM_INTERNAL_ERROR_EMULATION;
3357
vcpu->run->internal.ndata = 0;
3358
return false;
3359
}
3360
}
3361
3362
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3363
map = &vmx->nested.virtual_apic_map;
3364
3365
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3366
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3367
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3368
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3369
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3370
/*
3371
* The processor will never use the TPR shadow, simply
3372
* clear the bit from the execution control. Such a
3373
* configuration is useless, but it happens in tests.
3374
* For any other configuration, failing the vm entry is
3375
* _not_ what the processor does but it's basically the
3376
* only possibility we have.
3377
*/
3378
exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3379
} else {
3380
/*
3381
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3382
* force VM-Entry to fail.
3383
*/
3384
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
3385
}
3386
}
3387
3388
if (nested_cpu_has_posted_intr(vmcs12)) {
3389
map = &vmx->nested.pi_desc_map;
3390
3391
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3392
vmx->nested.pi_desc =
3393
(struct pi_desc *)(((void *)map->hva) +
3394
offset_in_page(vmcs12->posted_intr_desc_addr));
3395
vmcs_write64(POSTED_INTR_DESC_ADDR,
3396
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3397
} else {
3398
/*
3399
* Defer the KVM_INTERNAL_EXIT until KVM tries to
3400
* access the contents of the VMCS12 posted interrupt
3401
* descriptor. (Note that KVM may do this when it
3402
* should not, per the architectural specification.)
3403
*/
3404
vmx->nested.pi_desc = NULL;
3405
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3406
}
3407
}
3408
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3409
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3410
else
3411
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3412
3413
return true;
3414
}
3415
3416
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3417
{
3418
#ifdef CONFIG_KVM_HYPERV
3419
/*
3420
* Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
3421
* in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
3422
* to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
3423
* migration.
3424
*/
3425
if (!nested_get_evmcs_page(vcpu)) {
3426
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3427
__func__);
3428
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3429
vcpu->run->internal.suberror =
3430
KVM_INTERNAL_ERROR_EMULATION;
3431
vcpu->run->internal.ndata = 0;
3432
3433
return false;
3434
}
3435
#endif
3436
3437
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3438
return false;
3439
3440
return true;
3441
}
3442
3443
static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3444
{
3445
struct vmcs12 *vmcs12;
3446
struct vcpu_vmx *vmx = to_vmx(vcpu);
3447
gpa_t dst;
3448
3449
if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3450
return 0;
3451
3452
if (WARN_ON_ONCE(vmx->nested.pml_full))
3453
return 1;
3454
3455
/*
3456
* Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3457
* set is already checked as part of A/D emulation.
3458
*/
3459
vmcs12 = get_vmcs12(vcpu);
3460
if (!nested_cpu_has_pml(vmcs12))
3461
return 0;
3462
3463
if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
3464
vmx->nested.pml_full = true;
3465
return 1;
3466
}
3467
3468
gpa &= ~0xFFFull;
3469
dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3470
3471
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3472
offset_in_page(dst), sizeof(gpa)))
3473
return 0;
3474
3475
vmcs12->guest_pml_index--;
3476
3477
return 0;
3478
}
3479
3480
/*
3481
* Intel's VMX Instruction Reference specifies a common set of prerequisites
3482
* for running VMX instructions (except VMXON, whose prerequisites are
3483
* slightly different). It also specifies what exception to inject otherwise.
3484
* Note that many of these exceptions have priority over VM exits, so they
3485
* don't have to be checked again here.
3486
*/
3487
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3488
{
3489
if (!to_vmx(vcpu)->nested.vmxon) {
3490
kvm_queue_exception(vcpu, UD_VECTOR);
3491
return 0;
3492
}
3493
3494
if (vmx_get_cpl(vcpu)) {
3495
kvm_inject_gp(vcpu, 0);
3496
return 0;
3497
}
3498
3499
return 1;
3500
}
3501
3502
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3503
struct vmcs12 *vmcs12);
3504
3505
/*
3506
* If from_vmentry is false, this is being called from state restore (either RSM
3507
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3508
*
3509
* Returns:
3510
* NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3511
* NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3512
* NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3513
* NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3514
*/
3515
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3516
bool from_vmentry)
3517
{
3518
struct vcpu_vmx *vmx = to_vmx(vcpu);
3519
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3520
enum vm_entry_failure_code entry_failure_code;
3521
union vmx_exit_reason exit_reason = {
3522
.basic = EXIT_REASON_INVALID_STATE,
3523
.failed_vmentry = 1,
3524
};
3525
u32 failed_index;
3526
3527
trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
3528
vmx->nested.current_vmptr,
3529
vmcs12->guest_rip,
3530
vmcs12->guest_intr_status,
3531
vmcs12->vm_entry_intr_info_field,
3532
vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
3533
vmcs12->ept_pointer,
3534
vmcs12->guest_cr3,
3535
KVM_ISA_VMX);
3536
3537
kvm_service_local_tlb_flush_requests(vcpu);
3538
3539
if (!vmx->nested.nested_run_pending ||
3540
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3541
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
3542
if (kvm_mpx_supported() &&
3543
(!vmx->nested.nested_run_pending ||
3544
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3545
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3546
3547
/*
3548
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3549
* nested early checks are disabled. In the event of a "late" VM-Fail,
3550
* i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3551
* software model to the pre-VMEntry host state. When EPT is disabled,
3552
* GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3553
* nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3554
* vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3555
* the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3556
* VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3557
* guaranteed to be overwritten with a shadow CR3 prior to re-entering
3558
* L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3559
* KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3560
* pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3561
* path would need to manually save/restore vmcs01.GUEST_CR3.
3562
*/
3563
if (!enable_ept && !nested_early_check)
3564
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3565
3566
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3567
3568
prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3569
3570
if (from_vmentry) {
3571
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3572
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3573
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3574
}
3575
3576
if (nested_vmx_check_vmentry_hw(vcpu)) {
3577
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3578
return NVMX_VMENTRY_VMFAIL;
3579
}
3580
3581
if (nested_vmx_check_guest_state(vcpu, vmcs12,
3582
&entry_failure_code)) {
3583
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3584
vmcs12->exit_qualification = entry_failure_code;
3585
goto vmentry_fail_vmexit;
3586
}
3587
}
3588
3589
enter_guest_mode(vcpu);
3590
3591
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3592
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3593
vmcs12->exit_qualification = entry_failure_code;
3594
goto vmentry_fail_vmexit_guest_mode;
3595
}
3596
3597
if (from_vmentry) {
3598
failed_index = nested_vmx_load_msr(vcpu,
3599
vmcs12->vm_entry_msr_load_addr,
3600
vmcs12->vm_entry_msr_load_count);
3601
if (failed_index) {
3602
exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3603
vmcs12->exit_qualification = failed_index;
3604
goto vmentry_fail_vmexit_guest_mode;
3605
}
3606
} else {
3607
/*
3608
* The MMU is not initialized to point at the right entities yet and
3609
* "get pages" would need to read data from the guest (i.e. we will
3610
* need to perform gpa to hpa translation). Request a call
3611
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3612
* have already been set at vmentry time and should not be reset.
3613
*/
3614
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3615
}
3616
3617
/*
3618
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
3619
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
3620
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
3621
* unconditionally. Take care to pull data from vmcs01 as appropriate,
3622
* e.g. when checking for interrupt windows, as vmcs02 is now loaded.
3623
*/
3624
if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
3625
CPU_BASED_NMI_WINDOW_EXITING)) ||
3626
kvm_apic_has_pending_init_or_sipi(vcpu) ||
3627
kvm_apic_has_interrupt(vcpu))
3628
kvm_make_request(KVM_REQ_EVENT, vcpu);
3629
3630
/*
3631
* Do not start the preemption timer hrtimer until after we know
3632
* we are successful, so that only nested_vmx_vmexit needs to cancel
3633
* the timer.
3634
*/
3635
vmx->nested.preemption_timer_expired = false;
3636
if (nested_cpu_has_preemption_timer(vmcs12)) {
3637
u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3638
vmx_start_preemption_timer(vcpu, timer_value);
3639
}
3640
3641
/*
3642
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3643
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3644
* returned as far as L1 is concerned. It will only return (and set
3645
* the success flag) when L2 exits (see nested_vmx_vmexit()).
3646
*/
3647
return NVMX_VMENTRY_SUCCESS;
3648
3649
/*
3650
* A failed consistency check that leads to a VMExit during L1's
3651
* VMEnter to L2 is a variation of a normal VMexit, as explained in
3652
* 26.7 "VM-entry failures during or after loading guest state".
3653
*/
3654
vmentry_fail_vmexit_guest_mode:
3655
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3656
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3657
leave_guest_mode(vcpu);
3658
3659
vmentry_fail_vmexit:
3660
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3661
3662
if (!from_vmentry)
3663
return NVMX_VMENTRY_VMEXIT;
3664
3665
load_vmcs12_host_state(vcpu, vmcs12);
3666
vmcs12->vm_exit_reason = exit_reason.full;
3667
if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
3668
vmx->nested.need_vmcs12_to_shadow_sync = true;
3669
return NVMX_VMENTRY_VMEXIT;
3670
}
3671
3672
/*
3673
* nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3674
* for running an L2 nested guest.
3675
*/
3676
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3677
{
3678
struct vmcs12 *vmcs12;
3679
enum nvmx_vmentry_status status;
3680
struct vcpu_vmx *vmx = to_vmx(vcpu);
3681
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3682
enum nested_evmptrld_status evmptrld_status;
3683
3684
if (!nested_vmx_check_permission(vcpu))
3685
return 1;
3686
3687
evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3688
if (evmptrld_status == EVMPTRLD_ERROR) {
3689
kvm_queue_exception(vcpu, UD_VECTOR);
3690
return 1;
3691
}
3692
3693
kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
3694
3695
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3696
return nested_vmx_failInvalid(vcpu);
3697
3698
if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
3699
vmx->nested.current_vmptr == INVALID_GPA))
3700
return nested_vmx_failInvalid(vcpu);
3701
3702
vmcs12 = get_vmcs12(vcpu);
3703
3704
/*
3705
* Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3706
* that there *is* a valid VMCS pointer, RFLAGS.CF is set
3707
* rather than RFLAGS.ZF, and no error number is stored to the
3708
* VM-instruction error field.
3709
*/
3710
if (CC(vmcs12->hdr.shadow_vmcs))
3711
return nested_vmx_failInvalid(vcpu);
3712
3713
if (nested_vmx_is_evmptr12_valid(vmx)) {
3714
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
3715
3716
copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
3717
/* Enlightened VMCS doesn't have launch state */
3718
vmcs12->launch_state = !launch;
3719
} else if (enable_shadow_vmcs) {
3720
copy_shadow_to_vmcs12(vmx);
3721
}
3722
3723
/*
3724
* The nested entry process starts with enforcing various prerequisites
3725
* on vmcs12 as required by the Intel SDM, and act appropriately when
3726
* they fail: As the SDM explains, some conditions should cause the
3727
* instruction to fail, while others will cause the instruction to seem
3728
* to succeed, but return an EXIT_REASON_INVALID_STATE.
3729
* To speed up the normal (success) code path, we should avoid checking
3730
* for misconfigurations which will anyway be caught by the processor
3731
* when using the merged vmcs02.
3732
*/
3733
if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3734
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3735
3736
if (CC(vmcs12->launch_state == launch))
3737
return nested_vmx_fail(vcpu,
3738
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3739
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3740
3741
if (nested_vmx_check_controls(vcpu, vmcs12))
3742
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3743
3744
if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3745
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3746
3747
if (nested_vmx_check_host_state(vcpu, vmcs12))
3748
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3749
3750
/*
3751
* We're finally done with prerequisite checking, and can start with
3752
* the nested entry.
3753
*/
3754
vmx->nested.nested_run_pending = 1;
3755
vmx->nested.has_preemption_timer_deadline = false;
3756
status = nested_vmx_enter_non_root_mode(vcpu, true);
3757
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3758
goto vmentry_failed;
3759
3760
/* Hide L1D cache contents from the nested guest. */
3761
vmx->vcpu.arch.l1tf_flush_l1d = true;
3762
3763
/*
3764
* Must happen outside of nested_vmx_enter_non_root_mode() as it will
3765
* also be used as part of restoring nVMX state for
3766
* snapshot restore (migration).
3767
*
3768
* In this flow, it is assumed that vmcs12 cache was
3769
* transferred as part of captured nVMX state and should
3770
* therefore not be read from guest memory (which may not
3771
* exist on destination host yet).
3772
*/
3773
nested_cache_shadow_vmcs12(vcpu, vmcs12);
3774
3775
switch (vmcs12->guest_activity_state) {
3776
case GUEST_ACTIVITY_HLT:
3777
/*
3778
* If we're entering a halted L2 vcpu and the L2 vcpu won't be
3779
* awakened by event injection or by an NMI-window VM-exit or
3780
* by an interrupt-window VM-exit, halt the vcpu.
3781
*/
3782
if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3783
!nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3784
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3785
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3786
vmx->nested.nested_run_pending = 0;
3787
return kvm_emulate_halt_noskip(vcpu);
3788
}
3789
break;
3790
case GUEST_ACTIVITY_WAIT_SIPI:
3791
vmx->nested.nested_run_pending = 0;
3792
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
3793
break;
3794
default:
3795
break;
3796
}
3797
3798
return 1;
3799
3800
vmentry_failed:
3801
vmx->nested.nested_run_pending = 0;
3802
if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3803
return 0;
3804
if (status == NVMX_VMENTRY_VMEXIT)
3805
return 1;
3806
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3807
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3808
}
3809
3810
/*
3811
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3812
* because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3813
* This function returns the new value we should put in vmcs12.guest_cr0.
3814
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3815
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3816
* available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3817
* didn't trap the bit, because if L1 did, so would L0).
3818
* 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3819
* been modified by L2, and L1 knows it. So just leave the old value of
3820
* the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3821
* isn't relevant, because if L0 traps this bit it can set it to anything.
3822
* 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3823
* changed these bits, and therefore they need to be updated, but L0
3824
* didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3825
* put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3826
*/
3827
static inline unsigned long
3828
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3829
{
3830
return
3831
/*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3832
/*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3833
/*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3834
vcpu->arch.cr0_guest_owned_bits));
3835
}
3836
3837
static inline unsigned long
3838
vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3839
{
3840
return
3841
/*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3842
/*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3843
/*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3844
vcpu->arch.cr4_guest_owned_bits));
3845
}
3846
3847
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3848
struct vmcs12 *vmcs12,
3849
u32 vm_exit_reason, u32 exit_intr_info)
3850
{
3851
u32 idt_vectoring;
3852
unsigned int nr;
3853
3854
/*
3855
* Per the SDM, VM-Exits due to double and triple faults are never
3856
* considered to occur during event delivery, even if the double/triple
3857
* fault is the result of an escalating vectoring issue.
3858
*
3859
* Note, the SDM qualifies the double fault behavior with "The original
3860
* event results in a double-fault exception". It's unclear why the
3861
* qualification exists since exits due to double fault can occur only
3862
* while vectoring a different exception (injected events are never
3863
* subject to interception), i.e. there's _always_ an original event.
3864
*
3865
* The SDM also uses NMI as a confusing example for the "original event
3866
* causes the VM exit directly" clause. NMI isn't special in any way,
3867
* the same rule applies to all events that cause an exit directly.
3868
* NMI is an odd choice for the example because NMIs can only occur on
3869
* instruction boundaries, i.e. they _can't_ occur during vectoring.
3870
*/
3871
if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3872
((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3873
is_double_fault(exit_intr_info))) {
3874
vmcs12->idt_vectoring_info_field = 0;
3875
} else if (vcpu->arch.exception.injected) {
3876
nr = vcpu->arch.exception.vector;
3877
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3878
3879
if (kvm_exception_is_soft(nr)) {
3880
vmcs12->vm_exit_instruction_len =
3881
vcpu->arch.event_exit_inst_len;
3882
idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3883
} else
3884
idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3885
3886
if (vcpu->arch.exception.has_error_code) {
3887
idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3888
vmcs12->idt_vectoring_error_code =
3889
vcpu->arch.exception.error_code;
3890
}
3891
3892
vmcs12->idt_vectoring_info_field = idt_vectoring;
3893
} else if (vcpu->arch.nmi_injected) {
3894
vmcs12->idt_vectoring_info_field =
3895
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3896
} else if (vcpu->arch.interrupt.injected) {
3897
nr = vcpu->arch.interrupt.nr;
3898
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3899
3900
if (vcpu->arch.interrupt.soft) {
3901
idt_vectoring |= INTR_TYPE_SOFT_INTR;
3902
vmcs12->vm_entry_instruction_len =
3903
vcpu->arch.event_exit_inst_len;
3904
} else
3905
idt_vectoring |= INTR_TYPE_EXT_INTR;
3906
3907
vmcs12->idt_vectoring_info_field = idt_vectoring;
3908
} else {
3909
vmcs12->idt_vectoring_info_field = 0;
3910
}
3911
}
3912
3913
3914
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3915
{
3916
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3917
gfn_t gfn;
3918
3919
/*
3920
* Don't need to mark the APIC access page dirty; it is never
3921
* written to by the CPU during APIC virtualization.
3922
*/
3923
3924
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3925
gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3926
kvm_vcpu_mark_page_dirty(vcpu, gfn);
3927
}
3928
3929
if (nested_cpu_has_posted_intr(vmcs12)) {
3930
gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3931
kvm_vcpu_mark_page_dirty(vcpu, gfn);
3932
}
3933
}
3934
3935
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3936
{
3937
struct vcpu_vmx *vmx = to_vmx(vcpu);
3938
int max_irr;
3939
void *vapic_page;
3940
u16 status;
3941
3942
if (!vmx->nested.pi_pending)
3943
return 0;
3944
3945
if (!vmx->nested.pi_desc)
3946
goto mmio_needed;
3947
3948
vmx->nested.pi_pending = false;
3949
3950
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3951
return 0;
3952
3953
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
3954
if (max_irr > 0) {
3955
vapic_page = vmx->nested.virtual_apic_map.hva;
3956
if (!vapic_page)
3957
goto mmio_needed;
3958
3959
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3960
vapic_page, &max_irr);
3961
status = vmcs_read16(GUEST_INTR_STATUS);
3962
if ((u8)max_irr > ((u8)status & 0xff)) {
3963
status &= ~0xff;
3964
status |= (u8)max_irr;
3965
vmcs_write16(GUEST_INTR_STATUS, status);
3966
}
3967
}
3968
3969
nested_mark_vmcs12_pages_dirty(vcpu);
3970
return 0;
3971
3972
mmio_needed:
3973
kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3974
return -ENXIO;
3975
}
3976
3977
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
3978
{
3979
struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
3980
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
3981
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3982
unsigned long exit_qual;
3983
3984
if (ex->has_payload) {
3985
exit_qual = ex->payload;
3986
} else if (ex->vector == PF_VECTOR) {
3987
exit_qual = vcpu->arch.cr2;
3988
} else if (ex->vector == DB_VECTOR) {
3989
exit_qual = vcpu->arch.dr6;
3990
exit_qual &= ~DR6_BT;
3991
exit_qual ^= DR6_ACTIVE_LOW;
3992
} else {
3993
exit_qual = 0;
3994
}
3995
3996
/*
3997
* Unlike AMD's Paged Real Mode, which reports an error code on #PF
3998
* VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
3999
* "has error code" flags on VM-Exit if the CPU is in Real Mode.
4000
*/
4001
if (ex->has_error_code && is_protmode(vcpu)) {
4002
/*
4003
* Intel CPUs do not generate error codes with bits 31:16 set,
4004
* and more importantly VMX disallows setting bits 31:16 in the
4005
* injected error code for VM-Entry. Drop the bits to mimic
4006
* hardware and avoid inducing failure on nested VM-Entry if L1
4007
* chooses to inject the exception back to L2. AMD CPUs _do_
4008
* generate "full" 32-bit error codes, so KVM allows userspace
4009
* to inject exception error codes with bits 31:16 set.
4010
*/
4011
vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
4012
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
4013
}
4014
4015
if (kvm_exception_is_soft(ex->vector))
4016
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
4017
else
4018
intr_info |= INTR_TYPE_HARD_EXCEPTION;
4019
4020
if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
4021
vmx_get_nmi_mask(vcpu))
4022
intr_info |= INTR_INFO_UNBLOCK_NMI;
4023
4024
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
4025
}
4026
4027
/*
4028
* Returns true if a debug trap is (likely) pending delivery. Infer the class
4029
* of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
4030
* Using the payload is flawed because code breakpoints (fault-like) and data
4031
* breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
4032
* this will return false positives if a to-be-injected code breakpoint #DB is
4033
* pending (from KVM's perspective, but not "pending" across an instruction
4034
* boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
4035
* too is trap-like.
4036
*
4037
* KVM "works" despite these flaws as ICEBP isn't currently supported by the
4038
* emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
4039
* #DB has already happened), and MTF isn't marked pending on code breakpoints
4040
* from the emulator (because such #DBs are fault-like and thus don't trigger
4041
* actions that fire on instruction retire).
4042
*/
4043
static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
4044
{
4045
if (!ex->pending || ex->vector != DB_VECTOR)
4046
return 0;
4047
4048
/* General Detect #DBs are always fault-like. */
4049
return ex->payload & ~DR6_BD;
4050
}
4051
4052
/*
4053
* Returns true if there's a pending #DB exception that is lower priority than
4054
* a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
4055
* KVM, but could theoretically be injected by userspace. Note, this code is
4056
* imperfect, see above.
4057
*/
4058
static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
4059
{
4060
return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
4061
}
4062
4063
/*
4064
* Certain VM-exits set the 'pending debug exceptions' field to indicate a
4065
* recognized #DB (data or single-step) that has yet to be delivered. Since KVM
4066
* represents these debug traps with a payload that is said to be compatible
4067
* with the 'pending debug exceptions' field, write the payload to the VMCS
4068
* field if a VM-exit is delivered before the debug trap.
4069
*/
4070
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
4071
{
4072
unsigned long pending_dbg;
4073
4074
pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
4075
if (pending_dbg)
4076
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
4077
}
4078
4079
static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
4080
{
4081
return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
4082
to_vmx(vcpu)->nested.preemption_timer_expired;
4083
}
4084
4085
static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
4086
{
4087
struct vcpu_vmx *vmx = to_vmx(vcpu);
4088
void *vapic = vmx->nested.virtual_apic_map.hva;
4089
int max_irr, vppr;
4090
4091
if (nested_vmx_preemption_timer_pending(vcpu) ||
4092
vmx->nested.mtf_pending)
4093
return true;
4094
4095
/*
4096
* Virtual Interrupt Delivery doesn't require manual injection. Either
4097
* the interrupt is already in GUEST_RVI and will be recognized by CPU
4098
* at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
4099
* the interrupt from the PIR to RVI prior to entering the guest.
4100
*/
4101
if (for_injection)
4102
return false;
4103
4104
if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4105
__vmx_interrupt_blocked(vcpu))
4106
return false;
4107
4108
if (!vapic)
4109
return false;
4110
4111
vppr = *((u32 *)(vapic + APIC_PROCPRI));
4112
4113
max_irr = vmx_get_rvi();
4114
if ((max_irr & 0xf0) > (vppr & 0xf0))
4115
return true;
4116
4117
if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
4118
pi_test_on(vmx->nested.pi_desc)) {
4119
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4120
if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
4121
return true;
4122
}
4123
4124
return false;
4125
}
4126
4127
/*
4128
* Per the Intel SDM's table "Priority Among Concurrent Events", with minor
4129
* edits to fill in missing examples, e.g. #DB due to split-lock accesses,
4130
* and less minor edits to splice in the priority of VMX Non-Root specific
4131
* events, e.g. MTF and NMI/INTR-window exiting.
4132
*
4133
* 1 Hardware Reset and Machine Checks
4134
* - RESET
4135
* - Machine Check
4136
*
4137
* 2 Trap on Task Switch
4138
* - T flag in TSS is set (on task switch)
4139
*
4140
* 3 External Hardware Interventions
4141
* - FLUSH
4142
* - STOPCLK
4143
* - SMI
4144
* - INIT
4145
*
4146
* 3.5 Monitor Trap Flag (MTF) VM-exit[1]
4147
*
4148
* 4 Traps on Previous Instruction
4149
* - Breakpoints
4150
* - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
4151
* breakpoint, or #DB due to a split-lock access)
4152
*
4153
* 4.3 VMX-preemption timer expired VM-exit
4154
*
4155
* 4.6 NMI-window exiting VM-exit[2]
4156
*
4157
* 5 Nonmaskable Interrupts (NMI)
4158
*
4159
* 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
4160
*
4161
* 6 Maskable Hardware Interrupts
4162
*
4163
* 7 Code Breakpoint Fault
4164
*
4165
* 8 Faults from Fetching Next Instruction
4166
* - Code-Segment Limit Violation
4167
* - Code Page Fault
4168
* - Control protection exception (missing ENDBRANCH at target of indirect
4169
* call or jump)
4170
*
4171
* 9 Faults from Decoding Next Instruction
4172
* - Instruction length > 15 bytes
4173
* - Invalid Opcode
4174
* - Coprocessor Not Available
4175
*
4176
*10 Faults on Executing Instruction
4177
* - Overflow
4178
* - Bound error
4179
* - Invalid TSS
4180
* - Segment Not Present
4181
* - Stack fault
4182
* - General Protection
4183
* - Data Page Fault
4184
* - Alignment Check
4185
* - x86 FPU Floating-point exception
4186
* - SIMD floating-point exception
4187
* - Virtualization exception
4188
* - Control protection exception
4189
*
4190
* [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
4191
* INIT signals, and higher priority events take priority over MTF VM exits.
4192
* MTF VM exits take priority over debug-trap exceptions and lower priority
4193
* events.
4194
*
4195
* [2] Debug-trap exceptions and higher priority events take priority over VM exits
4196
* caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
4197
* timer take priority over VM exits caused by the "NMI-window exiting"
4198
* VM-execution control and lower priority events.
4199
*
4200
* [3] Debug-trap exceptions and higher priority events take priority over VM exits
4201
* caused by "NMI-window exiting". VM exits caused by this control take
4202
* priority over non-maskable interrupts (NMIs) and lower priority events.
4203
*
4204
* [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
4205
* the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
4206
* non-maskable interrupts (NMIs) and higher priority events take priority over
4207
* delivery of a virtual interrupt; delivery of a virtual interrupt takes
4208
* priority over external interrupts and lower priority events.
4209
*/
4210
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
4211
{
4212
struct kvm_lapic *apic = vcpu->arch.apic;
4213
struct vcpu_vmx *vmx = to_vmx(vcpu);
4214
/*
4215
* Only a pending nested run blocks a pending exception. If there is a
4216
* previously injected event, the pending exception occurred while said
4217
* event was being delivered and thus needs to be handled.
4218
*/
4219
bool block_nested_exceptions = vmx->nested.nested_run_pending;
4220
/*
4221
* Events that don't require injection, i.e. that are virtualized by
4222
* hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
4223
* to regain control in order to deliver the event, and hardware will
4224
* handle event ordering, e.g. with respect to injected exceptions.
4225
*
4226
* But, new events (not exceptions) are only recognized at instruction
4227
* boundaries. If an event needs reinjection, then KVM is handling a
4228
* VM-Exit that occurred _during_ instruction execution; new events,
4229
* irrespective of whether or not they're injected, are blocked until
4230
* the instruction completes.
4231
*/
4232
bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
4233
/*
4234
* Inject events are blocked by nested VM-Enter, as KVM is responsible
4235
* for managing priority between concurrent events, i.e. KVM needs to
4236
* wait until after VM-Enter completes to deliver injected events.
4237
*/
4238
bool block_nested_events = block_nested_exceptions ||
4239
block_non_injected_events;
4240
4241
if (lapic_in_kernel(vcpu) &&
4242
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
4243
if (block_nested_events)
4244
return -EBUSY;
4245
nested_vmx_update_pending_dbg(vcpu);
4246
clear_bit(KVM_APIC_INIT, &apic->pending_events);
4247
if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
4248
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
4249
4250
/* MTF is discarded if the vCPU is in WFS. */
4251
vmx->nested.mtf_pending = false;
4252
return 0;
4253
}
4254
4255
if (lapic_in_kernel(vcpu) &&
4256
test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
4257
if (block_nested_events)
4258
return -EBUSY;
4259
4260
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
4261
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
4262
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
4263
apic->sipi_vector & 0xFFUL);
4264
return 0;
4265
}
4266
/* Fallthrough, the SIPI is completely ignored. */
4267
}
4268
4269
/*
4270
* Process exceptions that are higher priority than Monitor Trap Flag:
4271
* fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
4272
* could theoretically come in from userspace), and ICEBP (INT1).
4273
*
4274
* TODO: SMIs have higher priority than MTF and trap-like #DBs (except
4275
* for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
4276
* across SMI/RSM as it should; that needs to be addressed in order to
4277
* prioritize SMI over MTF and trap-like #DBs.
4278
*/
4279
if (vcpu->arch.exception_vmexit.pending &&
4280
!vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
4281
if (block_nested_exceptions)
4282
return -EBUSY;
4283
4284
nested_vmx_inject_exception_vmexit(vcpu);
4285
return 0;
4286
}
4287
4288
if (vcpu->arch.exception.pending &&
4289
!vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
4290
if (block_nested_exceptions)
4291
return -EBUSY;
4292
goto no_vmexit;
4293
}
4294
4295
if (vmx->nested.mtf_pending) {
4296
if (block_nested_events)
4297
return -EBUSY;
4298
nested_vmx_update_pending_dbg(vcpu);
4299
nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
4300
return 0;
4301
}
4302
4303
if (vcpu->arch.exception_vmexit.pending) {
4304
if (block_nested_exceptions)
4305
return -EBUSY;
4306
4307
nested_vmx_inject_exception_vmexit(vcpu);
4308
return 0;
4309
}
4310
4311
if (vcpu->arch.exception.pending) {
4312
if (block_nested_exceptions)
4313
return -EBUSY;
4314
goto no_vmexit;
4315
}
4316
4317
if (nested_vmx_preemption_timer_pending(vcpu)) {
4318
if (block_nested_events)
4319
return -EBUSY;
4320
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
4321
return 0;
4322
}
4323
4324
if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
4325
if (block_nested_events)
4326
return -EBUSY;
4327
goto no_vmexit;
4328
}
4329
4330
if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
4331
if (block_nested_events)
4332
return -EBUSY;
4333
if (!nested_exit_on_nmi(vcpu))
4334
goto no_vmexit;
4335
4336
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4337
NMI_VECTOR | INTR_TYPE_NMI_INTR |
4338
INTR_INFO_VALID_MASK, 0);
4339
/*
4340
* The NMI-triggered VM exit counts as injection:
4341
* clear this one and block further NMIs.
4342
*/
4343
vcpu->arch.nmi_pending = 0;
4344
vmx_set_nmi_mask(vcpu, true);
4345
return 0;
4346
}
4347
4348
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
4349
int irq;
4350
4351
if (!nested_exit_on_intr(vcpu)) {
4352
if (block_nested_events)
4353
return -EBUSY;
4354
4355
goto no_vmexit;
4356
}
4357
4358
if (!nested_exit_intr_ack_set(vcpu)) {
4359
if (block_nested_events)
4360
return -EBUSY;
4361
4362
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
4363
return 0;
4364
}
4365
4366
irq = kvm_cpu_get_extint(vcpu);
4367
if (irq != -1) {
4368
if (block_nested_events)
4369
return -EBUSY;
4370
4371
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4372
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4373
return 0;
4374
}
4375
4376
irq = kvm_apic_has_interrupt(vcpu);
4377
if (WARN_ON_ONCE(irq < 0))
4378
goto no_vmexit;
4379
4380
/*
4381
* If the IRQ is L2's PI notification vector, process posted
4382
* interrupts for L2 instead of injecting VM-Exit, as the
4383
* detection/morphing architecturally occurs when the IRQ is
4384
* delivered to the CPU. Note, only interrupts that are routed
4385
* through the local APIC trigger posted interrupt processing,
4386
* and enabling posted interrupts requires ACK-on-exit.
4387
*/
4388
if (irq == vmx->nested.posted_intr_nv) {
4389
/*
4390
* Nested posted interrupts are delivered via RVI, i.e.
4391
* aren't injected by KVM, and so can be queued even if
4392
* manual event injection is disallowed.
4393
*/
4394
if (block_non_injected_events)
4395
return -EBUSY;
4396
4397
vmx->nested.pi_pending = true;
4398
kvm_apic_clear_irr(vcpu, irq);
4399
goto no_vmexit;
4400
}
4401
4402
if (block_nested_events)
4403
return -EBUSY;
4404
4405
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4406
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4407
4408
/*
4409
* ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
4410
* be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
4411
* if APICv is active.
4412
*/
4413
kvm_apic_ack_interrupt(vcpu, irq);
4414
return 0;
4415
}
4416
4417
no_vmexit:
4418
return vmx_complete_nested_posted_interrupt(vcpu);
4419
}
4420
4421
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
4422
{
4423
ktime_t remaining =
4424
hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4425
u64 value;
4426
4427
if (ktime_to_ns(remaining) <= 0)
4428
return 0;
4429
4430
value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4431
do_div(value, 1000000);
4432
return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4433
}
4434
4435
static bool is_vmcs12_ext_field(unsigned long field)
4436
{
4437
switch (field) {
4438
case GUEST_ES_SELECTOR:
4439
case GUEST_CS_SELECTOR:
4440
case GUEST_SS_SELECTOR:
4441
case GUEST_DS_SELECTOR:
4442
case GUEST_FS_SELECTOR:
4443
case GUEST_GS_SELECTOR:
4444
case GUEST_LDTR_SELECTOR:
4445
case GUEST_TR_SELECTOR:
4446
case GUEST_ES_LIMIT:
4447
case GUEST_CS_LIMIT:
4448
case GUEST_SS_LIMIT:
4449
case GUEST_DS_LIMIT:
4450
case GUEST_FS_LIMIT:
4451
case GUEST_GS_LIMIT:
4452
case GUEST_LDTR_LIMIT:
4453
case GUEST_TR_LIMIT:
4454
case GUEST_GDTR_LIMIT:
4455
case GUEST_IDTR_LIMIT:
4456
case GUEST_ES_AR_BYTES:
4457
case GUEST_DS_AR_BYTES:
4458
case GUEST_FS_AR_BYTES:
4459
case GUEST_GS_AR_BYTES:
4460
case GUEST_LDTR_AR_BYTES:
4461
case GUEST_TR_AR_BYTES:
4462
case GUEST_ES_BASE:
4463
case GUEST_CS_BASE:
4464
case GUEST_SS_BASE:
4465
case GUEST_DS_BASE:
4466
case GUEST_FS_BASE:
4467
case GUEST_GS_BASE:
4468
case GUEST_LDTR_BASE:
4469
case GUEST_TR_BASE:
4470
case GUEST_GDTR_BASE:
4471
case GUEST_IDTR_BASE:
4472
case GUEST_PENDING_DBG_EXCEPTIONS:
4473
case GUEST_BNDCFGS:
4474
return true;
4475
default:
4476
break;
4477
}
4478
4479
return false;
4480
}
4481
4482
static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4483
struct vmcs12 *vmcs12)
4484
{
4485
struct vcpu_vmx *vmx = to_vmx(vcpu);
4486
4487
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4488
vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4489
vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4490
vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4491
vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4492
vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4493
vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4494
vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4495
vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4496
vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4497
vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4498
vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4499
vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4500
vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4501
vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4502
vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4503
vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4504
vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4505
vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4506
vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4507
vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4508
vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4509
vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4510
vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4511
vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4512
vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4513
vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4514
vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4515
vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4516
vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4517
vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4518
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4519
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4520
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4521
vmcs12->guest_pending_dbg_exceptions =
4522
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4523
4524
vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4525
}
4526
4527
static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4528
struct vmcs12 *vmcs12)
4529
{
4530
struct vcpu_vmx *vmx = to_vmx(vcpu);
4531
int cpu;
4532
4533
if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4534
return;
4535
4536
4537
WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4538
4539
cpu = get_cpu();
4540
vmx->loaded_vmcs = &vmx->nested.vmcs02;
4541
vmx_vcpu_load_vmcs(vcpu, cpu);
4542
4543
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4544
4545
vmx->loaded_vmcs = &vmx->vmcs01;
4546
vmx_vcpu_load_vmcs(vcpu, cpu);
4547
put_cpu();
4548
}
4549
4550
/*
4551
* Update the guest state fields of vmcs12 to reflect changes that
4552
* occurred while L2 was running. (The "IA-32e mode guest" bit of the
4553
* VM-entry controls is also updated, since this is really a guest
4554
* state bit.)
4555
*/
4556
static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4557
{
4558
struct vcpu_vmx *vmx = to_vmx(vcpu);
4559
4560
if (nested_vmx_is_evmptr12_valid(vmx))
4561
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4562
4563
vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4564
!nested_vmx_is_evmptr12_valid(vmx);
4565
4566
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4567
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4568
4569
vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4570
vmcs12->guest_rip = kvm_rip_read(vcpu);
4571
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4572
4573
vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4574
vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4575
4576
vmcs12->guest_interruptibility_info =
4577
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4578
4579
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4580
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4581
else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4582
vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4583
else
4584
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4585
4586
if (nested_cpu_has_preemption_timer(vmcs12) &&
4587
vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4588
!vmx->nested.nested_run_pending)
4589
vmcs12->vmx_preemption_timer_value =
4590
vmx_get_preemption_timer_value(vcpu);
4591
4592
/*
4593
* In some cases (usually, nested EPT), L2 is allowed to change its
4594
* own CR3 without exiting. If it has changed it, we must keep it.
4595
* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4596
* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4597
*
4598
* Additionally, restore L2's PDPTR to vmcs12.
4599
*/
4600
if (enable_ept) {
4601
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4602
if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4603
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4604
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4605
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4606
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4607
}
4608
}
4609
4610
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4611
4612
if (nested_cpu_has_vid(vmcs12))
4613
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4614
4615
vmcs12->vm_entry_controls =
4616
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4617
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4618
4619
/*
4620
* Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
4621
* Writes to DEBUGCTL that aren't intercepted by L1 are immediately
4622
* propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
4623
* vmcs02 doesn't strictly track vmcs12.
4624
*/
4625
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4626
vmcs12->guest_dr7 = vcpu->arch.dr7;
4627
4628
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4629
vmcs12->guest_ia32_efer = vcpu->arch.efer;
4630
}
4631
4632
/*
4633
* prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4634
* and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4635
* and this function updates it to reflect the changes to the guest state while
4636
* L2 was running (and perhaps made some exits which were handled directly by L0
4637
* without going back to L1), and to reflect the exit reason.
4638
* Note that we do not have to copy here all VMCS fields, just those that
4639
* could have changed by the L2 guest or the exit - i.e., the guest-state and
4640
* exit-information fields only. Other fields are modified by L1 with VMWRITE,
4641
* which already writes to vmcs12 directly.
4642
*/
4643
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4644
u32 vm_exit_reason, u32 exit_intr_info,
4645
unsigned long exit_qualification, u32 exit_insn_len)
4646
{
4647
/* update exit information fields: */
4648
vmcs12->vm_exit_reason = vm_exit_reason;
4649
if (vmx_get_exit_reason(vcpu).enclave_mode)
4650
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4651
vmcs12->exit_qualification = exit_qualification;
4652
4653
/*
4654
* On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4655
* and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4656
* exit info fields are unmodified.
4657
*/
4658
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4659
vmcs12->launch_state = 1;
4660
4661
/* vm_entry_intr_info_field is cleared on exit. Emulate this
4662
* instead of reading the real value. */
4663
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4664
4665
/*
4666
* Transfer the event that L0 or L1 may wanted to inject into
4667
* L2 to IDT_VECTORING_INFO_FIELD.
4668
*/
4669
vmcs12_save_pending_event(vcpu, vmcs12,
4670
vm_exit_reason, exit_intr_info);
4671
4672
vmcs12->vm_exit_intr_info = exit_intr_info;
4673
vmcs12->vm_exit_instruction_len = exit_insn_len;
4674
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4675
4676
/*
4677
* According to spec, there's no need to store the guest's
4678
* MSRs if the exit is due to a VM-entry failure that occurs
4679
* during or after loading the guest state. Since this exit
4680
* does not fall in that category, we need to save the MSRs.
4681
*/
4682
if (nested_vmx_store_msr(vcpu,
4683
vmcs12->vm_exit_msr_store_addr,
4684
vmcs12->vm_exit_msr_store_count))
4685
nested_vmx_abort(vcpu,
4686
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4687
}
4688
}
4689
4690
/*
4691
* A part of what we need to when the nested L2 guest exits and we want to
4692
* run its L1 parent, is to reset L1's guest state to the host state specified
4693
* in vmcs12.
4694
* This function is to be called not only on normal nested exit, but also on
4695
* a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4696
* Failures During or After Loading Guest State").
4697
* This function should be called when the active VMCS is L1's (vmcs01).
4698
*/
4699
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4700
struct vmcs12 *vmcs12)
4701
{
4702
enum vm_entry_failure_code ignored;
4703
struct kvm_segment seg;
4704
4705
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4706
vcpu->arch.efer = vmcs12->host_ia32_efer;
4707
else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4708
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4709
else
4710
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4711
vmx_set_efer(vcpu, vcpu->arch.efer);
4712
4713
kvm_rsp_write(vcpu, vmcs12->host_rsp);
4714
kvm_rip_write(vcpu, vmcs12->host_rip);
4715
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4716
vmx_set_interrupt_shadow(vcpu, 0);
4717
4718
/*
4719
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4720
* actually changed, because vmx_set_cr0 refers to efer set above.
4721
*
4722
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
4723
* (KVM doesn't change it);
4724
*/
4725
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4726
vmx_set_cr0(vcpu, vmcs12->host_cr0);
4727
4728
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
4729
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4730
vmx_set_cr4(vcpu, vmcs12->host_cr4);
4731
4732
nested_ept_uninit_mmu_context(vcpu);
4733
4734
/*
4735
* Only PDPTE load can fail as the value of cr3 was checked on entry and
4736
* couldn't have changed.
4737
*/
4738
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4739
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4740
4741
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4742
4743
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4744
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4745
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4746
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4747
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4748
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4749
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4750
4751
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4752
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4753
vmcs_write64(GUEST_BNDCFGS, 0);
4754
4755
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4756
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4757
vcpu->arch.pat = vmcs12->host_ia32_pat;
4758
}
4759
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
4760
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
4761
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4762
vmcs12->host_ia32_perf_global_ctrl));
4763
4764
/* Set L1 segment info according to Intel SDM
4765
27.5.2 Loading Host Segment and Descriptor-Table Registers */
4766
seg = (struct kvm_segment) {
4767
.base = 0,
4768
.limit = 0xFFFFFFFF,
4769
.selector = vmcs12->host_cs_selector,
4770
.type = 11,
4771
.present = 1,
4772
.s = 1,
4773
.g = 1
4774
};
4775
if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4776
seg.l = 1;
4777
else
4778
seg.db = 1;
4779
__vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4780
seg = (struct kvm_segment) {
4781
.base = 0,
4782
.limit = 0xFFFFFFFF,
4783
.type = 3,
4784
.present = 1,
4785
.s = 1,
4786
.db = 1,
4787
.g = 1
4788
};
4789
seg.selector = vmcs12->host_ds_selector;
4790
__vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4791
seg.selector = vmcs12->host_es_selector;
4792
__vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4793
seg.selector = vmcs12->host_ss_selector;
4794
__vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4795
seg.selector = vmcs12->host_fs_selector;
4796
seg.base = vmcs12->host_fs_base;
4797
__vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4798
seg.selector = vmcs12->host_gs_selector;
4799
seg.base = vmcs12->host_gs_base;
4800
__vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4801
seg = (struct kvm_segment) {
4802
.base = vmcs12->host_tr_base,
4803
.limit = 0x67,
4804
.selector = vmcs12->host_tr_selector,
4805
.type = 11,
4806
.present = 1
4807
};
4808
__vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4809
4810
memset(&seg, 0, sizeof(seg));
4811
seg.unusable = 1;
4812
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4813
4814
kvm_set_dr(vcpu, 7, 0x400);
4815
vmx_guest_debugctl_write(vcpu, 0);
4816
4817
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4818
vmcs12->vm_exit_msr_load_count))
4819
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4820
4821
to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4822
}
4823
4824
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4825
{
4826
struct vmx_uret_msr *efer_msr;
4827
unsigned int i;
4828
4829
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4830
return vmcs_read64(GUEST_IA32_EFER);
4831
4832
if (cpu_has_load_ia32_efer())
4833
return kvm_host.efer;
4834
4835
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4836
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4837
return vmx->msr_autoload.guest.val[i].value;
4838
}
4839
4840
efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4841
if (efer_msr)
4842
return efer_msr->data;
4843
4844
return kvm_host.efer;
4845
}
4846
4847
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4848
{
4849
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4850
struct vcpu_vmx *vmx = to_vmx(vcpu);
4851
struct vmx_msr_entry g, h;
4852
gpa_t gpa;
4853
u32 i, j;
4854
4855
vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4856
4857
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4858
/*
4859
* L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4860
* as vmcs01.GUEST_DR7 contains a userspace defined value
4861
* and vcpu->arch.dr7 is not squirreled away before the
4862
* nested VMENTER (not worth adding a variable in nested_vmx).
4863
*/
4864
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4865
kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4866
else
4867
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4868
}
4869
4870
/* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
4871
vmx_reload_guest_debugctl(vcpu);
4872
4873
/*
4874
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
4875
* handle a variety of side effects to KVM's software model.
4876
*/
4877
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4878
4879
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4880
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4881
4882
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4883
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4884
4885
nested_ept_uninit_mmu_context(vcpu);
4886
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4887
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4888
4889
/*
4890
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4891
* from vmcs01 (if necessary). The PDPTRs are not loaded on
4892
* VMFail, like everything else we just need to ensure our
4893
* software model is up-to-date.
4894
*/
4895
if (enable_ept && is_pae_paging(vcpu))
4896
ept_save_pdptrs(vcpu);
4897
4898
kvm_mmu_reset_context(vcpu);
4899
4900
/*
4901
* This nasty bit of open coding is a compromise between blindly
4902
* loading L1's MSRs using the exit load lists (incorrect emulation
4903
* of VMFail), leaving the nested VM's MSRs in the software model
4904
* (incorrect behavior) and snapshotting the modified MSRs (too
4905
* expensive since the lists are unbound by hardware). For each
4906
* MSR that was (prematurely) loaded from the nested VMEntry load
4907
* list, reload it from the exit load list if it exists and differs
4908
* from the guest value. The intent is to stuff host state as
4909
* silently as possible, not to fully process the exit load list.
4910
*/
4911
for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4912
gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4913
if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4914
pr_debug_ratelimited(
4915
"%s read MSR index failed (%u, 0x%08llx)\n",
4916
__func__, i, gpa);
4917
goto vmabort;
4918
}
4919
4920
for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4921
gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4922
if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4923
pr_debug_ratelimited(
4924
"%s read MSR failed (%u, 0x%08llx)\n",
4925
__func__, j, gpa);
4926
goto vmabort;
4927
}
4928
if (h.index != g.index)
4929
continue;
4930
if (h.value == g.value)
4931
break;
4932
4933
if (nested_vmx_load_msr_check(vcpu, &h)) {
4934
pr_debug_ratelimited(
4935
"%s check failed (%u, 0x%x, 0x%x)\n",
4936
__func__, j, h.index, h.reserved);
4937
goto vmabort;
4938
}
4939
4940
if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
4941
pr_debug_ratelimited(
4942
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4943
__func__, j, h.index, h.value);
4944
goto vmabort;
4945
}
4946
}
4947
}
4948
4949
return;
4950
4951
vmabort:
4952
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4953
}
4954
4955
/*
4956
* Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4957
* and modify vmcs12 to make it see what it would expect to see there if
4958
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4959
*/
4960
void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4961
u32 exit_intr_info, unsigned long exit_qualification,
4962
u32 exit_insn_len)
4963
{
4964
struct vcpu_vmx *vmx = to_vmx(vcpu);
4965
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4966
4967
/* Pending MTF traps are discarded on VM-Exit. */
4968
vmx->nested.mtf_pending = false;
4969
4970
/* trying to cancel vmlaunch/vmresume is a bug */
4971
WARN_ON_ONCE(vmx->nested.nested_run_pending);
4972
4973
#ifdef CONFIG_KVM_HYPERV
4974
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4975
/*
4976
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4977
* Enlightened VMCS after migration and we still need to
4978
* do that when something is forcing L2->L1 exit prior to
4979
* the first L2 run.
4980
*/
4981
(void)nested_get_evmcs_page(vcpu);
4982
}
4983
#endif
4984
4985
/* Service pending TLB flush requests for L2 before switching to L1. */
4986
kvm_service_local_tlb_flush_requests(vcpu);
4987
4988
/*
4989
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4990
* now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4991
* up-to-date before switching to L1.
4992
*/
4993
if (enable_ept && is_pae_paging(vcpu))
4994
vmx_ept_load_pdptrs(vcpu);
4995
4996
leave_guest_mode(vcpu);
4997
4998
if (nested_cpu_has_preemption_timer(vmcs12))
4999
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
5000
5001
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
5002
vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
5003
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
5004
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
5005
}
5006
5007
if (likely(!vmx->fail)) {
5008
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5009
5010
if (vm_exit_reason != -1)
5011
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
5012
exit_intr_info, exit_qualification,
5013
exit_insn_len);
5014
5015
/*
5016
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
5017
* also be used to capture vmcs12 cache as part of
5018
* capturing nVMX state for snapshot (migration).
5019
*
5020
* Otherwise, this flush will dirty guest memory at a
5021
* point it is already assumed by user-space to be
5022
* immutable.
5023
*/
5024
nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
5025
} else {
5026
/*
5027
* The only expected VM-instruction error is "VM entry with
5028
* invalid control field(s)." Anything else indicates a
5029
* problem with L0. And we should never get here with a
5030
* VMFail of any type if early consistency checks are enabled.
5031
*/
5032
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
5033
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5034
WARN_ON_ONCE(nested_early_check);
5035
}
5036
5037
/*
5038
* Drop events/exceptions that were queued for re-injection to L2
5039
* (picked up via vmx_complete_interrupts()), as well as exceptions
5040
* that were pending for L2. Note, this must NOT be hoisted above
5041
* prepare_vmcs12(), events/exceptions queued for re-injection need to
5042
* be captured in vmcs12 (see vmcs12_save_pending_event()).
5043
*/
5044
vcpu->arch.nmi_injected = false;
5045
kvm_clear_exception_queue(vcpu);
5046
kvm_clear_interrupt_queue(vcpu);
5047
5048
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
5049
5050
kvm_nested_vmexit_handle_ibrs(vcpu);
5051
5052
/* Update any VMCS fields that might have changed while L2 ran */
5053
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
5054
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
5055
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
5056
if (kvm_caps.has_tsc_control)
5057
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
5058
5059
if (vmx->nested.l1_tpr_threshold != -1)
5060
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
5061
5062
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
5063
vmx->nested.change_vmcs01_virtual_apic_mode = false;
5064
vmx_set_virtual_apic_mode(vcpu);
5065
}
5066
5067
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
5068
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
5069
vmx_update_cpu_dirty_logging(vcpu);
5070
}
5071
5072
nested_put_vmcs12_pages(vcpu);
5073
5074
if (vmx->nested.reload_vmcs01_apic_access_page) {
5075
vmx->nested.reload_vmcs01_apic_access_page = false;
5076
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5077
}
5078
5079
if (vmx->nested.update_vmcs01_apicv_status) {
5080
vmx->nested.update_vmcs01_apicv_status = false;
5081
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
5082
}
5083
5084
if (vmx->nested.update_vmcs01_hwapic_isr) {
5085
vmx->nested.update_vmcs01_hwapic_isr = false;
5086
kvm_apic_update_hwapic_isr(vcpu);
5087
}
5088
5089
if ((vm_exit_reason != -1) &&
5090
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
5091
vmx->nested.need_vmcs12_to_shadow_sync = true;
5092
5093
/* in case we halted in L2 */
5094
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
5095
5096
if (likely(!vmx->fail)) {
5097
if (vm_exit_reason != -1)
5098
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
5099
vmcs12->exit_qualification,
5100
vmcs12->idt_vectoring_info_field,
5101
vmcs12->vm_exit_intr_info,
5102
vmcs12->vm_exit_intr_error_code,
5103
KVM_ISA_VMX);
5104
5105
load_vmcs12_host_state(vcpu, vmcs12);
5106
5107
/*
5108
* Process events if an injectable IRQ or NMI is pending, even
5109
* if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
5110
* If an event became pending while L2 was active, KVM needs to
5111
* either inject the event or request an IRQ/NMI window. SMIs
5112
* don't need to be processed as SMM is mutually exclusive with
5113
* non-root mode. INIT/SIPI don't need to be checked as INIT
5114
* is blocked post-VMXON, and SIPIs are ignored.
5115
*/
5116
if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
5117
kvm_make_request(KVM_REQ_EVENT, vcpu);
5118
return;
5119
}
5120
5121
/*
5122
* After an early L2 VM-entry failure, we're now back
5123
* in L1 which thinks it just finished a VMLAUNCH or
5124
* VMRESUME instruction, so we need to set the failure
5125
* flag and the VM-instruction error field of the VMCS
5126
* accordingly, and skip the emulated instruction.
5127
*/
5128
(void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5129
5130
/*
5131
* Restore L1's host state to KVM's software model. We're here
5132
* because a consistency check was caught by hardware, which
5133
* means some amount of guest state has been propagated to KVM's
5134
* model and needs to be unwound to the host's state.
5135
*/
5136
nested_vmx_restore_host_state(vcpu);
5137
5138
vmx->fail = 0;
5139
}
5140
5141
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
5142
{
5143
kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5144
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
5145
}
5146
5147
/*
5148
* Decode the memory-address operand of a vmx instruction, as recorded on an
5149
* exit caused by such an instruction (run by a guest hypervisor).
5150
* On success, returns 0. When the operand is invalid, returns 1 and throws
5151
* #UD, #GP, or #SS.
5152
*/
5153
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
5154
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
5155
{
5156
gva_t off;
5157
bool exn;
5158
struct kvm_segment s;
5159
5160
/*
5161
* According to Vol. 3B, "Information for VM Exits Due to Instruction
5162
* Execution", on an exit, vmx_instruction_info holds most of the
5163
* addressing components of the operand. Only the displacement part
5164
* is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5165
* For how an actual address is calculated from all these components,
5166
* refer to Vol. 1, "Operand Addressing".
5167
*/
5168
int scaling = vmx_instruction_info & 3;
5169
int addr_size = (vmx_instruction_info >> 7) & 7;
5170
bool is_reg = vmx_instruction_info & (1u << 10);
5171
int seg_reg = (vmx_instruction_info >> 15) & 7;
5172
int index_reg = (vmx_instruction_info >> 18) & 0xf;
5173
bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5174
int base_reg = (vmx_instruction_info >> 23) & 0xf;
5175
bool base_is_valid = !(vmx_instruction_info & (1u << 27));
5176
5177
if (is_reg) {
5178
kvm_queue_exception(vcpu, UD_VECTOR);
5179
return 1;
5180
}
5181
5182
/* Addr = segment_base + offset */
5183
/* offset = base + [index * scale] + displacement */
5184
off = exit_qualification; /* holds the displacement */
5185
if (addr_size == 1)
5186
off = (gva_t)sign_extend64(off, 31);
5187
else if (addr_size == 0)
5188
off = (gva_t)sign_extend64(off, 15);
5189
if (base_is_valid)
5190
off += kvm_register_read(vcpu, base_reg);
5191
if (index_is_valid)
5192
off += kvm_register_read(vcpu, index_reg) << scaling;
5193
vmx_get_segment(vcpu, &s, seg_reg);
5194
5195
/*
5196
* The effective address, i.e. @off, of a memory operand is truncated
5197
* based on the address size of the instruction. Note that this is
5198
* the *effective address*, i.e. the address prior to accounting for
5199
* the segment's base.
5200
*/
5201
if (addr_size == 1) /* 32 bit */
5202
off &= 0xffffffff;
5203
else if (addr_size == 0) /* 16 bit */
5204
off &= 0xffff;
5205
5206
/* Checks for #GP/#SS exceptions. */
5207
exn = false;
5208
if (is_long_mode(vcpu)) {
5209
/*
5210
* The virtual/linear address is never truncated in 64-bit
5211
* mode, e.g. a 32-bit address size can yield a 64-bit virtual
5212
* address when using FS/GS with a non-zero base.
5213
*/
5214
if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
5215
*ret = s.base + off;
5216
else
5217
*ret = off;
5218
5219
*ret = vmx_get_untagged_addr(vcpu, *ret, 0);
5220
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
5221
* non-canonical form. This is the only check on the memory
5222
* destination for long mode!
5223
*/
5224
exn = is_noncanonical_address(*ret, vcpu, 0);
5225
} else {
5226
/*
5227
* When not in long mode, the virtual/linear address is
5228
* unconditionally truncated to 32 bits regardless of the
5229
* address size.
5230
*/
5231
*ret = (s.base + off) & 0xffffffff;
5232
5233
/* Protected mode: apply checks for segment validity in the
5234
* following order:
5235
* - segment type check (#GP(0) may be thrown)
5236
* - usability check (#GP(0)/#SS(0))
5237
* - limit check (#GP(0)/#SS(0))
5238
*/
5239
if (wr)
5240
/* #GP(0) if the destination operand is located in a
5241
* read-only data segment or any code segment.
5242
*/
5243
exn = ((s.type & 0xa) == 0 || (s.type & 8));
5244
else
5245
/* #GP(0) if the source operand is located in an
5246
* execute-only code segment
5247
*/
5248
exn = ((s.type & 0xa) == 8);
5249
if (exn) {
5250
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5251
return 1;
5252
}
5253
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
5254
*/
5255
exn = (s.unusable != 0);
5256
5257
/*
5258
* Protected mode: #GP(0)/#SS(0) if the memory operand is
5259
* outside the segment limit. All CPUs that support VMX ignore
5260
* limit checks for flat segments, i.e. segments with base==0,
5261
* limit==0xffffffff and of type expand-up data or code.
5262
*/
5263
if (!(s.base == 0 && s.limit == 0xffffffff &&
5264
((s.type & 8) || !(s.type & 4))))
5265
exn = exn || ((u64)off + len - 1 > s.limit);
5266
}
5267
if (exn) {
5268
kvm_queue_exception_e(vcpu,
5269
seg_reg == VCPU_SREG_SS ?
5270
SS_VECTOR : GP_VECTOR,
5271
0);
5272
return 1;
5273
}
5274
5275
return 0;
5276
}
5277
5278
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
5279
int *ret)
5280
{
5281
gva_t gva;
5282
struct x86_exception e;
5283
int r;
5284
5285
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5286
vmcs_read32(VMX_INSTRUCTION_INFO), false,
5287
sizeof(*vmpointer), &gva)) {
5288
*ret = 1;
5289
return -EINVAL;
5290
}
5291
5292
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
5293
if (r != X86EMUL_CONTINUE) {
5294
*ret = kvm_handle_memory_failure(vcpu, r, &e);
5295
return -EINVAL;
5296
}
5297
5298
return 0;
5299
}
5300
5301
/*
5302
* Allocate a shadow VMCS and associate it with the currently loaded
5303
* VMCS, unless such a shadow VMCS already exists. The newly allocated
5304
* VMCS is also VMCLEARed, so that it is ready for use.
5305
*/
5306
static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
5307
{
5308
struct vcpu_vmx *vmx = to_vmx(vcpu);
5309
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
5310
5311
/*
5312
* KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
5313
* when L1 executes VMXOFF or the vCPU is forced out of nested
5314
* operation. VMXON faults if the CPU is already post-VMXON, so it
5315
* should be impossible to already have an allocated shadow VMCS. KVM
5316
* doesn't support virtualization of VMCS shadowing, so vmcs01 should
5317
* always be the loaded VMCS.
5318
*/
5319
if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
5320
return loaded_vmcs->shadow_vmcs;
5321
5322
loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
5323
if (loaded_vmcs->shadow_vmcs)
5324
vmcs_clear(loaded_vmcs->shadow_vmcs);
5325
5326
return loaded_vmcs->shadow_vmcs;
5327
}
5328
5329
static int enter_vmx_operation(struct kvm_vcpu *vcpu)
5330
{
5331
struct vcpu_vmx *vmx = to_vmx(vcpu);
5332
int r;
5333
5334
r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
5335
if (r < 0)
5336
goto out_vmcs02;
5337
5338
vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5339
if (!vmx->nested.cached_vmcs12)
5340
goto out_cached_vmcs12;
5341
5342
vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
5343
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5344
if (!vmx->nested.cached_shadow_vmcs12)
5345
goto out_cached_shadow_vmcs12;
5346
5347
if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
5348
goto out_shadow_vmcs;
5349
5350
hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
5351
HRTIMER_MODE_ABS_PINNED);
5352
5353
vmx->nested.vpid02 = allocate_vpid();
5354
5355
vmx->nested.vmcs02_initialized = false;
5356
vmx->nested.vmxon = true;
5357
5358
if (vmx_pt_mode_is_host_guest()) {
5359
vmx->pt_desc.guest.ctl = 0;
5360
pt_update_intercept_for_msr(vcpu);
5361
}
5362
5363
return 0;
5364
5365
out_shadow_vmcs:
5366
kfree(vmx->nested.cached_shadow_vmcs12);
5367
5368
out_cached_shadow_vmcs12:
5369
kfree(vmx->nested.cached_vmcs12);
5370
5371
out_cached_vmcs12:
5372
free_loaded_vmcs(&vmx->nested.vmcs02);
5373
5374
out_vmcs02:
5375
return -ENOMEM;
5376
}
5377
5378
/* Emulate the VMXON instruction. */
5379
static int handle_vmxon(struct kvm_vcpu *vcpu)
5380
{
5381
int ret;
5382
gpa_t vmptr;
5383
uint32_t revision;
5384
struct vcpu_vmx *vmx = to_vmx(vcpu);
5385
const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
5386
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
5387
5388
/*
5389
* Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
5390
* the guest and so cannot rely on hardware to perform the check,
5391
* which has higher priority than VM-Exit (see Intel SDM's pseudocode
5392
* for VMXON).
5393
*
5394
* Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
5395
* and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
5396
* force any of the relevant guest state. For a restricted guest, KVM
5397
* does force CR0.PE=1, but only to also force VM86 in order to emulate
5398
* Real Mode, and so there's no need to check CR0.PE manually.
5399
*/
5400
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
5401
kvm_queue_exception(vcpu, UD_VECTOR);
5402
return 1;
5403
}
5404
5405
/*
5406
* The CPL is checked for "not in VMX operation" and for "in VMX root",
5407
* and has higher priority than the VM-Fail due to being post-VMXON,
5408
* i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
5409
* VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
5410
* from L2 to L1, i.e. there's no need to check for the vCPU being in
5411
* VMX non-root.
5412
*
5413
* Forwarding the VM-Exit unconditionally, i.e. without performing the
5414
* #UD checks (see above), is functionally ok because KVM doesn't allow
5415
* L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
5416
* CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
5417
* missed by hardware due to shadowing CR0 and/or CR4.
5418
*/
5419
if (vmx_get_cpl(vcpu)) {
5420
kvm_inject_gp(vcpu, 0);
5421
return 1;
5422
}
5423
5424
if (vmx->nested.vmxon)
5425
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5426
5427
/*
5428
* Invalid CR0/CR4 generates #GP. These checks are performed if and
5429
* only if the vCPU isn't already in VMX operation, i.e. effectively
5430
* have lower priority than the VM-Fail above.
5431
*/
5432
if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
5433
!nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
5434
kvm_inject_gp(vcpu, 0);
5435
return 1;
5436
}
5437
5438
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5439
!= VMXON_NEEDED_FEATURES) {
5440
kvm_inject_gp(vcpu, 0);
5441
return 1;
5442
}
5443
5444
if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
5445
return ret;
5446
5447
/*
5448
* SDM 3: 24.11.5
5449
* The first 4 bytes of VMXON region contain the supported
5450
* VMCS revision identifier
5451
*
5452
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
5453
* which replaces physical address width with 32
5454
*/
5455
if (!page_address_valid(vcpu, vmptr))
5456
return nested_vmx_failInvalid(vcpu);
5457
5458
if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
5459
revision != VMCS12_REVISION)
5460
return nested_vmx_failInvalid(vcpu);
5461
5462
vmx->nested.vmxon_ptr = vmptr;
5463
ret = enter_vmx_operation(vcpu);
5464
if (ret)
5465
return ret;
5466
5467
return nested_vmx_succeed(vcpu);
5468
}
5469
5470
static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
5471
{
5472
struct vcpu_vmx *vmx = to_vmx(vcpu);
5473
5474
if (vmx->nested.current_vmptr == INVALID_GPA)
5475
return;
5476
5477
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5478
5479
if (enable_shadow_vmcs) {
5480
/* copy to memory all shadowed fields in case
5481
they were modified */
5482
copy_shadow_to_vmcs12(vmx);
5483
vmx_disable_shadow_vmcs(vmx);
5484
}
5485
vmx->nested.posted_intr_nv = -1;
5486
5487
/* Flush VMCS12 to guest memory */
5488
kvm_vcpu_write_guest_page(vcpu,
5489
vmx->nested.current_vmptr >> PAGE_SHIFT,
5490
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5491
5492
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5493
5494
vmx->nested.current_vmptr = INVALID_GPA;
5495
}
5496
5497
/* Emulate the VMXOFF instruction */
5498
static int handle_vmxoff(struct kvm_vcpu *vcpu)
5499
{
5500
if (!nested_vmx_check_permission(vcpu))
5501
return 1;
5502
5503
free_nested(vcpu);
5504
5505
if (kvm_apic_has_pending_init_or_sipi(vcpu))
5506
kvm_make_request(KVM_REQ_EVENT, vcpu);
5507
5508
return nested_vmx_succeed(vcpu);
5509
}
5510
5511
/* Emulate the VMCLEAR instruction */
5512
static int handle_vmclear(struct kvm_vcpu *vcpu)
5513
{
5514
struct vcpu_vmx *vmx = to_vmx(vcpu);
5515
u32 zero = 0;
5516
gpa_t vmptr;
5517
int r;
5518
5519
if (!nested_vmx_check_permission(vcpu))
5520
return 1;
5521
5522
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5523
return r;
5524
5525
if (!page_address_valid(vcpu, vmptr))
5526
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5527
5528
if (vmptr == vmx->nested.vmxon_ptr)
5529
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5530
5531
if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
5532
if (vmptr == vmx->nested.current_vmptr)
5533
nested_release_vmcs12(vcpu);
5534
5535
/*
5536
* Silently ignore memory errors on VMCLEAR, Intel's pseudocode
5537
* for VMCLEAR includes a "ensure that data for VMCS referenced
5538
* by the operand is in memory" clause that guards writes to
5539
* memory, i.e. doing nothing for I/O is architecturally valid.
5540
*
5541
* FIXME: Suppress failures if and only if no memslot is found,
5542
* i.e. exit to userspace if __copy_to_user() fails.
5543
*/
5544
(void)kvm_vcpu_write_guest(vcpu,
5545
vmptr + offsetof(struct vmcs12,
5546
launch_state),
5547
&zero, sizeof(zero));
5548
}
5549
5550
return nested_vmx_succeed(vcpu);
5551
}
5552
5553
/* Emulate the VMLAUNCH instruction */
5554
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5555
{
5556
return nested_vmx_run(vcpu, true);
5557
}
5558
5559
/* Emulate the VMRESUME instruction */
5560
static int handle_vmresume(struct kvm_vcpu *vcpu)
5561
{
5562
5563
return nested_vmx_run(vcpu, false);
5564
}
5565
5566
static int handle_vmread(struct kvm_vcpu *vcpu)
5567
{
5568
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5569
: get_vmcs12(vcpu);
5570
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5571
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5572
struct vcpu_vmx *vmx = to_vmx(vcpu);
5573
struct x86_exception e;
5574
unsigned long field;
5575
u64 value;
5576
gva_t gva = 0;
5577
short offset;
5578
int len, r;
5579
5580
if (!nested_vmx_check_permission(vcpu))
5581
return 1;
5582
5583
/* Decode instruction info and find the field to read */
5584
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5585
5586
if (!nested_vmx_is_evmptr12_valid(vmx)) {
5587
/*
5588
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5589
* any VMREAD sets the ALU flags for VMfailInvalid.
5590
*/
5591
if (vmx->nested.current_vmptr == INVALID_GPA ||
5592
(is_guest_mode(vcpu) &&
5593
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5594
return nested_vmx_failInvalid(vcpu);
5595
5596
offset = get_vmcs12_field_offset(field);
5597
if (offset < 0)
5598
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5599
5600
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5601
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5602
5603
/* Read the field, zero-extended to a u64 value */
5604
value = vmcs12_read_any(vmcs12, field, offset);
5605
} else {
5606
/*
5607
* Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5608
* enlightened VMCS is active VMREAD/VMWRITE instructions are
5609
* unsupported. Unfortunately, certain versions of Windows 11
5610
* don't comply with this requirement which is not enforced in
5611
* genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5612
* workaround, as misbehaving guests will panic on VM-Fail.
5613
* Note, enlightened VMCS is incompatible with shadow VMCS so
5614
* all VMREADs from L2 should go to L1.
5615
*/
5616
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5617
return nested_vmx_failInvalid(vcpu);
5618
5619
offset = evmcs_field_offset(field, NULL);
5620
if (offset < 0)
5621
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5622
5623
/* Read the field, zero-extended to a u64 value */
5624
value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
5625
}
5626
5627
/*
5628
* Now copy part of this value to register or memory, as requested.
5629
* Note that the number of bits actually copied is 32 or 64 depending
5630
* on the guest's mode (32 or 64 bit), not on the given field's length.
5631
*/
5632
if (instr_info & BIT(10)) {
5633
kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5634
} else {
5635
len = is_64_bit_mode(vcpu) ? 8 : 4;
5636
if (get_vmx_mem_address(vcpu, exit_qualification,
5637
instr_info, true, len, &gva))
5638
return 1;
5639
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
5640
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5641
if (r != X86EMUL_CONTINUE)
5642
return kvm_handle_memory_failure(vcpu, r, &e);
5643
}
5644
5645
return nested_vmx_succeed(vcpu);
5646
}
5647
5648
static bool is_shadow_field_rw(unsigned long field)
5649
{
5650
switch (field) {
5651
#define SHADOW_FIELD_RW(x, y) case x:
5652
#include "vmcs_shadow_fields.h"
5653
return true;
5654
default:
5655
break;
5656
}
5657
return false;
5658
}
5659
5660
static bool is_shadow_field_ro(unsigned long field)
5661
{
5662
switch (field) {
5663
#define SHADOW_FIELD_RO(x, y) case x:
5664
#include "vmcs_shadow_fields.h"
5665
return true;
5666
default:
5667
break;
5668
}
5669
return false;
5670
}
5671
5672
static int handle_vmwrite(struct kvm_vcpu *vcpu)
5673
{
5674
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5675
: get_vmcs12(vcpu);
5676
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5677
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5678
struct vcpu_vmx *vmx = to_vmx(vcpu);
5679
struct x86_exception e;
5680
unsigned long field;
5681
short offset;
5682
gva_t gva;
5683
int len, r;
5684
5685
/*
5686
* The value to write might be 32 or 64 bits, depending on L1's long
5687
* mode, and eventually we need to write that into a field of several
5688
* possible lengths. The code below first zero-extends the value to 64
5689
* bit (value), and then copies only the appropriate number of
5690
* bits into the vmcs12 field.
5691
*/
5692
u64 value = 0;
5693
5694
if (!nested_vmx_check_permission(vcpu))
5695
return 1;
5696
5697
/*
5698
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5699
* any VMWRITE sets the ALU flags for VMfailInvalid.
5700
*/
5701
if (vmx->nested.current_vmptr == INVALID_GPA ||
5702
(is_guest_mode(vcpu) &&
5703
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5704
return nested_vmx_failInvalid(vcpu);
5705
5706
if (instr_info & BIT(10))
5707
value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5708
else {
5709
len = is_64_bit_mode(vcpu) ? 8 : 4;
5710
if (get_vmx_mem_address(vcpu, exit_qualification,
5711
instr_info, false, len, &gva))
5712
return 1;
5713
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5714
if (r != X86EMUL_CONTINUE)
5715
return kvm_handle_memory_failure(vcpu, r, &e);
5716
}
5717
5718
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5719
5720
offset = get_vmcs12_field_offset(field);
5721
if (offset < 0)
5722
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5723
5724
/*
5725
* If the vCPU supports "VMWRITE to any supported field in the
5726
* VMCS," then the "read-only" fields are actually read/write.
5727
*/
5728
if (vmcs_field_readonly(field) &&
5729
!nested_cpu_has_vmwrite_any_field(vcpu))
5730
return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5731
5732
/*
5733
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5734
* vmcs12, else we may crush a field or consume a stale value.
5735
*/
5736
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5737
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5738
5739
/*
5740
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
5741
* fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5742
* behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5743
* field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5744
* from L1 will return a different value than VMREAD from L2 (L1 sees
5745
* the stripped down value, L2 sees the full value as stored by KVM).
5746
*/
5747
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5748
value &= 0x1f0ff;
5749
5750
vmcs12_write_any(vmcs12, field, offset, value);
5751
5752
/*
5753
* Do not track vmcs12 dirty-state if in guest-mode as we actually
5754
* dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5755
* by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5756
* "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5757
*/
5758
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5759
/*
5760
* L1 can read these fields without exiting, ensure the
5761
* shadow VMCS is up-to-date.
5762
*/
5763
if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5764
preempt_disable();
5765
vmcs_load(vmx->vmcs01.shadow_vmcs);
5766
5767
__vmcs_writel(field, value);
5768
5769
vmcs_clear(vmx->vmcs01.shadow_vmcs);
5770
vmcs_load(vmx->loaded_vmcs->vmcs);
5771
preempt_enable();
5772
}
5773
vmx->nested.dirty_vmcs12 = true;
5774
}
5775
5776
return nested_vmx_succeed(vcpu);
5777
}
5778
5779
static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5780
{
5781
vmx->nested.current_vmptr = vmptr;
5782
if (enable_shadow_vmcs) {
5783
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5784
vmcs_write64(VMCS_LINK_POINTER,
5785
__pa(vmx->vmcs01.shadow_vmcs));
5786
vmx->nested.need_vmcs12_to_shadow_sync = true;
5787
}
5788
vmx->nested.dirty_vmcs12 = true;
5789
vmx->nested.force_msr_bitmap_recalc = true;
5790
}
5791
5792
/* Emulate the VMPTRLD instruction */
5793
static int handle_vmptrld(struct kvm_vcpu *vcpu)
5794
{
5795
struct vcpu_vmx *vmx = to_vmx(vcpu);
5796
gpa_t vmptr;
5797
int r;
5798
5799
if (!nested_vmx_check_permission(vcpu))
5800
return 1;
5801
5802
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5803
return r;
5804
5805
if (!page_address_valid(vcpu, vmptr))
5806
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5807
5808
if (vmptr == vmx->nested.vmxon_ptr)
5809
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5810
5811
/* Forbid normal VMPTRLD if Enlightened version was used */
5812
if (nested_vmx_is_evmptr12_valid(vmx))
5813
return 1;
5814
5815
if (vmx->nested.current_vmptr != vmptr) {
5816
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5817
struct vmcs_hdr hdr;
5818
5819
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
5820
/*
5821
* Reads from an unbacked page return all 1s,
5822
* which means that the 32 bits located at the
5823
* given physical address won't match the required
5824
* VMCS12_REVISION identifier.
5825
*/
5826
return nested_vmx_fail(vcpu,
5827
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5828
}
5829
5830
if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5831
offsetof(struct vmcs12, hdr),
5832
sizeof(hdr))) {
5833
return nested_vmx_fail(vcpu,
5834
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5835
}
5836
5837
if (hdr.revision_id != VMCS12_REVISION ||
5838
(hdr.shadow_vmcs &&
5839
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5840
return nested_vmx_fail(vcpu,
5841
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5842
}
5843
5844
nested_release_vmcs12(vcpu);
5845
5846
/*
5847
* Load VMCS12 from guest memory since it is not already
5848
* cached.
5849
*/
5850
if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5851
VMCS12_SIZE)) {
5852
return nested_vmx_fail(vcpu,
5853
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5854
}
5855
5856
set_current_vmptr(vmx, vmptr);
5857
}
5858
5859
return nested_vmx_succeed(vcpu);
5860
}
5861
5862
/* Emulate the VMPTRST instruction */
5863
static int handle_vmptrst(struct kvm_vcpu *vcpu)
5864
{
5865
unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5866
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5867
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5868
struct x86_exception e;
5869
gva_t gva;
5870
int r;
5871
5872
if (!nested_vmx_check_permission(vcpu))
5873
return 1;
5874
5875
if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
5876
return 1;
5877
5878
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5879
true, sizeof(gpa_t), &gva))
5880
return 1;
5881
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5882
r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5883
sizeof(gpa_t), &e);
5884
if (r != X86EMUL_CONTINUE)
5885
return kvm_handle_memory_failure(vcpu, r, &e);
5886
5887
return nested_vmx_succeed(vcpu);
5888
}
5889
5890
/* Emulate the INVEPT instruction */
5891
static int handle_invept(struct kvm_vcpu *vcpu)
5892
{
5893
struct vcpu_vmx *vmx = to_vmx(vcpu);
5894
u32 vmx_instruction_info, types;
5895
unsigned long type, roots_to_free;
5896
struct kvm_mmu *mmu;
5897
gva_t gva;
5898
struct x86_exception e;
5899
struct {
5900
u64 eptp, gpa;
5901
} operand;
5902
int i, r, gpr_index;
5903
5904
if (!(vmx->nested.msrs.secondary_ctls_high &
5905
SECONDARY_EXEC_ENABLE_EPT) ||
5906
!(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5907
kvm_queue_exception(vcpu, UD_VECTOR);
5908
return 1;
5909
}
5910
5911
if (!nested_vmx_check_permission(vcpu))
5912
return 1;
5913
5914
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5915
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5916
type = kvm_register_read(vcpu, gpr_index);
5917
5918
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5919
5920
if (type >= 32 || !(types & (1 << type)))
5921
return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5922
5923
/* According to the Intel VMX instruction reference, the memory
5924
* operand is read even if it isn't needed (e.g., for type==global)
5925
*/
5926
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5927
vmx_instruction_info, false, sizeof(operand), &gva))
5928
return 1;
5929
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5930
if (r != X86EMUL_CONTINUE)
5931
return kvm_handle_memory_failure(vcpu, r, &e);
5932
5933
/*
5934
* Nested EPT roots are always held through guest_mmu,
5935
* not root_mmu.
5936
*/
5937
mmu = &vcpu->arch.guest_mmu;
5938
5939
switch (type) {
5940
case VMX_EPT_EXTENT_CONTEXT:
5941
if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5942
return nested_vmx_fail(vcpu,
5943
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5944
5945
roots_to_free = 0;
5946
if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
5947
operand.eptp))
5948
roots_to_free |= KVM_MMU_ROOT_CURRENT;
5949
5950
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5951
if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5952
mmu->prev_roots[i].pgd,
5953
operand.eptp))
5954
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5955
}
5956
break;
5957
case VMX_EPT_EXTENT_GLOBAL:
5958
roots_to_free = KVM_MMU_ROOTS_ALL;
5959
break;
5960
default:
5961
BUG();
5962
break;
5963
}
5964
5965
if (roots_to_free)
5966
kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
5967
5968
return nested_vmx_succeed(vcpu);
5969
}
5970
5971
static int handle_invvpid(struct kvm_vcpu *vcpu)
5972
{
5973
struct vcpu_vmx *vmx = to_vmx(vcpu);
5974
u32 vmx_instruction_info;
5975
unsigned long type, types;
5976
gva_t gva;
5977
struct x86_exception e;
5978
struct {
5979
u64 vpid;
5980
u64 gla;
5981
} operand;
5982
u16 vpid02;
5983
int r, gpr_index;
5984
5985
if (!(vmx->nested.msrs.secondary_ctls_high &
5986
SECONDARY_EXEC_ENABLE_VPID) ||
5987
!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5988
kvm_queue_exception(vcpu, UD_VECTOR);
5989
return 1;
5990
}
5991
5992
if (!nested_vmx_check_permission(vcpu))
5993
return 1;
5994
5995
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5996
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5997
type = kvm_register_read(vcpu, gpr_index);
5998
5999
types = (vmx->nested.msrs.vpid_caps &
6000
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
6001
6002
if (type >= 32 || !(types & (1 << type)))
6003
return nested_vmx_fail(vcpu,
6004
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6005
6006
/* according to the intel vmx instruction reference, the memory
6007
* operand is read even if it isn't needed (e.g., for type==global)
6008
*/
6009
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6010
vmx_instruction_info, false, sizeof(operand), &gva))
6011
return 1;
6012
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
6013
if (r != X86EMUL_CONTINUE)
6014
return kvm_handle_memory_failure(vcpu, r, &e);
6015
6016
if (operand.vpid >> 16)
6017
return nested_vmx_fail(vcpu,
6018
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6019
6020
/*
6021
* Always flush the effective vpid02, i.e. never flush the current VPID
6022
* and never explicitly flush vpid01. INVVPID targets a VPID, not a
6023
* VMCS, and so whether or not the current vmcs12 has VPID enabled is
6024
* irrelevant (and there may not be a loaded vmcs12).
6025
*/
6026
vpid02 = nested_get_vpid02(vcpu);
6027
switch (type) {
6028
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
6029
/*
6030
* LAM doesn't apply to addresses that are inputs to TLB
6031
* invalidation.
6032
*/
6033
if (!operand.vpid ||
6034
is_noncanonical_invlpg_address(operand.gla, vcpu))
6035
return nested_vmx_fail(vcpu,
6036
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6037
vpid_sync_vcpu_addr(vpid02, operand.gla);
6038
break;
6039
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
6040
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
6041
if (!operand.vpid)
6042
return nested_vmx_fail(vcpu,
6043
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6044
vpid_sync_context(vpid02);
6045
break;
6046
case VMX_VPID_EXTENT_ALL_CONTEXT:
6047
vpid_sync_context(vpid02);
6048
break;
6049
default:
6050
WARN_ON_ONCE(1);
6051
return kvm_skip_emulated_instruction(vcpu);
6052
}
6053
6054
/*
6055
* Sync the shadow page tables if EPT is disabled, L1 is invalidating
6056
* linear mappings for L2 (tagged with L2's VPID). Free all guest
6057
* roots as VPIDs are not tracked in the MMU role.
6058
*
6059
* Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
6060
* an MMU when EPT is disabled.
6061
*
6062
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
6063
*/
6064
if (!enable_ept)
6065
kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
6066
6067
return nested_vmx_succeed(vcpu);
6068
}
6069
6070
static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
6071
struct vmcs12 *vmcs12)
6072
{
6073
u32 index = kvm_rcx_read(vcpu);
6074
u64 new_eptp;
6075
6076
if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
6077
return 1;
6078
if (index >= VMFUNC_EPTP_ENTRIES)
6079
return 1;
6080
6081
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
6082
&new_eptp, index * 8, 8))
6083
return 1;
6084
6085
/*
6086
* If the (L2) guest does a vmfunc to the currently
6087
* active ept pointer, we don't have to do anything else
6088
*/
6089
if (vmcs12->ept_pointer != new_eptp) {
6090
if (!nested_vmx_check_eptp(vcpu, new_eptp))
6091
return 1;
6092
6093
vmcs12->ept_pointer = new_eptp;
6094
nested_ept_new_eptp(vcpu);
6095
6096
if (!nested_cpu_has_vpid(vmcs12))
6097
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
6098
}
6099
6100
return 0;
6101
}
6102
6103
static int handle_vmfunc(struct kvm_vcpu *vcpu)
6104
{
6105
struct vcpu_vmx *vmx = to_vmx(vcpu);
6106
struct vmcs12 *vmcs12;
6107
u32 function = kvm_rax_read(vcpu);
6108
6109
/*
6110
* VMFUNC should never execute cleanly while L1 is active; KVM supports
6111
* VMFUNC for nested VMs, but not for L1.
6112
*/
6113
if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
6114
kvm_queue_exception(vcpu, UD_VECTOR);
6115
return 1;
6116
}
6117
6118
vmcs12 = get_vmcs12(vcpu);
6119
6120
/*
6121
* #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
6122
* is enabled in vmcs02 if and only if it's enabled in vmcs12.
6123
*/
6124
if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
6125
kvm_queue_exception(vcpu, UD_VECTOR);
6126
return 1;
6127
}
6128
6129
if (!(vmcs12->vm_function_control & BIT_ULL(function)))
6130
goto fail;
6131
6132
switch (function) {
6133
case 0:
6134
if (nested_vmx_eptp_switching(vcpu, vmcs12))
6135
goto fail;
6136
break;
6137
default:
6138
goto fail;
6139
}
6140
return kvm_skip_emulated_instruction(vcpu);
6141
6142
fail:
6143
/*
6144
* This is effectively a reflected VM-Exit, as opposed to a synthesized
6145
* nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
6146
* EXIT_REASON_VMFUNC as the exit reason.
6147
*/
6148
nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
6149
vmx_get_intr_info(vcpu),
6150
vmx_get_exit_qual(vcpu));
6151
return 1;
6152
}
6153
6154
/*
6155
* Return true if an IO instruction with the specified port and size should cause
6156
* a VM-exit into L1.
6157
*/
6158
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
6159
int size)
6160
{
6161
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6162
gpa_t bitmap, last_bitmap;
6163
u8 b;
6164
6165
last_bitmap = INVALID_GPA;
6166
b = -1;
6167
6168
while (size > 0) {
6169
if (port < 0x8000)
6170
bitmap = vmcs12->io_bitmap_a;
6171
else if (port < 0x10000)
6172
bitmap = vmcs12->io_bitmap_b;
6173
else
6174
return true;
6175
bitmap += (port & 0x7fff) / 8;
6176
6177
if (last_bitmap != bitmap)
6178
if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
6179
return true;
6180
if (b & (1 << (port & 7)))
6181
return true;
6182
6183
port++;
6184
size--;
6185
last_bitmap = bitmap;
6186
}
6187
6188
return false;
6189
}
6190
6191
static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6192
struct vmcs12 *vmcs12)
6193
{
6194
unsigned long exit_qualification;
6195
unsigned short port;
6196
int size;
6197
6198
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6199
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6200
6201
exit_qualification = vmx_get_exit_qual(vcpu);
6202
6203
port = exit_qualification >> 16;
6204
size = (exit_qualification & 7) + 1;
6205
6206
return nested_vmx_check_io_bitmaps(vcpu, port, size);
6207
}
6208
6209
/*
6210
* Return 1 if we should exit from L2 to L1 to handle an MSR access,
6211
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
6212
* disinterest in the current event (read or write a specific MSR) by using an
6213
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
6214
*/
6215
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
6216
struct vmcs12 *vmcs12,
6217
union vmx_exit_reason exit_reason)
6218
{
6219
u32 msr_index = kvm_rcx_read(vcpu);
6220
gpa_t bitmap;
6221
6222
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
6223
return true;
6224
6225
/*
6226
* The MSR_BITMAP page is divided into four 1024-byte bitmaps,
6227
* for the four combinations of read/write and low/high MSR numbers.
6228
* First we need to figure out which of the four to use:
6229
*/
6230
bitmap = vmcs12->msr_bitmap;
6231
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6232
bitmap += 2048;
6233
if (msr_index >= 0xc0000000) {
6234
msr_index -= 0xc0000000;
6235
bitmap += 1024;
6236
}
6237
6238
/* Then read the msr_index'th bit from this bitmap: */
6239
if (msr_index < 1024*8) {
6240
unsigned char b;
6241
if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
6242
return true;
6243
return 1 & (b >> (msr_index & 7));
6244
} else
6245
return true; /* let L1 handle the wrong parameter */
6246
}
6247
6248
/*
6249
* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
6250
* rather than handle it ourselves in L0. I.e., check if L1 wanted to
6251
* intercept (via guest_host_mask etc.) the current event.
6252
*/
6253
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6254
struct vmcs12 *vmcs12)
6255
{
6256
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
6257
int cr = exit_qualification & 15;
6258
int reg;
6259
unsigned long val;
6260
6261
switch ((exit_qualification >> 4) & 3) {
6262
case 0: /* mov to cr */
6263
reg = (exit_qualification >> 8) & 15;
6264
val = kvm_register_read(vcpu, reg);
6265
switch (cr) {
6266
case 0:
6267
if (vmcs12->cr0_guest_host_mask &
6268
(val ^ vmcs12->cr0_read_shadow))
6269
return true;
6270
break;
6271
case 3:
6272
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
6273
return true;
6274
break;
6275
case 4:
6276
if (vmcs12->cr4_guest_host_mask &
6277
(vmcs12->cr4_read_shadow ^ val))
6278
return true;
6279
break;
6280
case 8:
6281
if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
6282
return true;
6283
break;
6284
}
6285
break;
6286
case 2: /* clts */
6287
if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
6288
(vmcs12->cr0_read_shadow & X86_CR0_TS))
6289
return true;
6290
break;
6291
case 1: /* mov from cr */
6292
switch (cr) {
6293
case 3:
6294
if (vmcs12->cpu_based_vm_exec_control &
6295
CPU_BASED_CR3_STORE_EXITING)
6296
return true;
6297
break;
6298
case 8:
6299
if (vmcs12->cpu_based_vm_exec_control &
6300
CPU_BASED_CR8_STORE_EXITING)
6301
return true;
6302
break;
6303
}
6304
break;
6305
case 3: /* lmsw */
6306
/*
6307
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
6308
* cr0. Other attempted changes are ignored, with no exit.
6309
*/
6310
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6311
if (vmcs12->cr0_guest_host_mask & 0xe &
6312
(val ^ vmcs12->cr0_read_shadow))
6313
return true;
6314
if ((vmcs12->cr0_guest_host_mask & 0x1) &&
6315
!(vmcs12->cr0_read_shadow & 0x1) &&
6316
(val & 0x1))
6317
return true;
6318
break;
6319
}
6320
return false;
6321
}
6322
6323
static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
6324
struct vmcs12 *vmcs12)
6325
{
6326
u32 encls_leaf;
6327
6328
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
6329
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
6330
return false;
6331
6332
encls_leaf = kvm_rax_read(vcpu);
6333
if (encls_leaf > 62)
6334
encls_leaf = 63;
6335
return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
6336
}
6337
6338
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
6339
struct vmcs12 *vmcs12, gpa_t bitmap)
6340
{
6341
u32 vmx_instruction_info;
6342
unsigned long field;
6343
u8 b;
6344
6345
if (!nested_cpu_has_shadow_vmcs(vmcs12))
6346
return true;
6347
6348
/* Decode instruction info and find the field to access */
6349
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6350
field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6351
6352
/* Out-of-range fields always cause a VM exit from L2 to L1 */
6353
if (field >> 15)
6354
return true;
6355
6356
if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
6357
return true;
6358
6359
return 1 & (b >> (field & 7));
6360
}
6361
6362
static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
6363
{
6364
u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
6365
6366
if (nested_cpu_has_mtf(vmcs12))
6367
return true;
6368
6369
/*
6370
* An MTF VM-exit may be injected into the guest by setting the
6371
* interruption-type to 7 (other event) and the vector field to 0. Such
6372
* is the case regardless of the 'monitor trap flag' VM-execution
6373
* control.
6374
*/
6375
return entry_intr_info == (INTR_INFO_VALID_MASK
6376
| INTR_TYPE_OTHER_EVENT);
6377
}
6378
6379
/*
6380
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
6381
* L1 wants the exit. Only call this when in is_guest_mode (L2).
6382
*/
6383
static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
6384
union vmx_exit_reason exit_reason)
6385
{
6386
u32 intr_info;
6387
6388
switch ((u16)exit_reason.basic) {
6389
case EXIT_REASON_EXCEPTION_NMI:
6390
intr_info = vmx_get_intr_info(vcpu);
6391
if (is_nmi(intr_info))
6392
return true;
6393
else if (is_page_fault(intr_info))
6394
return vcpu->arch.apf.host_apf_flags ||
6395
vmx_need_pf_intercept(vcpu);
6396
else if (is_debug(intr_info) &&
6397
vcpu->guest_debug &
6398
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6399
return true;
6400
else if (is_breakpoint(intr_info) &&
6401
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6402
return true;
6403
else if (is_alignment_check(intr_info) &&
6404
!vmx_guest_inject_ac(vcpu))
6405
return true;
6406
else if (is_ve_fault(intr_info))
6407
return true;
6408
return false;
6409
case EXIT_REASON_EXTERNAL_INTERRUPT:
6410
return true;
6411
case EXIT_REASON_MCE_DURING_VMENTRY:
6412
return true;
6413
case EXIT_REASON_EPT_VIOLATION:
6414
/*
6415
* L0 always deals with the EPT violation. If nested EPT is
6416
* used, and the nested mmu code discovers that the address is
6417
* missing in the guest EPT table (EPT12), the EPT violation
6418
* will be injected with nested_ept_inject_page_fault()
6419
*/
6420
return true;
6421
case EXIT_REASON_EPT_MISCONFIG:
6422
/*
6423
* L2 never uses directly L1's EPT, but rather L0's own EPT
6424
* table (shadow on EPT) or a merged EPT table that L0 built
6425
* (EPT on EPT). So any problems with the structure of the
6426
* table is L0's fault.
6427
*/
6428
return true;
6429
case EXIT_REASON_PREEMPTION_TIMER:
6430
return true;
6431
case EXIT_REASON_PML_FULL:
6432
/*
6433
* PML is emulated for an L1 VMM and should never be enabled in
6434
* vmcs02, always "handle" PML_FULL by exiting to userspace.
6435
*/
6436
return true;
6437
case EXIT_REASON_VMFUNC:
6438
/* VM functions are emulated through L2->L0 vmexits. */
6439
return true;
6440
case EXIT_REASON_BUS_LOCK:
6441
/*
6442
* At present, bus lock VM exit is never exposed to L1.
6443
* Handle L2's bus locks in L0 directly.
6444
*/
6445
return true;
6446
#ifdef CONFIG_KVM_HYPERV
6447
case EXIT_REASON_VMCALL:
6448
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
6449
return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
6450
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
6451
kvm_hv_is_tlb_flush_hcall(vcpu);
6452
#endif
6453
default:
6454
break;
6455
}
6456
return false;
6457
}
6458
6459
/*
6460
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
6461
* is_guest_mode (L2).
6462
*/
6463
static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
6464
union vmx_exit_reason exit_reason)
6465
{
6466
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6467
u32 intr_info;
6468
6469
switch ((u16)exit_reason.basic) {
6470
case EXIT_REASON_EXCEPTION_NMI:
6471
intr_info = vmx_get_intr_info(vcpu);
6472
if (is_nmi(intr_info))
6473
return true;
6474
else if (is_page_fault(intr_info))
6475
return true;
6476
return vmcs12->exception_bitmap &
6477
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
6478
case EXIT_REASON_EXTERNAL_INTERRUPT:
6479
return nested_exit_on_intr(vcpu);
6480
case EXIT_REASON_TRIPLE_FAULT:
6481
return true;
6482
case EXIT_REASON_INTERRUPT_WINDOW:
6483
return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
6484
case EXIT_REASON_NMI_WINDOW:
6485
return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
6486
case EXIT_REASON_TASK_SWITCH:
6487
return true;
6488
case EXIT_REASON_CPUID:
6489
return true;
6490
case EXIT_REASON_HLT:
6491
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6492
case EXIT_REASON_INVD:
6493
return true;
6494
case EXIT_REASON_INVLPG:
6495
return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6496
case EXIT_REASON_RDPMC:
6497
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6498
case EXIT_REASON_RDRAND:
6499
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6500
case EXIT_REASON_RDSEED:
6501
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6502
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6503
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6504
case EXIT_REASON_VMREAD:
6505
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6506
vmcs12->vmread_bitmap);
6507
case EXIT_REASON_VMWRITE:
6508
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6509
vmcs12->vmwrite_bitmap);
6510
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6511
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6512
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6513
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6514
case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6515
/*
6516
* VMX instructions trap unconditionally. This allows L1 to
6517
* emulate them for its L2 guest, i.e., allows 3-level nesting!
6518
*/
6519
return true;
6520
case EXIT_REASON_CR_ACCESS:
6521
return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6522
case EXIT_REASON_DR_ACCESS:
6523
return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6524
case EXIT_REASON_IO_INSTRUCTION:
6525
return nested_vmx_exit_handled_io(vcpu, vmcs12);
6526
case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6527
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6528
case EXIT_REASON_MSR_READ:
6529
case EXIT_REASON_MSR_WRITE:
6530
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6531
case EXIT_REASON_INVALID_STATE:
6532
return true;
6533
case EXIT_REASON_MWAIT_INSTRUCTION:
6534
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6535
case EXIT_REASON_MONITOR_TRAP_FLAG:
6536
return nested_vmx_exit_handled_mtf(vmcs12);
6537
case EXIT_REASON_MONITOR_INSTRUCTION:
6538
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6539
case EXIT_REASON_PAUSE_INSTRUCTION:
6540
return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6541
nested_cpu_has2(vmcs12,
6542
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6543
case EXIT_REASON_MCE_DURING_VMENTRY:
6544
return true;
6545
case EXIT_REASON_TPR_BELOW_THRESHOLD:
6546
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6547
case EXIT_REASON_APIC_ACCESS:
6548
case EXIT_REASON_APIC_WRITE:
6549
case EXIT_REASON_EOI_INDUCED:
6550
/*
6551
* The controls for "virtualize APIC accesses," "APIC-
6552
* register virtualization," and "virtual-interrupt
6553
* delivery" only come from vmcs12.
6554
*/
6555
return true;
6556
case EXIT_REASON_INVPCID:
6557
return
6558
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6559
nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6560
case EXIT_REASON_WBINVD:
6561
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6562
case EXIT_REASON_XSETBV:
6563
return true;
6564
case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6565
/*
6566
* This should never happen, since it is not possible to
6567
* set XSS to a non-zero value---neither in L1 nor in L2.
6568
* If if it were, XSS would have to be checked against
6569
* the XSS exit bitmap in vmcs12.
6570
*/
6571
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
6572
case EXIT_REASON_UMWAIT:
6573
case EXIT_REASON_TPAUSE:
6574
return nested_cpu_has2(vmcs12,
6575
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6576
case EXIT_REASON_ENCLS:
6577
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6578
case EXIT_REASON_NOTIFY:
6579
/* Notify VM exit is not exposed to L1 */
6580
return false;
6581
default:
6582
return true;
6583
}
6584
}
6585
6586
/*
6587
* Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6588
* reflected into L1.
6589
*/
6590
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6591
{
6592
struct vcpu_vmx *vmx = to_vmx(vcpu);
6593
union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
6594
unsigned long exit_qual;
6595
u32 exit_intr_info;
6596
6597
WARN_ON_ONCE(vmx->nested.nested_run_pending);
6598
6599
/*
6600
* Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6601
* has already loaded L2's state.
6602
*/
6603
if (unlikely(vmx->fail)) {
6604
trace_kvm_nested_vmenter_failed(
6605
"hardware VM-instruction error: ",
6606
vmcs_read32(VM_INSTRUCTION_ERROR));
6607
exit_intr_info = 0;
6608
exit_qual = 0;
6609
goto reflect_vmexit;
6610
}
6611
6612
trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
6613
6614
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
6615
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6616
return false;
6617
6618
/* If L1 doesn't want the exit, handle it in L0. */
6619
if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6620
return false;
6621
6622
/*
6623
* vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6624
* EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6625
* need to be synthesized by querying the in-kernel LAPIC, but external
6626
* interrupts are never reflected to L1 so it's a non-issue.
6627
*/
6628
exit_intr_info = vmx_get_intr_info(vcpu);
6629
if (is_exception_with_error_code(exit_intr_info)) {
6630
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6631
6632
vmcs12->vm_exit_intr_error_code =
6633
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6634
}
6635
exit_qual = vmx_get_exit_qual(vcpu);
6636
6637
reflect_vmexit:
6638
nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6639
return true;
6640
}
6641
6642
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6643
struct kvm_nested_state __user *user_kvm_nested_state,
6644
u32 user_data_size)
6645
{
6646
struct vcpu_vmx *vmx;
6647
struct vmcs12 *vmcs12;
6648
struct kvm_nested_state kvm_state = {
6649
.flags = 0,
6650
.format = KVM_STATE_NESTED_FORMAT_VMX,
6651
.size = sizeof(kvm_state),
6652
.hdr.vmx.flags = 0,
6653
.hdr.vmx.vmxon_pa = INVALID_GPA,
6654
.hdr.vmx.vmcs12_pa = INVALID_GPA,
6655
.hdr.vmx.preemption_timer_deadline = 0,
6656
};
6657
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6658
&user_kvm_nested_state->data.vmx[0];
6659
6660
if (!vcpu)
6661
return kvm_state.size + sizeof(*user_vmx_nested_state);
6662
6663
vmx = to_vmx(vcpu);
6664
vmcs12 = get_vmcs12(vcpu);
6665
6666
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
6667
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6668
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6669
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6670
6671
if (vmx_has_valid_vmcs12(vcpu)) {
6672
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6673
6674
/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6675
if (nested_vmx_is_evmptr12_set(vmx))
6676
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6677
6678
if (is_guest_mode(vcpu) &&
6679
nested_cpu_has_shadow_vmcs(vmcs12) &&
6680
vmcs12->vmcs_link_pointer != INVALID_GPA)
6681
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6682
}
6683
6684
if (vmx->nested.smm.vmxon)
6685
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6686
6687
if (vmx->nested.smm.guest_mode)
6688
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6689
6690
if (is_guest_mode(vcpu)) {
6691
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6692
6693
if (vmx->nested.nested_run_pending)
6694
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6695
6696
if (vmx->nested.mtf_pending)
6697
kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6698
6699
if (nested_cpu_has_preemption_timer(vmcs12) &&
6700
vmx->nested.has_preemption_timer_deadline) {
6701
kvm_state.hdr.vmx.flags |=
6702
KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6703
kvm_state.hdr.vmx.preemption_timer_deadline =
6704
vmx->nested.preemption_timer_deadline;
6705
}
6706
}
6707
}
6708
6709
if (user_data_size < kvm_state.size)
6710
goto out;
6711
6712
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6713
return -EFAULT;
6714
6715
if (!vmx_has_valid_vmcs12(vcpu))
6716
goto out;
6717
6718
/*
6719
* When running L2, the authoritative vmcs12 state is in the
6720
* vmcs02. When running L1, the authoritative vmcs12 state is
6721
* in the shadow or enlightened vmcs linked to vmcs01, unless
6722
* need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6723
* vmcs12 state is in the vmcs12 already.
6724
*/
6725
if (is_guest_mode(vcpu)) {
6726
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6727
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6728
} else {
6729
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6730
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6731
if (nested_vmx_is_evmptr12_valid(vmx))
6732
/*
6733
* L1 hypervisor is not obliged to keep eVMCS
6734
* clean fields data always up-to-date while
6735
* not in guest mode, 'hv_clean_fields' is only
6736
* supposed to be actual upon vmentry so we need
6737
* to ignore it here and do full copy.
6738
*/
6739
copy_enlightened_to_vmcs12(vmx, 0);
6740
else if (enable_shadow_vmcs)
6741
copy_shadow_to_vmcs12(vmx);
6742
}
6743
}
6744
6745
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6746
BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6747
6748
/*
6749
* Copy over the full allocated size of vmcs12 rather than just the size
6750
* of the struct.
6751
*/
6752
if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6753
return -EFAULT;
6754
6755
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6756
vmcs12->vmcs_link_pointer != INVALID_GPA) {
6757
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6758
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6759
return -EFAULT;
6760
}
6761
out:
6762
return kvm_state.size;
6763
}
6764
6765
void vmx_leave_nested(struct kvm_vcpu *vcpu)
6766
{
6767
if (is_guest_mode(vcpu)) {
6768
to_vmx(vcpu)->nested.nested_run_pending = 0;
6769
nested_vmx_vmexit(vcpu, -1, 0, 0);
6770
}
6771
free_nested(vcpu);
6772
}
6773
6774
static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6775
struct kvm_nested_state __user *user_kvm_nested_state,
6776
struct kvm_nested_state *kvm_state)
6777
{
6778
struct vcpu_vmx *vmx = to_vmx(vcpu);
6779
struct vmcs12 *vmcs12;
6780
enum vm_entry_failure_code ignored;
6781
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6782
&user_kvm_nested_state->data.vmx[0];
6783
int ret;
6784
6785
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6786
return -EINVAL;
6787
6788
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6789
if (kvm_state->hdr.vmx.smm.flags)
6790
return -EINVAL;
6791
6792
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
6793
return -EINVAL;
6794
6795
/*
6796
* KVM_STATE_NESTED_EVMCS used to signal that KVM should
6797
* enable eVMCS capability on vCPU. However, since then
6798
* code was changed such that flag signals vmcs12 should
6799
* be copied into eVMCS in guest memory.
6800
*
6801
* To preserve backwards compatibility, allow user
6802
* to set this flag even when there is no VMXON region.
6803
*/
6804
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6805
return -EINVAL;
6806
} else {
6807
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
6808
return -EINVAL;
6809
6810
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6811
return -EINVAL;
6812
}
6813
6814
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6815
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6816
return -EINVAL;
6817
6818
if (kvm_state->hdr.vmx.smm.flags &
6819
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6820
return -EINVAL;
6821
6822
if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6823
return -EINVAL;
6824
6825
/*
6826
* SMM temporarily disables VMX, so we cannot be in guest mode,
6827
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6828
* must be zero.
6829
*/
6830
if (is_smm(vcpu) ?
6831
(kvm_state->flags &
6832
(KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6833
: kvm_state->hdr.vmx.smm.flags)
6834
return -EINVAL;
6835
6836
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6837
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6838
return -EINVAL;
6839
6840
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6841
(!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
6842
!vmx->nested.enlightened_vmcs_enabled))
6843
return -EINVAL;
6844
6845
vmx_leave_nested(vcpu);
6846
6847
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
6848
return 0;
6849
6850
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6851
ret = enter_vmx_operation(vcpu);
6852
if (ret)
6853
return ret;
6854
6855
/* Empty 'VMXON' state is permitted if no VMCS loaded */
6856
if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6857
/* See vmx_has_valid_vmcs12. */
6858
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6859
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6860
(kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
6861
return -EINVAL;
6862
else
6863
return 0;
6864
}
6865
6866
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
6867
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6868
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6869
return -EINVAL;
6870
6871
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6872
#ifdef CONFIG_KVM_HYPERV
6873
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6874
/*
6875
* nested_vmx_handle_enlightened_vmptrld() cannot be called
6876
* directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6877
* restored yet. EVMCS will be mapped from
6878
* nested_get_vmcs12_pages().
6879
*/
6880
vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6881
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6882
#endif
6883
} else {
6884
return -EINVAL;
6885
}
6886
6887
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6888
vmx->nested.smm.vmxon = true;
6889
vmx->nested.vmxon = false;
6890
6891
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6892
vmx->nested.smm.guest_mode = true;
6893
}
6894
6895
vmcs12 = get_vmcs12(vcpu);
6896
if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6897
return -EFAULT;
6898
6899
if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6900
return -EINVAL;
6901
6902
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6903
return 0;
6904
6905
vmx->nested.nested_run_pending =
6906
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6907
6908
vmx->nested.mtf_pending =
6909
!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6910
6911
ret = -EINVAL;
6912
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6913
vmcs12->vmcs_link_pointer != INVALID_GPA) {
6914
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6915
6916
if (kvm_state->size <
6917
sizeof(*kvm_state) +
6918
sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6919
goto error_guest_mode;
6920
6921
if (copy_from_user(shadow_vmcs12,
6922
user_vmx_nested_state->shadow_vmcs12,
6923
sizeof(*shadow_vmcs12))) {
6924
ret = -EFAULT;
6925
goto error_guest_mode;
6926
}
6927
6928
if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6929
!shadow_vmcs12->hdr.shadow_vmcs)
6930
goto error_guest_mode;
6931
}
6932
6933
vmx->nested.has_preemption_timer_deadline = false;
6934
if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6935
vmx->nested.has_preemption_timer_deadline = true;
6936
vmx->nested.preemption_timer_deadline =
6937
kvm_state->hdr.vmx.preemption_timer_deadline;
6938
}
6939
6940
if (nested_vmx_check_controls(vcpu, vmcs12) ||
6941
nested_vmx_check_host_state(vcpu, vmcs12) ||
6942
nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6943
goto error_guest_mode;
6944
6945
vmx->nested.dirty_vmcs12 = true;
6946
vmx->nested.force_msr_bitmap_recalc = true;
6947
ret = nested_vmx_enter_non_root_mode(vcpu, false);
6948
if (ret)
6949
goto error_guest_mode;
6950
6951
if (vmx->nested.mtf_pending)
6952
kvm_make_request(KVM_REQ_EVENT, vcpu);
6953
6954
return 0;
6955
6956
error_guest_mode:
6957
vmx->nested.nested_run_pending = 0;
6958
return ret;
6959
}
6960
6961
void nested_vmx_set_vmcs_shadowing_bitmap(void)
6962
{
6963
if (enable_shadow_vmcs) {
6964
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6965
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6966
}
6967
}
6968
6969
/*
6970
* Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
6971
* that madness to get the encoding for comparison.
6972
*/
6973
#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
6974
6975
static u64 nested_vmx_calc_vmcs_enum_msr(void)
6976
{
6977
/*
6978
* Note these are the so called "index" of the VMCS field encoding, not
6979
* the index into vmcs12.
6980
*/
6981
unsigned int max_idx, idx;
6982
int i;
6983
6984
/*
6985
* For better or worse, KVM allows VMREAD/VMWRITE to all fields in
6986
* vmcs12, regardless of whether or not the associated feature is
6987
* exposed to L1. Simply find the field with the highest index.
6988
*/
6989
max_idx = 0;
6990
for (i = 0; i < nr_vmcs12_fields; i++) {
6991
/* The vmcs12 table is very, very sparsely populated. */
6992
if (!vmcs12_field_offsets[i])
6993
continue;
6994
6995
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
6996
if (idx > max_idx)
6997
max_idx = idx;
6998
}
6999
7000
return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
7001
}
7002
7003
static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
7004
struct nested_vmx_msrs *msrs)
7005
{
7006
msrs->pinbased_ctls_low =
7007
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7008
7009
msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
7010
msrs->pinbased_ctls_high &=
7011
PIN_BASED_EXT_INTR_MASK |
7012
PIN_BASED_NMI_EXITING |
7013
PIN_BASED_VIRTUAL_NMIS |
7014
(enable_apicv ? PIN_BASED_POSTED_INTR : 0);
7015
msrs->pinbased_ctls_high |=
7016
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7017
PIN_BASED_VMX_PREEMPTION_TIMER;
7018
}
7019
7020
static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
7021
struct nested_vmx_msrs *msrs)
7022
{
7023
msrs->exit_ctls_low =
7024
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
7025
7026
msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
7027
msrs->exit_ctls_high &=
7028
#ifdef CONFIG_X86_64
7029
VM_EXIT_HOST_ADDR_SPACE_SIZE |
7030
#endif
7031
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
7032
VM_EXIT_CLEAR_BNDCFGS;
7033
msrs->exit_ctls_high |=
7034
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
7035
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
7036
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
7037
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
7038
7039
/* We support free control of debug control saving. */
7040
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
7041
}
7042
7043
static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
7044
struct nested_vmx_msrs *msrs)
7045
{
7046
msrs->entry_ctls_low =
7047
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
7048
7049
msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
7050
msrs->entry_ctls_high &=
7051
#ifdef CONFIG_X86_64
7052
VM_ENTRY_IA32E_MODE |
7053
#endif
7054
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
7055
msrs->entry_ctls_high |=
7056
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
7057
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
7058
7059
/* We support free control of debug control loading. */
7060
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
7061
}
7062
7063
static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
7064
struct nested_vmx_msrs *msrs)
7065
{
7066
msrs->procbased_ctls_low =
7067
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7068
7069
msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
7070
msrs->procbased_ctls_high &=
7071
CPU_BASED_INTR_WINDOW_EXITING |
7072
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
7073
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
7074
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
7075
CPU_BASED_CR3_STORE_EXITING |
7076
#ifdef CONFIG_X86_64
7077
CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
7078
#endif
7079
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
7080
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
7081
CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
7082
CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
7083
CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
7084
/*
7085
* We can allow some features even when not supported by the
7086
* hardware. For example, L1 can specify an MSR bitmap - and we
7087
* can use it to avoid exits to L1 - even when L0 runs L2
7088
* without MSR bitmaps.
7089
*/
7090
msrs->procbased_ctls_high |=
7091
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7092
CPU_BASED_USE_MSR_BITMAPS;
7093
7094
/* We support free control of CR3 access interception. */
7095
msrs->procbased_ctls_low &=
7096
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
7097
}
7098
7099
static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
7100
struct vmcs_config *vmcs_conf,
7101
struct nested_vmx_msrs *msrs)
7102
{
7103
msrs->secondary_ctls_low = 0;
7104
7105
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
7106
msrs->secondary_ctls_high &=
7107
SECONDARY_EXEC_DESC |
7108
SECONDARY_EXEC_ENABLE_RDTSCP |
7109
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7110
SECONDARY_EXEC_WBINVD_EXITING |
7111
SECONDARY_EXEC_APIC_REGISTER_VIRT |
7112
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
7113
SECONDARY_EXEC_RDRAND_EXITING |
7114
SECONDARY_EXEC_ENABLE_INVPCID |
7115
SECONDARY_EXEC_ENABLE_VMFUNC |
7116
SECONDARY_EXEC_RDSEED_EXITING |
7117
SECONDARY_EXEC_ENABLE_XSAVES |
7118
SECONDARY_EXEC_TSC_SCALING |
7119
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
7120
7121
/*
7122
* We can emulate "VMCS shadowing," even if the hardware
7123
* doesn't support it.
7124
*/
7125
msrs->secondary_ctls_high |=
7126
SECONDARY_EXEC_SHADOW_VMCS;
7127
7128
if (enable_ept) {
7129
/* nested EPT: emulate EPT also to L1 */
7130
msrs->secondary_ctls_high |=
7131
SECONDARY_EXEC_ENABLE_EPT;
7132
msrs->ept_caps =
7133
VMX_EPT_PAGE_WALK_4_BIT |
7134
VMX_EPT_PAGE_WALK_5_BIT |
7135
VMX_EPTP_WB_BIT |
7136
VMX_EPT_INVEPT_BIT |
7137
VMX_EPT_EXECUTE_ONLY_BIT;
7138
7139
msrs->ept_caps &= ept_caps;
7140
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7141
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
7142
VMX_EPT_1GB_PAGE_BIT;
7143
if (enable_ept_ad_bits) {
7144
msrs->secondary_ctls_high |=
7145
SECONDARY_EXEC_ENABLE_PML;
7146
msrs->ept_caps |= VMX_EPT_AD_BIT;
7147
}
7148
7149
/*
7150
* Advertise EPTP switching irrespective of hardware support,
7151
* KVM emulates it in software so long as VMFUNC is supported.
7152
*/
7153
if (cpu_has_vmx_vmfunc())
7154
msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
7155
}
7156
7157
/*
7158
* Old versions of KVM use the single-context version without
7159
* checking for support, so declare that it is supported even
7160
* though it is treated as global context. The alternative is
7161
* not failing the single-context invvpid, and it is worse.
7162
*/
7163
if (enable_vpid) {
7164
msrs->secondary_ctls_high |=
7165
SECONDARY_EXEC_ENABLE_VPID;
7166
msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
7167
VMX_VPID_EXTENT_SUPPORTED_MASK;
7168
}
7169
7170
if (enable_unrestricted_guest)
7171
msrs->secondary_ctls_high |=
7172
SECONDARY_EXEC_UNRESTRICTED_GUEST;
7173
7174
if (flexpriority_enabled)
7175
msrs->secondary_ctls_high |=
7176
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7177
7178
if (enable_sgx)
7179
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
7180
}
7181
7182
static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
7183
struct nested_vmx_msrs *msrs)
7184
{
7185
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
7186
msrs->misc_low |=
7187
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
7188
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
7189
VMX_MISC_ACTIVITY_HLT |
7190
VMX_MISC_ACTIVITY_WAIT_SIPI;
7191
msrs->misc_high = 0;
7192
}
7193
7194
static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
7195
{
7196
/*
7197
* This MSR reports some information about VMX support. We
7198
* should return information about the VMX we emulate for the
7199
* guest, and the VMCS structure we give it - not about the
7200
* VMX support of the underlying hardware.
7201
*/
7202
msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
7203
X86_MEMTYPE_WB);
7204
7205
msrs->basic |= VMX_BASIC_TRUE_CTLS;
7206
if (cpu_has_vmx_basic_inout())
7207
msrs->basic |= VMX_BASIC_INOUT;
7208
}
7209
7210
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
7211
{
7212
/*
7213
* These MSRs specify bits which the guest must keep fixed on
7214
* while L1 is in VMXON mode (in L1's root mode, or running an L2).
7215
* We picked the standard core2 setting.
7216
*/
7217
#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
7218
#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
7219
msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
7220
msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
7221
7222
/* These MSRs specify bits which the guest must keep fixed off. */
7223
rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
7224
rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
7225
7226
if (vmx_umip_emulated())
7227
msrs->cr4_fixed1 |= X86_CR4_UMIP;
7228
}
7229
7230
/*
7231
* nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
7232
* returned for the various VMX controls MSRs when nested VMX is enabled.
7233
* The same values should also be used to verify that vmcs12 control fields are
7234
* valid during nested entry from L1 to L2.
7235
* Each of these control msrs has a low and high 32-bit half: A low bit is on
7236
* if the corresponding bit in the (32-bit) control field *must* be on, and a
7237
* bit in the high half is on if the corresponding bit in the control field
7238
* may be on. See also vmx_control_verify().
7239
*/
7240
void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
7241
{
7242
struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
7243
7244
/*
7245
* Note that as a general rule, the high half of the MSRs (bits in
7246
* the control fields which may be 1) should be initialized by the
7247
* intersection of the underlying hardware's MSR (i.e., features which
7248
* can be supported) and the list of features we want to expose -
7249
* because they are known to be properly supported in our code.
7250
* Also, usually, the low half of the MSRs (bits which must be 1) can
7251
* be set to 0, meaning that L1 may turn off any of these bits. The
7252
* reason is that if one of these bits is necessary, it will appear
7253
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
7254
* fields of vmcs01 and vmcs02, will turn these bits off - and
7255
* nested_vmx_l1_wants_exit() will not pass related exits to L1.
7256
* These rules have exceptions below.
7257
*/
7258
nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
7259
7260
nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
7261
7262
nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
7263
7264
nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
7265
7266
nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
7267
7268
nested_vmx_setup_misc_data(vmcs_conf, msrs);
7269
7270
nested_vmx_setup_basic(msrs);
7271
7272
nested_vmx_setup_cr_fixed(msrs);
7273
7274
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
7275
}
7276
7277
void nested_vmx_hardware_unsetup(void)
7278
{
7279
int i;
7280
7281
if (enable_shadow_vmcs) {
7282
for (i = 0; i < VMX_BITMAP_NR; i++)
7283
free_page((unsigned long)vmx_bitmap[i]);
7284
}
7285
}
7286
7287
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
7288
{
7289
int i;
7290
7291
if (!cpu_has_vmx_shadow_vmcs())
7292
enable_shadow_vmcs = 0;
7293
if (enable_shadow_vmcs) {
7294
for (i = 0; i < VMX_BITMAP_NR; i++) {
7295
/*
7296
* The vmx_bitmap is not tied to a VM and so should
7297
* not be charged to a memcg.
7298
*/
7299
vmx_bitmap[i] = (unsigned long *)
7300
__get_free_page(GFP_KERNEL);
7301
if (!vmx_bitmap[i]) {
7302
nested_vmx_hardware_unsetup();
7303
return -ENOMEM;
7304
}
7305
}
7306
7307
init_vmcs_shadow_fields();
7308
}
7309
7310
exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
7311
exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
7312
exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
7313
exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
7314
exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
7315
exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
7316
exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
7317
exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
7318
exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
7319
exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
7320
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
7321
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
7322
7323
return 0;
7324
}
7325
7326
struct kvm_x86_nested_ops vmx_nested_ops = {
7327
.leave_nested = vmx_leave_nested,
7328
.is_exception_vmexit = nested_vmx_is_exception_vmexit,
7329
.check_events = vmx_check_nested_events,
7330
.has_events = vmx_has_nested_events,
7331
.triple_fault = nested_vmx_triple_fault,
7332
.get_state = vmx_get_nested_state,
7333
.set_state = vmx_set_nested_state,
7334
.get_nested_state_pages = vmx_get_nested_state_pages,
7335
.write_log_dirty = nested_vmx_write_pml_buffer,
7336
#ifdef CONFIG_KVM_HYPERV
7337
.enable_evmcs = nested_enable_evmcs,
7338
.get_evmcs_version = nested_get_evmcs_version,
7339
.hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
7340
#endif
7341
};
7342
7343