Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/amd64/vmm/intel/vmx.c
39536 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2011 NetApp, Inc.
5
* All rights reserved.
6
* Copyright (c) 2018 Joyent, Inc.
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
#include <sys/cdefs.h>
31
#include "opt_bhyve_snapshot.h"
32
33
#include <sys/param.h>
34
#include <sys/systm.h>
35
#include <sys/smp.h>
36
#include <sys/kernel.h>
37
#include <sys/malloc.h>
38
#include <sys/pcpu.h>
39
#include <sys/proc.h>
40
#include <sys/reg.h>
41
#include <sys/smr.h>
42
#include <sys/sysctl.h>
43
44
#include <vm/vm.h>
45
#include <vm/vm_extern.h>
46
#include <vm/pmap.h>
47
48
#include <machine/psl.h>
49
#include <machine/cpufunc.h>
50
#include <machine/md_var.h>
51
#include <machine/segments.h>
52
#include <machine/smp.h>
53
#include <machine/specialreg.h>
54
#include <machine/vmparam.h>
55
56
#include <machine/vmm.h>
57
#include <machine/vmm_dev.h>
58
#include <machine/vmm_instruction_emul.h>
59
#include <machine/vmm_snapshot.h>
60
61
#include <dev/vmm/vmm_ktr.h>
62
#include <dev/vmm/vmm_mem.h>
63
64
#include "vmm_lapic.h"
65
#include "vmm_host.h"
66
#include "vmm_ioport.h"
67
#include "vmm_stat.h"
68
#include "vatpic.h"
69
#include "vlapic.h"
70
#include "vlapic_priv.h"
71
72
#include "ept.h"
73
#include "vmx_cpufunc.h"
74
#include "vmx.h"
75
#include "vmx_msr.h"
76
#include "x86.h"
77
#include "vmx_controls.h"
78
#include "io/ppt.h"
79
80
#define PINBASED_CTLS_ONE_SETTING \
81
(PINBASED_EXTINT_EXITING | \
82
PINBASED_NMI_EXITING | \
83
PINBASED_VIRTUAL_NMI)
84
#define PINBASED_CTLS_ZERO_SETTING 0
85
86
#define PROCBASED_CTLS_WINDOW_SETTING \
87
(PROCBASED_INT_WINDOW_EXITING | \
88
PROCBASED_NMI_WINDOW_EXITING)
89
90
#define PROCBASED_CTLS_ONE_SETTING \
91
(PROCBASED_SECONDARY_CONTROLS | \
92
PROCBASED_MWAIT_EXITING | \
93
PROCBASED_MONITOR_EXITING | \
94
PROCBASED_IO_EXITING | \
95
PROCBASED_MSR_BITMAPS | \
96
PROCBASED_CTLS_WINDOW_SETTING | \
97
PROCBASED_CR8_LOAD_EXITING | \
98
PROCBASED_CR8_STORE_EXITING)
99
#define PROCBASED_CTLS_ZERO_SETTING \
100
(PROCBASED_CR3_LOAD_EXITING | \
101
PROCBASED_CR3_STORE_EXITING | \
102
PROCBASED_IO_BITMAPS)
103
104
#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
105
#define PROCBASED_CTLS2_ZERO_SETTING 0
106
107
#define VM_EXIT_CTLS_ONE_SETTING \
108
(VM_EXIT_SAVE_DEBUG_CONTROLS | \
109
VM_EXIT_HOST_LMA | \
110
VM_EXIT_SAVE_EFER | \
111
VM_EXIT_LOAD_EFER | \
112
VM_EXIT_ACKNOWLEDGE_INTERRUPT)
113
114
#define VM_EXIT_CTLS_ZERO_SETTING 0
115
116
#define VM_ENTRY_CTLS_ONE_SETTING \
117
(VM_ENTRY_LOAD_DEBUG_CONTROLS | \
118
VM_ENTRY_LOAD_EFER)
119
120
#define VM_ENTRY_CTLS_ZERO_SETTING \
121
(VM_ENTRY_INTO_SMM | \
122
VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
123
124
#define HANDLED 1
125
#define UNHANDLED 0
126
127
static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
128
static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
129
130
bool vmx_have_msr_tsc_aux;
131
132
SYSCTL_DECL(_hw_vmm);
133
SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
134
NULL);
135
136
int vmxon_enabled[MAXCPU];
137
static uint8_t *vmxon_region;
138
139
static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
140
static uint32_t exit_ctls, entry_ctls;
141
142
static uint64_t cr0_ones_mask, cr0_zeros_mask;
143
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
144
&cr0_ones_mask, 0, NULL);
145
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
146
&cr0_zeros_mask, 0, NULL);
147
148
static uint64_t cr4_ones_mask, cr4_zeros_mask;
149
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
150
&cr4_ones_mask, 0, NULL);
151
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
152
&cr4_zeros_mask, 0, NULL);
153
154
static int vmx_initialized;
155
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
156
&vmx_initialized, 0, "Intel VMX initialized");
157
158
/*
159
* Optional capabilities
160
*/
161
static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap,
162
CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
163
NULL);
164
165
static int cap_halt_exit;
166
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
167
"HLT triggers a VM-exit");
168
169
static int cap_pause_exit;
170
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
171
0, "PAUSE triggers a VM-exit");
172
173
static int cap_wbinvd_exit;
174
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, wbinvd_exit, CTLFLAG_RD, &cap_wbinvd_exit,
175
0, "WBINVD triggers a VM-exit");
176
177
static int cap_rdpid;
178
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0,
179
"Guests are allowed to use RDPID");
180
181
static int cap_rdtscp;
182
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0,
183
"Guests are allowed to use RDTSCP");
184
185
static int cap_unrestricted_guest;
186
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
187
&cap_unrestricted_guest, 0, "Unrestricted guests");
188
189
static int cap_monitor_trap;
190
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
191
&cap_monitor_trap, 0, "Monitor trap flag");
192
193
static int cap_invpcid;
194
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
195
0, "Guests are allowed to use INVPCID");
196
197
static int tpr_shadowing;
198
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing,
199
CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
200
&tpr_shadowing, 0, "TPR shadowing support");
201
202
static int virtual_interrupt_delivery;
203
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery,
204
CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
205
&virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
206
207
static int posted_interrupts;
208
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts,
209
CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
210
&posted_interrupts, 0, "APICv posted interrupt support");
211
212
static int pirvec = -1;
213
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
214
&pirvec, 0, "APICv posted interrupt vector");
215
216
static struct unrhdr *vpid_unr;
217
static u_int vpid_alloc_failed;
218
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
219
&vpid_alloc_failed, 0, NULL);
220
221
int guest_l1d_flush;
222
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
223
&guest_l1d_flush, 0, NULL);
224
int guest_l1d_flush_sw;
225
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
226
&guest_l1d_flush_sw, 0, NULL);
227
228
static struct msr_entry msr_load_list[1] __aligned(16);
229
230
/*
231
* The definitions of SDT probes for VMX.
232
*/
233
234
SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
235
"struct vmx *", "int", "struct vm_exit *");
236
237
SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
238
"struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
239
240
SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
241
"struct vmx *", "int", "struct vm_exit *", "uint64_t");
242
243
SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
244
"struct vmx *", "int", "struct vm_exit *", "uint32_t");
245
246
SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
247
"struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
248
249
SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
250
"struct vmx *", "int", "struct vm_exit *");
251
252
SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
253
"struct vmx *", "int", "struct vm_exit *");
254
255
SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
256
"struct vmx *", "int", "struct vm_exit *");
257
258
SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
259
"struct vmx *", "int", "struct vm_exit *");
260
261
SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
262
"struct vmx *", "int", "struct vm_exit *", "uint32_t");
263
264
SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
265
"struct vmx *", "int", "struct vm_exit *");
266
267
SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
268
"struct vmx *", "int", "struct vm_exit *");
269
270
SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
271
"struct vmx *", "int", "struct vm_exit *");
272
273
SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
274
"struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
275
276
SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
277
"struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
278
279
SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
280
"struct vmx *", "int", "struct vm_exit *", "uint64_t");
281
282
SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
283
"struct vmx *", "int", "struct vm_exit *");
284
285
SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
286
"struct vmx *", "int", "struct vm_exit *");
287
288
SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
289
"struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
290
291
SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
292
"struct vmx *", "int", "struct vm_exit *");
293
294
SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
295
"struct vmx *", "int", "struct vm_exit *");
296
297
SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
298
"struct vmx *", "int", "struct vm_exit *");
299
300
SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
301
"struct vmx *", "int", "struct vm_exit *");
302
303
SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
304
"struct vmx *", "int", "struct vm_exit *", "uint32_t");
305
306
SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
307
"struct vmx *", "int", "struct vm_exit *", "int");
308
309
/*
310
* Use the last page below 4GB as the APIC access address. This address is
311
* occupied by the boot firmware so it is guaranteed that it will not conflict
312
* with a page in system memory.
313
*/
314
#define APIC_ACCESS_ADDRESS 0xFFFFF000
315
316
static int vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc);
317
static int vmx_getreg(void *vcpui, int reg, uint64_t *retval);
318
static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
319
static void vmx_inject_pir(struct vlapic *vlapic);
320
#ifdef BHYVE_SNAPSHOT
321
static int vmx_restore_tsc(void *vcpui, uint64_t now);
322
#endif
323
324
static inline bool
325
host_has_rdpid(void)
326
{
327
return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0);
328
}
329
330
static inline bool
331
host_has_rdtscp(void)
332
{
333
return ((amd_feature & AMDID_RDTSCP) != 0);
334
}
335
336
#ifdef KTR
337
static const char *
338
exit_reason_to_str(int reason)
339
{
340
static char reasonbuf[32];
341
342
switch (reason) {
343
case EXIT_REASON_EXCEPTION:
344
return "exception";
345
case EXIT_REASON_EXT_INTR:
346
return "extint";
347
case EXIT_REASON_TRIPLE_FAULT:
348
return "triplefault";
349
case EXIT_REASON_INIT:
350
return "init";
351
case EXIT_REASON_SIPI:
352
return "sipi";
353
case EXIT_REASON_IO_SMI:
354
return "iosmi";
355
case EXIT_REASON_SMI:
356
return "smi";
357
case EXIT_REASON_INTR_WINDOW:
358
return "intrwindow";
359
case EXIT_REASON_NMI_WINDOW:
360
return "nmiwindow";
361
case EXIT_REASON_TASK_SWITCH:
362
return "taskswitch";
363
case EXIT_REASON_CPUID:
364
return "cpuid";
365
case EXIT_REASON_GETSEC:
366
return "getsec";
367
case EXIT_REASON_HLT:
368
return "hlt";
369
case EXIT_REASON_INVD:
370
return "invd";
371
case EXIT_REASON_INVLPG:
372
return "invlpg";
373
case EXIT_REASON_RDPMC:
374
return "rdpmc";
375
case EXIT_REASON_RDTSC:
376
return "rdtsc";
377
case EXIT_REASON_RSM:
378
return "rsm";
379
case EXIT_REASON_VMCALL:
380
return "vmcall";
381
case EXIT_REASON_VMCLEAR:
382
return "vmclear";
383
case EXIT_REASON_VMLAUNCH:
384
return "vmlaunch";
385
case EXIT_REASON_VMPTRLD:
386
return "vmptrld";
387
case EXIT_REASON_VMPTRST:
388
return "vmptrst";
389
case EXIT_REASON_VMREAD:
390
return "vmread";
391
case EXIT_REASON_VMRESUME:
392
return "vmresume";
393
case EXIT_REASON_VMWRITE:
394
return "vmwrite";
395
case EXIT_REASON_VMXOFF:
396
return "vmxoff";
397
case EXIT_REASON_VMXON:
398
return "vmxon";
399
case EXIT_REASON_CR_ACCESS:
400
return "craccess";
401
case EXIT_REASON_DR_ACCESS:
402
return "draccess";
403
case EXIT_REASON_INOUT:
404
return "inout";
405
case EXIT_REASON_RDMSR:
406
return "rdmsr";
407
case EXIT_REASON_WRMSR:
408
return "wrmsr";
409
case EXIT_REASON_INVAL_VMCS:
410
return "invalvmcs";
411
case EXIT_REASON_INVAL_MSR:
412
return "invalmsr";
413
case EXIT_REASON_MWAIT:
414
return "mwait";
415
case EXIT_REASON_MTF:
416
return "mtf";
417
case EXIT_REASON_MONITOR:
418
return "monitor";
419
case EXIT_REASON_PAUSE:
420
return "pause";
421
case EXIT_REASON_MCE_DURING_ENTRY:
422
return "mce-during-entry";
423
case EXIT_REASON_TPR:
424
return "tpr";
425
case EXIT_REASON_APIC_ACCESS:
426
return "apic-access";
427
case EXIT_REASON_GDTR_IDTR:
428
return "gdtridtr";
429
case EXIT_REASON_LDTR_TR:
430
return "ldtrtr";
431
case EXIT_REASON_EPT_FAULT:
432
return "eptfault";
433
case EXIT_REASON_EPT_MISCONFIG:
434
return "eptmisconfig";
435
case EXIT_REASON_INVEPT:
436
return "invept";
437
case EXIT_REASON_RDTSCP:
438
return "rdtscp";
439
case EXIT_REASON_VMX_PREEMPT:
440
return "vmxpreempt";
441
case EXIT_REASON_INVVPID:
442
return "invvpid";
443
case EXIT_REASON_WBINVD:
444
return "wbinvd";
445
case EXIT_REASON_XSETBV:
446
return "xsetbv";
447
case EXIT_REASON_APIC_WRITE:
448
return "apic-write";
449
default:
450
snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
451
return (reasonbuf);
452
}
453
}
454
#endif /* KTR */
455
456
static int
457
vmx_allow_x2apic_msrs(struct vmx *vmx)
458
{
459
int i, error;
460
461
error = 0;
462
463
/*
464
* Allow readonly access to the following x2APIC MSRs from the guest.
465
*/
466
error += guest_msr_ro(vmx, MSR_APIC_ID);
467
error += guest_msr_ro(vmx, MSR_APIC_VERSION);
468
error += guest_msr_ro(vmx, MSR_APIC_LDR);
469
error += guest_msr_ro(vmx, MSR_APIC_SVR);
470
471
for (i = 0; i < 8; i++)
472
error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
473
474
for (i = 0; i < 8; i++)
475
error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
476
477
for (i = 0; i < 8; i++)
478
error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
479
480
error += guest_msr_ro(vmx, MSR_APIC_ESR);
481
error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
482
error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
483
error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
484
error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
485
error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
486
error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
487
error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
488
error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
489
error += guest_msr_ro(vmx, MSR_APIC_ICR);
490
491
/*
492
* Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
493
*
494
* These registers get special treatment described in the section
495
* "Virtualizing MSR-Based APIC Accesses".
496
*/
497
error += guest_msr_rw(vmx, MSR_APIC_TPR);
498
error += guest_msr_rw(vmx, MSR_APIC_EOI);
499
error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
500
501
return (error);
502
}
503
504
u_long
505
vmx_fix_cr0(u_long cr0)
506
{
507
508
return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
509
}
510
511
u_long
512
vmx_fix_cr4(u_long cr4)
513
{
514
515
return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
516
}
517
518
static void
519
vpid_free(int vpid)
520
{
521
if (vpid < 0 || vpid > 0xffff)
522
panic("vpid_free: invalid vpid %d", vpid);
523
524
/*
525
* VPIDs [0,vm_maxcpu] are special and are not allocated from
526
* the unit number allocator.
527
*/
528
529
if (vpid > vm_maxcpu)
530
free_unr(vpid_unr, vpid);
531
}
532
533
static uint16_t
534
vpid_alloc(int vcpuid)
535
{
536
int x;
537
538
/*
539
* If the "enable vpid" execution control is not enabled then the
540
* VPID is required to be 0 for all vcpus.
541
*/
542
if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0)
543
return (0);
544
545
/*
546
* Try to allocate a unique VPID for each from the unit number
547
* allocator.
548
*/
549
x = alloc_unr(vpid_unr);
550
551
if (x == -1) {
552
atomic_add_int(&vpid_alloc_failed, 1);
553
554
/*
555
* If the unit number allocator does not have enough unique
556
* VPIDs then we need to allocate from the [1,vm_maxcpu] range.
557
*
558
* These VPIDs are not be unique across VMs but this does not
559
* affect correctness because the combined mappings are also
560
* tagged with the EP4TA which is unique for each VM.
561
*
562
* It is still sub-optimal because the invvpid will invalidate
563
* combined mappings for a particular VPID across all EP4TAs.
564
*/
565
return (vcpuid + 1);
566
}
567
568
return (x);
569
}
570
571
static void
572
vpid_init(void)
573
{
574
/*
575
* VPID 0 is required when the "enable VPID" execution control is
576
* disabled.
577
*
578
* VPIDs [1,vm_maxcpu] are used as the "overflow namespace" when the
579
* unit number allocator does not have sufficient unique VPIDs to
580
* satisfy the allocation.
581
*
582
* The remaining VPIDs are managed by the unit number allocator.
583
*/
584
vpid_unr = new_unrhdr(vm_maxcpu + 1, 0xffff, NULL);
585
}
586
587
static void
588
vmx_disable(void *arg __unused)
589
{
590
struct invvpid_desc invvpid_desc = { 0 };
591
struct invept_desc invept_desc = { 0 };
592
593
if (vmxon_enabled[curcpu]) {
594
/*
595
* See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
596
*
597
* VMXON or VMXOFF are not required to invalidate any TLB
598
* caching structures. This prevents potential retention of
599
* cached information in the TLB between distinct VMX episodes.
600
*/
601
invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
602
invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
603
vmxoff();
604
}
605
load_cr4(rcr4() & ~CR4_VMXE);
606
}
607
608
static int
609
vmx_modcleanup(void)
610
{
611
612
if (pirvec >= 0)
613
lapic_ipi_free(pirvec);
614
615
if (vpid_unr != NULL) {
616
delete_unrhdr(vpid_unr);
617
vpid_unr = NULL;
618
}
619
620
if (nmi_flush_l1d_sw == 1)
621
nmi_flush_l1d_sw = 0;
622
623
smp_rendezvous(NULL, vmx_disable, NULL, NULL);
624
625
if (vmxon_region != NULL)
626
kmem_free(vmxon_region, (mp_maxid + 1) * PAGE_SIZE);
627
628
return (0);
629
}
630
631
static void
632
vmx_enable(void *arg __unused)
633
{
634
int error;
635
uint64_t feature_control;
636
637
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
638
if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
639
(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
640
wrmsr(MSR_IA32_FEATURE_CONTROL,
641
feature_control | IA32_FEATURE_CONTROL_VMX_EN |
642
IA32_FEATURE_CONTROL_LOCK);
643
}
644
645
load_cr4(rcr4() | CR4_VMXE);
646
647
*(uint32_t *)&vmxon_region[curcpu * PAGE_SIZE] = vmx_revision();
648
error = vmxon(&vmxon_region[curcpu * PAGE_SIZE]);
649
if (error == 0)
650
vmxon_enabled[curcpu] = 1;
651
}
652
653
static void
654
vmx_modsuspend(void)
655
{
656
657
if (vmxon_enabled[curcpu])
658
vmx_disable(NULL);
659
}
660
661
static void
662
vmx_modresume(void)
663
{
664
665
if (vmxon_enabled[curcpu])
666
vmx_enable(NULL);
667
}
668
669
static int
670
vmx_modinit(int ipinum)
671
{
672
int error;
673
uint64_t basic, fixed0, fixed1, feature_control;
674
uint32_t tmp, procbased2_vid_bits;
675
676
/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
677
if (!(cpu_feature2 & CPUID2_VMX)) {
678
printf("vmx_modinit: processor does not support VMX "
679
"operation\n");
680
return (ENXIO);
681
}
682
683
/*
684
* Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
685
* are set (bits 0 and 2 respectively).
686
*/
687
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
688
if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
689
(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
690
printf("vmx_modinit: VMX operation disabled by BIOS\n");
691
return (ENXIO);
692
}
693
694
/*
695
* Verify capabilities MSR_VMX_BASIC:
696
* - bit 54 indicates support for INS/OUTS decoding
697
*/
698
basic = rdmsr(MSR_VMX_BASIC);
699
if ((basic & (1UL << 54)) == 0) {
700
printf("vmx_modinit: processor does not support desired basic "
701
"capabilities\n");
702
return (EINVAL);
703
}
704
705
/* Check support for primary processor-based VM-execution controls */
706
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
707
MSR_VMX_TRUE_PROCBASED_CTLS,
708
PROCBASED_CTLS_ONE_SETTING,
709
PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
710
if (error) {
711
printf("vmx_modinit: processor does not support desired "
712
"primary processor-based controls\n");
713
return (error);
714
}
715
716
/* Clear the processor-based ctl bits that are set on demand */
717
procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
718
719
/* Check support for secondary processor-based VM-execution controls */
720
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
721
MSR_VMX_PROCBASED_CTLS2,
722
PROCBASED_CTLS2_ONE_SETTING,
723
PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
724
if (error) {
725
printf("vmx_modinit: processor does not support desired "
726
"secondary processor-based controls\n");
727
return (error);
728
}
729
730
/* Check support for VPID */
731
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
732
PROCBASED2_ENABLE_VPID, 0, &tmp);
733
if (error == 0)
734
procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
735
736
/* Check support for pin-based VM-execution controls */
737
error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
738
MSR_VMX_TRUE_PINBASED_CTLS,
739
PINBASED_CTLS_ONE_SETTING,
740
PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
741
if (error) {
742
printf("vmx_modinit: processor does not support desired "
743
"pin-based controls\n");
744
return (error);
745
}
746
747
/* Check support for VM-exit controls */
748
error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
749
VM_EXIT_CTLS_ONE_SETTING,
750
VM_EXIT_CTLS_ZERO_SETTING,
751
&exit_ctls);
752
if (error) {
753
printf("vmx_modinit: processor does not support desired "
754
"exit controls\n");
755
return (error);
756
}
757
758
/* Check support for VM-entry controls */
759
error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
760
VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
761
&entry_ctls);
762
if (error) {
763
printf("vmx_modinit: processor does not support desired "
764
"entry controls\n");
765
return (error);
766
}
767
768
/*
769
* Check support for optional features by testing them
770
* as individual bits
771
*/
772
cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
773
MSR_VMX_TRUE_PROCBASED_CTLS,
774
PROCBASED_HLT_EXITING, 0,
775
&tmp) == 0);
776
777
cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
778
MSR_VMX_PROCBASED_CTLS,
779
PROCBASED_MTF, 0,
780
&tmp) == 0);
781
782
cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
783
MSR_VMX_TRUE_PROCBASED_CTLS,
784
PROCBASED_PAUSE_EXITING, 0,
785
&tmp) == 0);
786
787
cap_wbinvd_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
788
MSR_VMX_PROCBASED_CTLS2,
789
PROCBASED2_WBINVD_EXITING,
790
0,
791
&tmp) == 0);
792
793
/*
794
* Check support for RDPID and/or RDTSCP.
795
*
796
* Support a pass-through-based implementation of these via the
797
* "enable RDTSCP" VM-execution control and the "RDTSC exiting"
798
* VM-execution control.
799
*
800
* The "enable RDTSCP" VM-execution control applies to both RDPID
801
* and RDTSCP (see SDM volume 3, section 25.3, "Changes to
802
* Instruction Behavior in VMX Non-root operation"); this is why
803
* only this VM-execution control needs to be enabled in order to
804
* enable passing through whichever of RDPID and/or RDTSCP are
805
* supported by the host.
806
*
807
* The "RDTSC exiting" VM-execution control applies to both RDTSC
808
* and RDTSCP (again, per SDM volume 3, section 25.3), and is
809
* already set up for RDTSC and RDTSCP pass-through by the current
810
* implementation of RDTSC.
811
*
812
* Although RDPID and RDTSCP are optional capabilities, since there
813
* does not currently seem to be a use case for enabling/disabling
814
* these via libvmmapi, choose not to support this and, instead,
815
* just statically always enable or always disable this support
816
* across all vCPUs on all VMs. (Note that there may be some
817
* complications to providing this functionality, e.g., the MSR
818
* bitmap is currently per-VM rather than per-vCPU while the
819
* capability API wants to be able to control capabilities on a
820
* per-vCPU basis).
821
*/
822
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
823
MSR_VMX_PROCBASED_CTLS2,
824
PROCBASED2_ENABLE_RDTSCP, 0, &tmp);
825
cap_rdpid = error == 0 && host_has_rdpid();
826
cap_rdtscp = error == 0 && host_has_rdtscp();
827
if (cap_rdpid || cap_rdtscp) {
828
procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP;
829
vmx_have_msr_tsc_aux = true;
830
}
831
832
cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
833
MSR_VMX_PROCBASED_CTLS2,
834
PROCBASED2_UNRESTRICTED_GUEST, 0,
835
&tmp) == 0);
836
837
cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
838
MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
839
&tmp) == 0);
840
841
/*
842
* Check support for TPR shadow.
843
*/
844
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
845
MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
846
&tmp);
847
if (error == 0) {
848
tpr_shadowing = 1;
849
#ifndef BURN_BRIDGES
850
TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
851
&tpr_shadowing);
852
#endif
853
TUNABLE_INT_FETCH("hw.vmm.vmx.cap.tpr_shadowing",
854
&tpr_shadowing);
855
}
856
857
if (tpr_shadowing) {
858
procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
859
procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
860
procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
861
}
862
863
/*
864
* Check support for virtual interrupt delivery.
865
*/
866
procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
867
PROCBASED2_VIRTUALIZE_X2APIC_MODE |
868
PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
869
PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
870
871
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
872
procbased2_vid_bits, 0, &tmp);
873
if (error == 0 && tpr_shadowing) {
874
virtual_interrupt_delivery = 1;
875
#ifndef BURN_BRIDGES
876
TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
877
&virtual_interrupt_delivery);
878
#endif
879
TUNABLE_INT_FETCH("hw.vmm.vmx.cap.virtual_interrupt_delivery",
880
&virtual_interrupt_delivery);
881
}
882
883
if (virtual_interrupt_delivery) {
884
procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
885
procbased_ctls2 |= procbased2_vid_bits;
886
procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
887
888
/*
889
* Check for Posted Interrupts only if Virtual Interrupt
890
* Delivery is enabled.
891
*/
892
error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
893
MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
894
&tmp);
895
if (error == 0) {
896
pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
897
&IDTVEC(justreturn));
898
if (pirvec < 0) {
899
if (bootverbose) {
900
printf("vmx_modinit: unable to "
901
"allocate posted interrupt "
902
"vector\n");
903
}
904
} else {
905
posted_interrupts = 1;
906
#ifndef BURN_BRIDGES
907
TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
908
&posted_interrupts);
909
#endif
910
TUNABLE_INT_FETCH("hw.vmm.vmx.cap.posted_interrupts",
911
&posted_interrupts);
912
}
913
}
914
}
915
916
if (posted_interrupts)
917
pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
918
919
/* Initialize EPT */
920
error = ept_init(ipinum);
921
if (error) {
922
printf("vmx_modinit: ept initialization failed (%d)\n", error);
923
return (error);
924
}
925
926
guest_l1d_flush = (cpu_ia32_arch_caps &
927
IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
928
#ifndef BURN_BRIDGES
929
TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
930
#endif
931
TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush", &guest_l1d_flush);
932
933
/*
934
* L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when
935
* available. Otherwise fall back to the software flush
936
* method which loads enough data from the kernel text to
937
* flush existing L1D content, both on VMX entry and on NMI
938
* return.
939
*/
940
if (guest_l1d_flush) {
941
if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
942
guest_l1d_flush_sw = 1;
943
#ifndef BURN_BRIDGES
944
TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
945
&guest_l1d_flush_sw);
946
#endif
947
TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush_sw",
948
&guest_l1d_flush_sw);
949
}
950
if (guest_l1d_flush_sw) {
951
if (nmi_flush_l1d_sw <= 1)
952
nmi_flush_l1d_sw = 1;
953
} else {
954
msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
955
msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
956
}
957
}
958
959
/*
960
* Stash the cr0 and cr4 bits that must be fixed to 0 or 1
961
*/
962
fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
963
fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
964
cr0_ones_mask = fixed0 & fixed1;
965
cr0_zeros_mask = ~fixed0 & ~fixed1;
966
967
/*
968
* CR0_PE and CR0_PG can be set to zero in VMX non-root operation
969
* if unrestricted guest execution is allowed.
970
*/
971
if (cap_unrestricted_guest)
972
cr0_ones_mask &= ~(CR0_PG | CR0_PE);
973
974
/*
975
* Do not allow the guest to set CR0_NW or CR0_CD.
976
*/
977
cr0_zeros_mask |= (CR0_NW | CR0_CD);
978
979
fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
980
fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
981
cr4_ones_mask = fixed0 & fixed1;
982
cr4_zeros_mask = ~fixed0 & ~fixed1;
983
984
vpid_init();
985
986
vmx_msr_init();
987
988
/* enable VMX operation */
989
vmxon_region = kmem_malloc((mp_maxid + 1) * PAGE_SIZE,
990
M_WAITOK | M_ZERO);
991
smp_rendezvous(NULL, vmx_enable, NULL, NULL);
992
993
vmx_initialized = 1;
994
995
return (0);
996
}
997
998
static void
999
vmx_trigger_hostintr(int vector)
1000
{
1001
uintptr_t func;
1002
struct gate_descriptor *gd;
1003
1004
gd = &idt[vector];
1005
1006
KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
1007
"invalid vector %d", vector));
1008
KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
1009
vector));
1010
KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
1011
"has invalid type %d", vector, gd->gd_type));
1012
KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
1013
"has invalid dpl %d", vector, gd->gd_dpl));
1014
KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
1015
"for vector %d has invalid selector %d", vector, gd->gd_selector));
1016
KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
1017
"IST %d", vector, gd->gd_ist));
1018
1019
func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
1020
vmx_call_isr(func);
1021
}
1022
1023
static int
1024
vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
1025
{
1026
int error, mask_ident, shadow_ident;
1027
uint64_t mask_value;
1028
1029
if (which != 0 && which != 4)
1030
panic("vmx_setup_cr_shadow: unknown cr%d", which);
1031
1032
if (which == 0) {
1033
mask_ident = VMCS_CR0_MASK;
1034
mask_value = cr0_ones_mask | cr0_zeros_mask;
1035
shadow_ident = VMCS_CR0_SHADOW;
1036
} else {
1037
mask_ident = VMCS_CR4_MASK;
1038
mask_value = cr4_ones_mask | cr4_zeros_mask;
1039
shadow_ident = VMCS_CR4_SHADOW;
1040
}
1041
1042
error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
1043
if (error)
1044
return (error);
1045
1046
error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
1047
if (error)
1048
return (error);
1049
1050
return (0);
1051
}
1052
#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
1053
#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
1054
1055
static void *
1056
vmx_init(struct vm *vm, pmap_t pmap)
1057
{
1058
int error __diagused;
1059
struct vmx *vmx;
1060
1061
vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
1062
vmx->vm = vm;
1063
1064
vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop));
1065
1066
/*
1067
* Clean up EPTP-tagged guest physical and combined mappings
1068
*
1069
* VMX transitions are not required to invalidate any guest physical
1070
* mappings. So, it may be possible for stale guest physical mappings
1071
* to be present in the processor TLBs.
1072
*
1073
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
1074
*/
1075
ept_invalidate_mappings(vmx->eptp);
1076
1077
vmx->msr_bitmap = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX,
1078
M_WAITOK | M_ZERO);
1079
msr_bitmap_initialize(vmx->msr_bitmap);
1080
1081
/*
1082
* It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
1083
* The guest FSBASE and GSBASE are saved and restored during
1084
* vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
1085
* always restored from the vmcs host state area on vm-exit.
1086
*
1087
* The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
1088
* how they are saved/restored so can be directly accessed by the
1089
* guest.
1090
*
1091
* MSR_EFER is saved and restored in the guest VMCS area on a
1092
* VM exit and entry respectively. It is also restored from the
1093
* host VMCS area on a VM exit.
1094
*
1095
* The TSC MSR is exposed read-only. Writes are disallowed as
1096
* that will impact the host TSC. If the guest does a write
1097
* the "use TSC offsetting" execution control is enabled and the
1098
* difference between the host TSC and the guest TSC is written
1099
* into the TSC offset in the VMCS.
1100
*
1101
* Guest TSC_AUX support is enabled if any of guest RDPID and/or
1102
* guest RDTSCP support are enabled (since, as per Table 2-2 in SDM
1103
* volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are
1104
* supported). If guest TSC_AUX support is enabled, TSC_AUX is
1105
* exposed read-only so that the VMM can do one fewer MSR read per
1106
* exit than if this register were exposed read-write; the guest
1107
* restore value can be updated during guest writes (expected to be
1108
* rare) instead of during all exits (common).
1109
*/
1110
if (guest_msr_rw(vmx, MSR_GSBASE) ||
1111
guest_msr_rw(vmx, MSR_FSBASE) ||
1112
guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
1113
guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
1114
guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
1115
guest_msr_rw(vmx, MSR_EFER) ||
1116
guest_msr_ro(vmx, MSR_TSC) ||
1117
((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX)))
1118
panic("vmx_init: error setting guest msr access");
1119
1120
if (virtual_interrupt_delivery) {
1121
error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
1122
APIC_ACCESS_ADDRESS);
1123
/* XXX this should really return an error to the caller */
1124
KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
1125
}
1126
1127
vmx->pmap = pmap;
1128
return (vmx);
1129
}
1130
1131
static void *
1132
vmx_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
1133
{
1134
struct vmx *vmx = vmi;
1135
struct vmcs *vmcs;
1136
struct vmx_vcpu *vcpu;
1137
uint32_t exc_bitmap;
1138
uint16_t vpid;
1139
int error;
1140
1141
vpid = vpid_alloc(vcpuid);
1142
1143
vcpu = malloc(sizeof(*vcpu), M_VMX, M_WAITOK | M_ZERO);
1144
vcpu->vmx = vmx;
1145
vcpu->vcpu = vcpu1;
1146
vcpu->vcpuid = vcpuid;
1147
vcpu->vmcs = malloc_aligned(sizeof(*vmcs), PAGE_SIZE, M_VMX,
1148
M_WAITOK | M_ZERO);
1149
vcpu->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX,
1150
M_WAITOK | M_ZERO);
1151
vcpu->pir_desc = malloc_aligned(sizeof(*vcpu->pir_desc), 64, M_VMX,
1152
M_WAITOK | M_ZERO);
1153
1154
vmcs = vcpu->vmcs;
1155
vmcs->identifier = vmx_revision();
1156
error = vmclear(vmcs);
1157
if (error != 0) {
1158
panic("vmx_init: vmclear error %d on vcpu %d\n",
1159
error, vcpuid);
1160
}
1161
1162
vmx_msr_guest_init(vmx, vcpu);
1163
1164
error = vmcs_init(vmcs);
1165
KASSERT(error == 0, ("vmcs_init error %d", error));
1166
1167
VMPTRLD(vmcs);
1168
error = 0;
1169
error += vmwrite(VMCS_HOST_RSP, (u_long)&vcpu->ctx);
1170
error += vmwrite(VMCS_EPTP, vmx->eptp);
1171
error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
1172
error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
1173
if (vcpu_trap_wbinvd(vcpu->vcpu)) {
1174
KASSERT(cap_wbinvd_exit, ("WBINVD trap not available"));
1175
procbased_ctls2 |= PROCBASED2_WBINVD_EXITING;
1176
}
1177
error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
1178
error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
1179
error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
1180
error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
1181
error += vmwrite(VMCS_VPID, vpid);
1182
1183
if (guest_l1d_flush && !guest_l1d_flush_sw) {
1184
vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
1185
(vm_offset_t)&msr_load_list[0]));
1186
vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
1187
nitems(msr_load_list));
1188
vmcs_write(VMCS_EXIT_MSR_STORE, 0);
1189
vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
1190
}
1191
1192
/* exception bitmap */
1193
if (vcpu_trace_exceptions(vcpu->vcpu))
1194
exc_bitmap = 0xffffffff;
1195
else
1196
exc_bitmap = 1 << IDT_MC;
1197
error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
1198
1199
vcpu->ctx.guest_dr6 = DBREG_DR6_RESERVED1;
1200
error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
1201
1202
if (tpr_shadowing) {
1203
error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(vcpu->apic_page));
1204
}
1205
1206
if (virtual_interrupt_delivery) {
1207
error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
1208
error += vmwrite(VMCS_EOI_EXIT0, 0);
1209
error += vmwrite(VMCS_EOI_EXIT1, 0);
1210
error += vmwrite(VMCS_EOI_EXIT2, 0);
1211
error += vmwrite(VMCS_EOI_EXIT3, 0);
1212
}
1213
if (posted_interrupts) {
1214
error += vmwrite(VMCS_PIR_VECTOR, pirvec);
1215
error += vmwrite(VMCS_PIR_DESC, vtophys(vcpu->pir_desc));
1216
}
1217
VMCLEAR(vmcs);
1218
KASSERT(error == 0, ("vmx_init: error customizing the vmcs"));
1219
1220
vcpu->cap.set = 0;
1221
vcpu->cap.set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0;
1222
vcpu->cap.set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0;
1223
vcpu->cap.proc_ctls = procbased_ctls;
1224
vcpu->cap.proc_ctls2 = procbased_ctls2;
1225
vcpu->cap.exc_bitmap = exc_bitmap;
1226
1227
vcpu->state.nextrip = ~0;
1228
vcpu->state.lastcpu = NOCPU;
1229
vcpu->state.vpid = vpid;
1230
1231
/*
1232
* Set up the CR0/4 shadows, and init the read shadow
1233
* to the power-on register value from the Intel Sys Arch.
1234
* CR0 - 0x60000010
1235
* CR4 - 0
1236
*/
1237
error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
1238
if (error != 0)
1239
panic("vmx_setup_cr0_shadow %d", error);
1240
1241
error = vmx_setup_cr4_shadow(vmcs, 0);
1242
if (error != 0)
1243
panic("vmx_setup_cr4_shadow %d", error);
1244
1245
vcpu->ctx.pmap = vmx->pmap;
1246
1247
return (vcpu);
1248
}
1249
1250
static int
1251
vmx_handle_cpuid(struct vmx_vcpu *vcpu, struct vmxctx *vmxctx)
1252
{
1253
int handled;
1254
1255
handled = x86_emulate_cpuid(vcpu->vcpu, (uint64_t *)&vmxctx->guest_rax,
1256
(uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx,
1257
(uint64_t *)&vmxctx->guest_rdx);
1258
return (handled);
1259
}
1260
1261
static __inline void
1262
vmx_run_trace(struct vmx_vcpu *vcpu)
1263
{
1264
VMX_CTR1(vcpu, "Resume execution at %#lx", vmcs_guest_rip());
1265
}
1266
1267
static __inline void
1268
vmx_exit_trace(struct vmx_vcpu *vcpu, uint64_t rip, uint32_t exit_reason,
1269
int handled)
1270
{
1271
VMX_CTR3(vcpu, "%s %s vmexit at 0x%0lx",
1272
handled ? "handled" : "unhandled",
1273
exit_reason_to_str(exit_reason), rip);
1274
}
1275
1276
static __inline void
1277
vmx_astpending_trace(struct vmx_vcpu *vcpu, uint64_t rip)
1278
{
1279
VMX_CTR1(vcpu, "astpending vmexit at 0x%0lx", rip);
1280
}
1281
1282
static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1283
static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
1284
1285
/*
1286
* Invalidate guest mappings identified by its vpid from the TLB.
1287
*/
1288
static __inline void
1289
vmx_invvpid(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap, int running)
1290
{
1291
struct vmxstate *vmxstate;
1292
struct invvpid_desc invvpid_desc;
1293
1294
vmxstate = &vcpu->state;
1295
if (vmxstate->vpid == 0)
1296
return;
1297
1298
if (!running) {
1299
/*
1300
* Set the 'lastcpu' to an invalid host cpu.
1301
*
1302
* This will invalidate TLB entries tagged with the vcpu's
1303
* vpid the next time it runs via vmx_set_pcpu_defaults().
1304
*/
1305
vmxstate->lastcpu = NOCPU;
1306
return;
1307
}
1308
1309
KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
1310
"critical section", __func__, vcpu->vcpuid));
1311
1312
/*
1313
* Invalidate all mappings tagged with 'vpid'
1314
*
1315
* We do this because this vcpu was executing on a different host
1316
* cpu when it last ran. We do not track whether it invalidated
1317
* mappings associated with its 'vpid' during that run. So we must
1318
* assume that the mappings associated with 'vpid' on 'curcpu' are
1319
* stale and invalidate them.
1320
*
1321
* Note that we incur this penalty only when the scheduler chooses to
1322
* move the thread associated with this vcpu between host cpus.
1323
*
1324
* Note also that this will invalidate mappings tagged with 'vpid'
1325
* for "all" EP4TAs.
1326
*/
1327
if (atomic_load_long(&pmap->pm_eptgen) == vmx->eptgen[curcpu]) {
1328
invvpid_desc._res1 = 0;
1329
invvpid_desc._res2 = 0;
1330
invvpid_desc.vpid = vmxstate->vpid;
1331
invvpid_desc.linear_addr = 0;
1332
invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1333
vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_DONE, 1);
1334
} else {
1335
/*
1336
* The invvpid can be skipped if an invept is going to
1337
* be performed before entering the guest. The invept
1338
* will invalidate combined mappings tagged with
1339
* 'vmx->eptp' for all vpids.
1340
*/
1341
vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_SAVED, 1);
1342
}
1343
}
1344
1345
static void
1346
vmx_set_pcpu_defaults(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap)
1347
{
1348
struct vmxstate *vmxstate;
1349
1350
vmxstate = &vcpu->state;
1351
if (vmxstate->lastcpu == curcpu)
1352
return;
1353
1354
vmxstate->lastcpu = curcpu;
1355
1356
vmm_stat_incr(vcpu->vcpu, VCPU_MIGRATIONS, 1);
1357
1358
vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1359
vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1360
vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1361
vmx_invvpid(vmx, vcpu, pmap, 1);
1362
}
1363
1364
/*
1365
* We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1366
*/
1367
CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1368
1369
static void __inline
1370
vmx_set_int_window_exiting(struct vmx_vcpu *vcpu)
1371
{
1372
1373
if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1374
vcpu->cap.proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1375
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls);
1376
VMX_CTR0(vcpu, "Enabling interrupt window exiting");
1377
}
1378
}
1379
1380
static void __inline
1381
vmx_clear_int_window_exiting(struct vmx_vcpu *vcpu)
1382
{
1383
1384
KASSERT((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1385
("intr_window_exiting not set: %#x", vcpu->cap.proc_ctls));
1386
vcpu->cap.proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1387
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls);
1388
VMX_CTR0(vcpu, "Disabling interrupt window exiting");
1389
}
1390
1391
static void __inline
1392
vmx_set_nmi_window_exiting(struct vmx_vcpu *vcpu)
1393
{
1394
1395
if ((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1396
vcpu->cap.proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1397
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls);
1398
VMX_CTR0(vcpu, "Enabling NMI window exiting");
1399
}
1400
}
1401
1402
static void __inline
1403
vmx_clear_nmi_window_exiting(struct vmx_vcpu *vcpu)
1404
{
1405
1406
KASSERT((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1407
("nmi_window_exiting not set %#x", vcpu->cap.proc_ctls));
1408
vcpu->cap.proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1409
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls);
1410
VMX_CTR0(vcpu, "Disabling NMI window exiting");
1411
}
1412
1413
int
1414
vmx_set_tsc_offset(struct vmx_vcpu *vcpu, uint64_t offset)
1415
{
1416
int error;
1417
1418
if ((vcpu->cap.proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
1419
vcpu->cap.proc_ctls |= PROCBASED_TSC_OFFSET;
1420
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls);
1421
VMX_CTR0(vcpu, "Enabling TSC offsetting");
1422
}
1423
1424
error = vmwrite(VMCS_TSC_OFFSET, offset);
1425
#ifdef BHYVE_SNAPSHOT
1426
if (error == 0)
1427
vm_set_tsc_offset(vcpu->vcpu, offset);
1428
#endif
1429
return (error);
1430
}
1431
1432
#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \
1433
VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1434
#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \
1435
VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1436
1437
static void
1438
vmx_inject_nmi(struct vmx_vcpu *vcpu)
1439
{
1440
uint32_t gi __diagused, info;
1441
1442
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1443
KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1444
"interruptibility-state %#x", gi));
1445
1446
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1447
KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1448
"VM-entry interruption information %#x", info));
1449
1450
/*
1451
* Inject the virtual NMI. The vector must be the NMI IDT entry
1452
* or the VMCS entry check will fail.
1453
*/
1454
info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1455
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1456
1457
VMX_CTR0(vcpu, "Injecting vNMI");
1458
1459
/* Clear the request */
1460
vm_nmi_clear(vcpu->vcpu);
1461
}
1462
1463
static void
1464
vmx_inject_interrupts(struct vmx_vcpu *vcpu, struct vlapic *vlapic,
1465
uint64_t guestrip)
1466
{
1467
int vector, need_nmi_exiting, extint_pending;
1468
uint64_t rflags, entryinfo;
1469
uint32_t gi, info;
1470
1471
if (vcpu->cap.set & (1 << VM_CAP_MASK_HWINTR)) {
1472
return;
1473
}
1474
1475
if (vcpu->state.nextrip != guestrip) {
1476
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1477
if (gi & HWINTR_BLOCKING) {
1478
VMX_CTR2(vcpu, "Guest interrupt blocking "
1479
"cleared due to rip change: %#lx/%#lx",
1480
vcpu->state.nextrip, guestrip);
1481
gi &= ~HWINTR_BLOCKING;
1482
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1483
}
1484
}
1485
1486
if (vm_entry_intinfo(vcpu->vcpu, &entryinfo)) {
1487
KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
1488
"intinfo is not valid: %#lx", __func__, entryinfo));
1489
1490
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1491
KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1492
"pending exception: %#lx/%#x", __func__, entryinfo, info));
1493
1494
info = entryinfo;
1495
vector = info & 0xff;
1496
if (vector == IDT_BP || vector == IDT_OF) {
1497
/*
1498
* VT-x requires #BP and #OF to be injected as software
1499
* exceptions.
1500
*/
1501
info &= ~VMCS_INTR_T_MASK;
1502
info |= VMCS_INTR_T_SWEXCEPTION;
1503
}
1504
1505
if (info & VMCS_INTR_DEL_ERRCODE)
1506
vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
1507
1508
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1509
}
1510
1511
if (vm_nmi_pending(vcpu->vcpu)) {
1512
/*
1513
* If there are no conditions blocking NMI injection then
1514
* inject it directly here otherwise enable "NMI window
1515
* exiting" to inject it as soon as we can.
1516
*
1517
* We also check for STI_BLOCKING because some implementations
1518
* don't allow NMI injection in this case. If we are running
1519
* on a processor that doesn't have this restriction it will
1520
* immediately exit and the NMI will be injected in the
1521
* "NMI window exiting" handler.
1522
*/
1523
need_nmi_exiting = 1;
1524
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1525
if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1526
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1527
if ((info & VMCS_INTR_VALID) == 0) {
1528
vmx_inject_nmi(vcpu);
1529
need_nmi_exiting = 0;
1530
} else {
1531
VMX_CTR1(vcpu, "Cannot inject NMI "
1532
"due to VM-entry intr info %#x", info);
1533
}
1534
} else {
1535
VMX_CTR1(vcpu, "Cannot inject NMI due to "
1536
"Guest Interruptibility-state %#x", gi);
1537
}
1538
1539
if (need_nmi_exiting)
1540
vmx_set_nmi_window_exiting(vcpu);
1541
}
1542
1543
extint_pending = vm_extint_pending(vcpu->vcpu);
1544
1545
if (!extint_pending && virtual_interrupt_delivery) {
1546
vmx_inject_pir(vlapic);
1547
return;
1548
}
1549
1550
/*
1551
* If interrupt-window exiting is already in effect then don't bother
1552
* checking for pending interrupts. This is just an optimization and
1553
* not needed for correctness.
1554
*/
1555
if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1556
VMX_CTR0(vcpu, "Skip interrupt injection due to "
1557
"pending int_window_exiting");
1558
return;
1559
}
1560
1561
if (!extint_pending) {
1562
/* Ask the local apic for a vector to inject */
1563
if (!vlapic_pending_intr(vlapic, &vector))
1564
return;
1565
1566
/*
1567
* From the Intel SDM, Volume 3, Section "Maskable
1568
* Hardware Interrupts":
1569
* - maskable interrupt vectors [16,255] can be delivered
1570
* through the local APIC.
1571
*/
1572
KASSERT(vector >= 16 && vector <= 255,
1573
("invalid vector %d from local APIC", vector));
1574
} else {
1575
/* Ask the legacy pic for a vector to inject */
1576
vatpic_pending_intr(vcpu->vmx->vm, &vector);
1577
1578
/*
1579
* From the Intel SDM, Volume 3, Section "Maskable
1580
* Hardware Interrupts":
1581
* - maskable interrupt vectors [0,255] can be delivered
1582
* through the INTR pin.
1583
*/
1584
KASSERT(vector >= 0 && vector <= 255,
1585
("invalid vector %d from INTR", vector));
1586
}
1587
1588
/* Check RFLAGS.IF and the interruptibility state of the guest */
1589
rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1590
if ((rflags & PSL_I) == 0) {
1591
VMX_CTR2(vcpu, "Cannot inject vector %d due to "
1592
"rflags %#lx", vector, rflags);
1593
goto cantinject;
1594
}
1595
1596
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1597
if (gi & HWINTR_BLOCKING) {
1598
VMX_CTR2(vcpu, "Cannot inject vector %d due to "
1599
"Guest Interruptibility-state %#x", vector, gi);
1600
goto cantinject;
1601
}
1602
1603
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1604
if (info & VMCS_INTR_VALID) {
1605
/*
1606
* This is expected and could happen for multiple reasons:
1607
* - A vectoring VM-entry was aborted due to astpending
1608
* - A VM-exit happened during event injection.
1609
* - An exception was injected above.
1610
* - An NMI was injected above or after "NMI window exiting"
1611
*/
1612
VMX_CTR2(vcpu, "Cannot inject vector %d due to "
1613
"VM-entry intr info %#x", vector, info);
1614
goto cantinject;
1615
}
1616
1617
/* Inject the interrupt */
1618
info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1619
info |= vector;
1620
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1621
1622
if (!extint_pending) {
1623
/* Update the Local APIC ISR */
1624
vlapic_intr_accepted(vlapic, vector);
1625
} else {
1626
vm_extint_clear(vcpu->vcpu);
1627
vatpic_intr_accepted(vcpu->vmx->vm, vector);
1628
1629
/*
1630
* After we accepted the current ExtINT the PIC may
1631
* have posted another one. If that is the case, set
1632
* the Interrupt Window Exiting execution control so
1633
* we can inject that one too.
1634
*
1635
* Also, interrupt window exiting allows us to inject any
1636
* pending APIC vector that was preempted by the ExtINT
1637
* as soon as possible. This applies both for the software
1638
* emulated vlapic and the hardware assisted virtual APIC.
1639
*/
1640
vmx_set_int_window_exiting(vcpu);
1641
}
1642
1643
VMX_CTR1(vcpu, "Injecting hwintr at vector %d", vector);
1644
1645
return;
1646
1647
cantinject:
1648
/*
1649
* Set the Interrupt Window Exiting execution control so we can inject
1650
* the interrupt as soon as blocking condition goes away.
1651
*/
1652
vmx_set_int_window_exiting(vcpu);
1653
}
1654
1655
/*
1656
* If the Virtual NMIs execution control is '1' then the logical processor
1657
* tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1658
* the VMCS. An IRET instruction in VMX non-root operation will remove any
1659
* virtual-NMI blocking.
1660
*
1661
* This unblocking occurs even if the IRET causes a fault. In this case the
1662
* hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1663
*/
1664
static void
1665
vmx_restore_nmi_blocking(struct vmx_vcpu *vcpu)
1666
{
1667
uint32_t gi;
1668
1669
VMX_CTR0(vcpu, "Restore Virtual-NMI blocking");
1670
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1671
gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1672
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1673
}
1674
1675
static void
1676
vmx_clear_nmi_blocking(struct vmx_vcpu *vcpu)
1677
{
1678
uint32_t gi;
1679
1680
VMX_CTR0(vcpu, "Clear Virtual-NMI blocking");
1681
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1682
gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1683
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1684
}
1685
1686
static void
1687
vmx_assert_nmi_blocking(struct vmx_vcpu *vcpu)
1688
{
1689
uint32_t gi __diagused;
1690
1691
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1692
KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
1693
("NMI blocking is not in effect %#x", gi));
1694
}
1695
1696
static int
1697
vmx_emulate_xsetbv(struct vmx *vmx, struct vmx_vcpu *vcpu,
1698
struct vm_exit *vmexit)
1699
{
1700
struct vmxctx *vmxctx;
1701
uint64_t xcrval;
1702
const struct xsave_limits *limits;
1703
1704
vmxctx = &vcpu->ctx;
1705
limits = vmm_get_xsave_limits();
1706
1707
/*
1708
* Note that the processor raises a GP# fault on its own if
1709
* xsetbv is executed for CPL != 0, so we do not have to
1710
* emulate that fault here.
1711
*/
1712
1713
/* Only xcr0 is supported. */
1714
if (vmxctx->guest_rcx != 0) {
1715
vm_inject_gp(vcpu->vcpu);
1716
return (HANDLED);
1717
}
1718
1719
/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1720
if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1721
vm_inject_ud(vcpu->vcpu);
1722
return (HANDLED);
1723
}
1724
1725
xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1726
if ((xcrval & ~limits->xcr0_allowed) != 0) {
1727
vm_inject_gp(vcpu->vcpu);
1728
return (HANDLED);
1729
}
1730
1731
if (!(xcrval & XFEATURE_ENABLED_X87)) {
1732
vm_inject_gp(vcpu->vcpu);
1733
return (HANDLED);
1734
}
1735
1736
/* AVX (YMM_Hi128) requires SSE. */
1737
if (xcrval & XFEATURE_ENABLED_AVX &&
1738
(xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1739
vm_inject_gp(vcpu->vcpu);
1740
return (HANDLED);
1741
}
1742
1743
/*
1744
* AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1745
* ZMM_Hi256, and Hi16_ZMM.
1746
*/
1747
if (xcrval & XFEATURE_AVX512 &&
1748
(xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1749
(XFEATURE_AVX512 | XFEATURE_AVX)) {
1750
vm_inject_gp(vcpu->vcpu);
1751
return (HANDLED);
1752
}
1753
1754
/*
1755
* Intel MPX requires both bound register state flags to be
1756
* set.
1757
*/
1758
if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1759
((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1760
vm_inject_gp(vcpu->vcpu);
1761
return (HANDLED);
1762
}
1763
1764
/*
1765
* This runs "inside" vmrun() with the guest's FPU state, so
1766
* modifying xcr0 directly modifies the guest's xcr0, not the
1767
* host's.
1768
*/
1769
load_xcr(0, xcrval);
1770
return (HANDLED);
1771
}
1772
1773
static uint64_t
1774
vmx_get_guest_reg(struct vmx_vcpu *vcpu, int ident)
1775
{
1776
const struct vmxctx *vmxctx;
1777
1778
vmxctx = &vcpu->ctx;
1779
1780
switch (ident) {
1781
case 0:
1782
return (vmxctx->guest_rax);
1783
case 1:
1784
return (vmxctx->guest_rcx);
1785
case 2:
1786
return (vmxctx->guest_rdx);
1787
case 3:
1788
return (vmxctx->guest_rbx);
1789
case 4:
1790
return (vmcs_read(VMCS_GUEST_RSP));
1791
case 5:
1792
return (vmxctx->guest_rbp);
1793
case 6:
1794
return (vmxctx->guest_rsi);
1795
case 7:
1796
return (vmxctx->guest_rdi);
1797
case 8:
1798
return (vmxctx->guest_r8);
1799
case 9:
1800
return (vmxctx->guest_r9);
1801
case 10:
1802
return (vmxctx->guest_r10);
1803
case 11:
1804
return (vmxctx->guest_r11);
1805
case 12:
1806
return (vmxctx->guest_r12);
1807
case 13:
1808
return (vmxctx->guest_r13);
1809
case 14:
1810
return (vmxctx->guest_r14);
1811
case 15:
1812
return (vmxctx->guest_r15);
1813
default:
1814
panic("invalid vmx register %d", ident);
1815
}
1816
}
1817
1818
static void
1819
vmx_set_guest_reg(struct vmx_vcpu *vcpu, int ident, uint64_t regval)
1820
{
1821
struct vmxctx *vmxctx;
1822
1823
vmxctx = &vcpu->ctx;
1824
1825
switch (ident) {
1826
case 0:
1827
vmxctx->guest_rax = regval;
1828
break;
1829
case 1:
1830
vmxctx->guest_rcx = regval;
1831
break;
1832
case 2:
1833
vmxctx->guest_rdx = regval;
1834
break;
1835
case 3:
1836
vmxctx->guest_rbx = regval;
1837
break;
1838
case 4:
1839
vmcs_write(VMCS_GUEST_RSP, regval);
1840
break;
1841
case 5:
1842
vmxctx->guest_rbp = regval;
1843
break;
1844
case 6:
1845
vmxctx->guest_rsi = regval;
1846
break;
1847
case 7:
1848
vmxctx->guest_rdi = regval;
1849
break;
1850
case 8:
1851
vmxctx->guest_r8 = regval;
1852
break;
1853
case 9:
1854
vmxctx->guest_r9 = regval;
1855
break;
1856
case 10:
1857
vmxctx->guest_r10 = regval;
1858
break;
1859
case 11:
1860
vmxctx->guest_r11 = regval;
1861
break;
1862
case 12:
1863
vmxctx->guest_r12 = regval;
1864
break;
1865
case 13:
1866
vmxctx->guest_r13 = regval;
1867
break;
1868
case 14:
1869
vmxctx->guest_r14 = regval;
1870
break;
1871
case 15:
1872
vmxctx->guest_r15 = regval;
1873
break;
1874
default:
1875
panic("invalid vmx register %d", ident);
1876
}
1877
}
1878
1879
static int
1880
vmx_emulate_cr0_access(struct vmx_vcpu *vcpu, uint64_t exitqual)
1881
{
1882
uint64_t crval, regval;
1883
1884
/* We only handle mov to %cr0 at this time */
1885
if ((exitqual & 0xf0) != 0x00)
1886
return (UNHANDLED);
1887
1888
regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf);
1889
1890
vmcs_write(VMCS_CR0_SHADOW, regval);
1891
1892
crval = regval | cr0_ones_mask;
1893
crval &= ~cr0_zeros_mask;
1894
vmcs_write(VMCS_GUEST_CR0, crval);
1895
1896
if (regval & CR0_PG) {
1897
uint64_t efer, entry_ctls;
1898
1899
/*
1900
* If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1901
* the "IA-32e mode guest" bit in VM-entry control must be
1902
* equal.
1903
*/
1904
efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1905
if (efer & EFER_LME) {
1906
efer |= EFER_LMA;
1907
vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1908
entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1909
entry_ctls |= VM_ENTRY_GUEST_LMA;
1910
vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1911
}
1912
}
1913
1914
return (HANDLED);
1915
}
1916
1917
static int
1918
vmx_emulate_cr4_access(struct vmx_vcpu *vcpu, uint64_t exitqual)
1919
{
1920
uint64_t crval, regval;
1921
1922
/* We only handle mov to %cr4 at this time */
1923
if ((exitqual & 0xf0) != 0x00)
1924
return (UNHANDLED);
1925
1926
regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf);
1927
1928
vmcs_write(VMCS_CR4_SHADOW, regval);
1929
1930
crval = regval | cr4_ones_mask;
1931
crval &= ~cr4_zeros_mask;
1932
vmcs_write(VMCS_GUEST_CR4, crval);
1933
1934
return (HANDLED);
1935
}
1936
1937
static int
1938
vmx_emulate_cr8_access(struct vmx *vmx, struct vmx_vcpu *vcpu,
1939
uint64_t exitqual)
1940
{
1941
struct vlapic *vlapic;
1942
uint64_t cr8;
1943
int regnum;
1944
1945
/* We only handle mov %cr8 to/from a register at this time. */
1946
if ((exitqual & 0xe0) != 0x00) {
1947
return (UNHANDLED);
1948
}
1949
1950
vlapic = vm_lapic(vcpu->vcpu);
1951
regnum = (exitqual >> 8) & 0xf;
1952
if (exitqual & 0x10) {
1953
cr8 = vlapic_get_cr8(vlapic);
1954
vmx_set_guest_reg(vcpu, regnum, cr8);
1955
} else {
1956
cr8 = vmx_get_guest_reg(vcpu, regnum);
1957
vlapic_set_cr8(vlapic, cr8);
1958
}
1959
1960
return (HANDLED);
1961
}
1962
1963
/*
1964
* From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1965
*/
1966
static int
1967
vmx_cpl(void)
1968
{
1969
uint32_t ssar;
1970
1971
ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1972
return ((ssar >> 5) & 0x3);
1973
}
1974
1975
static enum vm_cpu_mode
1976
vmx_cpu_mode(void)
1977
{
1978
uint32_t csar;
1979
1980
if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
1981
csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1982
if (csar & 0x2000)
1983
return (CPU_MODE_64BIT); /* CS.L = 1 */
1984
else
1985
return (CPU_MODE_COMPATIBILITY);
1986
} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
1987
return (CPU_MODE_PROTECTED);
1988
} else {
1989
return (CPU_MODE_REAL);
1990
}
1991
}
1992
1993
static enum vm_paging_mode
1994
vmx_paging_mode(void)
1995
{
1996
uint64_t cr4;
1997
1998
if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1999
return (PAGING_MODE_FLAT);
2000
cr4 = vmcs_read(VMCS_GUEST_CR4);
2001
if (!(cr4 & CR4_PAE))
2002
return (PAGING_MODE_32);
2003
if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) {
2004
if (!(cr4 & CR4_LA57))
2005
return (PAGING_MODE_64);
2006
return (PAGING_MODE_64_LA57);
2007
} else
2008
return (PAGING_MODE_PAE);
2009
}
2010
2011
static uint64_t
2012
inout_str_index(struct vmx_vcpu *vcpu, int in)
2013
{
2014
uint64_t val;
2015
int error __diagused;
2016
enum vm_reg_name reg;
2017
2018
reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2019
error = vmx_getreg(vcpu, reg, &val);
2020
KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
2021
return (val);
2022
}
2023
2024
static uint64_t
2025
inout_str_count(struct vmx_vcpu *vcpu, int rep)
2026
{
2027
uint64_t val;
2028
int error __diagused;
2029
2030
if (rep) {
2031
error = vmx_getreg(vcpu, VM_REG_GUEST_RCX, &val);
2032
KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
2033
} else {
2034
val = 1;
2035
}
2036
return (val);
2037
}
2038
2039
static int
2040
inout_str_addrsize(uint32_t inst_info)
2041
{
2042
uint32_t size;
2043
2044
size = (inst_info >> 7) & 0x7;
2045
switch (size) {
2046
case 0:
2047
return (2); /* 16 bit */
2048
case 1:
2049
return (4); /* 32 bit */
2050
case 2:
2051
return (8); /* 64 bit */
2052
default:
2053
panic("%s: invalid size encoding %d", __func__, size);
2054
}
2055
}
2056
2057
static void
2058
inout_str_seginfo(struct vmx_vcpu *vcpu, uint32_t inst_info, int in,
2059
struct vm_inout_str *vis)
2060
{
2061
int error __diagused, s;
2062
2063
if (in) {
2064
vis->seg_name = VM_REG_GUEST_ES;
2065
} else {
2066
s = (inst_info >> 15) & 0x7;
2067
vis->seg_name = vm_segment_name(s);
2068
}
2069
2070
error = vmx_getdesc(vcpu, vis->seg_name, &vis->seg_desc);
2071
KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
2072
}
2073
2074
static void
2075
vmx_paging_info(struct vm_guest_paging *paging)
2076
{
2077
paging->cr3 = vmcs_guest_cr3();
2078
paging->cpl = vmx_cpl();
2079
paging->cpu_mode = vmx_cpu_mode();
2080
paging->paging_mode = vmx_paging_mode();
2081
}
2082
2083
static void
2084
vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
2085
{
2086
struct vm_guest_paging *paging;
2087
uint32_t csar;
2088
2089
paging = &vmexit->u.inst_emul.paging;
2090
2091
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
2092
vmexit->inst_length = 0;
2093
vmexit->u.inst_emul.gpa = gpa;
2094
vmexit->u.inst_emul.gla = gla;
2095
vmx_paging_info(paging);
2096
switch (paging->cpu_mode) {
2097
case CPU_MODE_REAL:
2098
vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2099
vmexit->u.inst_emul.cs_d = 0;
2100
break;
2101
case CPU_MODE_PROTECTED:
2102
case CPU_MODE_COMPATIBILITY:
2103
vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2104
csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
2105
vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
2106
break;
2107
default:
2108
vmexit->u.inst_emul.cs_base = 0;
2109
vmexit->u.inst_emul.cs_d = 0;
2110
break;
2111
}
2112
vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
2113
}
2114
2115
static int
2116
ept_fault_type(uint64_t ept_qual)
2117
{
2118
int fault_type;
2119
2120
if (ept_qual & EPT_VIOLATION_DATA_WRITE)
2121
fault_type = VM_PROT_WRITE;
2122
else if (ept_qual & EPT_VIOLATION_INST_FETCH)
2123
fault_type = VM_PROT_EXECUTE;
2124
else
2125
fault_type= VM_PROT_READ;
2126
2127
return (fault_type);
2128
}
2129
2130
static bool
2131
ept_emulation_fault(uint64_t ept_qual)
2132
{
2133
int read, write;
2134
2135
/* EPT fault on an instruction fetch doesn't make sense here */
2136
if (ept_qual & EPT_VIOLATION_INST_FETCH)
2137
return (false);
2138
2139
/* EPT fault must be a read fault or a write fault */
2140
read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
2141
write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
2142
if ((read | write) == 0)
2143
return (false);
2144
2145
/*
2146
* The EPT violation must have been caused by accessing a
2147
* guest-physical address that is a translation of a guest-linear
2148
* address.
2149
*/
2150
if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
2151
(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
2152
return (false);
2153
}
2154
2155
return (true);
2156
}
2157
2158
static __inline int
2159
apic_access_virtualization(struct vmx_vcpu *vcpu)
2160
{
2161
uint32_t proc_ctls2;
2162
2163
proc_ctls2 = vcpu->cap.proc_ctls2;
2164
return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
2165
}
2166
2167
static __inline int
2168
x2apic_virtualization(struct vmx_vcpu *vcpu)
2169
{
2170
uint32_t proc_ctls2;
2171
2172
proc_ctls2 = vcpu->cap.proc_ctls2;
2173
return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
2174
}
2175
2176
static int
2177
vmx_handle_apic_write(struct vmx_vcpu *vcpu, struct vlapic *vlapic,
2178
uint64_t qual)
2179
{
2180
int error, handled, offset;
2181
uint32_t *apic_regs, vector;
2182
bool retu;
2183
2184
handled = HANDLED;
2185
offset = APIC_WRITE_OFFSET(qual);
2186
2187
if (!apic_access_virtualization(vcpu)) {
2188
/*
2189
* In general there should not be any APIC write VM-exits
2190
* unless APIC-access virtualization is enabled.
2191
*
2192
* However self-IPI virtualization can legitimately trigger
2193
* an APIC-write VM-exit so treat it specially.
2194
*/
2195
if (x2apic_virtualization(vcpu) &&
2196
offset == APIC_OFFSET_SELF_IPI) {
2197
apic_regs = (uint32_t *)(vlapic->apic_page);
2198
vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
2199
vlapic_self_ipi_handler(vlapic, vector);
2200
return (HANDLED);
2201
} else
2202
return (UNHANDLED);
2203
}
2204
2205
switch (offset) {
2206
case APIC_OFFSET_ID:
2207
vlapic_id_write_handler(vlapic);
2208
break;
2209
case APIC_OFFSET_LDR:
2210
vlapic_ldr_write_handler(vlapic);
2211
break;
2212
case APIC_OFFSET_DFR:
2213
vlapic_dfr_write_handler(vlapic);
2214
break;
2215
case APIC_OFFSET_SVR:
2216
vlapic_svr_write_handler(vlapic);
2217
break;
2218
case APIC_OFFSET_ESR:
2219
vlapic_esr_write_handler(vlapic);
2220
break;
2221
case APIC_OFFSET_ICR_LOW:
2222
retu = false;
2223
error = vlapic_icrlo_write_handler(vlapic, &retu);
2224
if (error != 0 || retu)
2225
handled = UNHANDLED;
2226
break;
2227
case APIC_OFFSET_CMCI_LVT:
2228
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
2229
vlapic_lvt_write_handler(vlapic, offset);
2230
break;
2231
case APIC_OFFSET_TIMER_ICR:
2232
vlapic_icrtmr_write_handler(vlapic);
2233
break;
2234
case APIC_OFFSET_TIMER_DCR:
2235
vlapic_dcr_write_handler(vlapic);
2236
break;
2237
default:
2238
handled = UNHANDLED;
2239
break;
2240
}
2241
return (handled);
2242
}
2243
2244
static bool
2245
apic_access_fault(struct vmx_vcpu *vcpu, uint64_t gpa)
2246
{
2247
2248
if (apic_access_virtualization(vcpu) &&
2249
(gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
2250
return (true);
2251
else
2252
return (false);
2253
}
2254
2255
static int
2256
vmx_handle_apic_access(struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
2257
{
2258
uint64_t qual;
2259
int access_type, offset, allowed;
2260
2261
if (!apic_access_virtualization(vcpu))
2262
return (UNHANDLED);
2263
2264
qual = vmexit->u.vmx.exit_qualification;
2265
access_type = APIC_ACCESS_TYPE(qual);
2266
offset = APIC_ACCESS_OFFSET(qual);
2267
2268
allowed = 0;
2269
if (access_type == 0) {
2270
/*
2271
* Read data access to the following registers is expected.
2272
*/
2273
switch (offset) {
2274
case APIC_OFFSET_APR:
2275
case APIC_OFFSET_PPR:
2276
case APIC_OFFSET_RRR:
2277
case APIC_OFFSET_CMCI_LVT:
2278
case APIC_OFFSET_TIMER_CCR:
2279
allowed = 1;
2280
break;
2281
default:
2282
break;
2283
}
2284
} else if (access_type == 1) {
2285
/*
2286
* Write data access to the following registers is expected.
2287
*/
2288
switch (offset) {
2289
case APIC_OFFSET_VER:
2290
case APIC_OFFSET_APR:
2291
case APIC_OFFSET_PPR:
2292
case APIC_OFFSET_RRR:
2293
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
2294
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
2295
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
2296
case APIC_OFFSET_CMCI_LVT:
2297
case APIC_OFFSET_TIMER_CCR:
2298
allowed = 1;
2299
break;
2300
default:
2301
break;
2302
}
2303
}
2304
2305
if (allowed) {
2306
vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
2307
VIE_INVALID_GLA);
2308
}
2309
2310
/*
2311
* Regardless of whether the APIC-access is allowed this handler
2312
* always returns UNHANDLED:
2313
* - if the access is allowed then it is handled by emulating the
2314
* instruction that caused the VM-exit (outside the critical section)
2315
* - if the access is not allowed then it will be converted to an
2316
* exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
2317
*/
2318
return (UNHANDLED);
2319
}
2320
2321
static enum task_switch_reason
2322
vmx_task_switch_reason(uint64_t qual)
2323
{
2324
int reason;
2325
2326
reason = (qual >> 30) & 0x3;
2327
switch (reason) {
2328
case 0:
2329
return (TSR_CALL);
2330
case 1:
2331
return (TSR_IRET);
2332
case 2:
2333
return (TSR_JMP);
2334
case 3:
2335
return (TSR_IDT_GATE);
2336
default:
2337
panic("%s: invalid reason %d", __func__, reason);
2338
}
2339
}
2340
2341
static int
2342
emulate_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu)
2343
{
2344
int error;
2345
2346
if (lapic_msr(num))
2347
error = lapic_wrmsr(vcpu->vcpu, num, val, retu);
2348
else
2349
error = vmx_wrmsr(vcpu, num, val, retu);
2350
2351
return (error);
2352
}
2353
2354
static int
2355
emulate_rdmsr(struct vmx_vcpu *vcpu, u_int num, bool *retu)
2356
{
2357
struct vmxctx *vmxctx;
2358
uint64_t result;
2359
uint32_t eax, edx;
2360
int error;
2361
2362
if (lapic_msr(num))
2363
error = lapic_rdmsr(vcpu->vcpu, num, &result, retu);
2364
else
2365
error = vmx_rdmsr(vcpu, num, &result, retu);
2366
2367
if (error == 0) {
2368
eax = result;
2369
vmxctx = &vcpu->ctx;
2370
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
2371
KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
2372
2373
edx = result >> 32;
2374
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
2375
KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
2376
}
2377
2378
return (error);
2379
}
2380
2381
static int
2382
vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
2383
{
2384
int error, errcode, errcode_valid, handled, in;
2385
struct vmxctx *vmxctx;
2386
struct vlapic *vlapic;
2387
struct vm_inout_str *vis;
2388
struct vm_task_switch *ts;
2389
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
2390
uint32_t intr_type, intr_vec, reason;
2391
uint64_t exitintinfo, qual, gpa;
2392
#ifdef KDTRACE_HOOKS
2393
int vcpuid;
2394
#endif
2395
bool retu;
2396
2397
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
2398
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
2399
2400
handled = UNHANDLED;
2401
vmxctx = &vcpu->ctx;
2402
#ifdef KDTRACE_HOOKS
2403
vcpuid = vcpu->vcpuid;
2404
#endif
2405
2406
qual = vmexit->u.vmx.exit_qualification;
2407
reason = vmexit->u.vmx.exit_reason;
2408
vmexit->exitcode = VM_EXITCODE_BOGUS;
2409
2410
vmm_stat_incr(vcpu->vcpu, VMEXIT_COUNT, 1);
2411
SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpuid, vmexit);
2412
2413
/*
2414
* VM-entry failures during or after loading guest state.
2415
*
2416
* These VM-exits are uncommon but must be handled specially
2417
* as most VM-exit fields are not populated as usual.
2418
*/
2419
if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
2420
VMX_CTR0(vcpu, "Handling MCE during VM-entry");
2421
__asm __volatile("int $18");
2422
return (1);
2423
}
2424
2425
/*
2426
* VM exits that can be triggered during event delivery need to
2427
* be handled specially by re-injecting the event if the IDT
2428
* vectoring information field's valid bit is set.
2429
*
2430
* See "Information for VM Exits During Event Delivery" in Intel SDM
2431
* for details.
2432
*/
2433
idtvec_info = vmcs_idt_vectoring_info();
2434
if (idtvec_info & VMCS_IDT_VEC_VALID) {
2435
idtvec_info &= ~(1 << 12); /* clear undefined bit */
2436
exitintinfo = idtvec_info;
2437
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2438
idtvec_err = vmcs_idt_vectoring_err();
2439
exitintinfo |= (uint64_t)idtvec_err << 32;
2440
}
2441
error = vm_exit_intinfo(vcpu->vcpu, exitintinfo);
2442
KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
2443
__func__, error));
2444
2445
/*
2446
* If 'virtual NMIs' are being used and the VM-exit
2447
* happened while injecting an NMI during the previous
2448
* VM-entry, then clear "blocking by NMI" in the
2449
* Guest Interruptibility-State so the NMI can be
2450
* reinjected on the subsequent VM-entry.
2451
*
2452
* However, if the NMI was being delivered through a task
2453
* gate, then the new task must start execution with NMIs
2454
* blocked so don't clear NMI blocking in this case.
2455
*/
2456
intr_type = idtvec_info & VMCS_INTR_T_MASK;
2457
if (intr_type == VMCS_INTR_T_NMI) {
2458
if (reason != EXIT_REASON_TASK_SWITCH)
2459
vmx_clear_nmi_blocking(vcpu);
2460
else
2461
vmx_assert_nmi_blocking(vcpu);
2462
}
2463
2464
/*
2465
* Update VM-entry instruction length if the event being
2466
* delivered was a software interrupt or software exception.
2467
*/
2468
if (intr_type == VMCS_INTR_T_SWINTR ||
2469
intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
2470
intr_type == VMCS_INTR_T_SWEXCEPTION) {
2471
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2472
}
2473
}
2474
2475
switch (reason) {
2476
case EXIT_REASON_TASK_SWITCH:
2477
ts = &vmexit->u.task_switch;
2478
ts->tsssel = qual & 0xffff;
2479
ts->reason = vmx_task_switch_reason(qual);
2480
ts->ext = 0;
2481
ts->errcode_valid = 0;
2482
vmx_paging_info(&ts->paging);
2483
/*
2484
* If the task switch was due to a CALL, JMP, IRET, software
2485
* interrupt (INT n) or software exception (INT3, INTO),
2486
* then the saved %rip references the instruction that caused
2487
* the task switch. The instruction length field in the VMCS
2488
* is valid in this case.
2489
*
2490
* In all other cases (e.g., NMI, hardware exception) the
2491
* saved %rip is one that would have been saved in the old TSS
2492
* had the task switch completed normally so the instruction
2493
* length field is not needed in this case and is explicitly
2494
* set to 0.
2495
*/
2496
if (ts->reason == TSR_IDT_GATE) {
2497
KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
2498
("invalid idtvec_info %#x for IDT task switch",
2499
idtvec_info));
2500
intr_type = idtvec_info & VMCS_INTR_T_MASK;
2501
if (intr_type != VMCS_INTR_T_SWINTR &&
2502
intr_type != VMCS_INTR_T_SWEXCEPTION &&
2503
intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
2504
/* Task switch triggered by external event */
2505
ts->ext = 1;
2506
vmexit->inst_length = 0;
2507
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2508
ts->errcode_valid = 1;
2509
ts->errcode = vmcs_idt_vectoring_err();
2510
}
2511
}
2512
}
2513
vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
2514
SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpuid, vmexit, ts);
2515
VMX_CTR4(vcpu, "task switch reason %d, tss 0x%04x, "
2516
"%s errcode 0x%016lx", ts->reason, ts->tsssel,
2517
ts->ext ? "external" : "internal",
2518
((uint64_t)ts->errcode << 32) | ts->errcode_valid);
2519
break;
2520
case EXIT_REASON_CR_ACCESS:
2521
vmm_stat_incr(vcpu->vcpu, VMEXIT_CR_ACCESS, 1);
2522
SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpuid, vmexit, qual);
2523
switch (qual & 0xf) {
2524
case 0:
2525
handled = vmx_emulate_cr0_access(vcpu, qual);
2526
break;
2527
case 4:
2528
handled = vmx_emulate_cr4_access(vcpu, qual);
2529
break;
2530
case 8:
2531
handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
2532
break;
2533
}
2534
break;
2535
case EXIT_REASON_RDMSR:
2536
vmm_stat_incr(vcpu->vcpu, VMEXIT_RDMSR, 1);
2537
retu = false;
2538
ecx = vmxctx->guest_rcx;
2539
VMX_CTR1(vcpu, "rdmsr 0x%08x", ecx);
2540
SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpuid, vmexit, ecx);
2541
error = emulate_rdmsr(vcpu, ecx, &retu);
2542
if (error) {
2543
vmexit->exitcode = VM_EXITCODE_RDMSR;
2544
vmexit->u.msr.code = ecx;
2545
} else if (!retu) {
2546
handled = HANDLED;
2547
} else {
2548
/* Return to userspace with a valid exitcode */
2549
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2550
("emulate_rdmsr retu with bogus exitcode"));
2551
}
2552
break;
2553
case EXIT_REASON_WRMSR:
2554
vmm_stat_incr(vcpu->vcpu, VMEXIT_WRMSR, 1);
2555
retu = false;
2556
eax = vmxctx->guest_rax;
2557
ecx = vmxctx->guest_rcx;
2558
edx = vmxctx->guest_rdx;
2559
VMX_CTR2(vcpu, "wrmsr 0x%08x value 0x%016lx",
2560
ecx, (uint64_t)edx << 32 | eax);
2561
SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpuid, ecx,
2562
(uint64_t)edx << 32 | eax);
2563
error = emulate_wrmsr(vcpu, ecx, (uint64_t)edx << 32 | eax,
2564
&retu);
2565
if (error) {
2566
vmexit->exitcode = VM_EXITCODE_WRMSR;
2567
vmexit->u.msr.code = ecx;
2568
vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
2569
} else if (!retu) {
2570
handled = HANDLED;
2571
} else {
2572
/* Return to userspace with a valid exitcode */
2573
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2574
("emulate_wrmsr retu with bogus exitcode"));
2575
}
2576
break;
2577
case EXIT_REASON_HLT:
2578
vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1);
2579
SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpuid, vmexit);
2580
vmexit->exitcode = VM_EXITCODE_HLT;
2581
vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2582
if (virtual_interrupt_delivery)
2583
vmexit->u.hlt.intr_status =
2584
vmcs_read(VMCS_GUEST_INTR_STATUS);
2585
else
2586
vmexit->u.hlt.intr_status = 0;
2587
break;
2588
case EXIT_REASON_MTF:
2589
vmm_stat_incr(vcpu->vcpu, VMEXIT_MTRAP, 1);
2590
SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpuid, vmexit);
2591
vmexit->exitcode = VM_EXITCODE_MTRAP;
2592
vmexit->inst_length = 0;
2593
break;
2594
case EXIT_REASON_PAUSE:
2595
vmm_stat_incr(vcpu->vcpu, VMEXIT_PAUSE, 1);
2596
SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpuid, vmexit);
2597
vmexit->exitcode = VM_EXITCODE_PAUSE;
2598
break;
2599
case EXIT_REASON_INTR_WINDOW:
2600
vmm_stat_incr(vcpu->vcpu, VMEXIT_INTR_WINDOW, 1);
2601
SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpuid, vmexit);
2602
vmx_clear_int_window_exiting(vcpu);
2603
return (1);
2604
case EXIT_REASON_EXT_INTR:
2605
/*
2606
* External interrupts serve only to cause VM exits and allow
2607
* the host interrupt handler to run.
2608
*
2609
* If this external interrupt triggers a virtual interrupt
2610
* to a VM, then that state will be recorded by the
2611
* host interrupt handler in the VM's softc. We will inject
2612
* this virtual interrupt during the subsequent VM enter.
2613
*/
2614
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2615
SDT_PROBE4(vmm, vmx, exit, interrupt,
2616
vmx, vcpuid, vmexit, intr_info);
2617
2618
/*
2619
* XXX: Ignore this exit if VMCS_INTR_VALID is not set.
2620
* This appears to be a bug in VMware Fusion?
2621
*/
2622
if (!(intr_info & VMCS_INTR_VALID))
2623
return (1);
2624
KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
2625
(intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
2626
("VM exit interruption info invalid: %#x", intr_info));
2627
vmx_trigger_hostintr(intr_info & 0xff);
2628
2629
/*
2630
* This is special. We want to treat this as an 'handled'
2631
* VM-exit but not increment the instruction pointer.
2632
*/
2633
vmm_stat_incr(vcpu->vcpu, VMEXIT_EXTINT, 1);
2634
return (1);
2635
case EXIT_REASON_NMI_WINDOW:
2636
SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpuid, vmexit);
2637
/* Exit to allow the pending virtual NMI to be injected */
2638
if (vm_nmi_pending(vcpu->vcpu))
2639
vmx_inject_nmi(vcpu);
2640
vmx_clear_nmi_window_exiting(vcpu);
2641
vmm_stat_incr(vcpu->vcpu, VMEXIT_NMI_WINDOW, 1);
2642
return (1);
2643
case EXIT_REASON_INOUT:
2644
vmm_stat_incr(vcpu->vcpu, VMEXIT_INOUT, 1);
2645
vmexit->exitcode = VM_EXITCODE_INOUT;
2646
vmexit->u.inout.bytes = (qual & 0x7) + 1;
2647
vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
2648
vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
2649
vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
2650
vmexit->u.inout.port = (uint16_t)(qual >> 16);
2651
vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
2652
if (vmexit->u.inout.string) {
2653
inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
2654
vmexit->exitcode = VM_EXITCODE_INOUT_STR;
2655
vis = &vmexit->u.inout_str;
2656
vmx_paging_info(&vis->paging);
2657
vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2658
vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
2659
vis->index = inout_str_index(vcpu, in);
2660
vis->count = inout_str_count(vcpu, vis->inout.rep);
2661
vis->addrsize = inout_str_addrsize(inst_info);
2662
vis->cs_d = 0;
2663
vis->cs_base = 0;
2664
inout_str_seginfo(vcpu, inst_info, in, vis);
2665
}
2666
SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit);
2667
break;
2668
case EXIT_REASON_CPUID:
2669
vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1);
2670
SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpuid, vmexit);
2671
handled = vmx_handle_cpuid(vcpu, vmxctx);
2672
break;
2673
case EXIT_REASON_EXCEPTION:
2674
vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1);
2675
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2676
KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2677
("VM exit interruption info invalid: %#x", intr_info));
2678
2679
intr_vec = intr_info & 0xff;
2680
intr_type = intr_info & VMCS_INTR_T_MASK;
2681
2682
/*
2683
* If Virtual NMIs control is 1 and the VM-exit is due to a
2684
* fault encountered during the execution of IRET then we must
2685
* restore the state of "virtual-NMI blocking" before resuming
2686
* the guest.
2687
*
2688
* See "Resuming Guest Software after Handling an Exception".
2689
* See "Information for VM Exits Due to Vectored Events".
2690
*/
2691
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2692
(intr_vec != IDT_DF) &&
2693
(intr_info & EXIT_QUAL_NMIUDTI) != 0)
2694
vmx_restore_nmi_blocking(vcpu);
2695
2696
/*
2697
* The NMI has already been handled in vmx_exit_handle_nmi().
2698
*/
2699
if (intr_type == VMCS_INTR_T_NMI)
2700
return (1);
2701
2702
/*
2703
* Call the machine check handler by hand. Also don't reflect
2704
* the machine check back into the guest.
2705
*/
2706
if (intr_vec == IDT_MC) {
2707
VMX_CTR0(vcpu, "Vectoring to MCE handler");
2708
__asm __volatile("int $18");
2709
return (1);
2710
}
2711
2712
/*
2713
* If the hypervisor has requested user exits for
2714
* debug exceptions, bounce them out to userland.
2715
*/
2716
if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP &&
2717
(vcpu->cap.set & (1 << VM_CAP_BPT_EXIT))) {
2718
vmexit->exitcode = VM_EXITCODE_BPT;
2719
vmexit->u.bpt.inst_length = vmexit->inst_length;
2720
vmexit->inst_length = 0;
2721
break;
2722
}
2723
2724
if (intr_vec == IDT_PF) {
2725
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
2726
KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
2727
__func__, error));
2728
}
2729
2730
/*
2731
* Software exceptions exhibit trap-like behavior. This in
2732
* turn requires populating the VM-entry instruction length
2733
* so that the %rip in the trap frame is past the INT3/INTO
2734
* instruction.
2735
*/
2736
if (intr_type == VMCS_INTR_T_SWEXCEPTION)
2737
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2738
2739
/* Reflect all other exceptions back into the guest */
2740
errcode_valid = errcode = 0;
2741
if (intr_info & VMCS_INTR_DEL_ERRCODE) {
2742
errcode_valid = 1;
2743
errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
2744
}
2745
VMX_CTR2(vcpu, "Reflecting exception %d/%#x into "
2746
"the guest", intr_vec, errcode);
2747
SDT_PROBE5(vmm, vmx, exit, exception,
2748
vmx, vcpuid, vmexit, intr_vec, errcode);
2749
error = vm_inject_exception(vcpu->vcpu, intr_vec,
2750
errcode_valid, errcode, 0);
2751
KASSERT(error == 0, ("%s: vm_inject_exception error %d",
2752
__func__, error));
2753
return (1);
2754
2755
case EXIT_REASON_EPT_FAULT:
2756
/*
2757
* If 'gpa' lies within the address space allocated to
2758
* memory then this must be a nested page fault otherwise
2759
* this must be an instruction that accesses MMIO space.
2760
*/
2761
gpa = vmcs_gpa();
2762
if (vm_mem_allocated(vcpu->vcpu, gpa) ||
2763
ppt_is_mmio(vmx->vm, gpa) || apic_access_fault(vcpu, gpa)) {
2764
vmexit->exitcode = VM_EXITCODE_PAGING;
2765
vmexit->inst_length = 0;
2766
vmexit->u.paging.gpa = gpa;
2767
vmexit->u.paging.fault_type = ept_fault_type(qual);
2768
vmm_stat_incr(vcpu->vcpu, VMEXIT_NESTED_FAULT, 1);
2769
SDT_PROBE5(vmm, vmx, exit, nestedfault,
2770
vmx, vcpuid, vmexit, gpa, qual);
2771
} else if (ept_emulation_fault(qual)) {
2772
vmexit_inst_emul(vmexit, gpa, vmcs_gla());
2773
vmm_stat_incr(vcpu->vcpu, VMEXIT_INST_EMUL, 1);
2774
SDT_PROBE4(vmm, vmx, exit, mmiofault,
2775
vmx, vcpuid, vmexit, gpa);
2776
}
2777
/*
2778
* If Virtual NMIs control is 1 and the VM-exit is due to an
2779
* EPT fault during the execution of IRET then we must restore
2780
* the state of "virtual-NMI blocking" before resuming.
2781
*
2782
* See description of "NMI unblocking due to IRET" in
2783
* "Exit Qualification for EPT Violations".
2784
*/
2785
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2786
(qual & EXIT_QUAL_NMIUDTI) != 0)
2787
vmx_restore_nmi_blocking(vcpu);
2788
break;
2789
case EXIT_REASON_VIRTUALIZED_EOI:
2790
vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2791
vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2792
SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpuid, vmexit);
2793
vmexit->inst_length = 0; /* trap-like */
2794
break;
2795
case EXIT_REASON_APIC_ACCESS:
2796
SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpuid, vmexit);
2797
handled = vmx_handle_apic_access(vcpu, vmexit);
2798
break;
2799
case EXIT_REASON_APIC_WRITE:
2800
/*
2801
* APIC-write VM exit is trap-like so the %rip is already
2802
* pointing to the next instruction.
2803
*/
2804
vmexit->inst_length = 0;
2805
vlapic = vm_lapic(vcpu->vcpu);
2806
SDT_PROBE4(vmm, vmx, exit, apicwrite,
2807
vmx, vcpuid, vmexit, vlapic);
2808
handled = vmx_handle_apic_write(vcpu, vlapic, qual);
2809
break;
2810
case EXIT_REASON_XSETBV:
2811
SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpuid, vmexit);
2812
handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2813
break;
2814
case EXIT_REASON_MONITOR:
2815
SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpuid, vmexit);
2816
vmexit->exitcode = VM_EXITCODE_MONITOR;
2817
break;
2818
case EXIT_REASON_MWAIT:
2819
SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpuid, vmexit);
2820
vmexit->exitcode = VM_EXITCODE_MWAIT;
2821
break;
2822
case EXIT_REASON_TPR:
2823
vlapic = vm_lapic(vcpu->vcpu);
2824
vlapic_sync_tpr(vlapic);
2825
vmexit->inst_length = 0;
2826
handled = HANDLED;
2827
break;
2828
case EXIT_REASON_VMCALL:
2829
case EXIT_REASON_VMCLEAR:
2830
case EXIT_REASON_VMLAUNCH:
2831
case EXIT_REASON_VMPTRLD:
2832
case EXIT_REASON_VMPTRST:
2833
case EXIT_REASON_VMREAD:
2834
case EXIT_REASON_VMRESUME:
2835
case EXIT_REASON_VMWRITE:
2836
case EXIT_REASON_VMXOFF:
2837
case EXIT_REASON_VMXON:
2838
SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpuid, vmexit);
2839
vmexit->exitcode = VM_EXITCODE_VMINSN;
2840
break;
2841
case EXIT_REASON_INVD:
2842
case EXIT_REASON_WBINVD:
2843
/* ignore exit */
2844
handled = HANDLED;
2845
break;
2846
default:
2847
SDT_PROBE4(vmm, vmx, exit, unknown,
2848
vmx, vcpuid, vmexit, reason);
2849
vmm_stat_incr(vcpu->vcpu, VMEXIT_UNKNOWN, 1);
2850
break;
2851
}
2852
2853
if (handled) {
2854
/*
2855
* It is possible that control is returned to userland
2856
* even though we were able to handle the VM exit in the
2857
* kernel.
2858
*
2859
* In such a case we want to make sure that the userland
2860
* restarts guest execution at the instruction *after*
2861
* the one we just processed. Therefore we update the
2862
* guest rip in the VMCS and in 'vmexit'.
2863
*/
2864
vmexit->rip += vmexit->inst_length;
2865
vmexit->inst_length = 0;
2866
vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2867
} else {
2868
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2869
/*
2870
* If this VM exit was not claimed by anybody then
2871
* treat it as a generic VMX exit.
2872
*/
2873
vmexit->exitcode = VM_EXITCODE_VMX;
2874
vmexit->u.vmx.status = VM_SUCCESS;
2875
vmexit->u.vmx.inst_type = 0;
2876
vmexit->u.vmx.inst_error = 0;
2877
} else {
2878
/*
2879
* The exitcode and collateral have been populated.
2880
* The VM exit will be processed further in userland.
2881
*/
2882
}
2883
}
2884
2885
SDT_PROBE4(vmm, vmx, exit, return,
2886
vmx, vcpuid, vmexit, handled);
2887
return (handled);
2888
}
2889
2890
static __inline void
2891
vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2892
{
2893
2894
KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2895
("vmx_exit_inst_error: invalid inst_fail_status %d",
2896
vmxctx->inst_fail_status));
2897
2898
vmexit->inst_length = 0;
2899
vmexit->exitcode = VM_EXITCODE_VMX;
2900
vmexit->u.vmx.status = vmxctx->inst_fail_status;
2901
vmexit->u.vmx.inst_error = vmcs_instruction_error();
2902
vmexit->u.vmx.exit_reason = ~0;
2903
vmexit->u.vmx.exit_qualification = ~0;
2904
2905
switch (rc) {
2906
case VMX_VMRESUME_ERROR:
2907
case VMX_VMLAUNCH_ERROR:
2908
vmexit->u.vmx.inst_type = rc;
2909
break;
2910
default:
2911
panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2912
}
2913
}
2914
2915
/*
2916
* If the NMI-exiting VM execution control is set to '1' then an NMI in
2917
* non-root operation causes a VM-exit. NMI blocking is in effect so it is
2918
* sufficient to simply vector to the NMI handler via a software interrupt.
2919
* However, this must be done before maskable interrupts are enabled
2920
* otherwise the "iret" issued by an interrupt handler will incorrectly
2921
* clear NMI blocking.
2922
*/
2923
static __inline void
2924
vmx_exit_handle_nmi(struct vmx_vcpu *vcpu, struct vm_exit *vmexit)
2925
{
2926
uint32_t intr_info;
2927
2928
KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2929
2930
if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2931
return;
2932
2933
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2934
KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2935
("VM exit interruption info invalid: %#x", intr_info));
2936
2937
if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2938
KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2939
"to NMI has invalid vector: %#x", intr_info));
2940
VMX_CTR0(vcpu, "Vectoring to NMI handler");
2941
__asm __volatile("int $2");
2942
}
2943
}
2944
2945
static __inline void
2946
vmx_dr_enter_guest(struct vmxctx *vmxctx)
2947
{
2948
register_t rflags;
2949
2950
/* Save host control debug registers. */
2951
vmxctx->host_dr7 = rdr7();
2952
vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
2953
2954
/*
2955
* Disable debugging in DR7 and DEBUGCTL to avoid triggering
2956
* exceptions in the host based on the guest DRx values. The
2957
* guest DR7 and DEBUGCTL are saved/restored in the VMCS.
2958
*/
2959
load_dr7(0);
2960
wrmsr(MSR_DEBUGCTLMSR, 0);
2961
2962
/*
2963
* Disable single stepping the kernel to avoid corrupting the
2964
* guest DR6. A debugger might still be able to corrupt the
2965
* guest DR6 by setting a breakpoint after this point and then
2966
* single stepping.
2967
*/
2968
rflags = read_rflags();
2969
vmxctx->host_tf = rflags & PSL_T;
2970
write_rflags(rflags & ~PSL_T);
2971
2972
/* Save host debug registers. */
2973
vmxctx->host_dr0 = rdr0();
2974
vmxctx->host_dr1 = rdr1();
2975
vmxctx->host_dr2 = rdr2();
2976
vmxctx->host_dr3 = rdr3();
2977
vmxctx->host_dr6 = rdr6();
2978
2979
/* Restore guest debug registers. */
2980
load_dr0(vmxctx->guest_dr0);
2981
load_dr1(vmxctx->guest_dr1);
2982
load_dr2(vmxctx->guest_dr2);
2983
load_dr3(vmxctx->guest_dr3);
2984
load_dr6(vmxctx->guest_dr6);
2985
}
2986
2987
static __inline void
2988
vmx_dr_leave_guest(struct vmxctx *vmxctx)
2989
{
2990
2991
/* Save guest debug registers. */
2992
vmxctx->guest_dr0 = rdr0();
2993
vmxctx->guest_dr1 = rdr1();
2994
vmxctx->guest_dr2 = rdr2();
2995
vmxctx->guest_dr3 = rdr3();
2996
vmxctx->guest_dr6 = rdr6();
2997
2998
/*
2999
* Restore host debug registers. Restore DR7, DEBUGCTL, and
3000
* PSL_T last.
3001
*/
3002
load_dr0(vmxctx->host_dr0);
3003
load_dr1(vmxctx->host_dr1);
3004
load_dr2(vmxctx->host_dr2);
3005
load_dr3(vmxctx->host_dr3);
3006
load_dr6(vmxctx->host_dr6);
3007
wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
3008
load_dr7(vmxctx->host_dr7);
3009
write_rflags(read_rflags() | vmxctx->host_tf);
3010
}
3011
3012
static __inline void
3013
vmx_pmap_activate(struct vmx *vmx, pmap_t pmap)
3014
{
3015
long eptgen;
3016
int cpu;
3017
3018
cpu = curcpu;
3019
3020
CPU_SET_ATOMIC(cpu, &pmap->pm_active);
3021
smr_enter(pmap->pm_eptsmr);
3022
eptgen = atomic_load_long(&pmap->pm_eptgen);
3023
if (eptgen != vmx->eptgen[cpu]) {
3024
vmx->eptgen[cpu] = eptgen;
3025
invept(INVEPT_TYPE_SINGLE_CONTEXT,
3026
(struct invept_desc){ .eptp = vmx->eptp, ._res = 0 });
3027
}
3028
}
3029
3030
static __inline void
3031
vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap)
3032
{
3033
smr_exit(pmap->pm_eptsmr);
3034
CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
3035
}
3036
3037
static int
3038
vmx_run(void *vcpui, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo)
3039
{
3040
int rc, handled, launched;
3041
struct vmx *vmx;
3042
struct vmx_vcpu *vcpu;
3043
struct vmxctx *vmxctx;
3044
struct vmcs *vmcs;
3045
struct vm_exit *vmexit;
3046
struct vlapic *vlapic;
3047
uint32_t exit_reason;
3048
struct region_descriptor gdtr, idtr;
3049
uint16_t ldt_sel;
3050
3051
vcpu = vcpui;
3052
vmx = vcpu->vmx;
3053
vmcs = vcpu->vmcs;
3054
vmxctx = &vcpu->ctx;
3055
vlapic = vm_lapic(vcpu->vcpu);
3056
vmexit = vm_exitinfo(vcpu->vcpu);
3057
launched = 0;
3058
3059
KASSERT(vmxctx->pmap == pmap,
3060
("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
3061
3062
vmx_msr_guest_enter(vcpu);
3063
3064
VMPTRLD(vmcs);
3065
3066
/*
3067
* XXX
3068
* We do this every time because we may setup the virtual machine
3069
* from a different process than the one that actually runs it.
3070
*
3071
* If the life of a virtual machine was spent entirely in the context
3072
* of a single process we could do this once in vmx_init().
3073
*/
3074
vmcs_write(VMCS_HOST_CR3, rcr3());
3075
3076
vmcs_write(VMCS_GUEST_RIP, rip);
3077
vmx_set_pcpu_defaults(vmx, vcpu, pmap);
3078
do {
3079
KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
3080
"%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
3081
3082
handled = UNHANDLED;
3083
/*
3084
* Interrupts are disabled from this point on until the
3085
* guest starts executing. This is done for the following
3086
* reasons:
3087
*
3088
* If an AST is asserted on this thread after the check below,
3089
* then the IPI_AST notification will not be lost, because it
3090
* will cause a VM exit due to external interrupt as soon as
3091
* the guest state is loaded.
3092
*
3093
* A posted interrupt after 'vmx_inject_interrupts()' will
3094
* not be "lost" because it will be held pending in the host
3095
* APIC because interrupts are disabled. The pending interrupt
3096
* will be recognized as soon as the guest state is loaded.
3097
*
3098
* The same reasoning applies to the IPI generated by
3099
* pmap_invalidate_ept().
3100
*/
3101
disable_intr();
3102
vmx_inject_interrupts(vcpu, vlapic, rip);
3103
3104
/*
3105
* Check for vcpu suspension after injecting events because
3106
* vmx_inject_interrupts() can suspend the vcpu due to a
3107
* triple fault.
3108
*/
3109
if (vcpu_suspended(evinfo)) {
3110
enable_intr();
3111
vm_exit_suspended(vcpu->vcpu, rip);
3112
break;
3113
}
3114
3115
if (vcpu_rendezvous_pending(vcpu->vcpu, evinfo)) {
3116
enable_intr();
3117
vm_exit_rendezvous(vcpu->vcpu, rip);
3118
break;
3119
}
3120
3121
if (vcpu_reqidle(evinfo)) {
3122
enable_intr();
3123
vm_exit_reqidle(vcpu->vcpu, rip);
3124
break;
3125
}
3126
3127
if (vcpu_should_yield(vcpu->vcpu)) {
3128
enable_intr();
3129
vm_exit_astpending(vcpu->vcpu, rip);
3130
vmx_astpending_trace(vcpu, rip);
3131
handled = HANDLED;
3132
break;
3133
}
3134
3135
if (vcpu_debugged(vcpu->vcpu)) {
3136
enable_intr();
3137
vm_exit_debug(vcpu->vcpu, rip);
3138
break;
3139
}
3140
3141
/*
3142
* If TPR Shadowing is enabled, the TPR Threshold
3143
* must be updated right before entering the guest.
3144
*/
3145
if (tpr_shadowing && !virtual_interrupt_delivery) {
3146
if ((vcpu->cap.proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) {
3147
vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
3148
}
3149
}
3150
3151
/*
3152
* VM exits restore the base address but not the
3153
* limits of GDTR and IDTR. The VMCS only stores the
3154
* base address, so VM exits set the limits to 0xffff.
3155
* Save and restore the full GDTR and IDTR to restore
3156
* the limits.
3157
*
3158
* The VMCS does not save the LDTR at all, and VM
3159
* exits clear LDTR as if a NULL selector were loaded.
3160
* The userspace hypervisor probably doesn't use a
3161
* LDT, but save and restore it to be safe.
3162
*/
3163
sgdt(&gdtr);
3164
sidt(&idtr);
3165
ldt_sel = sldt();
3166
3167
/*
3168
* The TSC_AUX MSR must be saved/restored while interrupts
3169
* are disabled so that it is not possible for the guest
3170
* TSC_AUX MSR value to be overwritten by the resume
3171
* portion of the IPI_SUSPEND codepath. This is why the
3172
* transition of this MSR is handled separately from those
3173
* handled by vmx_msr_guest_{enter,exit}(), which are ok to
3174
* be transitioned with preemption disabled but interrupts
3175
* enabled.
3176
*
3177
* These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be
3178
* anywhere in this loop so long as they happen with
3179
* interrupts disabled. This location is chosen for
3180
* simplicity.
3181
*/
3182
vmx_msr_guest_enter_tsc_aux(vmx, vcpu);
3183
3184
vmx_dr_enter_guest(vmxctx);
3185
3186
/*
3187
* Mark the EPT as active on this host CPU and invalidate
3188
* EPTP-tagged TLB entries if required.
3189
*/
3190
vmx_pmap_activate(vmx, pmap);
3191
3192
vmx_run_trace(vcpu);
3193
rc = vmx_enter_guest(vmxctx, vmx, launched);
3194
3195
vmx_pmap_deactivate(vmx, pmap);
3196
vmx_dr_leave_guest(vmxctx);
3197
vmx_msr_guest_exit_tsc_aux(vmx, vcpu);
3198
3199
bare_lgdt(&gdtr);
3200
lidt(&idtr);
3201
lldt(ldt_sel);
3202
3203
/* Collect some information for VM exit processing */
3204
vmexit->rip = rip = vmcs_guest_rip();
3205
vmexit->inst_length = vmexit_instruction_length();
3206
vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
3207
vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
3208
3209
/* Update 'nextrip' */
3210
vcpu->state.nextrip = rip;
3211
3212
if (rc == VMX_GUEST_VMEXIT) {
3213
vmx_exit_handle_nmi(vcpu, vmexit);
3214
enable_intr();
3215
handled = vmx_exit_process(vmx, vcpu, vmexit);
3216
} else {
3217
enable_intr();
3218
vmx_exit_inst_error(vmxctx, rc, vmexit);
3219
}
3220
launched = 1;
3221
vmx_exit_trace(vcpu, rip, exit_reason, handled);
3222
rip = vmexit->rip;
3223
} while (handled);
3224
3225
/*
3226
* If a VM exit has been handled then the exitcode must be BOGUS
3227
* If a VM exit is not handled then the exitcode must not be BOGUS
3228
*/
3229
if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
3230
(!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
3231
panic("Mismatch between handled (%d) and exitcode (%d)",
3232
handled, vmexit->exitcode);
3233
}
3234
3235
VMX_CTR1(vcpu, "returning from vmx_run: exitcode %d",
3236
vmexit->exitcode);
3237
3238
VMCLEAR(vmcs);
3239
vmx_msr_guest_exit(vcpu);
3240
3241
return (0);
3242
}
3243
3244
static void
3245
vmx_vcpu_cleanup(void *vcpui)
3246
{
3247
struct vmx_vcpu *vcpu = vcpui;
3248
3249
vpid_free(vcpu->state.vpid);
3250
free(vcpu->pir_desc, M_VMX);
3251
free(vcpu->apic_page, M_VMX);
3252
free(vcpu->vmcs, M_VMX);
3253
free(vcpu, M_VMX);
3254
}
3255
3256
static void
3257
vmx_cleanup(void *vmi)
3258
{
3259
struct vmx *vmx = vmi;
3260
3261
if (virtual_interrupt_delivery)
3262
vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3263
3264
free(vmx->msr_bitmap, M_VMX);
3265
free(vmx, M_VMX);
3266
3267
return;
3268
}
3269
3270
static register_t *
3271
vmxctx_regptr(struct vmxctx *vmxctx, int reg)
3272
{
3273
3274
switch (reg) {
3275
case VM_REG_GUEST_RAX:
3276
return (&vmxctx->guest_rax);
3277
case VM_REG_GUEST_RBX:
3278
return (&vmxctx->guest_rbx);
3279
case VM_REG_GUEST_RCX:
3280
return (&vmxctx->guest_rcx);
3281
case VM_REG_GUEST_RDX:
3282
return (&vmxctx->guest_rdx);
3283
case VM_REG_GUEST_RSI:
3284
return (&vmxctx->guest_rsi);
3285
case VM_REG_GUEST_RDI:
3286
return (&vmxctx->guest_rdi);
3287
case VM_REG_GUEST_RBP:
3288
return (&vmxctx->guest_rbp);
3289
case VM_REG_GUEST_R8:
3290
return (&vmxctx->guest_r8);
3291
case VM_REG_GUEST_R9:
3292
return (&vmxctx->guest_r9);
3293
case VM_REG_GUEST_R10:
3294
return (&vmxctx->guest_r10);
3295
case VM_REG_GUEST_R11:
3296
return (&vmxctx->guest_r11);
3297
case VM_REG_GUEST_R12:
3298
return (&vmxctx->guest_r12);
3299
case VM_REG_GUEST_R13:
3300
return (&vmxctx->guest_r13);
3301
case VM_REG_GUEST_R14:
3302
return (&vmxctx->guest_r14);
3303
case VM_REG_GUEST_R15:
3304
return (&vmxctx->guest_r15);
3305
case VM_REG_GUEST_CR2:
3306
return (&vmxctx->guest_cr2);
3307
case VM_REG_GUEST_DR0:
3308
return (&vmxctx->guest_dr0);
3309
case VM_REG_GUEST_DR1:
3310
return (&vmxctx->guest_dr1);
3311
case VM_REG_GUEST_DR2:
3312
return (&vmxctx->guest_dr2);
3313
case VM_REG_GUEST_DR3:
3314
return (&vmxctx->guest_dr3);
3315
case VM_REG_GUEST_DR6:
3316
return (&vmxctx->guest_dr6);
3317
default:
3318
break;
3319
}
3320
return (NULL);
3321
}
3322
3323
static int
3324
vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
3325
{
3326
register_t *regp;
3327
3328
if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3329
*retval = *regp;
3330
return (0);
3331
} else
3332
return (EINVAL);
3333
}
3334
3335
static int
3336
vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
3337
{
3338
register_t *regp;
3339
3340
if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3341
*regp = val;
3342
return (0);
3343
} else
3344
return (EINVAL);
3345
}
3346
3347
static int
3348
vmx_get_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t *retval)
3349
{
3350
uint64_t gi;
3351
int error;
3352
3353
error = vmcs_getreg(vcpu->vmcs, running,
3354
VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
3355
*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
3356
return (error);
3357
}
3358
3359
static int
3360
vmx_modify_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t val)
3361
{
3362
struct vmcs *vmcs;
3363
uint64_t gi;
3364
int error, ident;
3365
3366
/*
3367
* Forcing the vcpu into an interrupt shadow is not supported.
3368
*/
3369
if (val) {
3370
error = EINVAL;
3371
goto done;
3372
}
3373
3374
vmcs = vcpu->vmcs;
3375
ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
3376
error = vmcs_getreg(vmcs, running, ident, &gi);
3377
if (error == 0) {
3378
gi &= ~HWINTR_BLOCKING;
3379
error = vmcs_setreg(vmcs, running, ident, gi);
3380
}
3381
done:
3382
VMX_CTR2(vcpu, "Setting intr_shadow to %#lx %s", val,
3383
error ? "failed" : "succeeded");
3384
return (error);
3385
}
3386
3387
static int
3388
vmx_shadow_reg(int reg)
3389
{
3390
int shreg;
3391
3392
shreg = -1;
3393
3394
switch (reg) {
3395
case VM_REG_GUEST_CR0:
3396
shreg = VMCS_CR0_SHADOW;
3397
break;
3398
case VM_REG_GUEST_CR4:
3399
shreg = VMCS_CR4_SHADOW;
3400
break;
3401
default:
3402
break;
3403
}
3404
3405
return (shreg);
3406
}
3407
3408
static int
3409
vmx_getreg(void *vcpui, int reg, uint64_t *retval)
3410
{
3411
int running, hostcpu;
3412
struct vmx_vcpu *vcpu = vcpui;
3413
struct vmx *vmx = vcpu->vmx;
3414
3415
running = vcpu_is_running(vcpu->vcpu, &hostcpu);
3416
if (running && hostcpu != curcpu)
3417
panic("vmx_getreg: %s%d is running", vm_name(vmx->vm),
3418
vcpu->vcpuid);
3419
3420
switch (reg) {
3421
case VM_REG_GUEST_INTR_SHADOW:
3422
return (vmx_get_intr_shadow(vcpu, running, retval));
3423
case VM_REG_GUEST_KGS_BASE:
3424
*retval = vcpu->guest_msrs[IDX_MSR_KGSBASE];
3425
return (0);
3426
case VM_REG_GUEST_TPR:
3427
*retval = vlapic_get_cr8(vm_lapic(vcpu->vcpu));
3428
return (0);
3429
}
3430
3431
if (vmxctx_getreg(&vcpu->ctx, reg, retval) == 0)
3432
return (0);
3433
3434
return (vmcs_getreg(vcpu->vmcs, running, reg, retval));
3435
}
3436
3437
static int
3438
vmx_setreg(void *vcpui, int reg, uint64_t val)
3439
{
3440
int error, hostcpu, running, shadow;
3441
uint64_t ctls;
3442
pmap_t pmap;
3443
struct vmx_vcpu *vcpu = vcpui;
3444
struct vmx *vmx = vcpu->vmx;
3445
3446
running = vcpu_is_running(vcpu->vcpu, &hostcpu);
3447
if (running && hostcpu != curcpu)
3448
panic("vmx_setreg: %s%d is running", vm_name(vmx->vm),
3449
vcpu->vcpuid);
3450
3451
if (reg == VM_REG_GUEST_INTR_SHADOW)
3452
return (vmx_modify_intr_shadow(vcpu, running, val));
3453
3454
if (vmxctx_setreg(&vcpu->ctx, reg, val) == 0)
3455
return (0);
3456
3457
/* Do not permit user write access to VMCS fields by offset. */
3458
if (reg < 0)
3459
return (EINVAL);
3460
3461
error = vmcs_setreg(vcpu->vmcs, running, reg, val);
3462
3463
if (error == 0) {
3464
/*
3465
* If the "load EFER" VM-entry control is 1 then the
3466
* value of EFER.LMA must be identical to "IA-32e mode guest"
3467
* bit in the VM-entry control.
3468
*/
3469
if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
3470
(reg == VM_REG_GUEST_EFER)) {
3471
vmcs_getreg(vcpu->vmcs, running,
3472
VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
3473
if (val & EFER_LMA)
3474
ctls |= VM_ENTRY_GUEST_LMA;
3475
else
3476
ctls &= ~VM_ENTRY_GUEST_LMA;
3477
vmcs_setreg(vcpu->vmcs, running,
3478
VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
3479
}
3480
3481
shadow = vmx_shadow_reg(reg);
3482
if (shadow > 0) {
3483
/*
3484
* Store the unmodified value in the shadow
3485
*/
3486
error = vmcs_setreg(vcpu->vmcs, running,
3487
VMCS_IDENT(shadow), val);
3488
}
3489
3490
if (reg == VM_REG_GUEST_CR3) {
3491
/*
3492
* Invalidate the guest vcpu's TLB mappings to emulate
3493
* the behavior of updating %cr3.
3494
*
3495
* XXX the processor retains global mappings when %cr3
3496
* is updated but vmx_invvpid() does not.
3497
*/
3498
pmap = vcpu->ctx.pmap;
3499
vmx_invvpid(vmx, vcpu, pmap, running);
3500
}
3501
}
3502
3503
return (error);
3504
}
3505
3506
static int
3507
vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc)
3508
{
3509
int hostcpu, running;
3510
struct vmx_vcpu *vcpu = vcpui;
3511
struct vmx *vmx = vcpu->vmx;
3512
3513
running = vcpu_is_running(vcpu->vcpu, &hostcpu);
3514
if (running && hostcpu != curcpu)
3515
panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm),
3516
vcpu->vcpuid);
3517
3518
return (vmcs_getdesc(vcpu->vmcs, running, reg, desc));
3519
}
3520
3521
static int
3522
vmx_setdesc(void *vcpui, int reg, struct seg_desc *desc)
3523
{
3524
int hostcpu, running;
3525
struct vmx_vcpu *vcpu = vcpui;
3526
struct vmx *vmx = vcpu->vmx;
3527
3528
running = vcpu_is_running(vcpu->vcpu, &hostcpu);
3529
if (running && hostcpu != curcpu)
3530
panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm),
3531
vcpu->vcpuid);
3532
3533
return (vmcs_setdesc(vcpu->vmcs, running, reg, desc));
3534
}
3535
3536
static int
3537
vmx_getcap(void *vcpui, int type, int *retval)
3538
{
3539
struct vmx_vcpu *vcpu = vcpui;
3540
int vcap;
3541
int ret;
3542
3543
ret = ENOENT;
3544
3545
vcap = vcpu->cap.set;
3546
3547
switch (type) {
3548
case VM_CAP_HALT_EXIT:
3549
if (cap_halt_exit)
3550
ret = 0;
3551
break;
3552
case VM_CAP_PAUSE_EXIT:
3553
if (cap_pause_exit)
3554
ret = 0;
3555
break;
3556
case VM_CAP_MTRAP_EXIT:
3557
if (cap_monitor_trap)
3558
ret = 0;
3559
break;
3560
case VM_CAP_RDPID:
3561
if (cap_rdpid)
3562
ret = 0;
3563
break;
3564
case VM_CAP_RDTSCP:
3565
if (cap_rdtscp)
3566
ret = 0;
3567
break;
3568
case VM_CAP_UNRESTRICTED_GUEST:
3569
if (cap_unrestricted_guest)
3570
ret = 0;
3571
break;
3572
case VM_CAP_ENABLE_INVPCID:
3573
if (cap_invpcid)
3574
ret = 0;
3575
break;
3576
case VM_CAP_BPT_EXIT:
3577
case VM_CAP_IPI_EXIT:
3578
ret = 0;
3579
break;
3580
default:
3581
break;
3582
}
3583
3584
if (ret == 0)
3585
*retval = (vcap & (1 << type)) ? 1 : 0;
3586
3587
return (ret);
3588
}
3589
3590
static int
3591
vmx_setcap(void *vcpui, int type, int val)
3592
{
3593
struct vmx_vcpu *vcpu = vcpui;
3594
struct vmcs *vmcs = vcpu->vmcs;
3595
struct vlapic *vlapic;
3596
uint32_t baseval;
3597
uint32_t *pptr;
3598
int error;
3599
int flag;
3600
int reg;
3601
int retval;
3602
3603
retval = ENOENT;
3604
pptr = NULL;
3605
3606
switch (type) {
3607
case VM_CAP_HALT_EXIT:
3608
if (cap_halt_exit) {
3609
retval = 0;
3610
pptr = &vcpu->cap.proc_ctls;
3611
baseval = *pptr;
3612
flag = PROCBASED_HLT_EXITING;
3613
reg = VMCS_PRI_PROC_BASED_CTLS;
3614
}
3615
break;
3616
case VM_CAP_MTRAP_EXIT:
3617
if (cap_monitor_trap) {
3618
retval = 0;
3619
pptr = &vcpu->cap.proc_ctls;
3620
baseval = *pptr;
3621
flag = PROCBASED_MTF;
3622
reg = VMCS_PRI_PROC_BASED_CTLS;
3623
}
3624
break;
3625
case VM_CAP_PAUSE_EXIT:
3626
if (cap_pause_exit) {
3627
retval = 0;
3628
pptr = &vcpu->cap.proc_ctls;
3629
baseval = *pptr;
3630
flag = PROCBASED_PAUSE_EXITING;
3631
reg = VMCS_PRI_PROC_BASED_CTLS;
3632
}
3633
break;
3634
case VM_CAP_RDPID:
3635
case VM_CAP_RDTSCP:
3636
if (cap_rdpid || cap_rdtscp)
3637
/*
3638
* Choose not to support enabling/disabling
3639
* RDPID/RDTSCP via libvmmapi since, as per the
3640
* discussion in vmx_modinit(), RDPID/RDTSCP are
3641
* either always enabled or always disabled.
3642
*/
3643
error = EOPNOTSUPP;
3644
break;
3645
case VM_CAP_UNRESTRICTED_GUEST:
3646
if (cap_unrestricted_guest) {
3647
retval = 0;
3648
pptr = &vcpu->cap.proc_ctls2;
3649
baseval = *pptr;
3650
flag = PROCBASED2_UNRESTRICTED_GUEST;
3651
reg = VMCS_SEC_PROC_BASED_CTLS;
3652
}
3653
break;
3654
case VM_CAP_ENABLE_INVPCID:
3655
if (cap_invpcid) {
3656
retval = 0;
3657
pptr = &vcpu->cap.proc_ctls2;
3658
baseval = *pptr;
3659
flag = PROCBASED2_ENABLE_INVPCID;
3660
reg = VMCS_SEC_PROC_BASED_CTLS;
3661
}
3662
break;
3663
case VM_CAP_BPT_EXIT:
3664
retval = 0;
3665
3666
/* Don't change the bitmap if we are tracing all exceptions. */
3667
if (vcpu->cap.exc_bitmap != 0xffffffff) {
3668
pptr = &vcpu->cap.exc_bitmap;
3669
baseval = *pptr;
3670
flag = (1 << IDT_BP);
3671
reg = VMCS_EXCEPTION_BITMAP;
3672
}
3673
break;
3674
case VM_CAP_IPI_EXIT:
3675
retval = 0;
3676
3677
vlapic = vm_lapic(vcpu->vcpu);
3678
vlapic->ipi_exit = val;
3679
break;
3680
case VM_CAP_MASK_HWINTR:
3681
retval = 0;
3682
break;
3683
default:
3684
break;
3685
}
3686
3687
if (retval)
3688
return (retval);
3689
3690
if (pptr != NULL) {
3691
if (val) {
3692
baseval |= flag;
3693
} else {
3694
baseval &= ~flag;
3695
}
3696
VMPTRLD(vmcs);
3697
error = vmwrite(reg, baseval);
3698
VMCLEAR(vmcs);
3699
3700
if (error)
3701
return (error);
3702
3703
/*
3704
* Update optional stored flags, and record
3705
* setting
3706
*/
3707
*pptr = baseval;
3708
}
3709
3710
if (val) {
3711
vcpu->cap.set |= (1 << type);
3712
} else {
3713
vcpu->cap.set &= ~(1 << type);
3714
}
3715
3716
return (0);
3717
}
3718
3719
static struct vmspace *
3720
vmx_vmspace_alloc(vm_offset_t min, vm_offset_t max)
3721
{
3722
return (ept_vmspace_alloc(min, max));
3723
}
3724
3725
static void
3726
vmx_vmspace_free(struct vmspace *vmspace)
3727
{
3728
ept_vmspace_free(vmspace);
3729
}
3730
3731
struct vlapic_vtx {
3732
struct vlapic vlapic;
3733
struct pir_desc *pir_desc;
3734
struct vmx_vcpu *vcpu;
3735
u_int pending_prio;
3736
};
3737
3738
#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4))
3739
3740
#define VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level, msg) \
3741
do { \
3742
VLAPIC_CTR2(vlapic, msg " assert %s-triggered vector %d", \
3743
level ? "level" : "edge", vector); \
3744
VLAPIC_CTR1(vlapic, msg " pir0 0x%016lx", pir_desc->pir[0]); \
3745
VLAPIC_CTR1(vlapic, msg " pir1 0x%016lx", pir_desc->pir[1]); \
3746
VLAPIC_CTR1(vlapic, msg " pir2 0x%016lx", pir_desc->pir[2]); \
3747
VLAPIC_CTR1(vlapic, msg " pir3 0x%016lx", pir_desc->pir[3]); \
3748
VLAPIC_CTR1(vlapic, msg " notify: %s", notify ? "yes" : "no"); \
3749
} while (0)
3750
3751
/*
3752
* vlapic->ops handlers that utilize the APICv hardware assist described in
3753
* Chapter 29 of the Intel SDM.
3754
*/
3755
static int
3756
vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
3757
{
3758
struct vlapic_vtx *vlapic_vtx;
3759
struct pir_desc *pir_desc;
3760
uint64_t mask;
3761
int idx, notify = 0;
3762
3763
vlapic_vtx = (struct vlapic_vtx *)vlapic;
3764
pir_desc = vlapic_vtx->pir_desc;
3765
3766
/*
3767
* Keep track of interrupt requests in the PIR descriptor. This is
3768
* because the virtual APIC page pointed to by the VMCS cannot be
3769
* modified if the vcpu is running.
3770
*/
3771
idx = vector / 64;
3772
mask = 1UL << (vector % 64);
3773
atomic_set_long(&pir_desc->pir[idx], mask);
3774
3775
/*
3776
* A notification is required whenever the 'pending' bit makes a
3777
* transition from 0->1.
3778
*
3779
* Even if the 'pending' bit is already asserted, notification about
3780
* the incoming interrupt may still be necessary. For example, if a
3781
* vCPU is HLTed with a high PPR, a low priority interrupt would cause
3782
* the 0->1 'pending' transition with a notification, but the vCPU
3783
* would ignore the interrupt for the time being. The same vCPU would
3784
* need to then be notified if a high-priority interrupt arrived which
3785
* satisfied the PPR.
3786
*
3787
* The priorities of interrupts injected while 'pending' is asserted
3788
* are tracked in a custom bitfield 'pending_prio'. Should the
3789
* to-be-injected interrupt exceed the priorities already present, the
3790
* notification is sent. The priorities recorded in 'pending_prio' are
3791
* cleared whenever the 'pending' bit makes another 0->1 transition.
3792
*/
3793
if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
3794
notify = 1;
3795
vlapic_vtx->pending_prio = 0;
3796
} else {
3797
const u_int old_prio = vlapic_vtx->pending_prio;
3798
const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
3799
3800
if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
3801
atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
3802
notify = 1;
3803
}
3804
}
3805
3806
VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level,
3807
"vmx_set_intr_ready");
3808
return (notify);
3809
}
3810
3811
static int
3812
vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
3813
{
3814
struct vlapic_vtx *vlapic_vtx;
3815
struct pir_desc *pir_desc;
3816
struct LAPIC *lapic;
3817
uint64_t pending, pirval;
3818
uint8_t ppr, vpr, rvi;
3819
struct vm_exit *vmexit;
3820
int i;
3821
3822
/*
3823
* This function is only expected to be called from the 'HLT' exit
3824
* handler which does not care about the vector that is pending.
3825
*/
3826
KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
3827
3828
vlapic_vtx = (struct vlapic_vtx *)vlapic;
3829
pir_desc = vlapic_vtx->pir_desc;
3830
lapic = vlapic->apic_page;
3831
3832
/*
3833
* While a virtual interrupt may have already been
3834
* processed the actual delivery maybe pending the
3835
* interruptibility of the guest. Recognize a pending
3836
* interrupt by reevaluating virtual interrupts
3837
* following Section 30.2.1 in the Intel SDM Volume 3.
3838
*/
3839
vmexit = vm_exitinfo(vlapic->vcpu);
3840
KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
3841
("vmx_pending_intr: exitcode not 'HLT'"));
3842
rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
3843
ppr = lapic->ppr & APIC_TPR_INT;
3844
if (rvi > ppr)
3845
return (1);
3846
3847
pending = atomic_load_acq_long(&pir_desc->pending);
3848
if (!pending)
3849
return (0);
3850
3851
/*
3852
* If there is an interrupt pending then it will be recognized only
3853
* if its priority is greater than the processor priority.
3854
*
3855
* Special case: if the processor priority is zero then any pending
3856
* interrupt will be recognized.
3857
*/
3858
if (ppr == 0)
3859
return (1);
3860
3861
VLAPIC_CTR1(vlapic, "HLT with non-zero PPR %d", lapic->ppr);
3862
3863
vpr = 0;
3864
for (i = 3; i >= 0; i--) {
3865
pirval = pir_desc->pir[i];
3866
if (pirval != 0) {
3867
vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
3868
break;
3869
}
3870
}
3871
3872
/*
3873
* If the highest-priority pending interrupt falls short of the
3874
* processor priority of this vCPU, ensure that 'pending_prio' does not
3875
* have any stale bits which would preclude a higher-priority interrupt
3876
* from incurring a notification later.
3877
*/
3878
if (vpr <= ppr) {
3879
const u_int prio_bit = VPR_PRIO_BIT(vpr);
3880
const u_int old = vlapic_vtx->pending_prio;
3881
3882
if (old > prio_bit && (old & prio_bit) == 0) {
3883
vlapic_vtx->pending_prio = prio_bit;
3884
}
3885
return (0);
3886
}
3887
return (1);
3888
}
3889
3890
static void
3891
vmx_intr_accepted(struct vlapic *vlapic, int vector)
3892
{
3893
3894
panic("vmx_intr_accepted: not expected to be called");
3895
}
3896
3897
static void
3898
vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
3899
{
3900
struct vlapic_vtx *vlapic_vtx;
3901
struct vmcs *vmcs;
3902
uint64_t mask, val;
3903
3904
KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
3905
KASSERT(!vcpu_is_running(vlapic->vcpu, NULL),
3906
("vmx_set_tmr: vcpu cannot be running"));
3907
3908
vlapic_vtx = (struct vlapic_vtx *)vlapic;
3909
vmcs = vlapic_vtx->vcpu->vmcs;
3910
mask = 1UL << (vector % 64);
3911
3912
VMPTRLD(vmcs);
3913
val = vmcs_read(VMCS_EOI_EXIT(vector));
3914
if (level)
3915
val |= mask;
3916
else
3917
val &= ~mask;
3918
vmcs_write(VMCS_EOI_EXIT(vector), val);
3919
VMCLEAR(vmcs);
3920
}
3921
3922
static void
3923
vmx_enable_x2apic_mode_ts(struct vlapic *vlapic)
3924
{
3925
struct vlapic_vtx *vlapic_vtx;
3926
struct vmx_vcpu *vcpu;
3927
struct vmcs *vmcs;
3928
uint32_t proc_ctls;
3929
3930
vlapic_vtx = (struct vlapic_vtx *)vlapic;
3931
vcpu = vlapic_vtx->vcpu;
3932
vmcs = vcpu->vmcs;
3933
3934
proc_ctls = vcpu->cap.proc_ctls;
3935
proc_ctls &= ~PROCBASED_USE_TPR_SHADOW;
3936
proc_ctls |= PROCBASED_CR8_LOAD_EXITING;
3937
proc_ctls |= PROCBASED_CR8_STORE_EXITING;
3938
vcpu->cap.proc_ctls = proc_ctls;
3939
3940
VMPTRLD(vmcs);
3941
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
3942
VMCLEAR(vmcs);
3943
}
3944
3945
static void
3946
vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
3947
{
3948
struct vlapic_vtx *vlapic_vtx;
3949
struct vmx *vmx;
3950
struct vmx_vcpu *vcpu;
3951
struct vmcs *vmcs;
3952
uint32_t proc_ctls2;
3953
int error __diagused;
3954
3955
vlapic_vtx = (struct vlapic_vtx *)vlapic;
3956
vcpu = vlapic_vtx->vcpu;
3957
vmx = vcpu->vmx;
3958
vmcs = vcpu->vmcs;
3959
3960
proc_ctls2 = vcpu->cap.proc_ctls2;
3961
KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
3962
("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
3963
3964
proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
3965
proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
3966
vcpu->cap.proc_ctls2 = proc_ctls2;
3967
3968
VMPTRLD(vmcs);
3969
vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
3970
VMCLEAR(vmcs);
3971
3972
if (vlapic->vcpuid == 0) {
3973
/*
3974
* The nested page table mappings are shared by all vcpus
3975
* so unmap the APIC access page just once.
3976
*/
3977
error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3978
KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
3979
__func__, error));
3980
3981
/*
3982
* The MSR bitmap is shared by all vcpus so modify it only
3983
* once in the context of vcpu 0.
3984
*/
3985
error = vmx_allow_x2apic_msrs(vmx);
3986
KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
3987
__func__, error));
3988
}
3989
}
3990
3991
static void
3992
vmx_post_intr(struct vlapic *vlapic, int hostcpu)
3993
{
3994
3995
ipi_cpu(hostcpu, pirvec);
3996
}
3997
3998
/*
3999
* Transfer the pending interrupts in the PIR descriptor to the IRR
4000
* in the virtual APIC page.
4001
*/
4002
static void
4003
vmx_inject_pir(struct vlapic *vlapic)
4004
{
4005
struct vlapic_vtx *vlapic_vtx;
4006
struct pir_desc *pir_desc;
4007
struct LAPIC *lapic;
4008
uint64_t val, pirval;
4009
int rvi, pirbase = -1;
4010
uint16_t intr_status_old, intr_status_new;
4011
4012
vlapic_vtx = (struct vlapic_vtx *)vlapic;
4013
pir_desc = vlapic_vtx->pir_desc;
4014
if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
4015
VLAPIC_CTR0(vlapic, "vmx_inject_pir: "
4016
"no posted interrupt pending");
4017
return;
4018
}
4019
4020
pirval = 0;
4021
pirbase = -1;
4022
lapic = vlapic->apic_page;
4023
4024
val = atomic_readandclear_long(&pir_desc->pir[0]);
4025
if (val != 0) {
4026
lapic->irr0 |= val;
4027
lapic->irr1 |= val >> 32;
4028
pirbase = 0;
4029
pirval = val;
4030
}
4031
4032
val = atomic_readandclear_long(&pir_desc->pir[1]);
4033
if (val != 0) {
4034
lapic->irr2 |= val;
4035
lapic->irr3 |= val >> 32;
4036
pirbase = 64;
4037
pirval = val;
4038
}
4039
4040
val = atomic_readandclear_long(&pir_desc->pir[2]);
4041
if (val != 0) {
4042
lapic->irr4 |= val;
4043
lapic->irr5 |= val >> 32;
4044
pirbase = 128;
4045
pirval = val;
4046
}
4047
4048
val = atomic_readandclear_long(&pir_desc->pir[3]);
4049
if (val != 0) {
4050
lapic->irr6 |= val;
4051
lapic->irr7 |= val >> 32;
4052
pirbase = 192;
4053
pirval = val;
4054
}
4055
4056
VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
4057
4058
/*
4059
* Update RVI so the processor can evaluate pending virtual
4060
* interrupts on VM-entry.
4061
*
4062
* It is possible for pirval to be 0 here, even though the
4063
* pending bit has been set. The scenario is:
4064
* CPU-Y is sending a posted interrupt to CPU-X, which
4065
* is running a guest and processing posted interrupts in h/w.
4066
* CPU-X will eventually exit and the state seen in s/w is
4067
* the pending bit set, but no PIR bits set.
4068
*
4069
* CPU-X CPU-Y
4070
* (vm running) (host running)
4071
* rx posted interrupt
4072
* CLEAR pending bit
4073
* SET PIR bit
4074
* READ/CLEAR PIR bits
4075
* SET pending bit
4076
* (vm exit)
4077
* pending bit set, PIR 0
4078
*/
4079
if (pirval != 0) {
4080
rvi = pirbase + flsl(pirval) - 1;
4081
intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
4082
intr_status_new = (intr_status_old & 0xFF00) | rvi;
4083
if (intr_status_new > intr_status_old) {
4084
vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
4085
VLAPIC_CTR2(vlapic, "vmx_inject_pir: "
4086
"guest_intr_status changed from 0x%04x to 0x%04x",
4087
intr_status_old, intr_status_new);
4088
}
4089
}
4090
}
4091
4092
static struct vlapic *
4093
vmx_vlapic_init(void *vcpui)
4094
{
4095
struct vmx *vmx;
4096
struct vmx_vcpu *vcpu;
4097
struct vlapic *vlapic;
4098
struct vlapic_vtx *vlapic_vtx;
4099
4100
vcpu = vcpui;
4101
vmx = vcpu->vmx;
4102
4103
vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
4104
vlapic->vm = vmx->vm;
4105
vlapic->vcpu = vcpu->vcpu;
4106
vlapic->vcpuid = vcpu->vcpuid;
4107
vlapic->apic_page = (struct LAPIC *)vcpu->apic_page;
4108
4109
vlapic_vtx = (struct vlapic_vtx *)vlapic;
4110
vlapic_vtx->pir_desc = vcpu->pir_desc;
4111
vlapic_vtx->vcpu = vcpu;
4112
4113
if (tpr_shadowing) {
4114
vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
4115
}
4116
4117
if (virtual_interrupt_delivery) {
4118
vlapic->ops.set_intr_ready = vmx_set_intr_ready;
4119
vlapic->ops.pending_intr = vmx_pending_intr;
4120
vlapic->ops.intr_accepted = vmx_intr_accepted;
4121
vlapic->ops.set_tmr = vmx_set_tmr;
4122
vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
4123
}
4124
4125
if (posted_interrupts)
4126
vlapic->ops.post_intr = vmx_post_intr;
4127
4128
vlapic_init(vlapic);
4129
4130
return (vlapic);
4131
}
4132
4133
static void
4134
vmx_vlapic_cleanup(struct vlapic *vlapic)
4135
{
4136
4137
vlapic_cleanup(vlapic);
4138
free(vlapic, M_VLAPIC);
4139
}
4140
4141
#ifdef BHYVE_SNAPSHOT
4142
static int
4143
vmx_vcpu_snapshot(void *vcpui, struct vm_snapshot_meta *meta)
4144
{
4145
struct vmcs *vmcs;
4146
struct vmx *vmx;
4147
struct vmx_vcpu *vcpu;
4148
struct vmxctx *vmxctx;
4149
int err, run, hostcpu;
4150
4151
err = 0;
4152
vcpu = vcpui;
4153
vmx = vcpu->vmx;
4154
vmcs = vcpu->vmcs;
4155
4156
run = vcpu_is_running(vcpu->vcpu, &hostcpu);
4157
if (run && hostcpu != curcpu) {
4158
printf("%s: %s%d is running", __func__, vm_name(vmx->vm),
4159
vcpu->vcpuid);
4160
return (EINVAL);
4161
}
4162
4163
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta);
4164
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta);
4165
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta);
4166
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta);
4167
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta);
4168
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta);
4169
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta);
4170
4171
/* Guest segments */
4172
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta);
4173
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta);
4174
4175
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta);
4176
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta);
4177
4178
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta);
4179
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta);
4180
4181
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta);
4182
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta);
4183
4184
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta);
4185
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta);
4186
4187
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta);
4188
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta);
4189
4190
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta);
4191
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta);
4192
4193
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta);
4194
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta);
4195
4196
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta);
4197
4198
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta);
4199
err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta);
4200
4201
/* Guest page tables */
4202
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta);
4203
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta);
4204
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta);
4205
err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta);
4206
4207
/* Other guest state */
4208
err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta);
4209
err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta);
4210
err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta);
4211
err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta);
4212
err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta);
4213
err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta);
4214
err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta);
4215
if (err != 0)
4216
goto done;
4217
4218
SNAPSHOT_BUF_OR_LEAVE(vcpu->guest_msrs,
4219
sizeof(vcpu->guest_msrs), meta, err, done);
4220
4221
SNAPSHOT_BUF_OR_LEAVE(vcpu->pir_desc,
4222
sizeof(*vcpu->pir_desc), meta, err, done);
4223
4224
SNAPSHOT_BUF_OR_LEAVE(&vcpu->mtrr,
4225
sizeof(vcpu->mtrr), meta, err, done);
4226
4227
vmxctx = &vcpu->ctx;
4228
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, err, done);
4229
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, err, done);
4230
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, err, done);
4231
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, err, done);
4232
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, err, done);
4233
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, err, done);
4234
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, err, done);
4235
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, err, done);
4236
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, err, done);
4237
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, err, done);
4238
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, err, done);
4239
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, err, done);
4240
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, err, done);
4241
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, err, done);
4242
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, err, done);
4243
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, err, done);
4244
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, err, done);
4245
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, err, done);
4246
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, err, done);
4247
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, err, done);
4248
SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, err, done);
4249
4250
done:
4251
return (err);
4252
}
4253
4254
static int
4255
vmx_restore_tsc(void *vcpui, uint64_t offset)
4256
{
4257
struct vmx_vcpu *vcpu = vcpui;
4258
struct vmcs *vmcs;
4259
struct vmx *vmx;
4260
int error, running, hostcpu;
4261
4262
vmx = vcpu->vmx;
4263
vmcs = vcpu->vmcs;
4264
4265
running = vcpu_is_running(vcpu->vcpu, &hostcpu);
4266
if (running && hostcpu != curcpu) {
4267
printf("%s: %s%d is running", __func__, vm_name(vmx->vm),
4268
vcpu->vcpuid);
4269
return (EINVAL);
4270
}
4271
4272
if (!running)
4273
VMPTRLD(vmcs);
4274
4275
error = vmx_set_tsc_offset(vcpu, offset);
4276
4277
if (!running)
4278
VMCLEAR(vmcs);
4279
return (error);
4280
}
4281
#endif
4282
4283
const struct vmm_ops vmm_ops_intel = {
4284
.modinit = vmx_modinit,
4285
.modcleanup = vmx_modcleanup,
4286
.modsuspend = vmx_modsuspend,
4287
.modresume = vmx_modresume,
4288
.init = vmx_init,
4289
.run = vmx_run,
4290
.cleanup = vmx_cleanup,
4291
.vcpu_init = vmx_vcpu_init,
4292
.vcpu_cleanup = vmx_vcpu_cleanup,
4293
.getreg = vmx_getreg,
4294
.setreg = vmx_setreg,
4295
.getdesc = vmx_getdesc,
4296
.setdesc = vmx_setdesc,
4297
.getcap = vmx_getcap,
4298
.setcap = vmx_setcap,
4299
.vmspace_alloc = vmx_vmspace_alloc,
4300
.vmspace_free = vmx_vmspace_free,
4301
.vlapic_init = vmx_vlapic_init,
4302
.vlapic_cleanup = vmx_vlapic_cleanup,
4303
#ifdef BHYVE_SNAPSHOT
4304
.vcpu_snapshot = vmx_vcpu_snapshot,
4305
.restore_tsc = vmx_restore_tsc,
4306
#endif
4307
};
4308
4309