Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/usr.sbin/bhyve/amd64/task_switch.c
104425 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2014 Neel Natu <[email protected]>
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE.
27
*/
28
29
#include <sys/param.h>
30
#include <sys/_iovec.h>
31
#include <sys/mman.h>
32
33
#include <x86/psl.h>
34
#include <x86/specialreg.h>
35
#include <machine/vmm.h>
36
#include <machine/vmm_instruction_emul.h>
37
38
#include <assert.h>
39
#include <errno.h>
40
#include <stdbool.h>
41
#include <stdio.h>
42
#include <stdlib.h>
43
44
#include <vmmapi.h>
45
46
#include "bhyverun.h"
47
#include "debug.h"
48
49
/*
50
* Using 'struct i386tss' is tempting but causes myriad sign extension
51
* issues because all of its fields are defined as signed integers.
52
*/
53
struct tss32 {
54
uint16_t tss_link;
55
uint16_t rsvd1;
56
uint32_t tss_esp0;
57
uint16_t tss_ss0;
58
uint16_t rsvd2;
59
uint32_t tss_esp1;
60
uint16_t tss_ss1;
61
uint16_t rsvd3;
62
uint32_t tss_esp2;
63
uint16_t tss_ss2;
64
uint16_t rsvd4;
65
uint32_t tss_cr3;
66
uint32_t tss_eip;
67
uint32_t tss_eflags;
68
uint32_t tss_eax;
69
uint32_t tss_ecx;
70
uint32_t tss_edx;
71
uint32_t tss_ebx;
72
uint32_t tss_esp;
73
uint32_t tss_ebp;
74
uint32_t tss_esi;
75
uint32_t tss_edi;
76
uint16_t tss_es;
77
uint16_t rsvd5;
78
uint16_t tss_cs;
79
uint16_t rsvd6;
80
uint16_t tss_ss;
81
uint16_t rsvd7;
82
uint16_t tss_ds;
83
uint16_t rsvd8;
84
uint16_t tss_fs;
85
uint16_t rsvd9;
86
uint16_t tss_gs;
87
uint16_t rsvd10;
88
uint16_t tss_ldt;
89
uint16_t rsvd11;
90
uint16_t tss_trap;
91
uint16_t tss_iomap;
92
};
93
static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
94
95
#define SEL_START(sel) (((sel) & ~0x7))
96
#define SEL_LIMIT(sel) (((sel) | 0x7))
97
#define TSS_BUSY(type) (((type) & 0x2) != 0)
98
99
static uint64_t
100
GETREG(struct vcpu *vcpu, int reg)
101
{
102
uint64_t val;
103
int error;
104
105
error = vm_get_register(vcpu, reg, &val);
106
assert(error == 0);
107
return (val);
108
}
109
110
static void
111
SETREG(struct vcpu *vcpu, int reg, uint64_t val)
112
{
113
int error;
114
115
error = vm_set_register(vcpu, reg, val);
116
assert(error == 0);
117
}
118
119
static struct seg_desc
120
usd_to_seg_desc(struct user_segment_descriptor *usd)
121
{
122
struct seg_desc seg_desc;
123
124
seg_desc.base = (u_int)USD_GETBASE(usd);
125
if (usd->sd_gran)
126
seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
127
else
128
seg_desc.limit = (u_int)USD_GETLIMIT(usd);
129
seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
130
seg_desc.access |= usd->sd_xx << 12;
131
seg_desc.access |= usd->sd_def32 << 14;
132
seg_desc.access |= usd->sd_gran << 15;
133
134
return (seg_desc);
135
}
136
137
/*
138
* Inject an exception with an error code that is a segment selector.
139
* The format of the error code is described in section 6.13, "Error Code",
140
* Intel SDM volume 3.
141
*
142
* Bit 0 (EXT) denotes whether the exception occurred during delivery
143
* of an external event like an interrupt.
144
*
145
* Bit 1 (IDT) indicates whether the selector points to a gate descriptor
146
* in the IDT.
147
*
148
* Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
149
*/
150
static void
151
sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
152
{
153
/*
154
* Bit 2 from the selector is retained as-is in the error code.
155
*
156
* Bit 1 can be safely cleared because none of the selectors
157
* encountered during task switch emulation refer to a task
158
* gate in the IDT.
159
*
160
* Bit 0 is set depending on the value of 'ext'.
161
*/
162
sel &= ~0x3;
163
if (ext)
164
sel |= 0x1;
165
vm_inject_fault(vcpu, vector, 1, sel);
166
}
167
168
/*
169
* Return 0 if the selector 'sel' in within the limits of the GDT/LDT
170
* and non-zero otherwise.
171
*/
172
static int
173
desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
174
{
175
uint64_t base;
176
uint32_t limit, access;
177
int error, reg;
178
179
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
180
error = vm_get_desc(vcpu, reg, &base, &limit, &access);
181
assert(error == 0);
182
183
if (reg == VM_REG_GUEST_LDTR) {
184
if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
185
return (-1);
186
}
187
188
if (limit < SEL_LIMIT(sel))
189
return (-1);
190
else
191
return (0);
192
}
193
194
/*
195
* Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
196
* by the selector 'sel'.
197
*
198
* Returns 0 on success.
199
* Returns 1 if an exception was injected into the guest.
200
* Returns -1 otherwise.
201
*/
202
static int
203
desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
204
uint16_t sel, struct user_segment_descriptor *desc, bool doread,
205
int *faultptr)
206
{
207
struct iovec iov[2];
208
uint64_t base;
209
uint32_t limit, access;
210
int error, reg;
211
212
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213
error = vm_get_desc(vcpu, reg, &base, &limit, &access);
214
assert(error == 0);
215
assert(limit >= SEL_LIMIT(sel));
216
217
error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
218
sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
219
faultptr);
220
if (error || *faultptr)
221
return (error);
222
223
if (doread)
224
vm_copyin(iov, desc, sizeof(*desc));
225
else
226
vm_copyout(desc, iov, sizeof(*desc));
227
return (0);
228
}
229
230
static int
231
desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
232
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
233
{
234
return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
235
}
236
237
static int
238
desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
239
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
240
{
241
return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
242
}
243
244
/*
245
* Read the TSS descriptor referenced by 'sel' into 'desc'.
246
*
247
* Returns 0 on success.
248
* Returns 1 if an exception was injected into the guest.
249
* Returns -1 otherwise.
250
*/
251
static int
252
read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
253
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
254
{
255
struct vm_guest_paging sup_paging;
256
int error;
257
258
assert(!ISLDT(sel));
259
assert(IDXSEL(sel) != 0);
260
261
/* Fetch the new TSS descriptor */
262
if (desc_table_limit_check(vcpu, sel)) {
263
if (ts->reason == TSR_IRET)
264
sel_exception(vcpu, IDT_TS, sel, ts->ext);
265
else
266
sel_exception(vcpu, IDT_GP, sel, ts->ext);
267
return (1);
268
}
269
270
sup_paging = ts->paging;
271
sup_paging.cpl = 0; /* implicit supervisor mode */
272
error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
273
return (error);
274
}
275
276
static bool
277
code_desc(int sd_type)
278
{
279
/* code descriptor */
280
return ((sd_type & 0x18) == 0x18);
281
}
282
283
static bool
284
stack_desc(int sd_type)
285
{
286
/* writable data descriptor */
287
return ((sd_type & 0x1A) == 0x12);
288
}
289
290
static bool
291
data_desc(int sd_type)
292
{
293
/* data descriptor or a readable code descriptor */
294
return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
295
}
296
297
static bool
298
ldt_desc(int sd_type)
299
{
300
301
return (sd_type == SDT_SYSLDT);
302
}
303
304
/*
305
* Validate the descriptor 'seg_desc' associated with 'segment'.
306
*/
307
static int
308
validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
309
int segment, struct seg_desc *seg_desc, int *faultptr)
310
{
311
struct vm_guest_paging sup_paging;
312
struct user_segment_descriptor usd;
313
int error, idtvec;
314
int cpl, dpl, rpl;
315
uint16_t sel, cs;
316
bool ldtseg, codeseg, stackseg, dataseg, conforming;
317
318
ldtseg = codeseg = stackseg = dataseg = false;
319
switch (segment) {
320
case VM_REG_GUEST_LDTR:
321
ldtseg = true;
322
break;
323
case VM_REG_GUEST_CS:
324
codeseg = true;
325
break;
326
case VM_REG_GUEST_SS:
327
stackseg = true;
328
break;
329
case VM_REG_GUEST_DS:
330
case VM_REG_GUEST_ES:
331
case VM_REG_GUEST_FS:
332
case VM_REG_GUEST_GS:
333
dataseg = true;
334
break;
335
default:
336
assert(0);
337
}
338
339
/* Get the segment selector */
340
sel = GETREG(vcpu, segment);
341
342
/* LDT selector must point into the GDT */
343
if (ldtseg && ISLDT(sel)) {
344
sel_exception(vcpu, IDT_TS, sel, ts->ext);
345
return (1);
346
}
347
348
/* Descriptor table limit check */
349
if (desc_table_limit_check(vcpu, sel)) {
350
sel_exception(vcpu, IDT_TS, sel, ts->ext);
351
return (1);
352
}
353
354
/* NULL selector */
355
if (IDXSEL(sel) == 0) {
356
/* Code and stack segment selectors cannot be NULL */
357
if (codeseg || stackseg) {
358
sel_exception(vcpu, IDT_TS, sel, ts->ext);
359
return (1);
360
}
361
seg_desc->base = 0;
362
seg_desc->limit = 0;
363
seg_desc->access = 0x10000; /* unusable */
364
return (0);
365
}
366
367
/* Read the descriptor from the GDT/LDT */
368
sup_paging = ts->paging;
369
sup_paging.cpl = 0; /* implicit supervisor mode */
370
error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
371
if (error || *faultptr)
372
return (error);
373
374
/* Verify that the descriptor type is compatible with the segment */
375
if ((ldtseg && !ldt_desc(usd.sd_type)) ||
376
(codeseg && !code_desc(usd.sd_type)) ||
377
(dataseg && !data_desc(usd.sd_type)) ||
378
(stackseg && !stack_desc(usd.sd_type))) {
379
sel_exception(vcpu, IDT_TS, sel, ts->ext);
380
return (1);
381
}
382
383
/* Segment must be marked present */
384
if (!usd.sd_p) {
385
if (ldtseg)
386
idtvec = IDT_TS;
387
else if (stackseg)
388
idtvec = IDT_SS;
389
else
390
idtvec = IDT_NP;
391
sel_exception(vcpu, idtvec, sel, ts->ext);
392
return (1);
393
}
394
395
cs = GETREG(vcpu, VM_REG_GUEST_CS);
396
cpl = cs & SEL_RPL_MASK;
397
rpl = sel & SEL_RPL_MASK;
398
dpl = usd.sd_dpl;
399
400
if (stackseg && (rpl != cpl || dpl != cpl)) {
401
sel_exception(vcpu, IDT_TS, sel, ts->ext);
402
return (1);
403
}
404
405
if (codeseg) {
406
conforming = (usd.sd_type & 0x4) ? true : false;
407
if ((conforming && (cpl < dpl)) ||
408
(!conforming && (cpl != dpl))) {
409
sel_exception(vcpu, IDT_TS, sel, ts->ext);
410
return (1);
411
}
412
}
413
414
if (dataseg) {
415
/*
416
* A data segment is always non-conforming except when it's
417
* descriptor is a readable, conforming code segment.
418
*/
419
if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
420
conforming = true;
421
else
422
conforming = false;
423
424
if (!conforming && (rpl > dpl || cpl > dpl)) {
425
sel_exception(vcpu, IDT_TS, sel, ts->ext);
426
return (1);
427
}
428
}
429
*seg_desc = usd_to_seg_desc(&usd);
430
return (0);
431
}
432
433
static void
434
tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
435
uint32_t eip, struct tss32 *tss, struct iovec *iov)
436
{
437
438
/* General purpose registers */
439
tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
440
tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
441
tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
442
tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
443
tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
444
tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
445
tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
446
tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);
447
448
/* Segment selectors */
449
tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
450
tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
451
tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
452
tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
453
tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
454
tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);
455
456
/* eflags and eip */
457
tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
458
if (task_switch->reason == TSR_IRET)
459
tss->tss_eflags &= ~PSL_NT;
460
tss->tss_eip = eip;
461
462
/* Copy updated old TSS into guest memory */
463
vm_copyout(tss, iov, sizeof(struct tss32));
464
}
465
466
static void
467
update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
468
{
469
int error;
470
471
error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
472
assert(error == 0);
473
}
474
475
/*
476
* Update the vcpu registers to reflect the state of the new task.
477
*/
478
static int
479
tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
480
uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
481
{
482
struct seg_desc seg_desc, seg_desc2;
483
uint64_t *pdpte, maxphyaddr, reserved;
484
uint32_t eflags;
485
int error, i;
486
bool nested;
487
488
nested = false;
489
if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
490
tss->tss_link = ot_sel;
491
nested = true;
492
}
493
494
eflags = tss->tss_eflags;
495
if (nested)
496
eflags |= PSL_NT;
497
498
/* LDTR */
499
SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
500
501
/* PBDR */
502
if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
503
if (ts->paging.paging_mode == PAGING_MODE_PAE) {
504
/*
505
* XXX Assuming 36-bit MAXPHYADDR.
506
*/
507
maxphyaddr = (1UL << 36) - 1;
508
pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
509
for (i = 0; i < 4; i++) {
510
/* Check reserved bits if the PDPTE is valid */
511
if (!(pdpte[i] & 0x1))
512
continue;
513
/*
514
* Bits 2:1, 8:5 and bits above the processor's
515
* maximum physical address are reserved.
516
*/
517
reserved = ~maxphyaddr | 0x1E6;
518
if (pdpte[i] & reserved) {
519
vm_inject_gp(vcpu);
520
return (1);
521
}
522
}
523
SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
524
SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
525
SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
526
SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
527
}
528
SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
529
ts->paging.cr3 = tss->tss_cr3;
530
}
531
532
/* eflags and eip */
533
SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
534
SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
535
536
/* General purpose registers */
537
SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
538
SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
539
SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
540
SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
541
SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
542
SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
543
SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
544
SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
545
546
/* Segment selectors */
547
SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
548
SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
549
SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
550
SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
551
SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
552
SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);
553
554
/*
555
* If this is a nested task then write out the new TSS to update
556
* the previous link field.
557
*/
558
if (nested)
559
vm_copyout(tss, iov, sizeof(*tss));
560
561
/* Validate segment descriptors */
562
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
563
faultptr);
564
if (error || *faultptr)
565
return (error);
566
update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);
567
568
/*
569
* Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
570
*
571
* The SS and CS attribute checks on VM-entry are inter-dependent so
572
* we need to make sure that both segments are valid before updating
573
* either of them. This ensures that the VMCS state can pass the
574
* VM-entry checks so the guest can handle any exception injected
575
* during task switch emulation.
576
*/
577
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
578
faultptr);
579
if (error || *faultptr)
580
return (error);
581
582
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
583
faultptr);
584
if (error || *faultptr)
585
return (error);
586
update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
587
update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
588
ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
589
590
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
591
faultptr);
592
if (error || *faultptr)
593
return (error);
594
update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);
595
596
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
597
faultptr);
598
if (error || *faultptr)
599
return (error);
600
update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);
601
602
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
603
faultptr);
604
if (error || *faultptr)
605
return (error);
606
update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);
607
608
error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
609
faultptr);
610
if (error || *faultptr)
611
return (error);
612
update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);
613
614
return (0);
615
}
616
617
/*
618
* Push an error code on the stack of the new task. This is needed if the
619
* task switch was triggered by a hardware exception that causes an error
620
* code to be saved (e.g. #PF).
621
*/
622
static int
623
push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
624
int task_type, uint32_t errcode, int *faultptr)
625
{
626
struct iovec iov[2];
627
struct seg_desc seg_desc;
628
int stacksize, bytes, error;
629
uint64_t gla, cr0, rflags;
630
uint32_t esp;
631
uint16_t stacksel;
632
633
*faultptr = 0;
634
635
cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
636
rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
637
stacksel = GETREG(vcpu, VM_REG_GUEST_SS);
638
639
error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
640
&seg_desc.limit, &seg_desc.access);
641
assert(error == 0);
642
643
/*
644
* Section "Error Code" in the Intel SDM vol 3: the error code is
645
* pushed on the stack as a doubleword or word (depending on the
646
* default interrupt, trap or task gate size).
647
*/
648
if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
649
bytes = 4;
650
else
651
bytes = 2;
652
653
/*
654
* PUSH instruction from Intel SDM vol 2: the 'B' flag in the
655
* stack-segment descriptor determines the size of the stack
656
* pointer outside of 64-bit mode.
657
*/
658
if (SEG_DESC_DEF32(seg_desc.access))
659
stacksize = 4;
660
else
661
stacksize = 2;
662
663
esp = GETREG(vcpu, VM_REG_GUEST_RSP);
664
esp -= bytes;
665
666
if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
667
&seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
668
sel_exception(vcpu, IDT_SS, stacksel, 1);
669
*faultptr = 1;
670
return (0);
671
}
672
673
if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
674
vm_inject_ac(vcpu, 1);
675
*faultptr = 1;
676
return (0);
677
}
678
679
error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
680
iov, nitems(iov), faultptr);
681
if (error || *faultptr)
682
return (error);
683
684
vm_copyout(&errcode, iov, bytes);
685
SETREG(vcpu, VM_REG_GUEST_RSP, esp);
686
return (0);
687
}
688
689
/*
690
* Evaluate return value from helper functions and potentially return to
691
* the VM run loop.
692
*/
693
#define CHKERR(error,fault) \
694
do { \
695
assert((error == 0) || (error == EFAULT)); \
696
if (error) \
697
return (VMEXIT_ABORT); \
698
else if (fault) \
699
return (VMEXIT_CONTINUE); \
700
} while (0)
701
702
int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
703
704
int
705
vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
706
{
707
struct seg_desc nt;
708
struct tss32 oldtss, newtss;
709
struct vm_task_switch *task_switch;
710
struct vm_guest_paging *paging, sup_paging;
711
struct user_segment_descriptor nt_desc, ot_desc;
712
struct iovec nt_iov[2], ot_iov[2];
713
struct vm_exit *vmexit;
714
uint64_t cr0, ot_base;
715
uint32_t eip, ot_lim, access;
716
int error, ext, fault, minlimit, nt_type, ot_type;
717
enum task_switch_reason reason;
718
uint16_t nt_sel, ot_sel;
719
720
vmexit = vmrun->vm_exit;
721
task_switch = &vmexit->u.task_switch;
722
nt_sel = task_switch->tsssel;
723
ext = vmexit->u.task_switch.ext;
724
reason = vmexit->u.task_switch.reason;
725
paging = &vmexit->u.task_switch.paging;
726
727
assert(paging->cpu_mode == CPU_MODE_PROTECTED);
728
729
/*
730
* Calculate the instruction pointer to store in the old TSS.
731
*/
732
eip = vmexit->rip + vmexit->inst_length;
733
734
/*
735
* Section 4.6, "Access Rights" in Intel SDM Vol 3.
736
* The following page table accesses are implicitly supervisor mode:
737
* - accesses to GDT or LDT to load segment descriptors
738
* - accesses to the task state segment during task switch
739
*/
740
sup_paging = *paging;
741
sup_paging.cpl = 0; /* implicit supervisor mode */
742
743
/* Fetch the new TSS descriptor */
744
error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
745
&fault);
746
CHKERR(error, fault);
747
748
nt = usd_to_seg_desc(&nt_desc);
749
750
/* Verify the type of the new TSS */
751
nt_type = SEG_DESC_TYPE(nt.access);
752
if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
753
nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
754
sel_exception(vcpu, IDT_TS, nt_sel, ext);
755
goto done;
756
}
757
758
/* TSS descriptor must have present bit set */
759
if (!SEG_DESC_PRESENT(nt.access)) {
760
sel_exception(vcpu, IDT_NP, nt_sel, ext);
761
goto done;
762
}
763
764
/*
765
* TSS must have a minimum length of 104 bytes for a 32-bit TSS and
766
* 44 bytes for a 16-bit TSS.
767
*/
768
if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
769
minlimit = 104 - 1;
770
else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
771
minlimit = 44 - 1;
772
else
773
minlimit = 0;
774
775
assert(minlimit > 0);
776
if (nt.limit < (unsigned int)minlimit) {
777
sel_exception(vcpu, IDT_TS, nt_sel, ext);
778
goto done;
779
}
780
781
/* TSS must be busy if task switch is due to IRET */
782
if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
783
sel_exception(vcpu, IDT_TS, nt_sel, ext);
784
goto done;
785
}
786
787
/*
788
* TSS must be available (not busy) if task switch reason is
789
* CALL, JMP, exception or interrupt.
790
*/
791
if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
792
sel_exception(vcpu, IDT_GP, nt_sel, ext);
793
goto done;
794
}
795
796
/* Fetch the new TSS */
797
error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
798
PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
799
CHKERR(error, fault);
800
vm_copyin(nt_iov, &newtss, minlimit + 1);
801
802
/* Get the old TSS selector from the guest's task register */
803
ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
804
if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
805
/*
806
* This might happen if a task switch was attempted without
807
* ever loading the task register with LTR. In this case the
808
* TR would contain the values from power-on:
809
* (sel = 0, base = 0, limit = 0xffff).
810
*/
811
sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
812
goto done;
813
}
814
815
/* Get the old TSS base and limit from the guest's task register */
816
error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
817
&access);
818
assert(error == 0);
819
assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
820
ot_type = SEG_DESC_TYPE(access);
821
assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
822
823
/* Fetch the old TSS descriptor */
824
error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
825
&fault);
826
CHKERR(error, fault);
827
828
/* Get the old TSS */
829
error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
830
PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
831
CHKERR(error, fault);
832
vm_copyin(ot_iov, &oldtss, minlimit + 1);
833
834
/*
835
* Clear the busy bit in the old TSS descriptor if the task switch
836
* due to an IRET or JMP instruction.
837
*/
838
if (reason == TSR_IRET || reason == TSR_JMP) {
839
ot_desc.sd_type &= ~0x2;
840
error = desc_table_write(vcpu, &sup_paging, ot_sel,
841
&ot_desc, &fault);
842
CHKERR(error, fault);
843
}
844
845
if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
846
EPRINTLN("Task switch to 16-bit TSS not supported");
847
return (VMEXIT_ABORT);
848
}
849
850
/* Save processor state in old TSS */
851
tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);
852
853
/*
854
* If the task switch was triggered for any reason other than IRET
855
* then set the busy bit in the new TSS descriptor.
856
*/
857
if (reason != TSR_IRET) {
858
nt_desc.sd_type |= 0x2;
859
error = desc_table_write(vcpu, &sup_paging, nt_sel,
860
&nt_desc, &fault);
861
CHKERR(error, fault);
862
}
863
864
/* Update task register to point at the new TSS */
865
SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);
866
867
/* Update the hidden descriptor state of the task register */
868
nt = usd_to_seg_desc(&nt_desc);
869
update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);
870
871
/* Set CR0.TS */
872
cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
873
SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
874
875
/*
876
* We are now committed to the task switch. Any exceptions encountered
877
* after this point will be handled in the context of the new task and
878
* the saved instruction pointer will belong to the new task.
879
*/
880
error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
881
assert(error == 0);
882
883
/* Load processor state from new TSS */
884
error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
885
&fault);
886
CHKERR(error, fault);
887
888
/*
889
* Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
890
* caused an error code to be generated, this error code is copied
891
* to the stack of the new task.
892
*/
893
if (task_switch->errcode_valid) {
894
assert(task_switch->ext);
895
assert(task_switch->reason == TSR_IDT_GATE);
896
error = push_errcode(vcpu, &task_switch->paging, nt_type,
897
task_switch->errcode, &fault);
898
CHKERR(error, fault);
899
}
900
901
/*
902
* Treatment of virtual-NMI blocking if NMI is delivered through
903
* a task gate.
904
*
905
* Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
906
* If the virtual NMIs VM-execution control is 1, VM entry injects
907
* an NMI, and delivery of the NMI causes a task switch that causes
908
* a VM exit, virtual-NMI blocking is in effect before the VM exit
909
* commences.
910
*
911
* Thus, virtual-NMI blocking is in effect at the time of the task
912
* switch VM exit.
913
*/
914
915
/*
916
* Treatment of virtual-NMI unblocking on IRET from NMI handler task.
917
*
918
* Section "Changes to Instruction Behavior in VMX Non-Root Operation"
919
* If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
920
* This unblocking of virtual-NMI occurs even if IRET causes a fault.
921
*
922
* Thus, virtual-NMI blocking is cleared at the time of the task switch
923
* VM exit.
924
*/
925
926
/*
927
* If the task switch was triggered by an event delivered through
928
* the IDT then extinguish the pending event from the vcpu's
929
* exitintinfo.
930
*/
931
if (task_switch->reason == TSR_IDT_GATE) {
932
error = vm_set_intinfo(vcpu, 0);
933
assert(error == 0);
934
}
935
936
/*
937
* XXX should inject debug exception if 'T' bit is 1
938
*/
939
done:
940
return (VMEXIT_CONTINUE);
941
}
942
943