Path: blob/main/usr.sbin/bhyve/amd64/task_switch.c
104425 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2014 Neel Natu <[email protected]>4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*/2728#include <sys/param.h>29#include <sys/_iovec.h>30#include <sys/mman.h>3132#include <x86/psl.h>33#include <x86/specialreg.h>34#include <machine/vmm.h>35#include <machine/vmm_instruction_emul.h>3637#include <assert.h>38#include <errno.h>39#include <stdbool.h>40#include <stdio.h>41#include <stdlib.h>4243#include <vmmapi.h>4445#include "bhyverun.h"46#include "debug.h"4748/*49* Using 'struct i386tss' is tempting but causes myriad sign extension50* issues because all of its fields are defined as signed integers.51*/52struct tss32 {53uint16_t tss_link;54uint16_t rsvd1;55uint32_t tss_esp0;56uint16_t tss_ss0;57uint16_t rsvd2;58uint32_t tss_esp1;59uint16_t tss_ss1;60uint16_t rsvd3;61uint32_t tss_esp2;62uint16_t tss_ss2;63uint16_t rsvd4;64uint32_t tss_cr3;65uint32_t tss_eip;66uint32_t tss_eflags;67uint32_t tss_eax;68uint32_t tss_ecx;69uint32_t tss_edx;70uint32_t tss_ebx;71uint32_t tss_esp;72uint32_t tss_ebp;73uint32_t tss_esi;74uint32_t tss_edi;75uint16_t tss_es;76uint16_t rsvd5;77uint16_t tss_cs;78uint16_t rsvd6;79uint16_t tss_ss;80uint16_t rsvd7;81uint16_t tss_ds;82uint16_t rsvd8;83uint16_t tss_fs;84uint16_t rsvd9;85uint16_t tss_gs;86uint16_t rsvd10;87uint16_t tss_ldt;88uint16_t rsvd11;89uint16_t tss_trap;90uint16_t tss_iomap;91};92static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");9394#define SEL_START(sel) (((sel) & ~0x7))95#define SEL_LIMIT(sel) (((sel) | 0x7))96#define TSS_BUSY(type) (((type) & 0x2) != 0)9798static uint64_t99GETREG(struct vcpu *vcpu, int reg)100{101uint64_t val;102int error;103104error = vm_get_register(vcpu, reg, &val);105assert(error == 0);106return (val);107}108109static void110SETREG(struct vcpu *vcpu, int reg, uint64_t val)111{112int error;113114error = vm_set_register(vcpu, reg, val);115assert(error == 0);116}117118static struct seg_desc119usd_to_seg_desc(struct user_segment_descriptor *usd)120{121struct seg_desc seg_desc;122123seg_desc.base = (u_int)USD_GETBASE(usd);124if (usd->sd_gran)125seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;126else127seg_desc.limit = (u_int)USD_GETLIMIT(usd);128seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;129seg_desc.access |= usd->sd_xx << 12;130seg_desc.access |= usd->sd_def32 << 14;131seg_desc.access |= usd->sd_gran << 15;132133return (seg_desc);134}135136/*137* Inject an exception with an error code that is a segment selector.138* The format of the error code is described in section 6.13, "Error Code",139* Intel SDM volume 3.140*141* Bit 0 (EXT) denotes whether the exception occurred during delivery142* of an external event like an interrupt.143*144* Bit 1 (IDT) indicates whether the selector points to a gate descriptor145* in the IDT.146*147* Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).148*/149static void150sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)151{152/*153* Bit 2 from the selector is retained as-is in the error code.154*155* Bit 1 can be safely cleared because none of the selectors156* encountered during task switch emulation refer to a task157* gate in the IDT.158*159* Bit 0 is set depending on the value of 'ext'.160*/161sel &= ~0x3;162if (ext)163sel |= 0x1;164vm_inject_fault(vcpu, vector, 1, sel);165}166167/*168* Return 0 if the selector 'sel' in within the limits of the GDT/LDT169* and non-zero otherwise.170*/171static int172desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)173{174uint64_t base;175uint32_t limit, access;176int error, reg;177178reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;179error = vm_get_desc(vcpu, reg, &base, &limit, &access);180assert(error == 0);181182if (reg == VM_REG_GUEST_LDTR) {183if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))184return (-1);185}186187if (limit < SEL_LIMIT(sel))188return (-1);189else190return (0);191}192193/*194* Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced195* by the selector 'sel'.196*197* Returns 0 on success.198* Returns 1 if an exception was injected into the guest.199* Returns -1 otherwise.200*/201static int202desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,203uint16_t sel, struct user_segment_descriptor *desc, bool doread,204int *faultptr)205{206struct iovec iov[2];207uint64_t base;208uint32_t limit, access;209int error, reg;210211reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;212error = vm_get_desc(vcpu, reg, &base, &limit, &access);213assert(error == 0);214assert(limit >= SEL_LIMIT(sel));215216error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),217sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),218faultptr);219if (error || *faultptr)220return (error);221222if (doread)223vm_copyin(iov, desc, sizeof(*desc));224else225vm_copyout(desc, iov, sizeof(*desc));226return (0);227}228229static int230desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,231uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)232{233return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));234}235236static int237desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,238uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)239{240return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));241}242243/*244* Read the TSS descriptor referenced by 'sel' into 'desc'.245*246* Returns 0 on success.247* Returns 1 if an exception was injected into the guest.248* Returns -1 otherwise.249*/250static int251read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,252uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)253{254struct vm_guest_paging sup_paging;255int error;256257assert(!ISLDT(sel));258assert(IDXSEL(sel) != 0);259260/* Fetch the new TSS descriptor */261if (desc_table_limit_check(vcpu, sel)) {262if (ts->reason == TSR_IRET)263sel_exception(vcpu, IDT_TS, sel, ts->ext);264else265sel_exception(vcpu, IDT_GP, sel, ts->ext);266return (1);267}268269sup_paging = ts->paging;270sup_paging.cpl = 0; /* implicit supervisor mode */271error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);272return (error);273}274275static bool276code_desc(int sd_type)277{278/* code descriptor */279return ((sd_type & 0x18) == 0x18);280}281282static bool283stack_desc(int sd_type)284{285/* writable data descriptor */286return ((sd_type & 0x1A) == 0x12);287}288289static bool290data_desc(int sd_type)291{292/* data descriptor or a readable code descriptor */293return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);294}295296static bool297ldt_desc(int sd_type)298{299300return (sd_type == SDT_SYSLDT);301}302303/*304* Validate the descriptor 'seg_desc' associated with 'segment'.305*/306static int307validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,308int segment, struct seg_desc *seg_desc, int *faultptr)309{310struct vm_guest_paging sup_paging;311struct user_segment_descriptor usd;312int error, idtvec;313int cpl, dpl, rpl;314uint16_t sel, cs;315bool ldtseg, codeseg, stackseg, dataseg, conforming;316317ldtseg = codeseg = stackseg = dataseg = false;318switch (segment) {319case VM_REG_GUEST_LDTR:320ldtseg = true;321break;322case VM_REG_GUEST_CS:323codeseg = true;324break;325case VM_REG_GUEST_SS:326stackseg = true;327break;328case VM_REG_GUEST_DS:329case VM_REG_GUEST_ES:330case VM_REG_GUEST_FS:331case VM_REG_GUEST_GS:332dataseg = true;333break;334default:335assert(0);336}337338/* Get the segment selector */339sel = GETREG(vcpu, segment);340341/* LDT selector must point into the GDT */342if (ldtseg && ISLDT(sel)) {343sel_exception(vcpu, IDT_TS, sel, ts->ext);344return (1);345}346347/* Descriptor table limit check */348if (desc_table_limit_check(vcpu, sel)) {349sel_exception(vcpu, IDT_TS, sel, ts->ext);350return (1);351}352353/* NULL selector */354if (IDXSEL(sel) == 0) {355/* Code and stack segment selectors cannot be NULL */356if (codeseg || stackseg) {357sel_exception(vcpu, IDT_TS, sel, ts->ext);358return (1);359}360seg_desc->base = 0;361seg_desc->limit = 0;362seg_desc->access = 0x10000; /* unusable */363return (0);364}365366/* Read the descriptor from the GDT/LDT */367sup_paging = ts->paging;368sup_paging.cpl = 0; /* implicit supervisor mode */369error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);370if (error || *faultptr)371return (error);372373/* Verify that the descriptor type is compatible with the segment */374if ((ldtseg && !ldt_desc(usd.sd_type)) ||375(codeseg && !code_desc(usd.sd_type)) ||376(dataseg && !data_desc(usd.sd_type)) ||377(stackseg && !stack_desc(usd.sd_type))) {378sel_exception(vcpu, IDT_TS, sel, ts->ext);379return (1);380}381382/* Segment must be marked present */383if (!usd.sd_p) {384if (ldtseg)385idtvec = IDT_TS;386else if (stackseg)387idtvec = IDT_SS;388else389idtvec = IDT_NP;390sel_exception(vcpu, idtvec, sel, ts->ext);391return (1);392}393394cs = GETREG(vcpu, VM_REG_GUEST_CS);395cpl = cs & SEL_RPL_MASK;396rpl = sel & SEL_RPL_MASK;397dpl = usd.sd_dpl;398399if (stackseg && (rpl != cpl || dpl != cpl)) {400sel_exception(vcpu, IDT_TS, sel, ts->ext);401return (1);402}403404if (codeseg) {405conforming = (usd.sd_type & 0x4) ? true : false;406if ((conforming && (cpl < dpl)) ||407(!conforming && (cpl != dpl))) {408sel_exception(vcpu, IDT_TS, sel, ts->ext);409return (1);410}411}412413if (dataseg) {414/*415* A data segment is always non-conforming except when it's416* descriptor is a readable, conforming code segment.417*/418if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)419conforming = true;420else421conforming = false;422423if (!conforming && (rpl > dpl || cpl > dpl)) {424sel_exception(vcpu, IDT_TS, sel, ts->ext);425return (1);426}427}428*seg_desc = usd_to_seg_desc(&usd);429return (0);430}431432static void433tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,434uint32_t eip, struct tss32 *tss, struct iovec *iov)435{436437/* General purpose registers */438tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);439tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);440tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);441tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);442tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);443tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);444tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);445tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);446447/* Segment selectors */448tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);449tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);450tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);451tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);452tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);453tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);454455/* eflags and eip */456tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);457if (task_switch->reason == TSR_IRET)458tss->tss_eflags &= ~PSL_NT;459tss->tss_eip = eip;460461/* Copy updated old TSS into guest memory */462vm_copyout(tss, iov, sizeof(struct tss32));463}464465static void466update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)467{468int error;469470error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);471assert(error == 0);472}473474/*475* Update the vcpu registers to reflect the state of the new task.476*/477static int478tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,479uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)480{481struct seg_desc seg_desc, seg_desc2;482uint64_t *pdpte, maxphyaddr, reserved;483uint32_t eflags;484int error, i;485bool nested;486487nested = false;488if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {489tss->tss_link = ot_sel;490nested = true;491}492493eflags = tss->tss_eflags;494if (nested)495eflags |= PSL_NT;496497/* LDTR */498SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);499500/* PBDR */501if (ts->paging.paging_mode != PAGING_MODE_FLAT) {502if (ts->paging.paging_mode == PAGING_MODE_PAE) {503/*504* XXX Assuming 36-bit MAXPHYADDR.505*/506maxphyaddr = (1UL << 36) - 1;507pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);508for (i = 0; i < 4; i++) {509/* Check reserved bits if the PDPTE is valid */510if (!(pdpte[i] & 0x1))511continue;512/*513* Bits 2:1, 8:5 and bits above the processor's514* maximum physical address are reserved.515*/516reserved = ~maxphyaddr | 0x1E6;517if (pdpte[i] & reserved) {518vm_inject_gp(vcpu);519return (1);520}521}522SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);523SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);524SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);525SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);526}527SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);528ts->paging.cr3 = tss->tss_cr3;529}530531/* eflags and eip */532SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);533SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);534535/* General purpose registers */536SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);537SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);538SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);539SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);540SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);541SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);542SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);543SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);544545/* Segment selectors */546SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);547SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);548SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);549SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);550SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);551SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);552553/*554* If this is a nested task then write out the new TSS to update555* the previous link field.556*/557if (nested)558vm_copyout(tss, iov, sizeof(*tss));559560/* Validate segment descriptors */561error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,562faultptr);563if (error || *faultptr)564return (error);565update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);566567/*568* Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.569*570* The SS and CS attribute checks on VM-entry are inter-dependent so571* we need to make sure that both segments are valid before updating572* either of them. This ensures that the VMCS state can pass the573* VM-entry checks so the guest can handle any exception injected574* during task switch emulation.575*/576error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,577faultptr);578if (error || *faultptr)579return (error);580581error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,582faultptr);583if (error || *faultptr)584return (error);585update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);586update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);587ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;588589error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,590faultptr);591if (error || *faultptr)592return (error);593update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);594595error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,596faultptr);597if (error || *faultptr)598return (error);599update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);600601error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,602faultptr);603if (error || *faultptr)604return (error);605update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);606607error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,608faultptr);609if (error || *faultptr)610return (error);611update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);612613return (0);614}615616/*617* Push an error code on the stack of the new task. This is needed if the618* task switch was triggered by a hardware exception that causes an error619* code to be saved (e.g. #PF).620*/621static int622push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,623int task_type, uint32_t errcode, int *faultptr)624{625struct iovec iov[2];626struct seg_desc seg_desc;627int stacksize, bytes, error;628uint64_t gla, cr0, rflags;629uint32_t esp;630uint16_t stacksel;631632*faultptr = 0;633634cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);635rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);636stacksel = GETREG(vcpu, VM_REG_GUEST_SS);637638error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,639&seg_desc.limit, &seg_desc.access);640assert(error == 0);641642/*643* Section "Error Code" in the Intel SDM vol 3: the error code is644* pushed on the stack as a doubleword or word (depending on the645* default interrupt, trap or task gate size).646*/647if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)648bytes = 4;649else650bytes = 2;651652/*653* PUSH instruction from Intel SDM vol 2: the 'B' flag in the654* stack-segment descriptor determines the size of the stack655* pointer outside of 64-bit mode.656*/657if (SEG_DESC_DEF32(seg_desc.access))658stacksize = 4;659else660stacksize = 2;661662esp = GETREG(vcpu, VM_REG_GUEST_RSP);663esp -= bytes;664665if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,666&seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {667sel_exception(vcpu, IDT_SS, stacksel, 1);668*faultptr = 1;669return (0);670}671672if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {673vm_inject_ac(vcpu, 1);674*faultptr = 1;675return (0);676}677678error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,679iov, nitems(iov), faultptr);680if (error || *faultptr)681return (error);682683vm_copyout(&errcode, iov, bytes);684SETREG(vcpu, VM_REG_GUEST_RSP, esp);685return (0);686}687688/*689* Evaluate return value from helper functions and potentially return to690* the VM run loop.691*/692#define CHKERR(error,fault) \693do { \694assert((error == 0) || (error == EFAULT)); \695if (error) \696return (VMEXIT_ABORT); \697else if (fault) \698return (VMEXIT_CONTINUE); \699} while (0)700701int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);702703int704vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)705{706struct seg_desc nt;707struct tss32 oldtss, newtss;708struct vm_task_switch *task_switch;709struct vm_guest_paging *paging, sup_paging;710struct user_segment_descriptor nt_desc, ot_desc;711struct iovec nt_iov[2], ot_iov[2];712struct vm_exit *vmexit;713uint64_t cr0, ot_base;714uint32_t eip, ot_lim, access;715int error, ext, fault, minlimit, nt_type, ot_type;716enum task_switch_reason reason;717uint16_t nt_sel, ot_sel;718719vmexit = vmrun->vm_exit;720task_switch = &vmexit->u.task_switch;721nt_sel = task_switch->tsssel;722ext = vmexit->u.task_switch.ext;723reason = vmexit->u.task_switch.reason;724paging = &vmexit->u.task_switch.paging;725726assert(paging->cpu_mode == CPU_MODE_PROTECTED);727728/*729* Calculate the instruction pointer to store in the old TSS.730*/731eip = vmexit->rip + vmexit->inst_length;732733/*734* Section 4.6, "Access Rights" in Intel SDM Vol 3.735* The following page table accesses are implicitly supervisor mode:736* - accesses to GDT or LDT to load segment descriptors737* - accesses to the task state segment during task switch738*/739sup_paging = *paging;740sup_paging.cpl = 0; /* implicit supervisor mode */741742/* Fetch the new TSS descriptor */743error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,744&fault);745CHKERR(error, fault);746747nt = usd_to_seg_desc(&nt_desc);748749/* Verify the type of the new TSS */750nt_type = SEG_DESC_TYPE(nt.access);751if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&752nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {753sel_exception(vcpu, IDT_TS, nt_sel, ext);754goto done;755}756757/* TSS descriptor must have present bit set */758if (!SEG_DESC_PRESENT(nt.access)) {759sel_exception(vcpu, IDT_NP, nt_sel, ext);760goto done;761}762763/*764* TSS must have a minimum length of 104 bytes for a 32-bit TSS and765* 44 bytes for a 16-bit TSS.766*/767if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)768minlimit = 104 - 1;769else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)770minlimit = 44 - 1;771else772minlimit = 0;773774assert(minlimit > 0);775if (nt.limit < (unsigned int)minlimit) {776sel_exception(vcpu, IDT_TS, nt_sel, ext);777goto done;778}779780/* TSS must be busy if task switch is due to IRET */781if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {782sel_exception(vcpu, IDT_TS, nt_sel, ext);783goto done;784}785786/*787* TSS must be available (not busy) if task switch reason is788* CALL, JMP, exception or interrupt.789*/790if (reason != TSR_IRET && TSS_BUSY(nt_type)) {791sel_exception(vcpu, IDT_GP, nt_sel, ext);792goto done;793}794795/* Fetch the new TSS */796error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,797PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);798CHKERR(error, fault);799vm_copyin(nt_iov, &newtss, minlimit + 1);800801/* Get the old TSS selector from the guest's task register */802ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);803if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {804/*805* This might happen if a task switch was attempted without806* ever loading the task register with LTR. In this case the807* TR would contain the values from power-on:808* (sel = 0, base = 0, limit = 0xffff).809*/810sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);811goto done;812}813814/* Get the old TSS base and limit from the guest's task register */815error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,816&access);817assert(error == 0);818assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));819ot_type = SEG_DESC_TYPE(access);820assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);821822/* Fetch the old TSS descriptor */823error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,824&fault);825CHKERR(error, fault);826827/* Get the old TSS */828error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,829PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);830CHKERR(error, fault);831vm_copyin(ot_iov, &oldtss, minlimit + 1);832833/*834* Clear the busy bit in the old TSS descriptor if the task switch835* due to an IRET or JMP instruction.836*/837if (reason == TSR_IRET || reason == TSR_JMP) {838ot_desc.sd_type &= ~0x2;839error = desc_table_write(vcpu, &sup_paging, ot_sel,840&ot_desc, &fault);841CHKERR(error, fault);842}843844if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {845EPRINTLN("Task switch to 16-bit TSS not supported");846return (VMEXIT_ABORT);847}848849/* Save processor state in old TSS */850tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);851852/*853* If the task switch was triggered for any reason other than IRET854* then set the busy bit in the new TSS descriptor.855*/856if (reason != TSR_IRET) {857nt_desc.sd_type |= 0x2;858error = desc_table_write(vcpu, &sup_paging, nt_sel,859&nt_desc, &fault);860CHKERR(error, fault);861}862863/* Update task register to point at the new TSS */864SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);865866/* Update the hidden descriptor state of the task register */867nt = usd_to_seg_desc(&nt_desc);868update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);869870/* Set CR0.TS */871cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);872SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);873874/*875* We are now committed to the task switch. Any exceptions encountered876* after this point will be handled in the context of the new task and877* the saved instruction pointer will belong to the new task.878*/879error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);880assert(error == 0);881882/* Load processor state from new TSS */883error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,884&fault);885CHKERR(error, fault);886887/*888* Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception889* caused an error code to be generated, this error code is copied890* to the stack of the new task.891*/892if (task_switch->errcode_valid) {893assert(task_switch->ext);894assert(task_switch->reason == TSR_IDT_GATE);895error = push_errcode(vcpu, &task_switch->paging, nt_type,896task_switch->errcode, &fault);897CHKERR(error, fault);898}899900/*901* Treatment of virtual-NMI blocking if NMI is delivered through902* a task gate.903*904* Section "Architectural State Before A VM Exit", Intel SDM, Vol3:905* If the virtual NMIs VM-execution control is 1, VM entry injects906* an NMI, and delivery of the NMI causes a task switch that causes907* a VM exit, virtual-NMI blocking is in effect before the VM exit908* commences.909*910* Thus, virtual-NMI blocking is in effect at the time of the task911* switch VM exit.912*/913914/*915* Treatment of virtual-NMI unblocking on IRET from NMI handler task.916*917* Section "Changes to Instruction Behavior in VMX Non-Root Operation"918* If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.919* This unblocking of virtual-NMI occurs even if IRET causes a fault.920*921* Thus, virtual-NMI blocking is cleared at the time of the task switch922* VM exit.923*/924925/*926* If the task switch was triggered by an event delivered through927* the IDT then extinguish the pending event from the vcpu's928* exitintinfo.929*/930if (task_switch->reason == TSR_IDT_GATE) {931error = vm_set_intinfo(vcpu, 0);932assert(error == 0);933}934935/*936* XXX should inject debug exception if 'T' bit is 1937*/938done:939return (VMEXIT_CONTINUE);940}941942943