Path: blob/main/sys/amd64/vmm/vmm_instruction_emul.c
39536 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2012 Sandvine, Inc.4* Copyright (c) 2012 NetApp, Inc.5* All rights reserved.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10* 1. Redistributions of source code must retain the above copyright11* notice, this list of conditions and the following disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND17* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE18* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE19* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE20* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL21* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS22* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)23* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT24* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY25* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF26* SUCH DAMAGE.27*/2829#include <sys/cdefs.h>30#ifdef _KERNEL31#include <sys/param.h>32#include <sys/pcpu.h>33#include <sys/systm.h>34#include <sys/proc.h>3536#include <vm/vm.h>37#include <vm/pmap.h>3839#include <machine/vmparam.h>40#include <machine/vmm.h>4142#include <dev/vmm/vmm_mem.h>43#else /* !_KERNEL */44#include <sys/types.h>45#include <sys/errno.h>46#include <sys/_iovec.h>4748#include <machine/vmm.h>4950#include <err.h>51#include <assert.h>52#include <stdbool.h>53#include <stddef.h>54#include <stdio.h>55#include <string.h>56#include <strings.h>57#include <vmmapi.h>58#define __diagused59#define KASSERT(exp,msg) assert((exp))60#define panic(...) errx(4, __VA_ARGS__)61#endif /* _KERNEL */6263#include <machine/vmm_instruction_emul.h>64#include <x86/psl.h>65#include <x86/specialreg.h>6667/* struct vie_op.op_flags */68#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */69#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */70#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */71#define VIE_OP_F_NO_MODRM (1 << 3)72#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)7374static const struct vie_op three_byte_opcodes_0f38[256] = {75[0xF7] = {76.op_byte = 0xF7,77.op_type = VIE_OP_TYPE_BEXTR,78},79};8081static const struct vie_op two_byte_opcodes[256] = {82[0xAE] = {83.op_byte = 0xAE,84.op_type = VIE_OP_TYPE_TWOB_GRP15,85},86[0xB6] = {87.op_byte = 0xB6,88.op_type = VIE_OP_TYPE_MOVZX,89},90[0xB7] = {91.op_byte = 0xB7,92.op_type = VIE_OP_TYPE_MOVZX,93},94[0xBA] = {95.op_byte = 0xBA,96.op_type = VIE_OP_TYPE_BITTEST,97.op_flags = VIE_OP_F_IMM8,98},99[0xBE] = {100.op_byte = 0xBE,101.op_type = VIE_OP_TYPE_MOVSX,102},103};104105static const struct vie_op one_byte_opcodes[256] = {106[0x03] = {107.op_byte = 0x03,108.op_type = VIE_OP_TYPE_ADD,109},110[0x0F] = {111.op_byte = 0x0F,112.op_type = VIE_OP_TYPE_TWO_BYTE113},114[0x0B] = {115.op_byte = 0x0B,116.op_type = VIE_OP_TYPE_OR,117},118[0x2B] = {119.op_byte = 0x2B,120.op_type = VIE_OP_TYPE_SUB,121},122[0x39] = {123.op_byte = 0x39,124.op_type = VIE_OP_TYPE_CMP,125},126[0x3B] = {127.op_byte = 0x3B,128.op_type = VIE_OP_TYPE_CMP,129},130[0x6E] = {131.op_byte = 0x6E,132.op_type = VIE_OP_TYPE_OUTS,133.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,134},135[0x6F] = {136.op_byte = 0x6F,137.op_type = VIE_OP_TYPE_OUTS,138.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,139},140[0x88] = {141.op_byte = 0x88,142.op_type = VIE_OP_TYPE_MOV,143},144[0x89] = {145.op_byte = 0x89,146.op_type = VIE_OP_TYPE_MOV,147},148[0x8A] = {149.op_byte = 0x8A,150.op_type = VIE_OP_TYPE_MOV,151},152[0x8B] = {153.op_byte = 0x8B,154.op_type = VIE_OP_TYPE_MOV,155},156[0xA1] = {157.op_byte = 0xA1,158.op_type = VIE_OP_TYPE_MOV,159.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,160},161[0xA3] = {162.op_byte = 0xA3,163.op_type = VIE_OP_TYPE_MOV,164.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,165},166[0xA4] = {167.op_byte = 0xA4,168.op_type = VIE_OP_TYPE_MOVS,169.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION170},171[0xA5] = {172.op_byte = 0xA5,173.op_type = VIE_OP_TYPE_MOVS,174.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION175},176[0xAA] = {177.op_byte = 0xAA,178.op_type = VIE_OP_TYPE_STOS,179.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION180},181[0xAB] = {182.op_byte = 0xAB,183.op_type = VIE_OP_TYPE_STOS,184.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION185},186[0xC6] = {187/* XXX Group 11 extended opcode - not just MOV */188.op_byte = 0xC6,189.op_type = VIE_OP_TYPE_MOV,190.op_flags = VIE_OP_F_IMM8,191},192[0xC7] = {193.op_byte = 0xC7,194.op_type = VIE_OP_TYPE_MOV,195.op_flags = VIE_OP_F_IMM,196},197[0x23] = {198.op_byte = 0x23,199.op_type = VIE_OP_TYPE_AND,200},201[0x80] = {202/* Group 1 extended opcode */203.op_byte = 0x80,204.op_type = VIE_OP_TYPE_GROUP1,205.op_flags = VIE_OP_F_IMM8,206},207[0x81] = {208/* Group 1 extended opcode */209.op_byte = 0x81,210.op_type = VIE_OP_TYPE_GROUP1,211.op_flags = VIE_OP_F_IMM,212},213[0x83] = {214/* Group 1 extended opcode */215.op_byte = 0x83,216.op_type = VIE_OP_TYPE_GROUP1,217.op_flags = VIE_OP_F_IMM8,218},219[0x8F] = {220/* XXX Group 1A extended opcode - not just POP */221.op_byte = 0x8F,222.op_type = VIE_OP_TYPE_POP,223},224[0xF6] = {225/* XXX Group 3 extended opcode - not just TEST */226.op_byte = 0xF6,227.op_type = VIE_OP_TYPE_TEST,228.op_flags = VIE_OP_F_IMM8,229},230[0xF7] = {231/* XXX Group 3 extended opcode - not just TEST */232.op_byte = 0xF7,233.op_type = VIE_OP_TYPE_TEST,234.op_flags = VIE_OP_F_IMM,235},236[0xFF] = {237/* XXX Group 5 extended opcode - not just PUSH */238.op_byte = 0xFF,239.op_type = VIE_OP_TYPE_PUSH,240}241};242243/* struct vie.mod */244#define VIE_MOD_INDIRECT 0245#define VIE_MOD_INDIRECT_DISP8 1246#define VIE_MOD_INDIRECT_DISP32 2247#define VIE_MOD_DIRECT 3248249/* struct vie.rm */250#define VIE_RM_SIB 4251#define VIE_RM_DISP32 5252253#define GB (1024 * 1024 * 1024)254255static enum vm_reg_name gpr_map[16] = {256VM_REG_GUEST_RAX,257VM_REG_GUEST_RCX,258VM_REG_GUEST_RDX,259VM_REG_GUEST_RBX,260VM_REG_GUEST_RSP,261VM_REG_GUEST_RBP,262VM_REG_GUEST_RSI,263VM_REG_GUEST_RDI,264VM_REG_GUEST_R8,265VM_REG_GUEST_R9,266VM_REG_GUEST_R10,267VM_REG_GUEST_R11,268VM_REG_GUEST_R12,269VM_REG_GUEST_R13,270VM_REG_GUEST_R14,271VM_REG_GUEST_R15272};273274static uint64_t size2mask[] = {275[1] = 0xff,276[2] = 0xffff,277[4] = 0xffffffff,278[8] = 0xffffffffffffffff,279};280281static int282vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval)283{284int error;285286error = vm_get_register(vcpu, reg, rval);287288return (error);289}290291static void292vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)293{294*lhbr = 0;295*reg = gpr_map[vie->reg];296297/*298* 64-bit mode imposes limitations on accessing legacy high byte299* registers (lhbr).300*301* The legacy high-byte registers cannot be addressed if the REX302* prefix is present. In this case the values 4, 5, 6 and 7 of the303* 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.304*305* If the REX prefix is not present then the values 4, 5, 6 and 7306* of the 'ModRM:reg' field address the legacy high-byte registers,307* %ah, %ch, %dh and %bh respectively.308*/309if (!vie->rex_present) {310if (vie->reg & 0x4) {311*lhbr = 1;312*reg = gpr_map[vie->reg & 0x3];313}314}315}316317static int318vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval)319{320uint64_t val;321int error, lhbr;322enum vm_reg_name reg;323324vie_calc_bytereg(vie, ®, &lhbr);325error = vm_get_register(vcpu, reg, &val);326327/*328* To obtain the value of a legacy high byte register shift the329* base register right by 8 bits (%ah = %rax >> 8).330*/331if (lhbr)332*rval = val >> 8;333else334*rval = val;335return (error);336}337338static int339vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte)340{341uint64_t origval, val, mask;342int error, lhbr;343enum vm_reg_name reg;344345vie_calc_bytereg(vie, ®, &lhbr);346error = vm_get_register(vcpu, reg, &origval);347if (error == 0) {348val = byte;349mask = 0xff;350if (lhbr) {351/*352* Shift left by 8 to store 'byte' in a legacy high353* byte register.354*/355val <<= 8;356mask <<= 8;357}358val |= origval & ~mask;359error = vm_set_register(vcpu, reg, val);360}361return (error);362}363364int365vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,366uint64_t val, int size)367{368int error;369uint64_t origval;370371switch (size) {372case 1:373case 2:374error = vie_read_register(vcpu, reg, &origval);375if (error)376return (error);377val &= size2mask[size];378val |= origval & ~size2mask[size];379break;380case 4:381val &= 0xffffffffUL;382break;383case 8:384break;385default:386return (EINVAL);387}388389error = vm_set_register(vcpu, reg, val);390return (error);391}392393#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)394395/*396* Return the status flags that would result from doing (x - y).397*/398#define GETCC(sz) \399static u_long \400getcc##sz(uint##sz##_t x, uint##sz##_t y) \401{ \402u_long rflags; \403\404__asm __volatile("sub %2,%1; pushfq; popq %0" : \405"=r" (rflags), "+r" (x) : "m" (y)); \406return (rflags); \407} struct __hack408409GETCC(8);410GETCC(16);411GETCC(32);412GETCC(64);413414static u_long415getcc(int opsize, uint64_t x, uint64_t y)416{417KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,418("getcc: invalid operand size %d", opsize));419420if (opsize == 1)421return (getcc8(x, y));422else if (opsize == 2)423return (getcc16(x, y));424else if (opsize == 4)425return (getcc32(x, y));426else427return (getcc64(x, y));428}429430/*431* Macro creation of functions getaddflags{8,16,32,64}432*/433#define GETADDFLAGS(sz) \434static u_long \435getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \436{ \437u_long rflags; \438\439__asm __volatile("add %2,%1; pushfq; popq %0" : \440"=r" (rflags), "+r" (x) : "m" (y)); \441return (rflags); \442} struct __hack443444GETADDFLAGS(8);445GETADDFLAGS(16);446GETADDFLAGS(32);447GETADDFLAGS(64);448449static u_long450getaddflags(int opsize, uint64_t x, uint64_t y)451{452KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,453("getaddflags: invalid operand size %d", opsize));454455if (opsize == 1)456return (getaddflags8(x, y));457else if (opsize == 2)458return (getaddflags16(x, y));459else if (opsize == 4)460return (getaddflags32(x, y));461else462return (getaddflags64(x, y));463}464465/*466* Return the status flags that would result from doing (x & y).467*/468#define GETANDFLAGS(sz) \469static u_long \470getandflags##sz(uint##sz##_t x, uint##sz##_t y) \471{ \472u_long rflags; \473\474__asm __volatile("and %2,%1; pushfq; popq %0" : \475"=r" (rflags), "+r" (x) : "m" (y)); \476return (rflags); \477} struct __hack478479GETANDFLAGS(8);480GETANDFLAGS(16);481GETANDFLAGS(32);482GETANDFLAGS(64);483484static u_long485getandflags(int opsize, uint64_t x, uint64_t y)486{487KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,488("getandflags: invalid operand size %d", opsize));489490if (opsize == 1)491return (getandflags8(x, y));492else if (opsize == 2)493return (getandflags16(x, y));494else if (opsize == 4)495return (getandflags32(x, y));496else497return (getandflags64(x, y));498}499500static int501emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,502mem_region_read_t memread, mem_region_write_t memwrite, void *arg)503{504int error, size;505enum vm_reg_name reg;506uint8_t byte;507uint64_t val;508509size = vie->opsize;510error = EINVAL;511512switch (vie->op.op_byte) {513case 0x88:514/*515* MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)516* 88/r: mov r/m8, r8517* REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)518*/519size = 1; /* override for byte operation */520error = vie_read_bytereg(vcpu, vie, &byte);521if (error == 0)522error = memwrite(vcpu, gpa, byte, size, arg);523break;524case 0x89:525/*526* MOV from reg (ModRM:reg) to mem (ModRM:r/m)527* 89/r: mov r/m16, r16528* 89/r: mov r/m32, r32529* REX.W + 89/r mov r/m64, r64530*/531reg = gpr_map[vie->reg];532error = vie_read_register(vcpu, reg, &val);533if (error == 0) {534val &= size2mask[size];535error = memwrite(vcpu, gpa, val, size, arg);536}537break;538case 0x8A:539/*540* MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)541* 8A/r: mov r8, r/m8542* REX + 8A/r: mov r8, r/m8543*/544size = 1; /* override for byte operation */545error = memread(vcpu, gpa, &val, size, arg);546if (error == 0)547error = vie_write_bytereg(vcpu, vie, val);548break;549case 0x8B:550/*551* MOV from mem (ModRM:r/m) to reg (ModRM:reg)552* 8B/r: mov r16, r/m16553* 8B/r: mov r32, r/m32554* REX.W 8B/r: mov r64, r/m64555*/556error = memread(vcpu, gpa, &val, size, arg);557if (error == 0) {558reg = gpr_map[vie->reg];559error = vie_update_register(vcpu, reg, val, size);560}561break;562case 0xA1:563/*564* MOV from seg:moffset to AX/EAX/RAX565* A1: mov AX, moffs16566* A1: mov EAX, moffs32567* REX.W + A1: mov RAX, moffs64568*/569error = memread(vcpu, gpa, &val, size, arg);570if (error == 0) {571reg = VM_REG_GUEST_RAX;572error = vie_update_register(vcpu, reg, val, size);573}574break;575case 0xA3:576/*577* MOV from AX/EAX/RAX to seg:moffset578* A3: mov moffs16, AX579* A3: mov moffs32, EAX580* REX.W + A3: mov moffs64, RAX581*/582error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);583if (error == 0) {584val &= size2mask[size];585error = memwrite(vcpu, gpa, val, size, arg);586}587break;588case 0xC6:589/*590* MOV from imm8 to mem (ModRM:r/m)591* C6/0 mov r/m8, imm8592* REX + C6/0 mov r/m8, imm8593*/594size = 1; /* override for byte operation */595error = memwrite(vcpu, gpa, vie->immediate, size, arg);596break;597case 0xC7:598/*599* MOV from imm16/imm32 to mem (ModRM:r/m)600* C7/0 mov r/m16, imm16601* C7/0 mov r/m32, imm32602* REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)603*/604val = vie->immediate & size2mask[size];605error = memwrite(vcpu, gpa, val, size, arg);606break;607default:608break;609}610611return (error);612}613614static int615emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,616mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)617{618int error, size;619enum vm_reg_name reg;620uint64_t val;621622size = vie->opsize;623error = EINVAL;624625switch (vie->op.op_byte) {626case 0xB6:627/*628* MOV and zero extend byte from mem (ModRM:r/m) to629* reg (ModRM:reg).630*631* 0F B6/r movzx r16, r/m8632* 0F B6/r movzx r32, r/m8633* REX.W + 0F B6/r movzx r64, r/m8634*/635636/* get the first operand */637error = memread(vcpu, gpa, &val, 1, arg);638if (error)639break;640641/* get the second operand */642reg = gpr_map[vie->reg];643644/* zero-extend byte */645val = (uint8_t)val;646647/* write the result */648error = vie_update_register(vcpu, reg, val, size);649break;650case 0xB7:651/*652* MOV and zero extend word from mem (ModRM:r/m) to653* reg (ModRM:reg).654*655* 0F B7/r movzx r32, r/m16656* REX.W + 0F B7/r movzx r64, r/m16657*/658error = memread(vcpu, gpa, &val, 2, arg);659if (error)660return (error);661662reg = gpr_map[vie->reg];663664/* zero-extend word */665val = (uint16_t)val;666667error = vie_update_register(vcpu, reg, val, size);668break;669case 0xBE:670/*671* MOV and sign extend byte from mem (ModRM:r/m) to672* reg (ModRM:reg).673*674* 0F BE/r movsx r16, r/m8675* 0F BE/r movsx r32, r/m8676* REX.W + 0F BE/r movsx r64, r/m8677*/678679/* get the first operand */680error = memread(vcpu, gpa, &val, 1, arg);681if (error)682break;683684/* get the second operand */685reg = gpr_map[vie->reg];686687/* sign extend byte */688val = (int8_t)val;689690/* write the result */691error = vie_update_register(vcpu, reg, val, size);692break;693default:694break;695}696return (error);697}698699/*700* Helper function to calculate and validate a linear address.701*/702static int703get_gla(struct vcpu *vcpu, struct vie *vie __unused,704struct vm_guest_paging *paging, int opsize, int addrsize, int prot,705enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)706{707struct seg_desc desc;708uint64_t cr0, val, rflags;709int error __diagused;710711error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);712KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));713714error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);715KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));716717error = vm_get_seg_desc(vcpu, seg, &desc);718KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",719__func__, error, seg));720721error = vie_read_register(vcpu, gpr, &val);722KASSERT(error == 0, ("%s: error %d getting register %d", __func__,723error, gpr));724725if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,726addrsize, prot, gla)) {727if (seg == VM_REG_GUEST_SS)728vm_inject_ss(vcpu, 0);729else730vm_inject_gp(vcpu);731goto guest_fault;732}733734if (vie_canonical_check(paging->cpu_mode, *gla)) {735if (seg == VM_REG_GUEST_SS)736vm_inject_ss(vcpu, 0);737else738vm_inject_gp(vcpu);739goto guest_fault;740}741742if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {743vm_inject_ac(vcpu, 0);744goto guest_fault;745}746747*fault = 0;748return (0);749750guest_fault:751*fault = 1;752return (0);753}754755static int756emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,757struct vm_guest_paging *paging, mem_region_read_t memread,758mem_region_write_t memwrite, void *arg)759{760#ifdef _KERNEL761struct vm_copyinfo copyinfo[2];762#else763struct iovec copyinfo[2];764#endif765uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;766uint64_t rcx, rdi, rsi, rflags;767int error, fault, opsize, seg, repeat;768769opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;770val = 0;771error = 0;772773/*774* XXX although the MOVS instruction is only supposed to be used with775* the "rep" prefix some guests like FreeBSD will use "repnz" instead.776*777* Empirically the "repnz" prefix has identical behavior to "rep"778* and the zero flag does not make a difference.779*/780repeat = vie->repz_present | vie->repnz_present;781782if (repeat) {783error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);784KASSERT(!error, ("%s: error %d getting rcx", __func__, error));785786/*787* The count register is %rcx, %ecx or %cx depending on the788* address size of the instruction.789*/790if ((rcx & vie_size2mask(vie->addrsize)) == 0) {791error = 0;792goto done;793}794}795796/*797* Source Destination Comments798* --------------------------------------------799* (1) memory memory n/a800* (2) memory mmio emulated801* (3) mmio memory emulated802* (4) mmio mmio emulated803*804* At this point we don't have sufficient information to distinguish805* between (2), (3) and (4). We use 'vm_copy_setup()' to tease this806* out because it will succeed only when operating on regular memory.807*808* XXX the emulation doesn't properly handle the case where 'gpa'809* is straddling the boundary between the normal memory and MMIO.810*/811812seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;813error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,814PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);815if (error || fault)816goto done;817818error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ,819copyinfo, nitems(copyinfo), &fault);820if (error == 0) {821if (fault)822goto done; /* Resume guest to handle fault */823824/*825* case (2): read from system memory and write to mmio.826*/827vm_copyin(copyinfo, &val, opsize);828vm_copy_teardown(copyinfo, nitems(copyinfo));829error = memwrite(vcpu, gpa, val, opsize, arg);830if (error)831goto done;832} else {833/*834* 'vm_copy_setup()' is expected to fail for cases (3) and (4)835* if 'srcaddr' is in the mmio space.836*/837838error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,839PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,840&fault);841if (error || fault)842goto done;843844error = vm_copy_setup(vcpu, paging, dstaddr, opsize,845PROT_WRITE, copyinfo, nitems(copyinfo), &fault);846if (error == 0) {847if (fault)848goto done; /* Resume guest to handle fault */849850/*851* case (3): read from MMIO and write to system memory.852*853* A MMIO read can have side-effects so we854* commit to it only after vm_copy_setup() is855* successful. If a page-fault needs to be856* injected into the guest then it will happen857* before the MMIO read is attempted.858*/859error = memread(vcpu, gpa, &val, opsize, arg);860if (error)861goto done;862863vm_copyout(&val, copyinfo, opsize);864vm_copy_teardown(copyinfo, nitems(copyinfo));865} else {866/*867* Case (4): read from and write to mmio.868*869* Commit to the MMIO read/write (with potential870* side-effects) only after we are sure that the871* instruction is not going to be restarted due872* to address translation faults.873*/874error = vm_gla2gpa(vcpu, paging, srcaddr,875PROT_READ, &srcgpa, &fault);876if (error || fault)877goto done;878879error = vm_gla2gpa(vcpu, paging, dstaddr,880PROT_WRITE, &dstgpa, &fault);881if (error || fault)882goto done;883884error = memread(vcpu, srcgpa, &val, opsize, arg);885if (error)886goto done;887888error = memwrite(vcpu, dstgpa, val, opsize, arg);889if (error)890goto done;891}892}893894error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi);895KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));896897error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);898KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));899900error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);901KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));902903if (rflags & PSL_D) {904rsi -= opsize;905rdi -= opsize;906} else {907rsi += opsize;908rdi += opsize;909}910911error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi,912vie->addrsize);913KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));914915error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,916vie->addrsize);917KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));918919if (repeat) {920rcx = rcx - 1;921error = vie_update_register(vcpu, VM_REG_GUEST_RCX,922rcx, vie->addrsize);923KASSERT(!error, ("%s: error %d updating rcx", __func__, error));924925/*926* Repeat the instruction if the count register is not zero.927*/928if ((rcx & vie_size2mask(vie->addrsize)) != 0)929vm_restart_instruction(vcpu);930}931done:932KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",933__func__, error));934return (error);935}936937static int938emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,939struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,940mem_region_write_t memwrite, void *arg)941{942int error, opsize, repeat;943uint64_t val;944uint64_t rcx, rdi, rflags;945946opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;947repeat = vie->repz_present | vie->repnz_present;948949if (repeat) {950error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);951KASSERT(!error, ("%s: error %d getting rcx", __func__, error));952953/*954* The count register is %rcx, %ecx or %cx depending on the955* address size of the instruction.956*/957if ((rcx & vie_size2mask(vie->addrsize)) == 0)958return (0);959}960961error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);962KASSERT(!error, ("%s: error %d getting rax", __func__, error));963964error = memwrite(vcpu, gpa, val, opsize, arg);965if (error)966return (error);967968error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);969KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));970971error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);972KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));973974if (rflags & PSL_D)975rdi -= opsize;976else977rdi += opsize;978979error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,980vie->addrsize);981KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));982983if (repeat) {984rcx = rcx - 1;985error = vie_update_register(vcpu, VM_REG_GUEST_RCX,986rcx, vie->addrsize);987KASSERT(!error, ("%s: error %d updating rcx", __func__, error));988989/*990* Repeat the instruction if the count register is not zero.991*/992if ((rcx & vie_size2mask(vie->addrsize)) != 0)993vm_restart_instruction(vcpu);994}995996return (0);997}998999static int1000emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1001mem_region_read_t memread, mem_region_write_t memwrite, void *arg)1002{1003int error, size;1004enum vm_reg_name reg;1005uint64_t result, rflags, rflags2, val1, val2;10061007size = vie->opsize;1008error = EINVAL;10091010switch (vie->op.op_byte) {1011case 0x23:1012/*1013* AND reg (ModRM:reg) and mem (ModRM:r/m) and store the1014* result in reg.1015*1016* 23/r and r16, r/m161017* 23/r and r32, r/m321018* REX.W + 23/r and r64, r/m641019*/10201021/* get the first operand */1022reg = gpr_map[vie->reg];1023error = vie_read_register(vcpu, reg, &val1);1024if (error)1025break;10261027/* get the second operand */1028error = memread(vcpu, gpa, &val2, size, arg);1029if (error)1030break;10311032/* perform the operation and write the result */1033result = val1 & val2;1034error = vie_update_register(vcpu, reg, result, size);1035break;1036case 0x81:1037case 0x83:1038/*1039* AND mem (ModRM:r/m) with immediate and store the1040* result in mem.1041*1042* 81 /4 and r/m16, imm161043* 81 /4 and r/m32, imm321044* REX.W + 81 /4 and r/m64, imm32 sign-extended to 641045*1046* 83 /4 and r/m16, imm8 sign-extended to 161047* 83 /4 and r/m32, imm8 sign-extended to 321048* REX.W + 83/4 and r/m64, imm8 sign-extended to 641049*/10501051/* get the first operand */1052error = memread(vcpu, gpa, &val1, size, arg);1053if (error)1054break;10551056/*1057* perform the operation with the pre-fetched immediate1058* operand and write the result1059*/1060result = val1 & vie->immediate;1061error = memwrite(vcpu, gpa, result, size, arg);1062break;1063default:1064break;1065}1066if (error)1067return (error);10681069error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1070if (error)1071return (error);10721073/*1074* OF and CF are cleared; the SF, ZF and PF flags are set according1075* to the result; AF is undefined.1076*1077* The updated status flags are obtained by subtracting 0 from 'result'.1078*/1079rflags2 = getcc(size, result, 0);1080rflags &= ~RFLAGS_STATUS_BITS;1081rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);10821083error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);1084return (error);1085}10861087static int1088emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1089mem_region_read_t memread, mem_region_write_t memwrite, void *arg)1090{1091int error, size;1092enum vm_reg_name reg;1093uint64_t result, rflags, rflags2, val1, val2;10941095size = vie->opsize;1096error = EINVAL;10971098switch (vie->op.op_byte) {1099case 0x0B:1100/*1101* OR reg (ModRM:reg) and mem (ModRM:r/m) and store the1102* result in reg.1103*1104* 0b/r or r16, r/m161105* 0b/r or r32, r/m321106* REX.W + 0b/r or r64, r/m641107*/11081109/* get the first operand */1110reg = gpr_map[vie->reg];1111error = vie_read_register(vcpu, reg, &val1);1112if (error)1113break;11141115/* get the second operand */1116error = memread(vcpu, gpa, &val2, size, arg);1117if (error)1118break;11191120/* perform the operation and write the result */1121result = val1 | val2;1122error = vie_update_register(vcpu, reg, result, size);1123break;1124case 0x81:1125case 0x83:1126/*1127* OR mem (ModRM:r/m) with immediate and store the1128* result in mem.1129*1130* 81 /1 or r/m16, imm161131* 81 /1 or r/m32, imm321132* REX.W + 81 /1 or r/m64, imm32 sign-extended to 641133*1134* 83 /1 or r/m16, imm8 sign-extended to 161135* 83 /1 or r/m32, imm8 sign-extended to 321136* REX.W + 83/1 or r/m64, imm8 sign-extended to 641137*/11381139/* get the first operand */1140error = memread(vcpu, gpa, &val1, size, arg);1141if (error)1142break;11431144/*1145* perform the operation with the pre-fetched immediate1146* operand and write the result1147*/1148result = val1 | vie->immediate;1149error = memwrite(vcpu, gpa, result, size, arg);1150break;1151default:1152break;1153}1154if (error)1155return (error);11561157error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1158if (error)1159return (error);11601161/*1162* OF and CF are cleared; the SF, ZF and PF flags are set according1163* to the result; AF is undefined.1164*1165* The updated status flags are obtained by subtracting 0 from 'result'.1166*/1167rflags2 = getcc(size, result, 0);1168rflags &= ~RFLAGS_STATUS_BITS;1169rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);11701171error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);1172return (error);1173}11741175static int1176emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1177mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)1178{1179int error, size;1180uint64_t regop, memop, op1, op2, rflags, rflags2;1181enum vm_reg_name reg;11821183size = vie->opsize;1184switch (vie->op.op_byte) {1185case 0x39:1186case 0x3B:1187/*1188* 39/r CMP r/m16, r161189* 39/r CMP r/m32, r321190* REX.W 39/r CMP r/m64, r641191*1192* 3B/r CMP r16, r/m161193* 3B/r CMP r32, r/m321194* REX.W + 3B/r CMP r64, r/m641195*1196* Compare the first operand with the second operand and1197* set status flags in EFLAGS register. The comparison is1198* performed by subtracting the second operand from the first1199* operand and then setting the status flags.1200*/12011202/* Get the register operand */1203reg = gpr_map[vie->reg];1204error = vie_read_register(vcpu, reg, ®op);1205if (error)1206return (error);12071208/* Get the memory operand */1209error = memread(vcpu, gpa, &memop, size, arg);1210if (error)1211return (error);12121213if (vie->op.op_byte == 0x3B) {1214op1 = regop;1215op2 = memop;1216} else {1217op1 = memop;1218op2 = regop;1219}1220rflags2 = getcc(size, op1, op2);1221break;1222case 0x80:1223case 0x81:1224case 0x83:1225/*1226* 80 /7 cmp r/m8, imm81227* REX + 80 /7 cmp r/m8, imm81228*1229* 81 /7 cmp r/m16, imm161230* 81 /7 cmp r/m32, imm321231* REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 641232*1233* 83 /7 cmp r/m16, imm8 sign-extended to 161234* 83 /7 cmp r/m32, imm8 sign-extended to 321235* REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 641236*1237* Compare mem (ModRM:r/m) with immediate and set1238* status flags according to the results. The1239* comparison is performed by subtracting the1240* immediate from the first operand and then setting1241* the status flags.1242*1243*/1244if (vie->op.op_byte == 0x80)1245size = 1;12461247/* get the first operand */1248error = memread(vcpu, gpa, &op1, size, arg);1249if (error)1250return (error);12511252rflags2 = getcc(size, op1, vie->immediate);1253break;1254default:1255return (EINVAL);1256}1257error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1258if (error)1259return (error);1260rflags &= ~RFLAGS_STATUS_BITS;1261rflags |= rflags2 & RFLAGS_STATUS_BITS;12621263error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);1264return (error);1265}12661267static int1268emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1269mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)1270{1271int error, size;1272uint64_t op1, rflags, rflags2;12731274size = vie->opsize;1275error = EINVAL;12761277switch (vie->op.op_byte) {1278case 0xF6:1279/*1280* F6 /0 test r/m8, imm81281*/1282size = 1; /* override for byte operation */1283/* FALLTHROUGH */1284case 0xF7:1285/*1286* F7 /0 test r/m16, imm161287* F7 /0 test r/m32, imm321288* REX.W + F7 /0 test r/m64, imm32 sign-extended to 641289*1290* Test mem (ModRM:r/m) with immediate and set status1291* flags according to the results. The comparison is1292* performed by anding the immediate from the first1293* operand and then setting the status flags.1294*/1295if ((vie->reg & 7) != 0)1296return (EINVAL);12971298error = memread(vcpu, gpa, &op1, size, arg);1299if (error)1300return (error);13011302rflags2 = getandflags(size, op1, vie->immediate);1303break;1304default:1305return (EINVAL);1306}1307error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1308if (error)1309return (error);13101311/*1312* OF and CF are cleared; the SF, ZF and PF flags are set according1313* to the result; AF is undefined.1314*/1315rflags &= ~RFLAGS_STATUS_BITS;1316rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);13171318error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);1319return (error);1320}13211322static int1323emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1324struct vm_guest_paging *paging, mem_region_read_t memread,1325mem_region_write_t memwrite __unused, void *arg)1326{1327uint64_t src1, src2, dst, rflags;1328unsigned start, len, size;1329int error;13301331size = vie->opsize;1332error = EINVAL;13331334/*1335* VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b1336* VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b1337*1338* Destination operand is ModRM:reg. Source operands are ModRM:r/m and1339* Vex.vvvv.1340*1341* Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).1342*/1343if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)1344size = 4;13451346/*1347* Extracts contiguous bits from the first /source/ operand (second1348* operand) using an index and length specified in the second /source/1349* operand (third operand).1350*/1351error = memread(vcpu, gpa, &src1, size, arg);1352if (error)1353return (error);1354error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2);1355if (error)1356return (error);1357error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1358if (error)1359return (error);13601361start = (src2 & 0xff);1362len = (src2 & 0xff00) >> 8;13631364/* If no bits are extracted, the destination register is cleared. */1365dst = 0;13661367/* If START exceeds the operand size, no bits are extracted. */1368if (start > size * 8)1369goto done;1370/* Length is bounded by both the destination size and start offset. */1371if (start + len > size * 8)1372len = (size * 8) - start;1373if (len == 0)1374goto done;13751376if (start > 0)1377src1 = (src1 >> start);1378if (len < 64)1379src1 = src1 & ((1ull << len) - 1);1380dst = src1;13811382done:1383error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size);1384if (error)1385return (error);13861387/*1388* AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.1389* Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.1390*/1391rflags &= ~RFLAGS_STATUS_BITS;1392if (dst == 0)1393rflags |= PSL_Z;1394error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags,13958);1396return (error);1397}13981399static int1400emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1401mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)1402{1403int error, size;1404uint64_t nval, rflags, rflags2, val1, val2;1405enum vm_reg_name reg;14061407size = vie->opsize;1408error = EINVAL;14091410switch (vie->op.op_byte) {1411case 0x03:1412/*1413* ADD r/m to r and store the result in r1414*1415* 03/r ADD r16, r/m161416* 03/r ADD r32, r/m321417* REX.W + 03/r ADD r64, r/m641418*/14191420/* get the first operand */1421reg = gpr_map[vie->reg];1422error = vie_read_register(vcpu, reg, &val1);1423if (error)1424break;14251426/* get the second operand */1427error = memread(vcpu, gpa, &val2, size, arg);1428if (error)1429break;14301431/* perform the operation and write the result */1432nval = val1 + val2;1433error = vie_update_register(vcpu, reg, nval, size);1434break;1435default:1436break;1437}14381439if (!error) {1440rflags2 = getaddflags(size, val1, val2);1441error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,1442&rflags);1443if (error)1444return (error);14451446rflags &= ~RFLAGS_STATUS_BITS;1447rflags |= rflags2 & RFLAGS_STATUS_BITS;1448error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,1449rflags, 8);1450}14511452return (error);1453}14541455static int1456emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1457mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)1458{1459int error, size;1460uint64_t nval, rflags, rflags2, val1, val2;1461enum vm_reg_name reg;14621463size = vie->opsize;1464error = EINVAL;14651466switch (vie->op.op_byte) {1467case 0x2B:1468/*1469* SUB r/m from r and store the result in r1470*1471* 2B/r SUB r16, r/m161472* 2B/r SUB r32, r/m321473* REX.W + 2B/r SUB r64, r/m641474*/14751476/* get the first operand */1477reg = gpr_map[vie->reg];1478error = vie_read_register(vcpu, reg, &val1);1479if (error)1480break;14811482/* get the second operand */1483error = memread(vcpu, gpa, &val2, size, arg);1484if (error)1485break;14861487/* perform the operation and write the result */1488nval = val1 - val2;1489error = vie_update_register(vcpu, reg, nval, size);1490break;1491default:1492break;1493}14941495if (!error) {1496rflags2 = getcc(size, val1, val2);1497error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,1498&rflags);1499if (error)1500return (error);15011502rflags &= ~RFLAGS_STATUS_BITS;1503rflags |= rflags2 & RFLAGS_STATUS_BITS;1504error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,1505rflags, 8);1506}15071508return (error);1509}15101511static int1512emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,1513struct vm_guest_paging *paging, mem_region_read_t memread,1514mem_region_write_t memwrite, void *arg)1515{1516#ifdef _KERNEL1517struct vm_copyinfo copyinfo[2];1518#else1519struct iovec copyinfo[2];1520#endif1521struct seg_desc ss_desc;1522uint64_t cr0, rflags, rsp, stack_gla, val;1523int error, fault, size, stackaddrsize, pushop;15241525val = 0;1526size = vie->opsize;1527pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;15281529/*1530* From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 11531*/1532if (paging->cpu_mode == CPU_MODE_REAL) {1533stackaddrsize = 2;1534} else if (paging->cpu_mode == CPU_MODE_64BIT) {1535/*1536* "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 31537* - Stack pointer size is always 64-bits.1538* - PUSH/POP of 32-bit values is not possible in 64-bit mode.1539* - 16-bit PUSH/POP is supported by using the operand size1540* override prefix (66H).1541*/1542stackaddrsize = 8;1543size = vie->opsize_override ? 2 : 8;1544} else {1545/*1546* In protected or compatibility mode the 'B' flag in the1547* stack-segment descriptor determines the size of the1548* stack pointer.1549*/1550error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc);1551KASSERT(error == 0, ("%s: error %d getting SS descriptor",1552__func__, error));1553if (SEG_DESC_DEF32(ss_desc.access))1554stackaddrsize = 4;1555else1556stackaddrsize = 2;1557}15581559error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);1560KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));15611562error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1563KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));15641565error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp);1566KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));1567if (pushop) {1568rsp -= size;1569}15701571if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,1572rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,1573&stack_gla)) {1574vm_inject_ss(vcpu, 0);1575return (0);1576}15771578if (vie_canonical_check(paging->cpu_mode, stack_gla)) {1579vm_inject_ss(vcpu, 0);1580return (0);1581}15821583if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {1584vm_inject_ac(vcpu, 0);1585return (0);1586}15871588error = vm_copy_setup(vcpu, paging, stack_gla, size,1589pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),1590&fault);1591if (error || fault)1592return (error);15931594if (pushop) {1595error = memread(vcpu, mmio_gpa, &val, size, arg);1596if (error == 0)1597vm_copyout(&val, copyinfo, size);1598} else {1599vm_copyin(copyinfo, &val, size);1600error = memwrite(vcpu, mmio_gpa, val, size, arg);1601rsp += size;1602}1603vm_copy_teardown(copyinfo, nitems(copyinfo));16041605if (error == 0) {1606error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp,1607stackaddrsize);1608KASSERT(error == 0, ("error %d updating rsp", error));1609}1610return (error);1611}16121613static int1614emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,1615struct vm_guest_paging *paging, mem_region_read_t memread,1616mem_region_write_t memwrite, void *arg)1617{1618int error;16191620/*1621* Table A-6, "Opcode Extensions", Intel SDM, Vol 2.1622*1623* PUSH is part of the group 5 extended opcodes and is identified1624* by ModRM:reg = b110.1625*/1626if ((vie->reg & 7) != 6)1627return (EINVAL);16281629error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,1630memwrite, arg);1631return (error);1632}16331634static int1635emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,1636struct vm_guest_paging *paging, mem_region_read_t memread,1637mem_region_write_t memwrite, void *arg)1638{1639int error;16401641/*1642* Table A-6, "Opcode Extensions", Intel SDM, Vol 2.1643*1644* POP is part of the group 1A extended opcodes and is identified1645* by ModRM:reg = b000.1646*/1647if ((vie->reg & 7) != 0)1648return (EINVAL);16491650error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,1651memwrite, arg);1652return (error);1653}16541655static int1656emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1657struct vm_guest_paging *paging __unused, mem_region_read_t memread,1658mem_region_write_t memwrite, void *memarg)1659{1660int error;16611662switch (vie->reg & 7) {1663case 0x1: /* OR */1664error = emulate_or(vcpu, gpa, vie,1665memread, memwrite, memarg);1666break;1667case 0x4: /* AND */1668error = emulate_and(vcpu, gpa, vie,1669memread, memwrite, memarg);1670break;1671case 0x7: /* CMP */1672error = emulate_cmp(vcpu, gpa, vie,1673memread, memwrite, memarg);1674break;1675default:1676error = EINVAL;1677break;1678}16791680return (error);1681}16821683static int1684emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1685mem_region_read_t memread, mem_region_write_t memwrite __unused,1686void *memarg)1687{1688uint64_t val, rflags;1689int error, bitmask, bitoff;16901691/*1692* 0F BA is a Group 8 extended opcode.1693*1694* Currently we only emulate the 'Bit Test' instruction which is1695* identified by a ModR/M:reg encoding of 100b.1696*/1697if ((vie->reg & 7) != 4)1698return (EINVAL);16991700error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);1701KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));17021703error = memread(vcpu, gpa, &val, vie->opsize, memarg);1704if (error)1705return (error);17061707/*1708* Intel SDM, Vol 2, Table 3-2:1709* "Range of Bit Positions Specified by Bit Offset Operands"1710*/1711bitmask = vie->opsize * 8 - 1;1712bitoff = vie->immediate & bitmask;17131714/* Copy the bit into the Carry flag in %rflags */1715if (val & (1UL << bitoff))1716rflags |= PSL_C;1717else1718rflags &= ~PSL_C;17191720error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);1721KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));17221723return (0);1724}17251726static int1727emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1728mem_region_read_t memread, mem_region_write_t memwrite __unused,1729void *memarg)1730{1731int error;1732uint64_t buf;17331734switch (vie->reg & 7) {1735case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */1736if (vie->mod == 0x3) {1737/*1738* SFENCE. Ignore it, VM exit provides enough1739* barriers on its own.1740*/1741error = 0;1742} else {1743/*1744* CLFLUSH, CLFLUSHOPT. Only check for access1745* rights.1746*/1747error = memread(vcpu, gpa, &buf, 1, memarg);1748}1749break;1750default:1751error = EINVAL;1752break;1753}17541755return (error);1756}17571758int1759vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,1760struct vm_guest_paging *paging, mem_region_read_t memread,1761mem_region_write_t memwrite, void *memarg)1762{1763int error;17641765if (!vie->decoded)1766return (EINVAL);17671768switch (vie->op.op_type) {1769case VIE_OP_TYPE_GROUP1:1770error = emulate_group1(vcpu, gpa, vie, paging, memread,1771memwrite, memarg);1772break;1773case VIE_OP_TYPE_POP:1774error = emulate_pop(vcpu, gpa, vie, paging, memread,1775memwrite, memarg);1776break;1777case VIE_OP_TYPE_PUSH:1778error = emulate_push(vcpu, gpa, vie, paging, memread,1779memwrite, memarg);1780break;1781case VIE_OP_TYPE_CMP:1782error = emulate_cmp(vcpu, gpa, vie,1783memread, memwrite, memarg);1784break;1785case VIE_OP_TYPE_MOV:1786error = emulate_mov(vcpu, gpa, vie,1787memread, memwrite, memarg);1788break;1789case VIE_OP_TYPE_MOVSX:1790case VIE_OP_TYPE_MOVZX:1791error = emulate_movx(vcpu, gpa, vie,1792memread, memwrite, memarg);1793break;1794case VIE_OP_TYPE_MOVS:1795error = emulate_movs(vcpu, gpa, vie, paging, memread,1796memwrite, memarg);1797break;1798case VIE_OP_TYPE_STOS:1799error = emulate_stos(vcpu, gpa, vie, paging, memread,1800memwrite, memarg);1801break;1802case VIE_OP_TYPE_AND:1803error = emulate_and(vcpu, gpa, vie,1804memread, memwrite, memarg);1805break;1806case VIE_OP_TYPE_OR:1807error = emulate_or(vcpu, gpa, vie,1808memread, memwrite, memarg);1809break;1810case VIE_OP_TYPE_SUB:1811error = emulate_sub(vcpu, gpa, vie,1812memread, memwrite, memarg);1813break;1814case VIE_OP_TYPE_BITTEST:1815error = emulate_bittest(vcpu, gpa, vie,1816memread, memwrite, memarg);1817break;1818case VIE_OP_TYPE_TWOB_GRP15:1819error = emulate_twob_group15(vcpu, gpa, vie,1820memread, memwrite, memarg);1821break;1822case VIE_OP_TYPE_ADD:1823error = emulate_add(vcpu, gpa, vie, memread,1824memwrite, memarg);1825break;1826case VIE_OP_TYPE_TEST:1827error = emulate_test(vcpu, gpa, vie,1828memread, memwrite, memarg);1829break;1830case VIE_OP_TYPE_BEXTR:1831error = emulate_bextr(vcpu, gpa, vie, paging,1832memread, memwrite, memarg);1833break;1834default:1835error = EINVAL;1836break;1837}18381839return (error);1840}18411842int1843vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)1844{1845KASSERT(size == 1 || size == 2 || size == 4 || size == 8,1846("%s: invalid size %d", __func__, size));1847KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));18481849if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)1850return (0);18511852return ((gla & (size - 1)) ? 1 : 0);1853}18541855int1856vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)1857{1858uint64_t mask;18591860if (cpu_mode != CPU_MODE_64BIT)1861return (0);18621863/*1864* The value of the bit 47 in the 'gla' should be replicated in the1865* most significant 16 bits.1866*/1867mask = ~((1UL << 48) - 1);1868if (gla & (1UL << 47))1869return ((gla & mask) != mask);1870else1871return ((gla & mask) != 0);1872}18731874uint64_t1875vie_size2mask(int size)1876{1877KASSERT(size == 1 || size == 2 || size == 4 || size == 8,1878("vie_size2mask: invalid size %d", size));1879return (size2mask[size]);1880}18811882int1883vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,1884struct seg_desc *desc, uint64_t offset, int length, int addrsize,1885int prot, uint64_t *gla)1886{1887uint64_t firstoff, low_limit, high_limit, segbase;1888int glasize, type;18891890KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,1891("%s: invalid segment %d", __func__, seg));1892KASSERT(length == 1 || length == 2 || length == 4 || length == 8,1893("%s: invalid operand size %d", __func__, length));1894KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,1895("%s: invalid prot %#x", __func__, prot));18961897firstoff = offset;1898if (cpu_mode == CPU_MODE_64BIT) {1899KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "1900"size %d for cpu_mode %d", __func__, addrsize, cpu_mode));1901glasize = 8;1902} else {1903KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "1904"size %d for cpu mode %d", __func__, addrsize, cpu_mode));1905glasize = 4;1906/*1907* If the segment selector is loaded with a NULL selector1908* then the descriptor is unusable and attempting to use1909* it results in a #GP(0).1910*/1911if (SEG_DESC_UNUSABLE(desc->access))1912return (-1);19131914/*1915* The processor generates a #NP exception when a segment1916* register is loaded with a selector that points to a1917* descriptor that is not present. If this was the case then1918* it would have been checked before the VM-exit.1919*/1920KASSERT(SEG_DESC_PRESENT(desc->access),1921("segment %d not present: %#x", seg, desc->access));19221923/*1924* The descriptor type must indicate a code/data segment.1925*/1926type = SEG_DESC_TYPE(desc->access);1927KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "1928"descriptor type %#x", seg, type));19291930if (prot & PROT_READ) {1931/* #GP on a read access to a exec-only code segment */1932if ((type & 0xA) == 0x8)1933return (-1);1934}19351936if (prot & PROT_WRITE) {1937/*1938* #GP on a write access to a code segment or a1939* read-only data segment.1940*/1941if (type & 0x8) /* code segment */1942return (-1);19431944if ((type & 0xA) == 0) /* read-only data seg */1945return (-1);1946}19471948/*1949* 'desc->limit' is fully expanded taking granularity into1950* account.1951*/1952if ((type & 0xC) == 0x4) {1953/* expand-down data segment */1954low_limit = desc->limit + 1;1955high_limit = SEG_DESC_DEF32(desc->access) ?19560xffffffff : 0xffff;1957} else {1958/* code segment or expand-up data segment */1959low_limit = 0;1960high_limit = desc->limit;1961}19621963while (length > 0) {1964offset &= vie_size2mask(addrsize);1965if (offset < low_limit || offset > high_limit)1966return (-1);1967offset++;1968length--;1969}1970}19711972/*1973* In 64-bit mode all segments except %fs and %gs have a segment1974* base address of 0.1975*/1976if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&1977seg != VM_REG_GUEST_GS) {1978segbase = 0;1979} else {1980segbase = desc->base;1981}19821983/*1984* Truncate 'firstoff' to the effective address size before adding1985* it to the segment base.1986*/1987firstoff &= vie_size2mask(addrsize);1988*gla = (segbase + firstoff) & vie_size2mask(glasize);1989return (0);1990}19911992/*1993* Prepare a partially decoded vie for a 2nd attempt.1994*/1995void1996vie_restart(struct vie *vie)1997{1998_Static_assert(1999offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&2000offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),2001"restart should not erase instruction length or contents");20022003memset((char *)vie + offsetof(struct vie, vie_startzero), 0,2004sizeof(*vie) - offsetof(struct vie, vie_startzero));20052006vie->base_register = VM_REG_LAST;2007vie->index_register = VM_REG_LAST;2008vie->segment_register = VM_REG_LAST;2009}20102011void2012vie_init(struct vie *vie, const char *inst_bytes, int inst_length)2013{2014KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,2015("%s: invalid instruction length (%d)", __func__, inst_length));20162017vie_restart(vie);2018memset(vie->inst, 0, sizeof(vie->inst));2019if (inst_length != 0)2020memcpy(vie->inst, inst_bytes, inst_length);2021vie->num_valid = inst_length;2022}20232024#ifdef _KERNEL2025static int2026pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)2027{2028int error_code = 0;20292030if (pte & PG_V)2031error_code |= PGEX_P;2032if (prot & VM_PROT_WRITE)2033error_code |= PGEX_W;2034if (usermode)2035error_code |= PGEX_U;2036if (rsvd)2037error_code |= PGEX_RSV;2038if (prot & VM_PROT_EXECUTE)2039error_code |= PGEX_I;20402041return (error_code);2042}20432044static void2045ptp_release(void **cookie)2046{2047if (*cookie != NULL) {2048vm_gpa_release(*cookie);2049*cookie = NULL;2050}2051}20522053static void *2054ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)2055{2056void *ptr;20572058ptp_release(cookie);2059ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);2060return (ptr);2061}20622063static int2064_vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,2065uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)2066{2067int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;2068u_int retries;2069uint64_t *ptpbase, ptpphys, pte, pgsize;2070uint32_t *ptpbase32, pte32;2071void *cookie;20722073*guest_fault = 0;20742075usermode = (paging->cpl == 3 ? 1 : 0);2076writable = prot & VM_PROT_WRITE;2077cookie = NULL;2078retval = 0;2079retries = 0;2080restart:2081ptpphys = paging->cr3; /* root of the page tables */2082ptp_release(&cookie);2083if (retries++ > 0)2084maybe_yield();20852086if (vie_canonical_check(paging->cpu_mode, gla)) {2087/*2088* XXX assuming a non-stack reference otherwise a stack fault2089* should be generated.2090*/2091if (!check_only)2092vm_inject_gp(vcpu);2093goto fault;2094}20952096if (paging->paging_mode == PAGING_MODE_FLAT) {2097*gpa = gla;2098goto done;2099}21002101if (paging->paging_mode == PAGING_MODE_32) {2102nlevels = 2;2103while (--nlevels >= 0) {2104/* Zero out the lower 12 bits. */2105ptpphys &= ~0xfff;21062107ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,2108&cookie);21092110if (ptpbase32 == NULL)2111goto error;21122113ptpshift = PAGE_SHIFT + nlevels * 10;2114ptpindex = (gla >> ptpshift) & 0x3FF;2115pgsize = 1UL << ptpshift;21162117pte32 = ptpbase32[ptpindex];21182119if ((pte32 & PG_V) == 0 ||2120(usermode && (pte32 & PG_U) == 0) ||2121(writable && (pte32 & PG_RW) == 0)) {2122if (!check_only) {2123pfcode = pf_error_code(usermode, prot, 0,2124pte32);2125vm_inject_pf(vcpu, pfcode, gla);2126}2127goto fault;2128}21292130/*2131* Emulate the x86 MMU's management of the accessed2132* and dirty flags. While the accessed flag is set2133* at every level of the page table, the dirty flag2134* is only set at the last level providing the guest2135* physical address.2136*/2137if (!check_only && (pte32 & PG_A) == 0) {2138if (atomic_cmpset_32(&ptpbase32[ptpindex],2139pte32, pte32 | PG_A) == 0) {2140goto restart;2141}2142}21432144/* XXX must be ignored if CR4.PSE=0 */2145if (nlevels > 0 && (pte32 & PG_PS) != 0)2146break;21472148ptpphys = pte32;2149}21502151/* Set the dirty bit in the page table entry if necessary */2152if (!check_only && writable && (pte32 & PG_M) == 0) {2153if (atomic_cmpset_32(&ptpbase32[ptpindex],2154pte32, pte32 | PG_M) == 0) {2155goto restart;2156}2157}21582159/* Zero out the lower 'ptpshift' bits */2160pte32 >>= ptpshift; pte32 <<= ptpshift;2161*gpa = pte32 | (gla & (pgsize - 1));2162goto done;2163}21642165if (paging->paging_mode == PAGING_MODE_PAE) {2166/* Zero out the lower 5 bits and the upper 32 bits */2167ptpphys &= 0xffffffe0UL;21682169ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,2170&cookie);2171if (ptpbase == NULL)2172goto error;21732174ptpindex = (gla >> 30) & 0x3;21752176pte = ptpbase[ptpindex];21772178if ((pte & PG_V) == 0) {2179if (!check_only) {2180pfcode = pf_error_code(usermode, prot, 0, pte);2181vm_inject_pf(vcpu, pfcode, gla);2182}2183goto fault;2184}21852186ptpphys = pte;21872188nlevels = 2;2189} else if (paging->paging_mode == PAGING_MODE_64_LA57) {2190nlevels = 5;2191} else {2192nlevels = 4;2193}21942195while (--nlevels >= 0) {2196/* Zero out the lower 12 bits and the upper 12 bits */2197ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;21982199ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);2200if (ptpbase == NULL)2201goto error;22022203ptpshift = PAGE_SHIFT + nlevels * 9;2204ptpindex = (gla >> ptpshift) & 0x1FF;2205pgsize = 1UL << ptpshift;22062207pte = ptpbase[ptpindex];22082209if ((pte & PG_V) == 0 ||2210(usermode && (pte & PG_U) == 0) ||2211(writable && (pte & PG_RW) == 0)) {2212if (!check_only) {2213pfcode = pf_error_code(usermode, prot, 0, pte);2214vm_inject_pf(vcpu, pfcode, gla);2215}2216goto fault;2217}22182219/* Set the accessed bit in the page table entry */2220if (!check_only && (pte & PG_A) == 0) {2221if (atomic_cmpset_64(&ptpbase[ptpindex],2222pte, pte | PG_A) == 0) {2223goto restart;2224}2225}22262227if (nlevels > 0 && (pte & PG_PS) != 0) {2228if (pgsize > 1 * GB) {2229if (!check_only) {2230pfcode = pf_error_code(usermode, prot, 1,2231pte);2232vm_inject_pf(vcpu, pfcode, gla);2233}2234goto fault;2235}2236break;2237}22382239ptpphys = pte;2240}22412242/* Set the dirty bit in the page table entry if necessary */2243if (!check_only && writable && (pte & PG_M) == 0) {2244if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)2245goto restart;2246}22472248/* Zero out the lower 'ptpshift' bits and the upper 12 bits */2249pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;2250*gpa = pte | (gla & (pgsize - 1));2251done:2252ptp_release(&cookie);2253KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",2254__func__, retval));2255return (retval);2256error:2257retval = EFAULT;2258goto done;2259fault:2260*guest_fault = 1;2261goto done;2262}22632264int2265vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,2266uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)2267{22682269return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,2270false));2271}22722273int2274vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,2275uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)2276{22772278return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,2279true));2280}22812282int2283vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,2284uint64_t rip, int inst_length, struct vie *vie, int *faultptr)2285{2286struct vm_copyinfo copyinfo[2];2287int error, prot;22882289if (inst_length > VIE_INST_SIZE)2290panic("vmm_fetch_instruction: invalid length %d", inst_length);22912292prot = PROT_READ | PROT_EXEC;2293error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,2294copyinfo, nitems(copyinfo), faultptr);2295if (error || *faultptr)2296return (error);22972298vm_copyin(copyinfo, vie->inst, inst_length);2299vm_copy_teardown(copyinfo, nitems(copyinfo));2300vie->num_valid = inst_length;2301return (0);2302}2303#endif /* _KERNEL */23042305static int2306vie_peek(struct vie *vie, uint8_t *x)2307{23082309if (vie->num_processed < vie->num_valid) {2310*x = vie->inst[vie->num_processed];2311return (0);2312} else2313return (-1);2314}23152316static void2317vie_advance(struct vie *vie)2318{23192320vie->num_processed++;2321}23222323static bool2324segment_override(uint8_t x, int *seg)2325{23262327switch (x) {2328case 0x2E:2329*seg = VM_REG_GUEST_CS;2330break;2331case 0x36:2332*seg = VM_REG_GUEST_SS;2333break;2334case 0x3E:2335*seg = VM_REG_GUEST_DS;2336break;2337case 0x26:2338*seg = VM_REG_GUEST_ES;2339break;2340case 0x64:2341*seg = VM_REG_GUEST_FS;2342break;2343case 0x65:2344*seg = VM_REG_GUEST_GS;2345break;2346default:2347return (false);2348}2349return (true);2350}23512352static int2353decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)2354{2355uint8_t x;23562357while (1) {2358if (vie_peek(vie, &x))2359return (-1);23602361if (x == 0x66)2362vie->opsize_override = 1;2363else if (x == 0x67)2364vie->addrsize_override = 1;2365else if (x == 0xF3)2366vie->repz_present = 1;2367else if (x == 0xF2)2368vie->repnz_present = 1;2369else if (segment_override(x, &vie->segment_register))2370vie->segment_override = 1;2371else2372break;23732374vie_advance(vie);2375}23762377/*2378* From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:2379* - Only one REX prefix is allowed per instruction.2380* - The REX prefix must immediately precede the opcode byte or the2381* escape opcode byte.2382* - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)2383* the mandatory prefix must come before the REX prefix.2384*/2385if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {2386vie->rex_present = 1;2387vie->rex_w = x & 0x8 ? 1 : 0;2388vie->rex_r = x & 0x4 ? 1 : 0;2389vie->rex_x = x & 0x2 ? 1 : 0;2390vie->rex_b = x & 0x1 ? 1 : 0;2391vie_advance(vie);2392}23932394/*2395* ยง 2.3.5, "The VEX Prefix", SDM Vol 2.2396*/2397if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)2398&& x == 0xC4) {2399const struct vie_op *optab;24002401/* 3-byte VEX prefix. */2402vie->vex_present = 1;24032404vie_advance(vie);2405if (vie_peek(vie, &x))2406return (-1);24072408/*2409* 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted2410* relative to REX encoding.2411*/2412vie->rex_r = x & 0x80 ? 0 : 1;2413vie->rex_x = x & 0x40 ? 0 : 1;2414vie->rex_b = x & 0x20 ? 0 : 1;24152416switch (x & 0x1F) {2417case 0x2:2418/* 0F 38. */2419optab = three_byte_opcodes_0f38;2420break;2421case 0x1:2422/* 0F class - nothing handled here yet. */2423/* FALLTHROUGH */2424case 0x3:2425/* 0F 3A class - nothing handled here yet. */2426/* FALLTHROUGH */2427default:2428/* Reserved (#UD). */2429return (-1);2430}24312432vie_advance(vie);2433if (vie_peek(vie, &x))2434return (-1);24352436/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */2437vie->rex_w = x & 0x80 ? 1 : 0;24382439vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);2440vie->vex_l = !!(x & 0x4);2441vie->vex_pp = (x & 0x3);24422443/* PP: 1=66 2=F3 3=F2 prefixes. */2444switch (vie->vex_pp) {2445case 0x1:2446vie->opsize_override = 1;2447break;2448case 0x2:2449vie->repz_present = 1;2450break;2451case 0x3:2452vie->repnz_present = 1;2453break;2454}24552456vie_advance(vie);24572458/* Opcode, sans literal prefix prefix. */2459if (vie_peek(vie, &x))2460return (-1);24612462vie->op = optab[x];2463if (vie->op.op_type == VIE_OP_TYPE_NONE)2464return (-1);24652466vie_advance(vie);2467}24682469/*2470* Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 12471*/2472if (cpu_mode == CPU_MODE_64BIT) {2473/*2474* Default address size is 64-bits and default operand size2475* is 32-bits.2476*/2477vie->addrsize = vie->addrsize_override ? 4 : 8;2478if (vie->rex_w)2479vie->opsize = 8;2480else if (vie->opsize_override)2481vie->opsize = 2;2482else2483vie->opsize = 4;2484} else if (cs_d) {2485/* Default address and operand sizes are 32-bits */2486vie->addrsize = vie->addrsize_override ? 2 : 4;2487vie->opsize = vie->opsize_override ? 2 : 4;2488} else {2489/* Default address and operand sizes are 16-bits */2490vie->addrsize = vie->addrsize_override ? 4 : 2;2491vie->opsize = vie->opsize_override ? 4 : 2;2492}2493return (0);2494}24952496static int2497decode_two_byte_opcode(struct vie *vie)2498{2499uint8_t x;25002501if (vie_peek(vie, &x))2502return (-1);25032504vie->op = two_byte_opcodes[x];25052506if (vie->op.op_type == VIE_OP_TYPE_NONE)2507return (-1);25082509vie_advance(vie);2510return (0);2511}25122513static int2514decode_opcode(struct vie *vie)2515{2516uint8_t x;25172518if (vie_peek(vie, &x))2519return (-1);25202521/* Already did this via VEX prefix. */2522if (vie->op.op_type != VIE_OP_TYPE_NONE)2523return (0);25242525vie->op = one_byte_opcodes[x];25262527if (vie->op.op_type == VIE_OP_TYPE_NONE)2528return (-1);25292530vie_advance(vie);25312532if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)2533return (decode_two_byte_opcode(vie));25342535return (0);2536}25372538static int2539decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)2540{2541uint8_t x;25422543if (vie->op.op_flags & VIE_OP_F_NO_MODRM)2544return (0);25452546if (cpu_mode == CPU_MODE_REAL)2547return (-1);25482549if (vie_peek(vie, &x))2550return (-1);25512552vie->mod = (x >> 6) & 0x3;2553vie->rm = (x >> 0) & 0x7;2554vie->reg = (x >> 3) & 0x7;25552556/*2557* A direct addressing mode makes no sense in the context of an EPT2558* fault. There has to be a memory access involved to cause the2559* EPT fault.2560*/2561if (vie->mod == VIE_MOD_DIRECT)2562return (-1);25632564if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||2565(vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {2566/*2567* Table 2-5: Special Cases of REX Encodings2568*2569* mod=0, r/m=5 is used in the compatibility mode to2570* indicate a disp32 without a base register.2571*2572* mod!=3, r/m=4 is used in the compatibility mode to2573* indicate that the SIB byte is present.2574*2575* The 'b' bit in the REX prefix is don't care in2576* this case.2577*/2578} else {2579vie->rm |= (vie->rex_b << 3);2580}25812582vie->reg |= (vie->rex_r << 3);25832584/* SIB */2585if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)2586goto done;25872588vie->base_register = gpr_map[vie->rm];25892590switch (vie->mod) {2591case VIE_MOD_INDIRECT_DISP8:2592vie->disp_bytes = 1;2593break;2594case VIE_MOD_INDIRECT_DISP32:2595vie->disp_bytes = 4;2596break;2597case VIE_MOD_INDIRECT:2598if (vie->rm == VIE_RM_DISP32) {2599vie->disp_bytes = 4;2600/*2601* Table 2-7. RIP-Relative Addressing2602*2603* In 64-bit mode mod=00 r/m=101 implies [rip] + disp322604* whereas in compatibility mode it just implies disp32.2605*/26062607if (cpu_mode == CPU_MODE_64BIT)2608vie->base_register = VM_REG_GUEST_RIP;2609else2610vie->base_register = VM_REG_LAST;2611}2612break;2613}26142615done:2616vie_advance(vie);26172618return (0);2619}26202621static int2622decode_sib(struct vie *vie)2623{2624uint8_t x;26252626/* Proceed only if SIB byte is present */2627if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)2628return (0);26292630if (vie_peek(vie, &x))2631return (-1);26322633/* De-construct the SIB byte */2634vie->ss = (x >> 6) & 0x3;2635vie->index = (x >> 3) & 0x7;2636vie->base = (x >> 0) & 0x7;26372638/* Apply the REX prefix modifiers */2639vie->index |= vie->rex_x << 3;2640vie->base |= vie->rex_b << 3;26412642switch (vie->mod) {2643case VIE_MOD_INDIRECT_DISP8:2644vie->disp_bytes = 1;2645break;2646case VIE_MOD_INDIRECT_DISP32:2647vie->disp_bytes = 4;2648break;2649}26502651if (vie->mod == VIE_MOD_INDIRECT &&2652(vie->base == 5 || vie->base == 13)) {2653/*2654* Special case when base register is unused if mod = 02655* and base = %rbp or %r13.2656*2657* Documented in:2658* Table 2-3: 32-bit Addressing Forms with the SIB Byte2659* Table 2-5: Special Cases of REX Encodings2660*/2661vie->disp_bytes = 4;2662} else {2663vie->base_register = gpr_map[vie->base];2664}26652666/*2667* All encodings of 'index' are valid except for %rsp (4).2668*2669* Documented in:2670* Table 2-3: 32-bit Addressing Forms with the SIB Byte2671* Table 2-5: Special Cases of REX Encodings2672*/2673if (vie->index != 4)2674vie->index_register = gpr_map[vie->index];26752676/* 'scale' makes sense only in the context of an index register */2677if (vie->index_register < VM_REG_LAST)2678vie->scale = 1 << vie->ss;26792680vie_advance(vie);26812682return (0);2683}26842685static int2686decode_displacement(struct vie *vie)2687{2688int n, i;2689uint8_t x;26902691union {2692char buf[4];2693int8_t signed8;2694int32_t signed32;2695} u;26962697if ((n = vie->disp_bytes) == 0)2698return (0);26992700if (n != 1 && n != 4)2701panic("decode_displacement: invalid disp_bytes %d", n);27022703for (i = 0; i < n; i++) {2704if (vie_peek(vie, &x))2705return (-1);27062707u.buf[i] = x;2708vie_advance(vie);2709}27102711if (n == 1)2712vie->displacement = u.signed8; /* sign-extended */2713else2714vie->displacement = u.signed32; /* sign-extended */27152716return (0);2717}27182719static int2720decode_immediate(struct vie *vie)2721{2722int i, n;2723uint8_t x;2724union {2725char buf[4];2726int8_t signed8;2727int16_t signed16;2728int32_t signed32;2729} u;27302731/* Figure out immediate operand size (if any) */2732if (vie->op.op_flags & VIE_OP_F_IMM) {2733/*2734* Section 2.2.1.5 "Immediates", Intel SDM:2735* In 64-bit mode the typical size of immediate operands2736* remains 32-bits. When the operand size if 64-bits, the2737* processor sign-extends all immediates to 64-bits prior2738* to their use.2739*/2740if (vie->opsize == 4 || vie->opsize == 8)2741vie->imm_bytes = 4;2742else2743vie->imm_bytes = 2;2744} else if (vie->op.op_flags & VIE_OP_F_IMM8) {2745vie->imm_bytes = 1;2746}27472748if ((n = vie->imm_bytes) == 0)2749return (0);27502751KASSERT(n == 1 || n == 2 || n == 4,2752("%s: invalid number of immediate bytes: %d", __func__, n));27532754for (i = 0; i < n; i++) {2755if (vie_peek(vie, &x))2756return (-1);27572758u.buf[i] = x;2759vie_advance(vie);2760}27612762/* sign-extend the immediate value before use */2763if (n == 1)2764vie->immediate = u.signed8;2765else if (n == 2)2766vie->immediate = u.signed16;2767else2768vie->immediate = u.signed32;27692770return (0);2771}27722773static int2774decode_moffset(struct vie *vie)2775{2776int i, n;2777uint8_t x;2778union {2779char buf[8];2780uint64_t u64;2781} u;27822783if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)2784return (0);27852786/*2787* Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:2788* The memory offset size follows the address-size of the instruction.2789*/2790n = vie->addrsize;2791KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));27922793u.u64 = 0;2794for (i = 0; i < n; i++) {2795if (vie_peek(vie, &x))2796return (-1);27972798u.buf[i] = x;2799vie_advance(vie);2800}2801vie->displacement = u.u64;2802return (0);2803}28042805#ifdef _KERNEL2806/*2807* Verify that the 'guest linear address' provided as collateral of the nested2808* page table fault matches with our instruction decoding.2809*/2810static int2811verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,2812enum vm_cpu_mode cpu_mode)2813{2814int error;2815uint64_t base, segbase, idx, gla2;2816enum vm_reg_name seg;2817struct seg_desc desc;28182819/* Skip 'gla' verification */2820if (gla == VIE_INVALID_GLA)2821return (0);28222823base = 0;2824if (vie->base_register != VM_REG_LAST) {2825error = vm_get_register(vcpu, vie->base_register, &base);2826if (error) {2827printf("verify_gla: error %d getting base reg %d\n",2828error, vie->base_register);2829return (-1);2830}28312832/*2833* RIP-relative addressing starts from the following2834* instruction2835*/2836if (vie->base_register == VM_REG_GUEST_RIP)2837base += vie->num_processed;2838}28392840idx = 0;2841if (vie->index_register != VM_REG_LAST) {2842error = vm_get_register(vcpu, vie->index_register, &idx);2843if (error) {2844printf("verify_gla: error %d getting index reg %d\n",2845error, vie->index_register);2846return (-1);2847}2848}28492850/*2851* From "Specifying a Segment Selector", Intel SDM, Vol 12852*2853* In 64-bit mode, segmentation is generally (but not2854* completely) disabled. The exceptions are the FS and GS2855* segments.2856*2857* In legacy IA-32 mode, when the ESP or EBP register is used2858* as the base, the SS segment is the default segment. For2859* other data references, except when relative to stack or2860* string destination the DS segment is the default. These2861* can be overridden to allow other segments to be accessed.2862*/2863if (vie->segment_override)2864seg = vie->segment_register;2865else if (vie->base_register == VM_REG_GUEST_RSP ||2866vie->base_register == VM_REG_GUEST_RBP)2867seg = VM_REG_GUEST_SS;2868else2869seg = VM_REG_GUEST_DS;2870if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&2871seg != VM_REG_GUEST_GS) {2872segbase = 0;2873} else {2874error = vm_get_seg_desc(vcpu, seg, &desc);2875if (error) {2876printf("verify_gla: error %d getting segment"2877" descriptor %d", error,2878vie->segment_register);2879return (-1);2880}2881segbase = desc.base;2882}28832884gla2 = segbase + base + vie->scale * idx + vie->displacement;2885gla2 &= size2mask[vie->addrsize];2886if (gla != gla2) {2887printf("verify_gla mismatch: segbase(0x%0lx)"2888"base(0x%0lx), scale(%d), index(0x%0lx), "2889"disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",2890segbase, base, vie->scale, idx, vie->displacement,2891gla, gla2);2892return (-1);2893}28942895return (0);2896}2897#endif /* _KERNEL */28982899int2900#ifdef _KERNEL2901vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,2902enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)2903#else2904vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)2905#endif2906{29072908if (decode_prefixes(vie, cpu_mode, cs_d))2909return (-1);29102911if (decode_opcode(vie))2912return (-1);29132914if (decode_modrm(vie, cpu_mode))2915return (-1);29162917if (decode_sib(vie))2918return (-1);29192920if (decode_displacement(vie))2921return (-1);29222923if (decode_immediate(vie))2924return (-1);29252926if (decode_moffset(vie))2927return (-1);29282929#ifdef _KERNEL2930if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {2931if (verify_gla(vcpu, gla, vie, cpu_mode))2932return (-1);2933}2934#endif29352936vie->decoded = 1; /* success */29372938return (0);2939}294029412942