CoCalc -- vmm.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/arm64/vmm/vmm.c
³⁹⁴⁷⁸ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (C) 2015 Mihai Carabas <[email protected]>
5
 * All rights reserved.
6
 *
7
 * Redistribution and use in source and binary forms, with or without
8
 * modification, are permitted provided that the following conditions
9
 * are met:
10
 * 1. Redistributions of source code must retain the above copyright
11
 *    notice, this list of conditions and the following disclaimer.
12
 * 2. Redistributions in binary form must reproduce the above copyright
13
 *    notice, this list of conditions and the following disclaimer in the
14
 *    documentation and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28

29
#include <sys/param.h>
30
#include <sys/systm.h>
31
#include <sys/cpuset.h>
32
#include <sys/kernel.h>
33
#include <sys/linker.h>
34
#include <sys/lock.h>
35
#include <sys/malloc.h>
36
#include <sys/module.h>
37
#include <sys/mutex.h>
38
#include <sys/pcpu.h>
39
#include <sys/proc.h>
40
#include <sys/queue.h>
41
#include <sys/rwlock.h>
42
#include <sys/sched.h>
43
#include <sys/smp.h>
44
#include <sys/sysctl.h>
45

46
#include <vm/vm.h>
47
#include <vm/vm_object.h>
48
#include <vm/vm_page.h>
49
#include <vm/pmap.h>
50
#include <vm/vm_map.h>
51
#include <vm/vm_extern.h>
52
#include <vm/vm_param.h>
53

54
#include <machine/armreg.h>
55
#include <machine/cpu.h>
56
#include <machine/fpu.h>
57
#include <machine/machdep.h>
58
#include <machine/pcb.h>
59
#include <machine/smp.h>
60
#include <machine/vm.h>
61
#include <machine/vmparam.h>
62
#include <machine/vmm.h>
63
#include <machine/vmm_instruction_emul.h>
64

65
#include <dev/pci/pcireg.h>
66
#include <dev/vmm/vmm_dev.h>
67
#include <dev/vmm/vmm_ktr.h>
68
#include <dev/vmm/vmm_mem.h>
69
#include <dev/vmm/vmm_stat.h>
70

71
#include "arm64.h"
72
#include "mmu.h"
73

74
#include "io/vgic.h"
75
#include "io/vtimer.h"
76

77
struct vcpu {
78
	int		flags;
79
	enum vcpu_state	state;
80
	struct mtx	mtx;
81
	int		hostcpu;	/* host cpuid this vcpu last ran on */
82
	int		vcpuid;
83
	void		*stats;
84
	struct vm_exit	exitinfo;
85
	uint64_t	nextpc;		/* (x) next instruction to execute */
86
	struct vm	*vm;		/* (o) */
87
	void		*cookie;	/* (i) cpu-specific data */
88
	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
89
};
90

91
#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
92
#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
93
#define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
94
#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
95
#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
96
#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
97

98
struct vmm_mmio_region {
99
	uint64_t start;
100
	uint64_t end;
101
	mem_region_read_t read;
102
	mem_region_write_t write;
103
};
104
#define	VM_MAX_MMIO_REGIONS	4
105

106
struct vmm_special_reg {
107
	uint32_t	esr_iss;
108
	uint32_t	esr_mask;
109
	reg_read_t	reg_read;
110
	reg_write_t	reg_write;
111
	void		*arg;
112
};
113
#define	VM_MAX_SPECIAL_REGS	16
114

115
/*
116
 * Initialization:
117
 * (o) initialized the first time the VM is created
118
 * (i) initialized when VM is created and when it is reinitialized
119
 * (x) initialized before use
120
 */
121
struct vm {
122
	void		*cookie;		/* (i) cpu-specific data */
123
	volatile cpuset_t active_cpus;		/* (i) active vcpus */
124
	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
125
	int		suspend;		/* (i) stop VM execution */
126
	bool		dying;			/* (o) is dying */
127
	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
128
	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
129
	struct vmspace	*vmspace;		/* (o) guest's address space */
130
	struct vm_mem	mem;			/* (i) guest memory */
131
	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
132
	struct vcpu	**vcpu;			/* (i) guest vcpus */
133
	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
134
						/* (o) guest MMIO regions */
135
	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
136
	/* The following describe the vm cpu topology */
137
	uint16_t	sockets;		/* (o) num of sockets */
138
	uint16_t	cores;			/* (o) num of cores/socket */
139
	uint16_t	threads;		/* (o) num of threads/core */
140
	uint16_t	maxcpus;		/* (o) max pluggable cpus */
141
	struct sx	vcpus_init_lock;	/* (o) */
142
};
143

144
static bool vmm_initialized = false;
145

146
static int vm_handle_wfi(struct vcpu *vcpu,
147
			 struct vm_exit *vme, bool *retu);
148

149
static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
150

151
/* statistics */
152
static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
153

154
SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
155

156
static int vmm_ipinum;
157
SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
158
    "IPI vector used for vcpu notifications");
159

160
struct vmm_regs {
161
	uint64_t	id_aa64afr0;
162
	uint64_t	id_aa64afr1;
163
	uint64_t	id_aa64dfr0;
164
	uint64_t	id_aa64dfr1;
165
	uint64_t	id_aa64isar0;
166
	uint64_t	id_aa64isar1;
167
	uint64_t	id_aa64isar2;
168
	uint64_t	id_aa64mmfr0;
169
	uint64_t	id_aa64mmfr1;
170
	uint64_t	id_aa64mmfr2;
171
	uint64_t	id_aa64pfr0;
172
	uint64_t	id_aa64pfr1;
173
};
174

175
static const struct vmm_regs vmm_arch_regs_masks = {
176
	.id_aa64dfr0 =
177
	    ID_AA64DFR0_CTX_CMPs_MASK |
178
	    ID_AA64DFR0_WRPs_MASK |
179
	    ID_AA64DFR0_BRPs_MASK |
180
	    ID_AA64DFR0_PMUVer_3 |
181
	    ID_AA64DFR0_DebugVer_8,
182
	.id_aa64isar0 =
183
	    ID_AA64ISAR0_TLB_TLBIOSR |
184
	    ID_AA64ISAR0_SHA3_IMPL |
185
	    ID_AA64ISAR0_RDM_IMPL |
186
	    ID_AA64ISAR0_Atomic_IMPL |
187
	    ID_AA64ISAR0_CRC32_BASE |
188
	    ID_AA64ISAR0_SHA2_512 |
189
	    ID_AA64ISAR0_SHA1_BASE |
190
	    ID_AA64ISAR0_AES_PMULL,
191
	.id_aa64mmfr0 =
192
	    ID_AA64MMFR0_TGran4_IMPL |
193
	    ID_AA64MMFR0_TGran64_IMPL |
194
	    ID_AA64MMFR0_TGran16_IMPL |
195
	    ID_AA64MMFR0_ASIDBits_16 |
196
	    ID_AA64MMFR0_PARange_4P,
197
	.id_aa64mmfr1 =
198
	    ID_AA64MMFR1_SpecSEI_IMPL |
199
	    ID_AA64MMFR1_PAN_ATS1E1 |
200
	    ID_AA64MMFR1_HAFDBS_AF,
201
	.id_aa64pfr0 =
202
	    ID_AA64PFR0_GIC_CPUIF_NONE |
203
	    ID_AA64PFR0_AdvSIMD_HP |
204
	    ID_AA64PFR0_FP_HP |
205
	    ID_AA64PFR0_EL3_64 |
206
	    ID_AA64PFR0_EL2_64 |
207
	    ID_AA64PFR0_EL1_64 |
208
	    ID_AA64PFR0_EL0_64,
209
};
210

211
/* Host registers masked by vmm_arch_regs_masks. */
212
static struct vmm_regs vmm_arch_regs;
213

214
u_int vm_maxcpu;
215
SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
216
    &vm_maxcpu, 0, "Maximum number of vCPUs");
217

218
static void vcpu_notify_event_locked(struct vcpu *vcpu);
219

220
/* global statistics */
221
VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
222
VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
223
VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
224
VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
225
VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
226
VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
227
VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
228
VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
229
VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
230
VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
231
VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
232
VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
233
VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
234
VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
235
VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
236

237
/*
238
 * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
239
 * is a safe value for now.
240
 */
241
#define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
242

243
static int
244
vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
245
{
246
#define	_FETCH_KERN_REG(reg, field) do {				\
247
	regs->field = vmm_arch_regs_masks.field;			\
248
	if (!get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,	\
249
	    masks->field))						\
250
		regs->field = 0;					\
251
} while (0)
252
	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
253
	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
254
	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
255
	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
256
	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
257
	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
258
	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
259
	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
260
	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
261
	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
262
	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
263
	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
264
#undef _FETCH_KERN_REG
265
	return (0);
266
}
267

268
static void
269
vcpu_cleanup(struct vcpu *vcpu, bool destroy)
270
{
271
	vmmops_vcpu_cleanup(vcpu->cookie);
272
	vcpu->cookie = NULL;
273
	if (destroy) {
274
		vmm_stat_free(vcpu->stats);
275
		fpu_save_area_free(vcpu->guestfpu);
276
		vcpu_lock_destroy(vcpu);
277
	}
278
}
279

280
static struct vcpu *
281
vcpu_alloc(struct vm *vm, int vcpu_id)
282
{
283
	struct vcpu *vcpu;
284

285
	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
286
	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
287

288
	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
289
	vcpu_lock_init(vcpu);
290
	vcpu->state = VCPU_IDLE;
291
	vcpu->hostcpu = NOCPU;
292
	vcpu->vcpuid = vcpu_id;
293
	vcpu->vm = vm;
294
	vcpu->guestfpu = fpu_save_area_alloc();
295
	vcpu->stats = vmm_stat_alloc();
296
	return (vcpu);
297
}
298

299
static void
300
vcpu_init(struct vcpu *vcpu)
301
{
302
	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
303
	MPASS(vcpu->cookie != NULL);
304
	fpu_save_area_reset(vcpu->guestfpu);
305
	vmm_stat_init(vcpu->stats);
306
}
307

308
struct vm_exit *
309
vm_exitinfo(struct vcpu *vcpu)
310
{
311
	return (&vcpu->exitinfo);
312
}
313

314
static int
315
vmm_unsupported_quirk(void)
316
{
317
	/*
318
	 * Known to not load on Ampere eMAG
319
	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
320
	 */
321
	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
322
	    CPU_PART_EMAG8180, 0, 0))
323
		return (ENXIO);
324

325
	return (0);
326
}
327

328
static int
329
vmm_init(void)
330
{
331
	int error;
332

333
	vm_maxcpu = mp_ncpus;
334
	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
335

336
	if (vm_maxcpu > VM_MAXCPU) {
337
		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
338
		vm_maxcpu = VM_MAXCPU;
339
	}
340
	if (vm_maxcpu == 0)
341
		vm_maxcpu = 1;
342

343
	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
344
	if (error != 0)
345
		return (error);
346

347
	return (vmmops_modinit(0));
348
}
349

350
static int
351
vmm_handler(module_t mod, int what, void *arg)
352
{
353
	int error;
354

355
	switch (what) {
356
	case MOD_LOAD:
357
		error = vmm_unsupported_quirk();
358
		if (error != 0)
359
			break;
360
		error = vmmdev_init();
361
		if (error != 0)
362
			break;
363
		error = vmm_init();
364
		if (error == 0)
365
			vmm_initialized = true;
366
		else
367
			(void)vmmdev_cleanup();
368
		break;
369
	case MOD_UNLOAD:
370
		error = vmmdev_cleanup();
371
		if (error == 0 && vmm_initialized) {
372
			error = vmmops_modcleanup();
373
			if (error) {
374
				/*
375
				 * Something bad happened - prevent new
376
				 * VMs from being created
377
				 */
378
				vmm_initialized = false;
379
			}
380
		}
381
		break;
382
	default:
383
		error = 0;
384
		break;
385
	}
386
	return (error);
387
}
388

389
static moduledata_t vmm_kmod = {
390
	"vmm",
391
	vmm_handler,
392
	NULL
393
};
394

395
/*
396
 * vmm initialization has the following dependencies:
397
 *
398
 * - HYP initialization requires smp_rendezvous() and therefore must happen
399
 *   after SMP is fully functional (after SI_SUB_SMP).
400
 * - vmm device initialization requires an initialized devfs.
401
 */
402
DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
403
MODULE_VERSION(vmm, 1);
404

405
static void
406
vm_init(struct vm *vm, bool create)
407
{
408
	int i;
409

410
	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
411
	MPASS(vm->cookie != NULL);
412

413
	CPU_ZERO(&vm->active_cpus);
414
	CPU_ZERO(&vm->debug_cpus);
415

416
	vm->suspend = 0;
417
	CPU_ZERO(&vm->suspended_cpus);
418

419
	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
420
	memset(vm->special_reg, 0, sizeof(vm->special_reg));
421

422
	if (!create) {
423
		for (i = 0; i < vm->maxcpus; i++) {
424
			if (vm->vcpu[i] != NULL)
425
				vcpu_init(vm->vcpu[i]);
426
		}
427
	}
428
}
429

430
void
431
vm_disable_vcpu_creation(struct vm *vm)
432
{
433
	sx_xlock(&vm->vcpus_init_lock);
434
	vm->dying = true;
435
	sx_xunlock(&vm->vcpus_init_lock);
436
}
437

438
struct vcpu *
439
vm_alloc_vcpu(struct vm *vm, int vcpuid)
440
{
441
	struct vcpu *vcpu;
442

443
	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
444
		return (NULL);
445

446
	/* Some interrupt controllers may have a CPU limit */
447
	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
448
		return (NULL);
449

450
	vcpu = (struct vcpu *)
451
	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
452
	if (__predict_true(vcpu != NULL))
453
		return (vcpu);
454

455
	sx_xlock(&vm->vcpus_init_lock);
456
	vcpu = vm->vcpu[vcpuid];
457
	if (vcpu == NULL && !vm->dying) {
458
		vcpu = vcpu_alloc(vm, vcpuid);
459
		vcpu_init(vcpu);
460

461
		/*
462
		 * Ensure vCPU is fully created before updating pointer
463
		 * to permit unlocked reads above.
464
		 */
465
		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
466
		    (uintptr_t)vcpu);
467
	}
468
	sx_xunlock(&vm->vcpus_init_lock);
469
	return (vcpu);
470
}
471

472
void
473
vm_slock_vcpus(struct vm *vm)
474
{
475
	sx_slock(&vm->vcpus_init_lock);
476
}
477

478
void
479
vm_unlock_vcpus(struct vm *vm)
480
{
481
	sx_unlock(&vm->vcpus_init_lock);
482
}
483

484
int
485
vm_create(const char *name, struct vm **retvm)
486
{
487
	struct vm *vm;
488
	struct vmspace *vmspace;
489

490
	/*
491
	 * If vmm.ko could not be successfully initialized then don't attempt
492
	 * to create the virtual machine.
493
	 */
494
	if (!vmm_initialized)
495
		return (ENXIO);
496

497
	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
498
		return (EINVAL);
499

500
	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
501
	if (vmspace == NULL)
502
		return (ENOMEM);
503

504
	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
505
	strcpy(vm->name, name);
506
	vm->vmspace = vmspace;
507
	vm_mem_init(&vm->mem);
508
	sx_init(&vm->vcpus_init_lock, "vm vcpus");
509

510
	vm->sockets = 1;
511
	vm->cores = 1;			/* XXX backwards compatibility */
512
	vm->threads = 1;		/* XXX backwards compatibility */
513
	vm->maxcpus = vm_maxcpu;
514

515
	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
516
	    M_WAITOK | M_ZERO);
517

518
	vm_init(vm, true);
519

520
	*retvm = vm;
521
	return (0);
522
}
523

524
void
525
vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
526
    uint16_t *threads, uint16_t *maxcpus)
527
{
528
	*sockets = vm->sockets;
529
	*cores = vm->cores;
530
	*threads = vm->threads;
531
	*maxcpus = vm->maxcpus;
532
}
533

534
uint16_t
535
vm_get_maxcpus(struct vm *vm)
536
{
537
	return (vm->maxcpus);
538
}
539

540
int
541
vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
542
    uint16_t threads, uint16_t maxcpus)
543
{
544
	/* Ignore maxcpus. */
545
	if ((sockets * cores * threads) > vm->maxcpus)
546
		return (EINVAL);
547
	vm->sockets = sockets;
548
	vm->cores = cores;
549
	vm->threads = threads;
550
	return(0);
551
}
552

553
static void
554
vm_cleanup(struct vm *vm, bool destroy)
555
{
556
	pmap_t pmap __diagused;
557
	int i;
558

559
	if (destroy) {
560
		vm_xlock_memsegs(vm);
561
		pmap = vmspace_pmap(vm->vmspace);
562
		sched_pin();
563
		PCPU_SET(curvmpmap, NULL);
564
		sched_unpin();
565
		CPU_FOREACH(i) {
566
			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
567
		}
568
	} else
569
		vm_assert_memseg_xlocked(vm);
570

571

572
	vgic_detach_from_vm(vm->cookie);
573

574
	for (i = 0; i < vm->maxcpus; i++) {
575
		if (vm->vcpu[i] != NULL)
576
			vcpu_cleanup(vm->vcpu[i], destroy);
577
	}
578

579
	vmmops_cleanup(vm->cookie);
580

581
	vm_mem_cleanup(vm);
582
	if (destroy) {
583
		vm_mem_destroy(vm);
584

585
		vmmops_vmspace_free(vm->vmspace);
586
		vm->vmspace = NULL;
587

588
		for (i = 0; i < vm->maxcpus; i++)
589
			free(vm->vcpu[i], M_VMM);
590
		free(vm->vcpu, M_VMM);
591
		sx_destroy(&vm->vcpus_init_lock);
592
	}
593
}
594

595
void
596
vm_destroy(struct vm *vm)
597
{
598
	vm_cleanup(vm, true);
599
	free(vm, M_VMM);
600
}
601

602
int
603
vm_reinit(struct vm *vm)
604
{
605
	int error;
606

607
	/*
608
	 * A virtual machine can be reset only if all vcpus are suspended.
609
	 */
610
	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
611
		vm_cleanup(vm, false);
612
		vm_init(vm, false);
613
		error = 0;
614
	} else {
615
		error = EBUSY;
616
	}
617

618
	return (error);
619
}
620

621
const char *
622
vm_name(struct vm *vm)
623
{
624
	return (vm->name);
625
}
626

627
int
628
vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
629
    uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
630
{
631
	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
632
}
633

634
static int
635
vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
636
{
637
	*rval = 0;
638
	return (0);
639
}
640

641
static int
642
vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
643
{
644
	*rval = *(uint64_t *)arg;
645
	return (0);
646
}
647

648
static int
649
vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
650
{
651
	return (0);
652
}
653

654
static const struct vmm_special_reg vmm_special_regs[] = {
655
#define	SPECIAL_REG(_reg, _read, _write)				\
656
	{								\
657
		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
658
		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
659
		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
660
		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
661
		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
662
		.esr_mask = ISS_MSR_REG_MASK,				\
663
		.reg_read = (_read),					\
664
		.reg_write = (_write),					\
665
		.arg = NULL,						\
666
	}
667
#define	ID_SPECIAL_REG(_reg, _name)					\
668
	{								\
669
		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
670
		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
671
		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
672
		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
673
		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
674
		.esr_mask = ISS_MSR_REG_MASK,				\
675
		.reg_read = vmm_reg_read_arg,				\
676
		.reg_write = vmm_reg_wi,				\
677
		.arg = &(vmm_arch_regs._name),				\
678
	}
679

680
	/* ID registers */
681
	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
682
	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
683
	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
684
	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
685
	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
686

687
	/*
688
	 * All other ID registers are read as zero.
689
	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
690
	 */
691
	{
692
		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
693
		    (0 << ISS_MSR_OP1_SHIFT) |
694
		    (0 << ISS_MSR_CRn_SHIFT) |
695
		    (0 << ISS_MSR_CRm_SHIFT),
696
		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
697
		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
698
		.reg_read = vmm_reg_raz,
699
		.reg_write = vmm_reg_wi,
700
		.arg = NULL,
701
	},
702

703
	/* Counter physical registers */
704
	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
705
	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
706
	    vtimer_phys_cval_write),
707
	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
708
	    vtimer_phys_tval_write),
709
	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
710
#undef SPECIAL_REG
711
};
712

713
void
714
vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
715
    reg_read_t reg_read, reg_write_t reg_write, void *arg)
716
{
717
	int i;
718

719
	for (i = 0; i < nitems(vm->special_reg); i++) {
720
		if (vm->special_reg[i].esr_iss == 0 &&
721
		    vm->special_reg[i].esr_mask == 0) {
722
			vm->special_reg[i].esr_iss = iss;
723
			vm->special_reg[i].esr_mask = mask;
724
			vm->special_reg[i].reg_read = reg_read;
725
			vm->special_reg[i].reg_write = reg_write;
726
			vm->special_reg[i].arg = arg;
727
			return;
728
		}
729
	}
730

731
	panic("%s: No free special register slot", __func__);
732
}
733

734
void
735
vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
736
{
737
	int i;
738

739
	for (i = 0; i < nitems(vm->special_reg); i++) {
740
		if (vm->special_reg[i].esr_iss == iss &&
741
		    vm->special_reg[i].esr_mask == mask) {
742
			memset(&vm->special_reg[i], 0,
743
			    sizeof(vm->special_reg[i]));
744
			return;
745
		}
746
	}
747

748
	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
749
	    mask);
750
}
751

752
static int
753
vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
754
{
755
	struct vm *vm;
756
	struct vm_exit *vme;
757
	struct vre *vre;
758
	int i, rv;
759

760
	vm = vcpu->vm;
761
	vme = &vcpu->exitinfo;
762
	vre = &vme->u.reg_emul.vre;
763

764
	for (i = 0; i < nitems(vm->special_reg); i++) {
765
		if (vm->special_reg[i].esr_iss == 0 &&
766
		    vm->special_reg[i].esr_mask == 0)
767
			continue;
768

769
		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
770
		    vm->special_reg[i].esr_iss) {
771
			rv = vmm_emulate_register(vcpu, vre,
772
			    vm->special_reg[i].reg_read,
773
			    vm->special_reg[i].reg_write,
774
			    vm->special_reg[i].arg);
775
			if (rv == 0) {
776
				*retu = false;
777
			}
778
			return (rv);
779
		}
780
	}
781
	for (i = 0; i < nitems(vmm_special_regs); i++) {
782
		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
783
		    vmm_special_regs[i].esr_iss) {
784
			rv = vmm_emulate_register(vcpu, vre,
785
			    vmm_special_regs[i].reg_read,
786
			    vmm_special_regs[i].reg_write,
787
			    vmm_special_regs[i].arg);
788
			if (rv == 0) {
789
				*retu = false;
790
			}
791
			return (rv);
792
		}
793
	}
794

795

796
	*retu = true;
797
	return (0);
798
}
799

800
void
801
vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
802
    mem_region_read_t mmio_read, mem_region_write_t mmio_write)
803
{
804
	int i;
805

806
	for (i = 0; i < nitems(vm->mmio_region); i++) {
807
		if (vm->mmio_region[i].start == 0 &&
808
		    vm->mmio_region[i].end == 0) {
809
			vm->mmio_region[i].start = start;
810
			vm->mmio_region[i].end = start + size;
811
			vm->mmio_region[i].read = mmio_read;
812
			vm->mmio_region[i].write = mmio_write;
813
			return;
814
		}
815
	}
816

817
	panic("%s: No free MMIO region", __func__);
818
}
819

820
void
821
vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
822
{
823
	int i;
824

825
	for (i = 0; i < nitems(vm->mmio_region); i++) {
826
		if (vm->mmio_region[i].start == start &&
827
		    vm->mmio_region[i].end == start + size) {
828
			memset(&vm->mmio_region[i], 0,
829
			    sizeof(vm->mmio_region[i]));
830
			return;
831
		}
832
	}
833

834
	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
835
	    start + size);
836
}
837

838
static int
839
vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
840
{
841
	struct vm *vm;
842
	struct vm_exit *vme;
843
	struct vie *vie;
844
	struct hyp *hyp;
845
	uint64_t fault_ipa;
846
	struct vm_guest_paging *paging;
847
	struct vmm_mmio_region *vmr;
848
	int error, i;
849

850
	vm = vcpu->vm;
851
	hyp = vm->cookie;
852
	if (!hyp->vgic_attached)
853
		goto out_user;
854

855
	vme = &vcpu->exitinfo;
856
	vie = &vme->u.inst_emul.vie;
857
	paging = &vme->u.inst_emul.paging;
858

859
	fault_ipa = vme->u.inst_emul.gpa;
860

861
	vmr = NULL;
862
	for (i = 0; i < nitems(vm->mmio_region); i++) {
863
		if (vm->mmio_region[i].start <= fault_ipa &&
864
		    vm->mmio_region[i].end > fault_ipa) {
865
			vmr = &vm->mmio_region[i];
866
			break;
867
		}
868
	}
869
	if (vmr == NULL)
870
		goto out_user;
871

872
	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
873
	    vmr->read, vmr->write, retu);
874
	return (error);
875

876
out_user:
877
	*retu = true;
878
	return (0);
879
}
880

881
int
882
vm_suspend(struct vm *vm, enum vm_suspend_how how)
883
{
884
	int i;
885

886
	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
887
		return (EINVAL);
888

889
	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
890
		VM_CTR2(vm, "virtual machine already suspended %d/%d",
891
		    vm->suspend, how);
892
		return (EALREADY);
893
	}
894

895
	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
896

897
	/*
898
	 * Notify all active vcpus that they are now suspended.
899
	 */
900
	for (i = 0; i < vm->maxcpus; i++) {
901
		if (CPU_ISSET(i, &vm->active_cpus))
902
			vcpu_notify_event(vm_vcpu(vm, i));
903
	}
904

905
	return (0);
906
}
907

908
void
909
vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
910
{
911
	struct vm *vm = vcpu->vm;
912
	struct vm_exit *vmexit;
913

914
	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
915
	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
916

917
	vmexit = vm_exitinfo(vcpu);
918
	vmexit->pc = pc;
919
	vmexit->inst_length = 4;
920
	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
921
	vmexit->u.suspended.how = vm->suspend;
922
}
923

924
void
925
vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
926
{
927
	struct vm_exit *vmexit;
928

929
	vmexit = vm_exitinfo(vcpu);
930
	vmexit->pc = pc;
931
	vmexit->inst_length = 4;
932
	vmexit->exitcode = VM_EXITCODE_DEBUG;
933
}
934

935
int
936
vm_activate_cpu(struct vcpu *vcpu)
937
{
938
	struct vm *vm = vcpu->vm;
939

940
	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
941
		return (EBUSY);
942

943
	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
944
	return (0);
945

946
}
947

948
int
949
vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
950
{
951
	if (vcpu == NULL) {
952
		vm->debug_cpus = vm->active_cpus;
953
		for (int i = 0; i < vm->maxcpus; i++) {
954
			if (CPU_ISSET(i, &vm->active_cpus))
955
				vcpu_notify_event(vm_vcpu(vm, i));
956
		}
957
	} else {
958
		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
959
			return (EINVAL);
960

961
		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
962
		vcpu_notify_event(vcpu);
963
	}
964
	return (0);
965
}
966

967
int
968
vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
969
{
970

971
	if (vcpu == NULL) {
972
		CPU_ZERO(&vm->debug_cpus);
973
	} else {
974
		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
975
			return (EINVAL);
976

977
		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
978
	}
979
	return (0);
980
}
981

982
int
983
vcpu_debugged(struct vcpu *vcpu)
984
{
985

986
	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
987
}
988

989
cpuset_t
990
vm_active_cpus(struct vm *vm)
991
{
992

993
	return (vm->active_cpus);
994
}
995

996
cpuset_t
997
vm_debug_cpus(struct vm *vm)
998
{
999

1000
	return (vm->debug_cpus);
1001
}
1002

1003
cpuset_t
1004
vm_suspended_cpus(struct vm *vm)
1005
{
1006

1007
	return (vm->suspended_cpus);
1008
}
1009

1010

1011
void *
1012
vcpu_stats(struct vcpu *vcpu)
1013
{
1014

1015
	return (vcpu->stats);
1016
}
1017

1018
/*
1019
 * This function is called to ensure that a vcpu "sees" a pending event
1020
 * as soon as possible:
1021
 * - If the vcpu thread is sleeping then it is woken up.
1022
 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1023
 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1024
 */
1025
static void
1026
vcpu_notify_event_locked(struct vcpu *vcpu)
1027
{
1028
	int hostcpu;
1029

1030
	hostcpu = vcpu->hostcpu;
1031
	if (vcpu->state == VCPU_RUNNING) {
1032
		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1033
		if (hostcpu != curcpu) {
1034
			ipi_cpu(hostcpu, vmm_ipinum);
1035
		} else {
1036
			/*
1037
			 * If the 'vcpu' is running on 'curcpu' then it must
1038
			 * be sending a notification to itself (e.g. SELF_IPI).
1039
			 * The pending event will be picked up when the vcpu
1040
			 * transitions back to guest context.
1041
			 */
1042
		}
1043
	} else {
1044
		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1045
		    "with hostcpu %d", vcpu->state, hostcpu));
1046
		if (vcpu->state == VCPU_SLEEPING)
1047
			wakeup_one(vcpu);
1048
	}
1049
}
1050

1051
void
1052
vcpu_notify_event(struct vcpu *vcpu)
1053
{
1054
	vcpu_lock(vcpu);
1055
	vcpu_notify_event_locked(vcpu);
1056
	vcpu_unlock(vcpu);
1057
}
1058

1059
struct vmspace *
1060
vm_vmspace(struct vm *vm)
1061
{
1062
	return (vm->vmspace);
1063
}
1064

1065
struct vm_mem *
1066
vm_mem(struct vm *vm)
1067
{
1068
	return (&vm->mem);
1069
}
1070

1071
static void
1072
restore_guest_fpustate(struct vcpu *vcpu)
1073
{
1074

1075
	/* flush host state to the pcb */
1076
	vfp_save_state(curthread, curthread->td_pcb);
1077
	/* Ensure the VFP state will be re-loaded when exiting the guest */
1078
	PCPU_SET(fpcurthread, NULL);
1079

1080
	/* restore guest FPU state */
1081
	vfp_enable();
1082
	vfp_restore(vcpu->guestfpu);
1083

1084
	/*
1085
	 * The FPU is now "dirty" with the guest's state so turn on emulation
1086
	 * to trap any access to the FPU by the host.
1087
	 */
1088
	vfp_disable();
1089
}
1090

1091
static void
1092
save_guest_fpustate(struct vcpu *vcpu)
1093
{
1094
	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1095
	    CPACR_FPEN_TRAP_ALL1)
1096
		panic("VFP not enabled in host!");
1097

1098
	/* save guest FPU state */
1099
	vfp_enable();
1100
	vfp_store(vcpu->guestfpu);
1101
	vfp_disable();
1102

1103
	KASSERT(PCPU_GET(fpcurthread) == NULL,
1104
	    ("%s: fpcurthread set with guest registers", __func__));
1105
}
1106
static int
1107
vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1108
    bool from_idle)
1109
{
1110
	int error;
1111

1112
	vcpu_assert_locked(vcpu);
1113

1114
	/*
1115
	 * State transitions from the vmmdev_ioctl() must always begin from
1116
	 * the VCPU_IDLE state. This guarantees that there is only a single
1117
	 * ioctl() operating on a vcpu at any point.
1118
	 */
1119
	if (from_idle) {
1120
		while (vcpu->state != VCPU_IDLE) {
1121
			vcpu_notify_event_locked(vcpu);
1122
			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1123
		}
1124
	} else {
1125
		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1126
		    "vcpu idle state"));
1127
	}
1128

1129
	if (vcpu->state == VCPU_RUNNING) {
1130
		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1131
		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1132
	} else {
1133
		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1134
		    "vcpu that is not running", vcpu->hostcpu));
1135
	}
1136

1137
	/*
1138
	 * The following state transitions are allowed:
1139
	 * IDLE -> FROZEN -> IDLE
1140
	 * FROZEN -> RUNNING -> FROZEN
1141
	 * FROZEN -> SLEEPING -> FROZEN
1142
	 */
1143
	switch (vcpu->state) {
1144
	case VCPU_IDLE:
1145
	case VCPU_RUNNING:
1146
	case VCPU_SLEEPING:
1147
		error = (newstate != VCPU_FROZEN);
1148
		break;
1149
	case VCPU_FROZEN:
1150
		error = (newstate == VCPU_FROZEN);
1151
		break;
1152
	default:
1153
		error = 1;
1154
		break;
1155
	}
1156

1157
	if (error)
1158
		return (EBUSY);
1159

1160
	vcpu->state = newstate;
1161
	if (newstate == VCPU_RUNNING)
1162
		vcpu->hostcpu = curcpu;
1163
	else
1164
		vcpu->hostcpu = NOCPU;
1165

1166
	if (newstate == VCPU_IDLE)
1167
		wakeup(&vcpu->state);
1168

1169
	return (0);
1170
}
1171

1172
static void
1173
vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1174
{
1175
	int error;
1176

1177
	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1178
		panic("Error %d setting state to %d\n", error, newstate);
1179
}
1180

1181
static void
1182
vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1183
{
1184
	int error;
1185

1186
	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1187
		panic("Error %d setting state to %d", error, newstate);
1188
}
1189

1190
int
1191
vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1192
{
1193
	if (type < 0 || type >= VM_CAP_MAX)
1194
		return (EINVAL);
1195

1196
	return (vmmops_getcap(vcpu->cookie, type, retval));
1197
}
1198

1199
int
1200
vm_set_capability(struct vcpu *vcpu, int type, int val)
1201
{
1202
	if (type < 0 || type >= VM_CAP_MAX)
1203
		return (EINVAL);
1204

1205
	return (vmmops_setcap(vcpu->cookie, type, val));
1206
}
1207

1208
struct vm *
1209
vcpu_vm(struct vcpu *vcpu)
1210
{
1211
	return (vcpu->vm);
1212
}
1213

1214
int
1215
vcpu_vcpuid(struct vcpu *vcpu)
1216
{
1217
	return (vcpu->vcpuid);
1218
}
1219

1220
void *
1221
vcpu_get_cookie(struct vcpu *vcpu)
1222
{
1223
	return (vcpu->cookie);
1224
}
1225

1226
struct vcpu *
1227
vm_vcpu(struct vm *vm, int vcpuid)
1228
{
1229
	return (vm->vcpu[vcpuid]);
1230
}
1231

1232
int
1233
vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1234
{
1235
	int error;
1236

1237
	vcpu_lock(vcpu);
1238
	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1239
	vcpu_unlock(vcpu);
1240

1241
	return (error);
1242
}
1243

1244
enum vcpu_state
1245
vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1246
{
1247
	enum vcpu_state state;
1248

1249
	vcpu_lock(vcpu);
1250
	state = vcpu->state;
1251
	if (hostcpu != NULL)
1252
		*hostcpu = vcpu->hostcpu;
1253
	vcpu_unlock(vcpu);
1254

1255
	return (state);
1256
}
1257

1258
int
1259
vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1260
{
1261

1262
	if (reg >= VM_REG_LAST)
1263
		return (EINVAL);
1264

1265
	return (vmmops_getreg(vcpu->cookie, reg, retval));
1266
}
1267

1268
int
1269
vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1270
{
1271
	int error;
1272

1273
	if (reg >= VM_REG_LAST)
1274
		return (EINVAL);
1275
	error = vmmops_setreg(vcpu->cookie, reg, val);
1276
	if (error || reg != VM_REG_GUEST_PC)
1277
		return (error);
1278

1279
	vcpu->nextpc = val;
1280

1281
	return (0);
1282
}
1283

1284
void *
1285
vm_get_cookie(struct vm *vm)
1286
{
1287
	return (vm->cookie);
1288
}
1289

1290
int
1291
vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1292
{
1293
	return (vmmops_exception(vcpu->cookie, esr, far));
1294
}
1295

1296
int
1297
vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1298
{
1299
	return (vgic_attach_to_vm(vm->cookie, descr));
1300
}
1301

1302
int
1303
vm_assert_irq(struct vm *vm, uint32_t irq)
1304
{
1305
	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1306
}
1307

1308
int
1309
vm_deassert_irq(struct vm *vm, uint32_t irq)
1310
{
1311
	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1312
}
1313

1314
int
1315
vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1316
    int func)
1317
{
1318
	/* TODO: Should we raise an SError? */
1319
	return (vgic_inject_msi(vm->cookie, msg, addr));
1320
}
1321

1322
static int
1323
vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1324
{
1325
	struct hypctx *hypctx;
1326
	int i;
1327

1328
	hypctx = vcpu_get_cookie(vcpu);
1329

1330
	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1331
		return (1);
1332

1333
	vme->exitcode = VM_EXITCODE_SMCCC;
1334
	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1335
	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1336
		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1337

1338
	*retu = true;
1339
	return (0);
1340
}
1341

1342
static int
1343
vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1344
{
1345
	struct vm *vm;
1346

1347
	vm = vcpu->vm;
1348
	vcpu_lock(vcpu);
1349
	while (1) {
1350
		if (vm->suspend)
1351
			break;
1352

1353
		if (vgic_has_pending_irq(vcpu->cookie))
1354
			break;
1355

1356
		if (vcpu_should_yield(vcpu))
1357
			break;
1358

1359
		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1360
		/*
1361
		 * XXX msleep_spin() cannot be interrupted by signals so
1362
		 * wake up periodically to check pending signals.
1363
		 */
1364
		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1365
		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1366
	}
1367
	vcpu_unlock(vcpu);
1368

1369
	*retu = false;
1370
	return (0);
1371
}
1372

1373
static int
1374
vm_handle_paging(struct vcpu *vcpu, bool *retu)
1375
{
1376
	struct vm *vm = vcpu->vm;
1377
	struct vm_exit *vme;
1378
	struct vm_map *map;
1379
	uint64_t addr, esr;
1380
	pmap_t pmap;
1381
	int ftype, rv;
1382

1383
	vme = &vcpu->exitinfo;
1384

1385
	pmap = vmspace_pmap(vcpu->vm->vmspace);
1386
	addr = vme->u.paging.gpa;
1387
	esr = vme->u.paging.esr;
1388

1389
	/* The page exists, but the page table needs to be updated. */
1390
	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1391
		return (0);
1392

1393
	switch (ESR_ELx_EXCEPTION(esr)) {
1394
	case EXCP_INSN_ABORT_L:
1395
	case EXCP_DATA_ABORT_L:
1396
		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1397
		break;
1398
	default:
1399
		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1400
	}
1401

1402
	map = &vm->vmspace->vm_map;
1403
	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1404
	if (rv != KERN_SUCCESS)
1405
		return (EFAULT);
1406

1407
	return (0);
1408
}
1409

1410
static int
1411
vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1412
{
1413
	struct vm *vm = vcpu->vm;
1414
	int error, i;
1415
	struct thread *td;
1416

1417
	error = 0;
1418
	td = curthread;
1419

1420
	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1421

1422
	/*
1423
	 * Wait until all 'active_cpus' have suspended themselves.
1424
	 *
1425
	 * Since a VM may be suspended at any time including when one or
1426
	 * more vcpus are doing a rendezvous we need to call the rendezvous
1427
	 * handler while we are waiting to prevent a deadlock.
1428
	 */
1429
	vcpu_lock(vcpu);
1430
	while (error == 0) {
1431
		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1432
			break;
1433

1434
		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1435
		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1436
		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1437
		if (td_ast_pending(td, TDA_SUSPEND)) {
1438
			vcpu_unlock(vcpu);
1439
			error = thread_check_susp(td, false);
1440
			vcpu_lock(vcpu);
1441
		}
1442
	}
1443
	vcpu_unlock(vcpu);
1444

1445
	/*
1446
	 * Wakeup the other sleeping vcpus and return to userspace.
1447
	 */
1448
	for (i = 0; i < vm->maxcpus; i++) {
1449
		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1450
			vcpu_notify_event(vm_vcpu(vm, i));
1451
		}
1452
	}
1453

1454
	*retu = true;
1455
	return (error);
1456
}
1457

1458
int
1459
vm_run(struct vcpu *vcpu)
1460
{
1461
	struct vm *vm = vcpu->vm;
1462
	struct vm_eventinfo evinfo;
1463
	int error, vcpuid;
1464
	struct vm_exit *vme;
1465
	bool retu;
1466
	pmap_t pmap;
1467

1468
	vcpuid = vcpu->vcpuid;
1469

1470
	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1471
		return (EINVAL);
1472

1473
	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1474
		return (EINVAL);
1475

1476
	pmap = vmspace_pmap(vm->vmspace);
1477
	vme = &vcpu->exitinfo;
1478
	evinfo.rptr = NULL;
1479
	evinfo.sptr = &vm->suspend;
1480
	evinfo.iptr = NULL;
1481
restart:
1482
	critical_enter();
1483

1484
	restore_guest_fpustate(vcpu);
1485

1486
	vcpu_require_state(vcpu, VCPU_RUNNING);
1487
	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1488
	vcpu_require_state(vcpu, VCPU_FROZEN);
1489

1490
	save_guest_fpustate(vcpu);
1491

1492
	critical_exit();
1493

1494
	if (error == 0) {
1495
		retu = false;
1496
		switch (vme->exitcode) {
1497
		case VM_EXITCODE_INST_EMUL:
1498
			vcpu->nextpc = vme->pc + vme->inst_length;
1499
			error = vm_handle_inst_emul(vcpu, &retu);
1500
			break;
1501

1502
		case VM_EXITCODE_REG_EMUL:
1503
			vcpu->nextpc = vme->pc + vme->inst_length;
1504
			error = vm_handle_reg_emul(vcpu, &retu);
1505
			break;
1506

1507
		case VM_EXITCODE_HVC:
1508
			/*
1509
			 * The HVC instruction saves the address for the
1510
			 * next instruction as the return address.
1511
			 */
1512
			vcpu->nextpc = vme->pc;
1513
			/*
1514
			 * The PSCI call can change the exit information in the
1515
			 * case of suspend/reset/poweroff/cpu off/cpu on.
1516
			 */
1517
			error = vm_handle_smccc_call(vcpu, vme, &retu);
1518
			break;
1519

1520
		case VM_EXITCODE_WFI:
1521
			vcpu->nextpc = vme->pc + vme->inst_length;
1522
			error = vm_handle_wfi(vcpu, vme, &retu);
1523
			break;
1524

1525
		case VM_EXITCODE_PAGING:
1526
			vcpu->nextpc = vme->pc;
1527
			error = vm_handle_paging(vcpu, &retu);
1528
			break;
1529

1530
		case VM_EXITCODE_SUSPENDED:
1531
			vcpu->nextpc = vme->pc;
1532
			error = vm_handle_suspend(vcpu, &retu);
1533
			break;
1534

1535
		default:
1536
			/* Handle in userland */
1537
			vcpu->nextpc = vme->pc;
1538
			retu = true;
1539
			break;
1540
		}
1541
	}
1542

1543
	if (error == 0 && retu == false)
1544
		goto restart;
1545

1546
	return (error);
1547
}
1548

1549
Product

Resources

Company