CoCalc -- kvm

GitHub Repository: torvalds/linux
Path: blob/master/tools/testing/selftests/kvm/lib/kvm_util.c
⁴⁹⁶²¹ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * tools/testing/selftests/kvm/lib/kvm_util.c
4
 *
5
 * Copyright (C) 2018, Google LLC.
6
 */
7
#include "test_util.h"
8
#include "kvm_util.h"
9
#include "processor.h"
10
#include "ucall_common.h"
11

12
#include <assert.h>
13
#include <sched.h>
14
#include <sys/mman.h>
15
#include <sys/resource.h>
16
#include <sys/types.h>
17
#include <sys/stat.h>
18
#include <unistd.h>
19
#include <linux/kernel.h>
20

21
#define KVM_UTIL_MIN_PFN	2
22

23
uint32_t guest_random_seed;
24
struct guest_random_state guest_rng;
25
static uint32_t last_guest_seed;
26

27
static size_t vcpu_mmap_sz(void);
28

29
int __open_path_or_exit(const char *path, int flags, const char *enoent_help)
30
{
31
	int fd;
32

33
	fd = open(path, flags);
34
	if (fd < 0)
35
		goto error;
36

37
	return fd;
38

39
error:
40
	if (errno == EACCES || errno == ENOENT)
41
		ksft_exit_skip("- Cannot open '%s': %s.  %s\n",
42
			       path, strerror(errno),
43
			       errno == EACCES ? "Root required?" : enoent_help);
44
	TEST_FAIL("Failed to open '%s'", path);
45
}
46

47
int open_path_or_exit(const char *path, int flags)
48
{
49
	return __open_path_or_exit(path, flags, "");
50
}
51

52
/*
53
 * Open KVM_DEV_PATH if available, otherwise exit the entire program.
54
 *
55
 * Input Args:
56
 *   flags - The flags to pass when opening KVM_DEV_PATH.
57
 *
58
 * Return:
59
 *   The opened file descriptor of /dev/kvm.
60
 */
61
static int _open_kvm_dev_path_or_exit(int flags)
62
{
63
	return __open_path_or_exit(KVM_DEV_PATH, flags, "Is KVM loaded and enabled?");
64
}
65

66
int open_kvm_dev_path_or_exit(void)
67
{
68
	return _open_kvm_dev_path_or_exit(O_RDONLY);
69
}
70

71
static ssize_t get_module_param(const char *module_name, const char *param,
72
				void *buffer, size_t buffer_size)
73
{
74
	const int path_size = 128;
75
	char path[path_size];
76
	ssize_t bytes_read;
77
	int fd, r;
78

79
	/* Verify KVM is loaded, to provide a more helpful SKIP message. */
80
	close(open_kvm_dev_path_or_exit());
81

82
	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
83
		     module_name, param);
84
	TEST_ASSERT(r < path_size,
85
		    "Failed to construct sysfs path in %d bytes.", path_size);
86

87
	fd = open_path_or_exit(path, O_RDONLY);
88

89
	bytes_read = read(fd, buffer, buffer_size);
90
	TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes",
91
		    path, bytes_read, buffer_size);
92

93
	r = close(fd);
94
	TEST_ASSERT(!r, "close(%s) failed", path);
95
	return bytes_read;
96
}
97

98
int kvm_get_module_param_integer(const char *module_name, const char *param)
99
{
100
	/*
101
	 * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the
102
	 * NUL char, and 1 byte because the kernel sucks and inserts a newline
103
	 * at the end.
104
	 */
105
	char value[16 + 1 + 1];
106
	ssize_t r;
107

108
	memset(value, '\0', sizeof(value));
109

110
	r = get_module_param(module_name, param, value, sizeof(value));
111
	TEST_ASSERT(value[r - 1] == '\n',
112
		    "Expected trailing newline, got char '%c'", value[r - 1]);
113

114
	/*
115
	 * Squash the newline, otherwise atoi_paranoid() will complain about
116
	 * trailing non-NUL characters in the string.
117
	 */
118
	value[r - 1] = '\0';
119
	return atoi_paranoid(value);
120
}
121

122
bool kvm_get_module_param_bool(const char *module_name, const char *param)
123
{
124
	char value;
125
	ssize_t r;
126

127
	r = get_module_param(module_name, param, &value, sizeof(value));
128
	TEST_ASSERT_EQ(r, 1);
129

130
	if (value == 'Y')
131
		return true;
132
	else if (value == 'N')
133
		return false;
134

135
	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
136
}
137

138
/*
139
 * Capability
140
 *
141
 * Input Args:
142
 *   cap - Capability
143
 *
144
 * Output Args: None
145
 *
146
 * Return:
147
 *   On success, the Value corresponding to the capability (KVM_CAP_*)
148
 *   specified by the value of cap.  On failure a TEST_ASSERT failure
149
 *   is produced.
150
 *
151
 * Looks up and returns the value corresponding to the capability
152
 * (KVM_CAP_*) given by cap.
153
 */
154
unsigned int kvm_check_cap(long cap)
155
{
156
	int ret;
157
	int kvm_fd;
158

159
	kvm_fd = open_kvm_dev_path_or_exit();
160
	ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap);
161
	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
162

163
	close(kvm_fd);
164

165
	return (unsigned int)ret;
166
}
167

168
void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
169
{
170
	if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
171
		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
172
	else
173
		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
174
	vm->dirty_ring_size = ring_size;
175
}
176

177
static void vm_open(struct kvm_vm *vm)
178
{
179
	vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR);
180

181
	TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT));
182

183
	vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type);
184
	TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd));
185

186
	if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
187
		vm->stats.fd = vm_get_stats_fd(vm);
188
	else
189
		vm->stats.fd = -1;
190
}
191

192
const char *vm_guest_mode_string(uint32_t i)
193
{
194
	static const char * const strings[] = {
195
		[VM_MODE_P52V48_4K]	= "PA-bits:52,  VA-bits:48,  4K pages",
196
		[VM_MODE_P52V48_16K]	= "PA-bits:52,  VA-bits:48, 16K pages",
197
		[VM_MODE_P52V48_64K]	= "PA-bits:52,  VA-bits:48, 64K pages",
198
		[VM_MODE_P48V48_4K]	= "PA-bits:48,  VA-bits:48,  4K pages",
199
		[VM_MODE_P48V48_16K]	= "PA-bits:48,  VA-bits:48, 16K pages",
200
		[VM_MODE_P48V48_64K]	= "PA-bits:48,  VA-bits:48, 64K pages",
201
		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
202
		[VM_MODE_P40V48_16K]	= "PA-bits:40,  VA-bits:48, 16K pages",
203
		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
204
		[VM_MODE_PXXVYY_4K]	= "PA-bits:ANY, VA-bits:48 or 57, 4K pages",
205
		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
206
		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
207
		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
208
		[VM_MODE_P36V48_16K]	= "PA-bits:36,  VA-bits:48, 16K pages",
209
		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
210
		[VM_MODE_P47V47_16K]	= "PA-bits:47,  VA-bits:47, 16K pages",
211
		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
212
	};
213
	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
214
		       "Missing new mode strings?");
215

216
	TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
217

218
	return strings[i];
219
}
220

221
const struct vm_guest_mode_params vm_guest_mode_params[] = {
222
	[VM_MODE_P52V48_4K]	= { 52, 48,  0x1000, 12 },
223
	[VM_MODE_P52V48_16K]	= { 52, 48,  0x4000, 14 },
224
	[VM_MODE_P52V48_64K]	= { 52, 48, 0x10000, 16 },
225
	[VM_MODE_P48V48_4K]	= { 48, 48,  0x1000, 12 },
226
	[VM_MODE_P48V48_16K]	= { 48, 48,  0x4000, 14 },
227
	[VM_MODE_P48V48_64K]	= { 48, 48, 0x10000, 16 },
228
	[VM_MODE_P40V48_4K]	= { 40, 48,  0x1000, 12 },
229
	[VM_MODE_P40V48_16K]	= { 40, 48,  0x4000, 14 },
230
	[VM_MODE_P40V48_64K]	= { 40, 48, 0x10000, 16 },
231
	[VM_MODE_PXXVYY_4K]	= {  0,  0,  0x1000, 12 },
232
	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
233
	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
234
	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
235
	[VM_MODE_P36V48_16K]	= { 36, 48,  0x4000, 14 },
236
	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
237
	[VM_MODE_P47V47_16K]	= { 47, 47,  0x4000, 14 },
238
	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
239
};
240
_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
241
	       "Missing new mode params?");
242

243
/*
244
 * Initializes vm->vpages_valid to match the canonical VA space of the
245
 * architecture.
246
 *
247
 * The default implementation is valid for architectures which split the
248
 * range addressed by a single page table into a low and high region
249
 * based on the MSB of the VA. On architectures with this behavior
250
 * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1].
251
 */
252
__weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
253
{
254
	sparsebit_set_num(vm->vpages_valid,
255
		0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
256
	sparsebit_set_num(vm->vpages_valid,
257
		(~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
258
		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
259
}
260

261
struct kvm_vm *____vm_create(struct vm_shape shape)
262
{
263
	struct kvm_vm *vm;
264

265
	vm = calloc(1, sizeof(*vm));
266
	TEST_ASSERT(vm != NULL, "Insufficient Memory");
267

268
	INIT_LIST_HEAD(&vm->vcpus);
269
	vm->regions.gpa_tree = RB_ROOT;
270
	vm->regions.hva_tree = RB_ROOT;
271
	hash_init(vm->regions.slot_hash);
272

273
	vm->mode = shape.mode;
274
	vm->type = shape.type;
275

276
	vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
277
	vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
278
	vm->page_size = vm_guest_mode_params[vm->mode].page_size;
279
	vm->page_shift = vm_guest_mode_params[vm->mode].page_shift;
280

281
	/* Setup mode specific traits. */
282
	switch (vm->mode) {
283
	case VM_MODE_P52V48_4K:
284
		vm->pgtable_levels = 4;
285
		break;
286
	case VM_MODE_P52V48_64K:
287
		vm->pgtable_levels = 3;
288
		break;
289
	case VM_MODE_P48V48_4K:
290
		vm->pgtable_levels = 4;
291
		break;
292
	case VM_MODE_P48V48_64K:
293
		vm->pgtable_levels = 3;
294
		break;
295
	case VM_MODE_P40V48_4K:
296
	case VM_MODE_P36V48_4K:
297
		vm->pgtable_levels = 4;
298
		break;
299
	case VM_MODE_P40V48_64K:
300
	case VM_MODE_P36V48_64K:
301
		vm->pgtable_levels = 3;
302
		break;
303
	case VM_MODE_P52V48_16K:
304
	case VM_MODE_P48V48_16K:
305
	case VM_MODE_P40V48_16K:
306
	case VM_MODE_P36V48_16K:
307
		vm->pgtable_levels = 4;
308
		break;
309
	case VM_MODE_P47V47_16K:
310
	case VM_MODE_P36V47_16K:
311
		vm->pgtable_levels = 3;
312
		break;
313
	case VM_MODE_PXXVYY_4K:
314
#ifdef __x86_64__
315
		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
316
		kvm_init_vm_address_properties(vm);
317

318
		pr_debug("Guest physical address width detected: %d\n",
319
			 vm->pa_bits);
320
		pr_debug("Guest virtual address width detected: %d\n",
321
			 vm->va_bits);
322

323
		if (vm->va_bits == 57) {
324
			vm->pgtable_levels = 5;
325
		} else {
326
			TEST_ASSERT(vm->va_bits == 48,
327
				    "Unexpected guest virtual address width: %d",
328
				    vm->va_bits);
329
			vm->pgtable_levels = 4;
330
		}
331
#else
332
		TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms");
333
#endif
334
		break;
335
	case VM_MODE_P47V64_4K:
336
		vm->pgtable_levels = 5;
337
		break;
338
	case VM_MODE_P44V64_4K:
339
		vm->pgtable_levels = 5;
340
		break;
341
	default:
342
		TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
343
	}
344

345
#ifdef __aarch64__
346
	TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types");
347
	if (vm->pa_bits != 40)
348
		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
349
#endif
350

351
	vm_open(vm);
352

353
	/* Limit to VA-bit canonical virtual addresses. */
354
	vm->vpages_valid = sparsebit_alloc();
355
	vm_vaddr_populate_bitmap(vm);
356

357
	/* Limit physical addresses to PA-bits. */
358
	vm->max_gfn = vm_compute_max_gfn(vm);
359

360
	/* Allocate and setup memory for guest. */
361
	vm->vpages_mapped = sparsebit_alloc();
362

363
	return vm;
364
}
365

366
static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
367
				     uint32_t nr_runnable_vcpus,
368
				     uint64_t extra_mem_pages)
369
{
370
	uint64_t page_size = vm_guest_mode_params[mode].page_size;
371
	uint64_t nr_pages;
372

373
	TEST_ASSERT(nr_runnable_vcpus,
374
		    "Use vm_create_barebones() for VMs that _never_ have vCPUs");
375

376
	TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
377
		    "nr_vcpus = %d too large for host, max-vcpus = %d",
378
		    nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS));
379

380
	/*
381
	 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the
382
	 * test code and other per-VM assets that will be loaded into memslot0.
383
	 */
384
	nr_pages = 512;
385

386
	/* Account for the per-vCPU stacks on behalf of the test. */
387
	nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS;
388

389
	/*
390
	 * Account for the number of pages needed for the page tables.  The
391
	 * maximum page table size for a memory region will be when the
392
	 * smallest page size is used. Considering each page contains x page
393
	 * table descriptors, the total extra size for page tables (for extra
394
	 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
395
	 * than N/x*2.
396
	 */
397
	nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2;
398

399
	/* Account for the number of pages needed by ucall. */
400
	nr_pages += ucall_nr_pages_required(page_size);
401

402
	return vm_adjust_num_guest_pages(mode, nr_pages);
403
}
404

405
void kvm_set_files_rlimit(uint32_t nr_vcpus)
406
{
407
	/*
408
	 * Each vCPU will open two file descriptors: the vCPU itself and the
409
	 * vCPU's binary stats file descriptor.  Add an arbitrary amount of
410
	 * buffer for all other files a test may open.
411
	 */
412
	int nr_fds_wanted = nr_vcpus * 2 + 100;
413
	struct rlimit rl;
414

415
	/*
416
	 * Check that we're allowed to open nr_fds_wanted file descriptors and
417
	 * try raising the limits if needed.
418
	 */
419
	TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
420

421
	if (rl.rlim_cur < nr_fds_wanted) {
422
		rl.rlim_cur = nr_fds_wanted;
423
		if (rl.rlim_max < nr_fds_wanted) {
424
			int old_rlim_max = rl.rlim_max;
425

426
			rl.rlim_max = nr_fds_wanted;
427
			__TEST_REQUIRE(setrlimit(RLIMIT_NOFILE, &rl) >= 0,
428
				       "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)",
429
				       old_rlim_max, nr_fds_wanted);
430
		} else {
431
			TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
432
		}
433
	}
434

435
}
436

437
static bool is_guest_memfd_required(struct vm_shape shape)
438
{
439
#ifdef __x86_64__
440
	return shape.type == KVM_X86_SNP_VM;
441
#else
442
	return false;
443
#endif
444
}
445

446
struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
447
			   uint64_t nr_extra_pages)
448
{
449
	uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus,
450
						 nr_extra_pages);
451
	struct userspace_mem_region *slot0;
452
	struct kvm_vm *vm;
453
	int i, flags;
454

455
	kvm_set_files_rlimit(nr_runnable_vcpus);
456

457
	pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__,
458
		 vm_guest_mode_string(shape.mode), shape.type, nr_pages);
459

460
	vm = ____vm_create(shape);
461

462
	/*
463
	 * Force GUEST_MEMFD for the primary memory region if necessary, e.g.
464
	 * for CoCo VMs that require GUEST_MEMFD backed private memory.
465
	 */
466
	flags = 0;
467
	if (is_guest_memfd_required(shape))
468
		flags |= KVM_MEM_GUEST_MEMFD;
469

470
	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags);
471
	for (i = 0; i < NR_MEM_REGIONS; i++)
472
		vm->memslots[i] = 0;
473

474
	kvm_vm_elf_load(vm, program_invocation_name);
475

476
	/*
477
	 * TODO: Add proper defines to protect the library's memslots, and then
478
	 * carve out memslot1 for the ucall MMIO address.  KVM treats writes to
479
	 * read-only memslots as MMIO, and creating a read-only memslot for the
480
	 * MMIO region would prevent silently clobbering the MMIO region.
481
	 */
482
	slot0 = memslot2region(vm, 0);
483
	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
484

485
	if (guest_random_seed != last_guest_seed) {
486
		pr_info("Random seed: 0x%x\n", guest_random_seed);
487
		last_guest_seed = guest_random_seed;
488
	}
489
	guest_rng = new_guest_random_state(guest_random_seed);
490
	sync_global_to_guest(vm, guest_rng);
491

492
	kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
493

494
	return vm;
495
}
496

497
/*
498
 * VM Create with customized parameters
499
 *
500
 * Input Args:
501
 *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
502
 *   nr_vcpus - VCPU count
503
 *   extra_mem_pages - Non-slot0 physical memory total size
504
 *   guest_code - Guest entry point
505
 *   vcpuids - VCPU IDs
506
 *
507
 * Output Args: None
508
 *
509
 * Return:
510
 *   Pointer to opaque structure that describes the created VM.
511
 *
512
 * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
513
 * extra_mem_pages is only used to calculate the maximum page table size,
514
 * no real memory allocation for non-slot0 memory in this function.
515
 */
516
struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus,
517
				      uint64_t extra_mem_pages,
518
				      void *guest_code, struct kvm_vcpu *vcpus[])
519
{
520
	struct kvm_vm *vm;
521
	int i;
522

523
	TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
524

525
	vm = __vm_create(shape, nr_vcpus, extra_mem_pages);
526

527
	for (i = 0; i < nr_vcpus; ++i)
528
		vcpus[i] = vm_vcpu_add(vm, i, guest_code);
529

530
	kvm_arch_vm_finalize_vcpus(vm);
531
	return vm;
532
}
533

534
struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape,
535
					       struct kvm_vcpu **vcpu,
536
					       uint64_t extra_mem_pages,
537
					       void *guest_code)
538
{
539
	struct kvm_vcpu *vcpus[1];
540
	struct kvm_vm *vm;
541

542
	vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus);
543

544
	*vcpu = vcpus[0];
545
	return vm;
546
}
547

548
/*
549
 * VM Restart
550
 *
551
 * Input Args:
552
 *   vm - VM that has been released before
553
 *
554
 * Output Args: None
555
 *
556
 * Reopens the file descriptors associated to the VM and reinstates the
557
 * global state, such as the irqchip and the memory regions that are mapped
558
 * into the guest.
559
 */
560
void kvm_vm_restart(struct kvm_vm *vmp)
561
{
562
	int ctr;
563
	struct userspace_mem_region *region;
564

565
	vm_open(vmp);
566
	if (vmp->has_irqchip)
567
		vm_create_irqchip(vmp);
568

569
	hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
570
		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, &region->region);
571

572
		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
573
			    "  rc: %i errno: %i\n"
574
			    "  slot: %u flags: 0x%x\n"
575
			    "  guest_phys_addr: 0x%llx size: 0x%llx",
576
			    ret, errno, region->region.slot,
577
			    region->region.flags,
578
			    region->region.guest_phys_addr,
579
			    region->region.memory_size);
580
	}
581
}
582

583
__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm,
584
					      uint32_t vcpu_id)
585
{
586
	return __vm_vcpu_add(vm, vcpu_id);
587
}
588

589
struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
590
{
591
	kvm_vm_restart(vm);
592

593
	return vm_vcpu_recreate(vm, 0);
594
}
595

596
int __pin_task_to_cpu(pthread_t task, int cpu)
597
{
598
	cpu_set_t cpuset;
599

600
	CPU_ZERO(&cpuset);
601
	CPU_SET(cpu, &cpuset);
602

603
	return pthread_setaffinity_np(task, sizeof(cpuset), &cpuset);
604
}
605

606
static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
607
{
608
	uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
609

610
	TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
611
		    "Not allowed to run on pCPU '%d', check cgroups?", pcpu);
612
	return pcpu;
613
}
614

615
void kvm_print_vcpu_pinning_help(void)
616
{
617
	const char *name = program_invocation_name;
618

619
	printf(" -c: Pin tasks to physical CPUs.  Takes a list of comma separated\n"
620
	       "     values (target pCPU), one for each vCPU, plus an optional\n"
621
	       "     entry for the main application task (specified via entry\n"
622
	       "     <nr_vcpus + 1>).  If used, entries must be provided for all\n"
623
	       "     vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
624
	       "     E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
625
	       "     vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
626
	       "         %s -v 3 -c 22,23,24,50\n\n"
627
	       "     To leave the application task unpinned, drop the final entry:\n\n"
628
	       "         %s -v 3 -c 22,23,24\n\n"
629
	       "     (default: no pinning)\n", name, name);
630
}
631

632
void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
633
			    int nr_vcpus)
634
{
635
	cpu_set_t allowed_mask;
636
	char *cpu, *cpu_list;
637
	char delim[2] = ",";
638
	int i, r;
639

640
	cpu_list = strdup(pcpus_string);
641
	TEST_ASSERT(cpu_list, "strdup() allocation failed.");
642

643
	r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
644
	TEST_ASSERT(!r, "sched_getaffinity() failed");
645

646
	cpu = strtok(cpu_list, delim);
647

648
	/* 1. Get all pcpus for vcpus. */
649
	for (i = 0; i < nr_vcpus; i++) {
650
		TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i);
651
		vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
652
		cpu = strtok(NULL, delim);
653
	}
654

655
	/* 2. Check if the main worker needs to be pinned. */
656
	if (cpu) {
657
		pin_self_to_cpu(parse_pcpu(cpu, &allowed_mask));
658
		cpu = strtok(NULL, delim);
659
	}
660

661
	TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
662
	free(cpu_list);
663
}
664

665
/*
666
 * Userspace Memory Region Find
667
 *
668
 * Input Args:
669
 *   vm - Virtual Machine
670
 *   start - Starting VM physical address
671
 *   end - Ending VM physical address, inclusive.
672
 *
673
 * Output Args: None
674
 *
675
 * Return:
676
 *   Pointer to overlapping region, NULL if no such region.
677
 *
678
 * Searches for a region with any physical memory that overlaps with
679
 * any portion of the guest physical addresses from start to end
680
 * inclusive.  If multiple overlapping regions exist, a pointer to any
681
 * of the regions is returned.  Null is returned only when no overlapping
682
 * region exists.
683
 */
684
static struct userspace_mem_region *
685
userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
686
{
687
	struct rb_node *node;
688

689
	for (node = vm->regions.gpa_tree.rb_node; node; ) {
690
		struct userspace_mem_region *region =
691
			container_of(node, struct userspace_mem_region, gpa_node);
692
		uint64_t existing_start = region->region.guest_phys_addr;
693
		uint64_t existing_end = region->region.guest_phys_addr
694
			+ region->region.memory_size - 1;
695
		if (start <= existing_end && end >= existing_start)
696
			return region;
697

698
		if (start < existing_start)
699
			node = node->rb_left;
700
		else
701
			node = node->rb_right;
702
	}
703

704
	return NULL;
705
}
706

707
static void kvm_stats_release(struct kvm_binary_stats *stats)
708
{
709
	if (stats->fd < 0)
710
		return;
711

712
	if (stats->desc) {
713
		free(stats->desc);
714
		stats->desc = NULL;
715
	}
716

717
	kvm_close(stats->fd);
718
	stats->fd = -1;
719
}
720

721
__weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
722
{
723

724
}
725

726
/*
727
 * VM VCPU Remove
728
 *
729
 * Input Args:
730
 *   vcpu - VCPU to remove
731
 *
732
 * Output Args: None
733
 *
734
 * Return: None, TEST_ASSERT failures for all error conditions
735
 *
736
 * Removes a vCPU from a VM and frees its resources.
737
 */
738
static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
739
{
740
	if (vcpu->dirty_gfns) {
741
		kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
742
		vcpu->dirty_gfns = NULL;
743
	}
744

745
	kvm_munmap(vcpu->run, vcpu_mmap_sz());
746

747
	kvm_close(vcpu->fd);
748
	kvm_stats_release(&vcpu->stats);
749

750
	list_del(&vcpu->list);
751

752
	vcpu_arch_free(vcpu);
753
	free(vcpu);
754
}
755

756
void kvm_vm_release(struct kvm_vm *vmp)
757
{
758
	struct kvm_vcpu *vcpu, *tmp;
759

760
	list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
761
		vm_vcpu_rm(vmp, vcpu);
762

763
	kvm_close(vmp->fd);
764
	kvm_close(vmp->kvm_fd);
765

766
	/* Free cached stats metadata and close FD */
767
	kvm_stats_release(&vmp->stats);
768

769
	kvm_arch_vm_release(vmp);
770
}
771

772
static void __vm_mem_region_delete(struct kvm_vm *vm,
773
				   struct userspace_mem_region *region)
774
{
775
	rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
776
	rb_erase(&region->hva_node, &vm->regions.hva_tree);
777
	hash_del(&region->slot_node);
778

779
	sparsebit_free(&region->unused_phy_pages);
780
	sparsebit_free(&region->protected_phy_pages);
781
	kvm_munmap(region->mmap_start, region->mmap_size);
782
	if (region->fd >= 0) {
783
		/* There's an extra map when using shared memory. */
784
		kvm_munmap(region->mmap_alias, region->mmap_size);
785
		close(region->fd);
786
	}
787
	if (region->region.guest_memfd >= 0)
788
		close(region->region.guest_memfd);
789

790
	free(region);
791
}
792

793
/*
794
 * Destroys and frees the VM pointed to by vmp.
795
 */
796
void kvm_vm_free(struct kvm_vm *vmp)
797
{
798
	int ctr;
799
	struct hlist_node *node;
800
	struct userspace_mem_region *region;
801

802
	if (vmp == NULL)
803
		return;
804

805
	/* Free userspace_mem_regions. */
806
	hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
807
		__vm_mem_region_delete(vmp, region);
808

809
	/* Free sparsebit arrays. */
810
	sparsebit_free(&vmp->vpages_valid);
811
	sparsebit_free(&vmp->vpages_mapped);
812

813
	kvm_vm_release(vmp);
814

815
	/* Free the structure describing the VM. */
816
	free(vmp);
817
}
818

819
int kvm_memfd_alloc(size_t size, bool hugepages)
820
{
821
	int memfd_flags = MFD_CLOEXEC;
822
	int fd;
823

824
	if (hugepages)
825
		memfd_flags |= MFD_HUGETLB;
826

827
	fd = memfd_create("kvm_selftest", memfd_flags);
828
	TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd));
829

830
	kvm_ftruncate(fd, size);
831
	kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size);
832

833
	return fd;
834
}
835

836
static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
837
					       struct userspace_mem_region *region)
838
{
839
	struct rb_node **cur, *parent;
840

841
	for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
842
		struct userspace_mem_region *cregion;
843

844
		cregion = container_of(*cur, typeof(*cregion), gpa_node);
845
		parent = *cur;
846
		if (region->region.guest_phys_addr <
847
		    cregion->region.guest_phys_addr)
848
			cur = &(*cur)->rb_left;
849
		else {
850
			TEST_ASSERT(region->region.guest_phys_addr !=
851
				    cregion->region.guest_phys_addr,
852
				    "Duplicate GPA in region tree");
853

854
			cur = &(*cur)->rb_right;
855
		}
856
	}
857

858
	rb_link_node(&region->gpa_node, parent, cur);
859
	rb_insert_color(&region->gpa_node, gpa_tree);
860
}
861

862
static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
863
					       struct userspace_mem_region *region)
864
{
865
	struct rb_node **cur, *parent;
866

867
	for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
868
		struct userspace_mem_region *cregion;
869

870
		cregion = container_of(*cur, typeof(*cregion), hva_node);
871
		parent = *cur;
872
		if (region->host_mem < cregion->host_mem)
873
			cur = &(*cur)->rb_left;
874
		else {
875
			TEST_ASSERT(region->host_mem !=
876
				    cregion->host_mem,
877
				    "Duplicate HVA in region tree");
878

879
			cur = &(*cur)->rb_right;
880
		}
881
	}
882

883
	rb_link_node(&region->hva_node, parent, cur);
884
	rb_insert_color(&region->hva_node, hva_tree);
885
}
886

887

888
int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
889
				uint64_t gpa, uint64_t size, void *hva)
890
{
891
	struct kvm_userspace_memory_region region = {
892
		.slot = slot,
893
		.flags = flags,
894
		.guest_phys_addr = gpa,
895
		.memory_size = size,
896
		.userspace_addr = (uintptr_t)hva,
897
	};
898

899
	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region);
900
}
901

902
void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
903
			       uint64_t gpa, uint64_t size, void *hva)
904
{
905
	int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva);
906

907
	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)",
908
		    errno, strerror(errno));
909
}
910

911
#define TEST_REQUIRE_SET_USER_MEMORY_REGION2()			\
912
	__TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2),	\
913
		       "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
914

915
int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
916
				 uint64_t gpa, uint64_t size, void *hva,
917
				 uint32_t guest_memfd, uint64_t guest_memfd_offset)
918
{
919
	struct kvm_userspace_memory_region2 region = {
920
		.slot = slot,
921
		.flags = flags,
922
		.guest_phys_addr = gpa,
923
		.memory_size = size,
924
		.userspace_addr = (uintptr_t)hva,
925
		.guest_memfd = guest_memfd,
926
		.guest_memfd_offset = guest_memfd_offset,
927
	};
928

929
	TEST_REQUIRE_SET_USER_MEMORY_REGION2();
930

931
	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, &region);
932
}
933

934
void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
935
				uint64_t gpa, uint64_t size, void *hva,
936
				uint32_t guest_memfd, uint64_t guest_memfd_offset)
937
{
938
	int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
939
					       guest_memfd, guest_memfd_offset);
940

941
	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
942
		    errno, strerror(errno));
943
}
944

945

946
/* FIXME: This thing needs to be ripped apart and rewritten. */
947
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
948
		uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags,
949
		int guest_memfd, uint64_t guest_memfd_offset)
950
{
951
	int ret;
952
	struct userspace_mem_region *region;
953
	size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
954
	size_t mem_size = npages * vm->page_size;
955
	size_t alignment;
956

957
	TEST_REQUIRE_SET_USER_MEMORY_REGION2();
958

959
	TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
960
		"Number of guest pages is not compatible with the host. "
961
		"Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
962

963
	TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical "
964
		"address not on a page boundary.\n"
965
		"  gpa: 0x%lx vm->page_size: 0x%x",
966
		gpa, vm->page_size);
967
	TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1)
968
		<= vm->max_gfn, "Physical range beyond maximum "
969
		"supported physical address,\n"
970
		"  gpa: 0x%lx npages: 0x%lx\n"
971
		"  vm->max_gfn: 0x%lx vm->page_size: 0x%x",
972
		gpa, npages, vm->max_gfn, vm->page_size);
973

974
	/*
975
	 * Confirm a mem region with an overlapping address doesn't
976
	 * already exist.
977
	 */
978
	region = (struct userspace_mem_region *) userspace_mem_region_find(
979
		vm, gpa, (gpa + npages * vm->page_size) - 1);
980
	if (region != NULL)
981
		TEST_FAIL("overlapping userspace_mem_region already "
982
			"exists\n"
983
			"  requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n"
984
			"  existing gpa: 0x%lx size: 0x%lx",
985
			gpa, npages, vm->page_size,
986
			(uint64_t) region->region.guest_phys_addr,
987
			(uint64_t) region->region.memory_size);
988

989
	/* Confirm no region with the requested slot already exists. */
990
	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
991
			       slot) {
992
		if (region->region.slot != slot)
993
			continue;
994

995
		TEST_FAIL("A mem region with the requested slot "
996
			"already exists.\n"
997
			"  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
998
			"  existing slot: %u paddr: 0x%lx size: 0x%lx",
999
			slot, gpa, npages, region->region.slot,
1000
			(uint64_t) region->region.guest_phys_addr,
1001
			(uint64_t) region->region.memory_size);
1002
	}
1003

1004
	/* Allocate and initialize new mem region structure. */
1005
	region = calloc(1, sizeof(*region));
1006
	TEST_ASSERT(region != NULL, "Insufficient Memory");
1007
	region->mmap_size = mem_size;
1008

1009
#ifdef __s390x__
1010
	/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
1011
	alignment = 0x100000;
1012
#else
1013
	alignment = 1;
1014
#endif
1015

1016
	/*
1017
	 * When using THP mmap is not guaranteed to returned a hugepage aligned
1018
	 * address so we have to pad the mmap. Padding is not needed for HugeTLB
1019
	 * because mmap will always return an address aligned to the HugeTLB
1020
	 * page size.
1021
	 */
1022
	if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
1023
		alignment = max(backing_src_pagesz, alignment);
1024

1025
	TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz));
1026

1027
	/* Add enough memory to align up if necessary */
1028
	if (alignment > 1)
1029
		region->mmap_size += alignment;
1030

1031
	region->fd = -1;
1032
	if (backing_src_is_shared(src_type))
1033
		region->fd = kvm_memfd_alloc(region->mmap_size,
1034
					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
1035

1036
	region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
1037
				      vm_mem_backing_src_alias(src_type)->flag,
1038
				      region->fd);
1039

1040
	TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
1041
		    region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
1042
		    "mmap_start %p is not aligned to HugeTLB page size 0x%lx",
1043
		    region->mmap_start, backing_src_pagesz);
1044

1045
	/* Align host address */
1046
	region->host_mem = align_ptr_up(region->mmap_start, alignment);
1047

1048
	/* As needed perform madvise */
1049
	if ((src_type == VM_MEM_SRC_ANONYMOUS ||
1050
	     src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
1051
		ret = madvise(region->host_mem, mem_size,
1052
			      src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
1053
		TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
1054
			    region->host_mem, mem_size,
1055
			    vm_mem_backing_src_alias(src_type)->name);
1056
	}
1057

1058
	region->backing_src_type = src_type;
1059

1060
	if (flags & KVM_MEM_GUEST_MEMFD) {
1061
		if (guest_memfd < 0) {
1062
			uint32_t guest_memfd_flags = 0;
1063
			TEST_ASSERT(!guest_memfd_offset,
1064
				    "Offset must be zero when creating new guest_memfd");
1065
			guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
1066
		} else {
1067
			/*
1068
			 * Install a unique fd for each memslot so that the fd
1069
			 * can be closed when the region is deleted without
1070
			 * needing to track if the fd is owned by the framework
1071
			 * or by the caller.
1072
			 */
1073
			guest_memfd = kvm_dup(guest_memfd);
1074
		}
1075

1076
		region->region.guest_memfd = guest_memfd;
1077
		region->region.guest_memfd_offset = guest_memfd_offset;
1078
	} else {
1079
		region->region.guest_memfd = -1;
1080
	}
1081

1082
	region->unused_phy_pages = sparsebit_alloc();
1083
	if (vm_arch_has_protected_memory(vm))
1084
		region->protected_phy_pages = sparsebit_alloc();
1085
	sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages);
1086
	region->region.slot = slot;
1087
	region->region.flags = flags;
1088
	region->region.guest_phys_addr = gpa;
1089
	region->region.memory_size = npages * vm->page_size;
1090
	region->region.userspace_addr = (uintptr_t) region->host_mem;
1091
	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1092
	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1093
		"  rc: %i errno: %i\n"
1094
		"  slot: %u flags: 0x%x\n"
1095
		"  guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d",
1096
		ret, errno, slot, flags, gpa, region->region.memory_size,
1097
		region->region.guest_memfd);
1098

1099
	/* Add to quick lookup data structures */
1100
	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
1101
	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
1102
	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
1103

1104
	/* If shared memory, create an alias. */
1105
	if (region->fd >= 0) {
1106
		region->mmap_alias = kvm_mmap(region->mmap_size,
1107
					      PROT_READ | PROT_WRITE,
1108
					      vm_mem_backing_src_alias(src_type)->flag,
1109
					      region->fd);
1110

1111
		/* Align host alias address */
1112
		region->host_alias = align_ptr_up(region->mmap_alias, alignment);
1113
	}
1114
}
1115

1116
void vm_userspace_mem_region_add(struct kvm_vm *vm,
1117
				 enum vm_mem_backing_src_type src_type,
1118
				 uint64_t gpa, uint32_t slot, uint64_t npages,
1119
				 uint32_t flags)
1120
{
1121
	vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0);
1122
}
1123

1124
/*
1125
 * Memslot to region
1126
 *
1127
 * Input Args:
1128
 *   vm - Virtual Machine
1129
 *   memslot - KVM memory slot ID
1130
 *
1131
 * Output Args: None
1132
 *
1133
 * Return:
1134
 *   Pointer to memory region structure that describe memory region
1135
 *   using kvm memory slot ID given by memslot.  TEST_ASSERT failure
1136
 *   on error (e.g. currently no memory region using memslot as a KVM
1137
 *   memory slot ID).
1138
 */
1139
struct userspace_mem_region *
1140
memslot2region(struct kvm_vm *vm, uint32_t memslot)
1141
{
1142
	struct userspace_mem_region *region;
1143

1144
	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1145
			       memslot)
1146
		if (region->region.slot == memslot)
1147
			return region;
1148

1149
	fprintf(stderr, "No mem region with the requested slot found,\n"
1150
		"  requested slot: %u\n", memslot);
1151
	fputs("---- vm dump ----\n", stderr);
1152
	vm_dump(stderr, vm, 2);
1153
	TEST_FAIL("Mem region not found");
1154
	return NULL;
1155
}
1156

1157
/*
1158
 * VM Memory Region Flags Set
1159
 *
1160
 * Input Args:
1161
 *   vm - Virtual Machine
1162
 *   flags - Starting guest physical address
1163
 *
1164
 * Output Args: None
1165
 *
1166
 * Return: None
1167
 *
1168
 * Sets the flags of the memory region specified by the value of slot,
1169
 * to the values given by flags.
1170
 */
1171
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
1172
{
1173
	int ret;
1174
	struct userspace_mem_region *region;
1175

1176
	region = memslot2region(vm, slot);
1177

1178
	region->region.flags = flags;
1179

1180
	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1181

1182
	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1183
		"  rc: %i errno: %i slot: %u flags: 0x%x",
1184
		ret, errno, slot, flags);
1185
}
1186

1187
void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot)
1188
{
1189
	struct userspace_mem_region *region = memslot2region(vm, slot);
1190
	struct kvm_userspace_memory_region2 tmp = region->region;
1191

1192
	tmp.memory_size = 0;
1193
	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp);
1194
	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1195
}
1196

1197
/*
1198
 * VM Memory Region Move
1199
 *
1200
 * Input Args:
1201
 *   vm - Virtual Machine
1202
 *   slot - Slot of the memory region to move
1203
 *   new_gpa - Starting guest physical address
1204
 *
1205
 * Output Args: None
1206
 *
1207
 * Return: None
1208
 *
1209
 * Change the gpa of a memory region.
1210
 */
1211
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
1212
{
1213
	struct userspace_mem_region *region;
1214
	int ret;
1215

1216
	region = memslot2region(vm, slot);
1217

1218
	region->region.guest_phys_addr = new_gpa;
1219

1220
	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1221

1222
	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n"
1223
		    "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
1224
		    ret, errno, slot, new_gpa);
1225
}
1226

1227
/*
1228
 * VM Memory Region Delete
1229
 *
1230
 * Input Args:
1231
 *   vm - Virtual Machine
1232
 *   slot - Slot of the memory region to delete
1233
 *
1234
 * Output Args: None
1235
 *
1236
 * Return: None
1237
 *
1238
 * Delete a memory region.
1239
 */
1240
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
1241
{
1242
	struct userspace_mem_region *region = memslot2region(vm, slot);
1243

1244
	region->region.memory_size = 0;
1245
	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1246

1247
	__vm_mem_region_delete(vm, region);
1248
}
1249

1250
void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
1251
			    bool punch_hole)
1252
{
1253
	const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
1254
	struct userspace_mem_region *region;
1255
	uint64_t end = base + size;
1256
	uint64_t gpa, len;
1257
	off_t fd_offset;
1258
	int ret;
1259

1260
	for (gpa = base; gpa < end; gpa += len) {
1261
		uint64_t offset;
1262

1263
		region = userspace_mem_region_find(vm, gpa, gpa);
1264
		TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
1265
			    "Private memory region not found for GPA 0x%lx", gpa);
1266

1267
		offset = gpa - region->region.guest_phys_addr;
1268
		fd_offset = region->region.guest_memfd_offset + offset;
1269
		len = min_t(uint64_t, end - gpa, region->region.memory_size - offset);
1270

1271
		ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
1272
		TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
1273
			    punch_hole ? "punch hole" : "allocate", gpa, len,
1274
			    region->region.guest_memfd, mode, fd_offset);
1275
	}
1276
}
1277

1278
/* Returns the size of a vCPU's kvm_run structure. */
1279
static size_t vcpu_mmap_sz(void)
1280
{
1281
	int dev_fd, ret;
1282

1283
	dev_fd = open_kvm_dev_path_or_exit();
1284

1285
	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
1286
	TEST_ASSERT(ret >= 0 && ret >= sizeof(struct kvm_run),
1287
		    KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret));
1288

1289
	close(dev_fd);
1290

1291
	return ret;
1292
}
1293

1294
static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id)
1295
{
1296
	struct kvm_vcpu *vcpu;
1297

1298
	list_for_each_entry(vcpu, &vm->vcpus, list) {
1299
		if (vcpu->id == vcpu_id)
1300
			return true;
1301
	}
1302

1303
	return false;
1304
}
1305

1306
/*
1307
 * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id.
1308
 * No additional vCPU setup is done.  Returns the vCPU.
1309
 */
1310
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
1311
{
1312
	struct kvm_vcpu *vcpu;
1313

1314
	/* Confirm a vcpu with the specified id doesn't already exist. */
1315
	TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id);
1316

1317
	/* Allocate and initialize new vcpu structure. */
1318
	vcpu = calloc(1, sizeof(*vcpu));
1319
	TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
1320

1321
	vcpu->vm = vm;
1322
	vcpu->id = vcpu_id;
1323
	vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
1324
	TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm);
1325

1326
	TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
1327
		"smaller than expected, vcpu_mmap_sz: %zi expected_min: %zi",
1328
		vcpu_mmap_sz(), sizeof(*vcpu->run));
1329
	vcpu->run = kvm_mmap(vcpu_mmap_sz(), PROT_READ | PROT_WRITE,
1330
			     MAP_SHARED, vcpu->fd);
1331

1332
	if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
1333
		vcpu->stats.fd = vcpu_get_stats_fd(vcpu);
1334
	else
1335
		vcpu->stats.fd = -1;
1336

1337
	/* Add to linked-list of VCPUs. */
1338
	list_add(&vcpu->list, &vm->vcpus);
1339

1340
	return vcpu;
1341
}
1342

1343
/*
1344
 * VM Virtual Address Unused Gap
1345
 *
1346
 * Input Args:
1347
 *   vm - Virtual Machine
1348
 *   sz - Size (bytes)
1349
 *   vaddr_min - Minimum Virtual Address
1350
 *
1351
 * Output Args: None
1352
 *
1353
 * Return:
1354
 *   Lowest virtual address at or below vaddr_min, with at least
1355
 *   sz unused bytes.  TEST_ASSERT failure if no area of at least
1356
 *   size sz is available.
1357
 *
1358
 * Within the VM specified by vm, locates the lowest starting virtual
1359
 * address >= vaddr_min, that has at least sz unallocated bytes.  A
1360
 * TEST_ASSERT failure occurs for invalid input or no area of at least
1361
 * sz unallocated bytes >= vaddr_min is available.
1362
 */
1363
vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
1364
			       vm_vaddr_t vaddr_min)
1365
{
1366
	uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
1367

1368
	/* Determine lowest permitted virtual page index. */
1369
	uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
1370
	if ((pgidx_start * vm->page_size) < vaddr_min)
1371
		goto no_va_found;
1372

1373
	/* Loop over section with enough valid virtual page indexes. */
1374
	if (!sparsebit_is_set_num(vm->vpages_valid,
1375
		pgidx_start, pages))
1376
		pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
1377
			pgidx_start, pages);
1378
	do {
1379
		/*
1380
		 * Are there enough unused virtual pages available at
1381
		 * the currently proposed starting virtual page index.
1382
		 * If not, adjust proposed starting index to next
1383
		 * possible.
1384
		 */
1385
		if (sparsebit_is_clear_num(vm->vpages_mapped,
1386
			pgidx_start, pages))
1387
			goto va_found;
1388
		pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
1389
			pgidx_start, pages);
1390
		if (pgidx_start == 0)
1391
			goto no_va_found;
1392

1393
		/*
1394
		 * If needed, adjust proposed starting virtual address,
1395
		 * to next range of valid virtual addresses.
1396
		 */
1397
		if (!sparsebit_is_set_num(vm->vpages_valid,
1398
			pgidx_start, pages)) {
1399
			pgidx_start = sparsebit_next_set_num(
1400
				vm->vpages_valid, pgidx_start, pages);
1401
			if (pgidx_start == 0)
1402
				goto no_va_found;
1403
		}
1404
	} while (pgidx_start != 0);
1405

1406
no_va_found:
1407
	TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
1408

1409
	/* NOT REACHED */
1410
	return -1;
1411

1412
va_found:
1413
	TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
1414
		pgidx_start, pages),
1415
		"Unexpected, invalid virtual page index range,\n"
1416
		"  pgidx_start: 0x%lx\n"
1417
		"  pages: 0x%lx",
1418
		pgidx_start, pages);
1419
	TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
1420
		pgidx_start, pages),
1421
		"Unexpected, pages already mapped,\n"
1422
		"  pgidx_start: 0x%lx\n"
1423
		"  pages: 0x%lx",
1424
		pgidx_start, pages);
1425

1426
	return pgidx_start * vm->page_size;
1427
}
1428

1429
static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz,
1430
				     vm_vaddr_t vaddr_min,
1431
				     enum kvm_mem_region_type type,
1432
				     bool protected)
1433
{
1434
	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
1435

1436
	virt_pgd_alloc(vm);
1437
	vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages,
1438
						KVM_UTIL_MIN_PFN * vm->page_size,
1439
						vm->memslots[type], protected);
1440

1441
	/*
1442
	 * Find an unused range of virtual page addresses of at least
1443
	 * pages in length.
1444
	 */
1445
	vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
1446

1447
	/* Map the virtual pages. */
1448
	for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
1449
		pages--, vaddr += vm->page_size, paddr += vm->page_size) {
1450

1451
		virt_pg_map(vm, vaddr, paddr);
1452
	}
1453

1454
	return vaddr_start;
1455
}
1456

1457
vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
1458
			    enum kvm_mem_region_type type)
1459
{
1460
	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type,
1461
				  vm_arch_has_protected_memory(vm));
1462
}
1463

1464
vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz,
1465
				 vm_vaddr_t vaddr_min,
1466
				 enum kvm_mem_region_type type)
1467
{
1468
	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false);
1469
}
1470

1471
/*
1472
 * VM Virtual Address Allocate
1473
 *
1474
 * Input Args:
1475
 *   vm - Virtual Machine
1476
 *   sz - Size in bytes
1477
 *   vaddr_min - Minimum starting virtual address
1478
 *
1479
 * Output Args: None
1480
 *
1481
 * Return:
1482
 *   Starting guest virtual address
1483
 *
1484
 * Allocates at least sz bytes within the virtual address space of the vm
1485
 * given by vm.  The allocated bytes are mapped to a virtual address >=
1486
 * the address given by vaddr_min.  Note that each allocation uses a
1487
 * a unique set of pages, with the minimum real allocation being at least
1488
 * a page. The allocated physical space comes from the TEST_DATA memory region.
1489
 */
1490
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
1491
{
1492
	return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
1493
}
1494

1495
/*
1496
 * VM Virtual Address Allocate Pages
1497
 *
1498
 * Input Args:
1499
 *   vm - Virtual Machine
1500
 *
1501
 * Output Args: None
1502
 *
1503
 * Return:
1504
 *   Starting guest virtual address
1505
 *
1506
 * Allocates at least N system pages worth of bytes within the virtual address
1507
 * space of the vm.
1508
 */
1509
vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
1510
{
1511
	return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
1512
}
1513

1514
vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
1515
{
1516
	return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
1517
}
1518

1519
/*
1520
 * VM Virtual Address Allocate Page
1521
 *
1522
 * Input Args:
1523
 *   vm - Virtual Machine
1524
 *
1525
 * Output Args: None
1526
 *
1527
 * Return:
1528
 *   Starting guest virtual address
1529
 *
1530
 * Allocates at least one system page worth of bytes within the virtual address
1531
 * space of the vm.
1532
 */
1533
vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm)
1534
{
1535
	return vm_vaddr_alloc_pages(vm, 1);
1536
}
1537

1538
/*
1539
 * Map a range of VM virtual address to the VM's physical address
1540
 *
1541
 * Input Args:
1542
 *   vm - Virtual Machine
1543
 *   vaddr - Virtuall address to map
1544
 *   paddr - VM Physical Address
1545
 *   npages - The number of pages to map
1546
 *
1547
 * Output Args: None
1548
 *
1549
 * Return: None
1550
 *
1551
 * Within the VM given by @vm, creates a virtual translation for
1552
 * @npages starting at @vaddr to the page range starting at @paddr.
1553
 */
1554
void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
1555
	      unsigned int npages)
1556
{
1557
	size_t page_size = vm->page_size;
1558
	size_t size = npages * page_size;
1559

1560
	TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
1561
	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
1562

1563
	while (npages--) {
1564
		virt_pg_map(vm, vaddr, paddr);
1565

1566
		vaddr += page_size;
1567
		paddr += page_size;
1568
	}
1569
}
1570

1571
/*
1572
 * Address VM Physical to Host Virtual
1573
 *
1574
 * Input Args:
1575
 *   vm - Virtual Machine
1576
 *   gpa - VM physical address
1577
 *
1578
 * Output Args: None
1579
 *
1580
 * Return:
1581
 *   Equivalent host virtual address
1582
 *
1583
 * Locates the memory region containing the VM physical address given
1584
 * by gpa, within the VM given by vm.  When found, the host virtual
1585
 * address providing the memory to the vm physical address is returned.
1586
 * A TEST_ASSERT failure occurs if no region containing gpa exists.
1587
 */
1588
void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
1589
{
1590
	struct userspace_mem_region *region;
1591

1592
	gpa = vm_untag_gpa(vm, gpa);
1593

1594
	region = userspace_mem_region_find(vm, gpa, gpa);
1595
	if (!region) {
1596
		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
1597
		return NULL;
1598
	}
1599

1600
	return (void *)((uintptr_t)region->host_mem
1601
		+ (gpa - region->region.guest_phys_addr));
1602
}
1603

1604
/*
1605
 * Address Host Virtual to VM Physical
1606
 *
1607
 * Input Args:
1608
 *   vm - Virtual Machine
1609
 *   hva - Host virtual address
1610
 *
1611
 * Output Args: None
1612
 *
1613
 * Return:
1614
 *   Equivalent VM physical address
1615
 *
1616
 * Locates the memory region containing the host virtual address given
1617
 * by hva, within the VM given by vm.  When found, the equivalent
1618
 * VM physical address is returned. A TEST_ASSERT failure occurs if no
1619
 * region containing hva exists.
1620
 */
1621
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
1622
{
1623
	struct rb_node *node;
1624

1625
	for (node = vm->regions.hva_tree.rb_node; node; ) {
1626
		struct userspace_mem_region *region =
1627
			container_of(node, struct userspace_mem_region, hva_node);
1628

1629
		if (hva >= region->host_mem) {
1630
			if (hva <= (region->host_mem
1631
				+ region->region.memory_size - 1))
1632
				return (vm_paddr_t)((uintptr_t)
1633
					region->region.guest_phys_addr
1634
					+ (hva - (uintptr_t)region->host_mem));
1635

1636
			node = node->rb_right;
1637
		} else
1638
			node = node->rb_left;
1639
	}
1640

1641
	TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
1642
	return -1;
1643
}
1644

1645
/*
1646
 * Address VM physical to Host Virtual *alias*.
1647
 *
1648
 * Input Args:
1649
 *   vm - Virtual Machine
1650
 *   gpa - VM physical address
1651
 *
1652
 * Output Args: None
1653
 *
1654
 * Return:
1655
 *   Equivalent address within the host virtual *alias* area, or NULL
1656
 *   (without failing the test) if the guest memory is not shared (so
1657
 *   no alias exists).
1658
 *
1659
 * Create a writable, shared virtual=>physical alias for the specific GPA.
1660
 * The primary use case is to allow the host selftest to manipulate guest
1661
 * memory without mapping said memory in the guest's address space. And, for
1662
 * userfaultfd-based demand paging, to do so without triggering userfaults.
1663
 */
1664
void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
1665
{
1666
	struct userspace_mem_region *region;
1667
	uintptr_t offset;
1668

1669
	region = userspace_mem_region_find(vm, gpa, gpa);
1670
	if (!region)
1671
		return NULL;
1672

1673
	if (!region->host_alias)
1674
		return NULL;
1675

1676
	offset = gpa - region->region.guest_phys_addr;
1677
	return (void *) ((uintptr_t) region->host_alias + offset);
1678
}
1679

1680
/* Create an interrupt controller chip for the specified VM. */
1681
void vm_create_irqchip(struct kvm_vm *vm)
1682
{
1683
	int r;
1684

1685
	/*
1686
	 * Allocate a fully in-kernel IRQ chip by default, but fall back to a
1687
	 * split model (x86 only) if that fails (KVM x86 allows compiling out
1688
	 * support for KVM_CREATE_IRQCHIP).
1689
	 */
1690
	r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
1691
	if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP))
1692
		vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
1693
	else
1694
		TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm);
1695

1696
	vm->has_irqchip = true;
1697
}
1698

1699
int _vcpu_run(struct kvm_vcpu *vcpu)
1700
{
1701
	int rc;
1702

1703
	do {
1704
		rc = __vcpu_run(vcpu);
1705
	} while (rc == -1 && errno == EINTR);
1706

1707
	if (!rc)
1708
		assert_on_unhandled_exception(vcpu);
1709

1710
	return rc;
1711
}
1712

1713
/*
1714
 * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR.
1715
 * Assert if the KVM returns an error (other than -EINTR).
1716
 */
1717
void vcpu_run(struct kvm_vcpu *vcpu)
1718
{
1719
	int ret = _vcpu_run(vcpu);
1720

1721
	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret));
1722
}
1723

1724
void vcpu_run_complete_io(struct kvm_vcpu *vcpu)
1725
{
1726
	int ret;
1727

1728
	vcpu->run->immediate_exit = 1;
1729
	ret = __vcpu_run(vcpu);
1730
	vcpu->run->immediate_exit = 0;
1731

1732
	TEST_ASSERT(ret == -1 && errno == EINTR,
1733
		    "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1734
		    ret, errno);
1735
}
1736

1737
/*
1738
 * Get the list of guest registers which are supported for
1739
 * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.  Returns a kvm_reg_list pointer,
1740
 * it is the caller's responsibility to free the list.
1741
 */
1742
struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
1743
{
1744
	struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
1745
	int ret;
1746

1747
	ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &reg_list_n);
1748
	TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
1749

1750
	reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
1751
	reg_list->n = reg_list_n.n;
1752
	vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list);
1753
	return reg_list;
1754
}
1755

1756
void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
1757
{
1758
	uint32_t page_size = getpagesize();
1759
	uint32_t size = vcpu->vm->dirty_ring_size;
1760

1761
	TEST_ASSERT(size > 0, "Should enable dirty ring first");
1762

1763
	if (!vcpu->dirty_gfns) {
1764
		void *addr;
1765

1766
		addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd,
1767
			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1768
		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
1769

1770
		addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd,
1771
			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1772
		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
1773

1774
		addr = __kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd,
1775
				  page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1776

1777
		vcpu->dirty_gfns = addr;
1778
		vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
1779
	}
1780

1781
	return vcpu->dirty_gfns;
1782
}
1783

1784
/*
1785
 * Device Ioctl
1786
 */
1787

1788
int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr)
1789
{
1790
	struct kvm_device_attr attribute = {
1791
		.group = group,
1792
		.attr = attr,
1793
		.flags = 0,
1794
	};
1795

1796
	return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
1797
}
1798

1799
int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type)
1800
{
1801
	struct kvm_create_device create_dev = {
1802
		.type = type,
1803
		.flags = KVM_CREATE_DEVICE_TEST,
1804
	};
1805

1806
	return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1807
}
1808

1809
int __kvm_create_device(struct kvm_vm *vm, uint64_t type)
1810
{
1811
	struct kvm_create_device create_dev = {
1812
		.type = type,
1813
		.fd = -1,
1814
		.flags = 0,
1815
	};
1816
	int err;
1817

1818
	err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1819
	TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value");
1820
	return err ? : create_dev.fd;
1821
}
1822

1823
int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val)
1824
{
1825
	struct kvm_device_attr kvmattr = {
1826
		.group = group,
1827
		.attr = attr,
1828
		.flags = 0,
1829
		.addr = (uintptr_t)val,
1830
	};
1831

1832
	return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr);
1833
}
1834

1835
int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val)
1836
{
1837
	struct kvm_device_attr kvmattr = {
1838
		.group = group,
1839
		.attr = attr,
1840
		.flags = 0,
1841
		.addr = (uintptr_t)val,
1842
	};
1843

1844
	return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr);
1845
}
1846

1847
/*
1848
 * IRQ related functions.
1849
 */
1850

1851
int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1852
{
1853
	struct kvm_irq_level irq_level = {
1854
		.irq    = irq,
1855
		.level  = level,
1856
	};
1857

1858
	return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
1859
}
1860

1861
void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1862
{
1863
	int ret = _kvm_irq_line(vm, irq, level);
1864

1865
	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
1866
}
1867

1868
struct kvm_irq_routing *kvm_gsi_routing_create(void)
1869
{
1870
	struct kvm_irq_routing *routing;
1871
	size_t size;
1872

1873
	size = sizeof(struct kvm_irq_routing);
1874
	/* Allocate space for the max number of entries: this wastes 196 KBs. */
1875
	size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
1876
	routing = calloc(1, size);
1877
	assert(routing);
1878

1879
	return routing;
1880
}
1881

1882
void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
1883
		uint32_t gsi, uint32_t pin)
1884
{
1885
	int i;
1886

1887
	assert(routing);
1888
	assert(routing->nr < KVM_MAX_IRQ_ROUTES);
1889

1890
	i = routing->nr;
1891
	routing->entries[i].gsi = gsi;
1892
	routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
1893
	routing->entries[i].flags = 0;
1894
	routing->entries[i].u.irqchip.irqchip = 0;
1895
	routing->entries[i].u.irqchip.pin = pin;
1896
	routing->nr++;
1897
}
1898

1899
int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1900
{
1901
	int ret;
1902

1903
	assert(routing);
1904
	ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
1905
	free(routing);
1906

1907
	return ret;
1908
}
1909

1910
void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1911
{
1912
	int ret;
1913

1914
	ret = _kvm_gsi_routing_write(vm, routing);
1915
	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret));
1916
}
1917

1918
/*
1919
 * VM Dump
1920
 *
1921
 * Input Args:
1922
 *   vm - Virtual Machine
1923
 *   indent - Left margin indent amount
1924
 *
1925
 * Output Args:
1926
 *   stream - Output FILE stream
1927
 *
1928
 * Return: None
1929
 *
1930
 * Dumps the current state of the VM given by vm, to the FILE stream
1931
 * given by stream.
1932
 */
1933
void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1934
{
1935
	int ctr;
1936
	struct userspace_mem_region *region;
1937
	struct kvm_vcpu *vcpu;
1938

1939
	fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1940
	fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1941
	fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1942
	fprintf(stream, "%*sMem Regions:\n", indent, "");
1943
	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
1944
		fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1945
			"host_virt: %p\n", indent + 2, "",
1946
			(uint64_t) region->region.guest_phys_addr,
1947
			(uint64_t) region->region.memory_size,
1948
			region->host_mem);
1949
		fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1950
		sparsebit_dump(stream, region->unused_phy_pages, 0);
1951
		if (region->protected_phy_pages) {
1952
			fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, "");
1953
			sparsebit_dump(stream, region->protected_phy_pages, 0);
1954
		}
1955
	}
1956
	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1957
	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1958
	fprintf(stream, "%*spgd_created: %u\n", indent, "",
1959
		vm->pgd_created);
1960
	if (vm->pgd_created) {
1961
		fprintf(stream, "%*sVirtual Translation Tables:\n",
1962
			indent + 2, "");
1963
		virt_dump(stream, vm, indent + 4);
1964
	}
1965
	fprintf(stream, "%*sVCPUs:\n", indent, "");
1966

1967
	list_for_each_entry(vcpu, &vm->vcpus, list)
1968
		vcpu_dump(stream, vcpu, indent + 2);
1969
}
1970

1971
#define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x}
1972

1973
/* Known KVM exit reasons */
1974
static struct exit_reason {
1975
	unsigned int reason;
1976
	const char *name;
1977
} exit_reasons_known[] = {
1978
	KVM_EXIT_STRING(UNKNOWN),
1979
	KVM_EXIT_STRING(EXCEPTION),
1980
	KVM_EXIT_STRING(IO),
1981
	KVM_EXIT_STRING(HYPERCALL),
1982
	KVM_EXIT_STRING(DEBUG),
1983
	KVM_EXIT_STRING(HLT),
1984
	KVM_EXIT_STRING(MMIO),
1985
	KVM_EXIT_STRING(IRQ_WINDOW_OPEN),
1986
	KVM_EXIT_STRING(SHUTDOWN),
1987
	KVM_EXIT_STRING(FAIL_ENTRY),
1988
	KVM_EXIT_STRING(INTR),
1989
	KVM_EXIT_STRING(SET_TPR),
1990
	KVM_EXIT_STRING(TPR_ACCESS),
1991
	KVM_EXIT_STRING(S390_SIEIC),
1992
	KVM_EXIT_STRING(S390_RESET),
1993
	KVM_EXIT_STRING(DCR),
1994
	KVM_EXIT_STRING(NMI),
1995
	KVM_EXIT_STRING(INTERNAL_ERROR),
1996
	KVM_EXIT_STRING(OSI),
1997
	KVM_EXIT_STRING(PAPR_HCALL),
1998
	KVM_EXIT_STRING(S390_UCONTROL),
1999
	KVM_EXIT_STRING(WATCHDOG),
2000
	KVM_EXIT_STRING(S390_TSCH),
2001
	KVM_EXIT_STRING(EPR),
2002
	KVM_EXIT_STRING(SYSTEM_EVENT),
2003
	KVM_EXIT_STRING(S390_STSI),
2004
	KVM_EXIT_STRING(IOAPIC_EOI),
2005
	KVM_EXIT_STRING(HYPERV),
2006
	KVM_EXIT_STRING(ARM_NISV),
2007
	KVM_EXIT_STRING(X86_RDMSR),
2008
	KVM_EXIT_STRING(X86_WRMSR),
2009
	KVM_EXIT_STRING(DIRTY_RING_FULL),
2010
	KVM_EXIT_STRING(AP_RESET_HOLD),
2011
	KVM_EXIT_STRING(X86_BUS_LOCK),
2012
	KVM_EXIT_STRING(XEN),
2013
	KVM_EXIT_STRING(RISCV_SBI),
2014
	KVM_EXIT_STRING(RISCV_CSR),
2015
	KVM_EXIT_STRING(NOTIFY),
2016
	KVM_EXIT_STRING(LOONGARCH_IOCSR),
2017
	KVM_EXIT_STRING(MEMORY_FAULT),
2018
	KVM_EXIT_STRING(ARM_SEA),
2019
};
2020

2021
/*
2022
 * Exit Reason String
2023
 *
2024
 * Input Args:
2025
 *   exit_reason - Exit reason
2026
 *
2027
 * Output Args: None
2028
 *
2029
 * Return:
2030
 *   Constant string pointer describing the exit reason.
2031
 *
2032
 * Locates and returns a constant string that describes the KVM exit
2033
 * reason given by exit_reason.  If no such string is found, a constant
2034
 * string of "Unknown" is returned.
2035
 */
2036
const char *exit_reason_str(unsigned int exit_reason)
2037
{
2038
	unsigned int n1;
2039

2040
	for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
2041
		if (exit_reason == exit_reasons_known[n1].reason)
2042
			return exit_reasons_known[n1].name;
2043
	}
2044

2045
	return "Unknown";
2046
}
2047

2048
/*
2049
 * Physical Contiguous Page Allocator
2050
 *
2051
 * Input Args:
2052
 *   vm - Virtual Machine
2053
 *   num - number of pages
2054
 *   paddr_min - Physical address minimum
2055
 *   memslot - Memory region to allocate page from
2056
 *   protected - True if the pages will be used as protected/private memory
2057
 *
2058
 * Output Args: None
2059
 *
2060
 * Return:
2061
 *   Starting physical address
2062
 *
2063
 * Within the VM specified by vm, locates a range of available physical
2064
 * pages at or above paddr_min. If found, the pages are marked as in use
2065
 * and their base address is returned. A TEST_ASSERT failure occurs if
2066
 * not enough pages are available at or above paddr_min.
2067
 */
2068
vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
2069
				vm_paddr_t paddr_min, uint32_t memslot,
2070
				bool protected)
2071
{
2072
	struct userspace_mem_region *region;
2073
	sparsebit_idx_t pg, base;
2074

2075
	TEST_ASSERT(num > 0, "Must allocate at least one page");
2076

2077
	TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
2078
		"not divisible by page size.\n"
2079
		"  paddr_min: 0x%lx page_size: 0x%x",
2080
		paddr_min, vm->page_size);
2081

2082
	region = memslot2region(vm, memslot);
2083
	TEST_ASSERT(!protected || region->protected_phy_pages,
2084
		    "Region doesn't support protected memory");
2085

2086
	base = pg = paddr_min >> vm->page_shift;
2087
	do {
2088
		for (; pg < base + num; ++pg) {
2089
			if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
2090
				base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
2091
				break;
2092
			}
2093
		}
2094
	} while (pg && pg != base + num);
2095

2096
	if (pg == 0) {
2097
		fprintf(stderr, "No guest physical page available, "
2098
			"paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
2099
			paddr_min, vm->page_size, memslot);
2100
		fputs("---- vm dump ----\n", stderr);
2101
		vm_dump(stderr, vm, 2);
2102
		abort();
2103
	}
2104

2105
	for (pg = base; pg < base + num; ++pg) {
2106
		sparsebit_clear(region->unused_phy_pages, pg);
2107
		if (protected)
2108
			sparsebit_set(region->protected_phy_pages, pg);
2109
	}
2110

2111
	return base * vm->page_size;
2112
}
2113

2114
vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
2115
			     uint32_t memslot)
2116
{
2117
	return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
2118
}
2119

2120
vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
2121
{
2122
	return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
2123
				 vm->memslots[MEM_REGION_PT]);
2124
}
2125

2126
/*
2127
 * Address Guest Virtual to Host Virtual
2128
 *
2129
 * Input Args:
2130
 *   vm - Virtual Machine
2131
 *   gva - VM virtual address
2132
 *
2133
 * Output Args: None
2134
 *
2135
 * Return:
2136
 *   Equivalent host virtual address
2137
 */
2138
void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
2139
{
2140
	return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
2141
}
2142

2143
unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm)
2144
{
2145
	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
2146
}
2147

2148
static unsigned int vm_calc_num_pages(unsigned int num_pages,
2149
				      unsigned int page_shift,
2150
				      unsigned int new_page_shift,
2151
				      bool ceil)
2152
{
2153
	unsigned int n = 1 << (new_page_shift - page_shift);
2154

2155
	if (page_shift >= new_page_shift)
2156
		return num_pages * (1 << (page_shift - new_page_shift));
2157

2158
	return num_pages / n + !!(ceil && num_pages % n);
2159
}
2160

2161
static inline int getpageshift(void)
2162
{
2163
	return __builtin_ffs(getpagesize()) - 1;
2164
}
2165

2166
unsigned int
2167
vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
2168
{
2169
	return vm_calc_num_pages(num_guest_pages,
2170
				 vm_guest_mode_params[mode].page_shift,
2171
				 getpageshift(), true);
2172
}
2173

2174
unsigned int
2175
vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
2176
{
2177
	return vm_calc_num_pages(num_host_pages, getpageshift(),
2178
				 vm_guest_mode_params[mode].page_shift, false);
2179
}
2180

2181
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
2182
{
2183
	unsigned int n;
2184
	n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
2185
	return vm_adjust_num_guest_pages(mode, n);
2186
}
2187

2188
/*
2189
 * Read binary stats descriptors
2190
 *
2191
 * Input Args:
2192
 *   stats_fd - the file descriptor for the binary stats file from which to read
2193
 *   header - the binary stats metadata header corresponding to the given FD
2194
 *
2195
 * Output Args: None
2196
 *
2197
 * Return:
2198
 *   A pointer to a newly allocated series of stat descriptors.
2199
 *   Caller is responsible for freeing the returned kvm_stats_desc.
2200
 *
2201
 * Read the stats descriptors from the binary stats interface.
2202
 */
2203
struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
2204
					      struct kvm_stats_header *header)
2205
{
2206
	struct kvm_stats_desc *stats_desc;
2207
	ssize_t desc_size, total_size, ret;
2208

2209
	desc_size = get_stats_descriptor_size(header);
2210
	total_size = header->num_desc * desc_size;
2211

2212
	stats_desc = calloc(header->num_desc, desc_size);
2213
	TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors");
2214

2215
	ret = pread(stats_fd, stats_desc, total_size, header->desc_offset);
2216
	TEST_ASSERT(ret == total_size, "Read KVM stats descriptors");
2217

2218
	return stats_desc;
2219
}
2220

2221
/*
2222
 * Read stat data for a particular stat
2223
 *
2224
 * Input Args:
2225
 *   stats_fd - the file descriptor for the binary stats file from which to read
2226
 *   header - the binary stats metadata header corresponding to the given FD
2227
 *   desc - the binary stat metadata for the particular stat to be read
2228
 *   max_elements - the maximum number of 8-byte values to read into data
2229
 *
2230
 * Output Args:
2231
 *   data - the buffer into which stat data should be read
2232
 *
2233
 * Read the data values of a specified stat from the binary stats interface.
2234
 */
2235
void read_stat_data(int stats_fd, struct kvm_stats_header *header,
2236
		    struct kvm_stats_desc *desc, uint64_t *data,
2237
		    size_t max_elements)
2238
{
2239
	size_t nr_elements = min_t(ssize_t, desc->size, max_elements);
2240
	size_t size = nr_elements * sizeof(*data);
2241
	ssize_t ret;
2242

2243
	TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name);
2244
	TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name);
2245

2246
	ret = pread(stats_fd, data, size,
2247
		    header->data_offset + desc->offset);
2248

2249
	TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)",
2250
		    desc->name, errno, strerror(errno));
2251
	TEST_ASSERT(ret == size,
2252
		    "pread() on stat '%s' read %ld bytes, wanted %lu bytes",
2253
		    desc->name, size, ret);
2254
}
2255

2256
void kvm_get_stat(struct kvm_binary_stats *stats, const char *name,
2257
		  uint64_t *data, size_t max_elements)
2258
{
2259
	struct kvm_stats_desc *desc;
2260
	size_t size_desc;
2261
	int i;
2262

2263
	if (!stats->desc) {
2264
		read_stats_header(stats->fd, &stats->header);
2265
		stats->desc = read_stats_descriptors(stats->fd, &stats->header);
2266
	}
2267

2268
	size_desc = get_stats_descriptor_size(&stats->header);
2269

2270
	for (i = 0; i < stats->header.num_desc; ++i) {
2271
		desc = (void *)stats->desc + (i * size_desc);
2272

2273
		if (strcmp(desc->name, name))
2274
			continue;
2275

2276
		read_stat_data(stats->fd, &stats->header, desc, data, max_elements);
2277
		return;
2278
	}
2279

2280
	TEST_FAIL("Unable to find stat '%s'", name);
2281
}
2282

2283
__weak void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
2284
{
2285
}
2286

2287
__weak void kvm_arch_vm_finalize_vcpus(struct kvm_vm *vm)
2288
{
2289
}
2290

2291
__weak void kvm_arch_vm_release(struct kvm_vm *vm)
2292
{
2293
}
2294

2295
__weak void kvm_selftest_arch_init(void)
2296
{
2297
}
2298

2299
static void report_unexpected_signal(int signum)
2300
{
2301
#define KVM_CASE_SIGNUM(sig)					\
2302
	case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
2303

2304
	switch (signum) {
2305
	KVM_CASE_SIGNUM(SIGBUS);
2306
	KVM_CASE_SIGNUM(SIGSEGV);
2307
	KVM_CASE_SIGNUM(SIGILL);
2308
	KVM_CASE_SIGNUM(SIGFPE);
2309
	default:
2310
		TEST_FAIL("Unexpected signal %d\n", signum);
2311
	}
2312
}
2313

2314
void __attribute((constructor)) kvm_selftest_init(void)
2315
{
2316
	struct sigaction sig_sa = {
2317
		.sa_handler = report_unexpected_signal,
2318
	};
2319

2320
	/* Tell stdout not to buffer its content. */
2321
	setbuf(stdout, NULL);
2322

2323
	sigaction(SIGBUS, &sig_sa, NULL);
2324
	sigaction(SIGSEGV, &sig_sa, NULL);
2325
	sigaction(SIGILL, &sig_sa, NULL);
2326
	sigaction(SIGFPE, &sig_sa, NULL);
2327

2328
	guest_random_seed = last_guest_seed = random();
2329
	pr_info("Random seed: 0x%x\n", guest_random_seed);
2330

2331
	kvm_selftest_arch_init();
2332
}
2333

2334
bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr)
2335
{
2336
	sparsebit_idx_t pg = 0;
2337
	struct userspace_mem_region *region;
2338

2339
	if (!vm_arch_has_protected_memory(vm))
2340
		return false;
2341

2342
	region = userspace_mem_region_find(vm, paddr, paddr);
2343
	TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr);
2344

2345
	pg = paddr >> vm->page_shift;
2346
	return sparsebit_is_set(region->protected_phy_pages, pg);
2347
}
2348

2349
__weak bool kvm_arch_has_default_irqchip(void)
2350
{
2351
	return false;
2352
}
2353

2354
Product

Resources

Company