Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/testing/selftests/kvm/lib/kvm_util.c
38237 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* tools/testing/selftests/kvm/lib/kvm_util.c
4
*
5
* Copyright (C) 2018, Google LLC.
6
*/
7
#include "test_util.h"
8
#include "kvm_util.h"
9
#include "processor.h"
10
#include "ucall_common.h"
11
12
#include <assert.h>
13
#include <sched.h>
14
#include <sys/mman.h>
15
#include <sys/resource.h>
16
#include <sys/types.h>
17
#include <sys/stat.h>
18
#include <unistd.h>
19
#include <linux/kernel.h>
20
21
#define KVM_UTIL_MIN_PFN 2
22
23
uint32_t guest_random_seed;
24
struct guest_random_state guest_rng;
25
static uint32_t last_guest_seed;
26
27
static size_t vcpu_mmap_sz(void);
28
29
int __open_path_or_exit(const char *path, int flags, const char *enoent_help)
30
{
31
int fd;
32
33
fd = open(path, flags);
34
if (fd < 0)
35
goto error;
36
37
return fd;
38
39
error:
40
if (errno == EACCES || errno == ENOENT)
41
ksft_exit_skip("- Cannot open '%s': %s. %s\n",
42
path, strerror(errno),
43
errno == EACCES ? "Root required?" : enoent_help);
44
TEST_FAIL("Failed to open '%s'", path);
45
}
46
47
int open_path_or_exit(const char *path, int flags)
48
{
49
return __open_path_or_exit(path, flags, "");
50
}
51
52
/*
53
* Open KVM_DEV_PATH if available, otherwise exit the entire program.
54
*
55
* Input Args:
56
* flags - The flags to pass when opening KVM_DEV_PATH.
57
*
58
* Return:
59
* The opened file descriptor of /dev/kvm.
60
*/
61
static int _open_kvm_dev_path_or_exit(int flags)
62
{
63
return __open_path_or_exit(KVM_DEV_PATH, flags, "Is KVM loaded and enabled?");
64
}
65
66
int open_kvm_dev_path_or_exit(void)
67
{
68
return _open_kvm_dev_path_or_exit(O_RDONLY);
69
}
70
71
static ssize_t get_module_param(const char *module_name, const char *param,
72
void *buffer, size_t buffer_size)
73
{
74
const int path_size = 128;
75
char path[path_size];
76
ssize_t bytes_read;
77
int fd, r;
78
79
/* Verify KVM is loaded, to provide a more helpful SKIP message. */
80
close(open_kvm_dev_path_or_exit());
81
82
r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
83
module_name, param);
84
TEST_ASSERT(r < path_size,
85
"Failed to construct sysfs path in %d bytes.", path_size);
86
87
fd = open_path_or_exit(path, O_RDONLY);
88
89
bytes_read = read(fd, buffer, buffer_size);
90
TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes",
91
path, bytes_read, buffer_size);
92
93
r = close(fd);
94
TEST_ASSERT(!r, "close(%s) failed", path);
95
return bytes_read;
96
}
97
98
int kvm_get_module_param_integer(const char *module_name, const char *param)
99
{
100
/*
101
* 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the
102
* NUL char, and 1 byte because the kernel sucks and inserts a newline
103
* at the end.
104
*/
105
char value[16 + 1 + 1];
106
ssize_t r;
107
108
memset(value, '\0', sizeof(value));
109
110
r = get_module_param(module_name, param, value, sizeof(value));
111
TEST_ASSERT(value[r - 1] == '\n',
112
"Expected trailing newline, got char '%c'", value[r - 1]);
113
114
/*
115
* Squash the newline, otherwise atoi_paranoid() will complain about
116
* trailing non-NUL characters in the string.
117
*/
118
value[r - 1] = '\0';
119
return atoi_paranoid(value);
120
}
121
122
bool kvm_get_module_param_bool(const char *module_name, const char *param)
123
{
124
char value;
125
ssize_t r;
126
127
r = get_module_param(module_name, param, &value, sizeof(value));
128
TEST_ASSERT_EQ(r, 1);
129
130
if (value == 'Y')
131
return true;
132
else if (value == 'N')
133
return false;
134
135
TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
136
}
137
138
/*
139
* Capability
140
*
141
* Input Args:
142
* cap - Capability
143
*
144
* Output Args: None
145
*
146
* Return:
147
* On success, the Value corresponding to the capability (KVM_CAP_*)
148
* specified by the value of cap. On failure a TEST_ASSERT failure
149
* is produced.
150
*
151
* Looks up and returns the value corresponding to the capability
152
* (KVM_CAP_*) given by cap.
153
*/
154
unsigned int kvm_check_cap(long cap)
155
{
156
int ret;
157
int kvm_fd;
158
159
kvm_fd = open_kvm_dev_path_or_exit();
160
ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap);
161
TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
162
163
close(kvm_fd);
164
165
return (unsigned int)ret;
166
}
167
168
void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
169
{
170
if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
171
vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
172
else
173
vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
174
vm->dirty_ring_size = ring_size;
175
}
176
177
static void vm_open(struct kvm_vm *vm)
178
{
179
vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR);
180
181
TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT));
182
183
vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type);
184
TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd));
185
186
if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
187
vm->stats.fd = vm_get_stats_fd(vm);
188
else
189
vm->stats.fd = -1;
190
}
191
192
const char *vm_guest_mode_string(uint32_t i)
193
{
194
static const char * const strings[] = {
195
[VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages",
196
[VM_MODE_P52V48_16K] = "PA-bits:52, VA-bits:48, 16K pages",
197
[VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages",
198
[VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages",
199
[VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages",
200
[VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages",
201
[VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages",
202
[VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages",
203
[VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages",
204
[VM_MODE_PXXVYY_4K] = "PA-bits:ANY, VA-bits:48 or 57, 4K pages",
205
[VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages",
206
[VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages",
207
[VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages",
208
[VM_MODE_P36V48_16K] = "PA-bits:36, VA-bits:48, 16K pages",
209
[VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages",
210
[VM_MODE_P47V47_16K] = "PA-bits:47, VA-bits:47, 16K pages",
211
[VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages",
212
};
213
_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
214
"Missing new mode strings?");
215
216
TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
217
218
return strings[i];
219
}
220
221
const struct vm_guest_mode_params vm_guest_mode_params[] = {
222
[VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 },
223
[VM_MODE_P52V48_16K] = { 52, 48, 0x4000, 14 },
224
[VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 },
225
[VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 },
226
[VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 },
227
[VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 },
228
[VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 },
229
[VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 },
230
[VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 },
231
[VM_MODE_PXXVYY_4K] = { 0, 0, 0x1000, 12 },
232
[VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 },
233
[VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 },
234
[VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 },
235
[VM_MODE_P36V48_16K] = { 36, 48, 0x4000, 14 },
236
[VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 },
237
[VM_MODE_P47V47_16K] = { 47, 47, 0x4000, 14 },
238
[VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 },
239
};
240
_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
241
"Missing new mode params?");
242
243
/*
244
* Initializes vm->vpages_valid to match the canonical VA space of the
245
* architecture.
246
*
247
* The default implementation is valid for architectures which split the
248
* range addressed by a single page table into a low and high region
249
* based on the MSB of the VA. On architectures with this behavior
250
* the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1].
251
*/
252
__weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
253
{
254
sparsebit_set_num(vm->vpages_valid,
255
0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
256
sparsebit_set_num(vm->vpages_valid,
257
(~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
258
(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
259
}
260
261
struct kvm_vm *____vm_create(struct vm_shape shape)
262
{
263
struct kvm_vm *vm;
264
265
vm = calloc(1, sizeof(*vm));
266
TEST_ASSERT(vm != NULL, "Insufficient Memory");
267
268
INIT_LIST_HEAD(&vm->vcpus);
269
vm->regions.gpa_tree = RB_ROOT;
270
vm->regions.hva_tree = RB_ROOT;
271
hash_init(vm->regions.slot_hash);
272
273
vm->mode = shape.mode;
274
vm->type = shape.type;
275
276
vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
277
vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
278
vm->page_size = vm_guest_mode_params[vm->mode].page_size;
279
vm->page_shift = vm_guest_mode_params[vm->mode].page_shift;
280
281
/* Setup mode specific traits. */
282
switch (vm->mode) {
283
case VM_MODE_P52V48_4K:
284
vm->pgtable_levels = 4;
285
break;
286
case VM_MODE_P52V48_64K:
287
vm->pgtable_levels = 3;
288
break;
289
case VM_MODE_P48V48_4K:
290
vm->pgtable_levels = 4;
291
break;
292
case VM_MODE_P48V48_64K:
293
vm->pgtable_levels = 3;
294
break;
295
case VM_MODE_P40V48_4K:
296
case VM_MODE_P36V48_4K:
297
vm->pgtable_levels = 4;
298
break;
299
case VM_MODE_P40V48_64K:
300
case VM_MODE_P36V48_64K:
301
vm->pgtable_levels = 3;
302
break;
303
case VM_MODE_P52V48_16K:
304
case VM_MODE_P48V48_16K:
305
case VM_MODE_P40V48_16K:
306
case VM_MODE_P36V48_16K:
307
vm->pgtable_levels = 4;
308
break;
309
case VM_MODE_P47V47_16K:
310
case VM_MODE_P36V47_16K:
311
vm->pgtable_levels = 3;
312
break;
313
case VM_MODE_PXXVYY_4K:
314
#ifdef __x86_64__
315
kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
316
kvm_init_vm_address_properties(vm);
317
318
pr_debug("Guest physical address width detected: %d\n",
319
vm->pa_bits);
320
pr_debug("Guest virtual address width detected: %d\n",
321
vm->va_bits);
322
323
if (vm->va_bits == 57) {
324
vm->pgtable_levels = 5;
325
} else {
326
TEST_ASSERT(vm->va_bits == 48,
327
"Unexpected guest virtual address width: %d",
328
vm->va_bits);
329
vm->pgtable_levels = 4;
330
}
331
#else
332
TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms");
333
#endif
334
break;
335
case VM_MODE_P47V64_4K:
336
vm->pgtable_levels = 5;
337
break;
338
case VM_MODE_P44V64_4K:
339
vm->pgtable_levels = 5;
340
break;
341
default:
342
TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
343
}
344
345
#ifdef __aarch64__
346
TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types");
347
if (vm->pa_bits != 40)
348
vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
349
#endif
350
351
vm_open(vm);
352
353
/* Limit to VA-bit canonical virtual addresses. */
354
vm->vpages_valid = sparsebit_alloc();
355
vm_vaddr_populate_bitmap(vm);
356
357
/* Limit physical addresses to PA-bits. */
358
vm->max_gfn = vm_compute_max_gfn(vm);
359
360
/* Allocate and setup memory for guest. */
361
vm->vpages_mapped = sparsebit_alloc();
362
363
return vm;
364
}
365
366
static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
367
uint32_t nr_runnable_vcpus,
368
uint64_t extra_mem_pages)
369
{
370
uint64_t page_size = vm_guest_mode_params[mode].page_size;
371
uint64_t nr_pages;
372
373
TEST_ASSERT(nr_runnable_vcpus,
374
"Use vm_create_barebones() for VMs that _never_ have vCPUs");
375
376
TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
377
"nr_vcpus = %d too large for host, max-vcpus = %d",
378
nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS));
379
380
/*
381
* Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the
382
* test code and other per-VM assets that will be loaded into memslot0.
383
*/
384
nr_pages = 512;
385
386
/* Account for the per-vCPU stacks on behalf of the test. */
387
nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS;
388
389
/*
390
* Account for the number of pages needed for the page tables. The
391
* maximum page table size for a memory region will be when the
392
* smallest page size is used. Considering each page contains x page
393
* table descriptors, the total extra size for page tables (for extra
394
* N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
395
* than N/x*2.
396
*/
397
nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2;
398
399
/* Account for the number of pages needed by ucall. */
400
nr_pages += ucall_nr_pages_required(page_size);
401
402
return vm_adjust_num_guest_pages(mode, nr_pages);
403
}
404
405
void kvm_set_files_rlimit(uint32_t nr_vcpus)
406
{
407
/*
408
* Each vCPU will open two file descriptors: the vCPU itself and the
409
* vCPU's binary stats file descriptor. Add an arbitrary amount of
410
* buffer for all other files a test may open.
411
*/
412
int nr_fds_wanted = nr_vcpus * 2 + 100;
413
struct rlimit rl;
414
415
/*
416
* Check that we're allowed to open nr_fds_wanted file descriptors and
417
* try raising the limits if needed.
418
*/
419
TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
420
421
if (rl.rlim_cur < nr_fds_wanted) {
422
rl.rlim_cur = nr_fds_wanted;
423
if (rl.rlim_max < nr_fds_wanted) {
424
int old_rlim_max = rl.rlim_max;
425
426
rl.rlim_max = nr_fds_wanted;
427
__TEST_REQUIRE(setrlimit(RLIMIT_NOFILE, &rl) >= 0,
428
"RLIMIT_NOFILE hard limit is too low (%d, wanted %d)",
429
old_rlim_max, nr_fds_wanted);
430
} else {
431
TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
432
}
433
}
434
435
}
436
437
static bool is_guest_memfd_required(struct vm_shape shape)
438
{
439
#ifdef __x86_64__
440
return shape.type == KVM_X86_SNP_VM;
441
#else
442
return false;
443
#endif
444
}
445
446
struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
447
uint64_t nr_extra_pages)
448
{
449
uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus,
450
nr_extra_pages);
451
struct userspace_mem_region *slot0;
452
struct kvm_vm *vm;
453
int i, flags;
454
455
kvm_set_files_rlimit(nr_runnable_vcpus);
456
457
pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__,
458
vm_guest_mode_string(shape.mode), shape.type, nr_pages);
459
460
vm = ____vm_create(shape);
461
462
/*
463
* Force GUEST_MEMFD for the primary memory region if necessary, e.g.
464
* for CoCo VMs that require GUEST_MEMFD backed private memory.
465
*/
466
flags = 0;
467
if (is_guest_memfd_required(shape))
468
flags |= KVM_MEM_GUEST_MEMFD;
469
470
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags);
471
for (i = 0; i < NR_MEM_REGIONS; i++)
472
vm->memslots[i] = 0;
473
474
kvm_vm_elf_load(vm, program_invocation_name);
475
476
/*
477
* TODO: Add proper defines to protect the library's memslots, and then
478
* carve out memslot1 for the ucall MMIO address. KVM treats writes to
479
* read-only memslots as MMIO, and creating a read-only memslot for the
480
* MMIO region would prevent silently clobbering the MMIO region.
481
*/
482
slot0 = memslot2region(vm, 0);
483
ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
484
485
if (guest_random_seed != last_guest_seed) {
486
pr_info("Random seed: 0x%x\n", guest_random_seed);
487
last_guest_seed = guest_random_seed;
488
}
489
guest_rng = new_guest_random_state(guest_random_seed);
490
sync_global_to_guest(vm, guest_rng);
491
492
kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
493
494
return vm;
495
}
496
497
/*
498
* VM Create with customized parameters
499
*
500
* Input Args:
501
* mode - VM Mode (e.g. VM_MODE_P52V48_4K)
502
* nr_vcpus - VCPU count
503
* extra_mem_pages - Non-slot0 physical memory total size
504
* guest_code - Guest entry point
505
* vcpuids - VCPU IDs
506
*
507
* Output Args: None
508
*
509
* Return:
510
* Pointer to opaque structure that describes the created VM.
511
*
512
* Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
513
* extra_mem_pages is only used to calculate the maximum page table size,
514
* no real memory allocation for non-slot0 memory in this function.
515
*/
516
struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus,
517
uint64_t extra_mem_pages,
518
void *guest_code, struct kvm_vcpu *vcpus[])
519
{
520
struct kvm_vm *vm;
521
int i;
522
523
TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
524
525
vm = __vm_create(shape, nr_vcpus, extra_mem_pages);
526
527
for (i = 0; i < nr_vcpus; ++i)
528
vcpus[i] = vm_vcpu_add(vm, i, guest_code);
529
530
kvm_arch_vm_finalize_vcpus(vm);
531
return vm;
532
}
533
534
struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape,
535
struct kvm_vcpu **vcpu,
536
uint64_t extra_mem_pages,
537
void *guest_code)
538
{
539
struct kvm_vcpu *vcpus[1];
540
struct kvm_vm *vm;
541
542
vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus);
543
544
*vcpu = vcpus[0];
545
return vm;
546
}
547
548
/*
549
* VM Restart
550
*
551
* Input Args:
552
* vm - VM that has been released before
553
*
554
* Output Args: None
555
*
556
* Reopens the file descriptors associated to the VM and reinstates the
557
* global state, such as the irqchip and the memory regions that are mapped
558
* into the guest.
559
*/
560
void kvm_vm_restart(struct kvm_vm *vmp)
561
{
562
int ctr;
563
struct userspace_mem_region *region;
564
565
vm_open(vmp);
566
if (vmp->has_irqchip)
567
vm_create_irqchip(vmp);
568
569
hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
570
int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, &region->region);
571
572
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
573
" rc: %i errno: %i\n"
574
" slot: %u flags: 0x%x\n"
575
" guest_phys_addr: 0x%llx size: 0x%llx",
576
ret, errno, region->region.slot,
577
region->region.flags,
578
region->region.guest_phys_addr,
579
region->region.memory_size);
580
}
581
}
582
583
__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm,
584
uint32_t vcpu_id)
585
{
586
return __vm_vcpu_add(vm, vcpu_id);
587
}
588
589
struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
590
{
591
kvm_vm_restart(vm);
592
593
return vm_vcpu_recreate(vm, 0);
594
}
595
596
int __pin_task_to_cpu(pthread_t task, int cpu)
597
{
598
cpu_set_t cpuset;
599
600
CPU_ZERO(&cpuset);
601
CPU_SET(cpu, &cpuset);
602
603
return pthread_setaffinity_np(task, sizeof(cpuset), &cpuset);
604
}
605
606
static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
607
{
608
uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
609
610
TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
611
"Not allowed to run on pCPU '%d', check cgroups?", pcpu);
612
return pcpu;
613
}
614
615
void kvm_print_vcpu_pinning_help(void)
616
{
617
const char *name = program_invocation_name;
618
619
printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n"
620
" values (target pCPU), one for each vCPU, plus an optional\n"
621
" entry for the main application task (specified via entry\n"
622
" <nr_vcpus + 1>). If used, entries must be provided for all\n"
623
" vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
624
" E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
625
" vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
626
" %s -v 3 -c 22,23,24,50\n\n"
627
" To leave the application task unpinned, drop the final entry:\n\n"
628
" %s -v 3 -c 22,23,24\n\n"
629
" (default: no pinning)\n", name, name);
630
}
631
632
void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
633
int nr_vcpus)
634
{
635
cpu_set_t allowed_mask;
636
char *cpu, *cpu_list;
637
char delim[2] = ",";
638
int i, r;
639
640
cpu_list = strdup(pcpus_string);
641
TEST_ASSERT(cpu_list, "strdup() allocation failed.");
642
643
r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
644
TEST_ASSERT(!r, "sched_getaffinity() failed");
645
646
cpu = strtok(cpu_list, delim);
647
648
/* 1. Get all pcpus for vcpus. */
649
for (i = 0; i < nr_vcpus; i++) {
650
TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i);
651
vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
652
cpu = strtok(NULL, delim);
653
}
654
655
/* 2. Check if the main worker needs to be pinned. */
656
if (cpu) {
657
pin_self_to_cpu(parse_pcpu(cpu, &allowed_mask));
658
cpu = strtok(NULL, delim);
659
}
660
661
TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
662
free(cpu_list);
663
}
664
665
/*
666
* Userspace Memory Region Find
667
*
668
* Input Args:
669
* vm - Virtual Machine
670
* start - Starting VM physical address
671
* end - Ending VM physical address, inclusive.
672
*
673
* Output Args: None
674
*
675
* Return:
676
* Pointer to overlapping region, NULL if no such region.
677
*
678
* Searches for a region with any physical memory that overlaps with
679
* any portion of the guest physical addresses from start to end
680
* inclusive. If multiple overlapping regions exist, a pointer to any
681
* of the regions is returned. Null is returned only when no overlapping
682
* region exists.
683
*/
684
static struct userspace_mem_region *
685
userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
686
{
687
struct rb_node *node;
688
689
for (node = vm->regions.gpa_tree.rb_node; node; ) {
690
struct userspace_mem_region *region =
691
container_of(node, struct userspace_mem_region, gpa_node);
692
uint64_t existing_start = region->region.guest_phys_addr;
693
uint64_t existing_end = region->region.guest_phys_addr
694
+ region->region.memory_size - 1;
695
if (start <= existing_end && end >= existing_start)
696
return region;
697
698
if (start < existing_start)
699
node = node->rb_left;
700
else
701
node = node->rb_right;
702
}
703
704
return NULL;
705
}
706
707
static void kvm_stats_release(struct kvm_binary_stats *stats)
708
{
709
if (stats->fd < 0)
710
return;
711
712
if (stats->desc) {
713
free(stats->desc);
714
stats->desc = NULL;
715
}
716
717
kvm_close(stats->fd);
718
stats->fd = -1;
719
}
720
721
__weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
722
{
723
724
}
725
726
/*
727
* VM VCPU Remove
728
*
729
* Input Args:
730
* vcpu - VCPU to remove
731
*
732
* Output Args: None
733
*
734
* Return: None, TEST_ASSERT failures for all error conditions
735
*
736
* Removes a vCPU from a VM and frees its resources.
737
*/
738
static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
739
{
740
if (vcpu->dirty_gfns) {
741
kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
742
vcpu->dirty_gfns = NULL;
743
}
744
745
kvm_munmap(vcpu->run, vcpu_mmap_sz());
746
747
kvm_close(vcpu->fd);
748
kvm_stats_release(&vcpu->stats);
749
750
list_del(&vcpu->list);
751
752
vcpu_arch_free(vcpu);
753
free(vcpu);
754
}
755
756
void kvm_vm_release(struct kvm_vm *vmp)
757
{
758
struct kvm_vcpu *vcpu, *tmp;
759
760
list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
761
vm_vcpu_rm(vmp, vcpu);
762
763
kvm_close(vmp->fd);
764
kvm_close(vmp->kvm_fd);
765
766
/* Free cached stats metadata and close FD */
767
kvm_stats_release(&vmp->stats);
768
769
kvm_arch_vm_release(vmp);
770
}
771
772
static void __vm_mem_region_delete(struct kvm_vm *vm,
773
struct userspace_mem_region *region)
774
{
775
rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
776
rb_erase(&region->hva_node, &vm->regions.hva_tree);
777
hash_del(&region->slot_node);
778
779
sparsebit_free(&region->unused_phy_pages);
780
sparsebit_free(&region->protected_phy_pages);
781
kvm_munmap(region->mmap_start, region->mmap_size);
782
if (region->fd >= 0) {
783
/* There's an extra map when using shared memory. */
784
kvm_munmap(region->mmap_alias, region->mmap_size);
785
close(region->fd);
786
}
787
if (region->region.guest_memfd >= 0)
788
close(region->region.guest_memfd);
789
790
free(region);
791
}
792
793
/*
794
* Destroys and frees the VM pointed to by vmp.
795
*/
796
void kvm_vm_free(struct kvm_vm *vmp)
797
{
798
int ctr;
799
struct hlist_node *node;
800
struct userspace_mem_region *region;
801
802
if (vmp == NULL)
803
return;
804
805
/* Free userspace_mem_regions. */
806
hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
807
__vm_mem_region_delete(vmp, region);
808
809
/* Free sparsebit arrays. */
810
sparsebit_free(&vmp->vpages_valid);
811
sparsebit_free(&vmp->vpages_mapped);
812
813
kvm_vm_release(vmp);
814
815
/* Free the structure describing the VM. */
816
free(vmp);
817
}
818
819
int kvm_memfd_alloc(size_t size, bool hugepages)
820
{
821
int memfd_flags = MFD_CLOEXEC;
822
int fd;
823
824
if (hugepages)
825
memfd_flags |= MFD_HUGETLB;
826
827
fd = memfd_create("kvm_selftest", memfd_flags);
828
TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd));
829
830
kvm_ftruncate(fd, size);
831
kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size);
832
833
return fd;
834
}
835
836
static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
837
struct userspace_mem_region *region)
838
{
839
struct rb_node **cur, *parent;
840
841
for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
842
struct userspace_mem_region *cregion;
843
844
cregion = container_of(*cur, typeof(*cregion), gpa_node);
845
parent = *cur;
846
if (region->region.guest_phys_addr <
847
cregion->region.guest_phys_addr)
848
cur = &(*cur)->rb_left;
849
else {
850
TEST_ASSERT(region->region.guest_phys_addr !=
851
cregion->region.guest_phys_addr,
852
"Duplicate GPA in region tree");
853
854
cur = &(*cur)->rb_right;
855
}
856
}
857
858
rb_link_node(&region->gpa_node, parent, cur);
859
rb_insert_color(&region->gpa_node, gpa_tree);
860
}
861
862
static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
863
struct userspace_mem_region *region)
864
{
865
struct rb_node **cur, *parent;
866
867
for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
868
struct userspace_mem_region *cregion;
869
870
cregion = container_of(*cur, typeof(*cregion), hva_node);
871
parent = *cur;
872
if (region->host_mem < cregion->host_mem)
873
cur = &(*cur)->rb_left;
874
else {
875
TEST_ASSERT(region->host_mem !=
876
cregion->host_mem,
877
"Duplicate HVA in region tree");
878
879
cur = &(*cur)->rb_right;
880
}
881
}
882
883
rb_link_node(&region->hva_node, parent, cur);
884
rb_insert_color(&region->hva_node, hva_tree);
885
}
886
887
888
int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
889
uint64_t gpa, uint64_t size, void *hva)
890
{
891
struct kvm_userspace_memory_region region = {
892
.slot = slot,
893
.flags = flags,
894
.guest_phys_addr = gpa,
895
.memory_size = size,
896
.userspace_addr = (uintptr_t)hva,
897
};
898
899
return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region);
900
}
901
902
void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
903
uint64_t gpa, uint64_t size, void *hva)
904
{
905
int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva);
906
907
TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)",
908
errno, strerror(errno));
909
}
910
911
#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \
912
__TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \
913
"KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
914
915
int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
916
uint64_t gpa, uint64_t size, void *hva,
917
uint32_t guest_memfd, uint64_t guest_memfd_offset)
918
{
919
struct kvm_userspace_memory_region2 region = {
920
.slot = slot,
921
.flags = flags,
922
.guest_phys_addr = gpa,
923
.memory_size = size,
924
.userspace_addr = (uintptr_t)hva,
925
.guest_memfd = guest_memfd,
926
.guest_memfd_offset = guest_memfd_offset,
927
};
928
929
TEST_REQUIRE_SET_USER_MEMORY_REGION2();
930
931
return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, &region);
932
}
933
934
void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
935
uint64_t gpa, uint64_t size, void *hva,
936
uint32_t guest_memfd, uint64_t guest_memfd_offset)
937
{
938
int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
939
guest_memfd, guest_memfd_offset);
940
941
TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
942
errno, strerror(errno));
943
}
944
945
946
/* FIXME: This thing needs to be ripped apart and rewritten. */
947
void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
948
uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags,
949
int guest_memfd, uint64_t guest_memfd_offset)
950
{
951
int ret;
952
struct userspace_mem_region *region;
953
size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
954
size_t mem_size = npages * vm->page_size;
955
size_t alignment;
956
957
TEST_REQUIRE_SET_USER_MEMORY_REGION2();
958
959
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
960
"Number of guest pages is not compatible with the host. "
961
"Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
962
963
TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical "
964
"address not on a page boundary.\n"
965
" gpa: 0x%lx vm->page_size: 0x%x",
966
gpa, vm->page_size);
967
TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1)
968
<= vm->max_gfn, "Physical range beyond maximum "
969
"supported physical address,\n"
970
" gpa: 0x%lx npages: 0x%lx\n"
971
" vm->max_gfn: 0x%lx vm->page_size: 0x%x",
972
gpa, npages, vm->max_gfn, vm->page_size);
973
974
/*
975
* Confirm a mem region with an overlapping address doesn't
976
* already exist.
977
*/
978
region = (struct userspace_mem_region *) userspace_mem_region_find(
979
vm, gpa, (gpa + npages * vm->page_size) - 1);
980
if (region != NULL)
981
TEST_FAIL("overlapping userspace_mem_region already "
982
"exists\n"
983
" requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n"
984
" existing gpa: 0x%lx size: 0x%lx",
985
gpa, npages, vm->page_size,
986
(uint64_t) region->region.guest_phys_addr,
987
(uint64_t) region->region.memory_size);
988
989
/* Confirm no region with the requested slot already exists. */
990
hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
991
slot) {
992
if (region->region.slot != slot)
993
continue;
994
995
TEST_FAIL("A mem region with the requested slot "
996
"already exists.\n"
997
" requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
998
" existing slot: %u paddr: 0x%lx size: 0x%lx",
999
slot, gpa, npages, region->region.slot,
1000
(uint64_t) region->region.guest_phys_addr,
1001
(uint64_t) region->region.memory_size);
1002
}
1003
1004
/* Allocate and initialize new mem region structure. */
1005
region = calloc(1, sizeof(*region));
1006
TEST_ASSERT(region != NULL, "Insufficient Memory");
1007
region->mmap_size = mem_size;
1008
1009
#ifdef __s390x__
1010
/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
1011
alignment = 0x100000;
1012
#else
1013
alignment = 1;
1014
#endif
1015
1016
/*
1017
* When using THP mmap is not guaranteed to returned a hugepage aligned
1018
* address so we have to pad the mmap. Padding is not needed for HugeTLB
1019
* because mmap will always return an address aligned to the HugeTLB
1020
* page size.
1021
*/
1022
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
1023
alignment = max(backing_src_pagesz, alignment);
1024
1025
TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz));
1026
1027
/* Add enough memory to align up if necessary */
1028
if (alignment > 1)
1029
region->mmap_size += alignment;
1030
1031
region->fd = -1;
1032
if (backing_src_is_shared(src_type))
1033
region->fd = kvm_memfd_alloc(region->mmap_size,
1034
src_type == VM_MEM_SRC_SHARED_HUGETLB);
1035
1036
region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
1037
vm_mem_backing_src_alias(src_type)->flag,
1038
region->fd);
1039
1040
TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
1041
region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
1042
"mmap_start %p is not aligned to HugeTLB page size 0x%lx",
1043
region->mmap_start, backing_src_pagesz);
1044
1045
/* Align host address */
1046
region->host_mem = align_ptr_up(region->mmap_start, alignment);
1047
1048
/* As needed perform madvise */
1049
if ((src_type == VM_MEM_SRC_ANONYMOUS ||
1050
src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
1051
ret = madvise(region->host_mem, mem_size,
1052
src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
1053
TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
1054
region->host_mem, mem_size,
1055
vm_mem_backing_src_alias(src_type)->name);
1056
}
1057
1058
region->backing_src_type = src_type;
1059
1060
if (flags & KVM_MEM_GUEST_MEMFD) {
1061
if (guest_memfd < 0) {
1062
uint32_t guest_memfd_flags = 0;
1063
TEST_ASSERT(!guest_memfd_offset,
1064
"Offset must be zero when creating new guest_memfd");
1065
guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
1066
} else {
1067
/*
1068
* Install a unique fd for each memslot so that the fd
1069
* can be closed when the region is deleted without
1070
* needing to track if the fd is owned by the framework
1071
* or by the caller.
1072
*/
1073
guest_memfd = kvm_dup(guest_memfd);
1074
}
1075
1076
region->region.guest_memfd = guest_memfd;
1077
region->region.guest_memfd_offset = guest_memfd_offset;
1078
} else {
1079
region->region.guest_memfd = -1;
1080
}
1081
1082
region->unused_phy_pages = sparsebit_alloc();
1083
if (vm_arch_has_protected_memory(vm))
1084
region->protected_phy_pages = sparsebit_alloc();
1085
sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages);
1086
region->region.slot = slot;
1087
region->region.flags = flags;
1088
region->region.guest_phys_addr = gpa;
1089
region->region.memory_size = npages * vm->page_size;
1090
region->region.userspace_addr = (uintptr_t) region->host_mem;
1091
ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1092
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1093
" rc: %i errno: %i\n"
1094
" slot: %u flags: 0x%x\n"
1095
" guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d",
1096
ret, errno, slot, flags, gpa, region->region.memory_size,
1097
region->region.guest_memfd);
1098
1099
/* Add to quick lookup data structures */
1100
vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
1101
vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
1102
hash_add(vm->regions.slot_hash, &region->slot_node, slot);
1103
1104
/* If shared memory, create an alias. */
1105
if (region->fd >= 0) {
1106
region->mmap_alias = kvm_mmap(region->mmap_size,
1107
PROT_READ | PROT_WRITE,
1108
vm_mem_backing_src_alias(src_type)->flag,
1109
region->fd);
1110
1111
/* Align host alias address */
1112
region->host_alias = align_ptr_up(region->mmap_alias, alignment);
1113
}
1114
}
1115
1116
void vm_userspace_mem_region_add(struct kvm_vm *vm,
1117
enum vm_mem_backing_src_type src_type,
1118
uint64_t gpa, uint32_t slot, uint64_t npages,
1119
uint32_t flags)
1120
{
1121
vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0);
1122
}
1123
1124
/*
1125
* Memslot to region
1126
*
1127
* Input Args:
1128
* vm - Virtual Machine
1129
* memslot - KVM memory slot ID
1130
*
1131
* Output Args: None
1132
*
1133
* Return:
1134
* Pointer to memory region structure that describe memory region
1135
* using kvm memory slot ID given by memslot. TEST_ASSERT failure
1136
* on error (e.g. currently no memory region using memslot as a KVM
1137
* memory slot ID).
1138
*/
1139
struct userspace_mem_region *
1140
memslot2region(struct kvm_vm *vm, uint32_t memslot)
1141
{
1142
struct userspace_mem_region *region;
1143
1144
hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1145
memslot)
1146
if (region->region.slot == memslot)
1147
return region;
1148
1149
fprintf(stderr, "No mem region with the requested slot found,\n"
1150
" requested slot: %u\n", memslot);
1151
fputs("---- vm dump ----\n", stderr);
1152
vm_dump(stderr, vm, 2);
1153
TEST_FAIL("Mem region not found");
1154
return NULL;
1155
}
1156
1157
/*
1158
* VM Memory Region Flags Set
1159
*
1160
* Input Args:
1161
* vm - Virtual Machine
1162
* flags - Starting guest physical address
1163
*
1164
* Output Args: None
1165
*
1166
* Return: None
1167
*
1168
* Sets the flags of the memory region specified by the value of slot,
1169
* to the values given by flags.
1170
*/
1171
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
1172
{
1173
int ret;
1174
struct userspace_mem_region *region;
1175
1176
region = memslot2region(vm, slot);
1177
1178
region->region.flags = flags;
1179
1180
ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1181
1182
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1183
" rc: %i errno: %i slot: %u flags: 0x%x",
1184
ret, errno, slot, flags);
1185
}
1186
1187
void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot)
1188
{
1189
struct userspace_mem_region *region = memslot2region(vm, slot);
1190
struct kvm_userspace_memory_region2 tmp = region->region;
1191
1192
tmp.memory_size = 0;
1193
vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp);
1194
vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1195
}
1196
1197
/*
1198
* VM Memory Region Move
1199
*
1200
* Input Args:
1201
* vm - Virtual Machine
1202
* slot - Slot of the memory region to move
1203
* new_gpa - Starting guest physical address
1204
*
1205
* Output Args: None
1206
*
1207
* Return: None
1208
*
1209
* Change the gpa of a memory region.
1210
*/
1211
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
1212
{
1213
struct userspace_mem_region *region;
1214
int ret;
1215
1216
region = memslot2region(vm, slot);
1217
1218
region->region.guest_phys_addr = new_gpa;
1219
1220
ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1221
1222
TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n"
1223
"ret: %i errno: %i slot: %u new_gpa: 0x%lx",
1224
ret, errno, slot, new_gpa);
1225
}
1226
1227
/*
1228
* VM Memory Region Delete
1229
*
1230
* Input Args:
1231
* vm - Virtual Machine
1232
* slot - Slot of the memory region to delete
1233
*
1234
* Output Args: None
1235
*
1236
* Return: None
1237
*
1238
* Delete a memory region.
1239
*/
1240
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
1241
{
1242
struct userspace_mem_region *region = memslot2region(vm, slot);
1243
1244
region->region.memory_size = 0;
1245
vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1246
1247
__vm_mem_region_delete(vm, region);
1248
}
1249
1250
void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
1251
bool punch_hole)
1252
{
1253
const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
1254
struct userspace_mem_region *region;
1255
uint64_t end = base + size;
1256
uint64_t gpa, len;
1257
off_t fd_offset;
1258
int ret;
1259
1260
for (gpa = base; gpa < end; gpa += len) {
1261
uint64_t offset;
1262
1263
region = userspace_mem_region_find(vm, gpa, gpa);
1264
TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
1265
"Private memory region not found for GPA 0x%lx", gpa);
1266
1267
offset = gpa - region->region.guest_phys_addr;
1268
fd_offset = region->region.guest_memfd_offset + offset;
1269
len = min_t(uint64_t, end - gpa, region->region.memory_size - offset);
1270
1271
ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
1272
TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
1273
punch_hole ? "punch hole" : "allocate", gpa, len,
1274
region->region.guest_memfd, mode, fd_offset);
1275
}
1276
}
1277
1278
/* Returns the size of a vCPU's kvm_run structure. */
1279
static size_t vcpu_mmap_sz(void)
1280
{
1281
int dev_fd, ret;
1282
1283
dev_fd = open_kvm_dev_path_or_exit();
1284
1285
ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
1286
TEST_ASSERT(ret >= 0 && ret >= sizeof(struct kvm_run),
1287
KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret));
1288
1289
close(dev_fd);
1290
1291
return ret;
1292
}
1293
1294
static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id)
1295
{
1296
struct kvm_vcpu *vcpu;
1297
1298
list_for_each_entry(vcpu, &vm->vcpus, list) {
1299
if (vcpu->id == vcpu_id)
1300
return true;
1301
}
1302
1303
return false;
1304
}
1305
1306
/*
1307
* Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id.
1308
* No additional vCPU setup is done. Returns the vCPU.
1309
*/
1310
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
1311
{
1312
struct kvm_vcpu *vcpu;
1313
1314
/* Confirm a vcpu with the specified id doesn't already exist. */
1315
TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id);
1316
1317
/* Allocate and initialize new vcpu structure. */
1318
vcpu = calloc(1, sizeof(*vcpu));
1319
TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
1320
1321
vcpu->vm = vm;
1322
vcpu->id = vcpu_id;
1323
vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
1324
TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm);
1325
1326
TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
1327
"smaller than expected, vcpu_mmap_sz: %zi expected_min: %zi",
1328
vcpu_mmap_sz(), sizeof(*vcpu->run));
1329
vcpu->run = kvm_mmap(vcpu_mmap_sz(), PROT_READ | PROT_WRITE,
1330
MAP_SHARED, vcpu->fd);
1331
1332
if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
1333
vcpu->stats.fd = vcpu_get_stats_fd(vcpu);
1334
else
1335
vcpu->stats.fd = -1;
1336
1337
/* Add to linked-list of VCPUs. */
1338
list_add(&vcpu->list, &vm->vcpus);
1339
1340
return vcpu;
1341
}
1342
1343
/*
1344
* VM Virtual Address Unused Gap
1345
*
1346
* Input Args:
1347
* vm - Virtual Machine
1348
* sz - Size (bytes)
1349
* vaddr_min - Minimum Virtual Address
1350
*
1351
* Output Args: None
1352
*
1353
* Return:
1354
* Lowest virtual address at or below vaddr_min, with at least
1355
* sz unused bytes. TEST_ASSERT failure if no area of at least
1356
* size sz is available.
1357
*
1358
* Within the VM specified by vm, locates the lowest starting virtual
1359
* address >= vaddr_min, that has at least sz unallocated bytes. A
1360
* TEST_ASSERT failure occurs for invalid input or no area of at least
1361
* sz unallocated bytes >= vaddr_min is available.
1362
*/
1363
vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
1364
vm_vaddr_t vaddr_min)
1365
{
1366
uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
1367
1368
/* Determine lowest permitted virtual page index. */
1369
uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
1370
if ((pgidx_start * vm->page_size) < vaddr_min)
1371
goto no_va_found;
1372
1373
/* Loop over section with enough valid virtual page indexes. */
1374
if (!sparsebit_is_set_num(vm->vpages_valid,
1375
pgidx_start, pages))
1376
pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
1377
pgidx_start, pages);
1378
do {
1379
/*
1380
* Are there enough unused virtual pages available at
1381
* the currently proposed starting virtual page index.
1382
* If not, adjust proposed starting index to next
1383
* possible.
1384
*/
1385
if (sparsebit_is_clear_num(vm->vpages_mapped,
1386
pgidx_start, pages))
1387
goto va_found;
1388
pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
1389
pgidx_start, pages);
1390
if (pgidx_start == 0)
1391
goto no_va_found;
1392
1393
/*
1394
* If needed, adjust proposed starting virtual address,
1395
* to next range of valid virtual addresses.
1396
*/
1397
if (!sparsebit_is_set_num(vm->vpages_valid,
1398
pgidx_start, pages)) {
1399
pgidx_start = sparsebit_next_set_num(
1400
vm->vpages_valid, pgidx_start, pages);
1401
if (pgidx_start == 0)
1402
goto no_va_found;
1403
}
1404
} while (pgidx_start != 0);
1405
1406
no_va_found:
1407
TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
1408
1409
/* NOT REACHED */
1410
return -1;
1411
1412
va_found:
1413
TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
1414
pgidx_start, pages),
1415
"Unexpected, invalid virtual page index range,\n"
1416
" pgidx_start: 0x%lx\n"
1417
" pages: 0x%lx",
1418
pgidx_start, pages);
1419
TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
1420
pgidx_start, pages),
1421
"Unexpected, pages already mapped,\n"
1422
" pgidx_start: 0x%lx\n"
1423
" pages: 0x%lx",
1424
pgidx_start, pages);
1425
1426
return pgidx_start * vm->page_size;
1427
}
1428
1429
static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz,
1430
vm_vaddr_t vaddr_min,
1431
enum kvm_mem_region_type type,
1432
bool protected)
1433
{
1434
uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
1435
1436
virt_pgd_alloc(vm);
1437
vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages,
1438
KVM_UTIL_MIN_PFN * vm->page_size,
1439
vm->memslots[type], protected);
1440
1441
/*
1442
* Find an unused range of virtual page addresses of at least
1443
* pages in length.
1444
*/
1445
vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
1446
1447
/* Map the virtual pages. */
1448
for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
1449
pages--, vaddr += vm->page_size, paddr += vm->page_size) {
1450
1451
virt_pg_map(vm, vaddr, paddr);
1452
}
1453
1454
return vaddr_start;
1455
}
1456
1457
vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
1458
enum kvm_mem_region_type type)
1459
{
1460
return ____vm_vaddr_alloc(vm, sz, vaddr_min, type,
1461
vm_arch_has_protected_memory(vm));
1462
}
1463
1464
vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz,
1465
vm_vaddr_t vaddr_min,
1466
enum kvm_mem_region_type type)
1467
{
1468
return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false);
1469
}
1470
1471
/*
1472
* VM Virtual Address Allocate
1473
*
1474
* Input Args:
1475
* vm - Virtual Machine
1476
* sz - Size in bytes
1477
* vaddr_min - Minimum starting virtual address
1478
*
1479
* Output Args: None
1480
*
1481
* Return:
1482
* Starting guest virtual address
1483
*
1484
* Allocates at least sz bytes within the virtual address space of the vm
1485
* given by vm. The allocated bytes are mapped to a virtual address >=
1486
* the address given by vaddr_min. Note that each allocation uses a
1487
* a unique set of pages, with the minimum real allocation being at least
1488
* a page. The allocated physical space comes from the TEST_DATA memory region.
1489
*/
1490
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
1491
{
1492
return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
1493
}
1494
1495
/*
1496
* VM Virtual Address Allocate Pages
1497
*
1498
* Input Args:
1499
* vm - Virtual Machine
1500
*
1501
* Output Args: None
1502
*
1503
* Return:
1504
* Starting guest virtual address
1505
*
1506
* Allocates at least N system pages worth of bytes within the virtual address
1507
* space of the vm.
1508
*/
1509
vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
1510
{
1511
return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
1512
}
1513
1514
vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
1515
{
1516
return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
1517
}
1518
1519
/*
1520
* VM Virtual Address Allocate Page
1521
*
1522
* Input Args:
1523
* vm - Virtual Machine
1524
*
1525
* Output Args: None
1526
*
1527
* Return:
1528
* Starting guest virtual address
1529
*
1530
* Allocates at least one system page worth of bytes within the virtual address
1531
* space of the vm.
1532
*/
1533
vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm)
1534
{
1535
return vm_vaddr_alloc_pages(vm, 1);
1536
}
1537
1538
/*
1539
* Map a range of VM virtual address to the VM's physical address
1540
*
1541
* Input Args:
1542
* vm - Virtual Machine
1543
* vaddr - Virtuall address to map
1544
* paddr - VM Physical Address
1545
* npages - The number of pages to map
1546
*
1547
* Output Args: None
1548
*
1549
* Return: None
1550
*
1551
* Within the VM given by @vm, creates a virtual translation for
1552
* @npages starting at @vaddr to the page range starting at @paddr.
1553
*/
1554
void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
1555
unsigned int npages)
1556
{
1557
size_t page_size = vm->page_size;
1558
size_t size = npages * page_size;
1559
1560
TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
1561
TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
1562
1563
while (npages--) {
1564
virt_pg_map(vm, vaddr, paddr);
1565
1566
vaddr += page_size;
1567
paddr += page_size;
1568
}
1569
}
1570
1571
/*
1572
* Address VM Physical to Host Virtual
1573
*
1574
* Input Args:
1575
* vm - Virtual Machine
1576
* gpa - VM physical address
1577
*
1578
* Output Args: None
1579
*
1580
* Return:
1581
* Equivalent host virtual address
1582
*
1583
* Locates the memory region containing the VM physical address given
1584
* by gpa, within the VM given by vm. When found, the host virtual
1585
* address providing the memory to the vm physical address is returned.
1586
* A TEST_ASSERT failure occurs if no region containing gpa exists.
1587
*/
1588
void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
1589
{
1590
struct userspace_mem_region *region;
1591
1592
gpa = vm_untag_gpa(vm, gpa);
1593
1594
region = userspace_mem_region_find(vm, gpa, gpa);
1595
if (!region) {
1596
TEST_FAIL("No vm physical memory at 0x%lx", gpa);
1597
return NULL;
1598
}
1599
1600
return (void *)((uintptr_t)region->host_mem
1601
+ (gpa - region->region.guest_phys_addr));
1602
}
1603
1604
/*
1605
* Address Host Virtual to VM Physical
1606
*
1607
* Input Args:
1608
* vm - Virtual Machine
1609
* hva - Host virtual address
1610
*
1611
* Output Args: None
1612
*
1613
* Return:
1614
* Equivalent VM physical address
1615
*
1616
* Locates the memory region containing the host virtual address given
1617
* by hva, within the VM given by vm. When found, the equivalent
1618
* VM physical address is returned. A TEST_ASSERT failure occurs if no
1619
* region containing hva exists.
1620
*/
1621
vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
1622
{
1623
struct rb_node *node;
1624
1625
for (node = vm->regions.hva_tree.rb_node; node; ) {
1626
struct userspace_mem_region *region =
1627
container_of(node, struct userspace_mem_region, hva_node);
1628
1629
if (hva >= region->host_mem) {
1630
if (hva <= (region->host_mem
1631
+ region->region.memory_size - 1))
1632
return (vm_paddr_t)((uintptr_t)
1633
region->region.guest_phys_addr
1634
+ (hva - (uintptr_t)region->host_mem));
1635
1636
node = node->rb_right;
1637
} else
1638
node = node->rb_left;
1639
}
1640
1641
TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
1642
return -1;
1643
}
1644
1645
/*
1646
* Address VM physical to Host Virtual *alias*.
1647
*
1648
* Input Args:
1649
* vm - Virtual Machine
1650
* gpa - VM physical address
1651
*
1652
* Output Args: None
1653
*
1654
* Return:
1655
* Equivalent address within the host virtual *alias* area, or NULL
1656
* (without failing the test) if the guest memory is not shared (so
1657
* no alias exists).
1658
*
1659
* Create a writable, shared virtual=>physical alias for the specific GPA.
1660
* The primary use case is to allow the host selftest to manipulate guest
1661
* memory without mapping said memory in the guest's address space. And, for
1662
* userfaultfd-based demand paging, to do so without triggering userfaults.
1663
*/
1664
void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
1665
{
1666
struct userspace_mem_region *region;
1667
uintptr_t offset;
1668
1669
region = userspace_mem_region_find(vm, gpa, gpa);
1670
if (!region)
1671
return NULL;
1672
1673
if (!region->host_alias)
1674
return NULL;
1675
1676
offset = gpa - region->region.guest_phys_addr;
1677
return (void *) ((uintptr_t) region->host_alias + offset);
1678
}
1679
1680
/* Create an interrupt controller chip for the specified VM. */
1681
void vm_create_irqchip(struct kvm_vm *vm)
1682
{
1683
int r;
1684
1685
/*
1686
* Allocate a fully in-kernel IRQ chip by default, but fall back to a
1687
* split model (x86 only) if that fails (KVM x86 allows compiling out
1688
* support for KVM_CREATE_IRQCHIP).
1689
*/
1690
r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
1691
if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP))
1692
vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
1693
else
1694
TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm);
1695
1696
vm->has_irqchip = true;
1697
}
1698
1699
int _vcpu_run(struct kvm_vcpu *vcpu)
1700
{
1701
int rc;
1702
1703
do {
1704
rc = __vcpu_run(vcpu);
1705
} while (rc == -1 && errno == EINTR);
1706
1707
if (!rc)
1708
assert_on_unhandled_exception(vcpu);
1709
1710
return rc;
1711
}
1712
1713
/*
1714
* Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR.
1715
* Assert if the KVM returns an error (other than -EINTR).
1716
*/
1717
void vcpu_run(struct kvm_vcpu *vcpu)
1718
{
1719
int ret = _vcpu_run(vcpu);
1720
1721
TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret));
1722
}
1723
1724
void vcpu_run_complete_io(struct kvm_vcpu *vcpu)
1725
{
1726
int ret;
1727
1728
vcpu->run->immediate_exit = 1;
1729
ret = __vcpu_run(vcpu);
1730
vcpu->run->immediate_exit = 0;
1731
1732
TEST_ASSERT(ret == -1 && errno == EINTR,
1733
"KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1734
ret, errno);
1735
}
1736
1737
/*
1738
* Get the list of guest registers which are supported for
1739
* KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer,
1740
* it is the caller's responsibility to free the list.
1741
*/
1742
struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
1743
{
1744
struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
1745
int ret;
1746
1747
ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &reg_list_n);
1748
TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
1749
1750
reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
1751
reg_list->n = reg_list_n.n;
1752
vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list);
1753
return reg_list;
1754
}
1755
1756
void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
1757
{
1758
uint32_t page_size = getpagesize();
1759
uint32_t size = vcpu->vm->dirty_ring_size;
1760
1761
TEST_ASSERT(size > 0, "Should enable dirty ring first");
1762
1763
if (!vcpu->dirty_gfns) {
1764
void *addr;
1765
1766
addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd,
1767
page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1768
TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
1769
1770
addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd,
1771
page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1772
TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
1773
1774
addr = __kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd,
1775
page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1776
1777
vcpu->dirty_gfns = addr;
1778
vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
1779
}
1780
1781
return vcpu->dirty_gfns;
1782
}
1783
1784
/*
1785
* Device Ioctl
1786
*/
1787
1788
int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr)
1789
{
1790
struct kvm_device_attr attribute = {
1791
.group = group,
1792
.attr = attr,
1793
.flags = 0,
1794
};
1795
1796
return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
1797
}
1798
1799
int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type)
1800
{
1801
struct kvm_create_device create_dev = {
1802
.type = type,
1803
.flags = KVM_CREATE_DEVICE_TEST,
1804
};
1805
1806
return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1807
}
1808
1809
int __kvm_create_device(struct kvm_vm *vm, uint64_t type)
1810
{
1811
struct kvm_create_device create_dev = {
1812
.type = type,
1813
.fd = -1,
1814
.flags = 0,
1815
};
1816
int err;
1817
1818
err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1819
TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value");
1820
return err ? : create_dev.fd;
1821
}
1822
1823
int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val)
1824
{
1825
struct kvm_device_attr kvmattr = {
1826
.group = group,
1827
.attr = attr,
1828
.flags = 0,
1829
.addr = (uintptr_t)val,
1830
};
1831
1832
return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr);
1833
}
1834
1835
int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val)
1836
{
1837
struct kvm_device_attr kvmattr = {
1838
.group = group,
1839
.attr = attr,
1840
.flags = 0,
1841
.addr = (uintptr_t)val,
1842
};
1843
1844
return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr);
1845
}
1846
1847
/*
1848
* IRQ related functions.
1849
*/
1850
1851
int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1852
{
1853
struct kvm_irq_level irq_level = {
1854
.irq = irq,
1855
.level = level,
1856
};
1857
1858
return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
1859
}
1860
1861
void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1862
{
1863
int ret = _kvm_irq_line(vm, irq, level);
1864
1865
TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
1866
}
1867
1868
struct kvm_irq_routing *kvm_gsi_routing_create(void)
1869
{
1870
struct kvm_irq_routing *routing;
1871
size_t size;
1872
1873
size = sizeof(struct kvm_irq_routing);
1874
/* Allocate space for the max number of entries: this wastes 196 KBs. */
1875
size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
1876
routing = calloc(1, size);
1877
assert(routing);
1878
1879
return routing;
1880
}
1881
1882
void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
1883
uint32_t gsi, uint32_t pin)
1884
{
1885
int i;
1886
1887
assert(routing);
1888
assert(routing->nr < KVM_MAX_IRQ_ROUTES);
1889
1890
i = routing->nr;
1891
routing->entries[i].gsi = gsi;
1892
routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
1893
routing->entries[i].flags = 0;
1894
routing->entries[i].u.irqchip.irqchip = 0;
1895
routing->entries[i].u.irqchip.pin = pin;
1896
routing->nr++;
1897
}
1898
1899
int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1900
{
1901
int ret;
1902
1903
assert(routing);
1904
ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
1905
free(routing);
1906
1907
return ret;
1908
}
1909
1910
void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1911
{
1912
int ret;
1913
1914
ret = _kvm_gsi_routing_write(vm, routing);
1915
TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret));
1916
}
1917
1918
/*
1919
* VM Dump
1920
*
1921
* Input Args:
1922
* vm - Virtual Machine
1923
* indent - Left margin indent amount
1924
*
1925
* Output Args:
1926
* stream - Output FILE stream
1927
*
1928
* Return: None
1929
*
1930
* Dumps the current state of the VM given by vm, to the FILE stream
1931
* given by stream.
1932
*/
1933
void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1934
{
1935
int ctr;
1936
struct userspace_mem_region *region;
1937
struct kvm_vcpu *vcpu;
1938
1939
fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1940
fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1941
fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1942
fprintf(stream, "%*sMem Regions:\n", indent, "");
1943
hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
1944
fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1945
"host_virt: %p\n", indent + 2, "",
1946
(uint64_t) region->region.guest_phys_addr,
1947
(uint64_t) region->region.memory_size,
1948
region->host_mem);
1949
fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1950
sparsebit_dump(stream, region->unused_phy_pages, 0);
1951
if (region->protected_phy_pages) {
1952
fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, "");
1953
sparsebit_dump(stream, region->protected_phy_pages, 0);
1954
}
1955
}
1956
fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1957
sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1958
fprintf(stream, "%*spgd_created: %u\n", indent, "",
1959
vm->pgd_created);
1960
if (vm->pgd_created) {
1961
fprintf(stream, "%*sVirtual Translation Tables:\n",
1962
indent + 2, "");
1963
virt_dump(stream, vm, indent + 4);
1964
}
1965
fprintf(stream, "%*sVCPUs:\n", indent, "");
1966
1967
list_for_each_entry(vcpu, &vm->vcpus, list)
1968
vcpu_dump(stream, vcpu, indent + 2);
1969
}
1970
1971
#define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x}
1972
1973
/* Known KVM exit reasons */
1974
static struct exit_reason {
1975
unsigned int reason;
1976
const char *name;
1977
} exit_reasons_known[] = {
1978
KVM_EXIT_STRING(UNKNOWN),
1979
KVM_EXIT_STRING(EXCEPTION),
1980
KVM_EXIT_STRING(IO),
1981
KVM_EXIT_STRING(HYPERCALL),
1982
KVM_EXIT_STRING(DEBUG),
1983
KVM_EXIT_STRING(HLT),
1984
KVM_EXIT_STRING(MMIO),
1985
KVM_EXIT_STRING(IRQ_WINDOW_OPEN),
1986
KVM_EXIT_STRING(SHUTDOWN),
1987
KVM_EXIT_STRING(FAIL_ENTRY),
1988
KVM_EXIT_STRING(INTR),
1989
KVM_EXIT_STRING(SET_TPR),
1990
KVM_EXIT_STRING(TPR_ACCESS),
1991
KVM_EXIT_STRING(S390_SIEIC),
1992
KVM_EXIT_STRING(S390_RESET),
1993
KVM_EXIT_STRING(DCR),
1994
KVM_EXIT_STRING(NMI),
1995
KVM_EXIT_STRING(INTERNAL_ERROR),
1996
KVM_EXIT_STRING(OSI),
1997
KVM_EXIT_STRING(PAPR_HCALL),
1998
KVM_EXIT_STRING(S390_UCONTROL),
1999
KVM_EXIT_STRING(WATCHDOG),
2000
KVM_EXIT_STRING(S390_TSCH),
2001
KVM_EXIT_STRING(EPR),
2002
KVM_EXIT_STRING(SYSTEM_EVENT),
2003
KVM_EXIT_STRING(S390_STSI),
2004
KVM_EXIT_STRING(IOAPIC_EOI),
2005
KVM_EXIT_STRING(HYPERV),
2006
KVM_EXIT_STRING(ARM_NISV),
2007
KVM_EXIT_STRING(X86_RDMSR),
2008
KVM_EXIT_STRING(X86_WRMSR),
2009
KVM_EXIT_STRING(DIRTY_RING_FULL),
2010
KVM_EXIT_STRING(AP_RESET_HOLD),
2011
KVM_EXIT_STRING(X86_BUS_LOCK),
2012
KVM_EXIT_STRING(XEN),
2013
KVM_EXIT_STRING(RISCV_SBI),
2014
KVM_EXIT_STRING(RISCV_CSR),
2015
KVM_EXIT_STRING(NOTIFY),
2016
KVM_EXIT_STRING(LOONGARCH_IOCSR),
2017
KVM_EXIT_STRING(MEMORY_FAULT),
2018
KVM_EXIT_STRING(ARM_SEA),
2019
};
2020
2021
/*
2022
* Exit Reason String
2023
*
2024
* Input Args:
2025
* exit_reason - Exit reason
2026
*
2027
* Output Args: None
2028
*
2029
* Return:
2030
* Constant string pointer describing the exit reason.
2031
*
2032
* Locates and returns a constant string that describes the KVM exit
2033
* reason given by exit_reason. If no such string is found, a constant
2034
* string of "Unknown" is returned.
2035
*/
2036
const char *exit_reason_str(unsigned int exit_reason)
2037
{
2038
unsigned int n1;
2039
2040
for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
2041
if (exit_reason == exit_reasons_known[n1].reason)
2042
return exit_reasons_known[n1].name;
2043
}
2044
2045
return "Unknown";
2046
}
2047
2048
/*
2049
* Physical Contiguous Page Allocator
2050
*
2051
* Input Args:
2052
* vm - Virtual Machine
2053
* num - number of pages
2054
* paddr_min - Physical address minimum
2055
* memslot - Memory region to allocate page from
2056
* protected - True if the pages will be used as protected/private memory
2057
*
2058
* Output Args: None
2059
*
2060
* Return:
2061
* Starting physical address
2062
*
2063
* Within the VM specified by vm, locates a range of available physical
2064
* pages at or above paddr_min. If found, the pages are marked as in use
2065
* and their base address is returned. A TEST_ASSERT failure occurs if
2066
* not enough pages are available at or above paddr_min.
2067
*/
2068
vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
2069
vm_paddr_t paddr_min, uint32_t memslot,
2070
bool protected)
2071
{
2072
struct userspace_mem_region *region;
2073
sparsebit_idx_t pg, base;
2074
2075
TEST_ASSERT(num > 0, "Must allocate at least one page");
2076
2077
TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
2078
"not divisible by page size.\n"
2079
" paddr_min: 0x%lx page_size: 0x%x",
2080
paddr_min, vm->page_size);
2081
2082
region = memslot2region(vm, memslot);
2083
TEST_ASSERT(!protected || region->protected_phy_pages,
2084
"Region doesn't support protected memory");
2085
2086
base = pg = paddr_min >> vm->page_shift;
2087
do {
2088
for (; pg < base + num; ++pg) {
2089
if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
2090
base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
2091
break;
2092
}
2093
}
2094
} while (pg && pg != base + num);
2095
2096
if (pg == 0) {
2097
fprintf(stderr, "No guest physical page available, "
2098
"paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
2099
paddr_min, vm->page_size, memslot);
2100
fputs("---- vm dump ----\n", stderr);
2101
vm_dump(stderr, vm, 2);
2102
abort();
2103
}
2104
2105
for (pg = base; pg < base + num; ++pg) {
2106
sparsebit_clear(region->unused_phy_pages, pg);
2107
if (protected)
2108
sparsebit_set(region->protected_phy_pages, pg);
2109
}
2110
2111
return base * vm->page_size;
2112
}
2113
2114
vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
2115
uint32_t memslot)
2116
{
2117
return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
2118
}
2119
2120
vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
2121
{
2122
return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
2123
vm->memslots[MEM_REGION_PT]);
2124
}
2125
2126
/*
2127
* Address Guest Virtual to Host Virtual
2128
*
2129
* Input Args:
2130
* vm - Virtual Machine
2131
* gva - VM virtual address
2132
*
2133
* Output Args: None
2134
*
2135
* Return:
2136
* Equivalent host virtual address
2137
*/
2138
void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
2139
{
2140
return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
2141
}
2142
2143
unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm)
2144
{
2145
return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
2146
}
2147
2148
static unsigned int vm_calc_num_pages(unsigned int num_pages,
2149
unsigned int page_shift,
2150
unsigned int new_page_shift,
2151
bool ceil)
2152
{
2153
unsigned int n = 1 << (new_page_shift - page_shift);
2154
2155
if (page_shift >= new_page_shift)
2156
return num_pages * (1 << (page_shift - new_page_shift));
2157
2158
return num_pages / n + !!(ceil && num_pages % n);
2159
}
2160
2161
static inline int getpageshift(void)
2162
{
2163
return __builtin_ffs(getpagesize()) - 1;
2164
}
2165
2166
unsigned int
2167
vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
2168
{
2169
return vm_calc_num_pages(num_guest_pages,
2170
vm_guest_mode_params[mode].page_shift,
2171
getpageshift(), true);
2172
}
2173
2174
unsigned int
2175
vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
2176
{
2177
return vm_calc_num_pages(num_host_pages, getpageshift(),
2178
vm_guest_mode_params[mode].page_shift, false);
2179
}
2180
2181
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
2182
{
2183
unsigned int n;
2184
n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
2185
return vm_adjust_num_guest_pages(mode, n);
2186
}
2187
2188
/*
2189
* Read binary stats descriptors
2190
*
2191
* Input Args:
2192
* stats_fd - the file descriptor for the binary stats file from which to read
2193
* header - the binary stats metadata header corresponding to the given FD
2194
*
2195
* Output Args: None
2196
*
2197
* Return:
2198
* A pointer to a newly allocated series of stat descriptors.
2199
* Caller is responsible for freeing the returned kvm_stats_desc.
2200
*
2201
* Read the stats descriptors from the binary stats interface.
2202
*/
2203
struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
2204
struct kvm_stats_header *header)
2205
{
2206
struct kvm_stats_desc *stats_desc;
2207
ssize_t desc_size, total_size, ret;
2208
2209
desc_size = get_stats_descriptor_size(header);
2210
total_size = header->num_desc * desc_size;
2211
2212
stats_desc = calloc(header->num_desc, desc_size);
2213
TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors");
2214
2215
ret = pread(stats_fd, stats_desc, total_size, header->desc_offset);
2216
TEST_ASSERT(ret == total_size, "Read KVM stats descriptors");
2217
2218
return stats_desc;
2219
}
2220
2221
/*
2222
* Read stat data for a particular stat
2223
*
2224
* Input Args:
2225
* stats_fd - the file descriptor for the binary stats file from which to read
2226
* header - the binary stats metadata header corresponding to the given FD
2227
* desc - the binary stat metadata for the particular stat to be read
2228
* max_elements - the maximum number of 8-byte values to read into data
2229
*
2230
* Output Args:
2231
* data - the buffer into which stat data should be read
2232
*
2233
* Read the data values of a specified stat from the binary stats interface.
2234
*/
2235
void read_stat_data(int stats_fd, struct kvm_stats_header *header,
2236
struct kvm_stats_desc *desc, uint64_t *data,
2237
size_t max_elements)
2238
{
2239
size_t nr_elements = min_t(ssize_t, desc->size, max_elements);
2240
size_t size = nr_elements * sizeof(*data);
2241
ssize_t ret;
2242
2243
TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name);
2244
TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name);
2245
2246
ret = pread(stats_fd, data, size,
2247
header->data_offset + desc->offset);
2248
2249
TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)",
2250
desc->name, errno, strerror(errno));
2251
TEST_ASSERT(ret == size,
2252
"pread() on stat '%s' read %ld bytes, wanted %lu bytes",
2253
desc->name, size, ret);
2254
}
2255
2256
void kvm_get_stat(struct kvm_binary_stats *stats, const char *name,
2257
uint64_t *data, size_t max_elements)
2258
{
2259
struct kvm_stats_desc *desc;
2260
size_t size_desc;
2261
int i;
2262
2263
if (!stats->desc) {
2264
read_stats_header(stats->fd, &stats->header);
2265
stats->desc = read_stats_descriptors(stats->fd, &stats->header);
2266
}
2267
2268
size_desc = get_stats_descriptor_size(&stats->header);
2269
2270
for (i = 0; i < stats->header.num_desc; ++i) {
2271
desc = (void *)stats->desc + (i * size_desc);
2272
2273
if (strcmp(desc->name, name))
2274
continue;
2275
2276
read_stat_data(stats->fd, &stats->header, desc, data, max_elements);
2277
return;
2278
}
2279
2280
TEST_FAIL("Unable to find stat '%s'", name);
2281
}
2282
2283
__weak void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
2284
{
2285
}
2286
2287
__weak void kvm_arch_vm_finalize_vcpus(struct kvm_vm *vm)
2288
{
2289
}
2290
2291
__weak void kvm_arch_vm_release(struct kvm_vm *vm)
2292
{
2293
}
2294
2295
__weak void kvm_selftest_arch_init(void)
2296
{
2297
}
2298
2299
static void report_unexpected_signal(int signum)
2300
{
2301
#define KVM_CASE_SIGNUM(sig) \
2302
case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
2303
2304
switch (signum) {
2305
KVM_CASE_SIGNUM(SIGBUS);
2306
KVM_CASE_SIGNUM(SIGSEGV);
2307
KVM_CASE_SIGNUM(SIGILL);
2308
KVM_CASE_SIGNUM(SIGFPE);
2309
default:
2310
TEST_FAIL("Unexpected signal %d\n", signum);
2311
}
2312
}
2313
2314
void __attribute((constructor)) kvm_selftest_init(void)
2315
{
2316
struct sigaction sig_sa = {
2317
.sa_handler = report_unexpected_signal,
2318
};
2319
2320
/* Tell stdout not to buffer its content. */
2321
setbuf(stdout, NULL);
2322
2323
sigaction(SIGBUS, &sig_sa, NULL);
2324
sigaction(SIGSEGV, &sig_sa, NULL);
2325
sigaction(SIGILL, &sig_sa, NULL);
2326
sigaction(SIGFPE, &sig_sa, NULL);
2327
2328
guest_random_seed = last_guest_seed = random();
2329
pr_info("Random seed: 0x%x\n", guest_random_seed);
2330
2331
kvm_selftest_arch_init();
2332
}
2333
2334
bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr)
2335
{
2336
sparsebit_idx_t pg = 0;
2337
struct userspace_mem_region *region;
2338
2339
if (!vm_arch_has_protected_memory(vm))
2340
return false;
2341
2342
region = userspace_mem_region_find(vm, paddr, paddr);
2343
TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr);
2344
2345
pg = paddr >> vm->page_shift;
2346
return sparsebit_is_set(region->protected_phy_pages, pg);
2347
}
2348
2349
__weak bool kvm_arch_has_default_irqchip(void)
2350
{
2351
return false;
2352
}
2353
2354