Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/virt/kvm/kvm_main.c
49639 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine (KVM) Hypervisor
4
*
5
* Copyright (C) 2006 Qumranet, Inc.
6
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
7
*
8
* Authors:
9
* Avi Kivity <[email protected]>
10
* Yaniv Kamay <[email protected]>
11
*/
12
13
#include <kvm/iodev.h>
14
15
#include <linux/kvm_host.h>
16
#include <linux/kvm.h>
17
#include <linux/module.h>
18
#include <linux/errno.h>
19
#include <linux/percpu.h>
20
#include <linux/mm.h>
21
#include <linux/miscdevice.h>
22
#include <linux/vmalloc.h>
23
#include <linux/reboot.h>
24
#include <linux/debugfs.h>
25
#include <linux/highmem.h>
26
#include <linux/file.h>
27
#include <linux/syscore_ops.h>
28
#include <linux/cpu.h>
29
#include <linux/sched/signal.h>
30
#include <linux/sched/mm.h>
31
#include <linux/sched/stat.h>
32
#include <linux/cpumask.h>
33
#include <linux/smp.h>
34
#include <linux/anon_inodes.h>
35
#include <linux/profile.h>
36
#include <linux/kvm_para.h>
37
#include <linux/pagemap.h>
38
#include <linux/mman.h>
39
#include <linux/swap.h>
40
#include <linux/bitops.h>
41
#include <linux/spinlock.h>
42
#include <linux/compat.h>
43
#include <linux/srcu.h>
44
#include <linux/hugetlb.h>
45
#include <linux/slab.h>
46
#include <linux/sort.h>
47
#include <linux/bsearch.h>
48
#include <linux/io.h>
49
#include <linux/lockdep.h>
50
#include <linux/kthread.h>
51
#include <linux/suspend.h>
52
#include <linux/rseq.h>
53
54
#include <asm/processor.h>
55
#include <asm/ioctl.h>
56
#include <linux/uaccess.h>
57
58
#include "coalesced_mmio.h"
59
#include "async_pf.h"
60
#include "kvm_mm.h"
61
#include "vfio.h"
62
63
#include <trace/events/ipi.h>
64
65
#define CREATE_TRACE_POINTS
66
#include <trace/events/kvm.h>
67
68
#include <linux/kvm_dirty_ring.h>
69
70
71
/* Worst case buffer size needed for holding an integer. */
72
#define ITOA_MAX_LEN 12
73
74
MODULE_AUTHOR("Qumranet");
75
MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
76
MODULE_LICENSE("GPL");
77
78
/* Architectures should define their poll value according to the halt latency */
79
unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
80
module_param(halt_poll_ns, uint, 0644);
81
EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns);
82
83
/* Default doubles per-vcpu halt_poll_ns. */
84
unsigned int halt_poll_ns_grow = 2;
85
module_param(halt_poll_ns_grow, uint, 0644);
86
EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow);
87
88
/* The start value to grow halt_poll_ns from */
89
unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
90
module_param(halt_poll_ns_grow_start, uint, 0644);
91
EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start);
92
93
/* Default halves per-vcpu halt_poll_ns. */
94
unsigned int halt_poll_ns_shrink = 2;
95
module_param(halt_poll_ns_shrink, uint, 0644);
96
EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
97
98
/*
99
* Allow direct access (from KVM or the CPU) without MMU notifier protection
100
* to unpinned pages.
101
*/
102
static bool allow_unsafe_mappings;
103
module_param(allow_unsafe_mappings, bool, 0444);
104
105
/*
106
* Ordering of locks:
107
*
108
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
109
*/
110
111
DEFINE_MUTEX(kvm_lock);
112
LIST_HEAD(vm_list);
113
114
static struct kmem_cache *kvm_vcpu_cache;
115
116
static __read_mostly struct preempt_ops kvm_preempt_ops;
117
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
118
119
static struct dentry *kvm_debugfs_dir;
120
121
static const struct file_operations stat_fops_per_vm;
122
123
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
124
unsigned long arg);
125
#ifdef CONFIG_KVM_COMPAT
126
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
127
unsigned long arg);
128
#define KVM_COMPAT(c) .compat_ioctl = (c)
129
#else
130
/*
131
* For architectures that don't implement a compat infrastructure,
132
* adopt a double line of defense:
133
* - Prevent a compat task from opening /dev/kvm
134
* - If the open has been done by a 64bit task, and the KVM fd
135
* passed to a compat task, let the ioctls fail.
136
*/
137
static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
138
unsigned long arg) { return -EINVAL; }
139
140
static int kvm_no_compat_open(struct inode *inode, struct file *file)
141
{
142
return is_compat_task() ? -ENODEV : 0;
143
}
144
#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
145
.open = kvm_no_compat_open
146
#endif
147
148
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
149
150
#define KVM_EVENT_CREATE_VM 0
151
#define KVM_EVENT_DESTROY_VM 1
152
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
153
static unsigned long long kvm_createvm_count;
154
static unsigned long long kvm_active_vms;
155
156
static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
157
158
__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
159
{
160
}
161
162
/*
163
* Switches to specified vcpu, until a matching vcpu_put()
164
*/
165
void vcpu_load(struct kvm_vcpu *vcpu)
166
{
167
int cpu = get_cpu();
168
169
__this_cpu_write(kvm_running_vcpu, vcpu);
170
preempt_notifier_register(&vcpu->preempt_notifier);
171
kvm_arch_vcpu_load(vcpu, cpu);
172
put_cpu();
173
}
174
EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load);
175
176
void vcpu_put(struct kvm_vcpu *vcpu)
177
{
178
preempt_disable();
179
kvm_arch_vcpu_put(vcpu);
180
preempt_notifier_unregister(&vcpu->preempt_notifier);
181
__this_cpu_write(kvm_running_vcpu, NULL);
182
preempt_enable();
183
}
184
EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put);
185
186
/* TODO: merge with kvm_arch_vcpu_should_kick */
187
static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
188
{
189
int mode = kvm_vcpu_exiting_guest_mode(vcpu);
190
191
/*
192
* We need to wait for the VCPU to reenable interrupts and get out of
193
* READING_SHADOW_PAGE_TABLES mode.
194
*/
195
if (req & KVM_REQUEST_WAIT)
196
return mode != OUTSIDE_GUEST_MODE;
197
198
/*
199
* Need to kick a running VCPU, but otherwise there is nothing to do.
200
*/
201
return mode == IN_GUEST_MODE;
202
}
203
204
static void ack_kick(void *_completed)
205
{
206
}
207
208
static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
209
{
210
if (cpumask_empty(cpus))
211
return false;
212
213
smp_call_function_many(cpus, ack_kick, NULL, wait);
214
return true;
215
}
216
217
static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
218
struct cpumask *tmp, int current_cpu)
219
{
220
int cpu;
221
222
if (likely(!(req & KVM_REQUEST_NO_ACTION)))
223
__kvm_make_request(req, vcpu);
224
225
if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
226
return;
227
228
/*
229
* Note, the vCPU could get migrated to a different pCPU at any point
230
* after kvm_request_needs_ipi(), which could result in sending an IPI
231
* to the previous pCPU. But, that's OK because the purpose of the IPI
232
* is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
233
* satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
234
* after this point is also OK, as the requirement is only that KVM wait
235
* for vCPUs that were reading SPTEs _before_ any changes were
236
* finalized. See kvm_vcpu_kick() for more details on handling requests.
237
*/
238
if (kvm_request_needs_ipi(vcpu, req)) {
239
cpu = READ_ONCE(vcpu->cpu);
240
if (cpu != -1 && cpu != current_cpu)
241
__cpumask_set_cpu(cpu, tmp);
242
}
243
}
244
245
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
246
unsigned long *vcpu_bitmap)
247
{
248
struct kvm_vcpu *vcpu;
249
struct cpumask *cpus;
250
int i, me;
251
bool called;
252
253
me = get_cpu();
254
255
cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
256
cpumask_clear(cpus);
257
258
for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
259
vcpu = kvm_get_vcpu(kvm, i);
260
if (!vcpu)
261
continue;
262
kvm_make_vcpu_request(vcpu, req, cpus, me);
263
}
264
265
called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
266
put_cpu();
267
268
return called;
269
}
270
271
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
272
{
273
struct kvm_vcpu *vcpu;
274
struct cpumask *cpus;
275
unsigned long i;
276
bool called;
277
int me;
278
279
me = get_cpu();
280
281
cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
282
cpumask_clear(cpus);
283
284
kvm_for_each_vcpu(i, vcpu, kvm)
285
kvm_make_vcpu_request(vcpu, req, cpus, me);
286
287
called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
288
put_cpu();
289
290
return called;
291
}
292
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request);
293
294
void kvm_flush_remote_tlbs(struct kvm *kvm)
295
{
296
++kvm->stat.generic.remote_tlb_flush_requests;
297
298
/*
299
* We want to publish modifications to the page tables before reading
300
* mode. Pairs with a memory barrier in arch-specific code.
301
* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
302
* and smp_mb in walk_shadow_page_lockless_begin/end.
303
* - powerpc: smp_mb in kvmppc_prepare_to_enter.
304
*
305
* There is already an smp_mb__after_atomic() before
306
* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
307
* barrier here.
308
*/
309
if (!kvm_arch_flush_remote_tlbs(kvm)
310
|| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
311
++kvm->stat.generic.remote_tlb_flush;
312
}
313
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs);
314
315
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
316
{
317
if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
318
return;
319
320
/*
321
* Fall back to a flushing entire TLBs if the architecture range-based
322
* TLB invalidation is unsupported or can't be performed for whatever
323
* reason.
324
*/
325
kvm_flush_remote_tlbs(kvm);
326
}
327
328
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
329
const struct kvm_memory_slot *memslot)
330
{
331
/*
332
* All current use cases for flushing the TLBs for a specific memslot
333
* are related to dirty logging, and many do the TLB flush out of
334
* mmu_lock. The interaction between the various operations on memslot
335
* must be serialized by slots_lock to ensure the TLB flush from one
336
* operation is observed by any other operation on the same memslot.
337
*/
338
lockdep_assert_held(&kvm->slots_lock);
339
kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
340
}
341
342
static void kvm_flush_shadow_all(struct kvm *kvm)
343
{
344
kvm_arch_flush_shadow_all(kvm);
345
kvm_arch_guest_memory_reclaimed(kvm);
346
}
347
348
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
349
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
350
gfp_t gfp_flags)
351
{
352
void *page;
353
354
gfp_flags |= mc->gfp_zero;
355
356
if (mc->kmem_cache)
357
return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
358
359
page = (void *)__get_free_page(gfp_flags);
360
if (page && mc->init_value)
361
memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
362
return page;
363
}
364
365
int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
366
{
367
gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
368
void *obj;
369
370
if (mc->nobjs >= min)
371
return 0;
372
373
if (unlikely(!mc->objects)) {
374
if (WARN_ON_ONCE(!capacity))
375
return -EIO;
376
377
/*
378
* Custom init values can be used only for page allocations,
379
* and obviously conflict with __GFP_ZERO.
380
*/
381
if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
382
return -EIO;
383
384
mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
385
if (!mc->objects)
386
return -ENOMEM;
387
388
mc->capacity = capacity;
389
}
390
391
/* It is illegal to request a different capacity across topups. */
392
if (WARN_ON_ONCE(mc->capacity != capacity))
393
return -EIO;
394
395
while (mc->nobjs < mc->capacity) {
396
obj = mmu_memory_cache_alloc_obj(mc, gfp);
397
if (!obj)
398
return mc->nobjs >= min ? 0 : -ENOMEM;
399
mc->objects[mc->nobjs++] = obj;
400
}
401
return 0;
402
}
403
404
int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
405
{
406
return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
407
}
408
409
int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
410
{
411
return mc->nobjs;
412
}
413
414
void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
415
{
416
while (mc->nobjs) {
417
if (mc->kmem_cache)
418
kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
419
else
420
free_page((unsigned long)mc->objects[--mc->nobjs]);
421
}
422
423
kvfree(mc->objects);
424
425
mc->objects = NULL;
426
mc->capacity = 0;
427
}
428
429
void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
430
{
431
void *p;
432
433
if (WARN_ON(!mc->nobjs))
434
p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
435
else
436
p = mc->objects[--mc->nobjs];
437
BUG_ON(!p);
438
return p;
439
}
440
#endif
441
442
static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
443
{
444
mutex_init(&vcpu->mutex);
445
vcpu->cpu = -1;
446
vcpu->kvm = kvm;
447
vcpu->vcpu_id = id;
448
vcpu->pid = NULL;
449
rwlock_init(&vcpu->pid_lock);
450
#ifndef __KVM_HAVE_ARCH_WQP
451
rcuwait_init(&vcpu->wait);
452
#endif
453
kvm_async_pf_vcpu_init(vcpu);
454
455
kvm_vcpu_set_in_spin_loop(vcpu, false);
456
kvm_vcpu_set_dy_eligible(vcpu, false);
457
vcpu->preempted = false;
458
vcpu->ready = false;
459
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
460
vcpu->last_used_slot = NULL;
461
462
/* Fill the stats id string for the vcpu */
463
snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
464
task_pid_nr(current), id);
465
}
466
467
static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
468
{
469
kvm_arch_vcpu_destroy(vcpu);
470
kvm_dirty_ring_free(&vcpu->dirty_ring);
471
472
/*
473
* No need for rcu_read_lock as VCPU_RUN is the only place that changes
474
* the vcpu->pid pointer, and at destruction time all file descriptors
475
* are already gone.
476
*/
477
put_pid(vcpu->pid);
478
479
free_page((unsigned long)vcpu->run);
480
kmem_cache_free(kvm_vcpu_cache, vcpu);
481
}
482
483
void kvm_destroy_vcpus(struct kvm *kvm)
484
{
485
unsigned long i;
486
struct kvm_vcpu *vcpu;
487
488
kvm_for_each_vcpu(i, vcpu, kvm) {
489
kvm_vcpu_destroy(vcpu);
490
xa_erase(&kvm->vcpu_array, i);
491
492
/*
493
* Assert that the vCPU isn't visible in any way, to ensure KVM
494
* doesn't trigger a use-after-free if destroying vCPUs results
495
* in VM-wide request, e.g. to flush remote TLBs when tearing
496
* down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
497
*/
498
WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
499
}
500
501
atomic_set(&kvm->online_vcpus, 0);
502
}
503
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus);
504
505
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
506
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
507
{
508
return container_of(mn, struct kvm, mmu_notifier);
509
}
510
511
typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
512
513
typedef void (*on_lock_fn_t)(struct kvm *kvm);
514
515
struct kvm_mmu_notifier_range {
516
/*
517
* 64-bit addresses, as KVM notifiers can operate on host virtual
518
* addresses (unsigned long) and guest physical addresses (64-bit).
519
*/
520
u64 start;
521
u64 end;
522
union kvm_mmu_notifier_arg arg;
523
gfn_handler_t handler;
524
on_lock_fn_t on_lock;
525
bool flush_on_ret;
526
bool may_block;
527
bool lockless;
528
};
529
530
/*
531
* The inner-most helper returns a tuple containing the return value from the
532
* arch- and action-specific handler, plus a flag indicating whether or not at
533
* least one memslot was found, i.e. if the handler found guest memory.
534
*
535
* Note, most notifiers are averse to booleans, so even though KVM tracks the
536
* return from arch code as a bool, outer helpers will cast it to an int. :-(
537
*/
538
typedef struct kvm_mmu_notifier_return {
539
bool ret;
540
bool found_memslot;
541
} kvm_mn_ret_t;
542
543
/*
544
* Use a dedicated stub instead of NULL to indicate that there is no callback
545
* function/handler. The compiler technically can't guarantee that a real
546
* function will have a non-zero address, and so it will generate code to
547
* check for !NULL, whereas comparing against a stub will be elided at compile
548
* time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
549
*/
550
static void kvm_null_fn(void)
551
{
552
553
}
554
#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
555
556
/* Iterate over each memslot intersecting [start, last] (inclusive) range */
557
#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
558
for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
559
node; \
560
node = interval_tree_iter_next(node, start, last)) \
561
562
static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
563
const struct kvm_mmu_notifier_range *range)
564
{
565
struct kvm_mmu_notifier_return r = {
566
.ret = false,
567
.found_memslot = false,
568
};
569
struct kvm_gfn_range gfn_range;
570
struct kvm_memory_slot *slot;
571
struct kvm_memslots *slots;
572
int i, idx;
573
574
if (WARN_ON_ONCE(range->end <= range->start))
575
return r;
576
577
/* A null handler is allowed if and only if on_lock() is provided. */
578
if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
579
IS_KVM_NULL_FN(range->handler)))
580
return r;
581
582
/* on_lock will never be called for lockless walks */
583
if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
584
return r;
585
586
idx = srcu_read_lock(&kvm->srcu);
587
588
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
589
struct interval_tree_node *node;
590
591
slots = __kvm_memslots(kvm, i);
592
kvm_for_each_memslot_in_hva_range(node, slots,
593
range->start, range->end - 1) {
594
unsigned long hva_start, hva_end;
595
596
slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
597
hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
598
hva_end = min_t(unsigned long, range->end,
599
slot->userspace_addr + (slot->npages << PAGE_SHIFT));
600
601
/*
602
* To optimize for the likely case where the address
603
* range is covered by zero or one memslots, don't
604
* bother making these conditional (to avoid writes on
605
* the second or later invocation of the handler).
606
*/
607
gfn_range.arg = range->arg;
608
gfn_range.may_block = range->may_block;
609
/*
610
* HVA-based notifications aren't relevant to private
611
* mappings as they don't have a userspace mapping.
612
*/
613
gfn_range.attr_filter = KVM_FILTER_SHARED;
614
615
/*
616
* {gfn(page) | page intersects with [hva_start, hva_end)} =
617
* {gfn_start, gfn_start+1, ..., gfn_end-1}.
618
*/
619
gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
620
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
621
gfn_range.slot = slot;
622
gfn_range.lockless = range->lockless;
623
624
if (!r.found_memslot) {
625
r.found_memslot = true;
626
if (!range->lockless) {
627
KVM_MMU_LOCK(kvm);
628
if (!IS_KVM_NULL_FN(range->on_lock))
629
range->on_lock(kvm);
630
631
if (IS_KVM_NULL_FN(range->handler))
632
goto mmu_unlock;
633
}
634
}
635
r.ret |= range->handler(kvm, &gfn_range);
636
}
637
}
638
639
if (range->flush_on_ret && r.ret)
640
kvm_flush_remote_tlbs(kvm);
641
642
mmu_unlock:
643
if (r.found_memslot && !range->lockless)
644
KVM_MMU_UNLOCK(kvm);
645
646
srcu_read_unlock(&kvm->srcu, idx);
647
648
return r;
649
}
650
651
static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
652
unsigned long start,
653
unsigned long end,
654
gfn_handler_t handler,
655
bool flush_on_ret)
656
{
657
struct kvm *kvm = mmu_notifier_to_kvm(mn);
658
const struct kvm_mmu_notifier_range range = {
659
.start = start,
660
.end = end,
661
.handler = handler,
662
.on_lock = (void *)kvm_null_fn,
663
.flush_on_ret = flush_on_ret,
664
.may_block = false,
665
.lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
666
};
667
668
return kvm_handle_hva_range(kvm, &range).ret;
669
}
670
671
static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
672
unsigned long start,
673
unsigned long end,
674
gfn_handler_t handler)
675
{
676
return kvm_age_hva_range(mn, start, end, handler, false);
677
}
678
679
void kvm_mmu_invalidate_begin(struct kvm *kvm)
680
{
681
lockdep_assert_held_write(&kvm->mmu_lock);
682
/*
683
* The count increase must become visible at unlock time as no
684
* spte can be established without taking the mmu_lock and
685
* count is also read inside the mmu_lock critical section.
686
*/
687
kvm->mmu_invalidate_in_progress++;
688
689
if (likely(kvm->mmu_invalidate_in_progress == 1)) {
690
kvm->mmu_invalidate_range_start = INVALID_GPA;
691
kvm->mmu_invalidate_range_end = INVALID_GPA;
692
}
693
}
694
695
void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
696
{
697
lockdep_assert_held_write(&kvm->mmu_lock);
698
699
WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
700
701
if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
702
kvm->mmu_invalidate_range_start = start;
703
kvm->mmu_invalidate_range_end = end;
704
} else {
705
/*
706
* Fully tracking multiple concurrent ranges has diminishing
707
* returns. Keep things simple and just find the minimal range
708
* which includes the current and new ranges. As there won't be
709
* enough information to subtract a range after its invalidate
710
* completes, any ranges invalidated concurrently will
711
* accumulate and persist until all outstanding invalidates
712
* complete.
713
*/
714
kvm->mmu_invalidate_range_start =
715
min(kvm->mmu_invalidate_range_start, start);
716
kvm->mmu_invalidate_range_end =
717
max(kvm->mmu_invalidate_range_end, end);
718
}
719
}
720
721
bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
722
{
723
kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
724
return kvm_unmap_gfn_range(kvm, range);
725
}
726
727
static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
728
const struct mmu_notifier_range *range)
729
{
730
struct kvm *kvm = mmu_notifier_to_kvm(mn);
731
const struct kvm_mmu_notifier_range hva_range = {
732
.start = range->start,
733
.end = range->end,
734
.handler = kvm_mmu_unmap_gfn_range,
735
.on_lock = kvm_mmu_invalidate_begin,
736
.flush_on_ret = true,
737
.may_block = mmu_notifier_range_blockable(range),
738
};
739
740
trace_kvm_unmap_hva_range(range->start, range->end);
741
742
/*
743
* Prevent memslot modification between range_start() and range_end()
744
* so that conditionally locking provides the same result in both
745
* functions. Without that guarantee, the mmu_invalidate_in_progress
746
* adjustments will be imbalanced.
747
*
748
* Pairs with the decrement in range_end().
749
*/
750
spin_lock(&kvm->mn_invalidate_lock);
751
kvm->mn_active_invalidate_count++;
752
spin_unlock(&kvm->mn_invalidate_lock);
753
754
/*
755
* Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
756
* before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
757
* each cache's lock. There are relatively few caches in existence at
758
* any given time, and the caches themselves can check for hva overlap,
759
* i.e. don't need to rely on memslot overlap checks for performance.
760
* Because this runs without holding mmu_lock, the pfn caches must use
761
* mn_active_invalidate_count (see above) instead of
762
* mmu_invalidate_in_progress.
763
*/
764
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
765
766
/*
767
* If one or more memslots were found and thus zapped, notify arch code
768
* that guest memory has been reclaimed. This needs to be done *after*
769
* dropping mmu_lock, as x86's reclaim path is slooooow.
770
*/
771
if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
772
kvm_arch_guest_memory_reclaimed(kvm);
773
774
return 0;
775
}
776
777
void kvm_mmu_invalidate_end(struct kvm *kvm)
778
{
779
lockdep_assert_held_write(&kvm->mmu_lock);
780
781
/*
782
* This sequence increase will notify the kvm page fault that
783
* the page that is going to be mapped in the spte could have
784
* been freed.
785
*/
786
kvm->mmu_invalidate_seq++;
787
smp_wmb();
788
/*
789
* The above sequence increase must be visible before the
790
* below count decrease, which is ensured by the smp_wmb above
791
* in conjunction with the smp_rmb in mmu_invalidate_retry().
792
*/
793
kvm->mmu_invalidate_in_progress--;
794
KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
795
796
/*
797
* Assert that at least one range was added between start() and end().
798
* Not adding a range isn't fatal, but it is a KVM bug.
799
*/
800
WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
801
}
802
803
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
804
const struct mmu_notifier_range *range)
805
{
806
struct kvm *kvm = mmu_notifier_to_kvm(mn);
807
const struct kvm_mmu_notifier_range hva_range = {
808
.start = range->start,
809
.end = range->end,
810
.handler = (void *)kvm_null_fn,
811
.on_lock = kvm_mmu_invalidate_end,
812
.flush_on_ret = false,
813
.may_block = mmu_notifier_range_blockable(range),
814
};
815
bool wake;
816
817
kvm_handle_hva_range(kvm, &hva_range);
818
819
/* Pairs with the increment in range_start(). */
820
spin_lock(&kvm->mn_invalidate_lock);
821
if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
822
--kvm->mn_active_invalidate_count;
823
wake = !kvm->mn_active_invalidate_count;
824
spin_unlock(&kvm->mn_invalidate_lock);
825
826
/*
827
* There can only be one waiter, since the wait happens under
828
* slots_lock.
829
*/
830
if (wake)
831
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
832
}
833
834
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
835
struct mm_struct *mm,
836
unsigned long start,
837
unsigned long end)
838
{
839
trace_kvm_age_hva(start, end);
840
841
return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
842
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
843
}
844
845
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
846
struct mm_struct *mm,
847
unsigned long start,
848
unsigned long end)
849
{
850
trace_kvm_age_hva(start, end);
851
852
/*
853
* Even though we do not flush TLB, this will still adversely
854
* affect performance on pre-Haswell Intel EPT, where there is
855
* no EPT Access Bit to clear so that we have to tear down EPT
856
* tables instead. If we find this unacceptable, we can always
857
* add a parameter to kvm_age_hva so that it effectively doesn't
858
* do anything on clear_young.
859
*
860
* Also note that currently we never issue secondary TLB flushes
861
* from clear_young, leaving this job up to the regular system
862
* cadence. If we find this inaccurate, we might come up with a
863
* more sophisticated heuristic later.
864
*/
865
return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
866
}
867
868
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
869
struct mm_struct *mm,
870
unsigned long address)
871
{
872
trace_kvm_test_age_hva(address);
873
874
return kvm_age_hva_range_no_flush(mn, address, address + 1,
875
kvm_test_age_gfn);
876
}
877
878
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
879
struct mm_struct *mm)
880
{
881
struct kvm *kvm = mmu_notifier_to_kvm(mn);
882
int idx;
883
884
idx = srcu_read_lock(&kvm->srcu);
885
kvm_flush_shadow_all(kvm);
886
srcu_read_unlock(&kvm->srcu, idx);
887
}
888
889
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
890
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
891
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
892
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
893
.clear_young = kvm_mmu_notifier_clear_young,
894
.test_young = kvm_mmu_notifier_test_young,
895
.release = kvm_mmu_notifier_release,
896
};
897
898
static int kvm_init_mmu_notifier(struct kvm *kvm)
899
{
900
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
901
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
902
}
903
904
#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
905
906
static int kvm_init_mmu_notifier(struct kvm *kvm)
907
{
908
return 0;
909
}
910
911
#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
912
913
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
914
static int kvm_pm_notifier_call(struct notifier_block *bl,
915
unsigned long state,
916
void *unused)
917
{
918
struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
919
920
return kvm_arch_pm_notifier(kvm, state);
921
}
922
923
static void kvm_init_pm_notifier(struct kvm *kvm)
924
{
925
kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
926
/* Suspend KVM before we suspend ftrace, RCU, etc. */
927
kvm->pm_notifier.priority = INT_MAX;
928
register_pm_notifier(&kvm->pm_notifier);
929
}
930
931
static void kvm_destroy_pm_notifier(struct kvm *kvm)
932
{
933
unregister_pm_notifier(&kvm->pm_notifier);
934
}
935
#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
936
static void kvm_init_pm_notifier(struct kvm *kvm)
937
{
938
}
939
940
static void kvm_destroy_pm_notifier(struct kvm *kvm)
941
{
942
}
943
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
944
945
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
946
{
947
if (!memslot->dirty_bitmap)
948
return;
949
950
vfree(memslot->dirty_bitmap);
951
memslot->dirty_bitmap = NULL;
952
}
953
954
/* This does not remove the slot from struct kvm_memslots data structures */
955
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
956
{
957
if (slot->flags & KVM_MEM_GUEST_MEMFD)
958
kvm_gmem_unbind(slot);
959
960
kvm_destroy_dirty_bitmap(slot);
961
962
kvm_arch_free_memslot(kvm, slot);
963
964
kfree(slot);
965
}
966
967
static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
968
{
969
struct hlist_node *idnode;
970
struct kvm_memory_slot *memslot;
971
int bkt;
972
973
/*
974
* The same memslot objects live in both active and inactive sets,
975
* arbitrarily free using index '1' so the second invocation of this
976
* function isn't operating over a structure with dangling pointers
977
* (even though this function isn't actually touching them).
978
*/
979
if (!slots->node_idx)
980
return;
981
982
hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
983
kvm_free_memslot(kvm, memslot);
984
}
985
986
static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
987
{
988
switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
989
case KVM_STATS_TYPE_INSTANT:
990
return 0444;
991
case KVM_STATS_TYPE_CUMULATIVE:
992
case KVM_STATS_TYPE_PEAK:
993
default:
994
return 0644;
995
}
996
}
997
998
999
static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1000
{
1001
int i;
1002
int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1003
kvm_vcpu_stats_header.num_desc;
1004
1005
if (IS_ERR(kvm->debugfs_dentry))
1006
return;
1007
1008
debugfs_remove_recursive(kvm->debugfs_dentry);
1009
1010
if (kvm->debugfs_stat_data) {
1011
for (i = 0; i < kvm_debugfs_num_entries; i++)
1012
kfree(kvm->debugfs_stat_data[i]);
1013
kfree(kvm->debugfs_stat_data);
1014
}
1015
}
1016
1017
static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1018
{
1019
static DEFINE_MUTEX(kvm_debugfs_lock);
1020
struct dentry *dent;
1021
char dir_name[ITOA_MAX_LEN * 2];
1022
struct kvm_stat_data *stat_data;
1023
const struct _kvm_stats_desc *pdesc;
1024
int i, ret = -ENOMEM;
1025
int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1026
kvm_vcpu_stats_header.num_desc;
1027
1028
if (!debugfs_initialized())
1029
return 0;
1030
1031
snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1032
mutex_lock(&kvm_debugfs_lock);
1033
dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1034
if (dent) {
1035
pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1036
dput(dent);
1037
mutex_unlock(&kvm_debugfs_lock);
1038
return 0;
1039
}
1040
dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1041
mutex_unlock(&kvm_debugfs_lock);
1042
if (IS_ERR(dent))
1043
return 0;
1044
1045
kvm->debugfs_dentry = dent;
1046
kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1047
sizeof(*kvm->debugfs_stat_data),
1048
GFP_KERNEL_ACCOUNT);
1049
if (!kvm->debugfs_stat_data)
1050
goto out_err;
1051
1052
for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1053
pdesc = &kvm_vm_stats_desc[i];
1054
stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1055
if (!stat_data)
1056
goto out_err;
1057
1058
stat_data->kvm = kvm;
1059
stat_data->desc = pdesc;
1060
stat_data->kind = KVM_STAT_VM;
1061
kvm->debugfs_stat_data[i] = stat_data;
1062
debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1063
kvm->debugfs_dentry, stat_data,
1064
&stat_fops_per_vm);
1065
}
1066
1067
for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1068
pdesc = &kvm_vcpu_stats_desc[i];
1069
stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1070
if (!stat_data)
1071
goto out_err;
1072
1073
stat_data->kvm = kvm;
1074
stat_data->desc = pdesc;
1075
stat_data->kind = KVM_STAT_VCPU;
1076
kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1077
debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1078
kvm->debugfs_dentry, stat_data,
1079
&stat_fops_per_vm);
1080
}
1081
1082
kvm_arch_create_vm_debugfs(kvm);
1083
return 0;
1084
out_err:
1085
kvm_destroy_vm_debugfs(kvm);
1086
return ret;
1087
}
1088
1089
/*
1090
* Called just after removing the VM from the vm_list, but before doing any
1091
* other destruction.
1092
*/
1093
void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1094
{
1095
}
1096
1097
/*
1098
* Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1099
* be setup already, so we can create arch-specific debugfs entries under it.
1100
* Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1101
* a per-arch destroy interface is not needed.
1102
*/
1103
void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1104
{
1105
}
1106
1107
/* Called only on cleanup and destruction paths when there are no users. */
1108
static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm,
1109
enum kvm_bus idx)
1110
{
1111
return rcu_dereference_protected(kvm->buses[idx],
1112
!refcount_read(&kvm->users_count));
1113
}
1114
1115
static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1116
{
1117
struct kvm *kvm = kvm_arch_alloc_vm();
1118
struct kvm_memslots *slots;
1119
int r, i, j;
1120
1121
if (!kvm)
1122
return ERR_PTR(-ENOMEM);
1123
1124
KVM_MMU_LOCK_INIT(kvm);
1125
mmgrab(current->mm);
1126
kvm->mm = current->mm;
1127
kvm_eventfd_init(kvm);
1128
mutex_init(&kvm->lock);
1129
mutex_init(&kvm->irq_lock);
1130
mutex_init(&kvm->slots_lock);
1131
mutex_init(&kvm->slots_arch_lock);
1132
spin_lock_init(&kvm->mn_invalidate_lock);
1133
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1134
xa_init(&kvm->vcpu_array);
1135
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1136
xa_init(&kvm->mem_attr_array);
1137
#endif
1138
1139
INIT_LIST_HEAD(&kvm->gpc_list);
1140
spin_lock_init(&kvm->gpc_lock);
1141
1142
INIT_LIST_HEAD(&kvm->devices);
1143
kvm->max_vcpus = KVM_MAX_VCPUS;
1144
1145
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1146
1147
/*
1148
* Force subsequent debugfs file creations to fail if the VM directory
1149
* is not created (by kvm_create_vm_debugfs()).
1150
*/
1151
kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1152
1153
snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1154
task_pid_nr(current));
1155
1156
r = -ENOMEM;
1157
if (init_srcu_struct(&kvm->srcu))
1158
goto out_err_no_srcu;
1159
if (init_srcu_struct(&kvm->irq_srcu))
1160
goto out_err_no_irq_srcu;
1161
1162
r = kvm_init_irq_routing(kvm);
1163
if (r)
1164
goto out_err_no_irq_routing;
1165
1166
refcount_set(&kvm->users_count, 1);
1167
1168
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1169
for (j = 0; j < 2; j++) {
1170
slots = &kvm->__memslots[i][j];
1171
1172
atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1173
slots->hva_tree = RB_ROOT_CACHED;
1174
slots->gfn_tree = RB_ROOT;
1175
hash_init(slots->id_hash);
1176
slots->node_idx = j;
1177
1178
/* Generations must be different for each address space. */
1179
slots->generation = i;
1180
}
1181
1182
rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1183
}
1184
1185
r = -ENOMEM;
1186
for (i = 0; i < KVM_NR_BUSES; i++) {
1187
rcu_assign_pointer(kvm->buses[i],
1188
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1189
if (!kvm->buses[i])
1190
goto out_err_no_arch_destroy_vm;
1191
}
1192
1193
r = kvm_arch_init_vm(kvm, type);
1194
if (r)
1195
goto out_err_no_arch_destroy_vm;
1196
1197
r = kvm_enable_virtualization();
1198
if (r)
1199
goto out_err_no_disable;
1200
1201
#ifdef CONFIG_HAVE_KVM_IRQCHIP
1202
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1203
#endif
1204
1205
r = kvm_init_mmu_notifier(kvm);
1206
if (r)
1207
goto out_err_no_mmu_notifier;
1208
1209
r = kvm_coalesced_mmio_init(kvm);
1210
if (r < 0)
1211
goto out_no_coalesced_mmio;
1212
1213
r = kvm_create_vm_debugfs(kvm, fdname);
1214
if (r)
1215
goto out_err_no_debugfs;
1216
1217
mutex_lock(&kvm_lock);
1218
list_add(&kvm->vm_list, &vm_list);
1219
mutex_unlock(&kvm_lock);
1220
1221
preempt_notifier_inc();
1222
kvm_init_pm_notifier(kvm);
1223
1224
return kvm;
1225
1226
out_err_no_debugfs:
1227
kvm_coalesced_mmio_free(kvm);
1228
out_no_coalesced_mmio:
1229
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1230
if (kvm->mmu_notifier.ops)
1231
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1232
#endif
1233
out_err_no_mmu_notifier:
1234
kvm_disable_virtualization();
1235
out_err_no_disable:
1236
kvm_arch_destroy_vm(kvm);
1237
out_err_no_arch_destroy_vm:
1238
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1239
for (i = 0; i < KVM_NR_BUSES; i++)
1240
kfree(kvm_get_bus_for_destruction(kvm, i));
1241
kvm_free_irq_routing(kvm);
1242
out_err_no_irq_routing:
1243
cleanup_srcu_struct(&kvm->irq_srcu);
1244
out_err_no_irq_srcu:
1245
cleanup_srcu_struct(&kvm->srcu);
1246
out_err_no_srcu:
1247
kvm_arch_free_vm(kvm);
1248
mmdrop(current->mm);
1249
return ERR_PTR(r);
1250
}
1251
1252
static void kvm_destroy_devices(struct kvm *kvm)
1253
{
1254
struct kvm_device *dev, *tmp;
1255
1256
/*
1257
* We do not need to take the kvm->lock here, because nobody else
1258
* has a reference to the struct kvm at this point and therefore
1259
* cannot access the devices list anyhow.
1260
*
1261
* The device list is generally managed as an rculist, but list_del()
1262
* is used intentionally here. If a bug in KVM introduced a reader that
1263
* was not backed by a reference on the kvm struct, the hope is that
1264
* it'd consume the poisoned forward pointer instead of suffering a
1265
* use-after-free, even though this cannot be guaranteed.
1266
*/
1267
list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1268
list_del(&dev->vm_node);
1269
dev->ops->destroy(dev);
1270
}
1271
}
1272
1273
static void kvm_destroy_vm(struct kvm *kvm)
1274
{
1275
int i;
1276
struct mm_struct *mm = kvm->mm;
1277
1278
kvm_destroy_pm_notifier(kvm);
1279
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1280
kvm_destroy_vm_debugfs(kvm);
1281
mutex_lock(&kvm_lock);
1282
list_del(&kvm->vm_list);
1283
mutex_unlock(&kvm_lock);
1284
kvm_arch_pre_destroy_vm(kvm);
1285
1286
kvm_free_irq_routing(kvm);
1287
for (i = 0; i < KVM_NR_BUSES; i++) {
1288
struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i);
1289
1290
if (bus)
1291
kvm_io_bus_destroy(bus);
1292
kvm->buses[i] = NULL;
1293
}
1294
kvm_coalesced_mmio_free(kvm);
1295
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1296
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1297
/*
1298
* At this point, pending calls to invalidate_range_start()
1299
* have completed but no more MMU notifiers will run, so
1300
* mn_active_invalidate_count may remain unbalanced.
1301
* No threads can be waiting in kvm_swap_active_memslots() as the
1302
* last reference on KVM has been dropped, but freeing
1303
* memslots would deadlock without this manual intervention.
1304
*
1305
* If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1306
* notifier between a start() and end(), then there shouldn't be any
1307
* in-progress invalidations.
1308
*/
1309
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1310
if (kvm->mn_active_invalidate_count)
1311
kvm->mn_active_invalidate_count = 0;
1312
else
1313
WARN_ON(kvm->mmu_invalidate_in_progress);
1314
#else
1315
kvm_flush_shadow_all(kvm);
1316
#endif
1317
kvm_arch_destroy_vm(kvm);
1318
kvm_destroy_devices(kvm);
1319
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1320
kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1321
kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1322
}
1323
cleanup_srcu_struct(&kvm->irq_srcu);
1324
srcu_barrier(&kvm->srcu);
1325
cleanup_srcu_struct(&kvm->srcu);
1326
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1327
xa_destroy(&kvm->mem_attr_array);
1328
#endif
1329
kvm_arch_free_vm(kvm);
1330
preempt_notifier_dec();
1331
kvm_disable_virtualization();
1332
mmdrop(mm);
1333
}
1334
1335
void kvm_get_kvm(struct kvm *kvm)
1336
{
1337
refcount_inc(&kvm->users_count);
1338
}
1339
EXPORT_SYMBOL_GPL(kvm_get_kvm);
1340
1341
/*
1342
* Make sure the vm is not during destruction, which is a safe version of
1343
* kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1344
*/
1345
bool kvm_get_kvm_safe(struct kvm *kvm)
1346
{
1347
return refcount_inc_not_zero(&kvm->users_count);
1348
}
1349
EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1350
1351
void kvm_put_kvm(struct kvm *kvm)
1352
{
1353
if (refcount_dec_and_test(&kvm->users_count))
1354
kvm_destroy_vm(kvm);
1355
}
1356
EXPORT_SYMBOL_GPL(kvm_put_kvm);
1357
1358
/*
1359
* Used to put a reference that was taken on behalf of an object associated
1360
* with a user-visible file descriptor, e.g. a vcpu or device, if installation
1361
* of the new file descriptor fails and the reference cannot be transferred to
1362
* its final owner. In such cases, the caller is still actively using @kvm and
1363
* will fail miserably if the refcount unexpectedly hits zero.
1364
*/
1365
void kvm_put_kvm_no_destroy(struct kvm *kvm)
1366
{
1367
WARN_ON(refcount_dec_and_test(&kvm->users_count));
1368
}
1369
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy);
1370
1371
static int kvm_vm_release(struct inode *inode, struct file *filp)
1372
{
1373
struct kvm *kvm = filp->private_data;
1374
1375
kvm_irqfd_release(kvm);
1376
1377
kvm_put_kvm(kvm);
1378
return 0;
1379
}
1380
1381
int kvm_trylock_all_vcpus(struct kvm *kvm)
1382
{
1383
struct kvm_vcpu *vcpu;
1384
unsigned long i, j;
1385
1386
lockdep_assert_held(&kvm->lock);
1387
1388
kvm_for_each_vcpu(i, vcpu, kvm)
1389
if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
1390
goto out_unlock;
1391
return 0;
1392
1393
out_unlock:
1394
kvm_for_each_vcpu(j, vcpu, kvm) {
1395
if (i == j)
1396
break;
1397
mutex_unlock(&vcpu->mutex);
1398
}
1399
return -EINTR;
1400
}
1401
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus);
1402
1403
int kvm_lock_all_vcpus(struct kvm *kvm)
1404
{
1405
struct kvm_vcpu *vcpu;
1406
unsigned long i, j;
1407
int r;
1408
1409
lockdep_assert_held(&kvm->lock);
1410
1411
kvm_for_each_vcpu(i, vcpu, kvm) {
1412
r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock);
1413
if (r)
1414
goto out_unlock;
1415
}
1416
return 0;
1417
1418
out_unlock:
1419
kvm_for_each_vcpu(j, vcpu, kvm) {
1420
if (i == j)
1421
break;
1422
mutex_unlock(&vcpu->mutex);
1423
}
1424
return r;
1425
}
1426
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus);
1427
1428
void kvm_unlock_all_vcpus(struct kvm *kvm)
1429
{
1430
struct kvm_vcpu *vcpu;
1431
unsigned long i;
1432
1433
lockdep_assert_held(&kvm->lock);
1434
1435
kvm_for_each_vcpu(i, vcpu, kvm)
1436
mutex_unlock(&vcpu->mutex);
1437
}
1438
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus);
1439
1440
/*
1441
* Allocation size is twice as large as the actual dirty bitmap size.
1442
* See kvm_vm_ioctl_get_dirty_log() why this is needed.
1443
*/
1444
static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1445
{
1446
unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1447
1448
memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1449
if (!memslot->dirty_bitmap)
1450
return -ENOMEM;
1451
1452
return 0;
1453
}
1454
1455
static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1456
{
1457
struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1458
int node_idx_inactive = active->node_idx ^ 1;
1459
1460
return &kvm->__memslots[as_id][node_idx_inactive];
1461
}
1462
1463
/*
1464
* Helper to get the address space ID when one of memslot pointers may be NULL.
1465
* This also serves as a sanity that at least one of the pointers is non-NULL,
1466
* and that their address space IDs don't diverge.
1467
*/
1468
static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1469
struct kvm_memory_slot *b)
1470
{
1471
if (WARN_ON_ONCE(!a && !b))
1472
return 0;
1473
1474
if (!a)
1475
return b->as_id;
1476
if (!b)
1477
return a->as_id;
1478
1479
WARN_ON_ONCE(a->as_id != b->as_id);
1480
return a->as_id;
1481
}
1482
1483
static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1484
struct kvm_memory_slot *slot)
1485
{
1486
struct rb_root *gfn_tree = &slots->gfn_tree;
1487
struct rb_node **node, *parent;
1488
int idx = slots->node_idx;
1489
1490
parent = NULL;
1491
for (node = &gfn_tree->rb_node; *node; ) {
1492
struct kvm_memory_slot *tmp;
1493
1494
tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1495
parent = *node;
1496
if (slot->base_gfn < tmp->base_gfn)
1497
node = &(*node)->rb_left;
1498
else if (slot->base_gfn > tmp->base_gfn)
1499
node = &(*node)->rb_right;
1500
else
1501
BUG();
1502
}
1503
1504
rb_link_node(&slot->gfn_node[idx], parent, node);
1505
rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1506
}
1507
1508
static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1509
struct kvm_memory_slot *slot)
1510
{
1511
rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1512
}
1513
1514
static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1515
struct kvm_memory_slot *old,
1516
struct kvm_memory_slot *new)
1517
{
1518
int idx = slots->node_idx;
1519
1520
WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1521
1522
rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1523
&slots->gfn_tree);
1524
}
1525
1526
/*
1527
* Replace @old with @new in the inactive memslots.
1528
*
1529
* With NULL @old this simply adds @new.
1530
* With NULL @new this simply removes @old.
1531
*
1532
* If @new is non-NULL its hva_node[slots_idx] range has to be set
1533
* appropriately.
1534
*/
1535
static void kvm_replace_memslot(struct kvm *kvm,
1536
struct kvm_memory_slot *old,
1537
struct kvm_memory_slot *new)
1538
{
1539
int as_id = kvm_memslots_get_as_id(old, new);
1540
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1541
int idx = slots->node_idx;
1542
1543
if (old) {
1544
hash_del(&old->id_node[idx]);
1545
interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1546
1547
if ((long)old == atomic_long_read(&slots->last_used_slot))
1548
atomic_long_set(&slots->last_used_slot, (long)new);
1549
1550
if (!new) {
1551
kvm_erase_gfn_node(slots, old);
1552
return;
1553
}
1554
}
1555
1556
/*
1557
* Initialize @new's hva range. Do this even when replacing an @old
1558
* slot, kvm_copy_memslot() deliberately does not touch node data.
1559
*/
1560
new->hva_node[idx].start = new->userspace_addr;
1561
new->hva_node[idx].last = new->userspace_addr +
1562
(new->npages << PAGE_SHIFT) - 1;
1563
1564
/*
1565
* (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1566
* hva_node needs to be swapped with remove+insert even though hva can't
1567
* change when replacing an existing slot.
1568
*/
1569
hash_add(slots->id_hash, &new->id_node[idx], new->id);
1570
interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1571
1572
/*
1573
* If the memslot gfn is unchanged, rb_replace_node() can be used to
1574
* switch the node in the gfn tree instead of removing the old and
1575
* inserting the new as two separate operations. Replacement is a
1576
* single O(1) operation versus two O(log(n)) operations for
1577
* remove+insert.
1578
*/
1579
if (old && old->base_gfn == new->base_gfn) {
1580
kvm_replace_gfn_node(slots, old, new);
1581
} else {
1582
if (old)
1583
kvm_erase_gfn_node(slots, old);
1584
kvm_insert_gfn_node(slots, new);
1585
}
1586
}
1587
1588
/*
1589
* Flags that do not access any of the extra space of struct
1590
* kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1591
* only allows these.
1592
*/
1593
#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1594
(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1595
1596
static int check_memory_region_flags(struct kvm *kvm,
1597
const struct kvm_userspace_memory_region2 *mem)
1598
{
1599
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1600
1601
if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
1602
valid_flags |= KVM_MEM_GUEST_MEMFD;
1603
1604
/* Dirty logging private memory is not currently supported. */
1605
if (mem->flags & KVM_MEM_GUEST_MEMFD)
1606
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1607
1608
/*
1609
* GUEST_MEMFD is incompatible with read-only memslots, as writes to
1610
* read-only memslots have emulated MMIO, not page fault, semantics,
1611
* and KVM doesn't allow emulated MMIO for private memory.
1612
*/
1613
if (kvm_arch_has_readonly_mem(kvm) &&
1614
!(mem->flags & KVM_MEM_GUEST_MEMFD))
1615
valid_flags |= KVM_MEM_READONLY;
1616
1617
if (mem->flags & ~valid_flags)
1618
return -EINVAL;
1619
1620
return 0;
1621
}
1622
1623
static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1624
{
1625
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1626
1627
/* Grab the generation from the activate memslots. */
1628
u64 gen = __kvm_memslots(kvm, as_id)->generation;
1629
1630
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1631
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1632
1633
/*
1634
* Do not store the new memslots while there are invalidations in
1635
* progress, otherwise the locking in invalidate_range_start and
1636
* invalidate_range_end will be unbalanced.
1637
*/
1638
spin_lock(&kvm->mn_invalidate_lock);
1639
prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1640
while (kvm->mn_active_invalidate_count) {
1641
set_current_state(TASK_UNINTERRUPTIBLE);
1642
spin_unlock(&kvm->mn_invalidate_lock);
1643
schedule();
1644
spin_lock(&kvm->mn_invalidate_lock);
1645
}
1646
finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1647
rcu_assign_pointer(kvm->memslots[as_id], slots);
1648
spin_unlock(&kvm->mn_invalidate_lock);
1649
1650
/*
1651
* Acquired in kvm_set_memslot. Must be released before synchronize
1652
* SRCU below in order to avoid deadlock with another thread
1653
* acquiring the slots_arch_lock in an srcu critical section.
1654
*/
1655
mutex_unlock(&kvm->slots_arch_lock);
1656
1657
synchronize_srcu_expedited(&kvm->srcu);
1658
1659
/*
1660
* Increment the new memslot generation a second time, dropping the
1661
* update in-progress flag and incrementing the generation based on
1662
* the number of address spaces. This provides a unique and easily
1663
* identifiable generation number while the memslots are in flux.
1664
*/
1665
gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1666
1667
/*
1668
* Generations must be unique even across address spaces. We do not need
1669
* a global counter for that, instead the generation space is evenly split
1670
* across address spaces. For example, with two address spaces, address
1671
* space 0 will use generations 0, 2, 4, ... while address space 1 will
1672
* use generations 1, 3, 5, ...
1673
*/
1674
gen += kvm_arch_nr_memslot_as_ids(kvm);
1675
1676
kvm_arch_memslots_updated(kvm, gen);
1677
1678
slots->generation = gen;
1679
}
1680
1681
static int kvm_prepare_memory_region(struct kvm *kvm,
1682
const struct kvm_memory_slot *old,
1683
struct kvm_memory_slot *new,
1684
enum kvm_mr_change change)
1685
{
1686
int r;
1687
1688
/*
1689
* If dirty logging is disabled, nullify the bitmap; the old bitmap
1690
* will be freed on "commit". If logging is enabled in both old and
1691
* new, reuse the existing bitmap. If logging is enabled only in the
1692
* new and KVM isn't using a ring buffer, allocate and initialize a
1693
* new bitmap.
1694
*/
1695
if (change != KVM_MR_DELETE) {
1696
if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1697
new->dirty_bitmap = NULL;
1698
else if (old && old->dirty_bitmap)
1699
new->dirty_bitmap = old->dirty_bitmap;
1700
else if (kvm_use_dirty_bitmap(kvm)) {
1701
r = kvm_alloc_dirty_bitmap(new);
1702
if (r)
1703
return r;
1704
1705
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1706
bitmap_set(new->dirty_bitmap, 0, new->npages);
1707
}
1708
}
1709
1710
r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1711
1712
/* Free the bitmap on failure if it was allocated above. */
1713
if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1714
kvm_destroy_dirty_bitmap(new);
1715
1716
return r;
1717
}
1718
1719
static void kvm_commit_memory_region(struct kvm *kvm,
1720
struct kvm_memory_slot *old,
1721
const struct kvm_memory_slot *new,
1722
enum kvm_mr_change change)
1723
{
1724
int old_flags = old ? old->flags : 0;
1725
int new_flags = new ? new->flags : 0;
1726
/*
1727
* Update the total number of memslot pages before calling the arch
1728
* hook so that architectures can consume the result directly.
1729
*/
1730
if (change == KVM_MR_DELETE)
1731
kvm->nr_memslot_pages -= old->npages;
1732
else if (change == KVM_MR_CREATE)
1733
kvm->nr_memslot_pages += new->npages;
1734
1735
if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1736
int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1737
atomic_set(&kvm->nr_memslots_dirty_logging,
1738
atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1739
}
1740
1741
kvm_arch_commit_memory_region(kvm, old, new, change);
1742
1743
switch (change) {
1744
case KVM_MR_CREATE:
1745
/* Nothing more to do. */
1746
break;
1747
case KVM_MR_DELETE:
1748
/* Free the old memslot and all its metadata. */
1749
kvm_free_memslot(kvm, old);
1750
break;
1751
case KVM_MR_MOVE:
1752
/*
1753
* Moving a guest_memfd memslot isn't supported, and will never
1754
* be supported.
1755
*/
1756
WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD);
1757
fallthrough;
1758
case KVM_MR_FLAGS_ONLY:
1759
/*
1760
* Free the dirty bitmap as needed; the below check encompasses
1761
* both the flags and whether a ring buffer is being used)
1762
*/
1763
if (old->dirty_bitmap && !new->dirty_bitmap)
1764
kvm_destroy_dirty_bitmap(old);
1765
1766
/*
1767
* Unbind the guest_memfd instance as needed; the @new slot has
1768
* already created its own binding. TODO: Drop the WARN when
1769
* dirty logging guest_memfd memslots is supported. Until then,
1770
* flags-only changes on guest_memfd slots should be impossible.
1771
*/
1772
if (WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD))
1773
kvm_gmem_unbind(old);
1774
1775
/*
1776
* The final quirk. Free the detached, old slot, but only its
1777
* memory, not any metadata. Metadata, including arch specific
1778
* data, may be reused by @new.
1779
*/
1780
kfree(old);
1781
break;
1782
default:
1783
BUG();
1784
}
1785
}
1786
1787
/*
1788
* Activate @new, which must be installed in the inactive slots by the caller,
1789
* by swapping the active slots and then propagating @new to @old once @old is
1790
* unreachable and can be safely modified.
1791
*
1792
* With NULL @old this simply adds @new to @active (while swapping the sets).
1793
* With NULL @new this simply removes @old from @active and frees it
1794
* (while also swapping the sets).
1795
*/
1796
static void kvm_activate_memslot(struct kvm *kvm,
1797
struct kvm_memory_slot *old,
1798
struct kvm_memory_slot *new)
1799
{
1800
int as_id = kvm_memslots_get_as_id(old, new);
1801
1802
kvm_swap_active_memslots(kvm, as_id);
1803
1804
/* Propagate the new memslot to the now inactive memslots. */
1805
kvm_replace_memslot(kvm, old, new);
1806
}
1807
1808
static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1809
const struct kvm_memory_slot *src)
1810
{
1811
dest->base_gfn = src->base_gfn;
1812
dest->npages = src->npages;
1813
dest->dirty_bitmap = src->dirty_bitmap;
1814
dest->arch = src->arch;
1815
dest->userspace_addr = src->userspace_addr;
1816
dest->flags = src->flags;
1817
dest->id = src->id;
1818
dest->as_id = src->as_id;
1819
}
1820
1821
static void kvm_invalidate_memslot(struct kvm *kvm,
1822
struct kvm_memory_slot *old,
1823
struct kvm_memory_slot *invalid_slot)
1824
{
1825
/*
1826
* Mark the current slot INVALID. As with all memslot modifications,
1827
* this must be done on an unreachable slot to avoid modifying the
1828
* current slot in the active tree.
1829
*/
1830
kvm_copy_memslot(invalid_slot, old);
1831
invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1832
kvm_replace_memslot(kvm, old, invalid_slot);
1833
1834
/*
1835
* Activate the slot that is now marked INVALID, but don't propagate
1836
* the slot to the now inactive slots. The slot is either going to be
1837
* deleted or recreated as a new slot.
1838
*/
1839
kvm_swap_active_memslots(kvm, old->as_id);
1840
1841
/*
1842
* From this point no new shadow pages pointing to a deleted, or moved,
1843
* memslot will be created. Validation of sp->gfn happens in:
1844
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1845
* - kvm_is_visible_gfn (mmu_check_root)
1846
*/
1847
kvm_arch_flush_shadow_memslot(kvm, old);
1848
kvm_arch_guest_memory_reclaimed(kvm);
1849
1850
/* Was released by kvm_swap_active_memslots(), reacquire. */
1851
mutex_lock(&kvm->slots_arch_lock);
1852
1853
/*
1854
* Copy the arch-specific field of the newly-installed slot back to the
1855
* old slot as the arch data could have changed between releasing
1856
* slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1857
* above. Writers are required to retrieve memslots *after* acquiring
1858
* slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1859
*/
1860
old->arch = invalid_slot->arch;
1861
}
1862
1863
static void kvm_create_memslot(struct kvm *kvm,
1864
struct kvm_memory_slot *new)
1865
{
1866
/* Add the new memslot to the inactive set and activate. */
1867
kvm_replace_memslot(kvm, NULL, new);
1868
kvm_activate_memslot(kvm, NULL, new);
1869
}
1870
1871
static void kvm_delete_memslot(struct kvm *kvm,
1872
struct kvm_memory_slot *old,
1873
struct kvm_memory_slot *invalid_slot)
1874
{
1875
/*
1876
* Remove the old memslot (in the inactive memslots) by passing NULL as
1877
* the "new" slot, and for the invalid version in the active slots.
1878
*/
1879
kvm_replace_memslot(kvm, old, NULL);
1880
kvm_activate_memslot(kvm, invalid_slot, NULL);
1881
}
1882
1883
static void kvm_move_memslot(struct kvm *kvm,
1884
struct kvm_memory_slot *old,
1885
struct kvm_memory_slot *new,
1886
struct kvm_memory_slot *invalid_slot)
1887
{
1888
/*
1889
* Replace the old memslot in the inactive slots, and then swap slots
1890
* and replace the current INVALID with the new as well.
1891
*/
1892
kvm_replace_memslot(kvm, old, new);
1893
kvm_activate_memslot(kvm, invalid_slot, new);
1894
}
1895
1896
static void kvm_update_flags_memslot(struct kvm *kvm,
1897
struct kvm_memory_slot *old,
1898
struct kvm_memory_slot *new)
1899
{
1900
/*
1901
* Similar to the MOVE case, but the slot doesn't need to be zapped as
1902
* an intermediate step. Instead, the old memslot is simply replaced
1903
* with a new, updated copy in both memslot sets.
1904
*/
1905
kvm_replace_memslot(kvm, old, new);
1906
kvm_activate_memslot(kvm, old, new);
1907
}
1908
1909
static int kvm_set_memslot(struct kvm *kvm,
1910
struct kvm_memory_slot *old,
1911
struct kvm_memory_slot *new,
1912
enum kvm_mr_change change)
1913
{
1914
struct kvm_memory_slot *invalid_slot;
1915
int r;
1916
1917
/*
1918
* Released in kvm_swap_active_memslots().
1919
*
1920
* Must be held from before the current memslots are copied until after
1921
* the new memslots are installed with rcu_assign_pointer, then
1922
* released before the synchronize srcu in kvm_swap_active_memslots().
1923
*
1924
* When modifying memslots outside of the slots_lock, must be held
1925
* before reading the pointer to the current memslots until after all
1926
* changes to those memslots are complete.
1927
*
1928
* These rules ensure that installing new memslots does not lose
1929
* changes made to the previous memslots.
1930
*/
1931
mutex_lock(&kvm->slots_arch_lock);
1932
1933
/*
1934
* Invalidate the old slot if it's being deleted or moved. This is
1935
* done prior to actually deleting/moving the memslot to allow vCPUs to
1936
* continue running by ensuring there are no mappings or shadow pages
1937
* for the memslot when it is deleted/moved. Without pre-invalidation
1938
* (and without a lock), a window would exist between effecting the
1939
* delete/move and committing the changes in arch code where KVM or a
1940
* guest could access a non-existent memslot.
1941
*
1942
* Modifications are done on a temporary, unreachable slot. The old
1943
* slot needs to be preserved in case a later step fails and the
1944
* invalidation needs to be reverted.
1945
*/
1946
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1947
invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1948
if (!invalid_slot) {
1949
mutex_unlock(&kvm->slots_arch_lock);
1950
return -ENOMEM;
1951
}
1952
kvm_invalidate_memslot(kvm, old, invalid_slot);
1953
}
1954
1955
r = kvm_prepare_memory_region(kvm, old, new, change);
1956
if (r) {
1957
/*
1958
* For DELETE/MOVE, revert the above INVALID change. No
1959
* modifications required since the original slot was preserved
1960
* in the inactive slots. Changing the active memslots also
1961
* release slots_arch_lock.
1962
*/
1963
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1964
kvm_activate_memslot(kvm, invalid_slot, old);
1965
kfree(invalid_slot);
1966
} else {
1967
mutex_unlock(&kvm->slots_arch_lock);
1968
}
1969
return r;
1970
}
1971
1972
/*
1973
* For DELETE and MOVE, the working slot is now active as the INVALID
1974
* version of the old slot. MOVE is particularly special as it reuses
1975
* the old slot and returns a copy of the old slot (in working_slot).
1976
* For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1977
* old slot is detached but otherwise preserved.
1978
*/
1979
if (change == KVM_MR_CREATE)
1980
kvm_create_memslot(kvm, new);
1981
else if (change == KVM_MR_DELETE)
1982
kvm_delete_memslot(kvm, old, invalid_slot);
1983
else if (change == KVM_MR_MOVE)
1984
kvm_move_memslot(kvm, old, new, invalid_slot);
1985
else if (change == KVM_MR_FLAGS_ONLY)
1986
kvm_update_flags_memslot(kvm, old, new);
1987
else
1988
BUG();
1989
1990
/* Free the temporary INVALID slot used for DELETE and MOVE. */
1991
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1992
kfree(invalid_slot);
1993
1994
/*
1995
* No need to refresh new->arch, changes after dropping slots_arch_lock
1996
* will directly hit the final, active memslot. Architectures are
1997
* responsible for knowing that new->arch may be stale.
1998
*/
1999
kvm_commit_memory_region(kvm, old, new, change);
2000
2001
return 0;
2002
}
2003
2004
static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
2005
gfn_t start, gfn_t end)
2006
{
2007
struct kvm_memslot_iter iter;
2008
2009
kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
2010
if (iter.slot->id != id)
2011
return true;
2012
}
2013
2014
return false;
2015
}
2016
2017
static int kvm_set_memory_region(struct kvm *kvm,
2018
const struct kvm_userspace_memory_region2 *mem)
2019
{
2020
struct kvm_memory_slot *old, *new;
2021
struct kvm_memslots *slots;
2022
enum kvm_mr_change change;
2023
unsigned long npages;
2024
gfn_t base_gfn;
2025
int as_id, id;
2026
int r;
2027
2028
lockdep_assert_held(&kvm->slots_lock);
2029
2030
r = check_memory_region_flags(kvm, mem);
2031
if (r)
2032
return r;
2033
2034
as_id = mem->slot >> 16;
2035
id = (u16)mem->slot;
2036
2037
/* General sanity checks */
2038
if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2039
(mem->memory_size != (unsigned long)mem->memory_size))
2040
return -EINVAL;
2041
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
2042
return -EINVAL;
2043
/* We can read the guest memory with __xxx_user() later on. */
2044
if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
2045
(mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
2046
!access_ok((void __user *)(unsigned long)mem->userspace_addr,
2047
mem->memory_size))
2048
return -EINVAL;
2049
if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2050
(mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2051
mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2052
return -EINVAL;
2053
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
2054
return -EINVAL;
2055
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
2056
return -EINVAL;
2057
2058
/*
2059
* The size of userspace-defined memory regions is restricted in order
2060
* to play nice with dirty bitmap operations, which are indexed with an
2061
* "unsigned int". KVM's internal memory regions don't support dirty
2062
* logging, and so are exempt.
2063
*/
2064
if (id < KVM_USER_MEM_SLOTS &&
2065
(mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2066
return -EINVAL;
2067
2068
slots = __kvm_memslots(kvm, as_id);
2069
2070
/*
2071
* Note, the old memslot (and the pointer itself!) may be invalidated
2072
* and/or destroyed by kvm_set_memslot().
2073
*/
2074
old = id_to_memslot(slots, id);
2075
2076
if (!mem->memory_size) {
2077
if (!old || !old->npages)
2078
return -EINVAL;
2079
2080
if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
2081
return -EIO;
2082
2083
return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2084
}
2085
2086
base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2087
npages = (mem->memory_size >> PAGE_SHIFT);
2088
2089
if (!old || !old->npages) {
2090
change = KVM_MR_CREATE;
2091
2092
/*
2093
* To simplify KVM internals, the total number of pages across
2094
* all memslots must fit in an unsigned long.
2095
*/
2096
if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2097
return -EINVAL;
2098
} else { /* Modify an existing slot. */
2099
/* Private memslots are immutable, they can only be deleted. */
2100
if (mem->flags & KVM_MEM_GUEST_MEMFD)
2101
return -EINVAL;
2102
if ((mem->userspace_addr != old->userspace_addr) ||
2103
(npages != old->npages) ||
2104
((mem->flags ^ old->flags) & (KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD)))
2105
return -EINVAL;
2106
2107
if (base_gfn != old->base_gfn)
2108
change = KVM_MR_MOVE;
2109
else if (mem->flags != old->flags)
2110
change = KVM_MR_FLAGS_ONLY;
2111
else /* Nothing to change. */
2112
return 0;
2113
}
2114
2115
if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2116
kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2117
return -EEXIST;
2118
2119
/* Allocate a slot that will persist in the memslot. */
2120
new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2121
if (!new)
2122
return -ENOMEM;
2123
2124
new->as_id = as_id;
2125
new->id = id;
2126
new->base_gfn = base_gfn;
2127
new->npages = npages;
2128
new->flags = mem->flags;
2129
new->userspace_addr = mem->userspace_addr;
2130
if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2131
r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2132
if (r)
2133
goto out;
2134
}
2135
2136
r = kvm_set_memslot(kvm, old, new, change);
2137
if (r)
2138
goto out_unbind;
2139
2140
return 0;
2141
2142
out_unbind:
2143
if (mem->flags & KVM_MEM_GUEST_MEMFD)
2144
kvm_gmem_unbind(new);
2145
out:
2146
kfree(new);
2147
return r;
2148
}
2149
2150
int kvm_set_internal_memslot(struct kvm *kvm,
2151
const struct kvm_userspace_memory_region2 *mem)
2152
{
2153
if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS))
2154
return -EINVAL;
2155
2156
if (WARN_ON_ONCE(mem->flags))
2157
return -EINVAL;
2158
2159
return kvm_set_memory_region(kvm, mem);
2160
}
2161
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot);
2162
2163
static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2164
struct kvm_userspace_memory_region2 *mem)
2165
{
2166
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2167
return -EINVAL;
2168
2169
guard(mutex)(&kvm->slots_lock);
2170
return kvm_set_memory_region(kvm, mem);
2171
}
2172
2173
#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2174
/**
2175
* kvm_get_dirty_log - get a snapshot of dirty pages
2176
* @kvm: pointer to kvm instance
2177
* @log: slot id and address to which we copy the log
2178
* @is_dirty: set to '1' if any dirty pages were found
2179
* @memslot: set to the associated memslot, always valid on success
2180
*/
2181
int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2182
int *is_dirty, struct kvm_memory_slot **memslot)
2183
{
2184
struct kvm_memslots *slots;
2185
int i, as_id, id;
2186
unsigned long n;
2187
unsigned long any = 0;
2188
2189
/* Dirty ring tracking may be exclusive to dirty log tracking */
2190
if (!kvm_use_dirty_bitmap(kvm))
2191
return -ENXIO;
2192
2193
*memslot = NULL;
2194
*is_dirty = 0;
2195
2196
as_id = log->slot >> 16;
2197
id = (u16)log->slot;
2198
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2199
return -EINVAL;
2200
2201
slots = __kvm_memslots(kvm, as_id);
2202
*memslot = id_to_memslot(slots, id);
2203
if (!(*memslot) || !(*memslot)->dirty_bitmap)
2204
return -ENOENT;
2205
2206
kvm_arch_sync_dirty_log(kvm, *memslot);
2207
2208
n = kvm_dirty_bitmap_bytes(*memslot);
2209
2210
for (i = 0; !any && i < n/sizeof(long); ++i)
2211
any = (*memslot)->dirty_bitmap[i];
2212
2213
if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2214
return -EFAULT;
2215
2216
if (any)
2217
*is_dirty = 1;
2218
return 0;
2219
}
2220
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log);
2221
2222
#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2223
/**
2224
* kvm_get_dirty_log_protect - get a snapshot of dirty pages
2225
* and reenable dirty page tracking for the corresponding pages.
2226
* @kvm: pointer to kvm instance
2227
* @log: slot id and address to which we copy the log
2228
*
2229
* We need to keep it in mind that VCPU threads can write to the bitmap
2230
* concurrently. So, to avoid losing track of dirty pages we keep the
2231
* following order:
2232
*
2233
* 1. Take a snapshot of the bit and clear it if needed.
2234
* 2. Write protect the corresponding page.
2235
* 3. Copy the snapshot to the userspace.
2236
* 4. Upon return caller flushes TLB's if needed.
2237
*
2238
* Between 2 and 4, the guest may write to the page using the remaining TLB
2239
* entry. This is not a problem because the page is reported dirty using
2240
* the snapshot taken before and step 4 ensures that writes done after
2241
* exiting to userspace will be logged for the next call.
2242
*
2243
*/
2244
static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2245
{
2246
struct kvm_memslots *slots;
2247
struct kvm_memory_slot *memslot;
2248
int i, as_id, id;
2249
unsigned long n;
2250
unsigned long *dirty_bitmap;
2251
unsigned long *dirty_bitmap_buffer;
2252
bool flush;
2253
2254
/* Dirty ring tracking may be exclusive to dirty log tracking */
2255
if (!kvm_use_dirty_bitmap(kvm))
2256
return -ENXIO;
2257
2258
as_id = log->slot >> 16;
2259
id = (u16)log->slot;
2260
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2261
return -EINVAL;
2262
2263
slots = __kvm_memslots(kvm, as_id);
2264
memslot = id_to_memslot(slots, id);
2265
if (!memslot || !memslot->dirty_bitmap)
2266
return -ENOENT;
2267
2268
dirty_bitmap = memslot->dirty_bitmap;
2269
2270
kvm_arch_sync_dirty_log(kvm, memslot);
2271
2272
n = kvm_dirty_bitmap_bytes(memslot);
2273
flush = false;
2274
if (kvm->manual_dirty_log_protect) {
2275
/*
2276
* Unlike kvm_get_dirty_log, we always return false in *flush,
2277
* because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2278
* is some code duplication between this function and
2279
* kvm_get_dirty_log, but hopefully all architecture
2280
* transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2281
* can be eliminated.
2282
*/
2283
dirty_bitmap_buffer = dirty_bitmap;
2284
} else {
2285
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2286
memset(dirty_bitmap_buffer, 0, n);
2287
2288
KVM_MMU_LOCK(kvm);
2289
for (i = 0; i < n / sizeof(long); i++) {
2290
unsigned long mask;
2291
gfn_t offset;
2292
2293
if (!dirty_bitmap[i])
2294
continue;
2295
2296
flush = true;
2297
mask = xchg(&dirty_bitmap[i], 0);
2298
dirty_bitmap_buffer[i] = mask;
2299
2300
offset = i * BITS_PER_LONG;
2301
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2302
offset, mask);
2303
}
2304
KVM_MMU_UNLOCK(kvm);
2305
}
2306
2307
if (flush)
2308
kvm_flush_remote_tlbs_memslot(kvm, memslot);
2309
2310
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2311
return -EFAULT;
2312
return 0;
2313
}
2314
2315
2316
/**
2317
* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2318
* @kvm: kvm instance
2319
* @log: slot id and address to which we copy the log
2320
*
2321
* Steps 1-4 below provide general overview of dirty page logging. See
2322
* kvm_get_dirty_log_protect() function description for additional details.
2323
*
2324
* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2325
* always flush the TLB (step 4) even if previous step failed and the dirty
2326
* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2327
* does not preclude user space subsequent dirty log read. Flushing TLB ensures
2328
* writes will be marked dirty for next log read.
2329
*
2330
* 1. Take a snapshot of the bit and clear it if needed.
2331
* 2. Write protect the corresponding page.
2332
* 3. Copy the snapshot to the userspace.
2333
* 4. Flush TLB's if needed.
2334
*/
2335
static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2336
struct kvm_dirty_log *log)
2337
{
2338
int r;
2339
2340
mutex_lock(&kvm->slots_lock);
2341
2342
r = kvm_get_dirty_log_protect(kvm, log);
2343
2344
mutex_unlock(&kvm->slots_lock);
2345
return r;
2346
}
2347
2348
/**
2349
* kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2350
* and reenable dirty page tracking for the corresponding pages.
2351
* @kvm: pointer to kvm instance
2352
* @log: slot id and address from which to fetch the bitmap of dirty pages
2353
*/
2354
static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2355
struct kvm_clear_dirty_log *log)
2356
{
2357
struct kvm_memslots *slots;
2358
struct kvm_memory_slot *memslot;
2359
int as_id, id;
2360
gfn_t offset;
2361
unsigned long i, n;
2362
unsigned long *dirty_bitmap;
2363
unsigned long *dirty_bitmap_buffer;
2364
bool flush;
2365
2366
/* Dirty ring tracking may be exclusive to dirty log tracking */
2367
if (!kvm_use_dirty_bitmap(kvm))
2368
return -ENXIO;
2369
2370
as_id = log->slot >> 16;
2371
id = (u16)log->slot;
2372
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2373
return -EINVAL;
2374
2375
if (log->first_page & 63)
2376
return -EINVAL;
2377
2378
slots = __kvm_memslots(kvm, as_id);
2379
memslot = id_to_memslot(slots, id);
2380
if (!memslot || !memslot->dirty_bitmap)
2381
return -ENOENT;
2382
2383
dirty_bitmap = memslot->dirty_bitmap;
2384
2385
n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2386
2387
if (log->first_page > memslot->npages ||
2388
log->num_pages > memslot->npages - log->first_page ||
2389
(log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2390
return -EINVAL;
2391
2392
kvm_arch_sync_dirty_log(kvm, memslot);
2393
2394
flush = false;
2395
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2396
if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2397
return -EFAULT;
2398
2399
KVM_MMU_LOCK(kvm);
2400
for (offset = log->first_page, i = offset / BITS_PER_LONG,
2401
n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2402
i++, offset += BITS_PER_LONG) {
2403
unsigned long mask = *dirty_bitmap_buffer++;
2404
atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2405
if (!mask)
2406
continue;
2407
2408
mask &= atomic_long_fetch_andnot(mask, p);
2409
2410
/*
2411
* mask contains the bits that really have been cleared. This
2412
* never includes any bits beyond the length of the memslot (if
2413
* the length is not aligned to 64 pages), therefore it is not
2414
* a problem if userspace sets them in log->dirty_bitmap.
2415
*/
2416
if (mask) {
2417
flush = true;
2418
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2419
offset, mask);
2420
}
2421
}
2422
KVM_MMU_UNLOCK(kvm);
2423
2424
if (flush)
2425
kvm_flush_remote_tlbs_memslot(kvm, memslot);
2426
2427
return 0;
2428
}
2429
2430
static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2431
struct kvm_clear_dirty_log *log)
2432
{
2433
int r;
2434
2435
mutex_lock(&kvm->slots_lock);
2436
2437
r = kvm_clear_dirty_log_protect(kvm, log);
2438
2439
mutex_unlock(&kvm->slots_lock);
2440
return r;
2441
}
2442
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2443
2444
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2445
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2446
{
2447
if (!kvm || kvm_arch_has_private_mem(kvm))
2448
return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2449
2450
return 0;
2451
}
2452
2453
/*
2454
* Returns true if _all_ gfns in the range [@start, @end) have attributes
2455
* such that the bits in @mask match @attrs.
2456
*/
2457
bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2458
unsigned long mask, unsigned long attrs)
2459
{
2460
XA_STATE(xas, &kvm->mem_attr_array, start);
2461
unsigned long index;
2462
void *entry;
2463
2464
mask &= kvm_supported_mem_attributes(kvm);
2465
if (attrs & ~mask)
2466
return false;
2467
2468
if (end == start + 1)
2469
return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
2470
2471
guard(rcu)();
2472
if (!attrs)
2473
return !xas_find(&xas, end - 1);
2474
2475
for (index = start; index < end; index++) {
2476
do {
2477
entry = xas_next(&xas);
2478
} while (xas_retry(&xas, entry));
2479
2480
if (xas.xa_index != index ||
2481
(xa_to_value(entry) & mask) != attrs)
2482
return false;
2483
}
2484
2485
return true;
2486
}
2487
2488
static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2489
struct kvm_mmu_notifier_range *range)
2490
{
2491
struct kvm_gfn_range gfn_range;
2492
struct kvm_memory_slot *slot;
2493
struct kvm_memslots *slots;
2494
struct kvm_memslot_iter iter;
2495
bool found_memslot = false;
2496
bool ret = false;
2497
int i;
2498
2499
gfn_range.arg = range->arg;
2500
gfn_range.may_block = range->may_block;
2501
2502
/*
2503
* If/when KVM supports more attributes beyond private .vs shared, this
2504
* _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
2505
* range already has the desired private vs. shared state (it's unclear
2506
* if that is a net win). For now, KVM reaches this point if and only
2507
* if the private flag is being toggled, i.e. all mappings are in play.
2508
*/
2509
2510
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
2511
slots = __kvm_memslots(kvm, i);
2512
2513
kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2514
slot = iter.slot;
2515
gfn_range.slot = slot;
2516
2517
gfn_range.start = max(range->start, slot->base_gfn);
2518
gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2519
if (gfn_range.start >= gfn_range.end)
2520
continue;
2521
2522
if (!found_memslot) {
2523
found_memslot = true;
2524
KVM_MMU_LOCK(kvm);
2525
if (!IS_KVM_NULL_FN(range->on_lock))
2526
range->on_lock(kvm);
2527
}
2528
2529
ret |= range->handler(kvm, &gfn_range);
2530
}
2531
}
2532
2533
if (range->flush_on_ret && ret)
2534
kvm_flush_remote_tlbs(kvm);
2535
2536
if (found_memslot)
2537
KVM_MMU_UNLOCK(kvm);
2538
}
2539
2540
static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2541
struct kvm_gfn_range *range)
2542
{
2543
/*
2544
* Unconditionally add the range to the invalidation set, regardless of
2545
* whether or not the arch callback actually needs to zap SPTEs. E.g.
2546
* if KVM supports RWX attributes in the future and the attributes are
2547
* going from R=>RW, zapping isn't strictly necessary. Unconditionally
2548
* adding the range allows KVM to require that MMU invalidations add at
2549
* least one range between begin() and end(), e.g. allows KVM to detect
2550
* bugs where the add() is missed. Relaxing the rule *might* be safe,
2551
* but it's not obvious that allowing new mappings while the attributes
2552
* are in flux is desirable or worth the complexity.
2553
*/
2554
kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2555
2556
return kvm_arch_pre_set_memory_attributes(kvm, range);
2557
}
2558
2559
/* Set @attributes for the gfn range [@start, @end). */
2560
static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2561
unsigned long attributes)
2562
{
2563
struct kvm_mmu_notifier_range pre_set_range = {
2564
.start = start,
2565
.end = end,
2566
.arg.attributes = attributes,
2567
.handler = kvm_pre_set_memory_attributes,
2568
.on_lock = kvm_mmu_invalidate_begin,
2569
.flush_on_ret = true,
2570
.may_block = true,
2571
};
2572
struct kvm_mmu_notifier_range post_set_range = {
2573
.start = start,
2574
.end = end,
2575
.arg.attributes = attributes,
2576
.handler = kvm_arch_post_set_memory_attributes,
2577
.on_lock = kvm_mmu_invalidate_end,
2578
.may_block = true,
2579
};
2580
unsigned long i;
2581
void *entry;
2582
int r = 0;
2583
2584
entry = attributes ? xa_mk_value(attributes) : NULL;
2585
2586
trace_kvm_vm_set_mem_attributes(start, end, attributes);
2587
2588
mutex_lock(&kvm->slots_lock);
2589
2590
/* Nothing to do if the entire range has the desired attributes. */
2591
if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
2592
goto out_unlock;
2593
2594
/*
2595
* Reserve memory ahead of time to avoid having to deal with failures
2596
* partway through setting the new attributes.
2597
*/
2598
for (i = start; i < end; i++) {
2599
r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2600
if (r)
2601
goto out_unlock;
2602
2603
cond_resched();
2604
}
2605
2606
kvm_handle_gfn_range(kvm, &pre_set_range);
2607
2608
for (i = start; i < end; i++) {
2609
r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2610
GFP_KERNEL_ACCOUNT));
2611
KVM_BUG_ON(r, kvm);
2612
cond_resched();
2613
}
2614
2615
kvm_handle_gfn_range(kvm, &post_set_range);
2616
2617
out_unlock:
2618
mutex_unlock(&kvm->slots_lock);
2619
2620
return r;
2621
}
2622
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2623
struct kvm_memory_attributes *attrs)
2624
{
2625
gfn_t start, end;
2626
2627
/* flags is currently not used. */
2628
if (attrs->flags)
2629
return -EINVAL;
2630
if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2631
return -EINVAL;
2632
if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2633
return -EINVAL;
2634
if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2635
return -EINVAL;
2636
2637
start = attrs->address >> PAGE_SHIFT;
2638
end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2639
2640
/*
2641
* xarray tracks data using "unsigned long", and as a result so does
2642
* KVM. For simplicity, supports generic attributes only on 64-bit
2643
* architectures.
2644
*/
2645
BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2646
2647
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2648
}
2649
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2650
2651
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2652
{
2653
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2654
}
2655
EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot);
2656
2657
struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2658
{
2659
struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2660
u64 gen = slots->generation;
2661
struct kvm_memory_slot *slot;
2662
2663
/*
2664
* This also protects against using a memslot from a different address space,
2665
* since different address spaces have different generation numbers.
2666
*/
2667
if (unlikely(gen != vcpu->last_used_slot_gen)) {
2668
vcpu->last_used_slot = NULL;
2669
vcpu->last_used_slot_gen = gen;
2670
}
2671
2672
slot = try_get_memslot(vcpu->last_used_slot, gfn);
2673
if (slot)
2674
return slot;
2675
2676
/*
2677
* Fall back to searching all memslots. We purposely use
2678
* search_memslots() instead of __gfn_to_memslot() to avoid
2679
* thrashing the VM-wide last_used_slot in kvm_memslots.
2680
*/
2681
slot = search_memslots(slots, gfn, false);
2682
if (slot) {
2683
vcpu->last_used_slot = slot;
2684
return slot;
2685
}
2686
2687
return NULL;
2688
}
2689
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot);
2690
2691
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2692
{
2693
struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2694
2695
return kvm_is_visible_memslot(memslot);
2696
}
2697
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn);
2698
2699
bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2700
{
2701
struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2702
2703
return kvm_is_visible_memslot(memslot);
2704
}
2705
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn);
2706
2707
unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2708
{
2709
struct vm_area_struct *vma;
2710
unsigned long addr, size;
2711
2712
size = PAGE_SIZE;
2713
2714
addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2715
if (kvm_is_error_hva(addr))
2716
return PAGE_SIZE;
2717
2718
mmap_read_lock(current->mm);
2719
vma = find_vma(current->mm, addr);
2720
if (!vma)
2721
goto out;
2722
2723
size = vma_kernel_pagesize(vma);
2724
2725
out:
2726
mmap_read_unlock(current->mm);
2727
2728
return size;
2729
}
2730
2731
static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2732
{
2733
return slot->flags & KVM_MEM_READONLY;
2734
}
2735
2736
static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2737
gfn_t *nr_pages, bool write)
2738
{
2739
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2740
return KVM_HVA_ERR_BAD;
2741
2742
if (memslot_is_readonly(slot) && write)
2743
return KVM_HVA_ERR_RO_BAD;
2744
2745
if (nr_pages)
2746
*nr_pages = slot->npages - (gfn - slot->base_gfn);
2747
2748
return __gfn_to_hva_memslot(slot, gfn);
2749
}
2750
2751
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2752
gfn_t *nr_pages)
2753
{
2754
return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2755
}
2756
2757
unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2758
gfn_t gfn)
2759
{
2760
return gfn_to_hva_many(slot, gfn, NULL);
2761
}
2762
EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot);
2763
2764
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2765
{
2766
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2767
}
2768
EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva);
2769
2770
unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2771
{
2772
return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2773
}
2774
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva);
2775
2776
/*
2777
* Return the hva of a @gfn and the R/W attribute if possible.
2778
*
2779
* @slot: the kvm_memory_slot which contains @gfn
2780
* @gfn: the gfn to be translated
2781
* @writable: used to return the read/write attribute of the @slot if the hva
2782
* is valid and @writable is not NULL
2783
*/
2784
unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2785
gfn_t gfn, bool *writable)
2786
{
2787
unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2788
2789
if (!kvm_is_error_hva(hva) && writable)
2790
*writable = !memslot_is_readonly(slot);
2791
2792
return hva;
2793
}
2794
2795
unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2796
{
2797
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2798
2799
return gfn_to_hva_memslot_prot(slot, gfn, writable);
2800
}
2801
2802
unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2803
{
2804
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2805
2806
return gfn_to_hva_memslot_prot(slot, gfn, writable);
2807
}
2808
2809
static bool kvm_is_ad_tracked_page(struct page *page)
2810
{
2811
/*
2812
* Per page-flags.h, pages tagged PG_reserved "should in general not be
2813
* touched (e.g. set dirty) except by its owner".
2814
*/
2815
return !PageReserved(page);
2816
}
2817
2818
static void kvm_set_page_dirty(struct page *page)
2819
{
2820
if (kvm_is_ad_tracked_page(page))
2821
SetPageDirty(page);
2822
}
2823
2824
static void kvm_set_page_accessed(struct page *page)
2825
{
2826
if (kvm_is_ad_tracked_page(page))
2827
mark_page_accessed(page);
2828
}
2829
2830
void kvm_release_page_clean(struct page *page)
2831
{
2832
if (!page)
2833
return;
2834
2835
kvm_set_page_accessed(page);
2836
put_page(page);
2837
}
2838
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean);
2839
2840
void kvm_release_page_dirty(struct page *page)
2841
{
2842
if (!page)
2843
return;
2844
2845
kvm_set_page_dirty(page);
2846
kvm_release_page_clean(page);
2847
}
2848
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty);
2849
2850
static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
2851
struct follow_pfnmap_args *map, bool writable)
2852
{
2853
kvm_pfn_t pfn;
2854
2855
WARN_ON_ONCE(!!page == !!map);
2856
2857
if (kfp->map_writable)
2858
*kfp->map_writable = writable;
2859
2860
if (map)
2861
pfn = map->pfn;
2862
else
2863
pfn = page_to_pfn(page);
2864
2865
*kfp->refcounted_page = page;
2866
2867
return pfn;
2868
}
2869
2870
/*
2871
* The fast path to get the writable pfn which will be stored in @pfn,
2872
* true indicates success, otherwise false is returned.
2873
*/
2874
static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
2875
{
2876
struct page *page;
2877
bool r;
2878
2879
/*
2880
* Try the fast-only path when the caller wants to pin/get the page for
2881
* writing. If the caller only wants to read the page, KVM must go
2882
* down the full, slow path in order to avoid racing an operation that
2883
* breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
2884
* at the old, read-only page while mm/ points at a new, writable page.
2885
*/
2886
if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
2887
return false;
2888
2889
if (kfp->pin)
2890
r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
2891
else
2892
r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
2893
2894
if (r) {
2895
*pfn = kvm_resolve_pfn(kfp, page, NULL, true);
2896
return true;
2897
}
2898
2899
return false;
2900
}
2901
2902
/*
2903
* The slow path to get the pfn of the specified host virtual address,
2904
* 1 indicates success, -errno is returned if error is detected.
2905
*/
2906
static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
2907
{
2908
/*
2909
* When a VCPU accesses a page that is not mapped into the secondary
2910
* MMU, we lookup the page using GUP to map it, so the guest VCPU can
2911
* make progress. We always want to honor NUMA hinting faults in that
2912
* case, because GUP usage corresponds to memory accesses from the VCPU.
2913
* Otherwise, we'd not trigger NUMA hinting faults once a page is
2914
* mapped into the secondary MMU and gets accessed by a VCPU.
2915
*
2916
* Note that get_user_page_fast_only() and FOLL_WRITE for now
2917
* implicitly honor NUMA hinting faults and don't need this flag.
2918
*/
2919
unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags;
2920
struct page *page, *wpage;
2921
int npages;
2922
2923
if (kfp->pin)
2924
npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
2925
else
2926
npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
2927
if (npages != 1)
2928
return npages;
2929
2930
/*
2931
* Pinning is mutually exclusive with opportunistically mapping a read
2932
* fault as writable, as KVM should never pin pages when mapping memory
2933
* into the guest (pinning is only for direct accesses from KVM).
2934
*/
2935
if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
2936
goto out;
2937
2938
/* map read fault as writable if possible */
2939
if (!(flags & FOLL_WRITE) && kfp->map_writable &&
2940
get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
2941
put_page(page);
2942
page = wpage;
2943
flags |= FOLL_WRITE;
2944
}
2945
2946
out:
2947
*pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
2948
return npages;
2949
}
2950
2951
static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2952
{
2953
if (unlikely(!(vma->vm_flags & VM_READ)))
2954
return false;
2955
2956
if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2957
return false;
2958
2959
return true;
2960
}
2961
2962
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2963
struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
2964
{
2965
struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva };
2966
bool write_fault = kfp->flags & FOLL_WRITE;
2967
int r;
2968
2969
/*
2970
* Remapped memory cannot be pinned in any meaningful sense. Bail if
2971
* the caller wants to pin the page, i.e. access the page outside of
2972
* MMU notifier protection, and unsafe umappings are disallowed.
2973
*/
2974
if (kfp->pin && !allow_unsafe_mappings)
2975
return -EINVAL;
2976
2977
r = follow_pfnmap_start(&args);
2978
if (r) {
2979
/*
2980
* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2981
* not call the fault handler, so do it here.
2982
*/
2983
bool unlocked = false;
2984
r = fixup_user_fault(current->mm, kfp->hva,
2985
(write_fault ? FAULT_FLAG_WRITE : 0),
2986
&unlocked);
2987
if (unlocked)
2988
return -EAGAIN;
2989
if (r)
2990
return r;
2991
2992
r = follow_pfnmap_start(&args);
2993
if (r)
2994
return r;
2995
}
2996
2997
if (write_fault && !args.writable) {
2998
*p_pfn = KVM_PFN_ERR_RO_FAULT;
2999
goto out;
3000
}
3001
3002
*p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable);
3003
out:
3004
follow_pfnmap_end(&args);
3005
return r;
3006
}
3007
3008
kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp)
3009
{
3010
struct vm_area_struct *vma;
3011
kvm_pfn_t pfn;
3012
int npages, r;
3013
3014
might_sleep();
3015
3016
if (WARN_ON_ONCE(!kfp->refcounted_page))
3017
return KVM_PFN_ERR_FAULT;
3018
3019
if (hva_to_pfn_fast(kfp, &pfn))
3020
return pfn;
3021
3022
npages = hva_to_pfn_slow(kfp, &pfn);
3023
if (npages == 1)
3024
return pfn;
3025
if (npages == -EINTR || npages == -EAGAIN)
3026
return KVM_PFN_ERR_SIGPENDING;
3027
if (npages == -EHWPOISON)
3028
return KVM_PFN_ERR_HWPOISON;
3029
3030
mmap_read_lock(current->mm);
3031
retry:
3032
vma = vma_lookup(current->mm, kfp->hva);
3033
3034
if (vma == NULL)
3035
pfn = KVM_PFN_ERR_FAULT;
3036
else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
3037
r = hva_to_pfn_remapped(vma, kfp, &pfn);
3038
if (r == -EAGAIN)
3039
goto retry;
3040
if (r < 0)
3041
pfn = KVM_PFN_ERR_FAULT;
3042
} else {
3043
if ((kfp->flags & FOLL_NOWAIT) &&
3044
vma_is_valid(vma, kfp->flags & FOLL_WRITE))
3045
pfn = KVM_PFN_ERR_NEEDS_IO;
3046
else
3047
pfn = KVM_PFN_ERR_FAULT;
3048
}
3049
mmap_read_unlock(current->mm);
3050
return pfn;
3051
}
3052
3053
static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
3054
{
3055
kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL,
3056
kfp->flags & FOLL_WRITE);
3057
3058
if (kfp->hva == KVM_HVA_ERR_RO_BAD)
3059
return KVM_PFN_ERR_RO_FAULT;
3060
3061
if (kvm_is_error_hva(kfp->hva))
3062
return KVM_PFN_NOSLOT;
3063
3064
if (memslot_is_readonly(kfp->slot) && kfp->map_writable) {
3065
*kfp->map_writable = false;
3066
kfp->map_writable = NULL;
3067
}
3068
3069
return hva_to_pfn(kfp);
3070
}
3071
3072
kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
3073
unsigned int foll, bool *writable,
3074
struct page **refcounted_page)
3075
{
3076
struct kvm_follow_pfn kfp = {
3077
.slot = slot,
3078
.gfn = gfn,
3079
.flags = foll,
3080
.map_writable = writable,
3081
.refcounted_page = refcounted_page,
3082
};
3083
3084
if (WARN_ON_ONCE(!writable || !refcounted_page))
3085
return KVM_PFN_ERR_FAULT;
3086
3087
*writable = false;
3088
*refcounted_page = NULL;
3089
3090
return kvm_follow_pfn(&kfp);
3091
}
3092
EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn);
3093
3094
int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
3095
struct page **pages, int nr_pages)
3096
{
3097
unsigned long addr;
3098
gfn_t entry = 0;
3099
3100
addr = gfn_to_hva_many(slot, gfn, &entry);
3101
if (kvm_is_error_hva(addr))
3102
return -1;
3103
3104
if (entry < nr_pages)
3105
return 0;
3106
3107
return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
3108
}
3109
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages);
3110
3111
/*
3112
* Don't use this API unless you are absolutely, positively certain that KVM
3113
* needs to get a struct page, e.g. to pin the page for firmware DMA.
3114
*
3115
* FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
3116
* its refcount.
3117
*/
3118
struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
3119
{
3120
struct page *refcounted_page = NULL;
3121
struct kvm_follow_pfn kfp = {
3122
.slot = gfn_to_memslot(kvm, gfn),
3123
.gfn = gfn,
3124
.flags = write ? FOLL_WRITE : 0,
3125
.refcounted_page = &refcounted_page,
3126
};
3127
3128
(void)kvm_follow_pfn(&kfp);
3129
return refcounted_page;
3130
}
3131
EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page);
3132
3133
int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
3134
bool writable)
3135
{
3136
struct kvm_follow_pfn kfp = {
3137
.slot = gfn_to_memslot(vcpu->kvm, gfn),
3138
.gfn = gfn,
3139
.flags = writable ? FOLL_WRITE : 0,
3140
.refcounted_page = &map->pinned_page,
3141
.pin = true,
3142
};
3143
3144
map->pinned_page = NULL;
3145
map->page = NULL;
3146
map->hva = NULL;
3147
map->gfn = gfn;
3148
map->writable = writable;
3149
3150
map->pfn = kvm_follow_pfn(&kfp);
3151
if (is_error_noslot_pfn(map->pfn))
3152
return -EINVAL;
3153
3154
if (pfn_valid(map->pfn)) {
3155
map->page = pfn_to_page(map->pfn);
3156
map->hva = kmap(map->page);
3157
#ifdef CONFIG_HAS_IOMEM
3158
} else {
3159
map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB);
3160
#endif
3161
}
3162
3163
return map->hva ? 0 : -EFAULT;
3164
}
3165
EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map);
3166
3167
void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
3168
{
3169
if (!map->hva)
3170
return;
3171
3172
if (map->page)
3173
kunmap(map->page);
3174
#ifdef CONFIG_HAS_IOMEM
3175
else
3176
memunmap(map->hva);
3177
#endif
3178
3179
if (map->writable)
3180
kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
3181
3182
if (map->pinned_page) {
3183
if (map->writable)
3184
kvm_set_page_dirty(map->pinned_page);
3185
kvm_set_page_accessed(map->pinned_page);
3186
unpin_user_page(map->pinned_page);
3187
}
3188
3189
map->hva = NULL;
3190
map->page = NULL;
3191
map->pinned_page = NULL;
3192
}
3193
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap);
3194
3195
static int next_segment(unsigned long len, int offset)
3196
{
3197
if (len > PAGE_SIZE - offset)
3198
return PAGE_SIZE - offset;
3199
else
3200
return len;
3201
}
3202
3203
/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
3204
static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3205
void *data, int offset, int len)
3206
{
3207
int r;
3208
unsigned long addr;
3209
3210
if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3211
return -EFAULT;
3212
3213
addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3214
if (kvm_is_error_hva(addr))
3215
return -EFAULT;
3216
r = __copy_from_user(data, (void __user *)addr + offset, len);
3217
if (r)
3218
return -EFAULT;
3219
return 0;
3220
}
3221
3222
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3223
int len)
3224
{
3225
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3226
3227
return __kvm_read_guest_page(slot, gfn, data, offset, len);
3228
}
3229
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page);
3230
3231
int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3232
int offset, int len)
3233
{
3234
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3235
3236
return __kvm_read_guest_page(slot, gfn, data, offset, len);
3237
}
3238
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page);
3239
3240
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3241
{
3242
gfn_t gfn = gpa >> PAGE_SHIFT;
3243
int seg;
3244
int offset = offset_in_page(gpa);
3245
int ret;
3246
3247
while ((seg = next_segment(len, offset)) != 0) {
3248
ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3249
if (ret < 0)
3250
return ret;
3251
offset = 0;
3252
len -= seg;
3253
data += seg;
3254
++gfn;
3255
}
3256
return 0;
3257
}
3258
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest);
3259
3260
int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3261
{
3262
gfn_t gfn = gpa >> PAGE_SHIFT;
3263
int seg;
3264
int offset = offset_in_page(gpa);
3265
int ret;
3266
3267
while ((seg = next_segment(len, offset)) != 0) {
3268
ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3269
if (ret < 0)
3270
return ret;
3271
offset = 0;
3272
len -= seg;
3273
data += seg;
3274
++gfn;
3275
}
3276
return 0;
3277
}
3278
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest);
3279
3280
static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3281
void *data, int offset, unsigned long len)
3282
{
3283
int r;
3284
unsigned long addr;
3285
3286
if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3287
return -EFAULT;
3288
3289
addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3290
if (kvm_is_error_hva(addr))
3291
return -EFAULT;
3292
pagefault_disable();
3293
r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3294
pagefault_enable();
3295
if (r)
3296
return -EFAULT;
3297
return 0;
3298
}
3299
3300
int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3301
void *data, unsigned long len)
3302
{
3303
gfn_t gfn = gpa >> PAGE_SHIFT;
3304
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3305
int offset = offset_in_page(gpa);
3306
3307
return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3308
}
3309
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic);
3310
3311
/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
3312
static int __kvm_write_guest_page(struct kvm *kvm,
3313
struct kvm_memory_slot *memslot, gfn_t gfn,
3314
const void *data, int offset, int len)
3315
{
3316
int r;
3317
unsigned long addr;
3318
3319
if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3320
return -EFAULT;
3321
3322
addr = gfn_to_hva_memslot(memslot, gfn);
3323
if (kvm_is_error_hva(addr))
3324
return -EFAULT;
3325
r = __copy_to_user((void __user *)addr + offset, data, len);
3326
if (r)
3327
return -EFAULT;
3328
mark_page_dirty_in_slot(kvm, memslot, gfn);
3329
return 0;
3330
}
3331
3332
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3333
const void *data, int offset, int len)
3334
{
3335
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3336
3337
return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3338
}
3339
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page);
3340
3341
int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3342
const void *data, int offset, int len)
3343
{
3344
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3345
3346
return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3347
}
3348
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page);
3349
3350
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3351
unsigned long len)
3352
{
3353
gfn_t gfn = gpa >> PAGE_SHIFT;
3354
int seg;
3355
int offset = offset_in_page(gpa);
3356
int ret;
3357
3358
while ((seg = next_segment(len, offset)) != 0) {
3359
ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3360
if (ret < 0)
3361
return ret;
3362
offset = 0;
3363
len -= seg;
3364
data += seg;
3365
++gfn;
3366
}
3367
return 0;
3368
}
3369
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest);
3370
3371
int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3372
unsigned long len)
3373
{
3374
gfn_t gfn = gpa >> PAGE_SHIFT;
3375
int seg;
3376
int offset = offset_in_page(gpa);
3377
int ret;
3378
3379
while ((seg = next_segment(len, offset)) != 0) {
3380
ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3381
if (ret < 0)
3382
return ret;
3383
offset = 0;
3384
len -= seg;
3385
data += seg;
3386
++gfn;
3387
}
3388
return 0;
3389
}
3390
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest);
3391
3392
static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3393
struct gfn_to_hva_cache *ghc,
3394
gpa_t gpa, unsigned long len)
3395
{
3396
int offset = offset_in_page(gpa);
3397
gfn_t start_gfn = gpa >> PAGE_SHIFT;
3398
gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3399
gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3400
gfn_t nr_pages_avail;
3401
3402
/* Update ghc->generation before performing any error checks. */
3403
ghc->generation = slots->generation;
3404
3405
if (start_gfn > end_gfn) {
3406
ghc->hva = KVM_HVA_ERR_BAD;
3407
return -EINVAL;
3408
}
3409
3410
/*
3411
* If the requested region crosses two memslots, we still
3412
* verify that the entire region is valid here.
3413
*/
3414
for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3415
ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3416
ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3417
&nr_pages_avail);
3418
if (kvm_is_error_hva(ghc->hva))
3419
return -EFAULT;
3420
}
3421
3422
/* Use the slow path for cross page reads and writes. */
3423
if (nr_pages_needed == 1)
3424
ghc->hva += offset;
3425
else
3426
ghc->memslot = NULL;
3427
3428
ghc->gpa = gpa;
3429
ghc->len = len;
3430
return 0;
3431
}
3432
3433
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3434
gpa_t gpa, unsigned long len)
3435
{
3436
struct kvm_memslots *slots = kvm_memslots(kvm);
3437
return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3438
}
3439
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init);
3440
3441
int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3442
void *data, unsigned int offset,
3443
unsigned long len)
3444
{
3445
struct kvm_memslots *slots = kvm_memslots(kvm);
3446
int r;
3447
gpa_t gpa = ghc->gpa + offset;
3448
3449
if (WARN_ON_ONCE(len + offset > ghc->len))
3450
return -EINVAL;
3451
3452
if (slots->generation != ghc->generation) {
3453
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3454
return -EFAULT;
3455
}
3456
3457
if (kvm_is_error_hva(ghc->hva))
3458
return -EFAULT;
3459
3460
if (unlikely(!ghc->memslot))
3461
return kvm_write_guest(kvm, gpa, data, len);
3462
3463
r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3464
if (r)
3465
return -EFAULT;
3466
mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3467
3468
return 0;
3469
}
3470
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached);
3471
3472
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3473
void *data, unsigned long len)
3474
{
3475
return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3476
}
3477
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached);
3478
3479
int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3480
void *data, unsigned int offset,
3481
unsigned long len)
3482
{
3483
struct kvm_memslots *slots = kvm_memslots(kvm);
3484
int r;
3485
gpa_t gpa = ghc->gpa + offset;
3486
3487
if (WARN_ON_ONCE(len + offset > ghc->len))
3488
return -EINVAL;
3489
3490
if (slots->generation != ghc->generation) {
3491
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3492
return -EFAULT;
3493
}
3494
3495
if (kvm_is_error_hva(ghc->hva))
3496
return -EFAULT;
3497
3498
if (unlikely(!ghc->memslot))
3499
return kvm_read_guest(kvm, gpa, data, len);
3500
3501
r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3502
if (r)
3503
return -EFAULT;
3504
3505
return 0;
3506
}
3507
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached);
3508
3509
int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3510
void *data, unsigned long len)
3511
{
3512
return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3513
}
3514
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached);
3515
3516
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3517
{
3518
const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3519
gfn_t gfn = gpa >> PAGE_SHIFT;
3520
int seg;
3521
int offset = offset_in_page(gpa);
3522
int ret;
3523
3524
while ((seg = next_segment(len, offset)) != 0) {
3525
ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
3526
if (ret < 0)
3527
return ret;
3528
offset = 0;
3529
len -= seg;
3530
++gfn;
3531
}
3532
return 0;
3533
}
3534
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest);
3535
3536
void mark_page_dirty_in_slot(struct kvm *kvm,
3537
const struct kvm_memory_slot *memslot,
3538
gfn_t gfn)
3539
{
3540
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3541
3542
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3543
if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3544
return;
3545
3546
WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3547
#endif
3548
3549
if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3550
unsigned long rel_gfn = gfn - memslot->base_gfn;
3551
u32 slot = (memslot->as_id << 16) | memslot->id;
3552
3553
if (kvm->dirty_ring_size && vcpu)
3554
kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3555
else if (memslot->dirty_bitmap)
3556
set_bit_le(rel_gfn, memslot->dirty_bitmap);
3557
}
3558
}
3559
EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot);
3560
3561
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3562
{
3563
struct kvm_memory_slot *memslot;
3564
3565
memslot = gfn_to_memslot(kvm, gfn);
3566
mark_page_dirty_in_slot(kvm, memslot, gfn);
3567
}
3568
EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty);
3569
3570
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3571
{
3572
struct kvm_memory_slot *memslot;
3573
3574
memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3575
mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3576
}
3577
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty);
3578
3579
void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3580
{
3581
if (!vcpu->sigset_active)
3582
return;
3583
3584
/*
3585
* This does a lockless modification of ->real_blocked, which is fine
3586
* because, only current can change ->real_blocked and all readers of
3587
* ->real_blocked don't care as long ->real_blocked is always a subset
3588
* of ->blocked.
3589
*/
3590
sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3591
}
3592
3593
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3594
{
3595
if (!vcpu->sigset_active)
3596
return;
3597
3598
sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3599
sigemptyset(&current->real_blocked);
3600
}
3601
3602
static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3603
{
3604
unsigned int old, val, grow, grow_start;
3605
3606
old = val = vcpu->halt_poll_ns;
3607
grow_start = READ_ONCE(halt_poll_ns_grow_start);
3608
grow = READ_ONCE(halt_poll_ns_grow);
3609
if (!grow)
3610
goto out;
3611
3612
val *= grow;
3613
if (val < grow_start)
3614
val = grow_start;
3615
3616
vcpu->halt_poll_ns = val;
3617
out:
3618
trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3619
}
3620
3621
static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3622
{
3623
unsigned int old, val, shrink, grow_start;
3624
3625
old = val = vcpu->halt_poll_ns;
3626
shrink = READ_ONCE(halt_poll_ns_shrink);
3627
grow_start = READ_ONCE(halt_poll_ns_grow_start);
3628
if (shrink == 0)
3629
val = 0;
3630
else
3631
val /= shrink;
3632
3633
if (val < grow_start)
3634
val = 0;
3635
3636
vcpu->halt_poll_ns = val;
3637
trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3638
}
3639
3640
static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3641
{
3642
int ret = -EINTR;
3643
int idx = srcu_read_lock(&vcpu->kvm->srcu);
3644
3645
if (kvm_arch_vcpu_runnable(vcpu))
3646
goto out;
3647
if (kvm_cpu_has_pending_timer(vcpu))
3648
goto out;
3649
if (signal_pending(current))
3650
goto out;
3651
if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3652
goto out;
3653
3654
ret = 0;
3655
out:
3656
srcu_read_unlock(&vcpu->kvm->srcu, idx);
3657
return ret;
3658
}
3659
3660
/*
3661
* Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3662
* pending. This is mostly used when halting a vCPU, but may also be used
3663
* directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3664
*/
3665
bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3666
{
3667
struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3668
bool waited = false;
3669
3670
vcpu->stat.generic.blocking = 1;
3671
3672
preempt_disable();
3673
kvm_arch_vcpu_blocking(vcpu);
3674
prepare_to_rcuwait(wait);
3675
preempt_enable();
3676
3677
for (;;) {
3678
set_current_state(TASK_INTERRUPTIBLE);
3679
3680
if (kvm_vcpu_check_block(vcpu) < 0)
3681
break;
3682
3683
waited = true;
3684
schedule();
3685
}
3686
3687
preempt_disable();
3688
finish_rcuwait(wait);
3689
kvm_arch_vcpu_unblocking(vcpu);
3690
preempt_enable();
3691
3692
vcpu->stat.generic.blocking = 0;
3693
3694
return waited;
3695
}
3696
3697
static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3698
ktime_t end, bool success)
3699
{
3700
struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3701
u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3702
3703
++vcpu->stat.generic.halt_attempted_poll;
3704
3705
if (success) {
3706
++vcpu->stat.generic.halt_successful_poll;
3707
3708
if (!vcpu_valid_wakeup(vcpu))
3709
++vcpu->stat.generic.halt_poll_invalid;
3710
3711
stats->halt_poll_success_ns += poll_ns;
3712
KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3713
} else {
3714
stats->halt_poll_fail_ns += poll_ns;
3715
KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3716
}
3717
}
3718
3719
static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3720
{
3721
struct kvm *kvm = vcpu->kvm;
3722
3723
if (kvm->override_halt_poll_ns) {
3724
/*
3725
* Ensure kvm->max_halt_poll_ns is not read before
3726
* kvm->override_halt_poll_ns.
3727
*
3728
* Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3729
*/
3730
smp_rmb();
3731
return READ_ONCE(kvm->max_halt_poll_ns);
3732
}
3733
3734
return READ_ONCE(halt_poll_ns);
3735
}
3736
3737
/*
3738
* Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3739
* polling is enabled, busy wait for a short time before blocking to avoid the
3740
* expensive block+unblock sequence if a wake event arrives soon after the vCPU
3741
* is halted.
3742
*/
3743
void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3744
{
3745
unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3746
bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3747
ktime_t start, cur, poll_end;
3748
bool waited = false;
3749
bool do_halt_poll;
3750
u64 halt_ns;
3751
3752
if (vcpu->halt_poll_ns > max_halt_poll_ns)
3753
vcpu->halt_poll_ns = max_halt_poll_ns;
3754
3755
do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3756
3757
start = cur = poll_end = ktime_get();
3758
if (do_halt_poll) {
3759
ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3760
3761
do {
3762
if (kvm_vcpu_check_block(vcpu) < 0)
3763
goto out;
3764
cpu_relax();
3765
poll_end = cur = ktime_get();
3766
} while (kvm_vcpu_can_poll(cur, stop));
3767
}
3768
3769
waited = kvm_vcpu_block(vcpu);
3770
3771
cur = ktime_get();
3772
if (waited) {
3773
vcpu->stat.generic.halt_wait_ns +=
3774
ktime_to_ns(cur) - ktime_to_ns(poll_end);
3775
KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3776
ktime_to_ns(cur) - ktime_to_ns(poll_end));
3777
}
3778
out:
3779
/* The total time the vCPU was "halted", including polling time. */
3780
halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3781
3782
/*
3783
* Note, halt-polling is considered successful so long as the vCPU was
3784
* never actually scheduled out, i.e. even if the wake event arrived
3785
* after of the halt-polling loop itself, but before the full wait.
3786
*/
3787
if (do_halt_poll)
3788
update_halt_poll_stats(vcpu, start, poll_end, !waited);
3789
3790
if (halt_poll_allowed) {
3791
/* Recompute the max halt poll time in case it changed. */
3792
max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3793
3794
if (!vcpu_valid_wakeup(vcpu)) {
3795
shrink_halt_poll_ns(vcpu);
3796
} else if (max_halt_poll_ns) {
3797
if (halt_ns <= vcpu->halt_poll_ns)
3798
;
3799
/* we had a long block, shrink polling */
3800
else if (vcpu->halt_poll_ns &&
3801
halt_ns > max_halt_poll_ns)
3802
shrink_halt_poll_ns(vcpu);
3803
/* we had a short halt and our poll time is too small */
3804
else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3805
halt_ns < max_halt_poll_ns)
3806
grow_halt_poll_ns(vcpu);
3807
} else {
3808
vcpu->halt_poll_ns = 0;
3809
}
3810
}
3811
3812
trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3813
}
3814
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt);
3815
3816
bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3817
{
3818
if (__kvm_vcpu_wake_up(vcpu)) {
3819
WRITE_ONCE(vcpu->ready, true);
3820
++vcpu->stat.generic.halt_wakeup;
3821
return true;
3822
}
3823
3824
return false;
3825
}
3826
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up);
3827
3828
#ifndef CONFIG_S390
3829
/*
3830
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3831
*/
3832
void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait)
3833
{
3834
int me, cpu;
3835
3836
if (kvm_vcpu_wake_up(vcpu))
3837
return;
3838
3839
me = get_cpu();
3840
/*
3841
* The only state change done outside the vcpu mutex is IN_GUEST_MODE
3842
* to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3843
* kick" check does not need atomic operations if kvm_vcpu_kick is used
3844
* within the vCPU thread itself.
3845
*/
3846
if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3847
if (vcpu->mode == IN_GUEST_MODE)
3848
WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3849
goto out;
3850
}
3851
3852
/*
3853
* Note, the vCPU could get migrated to a different pCPU at any point
3854
* after kvm_arch_vcpu_should_kick(), which could result in sending an
3855
* IPI to the previous pCPU. But, that's ok because the purpose of the
3856
* IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3857
* vCPU also requires it to leave IN_GUEST_MODE.
3858
*/
3859
if (kvm_arch_vcpu_should_kick(vcpu)) {
3860
cpu = READ_ONCE(vcpu->cpu);
3861
if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) {
3862
/*
3863
* Use a reschedule IPI to kick the vCPU if the caller
3864
* doesn't need to wait for a response, as KVM allows
3865
* kicking vCPUs while IRQs are disabled, but using the
3866
* SMP function call framework with IRQs disabled can
3867
* deadlock due to taking cross-CPU locks.
3868
*/
3869
if (wait)
3870
smp_call_function_single(cpu, ack_kick, NULL, wait);
3871
else
3872
smp_send_reschedule(cpu);
3873
}
3874
}
3875
out:
3876
put_cpu();
3877
}
3878
EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick);
3879
#endif /* !CONFIG_S390 */
3880
3881
int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3882
{
3883
struct task_struct *task = NULL;
3884
int ret;
3885
3886
if (!read_trylock(&target->pid_lock))
3887
return 0;
3888
3889
if (target->pid)
3890
task = get_pid_task(target->pid, PIDTYPE_PID);
3891
3892
read_unlock(&target->pid_lock);
3893
3894
if (!task)
3895
return 0;
3896
ret = yield_to(task, 1);
3897
put_task_struct(task);
3898
3899
return ret;
3900
}
3901
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to);
3902
3903
/*
3904
* Helper that checks whether a VCPU is eligible for directed yield.
3905
* Most eligible candidate to yield is decided by following heuristics:
3906
*
3907
* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3908
* (preempted lock holder), indicated by @in_spin_loop.
3909
* Set at the beginning and cleared at the end of interception/PLE handler.
3910
*
3911
* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3912
* chance last time (mostly it has become eligible now since we have probably
3913
* yielded to lockholder in last iteration. This is done by toggling
3914
* @dy_eligible each time a VCPU checked for eligibility.)
3915
*
3916
* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3917
* to preempted lock-holder could result in wrong VCPU selection and CPU
3918
* burning. Giving priority for a potential lock-holder increases lock
3919
* progress.
3920
*
3921
* Since algorithm is based on heuristics, accessing another VCPU data without
3922
* locking does not harm. It may result in trying to yield to same VCPU, fail
3923
* and continue with next VCPU and so on.
3924
*/
3925
static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3926
{
3927
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3928
bool eligible;
3929
3930
eligible = !vcpu->spin_loop.in_spin_loop ||
3931
vcpu->spin_loop.dy_eligible;
3932
3933
if (vcpu->spin_loop.in_spin_loop)
3934
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3935
3936
return eligible;
3937
#else
3938
return true;
3939
#endif
3940
}
3941
3942
/*
3943
* Unlike kvm_arch_vcpu_runnable, this function is called outside
3944
* a vcpu_load/vcpu_put pair. However, for most architectures
3945
* kvm_arch_vcpu_runnable does not require vcpu_load.
3946
*/
3947
bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3948
{
3949
return kvm_arch_vcpu_runnable(vcpu);
3950
}
3951
3952
static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3953
{
3954
if (kvm_arch_dy_runnable(vcpu))
3955
return true;
3956
3957
#ifdef CONFIG_KVM_ASYNC_PF
3958
if (!list_empty_careful(&vcpu->async_pf.done))
3959
return true;
3960
#endif
3961
3962
return false;
3963
}
3964
3965
/*
3966
* By default, simply query the target vCPU's current mode when checking if a
3967
* vCPU was preempted in kernel mode. All architectures except x86 (or more
3968
* specifical, except VMX) allow querying whether or not a vCPU is in kernel
3969
* mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
3970
* directly for cross-vCPU checks is functionally correct and accurate.
3971
*/
3972
bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
3973
{
3974
return kvm_arch_vcpu_in_kernel(vcpu);
3975
}
3976
3977
bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3978
{
3979
return false;
3980
}
3981
3982
void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3983
{
3984
int nr_vcpus, start, i, idx, yielded;
3985
struct kvm *kvm = me->kvm;
3986
struct kvm_vcpu *vcpu;
3987
int try = 3;
3988
3989
nr_vcpus = atomic_read(&kvm->online_vcpus);
3990
if (nr_vcpus < 2)
3991
return;
3992
3993
/* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
3994
smp_rmb();
3995
3996
kvm_vcpu_set_in_spin_loop(me, true);
3997
3998
/*
3999
* The current vCPU ("me") is spinning in kernel mode, i.e. is likely
4000
* waiting for a resource to become available. Attempt to yield to a
4001
* vCPU that is runnable, but not currently running, e.g. because the
4002
* vCPU was preempted by a higher priority task. With luck, the vCPU
4003
* that was preempted is holding a lock or some other resource that the
4004
* current vCPU is waiting to acquire, and yielding to the other vCPU
4005
* will allow it to make forward progress and release the lock (or kick
4006
* the spinning vCPU, etc).
4007
*
4008
* Since KVM has no insight into what exactly the guest is doing,
4009
* approximate a round-robin selection by iterating over all vCPUs,
4010
* starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu,
4011
* iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
4012
*
4013
* Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
4014
* they may all try to yield to the same vCPU(s). But as above, this
4015
* is all best effort due to KVM's lack of visibility into the guest.
4016
*/
4017
start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
4018
for (i = 0; i < nr_vcpus; i++) {
4019
idx = (start + i) % nr_vcpus;
4020
if (idx == me->vcpu_idx)
4021
continue;
4022
4023
vcpu = xa_load(&kvm->vcpu_array, idx);
4024
if (!READ_ONCE(vcpu->ready))
4025
continue;
4026
if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
4027
continue;
4028
4029
/*
4030
* Treat the target vCPU as being in-kernel if it has a pending
4031
* interrupt, as the vCPU trying to yield may be spinning
4032
* waiting on IPI delivery, i.e. the target vCPU is in-kernel
4033
* for the purposes of directed yield.
4034
*/
4035
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
4036
!kvm_arch_dy_has_pending_interrupt(vcpu) &&
4037
!kvm_arch_vcpu_preempted_in_kernel(vcpu))
4038
continue;
4039
4040
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4041
continue;
4042
4043
yielded = kvm_vcpu_yield_to(vcpu);
4044
if (yielded > 0) {
4045
WRITE_ONCE(kvm->last_boosted_vcpu, idx);
4046
break;
4047
} else if (yielded < 0 && !--try) {
4048
break;
4049
}
4050
}
4051
kvm_vcpu_set_in_spin_loop(me, false);
4052
4053
/* Ensure vcpu is not eligible during next spinloop */
4054
kvm_vcpu_set_dy_eligible(me, false);
4055
}
4056
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin);
4057
4058
static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4059
{
4060
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
4061
return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4062
(pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4063
kvm->dirty_ring_size / PAGE_SIZE);
4064
#else
4065
return false;
4066
#endif
4067
}
4068
4069
static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
4070
{
4071
struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
4072
struct page *page;
4073
4074
if (vmf->pgoff == 0)
4075
page = virt_to_page(vcpu->run);
4076
#ifdef CONFIG_X86
4077
else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
4078
page = virt_to_page(vcpu->arch.pio_data);
4079
#endif
4080
#ifdef CONFIG_KVM_MMIO
4081
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4082
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
4083
#endif
4084
else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4085
page = kvm_dirty_ring_get_page(
4086
&vcpu->dirty_ring,
4087
vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
4088
else
4089
return kvm_arch_vcpu_fault(vcpu, vmf);
4090
get_page(page);
4091
vmf->page = page;
4092
return 0;
4093
}
4094
4095
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
4096
.fault = kvm_vcpu_fault,
4097
};
4098
4099
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4100
{
4101
struct kvm_vcpu *vcpu = file->private_data;
4102
unsigned long pages = vma_pages(vma);
4103
4104
if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4105
kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4106
((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4107
return -EINVAL;
4108
4109
vma->vm_ops = &kvm_vcpu_vm_ops;
4110
return 0;
4111
}
4112
4113
static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4114
{
4115
struct kvm_vcpu *vcpu = filp->private_data;
4116
4117
kvm_put_kvm(vcpu->kvm);
4118
return 0;
4119
}
4120
4121
static struct file_operations kvm_vcpu_fops = {
4122
.release = kvm_vcpu_release,
4123
.unlocked_ioctl = kvm_vcpu_ioctl,
4124
.mmap = kvm_vcpu_mmap,
4125
.llseek = noop_llseek,
4126
KVM_COMPAT(kvm_vcpu_compat_ioctl),
4127
};
4128
4129
/*
4130
* Allocates an inode for the vcpu.
4131
*/
4132
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4133
{
4134
char name[8 + 1 + ITOA_MAX_LEN + 1];
4135
4136
snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4137
return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
4138
}
4139
4140
#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
4141
static int vcpu_get_pid(void *data, u64 *val)
4142
{
4143
struct kvm_vcpu *vcpu = data;
4144
4145
read_lock(&vcpu->pid_lock);
4146
*val = pid_nr(vcpu->pid);
4147
read_unlock(&vcpu->pid_lock);
4148
return 0;
4149
}
4150
4151
DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4152
4153
static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
4154
{
4155
struct dentry *debugfs_dentry;
4156
char dir_name[ITOA_MAX_LEN * 2];
4157
4158
if (!debugfs_initialized())
4159
return;
4160
4161
snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
4162
debugfs_dentry = debugfs_create_dir(dir_name,
4163
vcpu->kvm->debugfs_dentry);
4164
debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4165
&vcpu_get_pid_fops);
4166
4167
kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
4168
}
4169
#endif
4170
4171
/*
4172
* Creates some virtual cpus. Good luck creating more than one.
4173
*/
4174
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
4175
{
4176
int r;
4177
struct kvm_vcpu *vcpu;
4178
struct page *page;
4179
4180
/*
4181
* KVM tracks vCPU IDs as 'int', be kind to userspace and reject
4182
* too-large values instead of silently truncating.
4183
*
4184
* Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
4185
* changing the storage type (at the very least, IDs should be tracked
4186
* as unsigned ints).
4187
*/
4188
BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
4189
if (id >= KVM_MAX_VCPU_IDS)
4190
return -EINVAL;
4191
4192
mutex_lock(&kvm->lock);
4193
if (kvm->created_vcpus >= kvm->max_vcpus) {
4194
mutex_unlock(&kvm->lock);
4195
return -EINVAL;
4196
}
4197
4198
r = kvm_arch_vcpu_precreate(kvm, id);
4199
if (r) {
4200
mutex_unlock(&kvm->lock);
4201
return r;
4202
}
4203
4204
kvm->created_vcpus++;
4205
mutex_unlock(&kvm->lock);
4206
4207
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4208
if (!vcpu) {
4209
r = -ENOMEM;
4210
goto vcpu_decrement;
4211
}
4212
4213
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4214
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4215
if (!page) {
4216
r = -ENOMEM;
4217
goto vcpu_free;
4218
}
4219
vcpu->run = page_address(page);
4220
4221
kvm_vcpu_init(vcpu, kvm, id);
4222
4223
r = kvm_arch_vcpu_create(vcpu);
4224
if (r)
4225
goto vcpu_free_run_page;
4226
4227
if (kvm->dirty_ring_size) {
4228
r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring,
4229
id, kvm->dirty_ring_size);
4230
if (r)
4231
goto arch_vcpu_destroy;
4232
}
4233
4234
mutex_lock(&kvm->lock);
4235
4236
if (kvm_get_vcpu_by_id(kvm, id)) {
4237
r = -EEXIST;
4238
goto unlock_vcpu_destroy;
4239
}
4240
4241
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4242
r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
4243
WARN_ON_ONCE(r == -EBUSY);
4244
if (r)
4245
goto unlock_vcpu_destroy;
4246
4247
/*
4248
* Now it's all set up, let userspace reach it. Grab the vCPU's mutex
4249
* so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
4250
* visible (per online_vcpus), e.g. so that KVM doesn't get tricked
4251
* into a NULL-pointer dereference because KVM thinks the _current_
4252
* vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep
4253
* knows it's taken *inside* kvm->lock.
4254
*/
4255
mutex_lock(&vcpu->mutex);
4256
kvm_get_kvm(kvm);
4257
r = create_vcpu_fd(vcpu);
4258
if (r < 0)
4259
goto kvm_put_xa_erase;
4260
4261
/*
4262
* Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4263
* pointer before kvm->online_vcpu's incremented value.
4264
*/
4265
smp_wmb();
4266
atomic_inc(&kvm->online_vcpus);
4267
mutex_unlock(&vcpu->mutex);
4268
4269
mutex_unlock(&kvm->lock);
4270
kvm_arch_vcpu_postcreate(vcpu);
4271
kvm_create_vcpu_debugfs(vcpu);
4272
return r;
4273
4274
kvm_put_xa_erase:
4275
mutex_unlock(&vcpu->mutex);
4276
kvm_put_kvm_no_destroy(kvm);
4277
xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
4278
unlock_vcpu_destroy:
4279
mutex_unlock(&kvm->lock);
4280
kvm_dirty_ring_free(&vcpu->dirty_ring);
4281
arch_vcpu_destroy:
4282
kvm_arch_vcpu_destroy(vcpu);
4283
vcpu_free_run_page:
4284
free_page((unsigned long)vcpu->run);
4285
vcpu_free:
4286
kmem_cache_free(kvm_vcpu_cache, vcpu);
4287
vcpu_decrement:
4288
mutex_lock(&kvm->lock);
4289
kvm->created_vcpus--;
4290
mutex_unlock(&kvm->lock);
4291
return r;
4292
}
4293
4294
static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4295
{
4296
if (sigset) {
4297
sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4298
vcpu->sigset_active = 1;
4299
vcpu->sigset = *sigset;
4300
} else
4301
vcpu->sigset_active = 0;
4302
return 0;
4303
}
4304
4305
static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4306
size_t size, loff_t *offset)
4307
{
4308
struct kvm_vcpu *vcpu = file->private_data;
4309
4310
return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4311
&kvm_vcpu_stats_desc[0], &vcpu->stat,
4312
sizeof(vcpu->stat), user_buffer, size, offset);
4313
}
4314
4315
static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4316
{
4317
struct kvm_vcpu *vcpu = file->private_data;
4318
4319
kvm_put_kvm(vcpu->kvm);
4320
return 0;
4321
}
4322
4323
static const struct file_operations kvm_vcpu_stats_fops = {
4324
.owner = THIS_MODULE,
4325
.read = kvm_vcpu_stats_read,
4326
.release = kvm_vcpu_stats_release,
4327
.llseek = noop_llseek,
4328
};
4329
4330
static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4331
{
4332
int fd;
4333
struct file *file;
4334
char name[15 + ITOA_MAX_LEN + 1];
4335
4336
snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4337
4338
fd = get_unused_fd_flags(O_CLOEXEC);
4339
if (fd < 0)
4340
return fd;
4341
4342
file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu,
4343
O_RDONLY, FMODE_PREAD);
4344
if (IS_ERR(file)) {
4345
put_unused_fd(fd);
4346
return PTR_ERR(file);
4347
}
4348
4349
kvm_get_kvm(vcpu->kvm);
4350
fd_install(fd, file);
4351
4352
return fd;
4353
}
4354
4355
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
4356
static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
4357
struct kvm_pre_fault_memory *range)
4358
{
4359
int idx;
4360
long r;
4361
u64 full_size;
4362
4363
if (range->flags)
4364
return -EINVAL;
4365
4366
if (!PAGE_ALIGNED(range->gpa) ||
4367
!PAGE_ALIGNED(range->size) ||
4368
range->gpa + range->size <= range->gpa)
4369
return -EINVAL;
4370
4371
vcpu_load(vcpu);
4372
idx = srcu_read_lock(&vcpu->kvm->srcu);
4373
4374
full_size = range->size;
4375
do {
4376
if (signal_pending(current)) {
4377
r = -EINTR;
4378
break;
4379
}
4380
4381
r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
4382
if (WARN_ON_ONCE(r == 0 || r == -EIO))
4383
break;
4384
4385
if (r < 0)
4386
break;
4387
4388
range->size -= r;
4389
range->gpa += r;
4390
cond_resched();
4391
} while (range->size);
4392
4393
srcu_read_unlock(&vcpu->kvm->srcu, idx);
4394
vcpu_put(vcpu);
4395
4396
/* Return success if at least one page was mapped successfully. */
4397
return full_size == range->size ? r : 0;
4398
}
4399
#endif
4400
4401
static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu)
4402
{
4403
struct kvm *kvm = vcpu->kvm;
4404
4405
/*
4406
* In practice, this happy path will always be taken, as a well-behaved
4407
* VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns.
4408
*/
4409
if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
4410
return 0;
4411
4412
/*
4413
* Acquire and release the vCPU's mutex to wait for vCPU creation to
4414
* complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU
4415
* is fully online).
4416
*/
4417
if (mutex_lock_killable(&vcpu->mutex))
4418
return -EINTR;
4419
4420
mutex_unlock(&vcpu->mutex);
4421
4422
if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx)))
4423
return -EIO;
4424
4425
return 0;
4426
}
4427
4428
static long kvm_vcpu_ioctl(struct file *filp,
4429
unsigned int ioctl, unsigned long arg)
4430
{
4431
struct kvm_vcpu *vcpu = filp->private_data;
4432
void __user *argp = (void __user *)arg;
4433
int r;
4434
struct kvm_fpu *fpu = NULL;
4435
struct kvm_sregs *kvm_sregs = NULL;
4436
4437
if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4438
return -EIO;
4439
4440
if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4441
return -EINVAL;
4442
4443
/*
4444
* Wait for the vCPU to be online before handling the ioctl(), as KVM
4445
* assumes the vCPU is reachable via vcpu_array, i.e. may dereference
4446
* a NULL pointer if userspace invokes an ioctl() before KVM is ready.
4447
*/
4448
r = kvm_wait_for_vcpu_online(vcpu);
4449
if (r)
4450
return r;
4451
4452
/*
4453
* Let arch code handle select vCPU ioctls without holding vcpu->mutex,
4454
* e.g. to support ioctls that can run asynchronous to vCPU execution.
4455
*/
4456
r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg);
4457
if (r != -ENOIOCTLCMD)
4458
return r;
4459
4460
if (mutex_lock_killable(&vcpu->mutex))
4461
return -EINTR;
4462
switch (ioctl) {
4463
case KVM_RUN: {
4464
struct pid *oldpid;
4465
r = -EINVAL;
4466
if (arg)
4467
goto out;
4468
4469
/*
4470
* Note, vcpu->pid is primarily protected by vcpu->mutex. The
4471
* dedicated r/w lock allows other tasks, e.g. other vCPUs, to
4472
* read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
4473
* directly to this vCPU
4474
*/
4475
oldpid = vcpu->pid;
4476
if (unlikely(oldpid != task_pid(current))) {
4477
/* The thread running this VCPU changed. */
4478
struct pid *newpid;
4479
4480
r = kvm_arch_vcpu_run_pid_change(vcpu);
4481
if (r)
4482
break;
4483
4484
newpid = get_task_pid(current, PIDTYPE_PID);
4485
write_lock(&vcpu->pid_lock);
4486
vcpu->pid = newpid;
4487
write_unlock(&vcpu->pid_lock);
4488
4489
put_pid(oldpid);
4490
}
4491
vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
4492
r = kvm_arch_vcpu_ioctl_run(vcpu);
4493
vcpu->wants_to_run = false;
4494
4495
/*
4496
* FIXME: Remove this hack once all KVM architectures
4497
* support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
4498
*/
4499
rseq_virt_userspace_exit();
4500
4501
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4502
break;
4503
}
4504
case KVM_GET_REGS: {
4505
struct kvm_regs *kvm_regs;
4506
4507
r = -ENOMEM;
4508
kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
4509
if (!kvm_regs)
4510
goto out;
4511
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4512
if (r)
4513
goto out_free1;
4514
r = -EFAULT;
4515
if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4516
goto out_free1;
4517
r = 0;
4518
out_free1:
4519
kfree(kvm_regs);
4520
break;
4521
}
4522
case KVM_SET_REGS: {
4523
struct kvm_regs *kvm_regs;
4524
4525
kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4526
if (IS_ERR(kvm_regs)) {
4527
r = PTR_ERR(kvm_regs);
4528
goto out;
4529
}
4530
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4531
kfree(kvm_regs);
4532
break;
4533
}
4534
case KVM_GET_SREGS: {
4535
kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
4536
r = -ENOMEM;
4537
if (!kvm_sregs)
4538
goto out;
4539
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4540
if (r)
4541
goto out;
4542
r = -EFAULT;
4543
if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4544
goto out;
4545
r = 0;
4546
break;
4547
}
4548
case KVM_SET_SREGS: {
4549
kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4550
if (IS_ERR(kvm_sregs)) {
4551
r = PTR_ERR(kvm_sregs);
4552
kvm_sregs = NULL;
4553
goto out;
4554
}
4555
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4556
break;
4557
}
4558
case KVM_GET_MP_STATE: {
4559
struct kvm_mp_state mp_state;
4560
4561
r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4562
if (r)
4563
goto out;
4564
r = -EFAULT;
4565
if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4566
goto out;
4567
r = 0;
4568
break;
4569
}
4570
case KVM_SET_MP_STATE: {
4571
struct kvm_mp_state mp_state;
4572
4573
r = -EFAULT;
4574
if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4575
goto out;
4576
r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4577
break;
4578
}
4579
case KVM_TRANSLATE: {
4580
struct kvm_translation tr;
4581
4582
r = -EFAULT;
4583
if (copy_from_user(&tr, argp, sizeof(tr)))
4584
goto out;
4585
r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4586
if (r)
4587
goto out;
4588
r = -EFAULT;
4589
if (copy_to_user(argp, &tr, sizeof(tr)))
4590
goto out;
4591
r = 0;
4592
break;
4593
}
4594
case KVM_SET_GUEST_DEBUG: {
4595
struct kvm_guest_debug dbg;
4596
4597
r = -EFAULT;
4598
if (copy_from_user(&dbg, argp, sizeof(dbg)))
4599
goto out;
4600
r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4601
break;
4602
}
4603
case KVM_SET_SIGNAL_MASK: {
4604
struct kvm_signal_mask __user *sigmask_arg = argp;
4605
struct kvm_signal_mask kvm_sigmask;
4606
sigset_t sigset, *p;
4607
4608
p = NULL;
4609
if (argp) {
4610
r = -EFAULT;
4611
if (copy_from_user(&kvm_sigmask, argp,
4612
sizeof(kvm_sigmask)))
4613
goto out;
4614
r = -EINVAL;
4615
if (kvm_sigmask.len != sizeof(sigset))
4616
goto out;
4617
r = -EFAULT;
4618
if (copy_from_user(&sigset, sigmask_arg->sigset,
4619
sizeof(sigset)))
4620
goto out;
4621
p = &sigset;
4622
}
4623
r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4624
break;
4625
}
4626
case KVM_GET_FPU: {
4627
fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
4628
r = -ENOMEM;
4629
if (!fpu)
4630
goto out;
4631
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4632
if (r)
4633
goto out;
4634
r = -EFAULT;
4635
if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4636
goto out;
4637
r = 0;
4638
break;
4639
}
4640
case KVM_SET_FPU: {
4641
fpu = memdup_user(argp, sizeof(*fpu));
4642
if (IS_ERR(fpu)) {
4643
r = PTR_ERR(fpu);
4644
fpu = NULL;
4645
goto out;
4646
}
4647
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4648
break;
4649
}
4650
case KVM_GET_STATS_FD: {
4651
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4652
break;
4653
}
4654
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
4655
case KVM_PRE_FAULT_MEMORY: {
4656
struct kvm_pre_fault_memory range;
4657
4658
r = -EFAULT;
4659
if (copy_from_user(&range, argp, sizeof(range)))
4660
break;
4661
r = kvm_vcpu_pre_fault_memory(vcpu, &range);
4662
/* Pass back leftover range. */
4663
if (copy_to_user(argp, &range, sizeof(range)))
4664
r = -EFAULT;
4665
break;
4666
}
4667
#endif
4668
default:
4669
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4670
}
4671
out:
4672
mutex_unlock(&vcpu->mutex);
4673
kfree(fpu);
4674
kfree(kvm_sregs);
4675
return r;
4676
}
4677
4678
#ifdef CONFIG_KVM_COMPAT
4679
static long kvm_vcpu_compat_ioctl(struct file *filp,
4680
unsigned int ioctl, unsigned long arg)
4681
{
4682
struct kvm_vcpu *vcpu = filp->private_data;
4683
void __user *argp = compat_ptr(arg);
4684
int r;
4685
4686
if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4687
return -EIO;
4688
4689
switch (ioctl) {
4690
case KVM_SET_SIGNAL_MASK: {
4691
struct kvm_signal_mask __user *sigmask_arg = argp;
4692
struct kvm_signal_mask kvm_sigmask;
4693
sigset_t sigset;
4694
4695
if (argp) {
4696
r = -EFAULT;
4697
if (copy_from_user(&kvm_sigmask, argp,
4698
sizeof(kvm_sigmask)))
4699
goto out;
4700
r = -EINVAL;
4701
if (kvm_sigmask.len != sizeof(compat_sigset_t))
4702
goto out;
4703
r = -EFAULT;
4704
if (get_compat_sigset(&sigset,
4705
(compat_sigset_t __user *)sigmask_arg->sigset))
4706
goto out;
4707
r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4708
} else
4709
r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4710
break;
4711
}
4712
default:
4713
r = kvm_vcpu_ioctl(filp, ioctl, arg);
4714
}
4715
4716
out:
4717
return r;
4718
}
4719
#endif
4720
4721
static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4722
{
4723
struct kvm_device *dev = filp->private_data;
4724
4725
if (dev->ops->mmap)
4726
return dev->ops->mmap(dev, vma);
4727
4728
return -ENODEV;
4729
}
4730
4731
static int kvm_device_ioctl_attr(struct kvm_device *dev,
4732
int (*accessor)(struct kvm_device *dev,
4733
struct kvm_device_attr *attr),
4734
unsigned long arg)
4735
{
4736
struct kvm_device_attr attr;
4737
4738
if (!accessor)
4739
return -EPERM;
4740
4741
if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4742
return -EFAULT;
4743
4744
return accessor(dev, &attr);
4745
}
4746
4747
static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4748
unsigned long arg)
4749
{
4750
struct kvm_device *dev = filp->private_data;
4751
4752
if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4753
return -EIO;
4754
4755
switch (ioctl) {
4756
case KVM_SET_DEVICE_ATTR:
4757
return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4758
case KVM_GET_DEVICE_ATTR:
4759
return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4760
case KVM_HAS_DEVICE_ATTR:
4761
return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4762
default:
4763
if (dev->ops->ioctl)
4764
return dev->ops->ioctl(dev, ioctl, arg);
4765
4766
return -ENOTTY;
4767
}
4768
}
4769
4770
static int kvm_device_release(struct inode *inode, struct file *filp)
4771
{
4772
struct kvm_device *dev = filp->private_data;
4773
struct kvm *kvm = dev->kvm;
4774
4775
if (dev->ops->release) {
4776
mutex_lock(&kvm->lock);
4777
list_del_rcu(&dev->vm_node);
4778
synchronize_rcu();
4779
dev->ops->release(dev);
4780
mutex_unlock(&kvm->lock);
4781
}
4782
4783
kvm_put_kvm(kvm);
4784
return 0;
4785
}
4786
4787
static struct file_operations kvm_device_fops = {
4788
.unlocked_ioctl = kvm_device_ioctl,
4789
.release = kvm_device_release,
4790
KVM_COMPAT(kvm_device_ioctl),
4791
.mmap = kvm_device_mmap,
4792
};
4793
4794
struct kvm_device *kvm_device_from_filp(struct file *filp)
4795
{
4796
if (filp->f_op != &kvm_device_fops)
4797
return NULL;
4798
4799
return filp->private_data;
4800
}
4801
4802
static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4803
#ifdef CONFIG_KVM_MPIC
4804
[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4805
[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4806
#endif
4807
};
4808
4809
int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4810
{
4811
if (type >= ARRAY_SIZE(kvm_device_ops_table))
4812
return -ENOSPC;
4813
4814
if (kvm_device_ops_table[type] != NULL)
4815
return -EEXIST;
4816
4817
kvm_device_ops_table[type] = ops;
4818
return 0;
4819
}
4820
4821
void kvm_unregister_device_ops(u32 type)
4822
{
4823
if (kvm_device_ops_table[type] != NULL)
4824
kvm_device_ops_table[type] = NULL;
4825
}
4826
4827
static int kvm_ioctl_create_device(struct kvm *kvm,
4828
struct kvm_create_device *cd)
4829
{
4830
const struct kvm_device_ops *ops;
4831
struct kvm_device *dev;
4832
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4833
int type;
4834
int ret;
4835
4836
if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4837
return -ENODEV;
4838
4839
type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4840
ops = kvm_device_ops_table[type];
4841
if (ops == NULL)
4842
return -ENODEV;
4843
4844
if (test)
4845
return 0;
4846
4847
dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4848
if (!dev)
4849
return -ENOMEM;
4850
4851
dev->ops = ops;
4852
dev->kvm = kvm;
4853
4854
mutex_lock(&kvm->lock);
4855
ret = ops->create(dev, type);
4856
if (ret < 0) {
4857
mutex_unlock(&kvm->lock);
4858
kfree(dev);
4859
return ret;
4860
}
4861
list_add_rcu(&dev->vm_node, &kvm->devices);
4862
mutex_unlock(&kvm->lock);
4863
4864
if (ops->init)
4865
ops->init(dev);
4866
4867
kvm_get_kvm(kvm);
4868
ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4869
if (ret < 0) {
4870
kvm_put_kvm_no_destroy(kvm);
4871
mutex_lock(&kvm->lock);
4872
list_del_rcu(&dev->vm_node);
4873
synchronize_rcu();
4874
if (ops->release)
4875
ops->release(dev);
4876
mutex_unlock(&kvm->lock);
4877
if (ops->destroy)
4878
ops->destroy(dev);
4879
return ret;
4880
}
4881
4882
cd->fd = ret;
4883
return 0;
4884
}
4885
4886
static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4887
{
4888
switch (arg) {
4889
case KVM_CAP_USER_MEMORY:
4890
case KVM_CAP_USER_MEMORY2:
4891
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4892
case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4893
case KVM_CAP_INTERNAL_ERROR_DATA:
4894
#ifdef CONFIG_HAVE_KVM_MSI
4895
case KVM_CAP_SIGNAL_MSI:
4896
#endif
4897
#ifdef CONFIG_HAVE_KVM_IRQCHIP
4898
case KVM_CAP_IRQFD:
4899
#endif
4900
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4901
case KVM_CAP_CHECK_EXTENSION_VM:
4902
case KVM_CAP_ENABLE_CAP_VM:
4903
case KVM_CAP_HALT_POLL:
4904
return 1;
4905
#ifdef CONFIG_KVM_MMIO
4906
case KVM_CAP_COALESCED_MMIO:
4907
return KVM_COALESCED_MMIO_PAGE_OFFSET;
4908
case KVM_CAP_COALESCED_PIO:
4909
return 1;
4910
#endif
4911
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4912
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4913
return KVM_DIRTY_LOG_MANUAL_CAPS;
4914
#endif
4915
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4916
case KVM_CAP_IRQ_ROUTING:
4917
return KVM_MAX_IRQ_ROUTES;
4918
#endif
4919
#if KVM_MAX_NR_ADDRESS_SPACES > 1
4920
case KVM_CAP_MULTI_ADDRESS_SPACE:
4921
if (kvm)
4922
return kvm_arch_nr_memslot_as_ids(kvm);
4923
return KVM_MAX_NR_ADDRESS_SPACES;
4924
#endif
4925
case KVM_CAP_NR_MEMSLOTS:
4926
return KVM_USER_MEM_SLOTS;
4927
case KVM_CAP_DIRTY_LOG_RING:
4928
#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4929
return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4930
#else
4931
return 0;
4932
#endif
4933
case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4934
#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4935
return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4936
#else
4937
return 0;
4938
#endif
4939
#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4940
case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4941
#endif
4942
case KVM_CAP_BINARY_STATS_FD:
4943
case KVM_CAP_SYSTEM_EVENT_DATA:
4944
case KVM_CAP_DEVICE_CTRL:
4945
return 1;
4946
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4947
case KVM_CAP_MEMORY_ATTRIBUTES:
4948
return kvm_supported_mem_attributes(kvm);
4949
#endif
4950
#ifdef CONFIG_KVM_GUEST_MEMFD
4951
case KVM_CAP_GUEST_MEMFD:
4952
return 1;
4953
case KVM_CAP_GUEST_MEMFD_FLAGS:
4954
return kvm_gmem_get_supported_flags(kvm);
4955
#endif
4956
default:
4957
break;
4958
}
4959
return kvm_vm_ioctl_check_extension(kvm, arg);
4960
}
4961
4962
static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4963
{
4964
int r;
4965
4966
if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4967
return -EINVAL;
4968
4969
/* the size should be power of 2 */
4970
if (!size || (size & (size - 1)))
4971
return -EINVAL;
4972
4973
/* Should be bigger to keep the reserved entries, or a page */
4974
if (size < kvm_dirty_ring_get_rsvd_entries(kvm) *
4975
sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4976
return -EINVAL;
4977
4978
if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4979
sizeof(struct kvm_dirty_gfn))
4980
return -E2BIG;
4981
4982
/* We only allow it to set once */
4983
if (kvm->dirty_ring_size)
4984
return -EINVAL;
4985
4986
mutex_lock(&kvm->lock);
4987
4988
if (kvm->created_vcpus) {
4989
/* We don't allow to change this value after vcpu created */
4990
r = -EINVAL;
4991
} else {
4992
kvm->dirty_ring_size = size;
4993
r = 0;
4994
}
4995
4996
mutex_unlock(&kvm->lock);
4997
return r;
4998
}
4999
5000
static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
5001
{
5002
unsigned long i;
5003
struct kvm_vcpu *vcpu;
5004
int cleared = 0, r;
5005
5006
if (!kvm->dirty_ring_size)
5007
return -EINVAL;
5008
5009
mutex_lock(&kvm->slots_lock);
5010
5011
kvm_for_each_vcpu(i, vcpu, kvm) {
5012
r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared);
5013
if (r)
5014
break;
5015
}
5016
5017
mutex_unlock(&kvm->slots_lock);
5018
5019
if (cleared)
5020
kvm_flush_remote_tlbs(kvm);
5021
5022
return cleared;
5023
}
5024
5025
int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5026
struct kvm_enable_cap *cap)
5027
{
5028
return -EINVAL;
5029
}
5030
5031
bool kvm_are_all_memslots_empty(struct kvm *kvm)
5032
{
5033
int i;
5034
5035
lockdep_assert_held(&kvm->slots_lock);
5036
5037
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
5038
if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
5039
return false;
5040
}
5041
5042
return true;
5043
}
5044
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty);
5045
5046
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
5047
struct kvm_enable_cap *cap)
5048
{
5049
switch (cap->cap) {
5050
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5051
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
5052
u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
5053
5054
if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
5055
allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
5056
5057
if (cap->flags || (cap->args[0] & ~allowed_options))
5058
return -EINVAL;
5059
kvm->manual_dirty_log_protect = cap->args[0];
5060
return 0;
5061
}
5062
#endif
5063
case KVM_CAP_HALT_POLL: {
5064
if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
5065
return -EINVAL;
5066
5067
kvm->max_halt_poll_ns = cap->args[0];
5068
5069
/*
5070
* Ensure kvm->override_halt_poll_ns does not become visible
5071
* before kvm->max_halt_poll_ns.
5072
*
5073
* Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
5074
*/
5075
smp_wmb();
5076
kvm->override_halt_poll_ns = true;
5077
5078
return 0;
5079
}
5080
case KVM_CAP_DIRTY_LOG_RING:
5081
case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
5082
if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
5083
return -EINVAL;
5084
5085
return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
5086
case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
5087
int r = -EINVAL;
5088
5089
if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
5090
!kvm->dirty_ring_size || cap->flags)
5091
return r;
5092
5093
mutex_lock(&kvm->slots_lock);
5094
5095
/*
5096
* For simplicity, allow enabling ring+bitmap if and only if
5097
* there are no memslots, e.g. to ensure all memslots allocate
5098
* a bitmap after the capability is enabled.
5099
*/
5100
if (kvm_are_all_memslots_empty(kvm)) {
5101
kvm->dirty_ring_with_bitmap = true;
5102
r = 0;
5103
}
5104
5105
mutex_unlock(&kvm->slots_lock);
5106
5107
return r;
5108
}
5109
default:
5110
return kvm_vm_ioctl_enable_cap(kvm, cap);
5111
}
5112
}
5113
5114
static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5115
size_t size, loff_t *offset)
5116
{
5117
struct kvm *kvm = file->private_data;
5118
5119
return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5120
&kvm_vm_stats_desc[0], &kvm->stat,
5121
sizeof(kvm->stat), user_buffer, size, offset);
5122
}
5123
5124
static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5125
{
5126
struct kvm *kvm = file->private_data;
5127
5128
kvm_put_kvm(kvm);
5129
return 0;
5130
}
5131
5132
static const struct file_operations kvm_vm_stats_fops = {
5133
.owner = THIS_MODULE,
5134
.read = kvm_vm_stats_read,
5135
.release = kvm_vm_stats_release,
5136
.llseek = noop_llseek,
5137
};
5138
5139
static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5140
{
5141
int fd;
5142
struct file *file;
5143
5144
fd = get_unused_fd_flags(O_CLOEXEC);
5145
if (fd < 0)
5146
return fd;
5147
5148
file = anon_inode_getfile_fmode("kvm-vm-stats",
5149
&kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD);
5150
if (IS_ERR(file)) {
5151
put_unused_fd(fd);
5152
return PTR_ERR(file);
5153
}
5154
5155
kvm_get_kvm(kvm);
5156
fd_install(fd, file);
5157
5158
return fd;
5159
}
5160
5161
#define SANITY_CHECK_MEM_REGION_FIELD(field) \
5162
do { \
5163
BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5164
offsetof(struct kvm_userspace_memory_region2, field)); \
5165
BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5166
sizeof_field(struct kvm_userspace_memory_region2, field)); \
5167
} while (0)
5168
5169
static long kvm_vm_ioctl(struct file *filp,
5170
unsigned int ioctl, unsigned long arg)
5171
{
5172
struct kvm *kvm = filp->private_data;
5173
void __user *argp = (void __user *)arg;
5174
int r;
5175
5176
if (kvm->mm != current->mm || kvm->vm_dead)
5177
return -EIO;
5178
switch (ioctl) {
5179
case KVM_CREATE_VCPU:
5180
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
5181
break;
5182
case KVM_ENABLE_CAP: {
5183
struct kvm_enable_cap cap;
5184
5185
r = -EFAULT;
5186
if (copy_from_user(&cap, argp, sizeof(cap)))
5187
goto out;
5188
r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5189
break;
5190
}
5191
case KVM_SET_USER_MEMORY_REGION2:
5192
case KVM_SET_USER_MEMORY_REGION: {
5193
struct kvm_userspace_memory_region2 mem;
5194
unsigned long size;
5195
5196
if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5197
/*
5198
* Fields beyond struct kvm_userspace_memory_region shouldn't be
5199
* accessed, but avoid leaking kernel memory in case of a bug.
5200
*/
5201
memset(&mem, 0, sizeof(mem));
5202
size = sizeof(struct kvm_userspace_memory_region);
5203
} else {
5204
size = sizeof(struct kvm_userspace_memory_region2);
5205
}
5206
5207
/* Ensure the common parts of the two structs are identical. */
5208
SANITY_CHECK_MEM_REGION_FIELD(slot);
5209
SANITY_CHECK_MEM_REGION_FIELD(flags);
5210
SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5211
SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5212
SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
5213
5214
r = -EFAULT;
5215
if (copy_from_user(&mem, argp, size))
5216
goto out;
5217
5218
r = -EINVAL;
5219
if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5220
(mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
5221
goto out;
5222
5223
r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
5224
break;
5225
}
5226
case KVM_GET_DIRTY_LOG: {
5227
struct kvm_dirty_log log;
5228
5229
r = -EFAULT;
5230
if (copy_from_user(&log, argp, sizeof(log)))
5231
goto out;
5232
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5233
break;
5234
}
5235
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5236
case KVM_CLEAR_DIRTY_LOG: {
5237
struct kvm_clear_dirty_log log;
5238
5239
r = -EFAULT;
5240
if (copy_from_user(&log, argp, sizeof(log)))
5241
goto out;
5242
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5243
break;
5244
}
5245
#endif
5246
#ifdef CONFIG_KVM_MMIO
5247
case KVM_REGISTER_COALESCED_MMIO: {
5248
struct kvm_coalesced_mmio_zone zone;
5249
5250
r = -EFAULT;
5251
if (copy_from_user(&zone, argp, sizeof(zone)))
5252
goto out;
5253
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5254
break;
5255
}
5256
case KVM_UNREGISTER_COALESCED_MMIO: {
5257
struct kvm_coalesced_mmio_zone zone;
5258
5259
r = -EFAULT;
5260
if (copy_from_user(&zone, argp, sizeof(zone)))
5261
goto out;
5262
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5263
break;
5264
}
5265
#endif
5266
case KVM_IRQFD: {
5267
struct kvm_irqfd data;
5268
5269
r = -EFAULT;
5270
if (copy_from_user(&data, argp, sizeof(data)))
5271
goto out;
5272
r = kvm_irqfd(kvm, &data);
5273
break;
5274
}
5275
case KVM_IOEVENTFD: {
5276
struct kvm_ioeventfd data;
5277
5278
r = -EFAULT;
5279
if (copy_from_user(&data, argp, sizeof(data)))
5280
goto out;
5281
r = kvm_ioeventfd(kvm, &data);
5282
break;
5283
}
5284
#ifdef CONFIG_HAVE_KVM_MSI
5285
case KVM_SIGNAL_MSI: {
5286
struct kvm_msi msi;
5287
5288
r = -EFAULT;
5289
if (copy_from_user(&msi, argp, sizeof(msi)))
5290
goto out;
5291
r = kvm_send_userspace_msi(kvm, &msi);
5292
break;
5293
}
5294
#endif
5295
#ifdef __KVM_HAVE_IRQ_LINE
5296
case KVM_IRQ_LINE_STATUS:
5297
case KVM_IRQ_LINE: {
5298
struct kvm_irq_level irq_event;
5299
5300
r = -EFAULT;
5301
if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
5302
goto out;
5303
5304
r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5305
ioctl == KVM_IRQ_LINE_STATUS);
5306
if (r)
5307
goto out;
5308
5309
r = -EFAULT;
5310
if (ioctl == KVM_IRQ_LINE_STATUS) {
5311
if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
5312
goto out;
5313
}
5314
5315
r = 0;
5316
break;
5317
}
5318
#endif
5319
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5320
case KVM_SET_GSI_ROUTING: {
5321
struct kvm_irq_routing routing;
5322
struct kvm_irq_routing __user *urouting;
5323
struct kvm_irq_routing_entry *entries = NULL;
5324
5325
r = -EFAULT;
5326
if (copy_from_user(&routing, argp, sizeof(routing)))
5327
goto out;
5328
r = -EINVAL;
5329
if (!kvm_arch_can_set_irq_routing(kvm))
5330
goto out;
5331
if (routing.nr > KVM_MAX_IRQ_ROUTES)
5332
goto out;
5333
if (routing.flags)
5334
goto out;
5335
if (routing.nr) {
5336
urouting = argp;
5337
entries = vmemdup_array_user(urouting->entries,
5338
routing.nr, sizeof(*entries));
5339
if (IS_ERR(entries)) {
5340
r = PTR_ERR(entries);
5341
goto out;
5342
}
5343
}
5344
r = kvm_set_irq_routing(kvm, entries, routing.nr,
5345
routing.flags);
5346
kvfree(entries);
5347
break;
5348
}
5349
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5350
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5351
case KVM_SET_MEMORY_ATTRIBUTES: {
5352
struct kvm_memory_attributes attrs;
5353
5354
r = -EFAULT;
5355
if (copy_from_user(&attrs, argp, sizeof(attrs)))
5356
goto out;
5357
5358
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5359
break;
5360
}
5361
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
5362
case KVM_CREATE_DEVICE: {
5363
struct kvm_create_device cd;
5364
5365
r = -EFAULT;
5366
if (copy_from_user(&cd, argp, sizeof(cd)))
5367
goto out;
5368
5369
r = kvm_ioctl_create_device(kvm, &cd);
5370
if (r)
5371
goto out;
5372
5373
r = -EFAULT;
5374
if (copy_to_user(argp, &cd, sizeof(cd)))
5375
goto out;
5376
5377
r = 0;
5378
break;
5379
}
5380
case KVM_CHECK_EXTENSION:
5381
r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5382
break;
5383
case KVM_RESET_DIRTY_RINGS:
5384
r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5385
break;
5386
case KVM_GET_STATS_FD:
5387
r = kvm_vm_ioctl_get_stats_fd(kvm);
5388
break;
5389
#ifdef CONFIG_KVM_GUEST_MEMFD
5390
case KVM_CREATE_GUEST_MEMFD: {
5391
struct kvm_create_guest_memfd guest_memfd;
5392
5393
r = -EFAULT;
5394
if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5395
goto out;
5396
5397
r = kvm_gmem_create(kvm, &guest_memfd);
5398
break;
5399
}
5400
#endif
5401
default:
5402
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5403
}
5404
out:
5405
return r;
5406
}
5407
5408
#ifdef CONFIG_KVM_COMPAT
5409
struct compat_kvm_dirty_log {
5410
__u32 slot;
5411
__u32 padding1;
5412
union {
5413
compat_uptr_t dirty_bitmap; /* one bit per page */
5414
__u64 padding2;
5415
};
5416
};
5417
5418
struct compat_kvm_clear_dirty_log {
5419
__u32 slot;
5420
__u32 num_pages;
5421
__u64 first_page;
5422
union {
5423
compat_uptr_t dirty_bitmap; /* one bit per page */
5424
__u64 padding2;
5425
};
5426
};
5427
5428
long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5429
unsigned long arg)
5430
{
5431
return -ENOTTY;
5432
}
5433
5434
static long kvm_vm_compat_ioctl(struct file *filp,
5435
unsigned int ioctl, unsigned long arg)
5436
{
5437
struct kvm *kvm = filp->private_data;
5438
int r;
5439
5440
if (kvm->mm != current->mm || kvm->vm_dead)
5441
return -EIO;
5442
5443
r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5444
if (r != -ENOTTY)
5445
return r;
5446
5447
switch (ioctl) {
5448
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5449
case KVM_CLEAR_DIRTY_LOG: {
5450
struct compat_kvm_clear_dirty_log compat_log;
5451
struct kvm_clear_dirty_log log;
5452
5453
if (copy_from_user(&compat_log, (void __user *)arg,
5454
sizeof(compat_log)))
5455
return -EFAULT;
5456
log.slot = compat_log.slot;
5457
log.num_pages = compat_log.num_pages;
5458
log.first_page = compat_log.first_page;
5459
log.padding2 = compat_log.padding2;
5460
log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5461
5462
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5463
break;
5464
}
5465
#endif
5466
case KVM_GET_DIRTY_LOG: {
5467
struct compat_kvm_dirty_log compat_log;
5468
struct kvm_dirty_log log;
5469
5470
if (copy_from_user(&compat_log, (void __user *)arg,
5471
sizeof(compat_log)))
5472
return -EFAULT;
5473
log.slot = compat_log.slot;
5474
log.padding1 = compat_log.padding1;
5475
log.padding2 = compat_log.padding2;
5476
log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5477
5478
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5479
break;
5480
}
5481
default:
5482
r = kvm_vm_ioctl(filp, ioctl, arg);
5483
}
5484
return r;
5485
}
5486
#endif
5487
5488
static struct file_operations kvm_vm_fops = {
5489
.release = kvm_vm_release,
5490
.unlocked_ioctl = kvm_vm_ioctl,
5491
.llseek = noop_llseek,
5492
KVM_COMPAT(kvm_vm_compat_ioctl),
5493
};
5494
5495
bool file_is_kvm(struct file *file)
5496
{
5497
return file && file->f_op == &kvm_vm_fops;
5498
}
5499
EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
5500
5501
static int kvm_dev_ioctl_create_vm(unsigned long type)
5502
{
5503
char fdname[ITOA_MAX_LEN + 1];
5504
int r, fd;
5505
struct kvm *kvm;
5506
struct file *file;
5507
5508
fd = get_unused_fd_flags(O_CLOEXEC);
5509
if (fd < 0)
5510
return fd;
5511
5512
snprintf(fdname, sizeof(fdname), "%d", fd);
5513
5514
kvm = kvm_create_vm(type, fdname);
5515
if (IS_ERR(kvm)) {
5516
r = PTR_ERR(kvm);
5517
goto put_fd;
5518
}
5519
5520
file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5521
if (IS_ERR(file)) {
5522
r = PTR_ERR(file);
5523
goto put_kvm;
5524
}
5525
5526
/*
5527
* Don't call kvm_put_kvm anymore at this point; file->f_op is
5528
* already set, with ->release() being kvm_vm_release(). In error
5529
* cases it will be called by the final fput(file) and will take
5530
* care of doing kvm_put_kvm(kvm).
5531
*/
5532
kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5533
5534
fd_install(fd, file);
5535
return fd;
5536
5537
put_kvm:
5538
kvm_put_kvm(kvm);
5539
put_fd:
5540
put_unused_fd(fd);
5541
return r;
5542
}
5543
5544
static long kvm_dev_ioctl(struct file *filp,
5545
unsigned int ioctl, unsigned long arg)
5546
{
5547
int r = -EINVAL;
5548
5549
switch (ioctl) {
5550
case KVM_GET_API_VERSION:
5551
if (arg)
5552
goto out;
5553
r = KVM_API_VERSION;
5554
break;
5555
case KVM_CREATE_VM:
5556
r = kvm_dev_ioctl_create_vm(arg);
5557
break;
5558
case KVM_CHECK_EXTENSION:
5559
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5560
break;
5561
case KVM_GET_VCPU_MMAP_SIZE:
5562
if (arg)
5563
goto out;
5564
r = PAGE_SIZE; /* struct kvm_run */
5565
#ifdef CONFIG_X86
5566
r += PAGE_SIZE; /* pio data page */
5567
#endif
5568
#ifdef CONFIG_KVM_MMIO
5569
r += PAGE_SIZE; /* coalesced mmio ring page */
5570
#endif
5571
break;
5572
default:
5573
return kvm_arch_dev_ioctl(filp, ioctl, arg);
5574
}
5575
out:
5576
return r;
5577
}
5578
5579
static struct file_operations kvm_chardev_ops = {
5580
.unlocked_ioctl = kvm_dev_ioctl,
5581
.llseek = noop_llseek,
5582
KVM_COMPAT(kvm_dev_ioctl),
5583
};
5584
5585
static struct miscdevice kvm_dev = {
5586
KVM_MINOR,
5587
"kvm",
5588
&kvm_chardev_ops,
5589
};
5590
5591
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5592
bool enable_virt_at_load = true;
5593
module_param(enable_virt_at_load, bool, 0444);
5594
EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load);
5595
5596
__visible bool kvm_rebooting;
5597
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
5598
5599
static DEFINE_PER_CPU(bool, virtualization_enabled);
5600
static DEFINE_MUTEX(kvm_usage_lock);
5601
static int kvm_usage_count;
5602
5603
__weak void kvm_arch_enable_virtualization(void)
5604
{
5605
5606
}
5607
5608
__weak void kvm_arch_disable_virtualization(void)
5609
{
5610
5611
}
5612
5613
static int kvm_enable_virtualization_cpu(void)
5614
{
5615
if (__this_cpu_read(virtualization_enabled))
5616
return 0;
5617
5618
if (kvm_arch_enable_virtualization_cpu()) {
5619
pr_info("kvm: enabling virtualization on CPU%d failed\n",
5620
raw_smp_processor_id());
5621
return -EIO;
5622
}
5623
5624
__this_cpu_write(virtualization_enabled, true);
5625
return 0;
5626
}
5627
5628
static int kvm_online_cpu(unsigned int cpu)
5629
{
5630
/*
5631
* Abort the CPU online process if hardware virtualization cannot
5632
* be enabled. Otherwise running VMs would encounter unrecoverable
5633
* errors when scheduled to this CPU.
5634
*/
5635
return kvm_enable_virtualization_cpu();
5636
}
5637
5638
static void kvm_disable_virtualization_cpu(void *ign)
5639
{
5640
if (!__this_cpu_read(virtualization_enabled))
5641
return;
5642
5643
kvm_arch_disable_virtualization_cpu();
5644
5645
__this_cpu_write(virtualization_enabled, false);
5646
}
5647
5648
static int kvm_offline_cpu(unsigned int cpu)
5649
{
5650
kvm_disable_virtualization_cpu(NULL);
5651
return 0;
5652
}
5653
5654
static void kvm_shutdown(void *data)
5655
{
5656
/*
5657
* Disable hardware virtualization and set kvm_rebooting to indicate
5658
* that KVM has asynchronously disabled hardware virtualization, i.e.
5659
* that relevant errors and exceptions aren't entirely unexpected.
5660
* Some flavors of hardware virtualization need to be disabled before
5661
* transferring control to firmware (to perform shutdown/reboot), e.g.
5662
* on x86, virtualization can block INIT interrupts, which are used by
5663
* firmware to pull APs back under firmware control. Note, this path
5664
* is used for both shutdown and reboot scenarios, i.e. neither name is
5665
* 100% comprehensive.
5666
*/
5667
pr_info("kvm: exiting hardware virtualization\n");
5668
kvm_rebooting = true;
5669
on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
5670
}
5671
5672
static int kvm_suspend(void *data)
5673
{
5674
/*
5675
* Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5676
* callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
5677
* count is stable. Assert that kvm_usage_lock is not held to ensure
5678
* the system isn't suspended while KVM is enabling hardware. Hardware
5679
* enabling can be preempted, but the task cannot be frozen until it has
5680
* dropped all locks (userspace tasks are frozen via a fake signal).
5681
*/
5682
lockdep_assert_not_held(&kvm_usage_lock);
5683
lockdep_assert_irqs_disabled();
5684
5685
kvm_disable_virtualization_cpu(NULL);
5686
return 0;
5687
}
5688
5689
static void kvm_resume(void *data)
5690
{
5691
lockdep_assert_not_held(&kvm_usage_lock);
5692
lockdep_assert_irqs_disabled();
5693
5694
WARN_ON_ONCE(kvm_enable_virtualization_cpu());
5695
}
5696
5697
static const struct syscore_ops kvm_syscore_ops = {
5698
.suspend = kvm_suspend,
5699
.resume = kvm_resume,
5700
.shutdown = kvm_shutdown,
5701
};
5702
5703
static struct syscore kvm_syscore = {
5704
.ops = &kvm_syscore_ops,
5705
};
5706
5707
int kvm_enable_virtualization(void)
5708
{
5709
int r;
5710
5711
guard(mutex)(&kvm_usage_lock);
5712
5713
if (kvm_usage_count++)
5714
return 0;
5715
5716
kvm_arch_enable_virtualization();
5717
5718
r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
5719
kvm_online_cpu, kvm_offline_cpu);
5720
if (r)
5721
goto err_cpuhp;
5722
5723
register_syscore(&kvm_syscore);
5724
5725
/*
5726
* Undo virtualization enabling and bail if the system is going down.
5727
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
5728
* possible for an in-flight operation to enable virtualization after
5729
* syscore_shutdown() is called, i.e. without kvm_shutdown() being
5730
* invoked. Note, this relies on system_state being set _before_
5731
* kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
5732
* or this CPU observes the impending shutdown. Which is why KVM uses
5733
* a syscore ops hook instead of registering a dedicated reboot
5734
* notifier (the latter runs before system_state is updated).
5735
*/
5736
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5737
system_state == SYSTEM_RESTART) {
5738
r = -EBUSY;
5739
goto err_rebooting;
5740
}
5741
5742
return 0;
5743
5744
err_rebooting:
5745
unregister_syscore(&kvm_syscore);
5746
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5747
err_cpuhp:
5748
kvm_arch_disable_virtualization();
5749
--kvm_usage_count;
5750
return r;
5751
}
5752
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization);
5753
5754
void kvm_disable_virtualization(void)
5755
{
5756
guard(mutex)(&kvm_usage_lock);
5757
5758
if (--kvm_usage_count)
5759
return;
5760
5761
unregister_syscore(&kvm_syscore);
5762
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5763
kvm_arch_disable_virtualization();
5764
}
5765
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization);
5766
5767
static int kvm_init_virtualization(void)
5768
{
5769
if (enable_virt_at_load)
5770
return kvm_enable_virtualization();
5771
5772
return 0;
5773
}
5774
5775
static void kvm_uninit_virtualization(void)
5776
{
5777
if (enable_virt_at_load)
5778
kvm_disable_virtualization();
5779
}
5780
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5781
static int kvm_init_virtualization(void)
5782
{
5783
return 0;
5784
}
5785
5786
static void kvm_uninit_virtualization(void)
5787
{
5788
5789
}
5790
#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5791
5792
static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5793
{
5794
if (dev->ops->destructor)
5795
dev->ops->destructor(dev);
5796
}
5797
5798
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5799
{
5800
int i;
5801
5802
for (i = 0; i < bus->dev_count; i++) {
5803
struct kvm_io_device *pos = bus->range[i].dev;
5804
5805
kvm_iodevice_destructor(pos);
5806
}
5807
kfree(bus);
5808
}
5809
5810
static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5811
const struct kvm_io_range *r2)
5812
{
5813
gpa_t addr1 = r1->addr;
5814
gpa_t addr2 = r2->addr;
5815
5816
if (addr1 < addr2)
5817
return -1;
5818
5819
/* If r2->len == 0, match the exact address. If r2->len != 0,
5820
* accept any overlapping write. Any order is acceptable for
5821
* overlapping ranges, because kvm_io_bus_get_first_dev ensures
5822
* we process all of them.
5823
*/
5824
if (r2->len) {
5825
addr1 += r1->len;
5826
addr2 += r2->len;
5827
}
5828
5829
if (addr1 > addr2)
5830
return 1;
5831
5832
return 0;
5833
}
5834
5835
static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5836
{
5837
return kvm_io_bus_cmp(p1, p2);
5838
}
5839
5840
static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5841
gpa_t addr, int len)
5842
{
5843
struct kvm_io_range *range, key;
5844
int off;
5845
5846
key = (struct kvm_io_range) {
5847
.addr = addr,
5848
.len = len,
5849
};
5850
5851
range = bsearch(&key, bus->range, bus->dev_count,
5852
sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5853
if (range == NULL)
5854
return -ENOENT;
5855
5856
off = range - bus->range;
5857
5858
while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5859
off--;
5860
5861
return off;
5862
}
5863
5864
static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5865
struct kvm_io_range *range, const void *val)
5866
{
5867
int idx;
5868
5869
idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5870
if (idx < 0)
5871
return -EOPNOTSUPP;
5872
5873
while (idx < bus->dev_count &&
5874
kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5875
if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5876
range->len, val))
5877
return idx;
5878
idx++;
5879
}
5880
5881
return -EOPNOTSUPP;
5882
}
5883
5884
static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx)
5885
{
5886
/*
5887
* Ensure that any updates to kvm_buses[] observed by the previous vCPU
5888
* machine instruction are also visible to the vCPU machine instruction
5889
* that triggered this call.
5890
*/
5891
smp_mb__after_srcu_read_lock();
5892
5893
return srcu_dereference(kvm->buses[idx], &kvm->srcu);
5894
}
5895
5896
int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5897
int len, const void *val)
5898
{
5899
struct kvm_io_bus *bus;
5900
struct kvm_io_range range;
5901
int r;
5902
5903
range = (struct kvm_io_range) {
5904
.addr = addr,
5905
.len = len,
5906
};
5907
5908
bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
5909
if (!bus)
5910
return -ENOMEM;
5911
r = __kvm_io_bus_write(vcpu, bus, &range, val);
5912
return r < 0 ? r : 0;
5913
}
5914
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write);
5915
5916
int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5917
gpa_t addr, int len, const void *val, long cookie)
5918
{
5919
struct kvm_io_bus *bus;
5920
struct kvm_io_range range;
5921
5922
range = (struct kvm_io_range) {
5923
.addr = addr,
5924
.len = len,
5925
};
5926
5927
bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
5928
if (!bus)
5929
return -ENOMEM;
5930
5931
/* First try the device referenced by cookie. */
5932
if ((cookie >= 0) && (cookie < bus->dev_count) &&
5933
(kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5934
if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5935
val))
5936
return cookie;
5937
5938
/*
5939
* cookie contained garbage; fall back to search and return the
5940
* correct cookie value.
5941
*/
5942
return __kvm_io_bus_write(vcpu, bus, &range, val);
5943
}
5944
5945
static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5946
struct kvm_io_range *range, void *val)
5947
{
5948
int idx;
5949
5950
idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5951
if (idx < 0)
5952
return -EOPNOTSUPP;
5953
5954
while (idx < bus->dev_count &&
5955
kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5956
if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5957
range->len, val))
5958
return idx;
5959
idx++;
5960
}
5961
5962
return -EOPNOTSUPP;
5963
}
5964
5965
int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5966
int len, void *val)
5967
{
5968
struct kvm_io_bus *bus;
5969
struct kvm_io_range range;
5970
int r;
5971
5972
range = (struct kvm_io_range) {
5973
.addr = addr,
5974
.len = len,
5975
};
5976
5977
bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
5978
if (!bus)
5979
return -ENOMEM;
5980
r = __kvm_io_bus_read(vcpu, bus, &range, val);
5981
return r < 0 ? r : 0;
5982
}
5983
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read);
5984
5985
static void __free_bus(struct rcu_head *rcu)
5986
{
5987
struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu);
5988
5989
kfree(bus);
5990
}
5991
5992
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5993
int len, struct kvm_io_device *dev)
5994
{
5995
int i;
5996
struct kvm_io_bus *new_bus, *bus;
5997
struct kvm_io_range range;
5998
5999
lockdep_assert_held(&kvm->slots_lock);
6000
6001
bus = kvm_get_bus(kvm, bus_idx);
6002
if (!bus)
6003
return -ENOMEM;
6004
6005
/* exclude ioeventfd which is limited by maximum fd */
6006
if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
6007
return -ENOSPC;
6008
6009
new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
6010
GFP_KERNEL_ACCOUNT);
6011
if (!new_bus)
6012
return -ENOMEM;
6013
6014
range = (struct kvm_io_range) {
6015
.addr = addr,
6016
.len = len,
6017
.dev = dev,
6018
};
6019
6020
for (i = 0; i < bus->dev_count; i++)
6021
if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
6022
break;
6023
6024
memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
6025
new_bus->dev_count++;
6026
new_bus->range[i] = range;
6027
memcpy(new_bus->range + i + 1, bus->range + i,
6028
(bus->dev_count - i) * sizeof(struct kvm_io_range));
6029
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
6030
call_srcu(&kvm->srcu, &bus->rcu, __free_bus);
6031
6032
return 0;
6033
}
6034
6035
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
6036
struct kvm_io_device *dev)
6037
{
6038
int i;
6039
struct kvm_io_bus *new_bus, *bus;
6040
6041
lockdep_assert_held(&kvm->slots_lock);
6042
6043
bus = kvm_get_bus(kvm, bus_idx);
6044
if (!bus)
6045
return 0;
6046
6047
for (i = 0; i < bus->dev_count; i++) {
6048
if (bus->range[i].dev == dev) {
6049
break;
6050
}
6051
}
6052
6053
if (i == bus->dev_count)
6054
return 0;
6055
6056
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
6057
GFP_KERNEL_ACCOUNT);
6058
if (new_bus) {
6059
memcpy(new_bus, bus, struct_size(bus, range, i));
6060
new_bus->dev_count--;
6061
memcpy(new_bus->range + i, bus->range + i + 1,
6062
flex_array_size(new_bus, range, new_bus->dev_count - i));
6063
}
6064
6065
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
6066
synchronize_srcu_expedited(&kvm->srcu);
6067
6068
/*
6069
* If NULL bus is installed, destroy the old bus, including all the
6070
* attached devices. Otherwise, destroy the caller's device only.
6071
*/
6072
if (!new_bus) {
6073
pr_err("kvm: failed to shrink bus, removing it completely\n");
6074
kvm_io_bus_destroy(bus);
6075
return -ENOMEM;
6076
}
6077
6078
kvm_iodevice_destructor(dev);
6079
kfree(bus);
6080
return 0;
6081
}
6082
6083
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
6084
gpa_t addr)
6085
{
6086
struct kvm_io_bus *bus;
6087
int dev_idx, srcu_idx;
6088
struct kvm_io_device *iodev = NULL;
6089
6090
srcu_idx = srcu_read_lock(&kvm->srcu);
6091
6092
bus = kvm_get_bus_srcu(kvm, bus_idx);
6093
if (!bus)
6094
goto out_unlock;
6095
6096
dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
6097
if (dev_idx < 0)
6098
goto out_unlock;
6099
6100
iodev = bus->range[dev_idx].dev;
6101
6102
out_unlock:
6103
srcu_read_unlock(&kvm->srcu, srcu_idx);
6104
6105
return iodev;
6106
}
6107
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev);
6108
6109
static int kvm_debugfs_open(struct inode *inode, struct file *file,
6110
int (*get)(void *, u64 *), int (*set)(void *, u64),
6111
const char *fmt)
6112
{
6113
int ret;
6114
struct kvm_stat_data *stat_data = inode->i_private;
6115
6116
/*
6117
* The debugfs files are a reference to the kvm struct which
6118
* is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6119
* avoids the race between open and the removal of the debugfs directory.
6120
*/
6121
if (!kvm_get_kvm_safe(stat_data->kvm))
6122
return -ENOENT;
6123
6124
ret = simple_attr_open(inode, file, get,
6125
kvm_stats_debugfs_mode(stat_data->desc) & 0222
6126
? set : NULL, fmt);
6127
if (ret)
6128
kvm_put_kvm(stat_data->kvm);
6129
6130
return ret;
6131
}
6132
6133
static int kvm_debugfs_release(struct inode *inode, struct file *file)
6134
{
6135
struct kvm_stat_data *stat_data = inode->i_private;
6136
6137
simple_attr_release(inode, file);
6138
kvm_put_kvm(stat_data->kvm);
6139
6140
return 0;
6141
}
6142
6143
static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
6144
{
6145
*val = *(u64 *)((void *)(&kvm->stat) + offset);
6146
6147
return 0;
6148
}
6149
6150
static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6151
{
6152
*(u64 *)((void *)(&kvm->stat) + offset) = 0;
6153
6154
return 0;
6155
}
6156
6157
static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
6158
{
6159
unsigned long i;
6160
struct kvm_vcpu *vcpu;
6161
6162
*val = 0;
6163
6164
kvm_for_each_vcpu(i, vcpu, kvm)
6165
*val += *(u64 *)((void *)(&vcpu->stat) + offset);
6166
6167
return 0;
6168
}
6169
6170
static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
6171
{
6172
unsigned long i;
6173
struct kvm_vcpu *vcpu;
6174
6175
kvm_for_each_vcpu(i, vcpu, kvm)
6176
*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
6177
6178
return 0;
6179
}
6180
6181
static int kvm_stat_data_get(void *data, u64 *val)
6182
{
6183
int r = -EFAULT;
6184
struct kvm_stat_data *stat_data = data;
6185
6186
switch (stat_data->kind) {
6187
case KVM_STAT_VM:
6188
r = kvm_get_stat_per_vm(stat_data->kvm,
6189
stat_data->desc->desc.offset, val);
6190
break;
6191
case KVM_STAT_VCPU:
6192
r = kvm_get_stat_per_vcpu(stat_data->kvm,
6193
stat_data->desc->desc.offset, val);
6194
break;
6195
}
6196
6197
return r;
6198
}
6199
6200
static int kvm_stat_data_clear(void *data, u64 val)
6201
{
6202
int r = -EFAULT;
6203
struct kvm_stat_data *stat_data = data;
6204
6205
if (val)
6206
return -EINVAL;
6207
6208
switch (stat_data->kind) {
6209
case KVM_STAT_VM:
6210
r = kvm_clear_stat_per_vm(stat_data->kvm,
6211
stat_data->desc->desc.offset);
6212
break;
6213
case KVM_STAT_VCPU:
6214
r = kvm_clear_stat_per_vcpu(stat_data->kvm,
6215
stat_data->desc->desc.offset);
6216
break;
6217
}
6218
6219
return r;
6220
}
6221
6222
static int kvm_stat_data_open(struct inode *inode, struct file *file)
6223
{
6224
__simple_attr_check_format("%llu\n", 0ull);
6225
return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6226
kvm_stat_data_clear, "%llu\n");
6227
}
6228
6229
static const struct file_operations stat_fops_per_vm = {
6230
.owner = THIS_MODULE,
6231
.open = kvm_stat_data_open,
6232
.release = kvm_debugfs_release,
6233
.read = simple_attr_read,
6234
.write = simple_attr_write,
6235
};
6236
6237
static int vm_stat_get(void *_offset, u64 *val)
6238
{
6239
unsigned offset = (long)_offset;
6240
struct kvm *kvm;
6241
u64 tmp_val;
6242
6243
*val = 0;
6244
mutex_lock(&kvm_lock);
6245
list_for_each_entry(kvm, &vm_list, vm_list) {
6246
kvm_get_stat_per_vm(kvm, offset, &tmp_val);
6247
*val += tmp_val;
6248
}
6249
mutex_unlock(&kvm_lock);
6250
return 0;
6251
}
6252
6253
static int vm_stat_clear(void *_offset, u64 val)
6254
{
6255
unsigned offset = (long)_offset;
6256
struct kvm *kvm;
6257
6258
if (val)
6259
return -EINVAL;
6260
6261
mutex_lock(&kvm_lock);
6262
list_for_each_entry(kvm, &vm_list, vm_list) {
6263
kvm_clear_stat_per_vm(kvm, offset);
6264
}
6265
mutex_unlock(&kvm_lock);
6266
6267
return 0;
6268
}
6269
6270
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
6271
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
6272
6273
static int vcpu_stat_get(void *_offset, u64 *val)
6274
{
6275
unsigned offset = (long)_offset;
6276
struct kvm *kvm;
6277
u64 tmp_val;
6278
6279
*val = 0;
6280
mutex_lock(&kvm_lock);
6281
list_for_each_entry(kvm, &vm_list, vm_list) {
6282
kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
6283
*val += tmp_val;
6284
}
6285
mutex_unlock(&kvm_lock);
6286
return 0;
6287
}
6288
6289
static int vcpu_stat_clear(void *_offset, u64 val)
6290
{
6291
unsigned offset = (long)_offset;
6292
struct kvm *kvm;
6293
6294
if (val)
6295
return -EINVAL;
6296
6297
mutex_lock(&kvm_lock);
6298
list_for_each_entry(kvm, &vm_list, vm_list) {
6299
kvm_clear_stat_per_vcpu(kvm, offset);
6300
}
6301
mutex_unlock(&kvm_lock);
6302
6303
return 0;
6304
}
6305
6306
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6307
"%llu\n");
6308
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
6309
6310
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6311
{
6312
struct kobj_uevent_env *env;
6313
unsigned long long created, active;
6314
6315
if (!kvm_dev.this_device || !kvm)
6316
return;
6317
6318
mutex_lock(&kvm_lock);
6319
if (type == KVM_EVENT_CREATE_VM) {
6320
kvm_createvm_count++;
6321
kvm_active_vms++;
6322
} else if (type == KVM_EVENT_DESTROY_VM) {
6323
kvm_active_vms--;
6324
}
6325
created = kvm_createvm_count;
6326
active = kvm_active_vms;
6327
mutex_unlock(&kvm_lock);
6328
6329
env = kzalloc(sizeof(*env), GFP_KERNEL);
6330
if (!env)
6331
return;
6332
6333
add_uevent_var(env, "CREATED=%llu", created);
6334
add_uevent_var(env, "COUNT=%llu", active);
6335
6336
if (type == KVM_EVENT_CREATE_VM) {
6337
add_uevent_var(env, "EVENT=create");
6338
kvm->userspace_pid = task_pid_nr(current);
6339
} else if (type == KVM_EVENT_DESTROY_VM) {
6340
add_uevent_var(env, "EVENT=destroy");
6341
}
6342
add_uevent_var(env, "PID=%d", kvm->userspace_pid);
6343
6344
if (!IS_ERR(kvm->debugfs_dentry)) {
6345
char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
6346
6347
if (p) {
6348
tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6349
if (!IS_ERR(tmp))
6350
add_uevent_var(env, "STATS_PATH=%s", tmp);
6351
kfree(p);
6352
}
6353
}
6354
/* no need for checks, since we are adding at most only 5 keys */
6355
env->envp[env->envp_idx++] = NULL;
6356
kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6357
kfree(env);
6358
}
6359
6360
static void kvm_init_debug(void)
6361
{
6362
const struct file_operations *fops;
6363
const struct _kvm_stats_desc *pdesc;
6364
int i;
6365
6366
kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6367
6368
for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6369
pdesc = &kvm_vm_stats_desc[i];
6370
if (kvm_stats_debugfs_mode(pdesc) & 0222)
6371
fops = &vm_stat_fops;
6372
else
6373
fops = &vm_stat_readonly_fops;
6374
debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6375
kvm_debugfs_dir,
6376
(void *)(long)pdesc->desc.offset, fops);
6377
}
6378
6379
for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6380
pdesc = &kvm_vcpu_stats_desc[i];
6381
if (kvm_stats_debugfs_mode(pdesc) & 0222)
6382
fops = &vcpu_stat_fops;
6383
else
6384
fops = &vcpu_stat_readonly_fops;
6385
debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6386
kvm_debugfs_dir,
6387
(void *)(long)pdesc->desc.offset, fops);
6388
}
6389
}
6390
6391
static inline
6392
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6393
{
6394
return container_of(pn, struct kvm_vcpu, preempt_notifier);
6395
}
6396
6397
static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6398
{
6399
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6400
6401
WRITE_ONCE(vcpu->preempted, false);
6402
WRITE_ONCE(vcpu->ready, false);
6403
6404
__this_cpu_write(kvm_running_vcpu, vcpu);
6405
kvm_arch_vcpu_load(vcpu, cpu);
6406
6407
WRITE_ONCE(vcpu->scheduled_out, false);
6408
}
6409
6410
static void kvm_sched_out(struct preempt_notifier *pn,
6411
struct task_struct *next)
6412
{
6413
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6414
6415
WRITE_ONCE(vcpu->scheduled_out, true);
6416
6417
if (task_is_runnable(current) && vcpu->wants_to_run) {
6418
WRITE_ONCE(vcpu->preempted, true);
6419
WRITE_ONCE(vcpu->ready, true);
6420
}
6421
kvm_arch_vcpu_put(vcpu);
6422
__this_cpu_write(kvm_running_vcpu, NULL);
6423
}
6424
6425
/**
6426
* kvm_get_running_vcpu - get the vcpu running on the current CPU.
6427
*
6428
* We can disable preemption locally around accessing the per-CPU variable,
6429
* and use the resolved vcpu pointer after enabling preemption again,
6430
* because even if the current thread is migrated to another CPU, reading
6431
* the per-CPU value later will give us the same value as we update the
6432
* per-CPU variable in the preempt notifier handlers.
6433
*/
6434
struct kvm_vcpu *kvm_get_running_vcpu(void)
6435
{
6436
struct kvm_vcpu *vcpu;
6437
6438
preempt_disable();
6439
vcpu = __this_cpu_read(kvm_running_vcpu);
6440
preempt_enable();
6441
6442
return vcpu;
6443
}
6444
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu);
6445
6446
/**
6447
* kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6448
*/
6449
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6450
{
6451
return &kvm_running_vcpu;
6452
}
6453
6454
#ifdef CONFIG_GUEST_PERF_EVENTS
6455
static unsigned int kvm_guest_state(void)
6456
{
6457
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6458
unsigned int state;
6459
6460
if (!kvm_arch_pmi_in_guest(vcpu))
6461
return 0;
6462
6463
state = PERF_GUEST_ACTIVE;
6464
if (!kvm_arch_vcpu_in_kernel(vcpu))
6465
state |= PERF_GUEST_USER;
6466
6467
return state;
6468
}
6469
6470
static unsigned long kvm_guest_get_ip(void)
6471
{
6472
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6473
6474
/* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6475
if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6476
return 0;
6477
6478
return kvm_arch_vcpu_get_ip(vcpu);
6479
}
6480
6481
static struct perf_guest_info_callbacks kvm_guest_cbs = {
6482
.state = kvm_guest_state,
6483
.get_ip = kvm_guest_get_ip,
6484
.handle_intel_pt_intr = NULL,
6485
};
6486
6487
void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6488
{
6489
kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6490
perf_register_guest_info_callbacks(&kvm_guest_cbs);
6491
}
6492
void kvm_unregister_perf_callbacks(void)
6493
{
6494
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6495
}
6496
#endif
6497
6498
int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6499
{
6500
int r;
6501
int cpu;
6502
6503
/* A kmem cache lets us meet the alignment requirements of fx_save. */
6504
if (!vcpu_align)
6505
vcpu_align = __alignof__(struct kvm_vcpu);
6506
kvm_vcpu_cache =
6507
kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6508
SLAB_ACCOUNT,
6509
offsetof(struct kvm_vcpu, arch),
6510
offsetofend(struct kvm_vcpu, stats_id)
6511
- offsetof(struct kvm_vcpu, arch),
6512
NULL);
6513
if (!kvm_vcpu_cache)
6514
return -ENOMEM;
6515
6516
for_each_possible_cpu(cpu) {
6517
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6518
GFP_KERNEL, cpu_to_node(cpu))) {
6519
r = -ENOMEM;
6520
goto err_cpu_kick_mask;
6521
}
6522
}
6523
6524
r = kvm_irqfd_init();
6525
if (r)
6526
goto err_irqfd;
6527
6528
r = kvm_async_pf_init();
6529
if (r)
6530
goto err_async_pf;
6531
6532
kvm_chardev_ops.owner = module;
6533
kvm_vm_fops.owner = module;
6534
kvm_vcpu_fops.owner = module;
6535
kvm_device_fops.owner = module;
6536
6537
kvm_preempt_ops.sched_in = kvm_sched_in;
6538
kvm_preempt_ops.sched_out = kvm_sched_out;
6539
6540
kvm_init_debug();
6541
6542
r = kvm_vfio_ops_init();
6543
if (WARN_ON_ONCE(r))
6544
goto err_vfio;
6545
6546
r = kvm_gmem_init(module);
6547
if (r)
6548
goto err_gmem;
6549
6550
r = kvm_init_virtualization();
6551
if (r)
6552
goto err_virt;
6553
6554
/*
6555
* Registration _must_ be the very last thing done, as this exposes
6556
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
6557
*/
6558
r = misc_register(&kvm_dev);
6559
if (r) {
6560
pr_err("kvm: misc device register failed\n");
6561
goto err_register;
6562
}
6563
6564
return 0;
6565
6566
err_register:
6567
kvm_uninit_virtualization();
6568
err_virt:
6569
kvm_gmem_exit();
6570
err_gmem:
6571
kvm_vfio_ops_exit();
6572
err_vfio:
6573
kvm_async_pf_deinit();
6574
err_async_pf:
6575
kvm_irqfd_exit();
6576
err_irqfd:
6577
err_cpu_kick_mask:
6578
for_each_possible_cpu(cpu)
6579
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6580
kmem_cache_destroy(kvm_vcpu_cache);
6581
return r;
6582
}
6583
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init);
6584
6585
void kvm_exit(void)
6586
{
6587
int cpu;
6588
6589
/*
6590
* Note, unregistering /dev/kvm doesn't strictly need to come first,
6591
* fops_get(), a.k.a. try_module_get(), prevents acquiring references
6592
* to KVM while the module is being stopped.
6593
*/
6594
misc_deregister(&kvm_dev);
6595
6596
kvm_uninit_virtualization();
6597
6598
debugfs_remove_recursive(kvm_debugfs_dir);
6599
for_each_possible_cpu(cpu)
6600
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6601
kmem_cache_destroy(kvm_vcpu_cache);
6602
kvm_gmem_exit();
6603
kvm_vfio_ops_exit();
6604
kvm_async_pf_deinit();
6605
kvm_irqfd_exit();
6606
}
6607
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit);
6608
6609