Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/virt/kvm/kvm_main.c
10817 views
1
/*
2
* Kernel-based Virtual Machine driver for Linux
3
*
4
* This module enables machines with Intel VT-x extensions to run virtual
5
* machines without emulation or binary translation.
6
*
7
* Copyright (C) 2006 Qumranet, Inc.
8
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
9
*
10
* Authors:
11
* Avi Kivity <[email protected]>
12
* Yaniv Kamay <[email protected]>
13
*
14
* This work is licensed under the terms of the GNU GPL, version 2. See
15
* the COPYING file in the top-level directory.
16
*
17
*/
18
19
#include "iodev.h"
20
21
#include <linux/kvm_host.h>
22
#include <linux/kvm.h>
23
#include <linux/module.h>
24
#include <linux/errno.h>
25
#include <linux/percpu.h>
26
#include <linux/mm.h>
27
#include <linux/miscdevice.h>
28
#include <linux/vmalloc.h>
29
#include <linux/reboot.h>
30
#include <linux/debugfs.h>
31
#include <linux/highmem.h>
32
#include <linux/file.h>
33
#include <linux/syscore_ops.h>
34
#include <linux/cpu.h>
35
#include <linux/sched.h>
36
#include <linux/cpumask.h>
37
#include <linux/smp.h>
38
#include <linux/anon_inodes.h>
39
#include <linux/profile.h>
40
#include <linux/kvm_para.h>
41
#include <linux/pagemap.h>
42
#include <linux/mman.h>
43
#include <linux/swap.h>
44
#include <linux/bitops.h>
45
#include <linux/spinlock.h>
46
#include <linux/compat.h>
47
#include <linux/srcu.h>
48
#include <linux/hugetlb.h>
49
#include <linux/slab.h>
50
51
#include <asm/processor.h>
52
#include <asm/io.h>
53
#include <asm/uaccess.h>
54
#include <asm/pgtable.h>
55
56
#include "coalesced_mmio.h"
57
#include "async_pf.h"
58
59
#define CREATE_TRACE_POINTS
60
#include <trace/events/kvm.h>
61
62
MODULE_AUTHOR("Qumranet");
63
MODULE_LICENSE("GPL");
64
65
/*
66
* Ordering of locks:
67
*
68
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
69
*/
70
71
DEFINE_RAW_SPINLOCK(kvm_lock);
72
LIST_HEAD(vm_list);
73
74
static cpumask_var_t cpus_hardware_enabled;
75
static int kvm_usage_count = 0;
76
static atomic_t hardware_enable_failed;
77
78
struct kmem_cache *kvm_vcpu_cache;
79
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
80
81
static __read_mostly struct preempt_ops kvm_preempt_ops;
82
83
struct dentry *kvm_debugfs_dir;
84
85
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86
unsigned long arg);
87
static int hardware_enable_all(void);
88
static void hardware_disable_all(void);
89
90
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91
92
bool kvm_rebooting;
93
EXPORT_SYMBOL_GPL(kvm_rebooting);
94
95
static bool largepages_enabled = true;
96
97
static struct page *hwpoison_page;
98
static pfn_t hwpoison_pfn;
99
100
static struct page *fault_page;
101
static pfn_t fault_pfn;
102
103
inline int kvm_is_mmio_pfn(pfn_t pfn)
104
{
105
if (pfn_valid(pfn)) {
106
int reserved;
107
struct page *tail = pfn_to_page(pfn);
108
struct page *head = compound_trans_head(tail);
109
reserved = PageReserved(head);
110
if (head != tail) {
111
/*
112
* "head" is not a dangling pointer
113
* (compound_trans_head takes care of that)
114
* but the hugepage may have been splitted
115
* from under us (and we may not hold a
116
* reference count on the head page so it can
117
* be reused before we run PageReferenced), so
118
* we've to check PageTail before returning
119
* what we just read.
120
*/
121
smp_rmb();
122
if (PageTail(tail))
123
return reserved;
124
}
125
return PageReserved(tail);
126
}
127
128
return true;
129
}
130
131
/*
132
* Switches to specified vcpu, until a matching vcpu_put()
133
*/
134
void vcpu_load(struct kvm_vcpu *vcpu)
135
{
136
int cpu;
137
138
mutex_lock(&vcpu->mutex);
139
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
140
/* The thread running this VCPU changed. */
141
struct pid *oldpid = vcpu->pid;
142
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
143
rcu_assign_pointer(vcpu->pid, newpid);
144
synchronize_rcu();
145
put_pid(oldpid);
146
}
147
cpu = get_cpu();
148
preempt_notifier_register(&vcpu->preempt_notifier);
149
kvm_arch_vcpu_load(vcpu, cpu);
150
put_cpu();
151
}
152
153
void vcpu_put(struct kvm_vcpu *vcpu)
154
{
155
preempt_disable();
156
kvm_arch_vcpu_put(vcpu);
157
preempt_notifier_unregister(&vcpu->preempt_notifier);
158
preempt_enable();
159
mutex_unlock(&vcpu->mutex);
160
}
161
162
static void ack_flush(void *_completed)
163
{
164
}
165
166
static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
167
{
168
int i, cpu, me;
169
cpumask_var_t cpus;
170
bool called = true;
171
struct kvm_vcpu *vcpu;
172
173
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
174
175
me = get_cpu();
176
kvm_for_each_vcpu(i, vcpu, kvm) {
177
kvm_make_request(req, vcpu);
178
cpu = vcpu->cpu;
179
180
/* Set ->requests bit before we read ->mode */
181
smp_mb();
182
183
if (cpus != NULL && cpu != -1 && cpu != me &&
184
kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
185
cpumask_set_cpu(cpu, cpus);
186
}
187
if (unlikely(cpus == NULL))
188
smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
189
else if (!cpumask_empty(cpus))
190
smp_call_function_many(cpus, ack_flush, NULL, 1);
191
else
192
called = false;
193
put_cpu();
194
free_cpumask_var(cpus);
195
return called;
196
}
197
198
void kvm_flush_remote_tlbs(struct kvm *kvm)
199
{
200
int dirty_count = kvm->tlbs_dirty;
201
202
smp_mb();
203
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
204
++kvm->stat.remote_tlb_flush;
205
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
206
}
207
208
void kvm_reload_remote_mmus(struct kvm *kvm)
209
{
210
make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
211
}
212
213
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
214
{
215
struct page *page;
216
int r;
217
218
mutex_init(&vcpu->mutex);
219
vcpu->cpu = -1;
220
vcpu->kvm = kvm;
221
vcpu->vcpu_id = id;
222
vcpu->pid = NULL;
223
init_waitqueue_head(&vcpu->wq);
224
kvm_async_pf_vcpu_init(vcpu);
225
226
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
227
if (!page) {
228
r = -ENOMEM;
229
goto fail;
230
}
231
vcpu->run = page_address(page);
232
233
r = kvm_arch_vcpu_init(vcpu);
234
if (r < 0)
235
goto fail_free_run;
236
return 0;
237
238
fail_free_run:
239
free_page((unsigned long)vcpu->run);
240
fail:
241
return r;
242
}
243
EXPORT_SYMBOL_GPL(kvm_vcpu_init);
244
245
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
246
{
247
put_pid(vcpu->pid);
248
kvm_arch_vcpu_uninit(vcpu);
249
free_page((unsigned long)vcpu->run);
250
}
251
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
252
253
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
254
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
255
{
256
return container_of(mn, struct kvm, mmu_notifier);
257
}
258
259
static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
260
struct mm_struct *mm,
261
unsigned long address)
262
{
263
struct kvm *kvm = mmu_notifier_to_kvm(mn);
264
int need_tlb_flush, idx;
265
266
/*
267
* When ->invalidate_page runs, the linux pte has been zapped
268
* already but the page is still allocated until
269
* ->invalidate_page returns. So if we increase the sequence
270
* here the kvm page fault will notice if the spte can't be
271
* established because the page is going to be freed. If
272
* instead the kvm page fault establishes the spte before
273
* ->invalidate_page runs, kvm_unmap_hva will release it
274
* before returning.
275
*
276
* The sequence increase only need to be seen at spin_unlock
277
* time, and not at spin_lock time.
278
*
279
* Increasing the sequence after the spin_unlock would be
280
* unsafe because the kvm page fault could then establish the
281
* pte after kvm_unmap_hva returned, without noticing the page
282
* is going to be freed.
283
*/
284
idx = srcu_read_lock(&kvm->srcu);
285
spin_lock(&kvm->mmu_lock);
286
kvm->mmu_notifier_seq++;
287
need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
288
spin_unlock(&kvm->mmu_lock);
289
srcu_read_unlock(&kvm->srcu, idx);
290
291
/* we've to flush the tlb before the pages can be freed */
292
if (need_tlb_flush)
293
kvm_flush_remote_tlbs(kvm);
294
295
}
296
297
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
298
struct mm_struct *mm,
299
unsigned long address,
300
pte_t pte)
301
{
302
struct kvm *kvm = mmu_notifier_to_kvm(mn);
303
int idx;
304
305
idx = srcu_read_lock(&kvm->srcu);
306
spin_lock(&kvm->mmu_lock);
307
kvm->mmu_notifier_seq++;
308
kvm_set_spte_hva(kvm, address, pte);
309
spin_unlock(&kvm->mmu_lock);
310
srcu_read_unlock(&kvm->srcu, idx);
311
}
312
313
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
314
struct mm_struct *mm,
315
unsigned long start,
316
unsigned long end)
317
{
318
struct kvm *kvm = mmu_notifier_to_kvm(mn);
319
int need_tlb_flush = 0, idx;
320
321
idx = srcu_read_lock(&kvm->srcu);
322
spin_lock(&kvm->mmu_lock);
323
/*
324
* The count increase must become visible at unlock time as no
325
* spte can be established without taking the mmu_lock and
326
* count is also read inside the mmu_lock critical section.
327
*/
328
kvm->mmu_notifier_count++;
329
for (; start < end; start += PAGE_SIZE)
330
need_tlb_flush |= kvm_unmap_hva(kvm, start);
331
need_tlb_flush |= kvm->tlbs_dirty;
332
spin_unlock(&kvm->mmu_lock);
333
srcu_read_unlock(&kvm->srcu, idx);
334
335
/* we've to flush the tlb before the pages can be freed */
336
if (need_tlb_flush)
337
kvm_flush_remote_tlbs(kvm);
338
}
339
340
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
341
struct mm_struct *mm,
342
unsigned long start,
343
unsigned long end)
344
{
345
struct kvm *kvm = mmu_notifier_to_kvm(mn);
346
347
spin_lock(&kvm->mmu_lock);
348
/*
349
* This sequence increase will notify the kvm page fault that
350
* the page that is going to be mapped in the spte could have
351
* been freed.
352
*/
353
kvm->mmu_notifier_seq++;
354
/*
355
* The above sequence increase must be visible before the
356
* below count decrease but both values are read by the kvm
357
* page fault under mmu_lock spinlock so we don't need to add
358
* a smb_wmb() here in between the two.
359
*/
360
kvm->mmu_notifier_count--;
361
spin_unlock(&kvm->mmu_lock);
362
363
BUG_ON(kvm->mmu_notifier_count < 0);
364
}
365
366
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
367
struct mm_struct *mm,
368
unsigned long address)
369
{
370
struct kvm *kvm = mmu_notifier_to_kvm(mn);
371
int young, idx;
372
373
idx = srcu_read_lock(&kvm->srcu);
374
spin_lock(&kvm->mmu_lock);
375
young = kvm_age_hva(kvm, address);
376
spin_unlock(&kvm->mmu_lock);
377
srcu_read_unlock(&kvm->srcu, idx);
378
379
if (young)
380
kvm_flush_remote_tlbs(kvm);
381
382
return young;
383
}
384
385
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
386
struct mm_struct *mm,
387
unsigned long address)
388
{
389
struct kvm *kvm = mmu_notifier_to_kvm(mn);
390
int young, idx;
391
392
idx = srcu_read_lock(&kvm->srcu);
393
spin_lock(&kvm->mmu_lock);
394
young = kvm_test_age_hva(kvm, address);
395
spin_unlock(&kvm->mmu_lock);
396
srcu_read_unlock(&kvm->srcu, idx);
397
398
return young;
399
}
400
401
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
402
struct mm_struct *mm)
403
{
404
struct kvm *kvm = mmu_notifier_to_kvm(mn);
405
int idx;
406
407
idx = srcu_read_lock(&kvm->srcu);
408
kvm_arch_flush_shadow(kvm);
409
srcu_read_unlock(&kvm->srcu, idx);
410
}
411
412
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
413
.invalidate_page = kvm_mmu_notifier_invalidate_page,
414
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
415
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
416
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
417
.test_young = kvm_mmu_notifier_test_young,
418
.change_pte = kvm_mmu_notifier_change_pte,
419
.release = kvm_mmu_notifier_release,
420
};
421
422
static int kvm_init_mmu_notifier(struct kvm *kvm)
423
{
424
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
425
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
426
}
427
428
#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
429
430
static int kvm_init_mmu_notifier(struct kvm *kvm)
431
{
432
return 0;
433
}
434
435
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
436
437
static struct kvm *kvm_create_vm(void)
438
{
439
int r, i;
440
struct kvm *kvm = kvm_arch_alloc_vm();
441
442
if (!kvm)
443
return ERR_PTR(-ENOMEM);
444
445
r = kvm_arch_init_vm(kvm);
446
if (r)
447
goto out_err_nodisable;
448
449
r = hardware_enable_all();
450
if (r)
451
goto out_err_nodisable;
452
453
#ifdef CONFIG_HAVE_KVM_IRQCHIP
454
INIT_HLIST_HEAD(&kvm->mask_notifier_list);
455
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
456
#endif
457
458
r = -ENOMEM;
459
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
460
if (!kvm->memslots)
461
goto out_err_nosrcu;
462
if (init_srcu_struct(&kvm->srcu))
463
goto out_err_nosrcu;
464
for (i = 0; i < KVM_NR_BUSES; i++) {
465
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
466
GFP_KERNEL);
467
if (!kvm->buses[i])
468
goto out_err;
469
}
470
471
spin_lock_init(&kvm->mmu_lock);
472
kvm->mm = current->mm;
473
atomic_inc(&kvm->mm->mm_count);
474
kvm_eventfd_init(kvm);
475
mutex_init(&kvm->lock);
476
mutex_init(&kvm->irq_lock);
477
mutex_init(&kvm->slots_lock);
478
atomic_set(&kvm->users_count, 1);
479
480
r = kvm_init_mmu_notifier(kvm);
481
if (r)
482
goto out_err;
483
484
raw_spin_lock(&kvm_lock);
485
list_add(&kvm->vm_list, &vm_list);
486
raw_spin_unlock(&kvm_lock);
487
488
return kvm;
489
490
out_err:
491
cleanup_srcu_struct(&kvm->srcu);
492
out_err_nosrcu:
493
hardware_disable_all();
494
out_err_nodisable:
495
for (i = 0; i < KVM_NR_BUSES; i++)
496
kfree(kvm->buses[i]);
497
kfree(kvm->memslots);
498
kvm_arch_free_vm(kvm);
499
return ERR_PTR(r);
500
}
501
502
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
503
{
504
if (!memslot->dirty_bitmap)
505
return;
506
507
if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
508
vfree(memslot->dirty_bitmap_head);
509
else
510
kfree(memslot->dirty_bitmap_head);
511
512
memslot->dirty_bitmap = NULL;
513
memslot->dirty_bitmap_head = NULL;
514
}
515
516
/*
517
* Free any memory in @free but not in @dont.
518
*/
519
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
520
struct kvm_memory_slot *dont)
521
{
522
int i;
523
524
if (!dont || free->rmap != dont->rmap)
525
vfree(free->rmap);
526
527
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
528
kvm_destroy_dirty_bitmap(free);
529
530
531
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
532
if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
533
vfree(free->lpage_info[i]);
534
free->lpage_info[i] = NULL;
535
}
536
}
537
538
free->npages = 0;
539
free->rmap = NULL;
540
}
541
542
void kvm_free_physmem(struct kvm *kvm)
543
{
544
int i;
545
struct kvm_memslots *slots = kvm->memslots;
546
547
for (i = 0; i < slots->nmemslots; ++i)
548
kvm_free_physmem_slot(&slots->memslots[i], NULL);
549
550
kfree(kvm->memslots);
551
}
552
553
static void kvm_destroy_vm(struct kvm *kvm)
554
{
555
int i;
556
struct mm_struct *mm = kvm->mm;
557
558
kvm_arch_sync_events(kvm);
559
raw_spin_lock(&kvm_lock);
560
list_del(&kvm->vm_list);
561
raw_spin_unlock(&kvm_lock);
562
kvm_free_irq_routing(kvm);
563
for (i = 0; i < KVM_NR_BUSES; i++)
564
kvm_io_bus_destroy(kvm->buses[i]);
565
kvm_coalesced_mmio_free(kvm);
566
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
567
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
568
#else
569
kvm_arch_flush_shadow(kvm);
570
#endif
571
kvm_arch_destroy_vm(kvm);
572
kvm_free_physmem(kvm);
573
cleanup_srcu_struct(&kvm->srcu);
574
kvm_arch_free_vm(kvm);
575
hardware_disable_all();
576
mmdrop(mm);
577
}
578
579
void kvm_get_kvm(struct kvm *kvm)
580
{
581
atomic_inc(&kvm->users_count);
582
}
583
EXPORT_SYMBOL_GPL(kvm_get_kvm);
584
585
void kvm_put_kvm(struct kvm *kvm)
586
{
587
if (atomic_dec_and_test(&kvm->users_count))
588
kvm_destroy_vm(kvm);
589
}
590
EXPORT_SYMBOL_GPL(kvm_put_kvm);
591
592
593
static int kvm_vm_release(struct inode *inode, struct file *filp)
594
{
595
struct kvm *kvm = filp->private_data;
596
597
kvm_irqfd_release(kvm);
598
599
kvm_put_kvm(kvm);
600
return 0;
601
}
602
603
#ifndef CONFIG_S390
604
/*
605
* Allocation size is twice as large as the actual dirty bitmap size.
606
* This makes it possible to do double buffering: see x86's
607
* kvm_vm_ioctl_get_dirty_log().
608
*/
609
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
610
{
611
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
612
613
if (dirty_bytes > PAGE_SIZE)
614
memslot->dirty_bitmap = vzalloc(dirty_bytes);
615
else
616
memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
617
618
if (!memslot->dirty_bitmap)
619
return -ENOMEM;
620
621
memslot->dirty_bitmap_head = memslot->dirty_bitmap;
622
return 0;
623
}
624
#endif /* !CONFIG_S390 */
625
626
/*
627
* Allocate some memory and give it an address in the guest physical address
628
* space.
629
*
630
* Discontiguous memory is allowed, mostly for framebuffers.
631
*
632
* Must be called holding mmap_sem for write.
633
*/
634
int __kvm_set_memory_region(struct kvm *kvm,
635
struct kvm_userspace_memory_region *mem,
636
int user_alloc)
637
{
638
int r;
639
gfn_t base_gfn;
640
unsigned long npages;
641
unsigned long i;
642
struct kvm_memory_slot *memslot;
643
struct kvm_memory_slot old, new;
644
struct kvm_memslots *slots, *old_memslots;
645
646
r = -EINVAL;
647
/* General sanity checks */
648
if (mem->memory_size & (PAGE_SIZE - 1))
649
goto out;
650
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
651
goto out;
652
/* We can read the guest memory with __xxx_user() later on. */
653
if (user_alloc &&
654
((mem->userspace_addr & (PAGE_SIZE - 1)) ||
655
!access_ok(VERIFY_WRITE,
656
(void __user *)(unsigned long)mem->userspace_addr,
657
mem->memory_size)))
658
goto out;
659
if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
660
goto out;
661
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
662
goto out;
663
664
memslot = &kvm->memslots->memslots[mem->slot];
665
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
666
npages = mem->memory_size >> PAGE_SHIFT;
667
668
r = -EINVAL;
669
if (npages > KVM_MEM_MAX_NR_PAGES)
670
goto out;
671
672
if (!npages)
673
mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
674
675
new = old = *memslot;
676
677
new.id = mem->slot;
678
new.base_gfn = base_gfn;
679
new.npages = npages;
680
new.flags = mem->flags;
681
682
/* Disallow changing a memory slot's size. */
683
r = -EINVAL;
684
if (npages && old.npages && npages != old.npages)
685
goto out_free;
686
687
/* Check for overlaps */
688
r = -EEXIST;
689
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
690
struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
691
692
if (s == memslot || !s->npages)
693
continue;
694
if (!((base_gfn + npages <= s->base_gfn) ||
695
(base_gfn >= s->base_gfn + s->npages)))
696
goto out_free;
697
}
698
699
/* Free page dirty bitmap if unneeded */
700
if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
701
new.dirty_bitmap = NULL;
702
703
r = -ENOMEM;
704
705
/* Allocate if a slot is being created */
706
#ifndef CONFIG_S390
707
if (npages && !new.rmap) {
708
new.rmap = vzalloc(npages * sizeof(*new.rmap));
709
710
if (!new.rmap)
711
goto out_free;
712
713
new.user_alloc = user_alloc;
714
new.userspace_addr = mem->userspace_addr;
715
}
716
if (!npages)
717
goto skip_lpage;
718
719
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
720
unsigned long ugfn;
721
unsigned long j;
722
int lpages;
723
int level = i + 2;
724
725
/* Avoid unused variable warning if no large pages */
726
(void)level;
727
728
if (new.lpage_info[i])
729
continue;
730
731
lpages = 1 + ((base_gfn + npages - 1)
732
>> KVM_HPAGE_GFN_SHIFT(level));
733
lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
734
735
new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
736
737
if (!new.lpage_info[i])
738
goto out_free;
739
740
if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
741
new.lpage_info[i][0].write_count = 1;
742
if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
743
new.lpage_info[i][lpages - 1].write_count = 1;
744
ugfn = new.userspace_addr >> PAGE_SHIFT;
745
/*
746
* If the gfn and userspace address are not aligned wrt each
747
* other, or if explicitly asked to, disable large page
748
* support for this slot
749
*/
750
if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
751
!largepages_enabled)
752
for (j = 0; j < lpages; ++j)
753
new.lpage_info[i][j].write_count = 1;
754
}
755
756
skip_lpage:
757
758
/* Allocate page dirty bitmap if needed */
759
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
760
if (kvm_create_dirty_bitmap(&new) < 0)
761
goto out_free;
762
/* destroy any largepage mappings for dirty tracking */
763
}
764
#else /* not defined CONFIG_S390 */
765
new.user_alloc = user_alloc;
766
if (user_alloc)
767
new.userspace_addr = mem->userspace_addr;
768
#endif /* not defined CONFIG_S390 */
769
770
if (!npages) {
771
r = -ENOMEM;
772
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
773
if (!slots)
774
goto out_free;
775
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
776
if (mem->slot >= slots->nmemslots)
777
slots->nmemslots = mem->slot + 1;
778
slots->generation++;
779
slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
780
781
old_memslots = kvm->memslots;
782
rcu_assign_pointer(kvm->memslots, slots);
783
synchronize_srcu_expedited(&kvm->srcu);
784
/* From this point no new shadow pages pointing to a deleted
785
* memslot will be created.
786
*
787
* validation of sp->gfn happens in:
788
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
789
* - kvm_is_visible_gfn (mmu_check_roots)
790
*/
791
kvm_arch_flush_shadow(kvm);
792
kfree(old_memslots);
793
}
794
795
r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
796
if (r)
797
goto out_free;
798
799
/* map the pages in iommu page table */
800
if (npages) {
801
r = kvm_iommu_map_pages(kvm, &new);
802
if (r)
803
goto out_free;
804
}
805
806
r = -ENOMEM;
807
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
808
if (!slots)
809
goto out_free;
810
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
811
if (mem->slot >= slots->nmemslots)
812
slots->nmemslots = mem->slot + 1;
813
slots->generation++;
814
815
/* actual memory is freed via old in kvm_free_physmem_slot below */
816
if (!npages) {
817
new.rmap = NULL;
818
new.dirty_bitmap = NULL;
819
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
820
new.lpage_info[i] = NULL;
821
}
822
823
slots->memslots[mem->slot] = new;
824
old_memslots = kvm->memslots;
825
rcu_assign_pointer(kvm->memslots, slots);
826
synchronize_srcu_expedited(&kvm->srcu);
827
828
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
829
830
kvm_free_physmem_slot(&old, &new);
831
kfree(old_memslots);
832
833
return 0;
834
835
out_free:
836
kvm_free_physmem_slot(&new, &old);
837
out:
838
return r;
839
840
}
841
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
842
843
int kvm_set_memory_region(struct kvm *kvm,
844
struct kvm_userspace_memory_region *mem,
845
int user_alloc)
846
{
847
int r;
848
849
mutex_lock(&kvm->slots_lock);
850
r = __kvm_set_memory_region(kvm, mem, user_alloc);
851
mutex_unlock(&kvm->slots_lock);
852
return r;
853
}
854
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
855
856
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
857
struct
858
kvm_userspace_memory_region *mem,
859
int user_alloc)
860
{
861
if (mem->slot >= KVM_MEMORY_SLOTS)
862
return -EINVAL;
863
return kvm_set_memory_region(kvm, mem, user_alloc);
864
}
865
866
int kvm_get_dirty_log(struct kvm *kvm,
867
struct kvm_dirty_log *log, int *is_dirty)
868
{
869
struct kvm_memory_slot *memslot;
870
int r, i;
871
unsigned long n;
872
unsigned long any = 0;
873
874
r = -EINVAL;
875
if (log->slot >= KVM_MEMORY_SLOTS)
876
goto out;
877
878
memslot = &kvm->memslots->memslots[log->slot];
879
r = -ENOENT;
880
if (!memslot->dirty_bitmap)
881
goto out;
882
883
n = kvm_dirty_bitmap_bytes(memslot);
884
885
for (i = 0; !any && i < n/sizeof(long); ++i)
886
any = memslot->dirty_bitmap[i];
887
888
r = -EFAULT;
889
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
890
goto out;
891
892
if (any)
893
*is_dirty = 1;
894
895
r = 0;
896
out:
897
return r;
898
}
899
900
void kvm_disable_largepages(void)
901
{
902
largepages_enabled = false;
903
}
904
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
905
906
int is_error_page(struct page *page)
907
{
908
return page == bad_page || page == hwpoison_page || page == fault_page;
909
}
910
EXPORT_SYMBOL_GPL(is_error_page);
911
912
int is_error_pfn(pfn_t pfn)
913
{
914
return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
915
}
916
EXPORT_SYMBOL_GPL(is_error_pfn);
917
918
int is_hwpoison_pfn(pfn_t pfn)
919
{
920
return pfn == hwpoison_pfn;
921
}
922
EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
923
924
int is_fault_pfn(pfn_t pfn)
925
{
926
return pfn == fault_pfn;
927
}
928
EXPORT_SYMBOL_GPL(is_fault_pfn);
929
930
static inline unsigned long bad_hva(void)
931
{
932
return PAGE_OFFSET;
933
}
934
935
int kvm_is_error_hva(unsigned long addr)
936
{
937
return addr == bad_hva();
938
}
939
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
940
941
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
942
gfn_t gfn)
943
{
944
int i;
945
946
for (i = 0; i < slots->nmemslots; ++i) {
947
struct kvm_memory_slot *memslot = &slots->memslots[i];
948
949
if (gfn >= memslot->base_gfn
950
&& gfn < memslot->base_gfn + memslot->npages)
951
return memslot;
952
}
953
return NULL;
954
}
955
956
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
957
{
958
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
959
}
960
EXPORT_SYMBOL_GPL(gfn_to_memslot);
961
962
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
963
{
964
int i;
965
struct kvm_memslots *slots = kvm_memslots(kvm);
966
967
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
968
struct kvm_memory_slot *memslot = &slots->memslots[i];
969
970
if (memslot->flags & KVM_MEMSLOT_INVALID)
971
continue;
972
973
if (gfn >= memslot->base_gfn
974
&& gfn < memslot->base_gfn + memslot->npages)
975
return 1;
976
}
977
return 0;
978
}
979
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
980
981
unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
982
{
983
struct vm_area_struct *vma;
984
unsigned long addr, size;
985
986
size = PAGE_SIZE;
987
988
addr = gfn_to_hva(kvm, gfn);
989
if (kvm_is_error_hva(addr))
990
return PAGE_SIZE;
991
992
down_read(&current->mm->mmap_sem);
993
vma = find_vma(current->mm, addr);
994
if (!vma)
995
goto out;
996
997
size = vma_kernel_pagesize(vma);
998
999
out:
1000
up_read(&current->mm->mmap_sem);
1001
1002
return size;
1003
}
1004
1005
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1006
gfn_t *nr_pages)
1007
{
1008
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1009
return bad_hva();
1010
1011
if (nr_pages)
1012
*nr_pages = slot->npages - (gfn - slot->base_gfn);
1013
1014
return gfn_to_hva_memslot(slot, gfn);
1015
}
1016
1017
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1018
{
1019
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1020
}
1021
EXPORT_SYMBOL_GPL(gfn_to_hva);
1022
1023
static pfn_t get_fault_pfn(void)
1024
{
1025
get_page(fault_page);
1026
return fault_pfn;
1027
}
1028
1029
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1030
unsigned long start, int write, struct page **page)
1031
{
1032
int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1033
1034
if (write)
1035
flags |= FOLL_WRITE;
1036
1037
return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1038
}
1039
1040
static inline int check_user_page_hwpoison(unsigned long addr)
1041
{
1042
int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1043
1044
rc = __get_user_pages(current, current->mm, addr, 1,
1045
flags, NULL, NULL, NULL);
1046
return rc == -EHWPOISON;
1047
}
1048
1049
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1050
bool *async, bool write_fault, bool *writable)
1051
{
1052
struct page *page[1];
1053
int npages = 0;
1054
pfn_t pfn;
1055
1056
/* we can do it either atomically or asynchronously, not both */
1057
BUG_ON(atomic && async);
1058
1059
BUG_ON(!write_fault && !writable);
1060
1061
if (writable)
1062
*writable = true;
1063
1064
if (atomic || async)
1065
npages = __get_user_pages_fast(addr, 1, 1, page);
1066
1067
if (unlikely(npages != 1) && !atomic) {
1068
might_sleep();
1069
1070
if (writable)
1071
*writable = write_fault;
1072
1073
if (async) {
1074
down_read(&current->mm->mmap_sem);
1075
npages = get_user_page_nowait(current, current->mm,
1076
addr, write_fault, page);
1077
up_read(&current->mm->mmap_sem);
1078
} else
1079
npages = get_user_pages_fast(addr, 1, write_fault,
1080
page);
1081
1082
/* map read fault as writable if possible */
1083
if (unlikely(!write_fault) && npages == 1) {
1084
struct page *wpage[1];
1085
1086
npages = __get_user_pages_fast(addr, 1, 1, wpage);
1087
if (npages == 1) {
1088
*writable = true;
1089
put_page(page[0]);
1090
page[0] = wpage[0];
1091
}
1092
npages = 1;
1093
}
1094
}
1095
1096
if (unlikely(npages != 1)) {
1097
struct vm_area_struct *vma;
1098
1099
if (atomic)
1100
return get_fault_pfn();
1101
1102
down_read(&current->mm->mmap_sem);
1103
if (npages == -EHWPOISON ||
1104
(!async && check_user_page_hwpoison(addr))) {
1105
up_read(&current->mm->mmap_sem);
1106
get_page(hwpoison_page);
1107
return page_to_pfn(hwpoison_page);
1108
}
1109
1110
vma = find_vma_intersection(current->mm, addr, addr+1);
1111
1112
if (vma == NULL)
1113
pfn = get_fault_pfn();
1114
else if ((vma->vm_flags & VM_PFNMAP)) {
1115
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1116
vma->vm_pgoff;
1117
BUG_ON(!kvm_is_mmio_pfn(pfn));
1118
} else {
1119
if (async && (vma->vm_flags & VM_WRITE))
1120
*async = true;
1121
pfn = get_fault_pfn();
1122
}
1123
up_read(&current->mm->mmap_sem);
1124
} else
1125
pfn = page_to_pfn(page[0]);
1126
1127
return pfn;
1128
}
1129
1130
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1131
{
1132
return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
1133
}
1134
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1135
1136
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1137
bool write_fault, bool *writable)
1138
{
1139
unsigned long addr;
1140
1141
if (async)
1142
*async = false;
1143
1144
addr = gfn_to_hva(kvm, gfn);
1145
if (kvm_is_error_hva(addr)) {
1146
get_page(bad_page);
1147
return page_to_pfn(bad_page);
1148
}
1149
1150
return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1151
}
1152
1153
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1154
{
1155
return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1156
}
1157
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1158
1159
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1160
bool write_fault, bool *writable)
1161
{
1162
return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1163
}
1164
EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1165
1166
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1167
{
1168
return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1169
}
1170
EXPORT_SYMBOL_GPL(gfn_to_pfn);
1171
1172
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1173
bool *writable)
1174
{
1175
return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1176
}
1177
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1178
1179
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1180
struct kvm_memory_slot *slot, gfn_t gfn)
1181
{
1182
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1183
return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1184
}
1185
1186
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1187
int nr_pages)
1188
{
1189
unsigned long addr;
1190
gfn_t entry;
1191
1192
addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1193
if (kvm_is_error_hva(addr))
1194
return -1;
1195
1196
if (entry < nr_pages)
1197
return 0;
1198
1199
return __get_user_pages_fast(addr, nr_pages, 1, pages);
1200
}
1201
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1202
1203
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1204
{
1205
pfn_t pfn;
1206
1207
pfn = gfn_to_pfn(kvm, gfn);
1208
if (!kvm_is_mmio_pfn(pfn))
1209
return pfn_to_page(pfn);
1210
1211
WARN_ON(kvm_is_mmio_pfn(pfn));
1212
1213
get_page(bad_page);
1214
return bad_page;
1215
}
1216
1217
EXPORT_SYMBOL_GPL(gfn_to_page);
1218
1219
void kvm_release_page_clean(struct page *page)
1220
{
1221
kvm_release_pfn_clean(page_to_pfn(page));
1222
}
1223
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1224
1225
void kvm_release_pfn_clean(pfn_t pfn)
1226
{
1227
if (!kvm_is_mmio_pfn(pfn))
1228
put_page(pfn_to_page(pfn));
1229
}
1230
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1231
1232
void kvm_release_page_dirty(struct page *page)
1233
{
1234
kvm_release_pfn_dirty(page_to_pfn(page));
1235
}
1236
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1237
1238
void kvm_release_pfn_dirty(pfn_t pfn)
1239
{
1240
kvm_set_pfn_dirty(pfn);
1241
kvm_release_pfn_clean(pfn);
1242
}
1243
EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1244
1245
void kvm_set_page_dirty(struct page *page)
1246
{
1247
kvm_set_pfn_dirty(page_to_pfn(page));
1248
}
1249
EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1250
1251
void kvm_set_pfn_dirty(pfn_t pfn)
1252
{
1253
if (!kvm_is_mmio_pfn(pfn)) {
1254
struct page *page = pfn_to_page(pfn);
1255
if (!PageReserved(page))
1256
SetPageDirty(page);
1257
}
1258
}
1259
EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1260
1261
void kvm_set_pfn_accessed(pfn_t pfn)
1262
{
1263
if (!kvm_is_mmio_pfn(pfn))
1264
mark_page_accessed(pfn_to_page(pfn));
1265
}
1266
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1267
1268
void kvm_get_pfn(pfn_t pfn)
1269
{
1270
if (!kvm_is_mmio_pfn(pfn))
1271
get_page(pfn_to_page(pfn));
1272
}
1273
EXPORT_SYMBOL_GPL(kvm_get_pfn);
1274
1275
static int next_segment(unsigned long len, int offset)
1276
{
1277
if (len > PAGE_SIZE - offset)
1278
return PAGE_SIZE - offset;
1279
else
1280
return len;
1281
}
1282
1283
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1284
int len)
1285
{
1286
int r;
1287
unsigned long addr;
1288
1289
addr = gfn_to_hva(kvm, gfn);
1290
if (kvm_is_error_hva(addr))
1291
return -EFAULT;
1292
r = __copy_from_user(data, (void __user *)addr + offset, len);
1293
if (r)
1294
return -EFAULT;
1295
return 0;
1296
}
1297
EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1298
1299
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1300
{
1301
gfn_t gfn = gpa >> PAGE_SHIFT;
1302
int seg;
1303
int offset = offset_in_page(gpa);
1304
int ret;
1305
1306
while ((seg = next_segment(len, offset)) != 0) {
1307
ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1308
if (ret < 0)
1309
return ret;
1310
offset = 0;
1311
len -= seg;
1312
data += seg;
1313
++gfn;
1314
}
1315
return 0;
1316
}
1317
EXPORT_SYMBOL_GPL(kvm_read_guest);
1318
1319
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1320
unsigned long len)
1321
{
1322
int r;
1323
unsigned long addr;
1324
gfn_t gfn = gpa >> PAGE_SHIFT;
1325
int offset = offset_in_page(gpa);
1326
1327
addr = gfn_to_hva(kvm, gfn);
1328
if (kvm_is_error_hva(addr))
1329
return -EFAULT;
1330
pagefault_disable();
1331
r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1332
pagefault_enable();
1333
if (r)
1334
return -EFAULT;
1335
return 0;
1336
}
1337
EXPORT_SYMBOL(kvm_read_guest_atomic);
1338
1339
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1340
int offset, int len)
1341
{
1342
int r;
1343
unsigned long addr;
1344
1345
addr = gfn_to_hva(kvm, gfn);
1346
if (kvm_is_error_hva(addr))
1347
return -EFAULT;
1348
r = copy_to_user((void __user *)addr + offset, data, len);
1349
if (r)
1350
return -EFAULT;
1351
mark_page_dirty(kvm, gfn);
1352
return 0;
1353
}
1354
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1355
1356
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1357
unsigned long len)
1358
{
1359
gfn_t gfn = gpa >> PAGE_SHIFT;
1360
int seg;
1361
int offset = offset_in_page(gpa);
1362
int ret;
1363
1364
while ((seg = next_segment(len, offset)) != 0) {
1365
ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1366
if (ret < 0)
1367
return ret;
1368
offset = 0;
1369
len -= seg;
1370
data += seg;
1371
++gfn;
1372
}
1373
return 0;
1374
}
1375
1376
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1377
gpa_t gpa)
1378
{
1379
struct kvm_memslots *slots = kvm_memslots(kvm);
1380
int offset = offset_in_page(gpa);
1381
gfn_t gfn = gpa >> PAGE_SHIFT;
1382
1383
ghc->gpa = gpa;
1384
ghc->generation = slots->generation;
1385
ghc->memslot = __gfn_to_memslot(slots, gfn);
1386
ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1387
if (!kvm_is_error_hva(ghc->hva))
1388
ghc->hva += offset;
1389
else
1390
return -EFAULT;
1391
1392
return 0;
1393
}
1394
EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1395
1396
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1397
void *data, unsigned long len)
1398
{
1399
struct kvm_memslots *slots = kvm_memslots(kvm);
1400
int r;
1401
1402
if (slots->generation != ghc->generation)
1403
kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1404
1405
if (kvm_is_error_hva(ghc->hva))
1406
return -EFAULT;
1407
1408
r = copy_to_user((void __user *)ghc->hva, data, len);
1409
if (r)
1410
return -EFAULT;
1411
mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1412
1413
return 0;
1414
}
1415
EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1416
1417
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1418
{
1419
return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1420
offset, len);
1421
}
1422
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1423
1424
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1425
{
1426
gfn_t gfn = gpa >> PAGE_SHIFT;
1427
int seg;
1428
int offset = offset_in_page(gpa);
1429
int ret;
1430
1431
while ((seg = next_segment(len, offset)) != 0) {
1432
ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1433
if (ret < 0)
1434
return ret;
1435
offset = 0;
1436
len -= seg;
1437
++gfn;
1438
}
1439
return 0;
1440
}
1441
EXPORT_SYMBOL_GPL(kvm_clear_guest);
1442
1443
void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1444
gfn_t gfn)
1445
{
1446
if (memslot && memslot->dirty_bitmap) {
1447
unsigned long rel_gfn = gfn - memslot->base_gfn;
1448
1449
__set_bit_le(rel_gfn, memslot->dirty_bitmap);
1450
}
1451
}
1452
1453
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1454
{
1455
struct kvm_memory_slot *memslot;
1456
1457
memslot = gfn_to_memslot(kvm, gfn);
1458
mark_page_dirty_in_slot(kvm, memslot, gfn);
1459
}
1460
1461
/*
1462
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
1463
*/
1464
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1465
{
1466
DEFINE_WAIT(wait);
1467
1468
for (;;) {
1469
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1470
1471
if (kvm_arch_vcpu_runnable(vcpu)) {
1472
kvm_make_request(KVM_REQ_UNHALT, vcpu);
1473
break;
1474
}
1475
if (kvm_cpu_has_pending_timer(vcpu))
1476
break;
1477
if (signal_pending(current))
1478
break;
1479
1480
schedule();
1481
}
1482
1483
finish_wait(&vcpu->wq, &wait);
1484
}
1485
1486
void kvm_resched(struct kvm_vcpu *vcpu)
1487
{
1488
if (!need_resched())
1489
return;
1490
cond_resched();
1491
}
1492
EXPORT_SYMBOL_GPL(kvm_resched);
1493
1494
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1495
{
1496
struct kvm *kvm = me->kvm;
1497
struct kvm_vcpu *vcpu;
1498
int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1499
int yielded = 0;
1500
int pass;
1501
int i;
1502
1503
/*
1504
* We boost the priority of a VCPU that is runnable but not
1505
* currently running, because it got preempted by something
1506
* else and called schedule in __vcpu_run. Hopefully that
1507
* VCPU is holding the lock that we need and will release it.
1508
* We approximate round-robin by starting at the last boosted VCPU.
1509
*/
1510
for (pass = 0; pass < 2 && !yielded; pass++) {
1511
kvm_for_each_vcpu(i, vcpu, kvm) {
1512
struct task_struct *task = NULL;
1513
struct pid *pid;
1514
if (!pass && i < last_boosted_vcpu) {
1515
i = last_boosted_vcpu;
1516
continue;
1517
} else if (pass && i > last_boosted_vcpu)
1518
break;
1519
if (vcpu == me)
1520
continue;
1521
if (waitqueue_active(&vcpu->wq))
1522
continue;
1523
rcu_read_lock();
1524
pid = rcu_dereference(vcpu->pid);
1525
if (pid)
1526
task = get_pid_task(vcpu->pid, PIDTYPE_PID);
1527
rcu_read_unlock();
1528
if (!task)
1529
continue;
1530
if (task->flags & PF_VCPU) {
1531
put_task_struct(task);
1532
continue;
1533
}
1534
if (yield_to(task, 1)) {
1535
put_task_struct(task);
1536
kvm->last_boosted_vcpu = i;
1537
yielded = 1;
1538
break;
1539
}
1540
put_task_struct(task);
1541
}
1542
}
1543
}
1544
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1545
1546
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1547
{
1548
struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1549
struct page *page;
1550
1551
if (vmf->pgoff == 0)
1552
page = virt_to_page(vcpu->run);
1553
#ifdef CONFIG_X86
1554
else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1555
page = virt_to_page(vcpu->arch.pio_data);
1556
#endif
1557
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1558
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1559
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1560
#endif
1561
else
1562
return VM_FAULT_SIGBUS;
1563
get_page(page);
1564
vmf->page = page;
1565
return 0;
1566
}
1567
1568
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1569
.fault = kvm_vcpu_fault,
1570
};
1571
1572
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1573
{
1574
vma->vm_ops = &kvm_vcpu_vm_ops;
1575
return 0;
1576
}
1577
1578
static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1579
{
1580
struct kvm_vcpu *vcpu = filp->private_data;
1581
1582
kvm_put_kvm(vcpu->kvm);
1583
return 0;
1584
}
1585
1586
static struct file_operations kvm_vcpu_fops = {
1587
.release = kvm_vcpu_release,
1588
.unlocked_ioctl = kvm_vcpu_ioctl,
1589
.compat_ioctl = kvm_vcpu_ioctl,
1590
.mmap = kvm_vcpu_mmap,
1591
.llseek = noop_llseek,
1592
};
1593
1594
/*
1595
* Allocates an inode for the vcpu.
1596
*/
1597
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1598
{
1599
return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1600
}
1601
1602
/*
1603
* Creates some virtual cpus. Good luck creating more than one.
1604
*/
1605
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1606
{
1607
int r;
1608
struct kvm_vcpu *vcpu, *v;
1609
1610
vcpu = kvm_arch_vcpu_create(kvm, id);
1611
if (IS_ERR(vcpu))
1612
return PTR_ERR(vcpu);
1613
1614
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1615
1616
r = kvm_arch_vcpu_setup(vcpu);
1617
if (r)
1618
return r;
1619
1620
mutex_lock(&kvm->lock);
1621
if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1622
r = -EINVAL;
1623
goto vcpu_destroy;
1624
}
1625
1626
kvm_for_each_vcpu(r, v, kvm)
1627
if (v->vcpu_id == id) {
1628
r = -EEXIST;
1629
goto vcpu_destroy;
1630
}
1631
1632
BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1633
1634
/* Now it's all set up, let userspace reach it */
1635
kvm_get_kvm(kvm);
1636
r = create_vcpu_fd(vcpu);
1637
if (r < 0) {
1638
kvm_put_kvm(kvm);
1639
goto vcpu_destroy;
1640
}
1641
1642
kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1643
smp_wmb();
1644
atomic_inc(&kvm->online_vcpus);
1645
1646
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1647
if (kvm->bsp_vcpu_id == id)
1648
kvm->bsp_vcpu = vcpu;
1649
#endif
1650
mutex_unlock(&kvm->lock);
1651
return r;
1652
1653
vcpu_destroy:
1654
mutex_unlock(&kvm->lock);
1655
kvm_arch_vcpu_destroy(vcpu);
1656
return r;
1657
}
1658
1659
static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1660
{
1661
if (sigset) {
1662
sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1663
vcpu->sigset_active = 1;
1664
vcpu->sigset = *sigset;
1665
} else
1666
vcpu->sigset_active = 0;
1667
return 0;
1668
}
1669
1670
static long kvm_vcpu_ioctl(struct file *filp,
1671
unsigned int ioctl, unsigned long arg)
1672
{
1673
struct kvm_vcpu *vcpu = filp->private_data;
1674
void __user *argp = (void __user *)arg;
1675
int r;
1676
struct kvm_fpu *fpu = NULL;
1677
struct kvm_sregs *kvm_sregs = NULL;
1678
1679
if (vcpu->kvm->mm != current->mm)
1680
return -EIO;
1681
1682
#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1683
/*
1684
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1685
* so vcpu_load() would break it.
1686
*/
1687
if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1688
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1689
#endif
1690
1691
1692
vcpu_load(vcpu);
1693
switch (ioctl) {
1694
case KVM_RUN:
1695
r = -EINVAL;
1696
if (arg)
1697
goto out;
1698
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1699
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1700
break;
1701
case KVM_GET_REGS: {
1702
struct kvm_regs *kvm_regs;
1703
1704
r = -ENOMEM;
1705
kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1706
if (!kvm_regs)
1707
goto out;
1708
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1709
if (r)
1710
goto out_free1;
1711
r = -EFAULT;
1712
if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1713
goto out_free1;
1714
r = 0;
1715
out_free1:
1716
kfree(kvm_regs);
1717
break;
1718
}
1719
case KVM_SET_REGS: {
1720
struct kvm_regs *kvm_regs;
1721
1722
r = -ENOMEM;
1723
kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1724
if (!kvm_regs)
1725
goto out;
1726
r = -EFAULT;
1727
if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1728
goto out_free2;
1729
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1730
if (r)
1731
goto out_free2;
1732
r = 0;
1733
out_free2:
1734
kfree(kvm_regs);
1735
break;
1736
}
1737
case KVM_GET_SREGS: {
1738
kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1739
r = -ENOMEM;
1740
if (!kvm_sregs)
1741
goto out;
1742
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1743
if (r)
1744
goto out;
1745
r = -EFAULT;
1746
if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1747
goto out;
1748
r = 0;
1749
break;
1750
}
1751
case KVM_SET_SREGS: {
1752
kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1753
r = -ENOMEM;
1754
if (!kvm_sregs)
1755
goto out;
1756
r = -EFAULT;
1757
if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1758
goto out;
1759
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1760
if (r)
1761
goto out;
1762
r = 0;
1763
break;
1764
}
1765
case KVM_GET_MP_STATE: {
1766
struct kvm_mp_state mp_state;
1767
1768
r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1769
if (r)
1770
goto out;
1771
r = -EFAULT;
1772
if (copy_to_user(argp, &mp_state, sizeof mp_state))
1773
goto out;
1774
r = 0;
1775
break;
1776
}
1777
case KVM_SET_MP_STATE: {
1778
struct kvm_mp_state mp_state;
1779
1780
r = -EFAULT;
1781
if (copy_from_user(&mp_state, argp, sizeof mp_state))
1782
goto out;
1783
r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1784
if (r)
1785
goto out;
1786
r = 0;
1787
break;
1788
}
1789
case KVM_TRANSLATE: {
1790
struct kvm_translation tr;
1791
1792
r = -EFAULT;
1793
if (copy_from_user(&tr, argp, sizeof tr))
1794
goto out;
1795
r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1796
if (r)
1797
goto out;
1798
r = -EFAULT;
1799
if (copy_to_user(argp, &tr, sizeof tr))
1800
goto out;
1801
r = 0;
1802
break;
1803
}
1804
case KVM_SET_GUEST_DEBUG: {
1805
struct kvm_guest_debug dbg;
1806
1807
r = -EFAULT;
1808
if (copy_from_user(&dbg, argp, sizeof dbg))
1809
goto out;
1810
r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1811
if (r)
1812
goto out;
1813
r = 0;
1814
break;
1815
}
1816
case KVM_SET_SIGNAL_MASK: {
1817
struct kvm_signal_mask __user *sigmask_arg = argp;
1818
struct kvm_signal_mask kvm_sigmask;
1819
sigset_t sigset, *p;
1820
1821
p = NULL;
1822
if (argp) {
1823
r = -EFAULT;
1824
if (copy_from_user(&kvm_sigmask, argp,
1825
sizeof kvm_sigmask))
1826
goto out;
1827
r = -EINVAL;
1828
if (kvm_sigmask.len != sizeof sigset)
1829
goto out;
1830
r = -EFAULT;
1831
if (copy_from_user(&sigset, sigmask_arg->sigset,
1832
sizeof sigset))
1833
goto out;
1834
p = &sigset;
1835
}
1836
r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1837
break;
1838
}
1839
case KVM_GET_FPU: {
1840
fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1841
r = -ENOMEM;
1842
if (!fpu)
1843
goto out;
1844
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1845
if (r)
1846
goto out;
1847
r = -EFAULT;
1848
if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1849
goto out;
1850
r = 0;
1851
break;
1852
}
1853
case KVM_SET_FPU: {
1854
fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1855
r = -ENOMEM;
1856
if (!fpu)
1857
goto out;
1858
r = -EFAULT;
1859
if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1860
goto out;
1861
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1862
if (r)
1863
goto out;
1864
r = 0;
1865
break;
1866
}
1867
default:
1868
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1869
}
1870
out:
1871
vcpu_put(vcpu);
1872
kfree(fpu);
1873
kfree(kvm_sregs);
1874
return r;
1875
}
1876
1877
static long kvm_vm_ioctl(struct file *filp,
1878
unsigned int ioctl, unsigned long arg)
1879
{
1880
struct kvm *kvm = filp->private_data;
1881
void __user *argp = (void __user *)arg;
1882
int r;
1883
1884
if (kvm->mm != current->mm)
1885
return -EIO;
1886
switch (ioctl) {
1887
case KVM_CREATE_VCPU:
1888
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1889
if (r < 0)
1890
goto out;
1891
break;
1892
case KVM_SET_USER_MEMORY_REGION: {
1893
struct kvm_userspace_memory_region kvm_userspace_mem;
1894
1895
r = -EFAULT;
1896
if (copy_from_user(&kvm_userspace_mem, argp,
1897
sizeof kvm_userspace_mem))
1898
goto out;
1899
1900
r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1901
if (r)
1902
goto out;
1903
break;
1904
}
1905
case KVM_GET_DIRTY_LOG: {
1906
struct kvm_dirty_log log;
1907
1908
r = -EFAULT;
1909
if (copy_from_user(&log, argp, sizeof log))
1910
goto out;
1911
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1912
if (r)
1913
goto out;
1914
break;
1915
}
1916
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1917
case KVM_REGISTER_COALESCED_MMIO: {
1918
struct kvm_coalesced_mmio_zone zone;
1919
r = -EFAULT;
1920
if (copy_from_user(&zone, argp, sizeof zone))
1921
goto out;
1922
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1923
if (r)
1924
goto out;
1925
r = 0;
1926
break;
1927
}
1928
case KVM_UNREGISTER_COALESCED_MMIO: {
1929
struct kvm_coalesced_mmio_zone zone;
1930
r = -EFAULT;
1931
if (copy_from_user(&zone, argp, sizeof zone))
1932
goto out;
1933
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1934
if (r)
1935
goto out;
1936
r = 0;
1937
break;
1938
}
1939
#endif
1940
case KVM_IRQFD: {
1941
struct kvm_irqfd data;
1942
1943
r = -EFAULT;
1944
if (copy_from_user(&data, argp, sizeof data))
1945
goto out;
1946
r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1947
break;
1948
}
1949
case KVM_IOEVENTFD: {
1950
struct kvm_ioeventfd data;
1951
1952
r = -EFAULT;
1953
if (copy_from_user(&data, argp, sizeof data))
1954
goto out;
1955
r = kvm_ioeventfd(kvm, &data);
1956
break;
1957
}
1958
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1959
case KVM_SET_BOOT_CPU_ID:
1960
r = 0;
1961
mutex_lock(&kvm->lock);
1962
if (atomic_read(&kvm->online_vcpus) != 0)
1963
r = -EBUSY;
1964
else
1965
kvm->bsp_vcpu_id = arg;
1966
mutex_unlock(&kvm->lock);
1967
break;
1968
#endif
1969
default:
1970
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1971
if (r == -ENOTTY)
1972
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
1973
}
1974
out:
1975
return r;
1976
}
1977
1978
#ifdef CONFIG_COMPAT
1979
struct compat_kvm_dirty_log {
1980
__u32 slot;
1981
__u32 padding1;
1982
union {
1983
compat_uptr_t dirty_bitmap; /* one bit per page */
1984
__u64 padding2;
1985
};
1986
};
1987
1988
static long kvm_vm_compat_ioctl(struct file *filp,
1989
unsigned int ioctl, unsigned long arg)
1990
{
1991
struct kvm *kvm = filp->private_data;
1992
int r;
1993
1994
if (kvm->mm != current->mm)
1995
return -EIO;
1996
switch (ioctl) {
1997
case KVM_GET_DIRTY_LOG: {
1998
struct compat_kvm_dirty_log compat_log;
1999
struct kvm_dirty_log log;
2000
2001
r = -EFAULT;
2002
if (copy_from_user(&compat_log, (void __user *)arg,
2003
sizeof(compat_log)))
2004
goto out;
2005
log.slot = compat_log.slot;
2006
log.padding1 = compat_log.padding1;
2007
log.padding2 = compat_log.padding2;
2008
log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2009
2010
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2011
if (r)
2012
goto out;
2013
break;
2014
}
2015
default:
2016
r = kvm_vm_ioctl(filp, ioctl, arg);
2017
}
2018
2019
out:
2020
return r;
2021
}
2022
#endif
2023
2024
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2025
{
2026
struct page *page[1];
2027
unsigned long addr;
2028
int npages;
2029
gfn_t gfn = vmf->pgoff;
2030
struct kvm *kvm = vma->vm_file->private_data;
2031
2032
addr = gfn_to_hva(kvm, gfn);
2033
if (kvm_is_error_hva(addr))
2034
return VM_FAULT_SIGBUS;
2035
2036
npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2037
NULL);
2038
if (unlikely(npages != 1))
2039
return VM_FAULT_SIGBUS;
2040
2041
vmf->page = page[0];
2042
return 0;
2043
}
2044
2045
static const struct vm_operations_struct kvm_vm_vm_ops = {
2046
.fault = kvm_vm_fault,
2047
};
2048
2049
static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2050
{
2051
vma->vm_ops = &kvm_vm_vm_ops;
2052
return 0;
2053
}
2054
2055
static struct file_operations kvm_vm_fops = {
2056
.release = kvm_vm_release,
2057
.unlocked_ioctl = kvm_vm_ioctl,
2058
#ifdef CONFIG_COMPAT
2059
.compat_ioctl = kvm_vm_compat_ioctl,
2060
#endif
2061
.mmap = kvm_vm_mmap,
2062
.llseek = noop_llseek,
2063
};
2064
2065
static int kvm_dev_ioctl_create_vm(void)
2066
{
2067
int r;
2068
struct kvm *kvm;
2069
2070
kvm = kvm_create_vm();
2071
if (IS_ERR(kvm))
2072
return PTR_ERR(kvm);
2073
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2074
r = kvm_coalesced_mmio_init(kvm);
2075
if (r < 0) {
2076
kvm_put_kvm(kvm);
2077
return r;
2078
}
2079
#endif
2080
r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2081
if (r < 0)
2082
kvm_put_kvm(kvm);
2083
2084
return r;
2085
}
2086
2087
static long kvm_dev_ioctl_check_extension_generic(long arg)
2088
{
2089
switch (arg) {
2090
case KVM_CAP_USER_MEMORY:
2091
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2092
case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2093
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2094
case KVM_CAP_SET_BOOT_CPU_ID:
2095
#endif
2096
case KVM_CAP_INTERNAL_ERROR_DATA:
2097
return 1;
2098
#ifdef CONFIG_HAVE_KVM_IRQCHIP
2099
case KVM_CAP_IRQ_ROUTING:
2100
return KVM_MAX_IRQ_ROUTES;
2101
#endif
2102
default:
2103
break;
2104
}
2105
return kvm_dev_ioctl_check_extension(arg);
2106
}
2107
2108
static long kvm_dev_ioctl(struct file *filp,
2109
unsigned int ioctl, unsigned long arg)
2110
{
2111
long r = -EINVAL;
2112
2113
switch (ioctl) {
2114
case KVM_GET_API_VERSION:
2115
r = -EINVAL;
2116
if (arg)
2117
goto out;
2118
r = KVM_API_VERSION;
2119
break;
2120
case KVM_CREATE_VM:
2121
r = -EINVAL;
2122
if (arg)
2123
goto out;
2124
r = kvm_dev_ioctl_create_vm();
2125
break;
2126
case KVM_CHECK_EXTENSION:
2127
r = kvm_dev_ioctl_check_extension_generic(arg);
2128
break;
2129
case KVM_GET_VCPU_MMAP_SIZE:
2130
r = -EINVAL;
2131
if (arg)
2132
goto out;
2133
r = PAGE_SIZE; /* struct kvm_run */
2134
#ifdef CONFIG_X86
2135
r += PAGE_SIZE; /* pio data page */
2136
#endif
2137
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2138
r += PAGE_SIZE; /* coalesced mmio ring page */
2139
#endif
2140
break;
2141
case KVM_TRACE_ENABLE:
2142
case KVM_TRACE_PAUSE:
2143
case KVM_TRACE_DISABLE:
2144
r = -EOPNOTSUPP;
2145
break;
2146
default:
2147
return kvm_arch_dev_ioctl(filp, ioctl, arg);
2148
}
2149
out:
2150
return r;
2151
}
2152
2153
static struct file_operations kvm_chardev_ops = {
2154
.unlocked_ioctl = kvm_dev_ioctl,
2155
.compat_ioctl = kvm_dev_ioctl,
2156
.llseek = noop_llseek,
2157
};
2158
2159
static struct miscdevice kvm_dev = {
2160
KVM_MINOR,
2161
"kvm",
2162
&kvm_chardev_ops,
2163
};
2164
2165
static void hardware_enable_nolock(void *junk)
2166
{
2167
int cpu = raw_smp_processor_id();
2168
int r;
2169
2170
if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2171
return;
2172
2173
cpumask_set_cpu(cpu, cpus_hardware_enabled);
2174
2175
r = kvm_arch_hardware_enable(NULL);
2176
2177
if (r) {
2178
cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2179
atomic_inc(&hardware_enable_failed);
2180
printk(KERN_INFO "kvm: enabling virtualization on "
2181
"CPU%d failed\n", cpu);
2182
}
2183
}
2184
2185
static void hardware_enable(void *junk)
2186
{
2187
raw_spin_lock(&kvm_lock);
2188
hardware_enable_nolock(junk);
2189
raw_spin_unlock(&kvm_lock);
2190
}
2191
2192
static void hardware_disable_nolock(void *junk)
2193
{
2194
int cpu = raw_smp_processor_id();
2195
2196
if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2197
return;
2198
cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2199
kvm_arch_hardware_disable(NULL);
2200
}
2201
2202
static void hardware_disable(void *junk)
2203
{
2204
raw_spin_lock(&kvm_lock);
2205
hardware_disable_nolock(junk);
2206
raw_spin_unlock(&kvm_lock);
2207
}
2208
2209
static void hardware_disable_all_nolock(void)
2210
{
2211
BUG_ON(!kvm_usage_count);
2212
2213
kvm_usage_count--;
2214
if (!kvm_usage_count)
2215
on_each_cpu(hardware_disable_nolock, NULL, 1);
2216
}
2217
2218
static void hardware_disable_all(void)
2219
{
2220
raw_spin_lock(&kvm_lock);
2221
hardware_disable_all_nolock();
2222
raw_spin_unlock(&kvm_lock);
2223
}
2224
2225
static int hardware_enable_all(void)
2226
{
2227
int r = 0;
2228
2229
raw_spin_lock(&kvm_lock);
2230
2231
kvm_usage_count++;
2232
if (kvm_usage_count == 1) {
2233
atomic_set(&hardware_enable_failed, 0);
2234
on_each_cpu(hardware_enable_nolock, NULL, 1);
2235
2236
if (atomic_read(&hardware_enable_failed)) {
2237
hardware_disable_all_nolock();
2238
r = -EBUSY;
2239
}
2240
}
2241
2242
raw_spin_unlock(&kvm_lock);
2243
2244
return r;
2245
}
2246
2247
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2248
void *v)
2249
{
2250
int cpu = (long)v;
2251
2252
if (!kvm_usage_count)
2253
return NOTIFY_OK;
2254
2255
val &= ~CPU_TASKS_FROZEN;
2256
switch (val) {
2257
case CPU_DYING:
2258
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2259
cpu);
2260
hardware_disable(NULL);
2261
break;
2262
case CPU_STARTING:
2263
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2264
cpu);
2265
hardware_enable(NULL);
2266
break;
2267
}
2268
return NOTIFY_OK;
2269
}
2270
2271
2272
asmlinkage void kvm_spurious_fault(void)
2273
{
2274
/* Fault while not rebooting. We want the trace. */
2275
BUG();
2276
}
2277
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2278
2279
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2280
void *v)
2281
{
2282
/*
2283
* Some (well, at least mine) BIOSes hang on reboot if
2284
* in vmx root mode.
2285
*
2286
* And Intel TXT required VMX off for all cpu when system shutdown.
2287
*/
2288
printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2289
kvm_rebooting = true;
2290
on_each_cpu(hardware_disable_nolock, NULL, 1);
2291
return NOTIFY_OK;
2292
}
2293
2294
static struct notifier_block kvm_reboot_notifier = {
2295
.notifier_call = kvm_reboot,
2296
.priority = 0,
2297
};
2298
2299
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2300
{
2301
int i;
2302
2303
for (i = 0; i < bus->dev_count; i++) {
2304
struct kvm_io_device *pos = bus->devs[i];
2305
2306
kvm_iodevice_destructor(pos);
2307
}
2308
kfree(bus);
2309
}
2310
2311
/* kvm_io_bus_write - called under kvm->slots_lock */
2312
int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2313
int len, const void *val)
2314
{
2315
int i;
2316
struct kvm_io_bus *bus;
2317
2318
bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2319
for (i = 0; i < bus->dev_count; i++)
2320
if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2321
return 0;
2322
return -EOPNOTSUPP;
2323
}
2324
2325
/* kvm_io_bus_read - called under kvm->slots_lock */
2326
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2327
int len, void *val)
2328
{
2329
int i;
2330
struct kvm_io_bus *bus;
2331
2332
bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2333
for (i = 0; i < bus->dev_count; i++)
2334
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2335
return 0;
2336
return -EOPNOTSUPP;
2337
}
2338
2339
/* Caller must hold slots_lock. */
2340
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2341
struct kvm_io_device *dev)
2342
{
2343
struct kvm_io_bus *new_bus, *bus;
2344
2345
bus = kvm->buses[bus_idx];
2346
if (bus->dev_count > NR_IOBUS_DEVS-1)
2347
return -ENOSPC;
2348
2349
new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2350
if (!new_bus)
2351
return -ENOMEM;
2352
memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2353
new_bus->devs[new_bus->dev_count++] = dev;
2354
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2355
synchronize_srcu_expedited(&kvm->srcu);
2356
kfree(bus);
2357
2358
return 0;
2359
}
2360
2361
/* Caller must hold slots_lock. */
2362
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2363
struct kvm_io_device *dev)
2364
{
2365
int i, r;
2366
struct kvm_io_bus *new_bus, *bus;
2367
2368
new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2369
if (!new_bus)
2370
return -ENOMEM;
2371
2372
bus = kvm->buses[bus_idx];
2373
memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2374
2375
r = -ENOENT;
2376
for (i = 0; i < new_bus->dev_count; i++)
2377
if (new_bus->devs[i] == dev) {
2378
r = 0;
2379
new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2380
break;
2381
}
2382
2383
if (r) {
2384
kfree(new_bus);
2385
return r;
2386
}
2387
2388
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2389
synchronize_srcu_expedited(&kvm->srcu);
2390
kfree(bus);
2391
return r;
2392
}
2393
2394
static struct notifier_block kvm_cpu_notifier = {
2395
.notifier_call = kvm_cpu_hotplug,
2396
};
2397
2398
static int vm_stat_get(void *_offset, u64 *val)
2399
{
2400
unsigned offset = (long)_offset;
2401
struct kvm *kvm;
2402
2403
*val = 0;
2404
raw_spin_lock(&kvm_lock);
2405
list_for_each_entry(kvm, &vm_list, vm_list)
2406
*val += *(u32 *)((void *)kvm + offset);
2407
raw_spin_unlock(&kvm_lock);
2408
return 0;
2409
}
2410
2411
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2412
2413
static int vcpu_stat_get(void *_offset, u64 *val)
2414
{
2415
unsigned offset = (long)_offset;
2416
struct kvm *kvm;
2417
struct kvm_vcpu *vcpu;
2418
int i;
2419
2420
*val = 0;
2421
raw_spin_lock(&kvm_lock);
2422
list_for_each_entry(kvm, &vm_list, vm_list)
2423
kvm_for_each_vcpu(i, vcpu, kvm)
2424
*val += *(u32 *)((void *)vcpu + offset);
2425
2426
raw_spin_unlock(&kvm_lock);
2427
return 0;
2428
}
2429
2430
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2431
2432
static const struct file_operations *stat_fops[] = {
2433
[KVM_STAT_VCPU] = &vcpu_stat_fops,
2434
[KVM_STAT_VM] = &vm_stat_fops,
2435
};
2436
2437
static void kvm_init_debug(void)
2438
{
2439
struct kvm_stats_debugfs_item *p;
2440
2441
kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2442
for (p = debugfs_entries; p->name; ++p)
2443
p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2444
(void *)(long)p->offset,
2445
stat_fops[p->kind]);
2446
}
2447
2448
static void kvm_exit_debug(void)
2449
{
2450
struct kvm_stats_debugfs_item *p;
2451
2452
for (p = debugfs_entries; p->name; ++p)
2453
debugfs_remove(p->dentry);
2454
debugfs_remove(kvm_debugfs_dir);
2455
}
2456
2457
static int kvm_suspend(void)
2458
{
2459
if (kvm_usage_count)
2460
hardware_disable_nolock(NULL);
2461
return 0;
2462
}
2463
2464
static void kvm_resume(void)
2465
{
2466
if (kvm_usage_count) {
2467
WARN_ON(raw_spin_is_locked(&kvm_lock));
2468
hardware_enable_nolock(NULL);
2469
}
2470
}
2471
2472
static struct syscore_ops kvm_syscore_ops = {
2473
.suspend = kvm_suspend,
2474
.resume = kvm_resume,
2475
};
2476
2477
struct page *bad_page;
2478
pfn_t bad_pfn;
2479
2480
static inline
2481
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2482
{
2483
return container_of(pn, struct kvm_vcpu, preempt_notifier);
2484
}
2485
2486
static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2487
{
2488
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2489
2490
kvm_arch_vcpu_load(vcpu, cpu);
2491
}
2492
2493
static void kvm_sched_out(struct preempt_notifier *pn,
2494
struct task_struct *next)
2495
{
2496
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2497
2498
kvm_arch_vcpu_put(vcpu);
2499
}
2500
2501
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2502
struct module *module)
2503
{
2504
int r;
2505
int cpu;
2506
2507
r = kvm_arch_init(opaque);
2508
if (r)
2509
goto out_fail;
2510
2511
bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2512
2513
if (bad_page == NULL) {
2514
r = -ENOMEM;
2515
goto out;
2516
}
2517
2518
bad_pfn = page_to_pfn(bad_page);
2519
2520
hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2521
2522
if (hwpoison_page == NULL) {
2523
r = -ENOMEM;
2524
goto out_free_0;
2525
}
2526
2527
hwpoison_pfn = page_to_pfn(hwpoison_page);
2528
2529
fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2530
2531
if (fault_page == NULL) {
2532
r = -ENOMEM;
2533
goto out_free_0;
2534
}
2535
2536
fault_pfn = page_to_pfn(fault_page);
2537
2538
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2539
r = -ENOMEM;
2540
goto out_free_0;
2541
}
2542
2543
r = kvm_arch_hardware_setup();
2544
if (r < 0)
2545
goto out_free_0a;
2546
2547
for_each_online_cpu(cpu) {
2548
smp_call_function_single(cpu,
2549
kvm_arch_check_processor_compat,
2550
&r, 1);
2551
if (r < 0)
2552
goto out_free_1;
2553
}
2554
2555
r = register_cpu_notifier(&kvm_cpu_notifier);
2556
if (r)
2557
goto out_free_2;
2558
register_reboot_notifier(&kvm_reboot_notifier);
2559
2560
/* A kmem cache lets us meet the alignment requirements of fx_save. */
2561
if (!vcpu_align)
2562
vcpu_align = __alignof__(struct kvm_vcpu);
2563
kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2564
0, NULL);
2565
if (!kvm_vcpu_cache) {
2566
r = -ENOMEM;
2567
goto out_free_3;
2568
}
2569
2570
r = kvm_async_pf_init();
2571
if (r)
2572
goto out_free;
2573
2574
kvm_chardev_ops.owner = module;
2575
kvm_vm_fops.owner = module;
2576
kvm_vcpu_fops.owner = module;
2577
2578
r = misc_register(&kvm_dev);
2579
if (r) {
2580
printk(KERN_ERR "kvm: misc device register failed\n");
2581
goto out_unreg;
2582
}
2583
2584
register_syscore_ops(&kvm_syscore_ops);
2585
2586
kvm_preempt_ops.sched_in = kvm_sched_in;
2587
kvm_preempt_ops.sched_out = kvm_sched_out;
2588
2589
kvm_init_debug();
2590
2591
return 0;
2592
2593
out_unreg:
2594
kvm_async_pf_deinit();
2595
out_free:
2596
kmem_cache_destroy(kvm_vcpu_cache);
2597
out_free_3:
2598
unregister_reboot_notifier(&kvm_reboot_notifier);
2599
unregister_cpu_notifier(&kvm_cpu_notifier);
2600
out_free_2:
2601
out_free_1:
2602
kvm_arch_hardware_unsetup();
2603
out_free_0a:
2604
free_cpumask_var(cpus_hardware_enabled);
2605
out_free_0:
2606
if (fault_page)
2607
__free_page(fault_page);
2608
if (hwpoison_page)
2609
__free_page(hwpoison_page);
2610
__free_page(bad_page);
2611
out:
2612
kvm_arch_exit();
2613
out_fail:
2614
return r;
2615
}
2616
EXPORT_SYMBOL_GPL(kvm_init);
2617
2618
void kvm_exit(void)
2619
{
2620
kvm_exit_debug();
2621
misc_deregister(&kvm_dev);
2622
kmem_cache_destroy(kvm_vcpu_cache);
2623
kvm_async_pf_deinit();
2624
unregister_syscore_ops(&kvm_syscore_ops);
2625
unregister_reboot_notifier(&kvm_reboot_notifier);
2626
unregister_cpu_notifier(&kvm_cpu_notifier);
2627
on_each_cpu(hardware_disable_nolock, NULL, 1);
2628
kvm_arch_hardware_unsetup();
2629
kvm_arch_exit();
2630
free_cpumask_var(cpus_hardware_enabled);
2631
__free_page(hwpoison_page);
2632
__free_page(bad_page);
2633
}
2634
EXPORT_SYMBOL_GPL(kvm_exit);
2635
2636