Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/virt/kvm/guest_memfd.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/backing-dev.h>
3
#include <linux/falloc.h>
4
#include <linux/kvm_host.h>
5
#include <linux/pagemap.h>
6
#include <linux/anon_inodes.h>
7
8
#include "kvm_mm.h"
9
10
struct kvm_gmem {
11
struct kvm *kvm;
12
struct xarray bindings;
13
struct list_head entry;
14
};
15
16
/**
17
* folio_file_pfn - like folio_file_page, but return a pfn.
18
* @folio: The folio which contains this index.
19
* @index: The index we want to look up.
20
*
21
* Return: The pfn for this index.
22
*/
23
static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
24
{
25
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
26
}
27
28
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
29
pgoff_t index, struct folio *folio)
30
{
31
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
32
kvm_pfn_t pfn = folio_file_pfn(folio, index);
33
gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
34
int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
35
if (rc) {
36
pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
37
index, gfn, pfn, rc);
38
return rc;
39
}
40
#endif
41
42
return 0;
43
}
44
45
static inline void kvm_gmem_mark_prepared(struct folio *folio)
46
{
47
folio_mark_uptodate(folio);
48
}
49
50
/*
51
* Process @folio, which contains @gfn, so that the guest can use it.
52
* The folio must be locked and the gfn must be contained in @slot.
53
* On successful return the guest sees a zero page so as to avoid
54
* leaking host data and the up-to-date flag is set.
55
*/
56
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
57
gfn_t gfn, struct folio *folio)
58
{
59
unsigned long nr_pages, i;
60
pgoff_t index;
61
int r;
62
63
nr_pages = folio_nr_pages(folio);
64
for (i = 0; i < nr_pages; i++)
65
clear_highpage(folio_page(folio, i));
66
67
/*
68
* Preparing huge folios should always be safe, since it should
69
* be possible to split them later if needed.
70
*
71
* Right now the folio order is always going to be zero, but the
72
* code is ready for huge folios. The only assumption is that
73
* the base pgoff of memslots is naturally aligned with the
74
* requested page order, ensuring that huge folios can also use
75
* huge page table entries for GPA->HPA mapping.
76
*
77
* The order will be passed when creating the guest_memfd, and
78
* checked when creating memslots.
79
*/
80
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
81
index = gfn - slot->base_gfn + slot->gmem.pgoff;
82
index = ALIGN_DOWN(index, 1 << folio_order(folio));
83
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
84
if (!r)
85
kvm_gmem_mark_prepared(folio);
86
87
return r;
88
}
89
90
/*
91
* Returns a locked folio on success. The caller is responsible for
92
* setting the up-to-date flag before the memory is mapped into the guest.
93
* There is no backing storage for the memory, so the folio will remain
94
* up-to-date until it's removed.
95
*
96
* Ignore accessed, referenced, and dirty flags. The memory is
97
* unevictable and there is no storage to write back to.
98
*/
99
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
100
{
101
/* TODO: Support huge pages. */
102
return filemap_grab_folio(inode->i_mapping, index);
103
}
104
105
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
106
pgoff_t end)
107
{
108
bool flush = false, found_memslot = false;
109
struct kvm_memory_slot *slot;
110
struct kvm *kvm = gmem->kvm;
111
unsigned long index;
112
113
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
114
pgoff_t pgoff = slot->gmem.pgoff;
115
116
struct kvm_gfn_range gfn_range = {
117
.start = slot->base_gfn + max(pgoff, start) - pgoff,
118
.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
119
.slot = slot,
120
.may_block = true,
121
/* guest memfd is relevant to only private mappings. */
122
.attr_filter = KVM_FILTER_PRIVATE,
123
};
124
125
if (!found_memslot) {
126
found_memslot = true;
127
128
KVM_MMU_LOCK(kvm);
129
kvm_mmu_invalidate_begin(kvm);
130
}
131
132
flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
133
}
134
135
if (flush)
136
kvm_flush_remote_tlbs(kvm);
137
138
if (found_memslot)
139
KVM_MMU_UNLOCK(kvm);
140
}
141
142
static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
143
pgoff_t end)
144
{
145
struct kvm *kvm = gmem->kvm;
146
147
if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
148
KVM_MMU_LOCK(kvm);
149
kvm_mmu_invalidate_end(kvm);
150
KVM_MMU_UNLOCK(kvm);
151
}
152
}
153
154
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
155
{
156
struct list_head *gmem_list = &inode->i_mapping->i_private_list;
157
pgoff_t start = offset >> PAGE_SHIFT;
158
pgoff_t end = (offset + len) >> PAGE_SHIFT;
159
struct kvm_gmem *gmem;
160
161
/*
162
* Bindings must be stable across invalidation to ensure the start+end
163
* are balanced.
164
*/
165
filemap_invalidate_lock(inode->i_mapping);
166
167
list_for_each_entry(gmem, gmem_list, entry)
168
kvm_gmem_invalidate_begin(gmem, start, end);
169
170
truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
171
172
list_for_each_entry(gmem, gmem_list, entry)
173
kvm_gmem_invalidate_end(gmem, start, end);
174
175
filemap_invalidate_unlock(inode->i_mapping);
176
177
return 0;
178
}
179
180
static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
181
{
182
struct address_space *mapping = inode->i_mapping;
183
pgoff_t start, index, end;
184
int r;
185
186
/* Dedicated guest is immutable by default. */
187
if (offset + len > i_size_read(inode))
188
return -EINVAL;
189
190
filemap_invalidate_lock_shared(mapping);
191
192
start = offset >> PAGE_SHIFT;
193
end = (offset + len) >> PAGE_SHIFT;
194
195
r = 0;
196
for (index = start; index < end; ) {
197
struct folio *folio;
198
199
if (signal_pending(current)) {
200
r = -EINTR;
201
break;
202
}
203
204
folio = kvm_gmem_get_folio(inode, index);
205
if (IS_ERR(folio)) {
206
r = PTR_ERR(folio);
207
break;
208
}
209
210
index = folio_next_index(folio);
211
212
folio_unlock(folio);
213
folio_put(folio);
214
215
/* 64-bit only, wrapping the index should be impossible. */
216
if (WARN_ON_ONCE(!index))
217
break;
218
219
cond_resched();
220
}
221
222
filemap_invalidate_unlock_shared(mapping);
223
224
return r;
225
}
226
227
static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
228
loff_t len)
229
{
230
int ret;
231
232
if (!(mode & FALLOC_FL_KEEP_SIZE))
233
return -EOPNOTSUPP;
234
235
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
236
return -EOPNOTSUPP;
237
238
if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
239
return -EINVAL;
240
241
if (mode & FALLOC_FL_PUNCH_HOLE)
242
ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
243
else
244
ret = kvm_gmem_allocate(file_inode(file), offset, len);
245
246
if (!ret)
247
file_modified(file);
248
return ret;
249
}
250
251
static int kvm_gmem_release(struct inode *inode, struct file *file)
252
{
253
struct kvm_gmem *gmem = file->private_data;
254
struct kvm_memory_slot *slot;
255
struct kvm *kvm = gmem->kvm;
256
unsigned long index;
257
258
/*
259
* Prevent concurrent attempts to *unbind* a memslot. This is the last
260
* reference to the file and thus no new bindings can be created, but
261
* dereferencing the slot for existing bindings needs to be protected
262
* against memslot updates, specifically so that unbind doesn't race
263
* and free the memslot (kvm_gmem_get_file() will return NULL).
264
*
265
* Since .release is called only when the reference count is zero,
266
* after which file_ref_get() and get_file_active() fail,
267
* kvm_gmem_get_pfn() cannot be using the file concurrently.
268
* file_ref_put() provides a full barrier, and get_file_active() the
269
* matching acquire barrier.
270
*/
271
mutex_lock(&kvm->slots_lock);
272
273
filemap_invalidate_lock(inode->i_mapping);
274
275
xa_for_each(&gmem->bindings, index, slot)
276
WRITE_ONCE(slot->gmem.file, NULL);
277
278
/*
279
* All in-flight operations are gone and new bindings can be created.
280
* Zap all SPTEs pointed at by this file. Do not free the backing
281
* memory, as its lifetime is associated with the inode, not the file.
282
*/
283
kvm_gmem_invalidate_begin(gmem, 0, -1ul);
284
kvm_gmem_invalidate_end(gmem, 0, -1ul);
285
286
list_del(&gmem->entry);
287
288
filemap_invalidate_unlock(inode->i_mapping);
289
290
mutex_unlock(&kvm->slots_lock);
291
292
xa_destroy(&gmem->bindings);
293
kfree(gmem);
294
295
kvm_put_kvm(kvm);
296
297
return 0;
298
}
299
300
static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
301
{
302
/*
303
* Do not return slot->gmem.file if it has already been closed;
304
* there might be some time between the last fput() and when
305
* kvm_gmem_release() clears slot->gmem.file.
306
*/
307
return get_file_active(&slot->gmem.file);
308
}
309
310
static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
311
{
312
return gfn - slot->base_gfn + slot->gmem.pgoff;
313
}
314
315
static struct file_operations kvm_gmem_fops = {
316
.open = generic_file_open,
317
.release = kvm_gmem_release,
318
.fallocate = kvm_gmem_fallocate,
319
};
320
321
void kvm_gmem_init(struct module *module)
322
{
323
kvm_gmem_fops.owner = module;
324
}
325
326
static int kvm_gmem_migrate_folio(struct address_space *mapping,
327
struct folio *dst, struct folio *src,
328
enum migrate_mode mode)
329
{
330
WARN_ON_ONCE(1);
331
return -EINVAL;
332
}
333
334
static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
335
{
336
struct list_head *gmem_list = &mapping->i_private_list;
337
struct kvm_gmem *gmem;
338
pgoff_t start, end;
339
340
filemap_invalidate_lock_shared(mapping);
341
342
start = folio->index;
343
end = start + folio_nr_pages(folio);
344
345
list_for_each_entry(gmem, gmem_list, entry)
346
kvm_gmem_invalidate_begin(gmem, start, end);
347
348
/*
349
* Do not truncate the range, what action is taken in response to the
350
* error is userspace's decision (assuming the architecture supports
351
* gracefully handling memory errors). If/when the guest attempts to
352
* access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
353
* at which point KVM can either terminate the VM or propagate the
354
* error to userspace.
355
*/
356
357
list_for_each_entry(gmem, gmem_list, entry)
358
kvm_gmem_invalidate_end(gmem, start, end);
359
360
filemap_invalidate_unlock_shared(mapping);
361
362
return MF_DELAYED;
363
}
364
365
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
366
static void kvm_gmem_free_folio(struct folio *folio)
367
{
368
struct page *page = folio_page(folio, 0);
369
kvm_pfn_t pfn = page_to_pfn(page);
370
int order = folio_order(folio);
371
372
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
373
}
374
#endif
375
376
static const struct address_space_operations kvm_gmem_aops = {
377
.dirty_folio = noop_dirty_folio,
378
.migrate_folio = kvm_gmem_migrate_folio,
379
.error_remove_folio = kvm_gmem_error_folio,
380
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
381
.free_folio = kvm_gmem_free_folio,
382
#endif
383
};
384
385
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
386
struct iattr *attr)
387
{
388
return -EINVAL;
389
}
390
static const struct inode_operations kvm_gmem_iops = {
391
.setattr = kvm_gmem_setattr,
392
};
393
394
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
395
{
396
const char *anon_name = "[kvm-gmem]";
397
struct kvm_gmem *gmem;
398
struct inode *inode;
399
struct file *file;
400
int fd, err;
401
402
fd = get_unused_fd_flags(0);
403
if (fd < 0)
404
return fd;
405
406
gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
407
if (!gmem) {
408
err = -ENOMEM;
409
goto err_fd;
410
}
411
412
file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
413
O_RDWR, NULL);
414
if (IS_ERR(file)) {
415
err = PTR_ERR(file);
416
goto err_gmem;
417
}
418
419
file->f_flags |= O_LARGEFILE;
420
421
inode = file->f_inode;
422
WARN_ON(file->f_mapping != inode->i_mapping);
423
424
inode->i_private = (void *)(unsigned long)flags;
425
inode->i_op = &kvm_gmem_iops;
426
inode->i_mapping->a_ops = &kvm_gmem_aops;
427
inode->i_mode |= S_IFREG;
428
inode->i_size = size;
429
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
430
mapping_set_inaccessible(inode->i_mapping);
431
/* Unmovable mappings are supposed to be marked unevictable as well. */
432
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
433
434
kvm_get_kvm(kvm);
435
gmem->kvm = kvm;
436
xa_init(&gmem->bindings);
437
list_add(&gmem->entry, &inode->i_mapping->i_private_list);
438
439
fd_install(fd, file);
440
return fd;
441
442
err_gmem:
443
kfree(gmem);
444
err_fd:
445
put_unused_fd(fd);
446
return err;
447
}
448
449
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
450
{
451
loff_t size = args->size;
452
u64 flags = args->flags;
453
u64 valid_flags = 0;
454
455
if (flags & ~valid_flags)
456
return -EINVAL;
457
458
if (size <= 0 || !PAGE_ALIGNED(size))
459
return -EINVAL;
460
461
return __kvm_gmem_create(kvm, size, flags);
462
}
463
464
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
465
unsigned int fd, loff_t offset)
466
{
467
loff_t size = slot->npages << PAGE_SHIFT;
468
unsigned long start, end;
469
struct kvm_gmem *gmem;
470
struct inode *inode;
471
struct file *file;
472
int r = -EINVAL;
473
474
BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
475
476
file = fget(fd);
477
if (!file)
478
return -EBADF;
479
480
if (file->f_op != &kvm_gmem_fops)
481
goto err;
482
483
gmem = file->private_data;
484
if (gmem->kvm != kvm)
485
goto err;
486
487
inode = file_inode(file);
488
489
if (offset < 0 || !PAGE_ALIGNED(offset) ||
490
offset + size > i_size_read(inode))
491
goto err;
492
493
filemap_invalidate_lock(inode->i_mapping);
494
495
start = offset >> PAGE_SHIFT;
496
end = start + slot->npages;
497
498
if (!xa_empty(&gmem->bindings) &&
499
xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
500
filemap_invalidate_unlock(inode->i_mapping);
501
goto err;
502
}
503
504
/*
505
* memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
506
* kvm_gmem_bind() must occur on a new memslot. Because the memslot
507
* is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
508
*/
509
WRITE_ONCE(slot->gmem.file, file);
510
slot->gmem.pgoff = start;
511
512
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
513
filemap_invalidate_unlock(inode->i_mapping);
514
515
/*
516
* Drop the reference to the file, even on success. The file pins KVM,
517
* not the other way 'round. Active bindings are invalidated if the
518
* file is closed before memslots are destroyed.
519
*/
520
r = 0;
521
err:
522
fput(file);
523
return r;
524
}
525
526
void kvm_gmem_unbind(struct kvm_memory_slot *slot)
527
{
528
unsigned long start = slot->gmem.pgoff;
529
unsigned long end = start + slot->npages;
530
struct kvm_gmem *gmem;
531
struct file *file;
532
533
/*
534
* Nothing to do if the underlying file was already closed (or is being
535
* closed right now), kvm_gmem_release() invalidates all bindings.
536
*/
537
file = kvm_gmem_get_file(slot);
538
if (!file)
539
return;
540
541
gmem = file->private_data;
542
543
filemap_invalidate_lock(file->f_mapping);
544
xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
545
546
/*
547
* synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
548
* cannot see this memslot.
549
*/
550
WRITE_ONCE(slot->gmem.file, NULL);
551
filemap_invalidate_unlock(file->f_mapping);
552
553
fput(file);
554
}
555
556
/* Returns a locked folio on success. */
557
static struct folio *__kvm_gmem_get_pfn(struct file *file,
558
struct kvm_memory_slot *slot,
559
pgoff_t index, kvm_pfn_t *pfn,
560
bool *is_prepared, int *max_order)
561
{
562
struct file *gmem_file = READ_ONCE(slot->gmem.file);
563
struct kvm_gmem *gmem = file->private_data;
564
struct folio *folio;
565
566
if (file != gmem_file) {
567
WARN_ON_ONCE(gmem_file);
568
return ERR_PTR(-EFAULT);
569
}
570
571
gmem = file->private_data;
572
if (xa_load(&gmem->bindings, index) != slot) {
573
WARN_ON_ONCE(xa_load(&gmem->bindings, index));
574
return ERR_PTR(-EIO);
575
}
576
577
folio = kvm_gmem_get_folio(file_inode(file), index);
578
if (IS_ERR(folio))
579
return folio;
580
581
if (folio_test_hwpoison(folio)) {
582
folio_unlock(folio);
583
folio_put(folio);
584
return ERR_PTR(-EHWPOISON);
585
}
586
587
*pfn = folio_file_pfn(folio, index);
588
if (max_order)
589
*max_order = 0;
590
591
*is_prepared = folio_test_uptodate(folio);
592
return folio;
593
}
594
595
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
596
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
597
int *max_order)
598
{
599
pgoff_t index = kvm_gmem_get_index(slot, gfn);
600
struct file *file = kvm_gmem_get_file(slot);
601
struct folio *folio;
602
bool is_prepared = false;
603
int r = 0;
604
605
if (!file)
606
return -EFAULT;
607
608
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
609
if (IS_ERR(folio)) {
610
r = PTR_ERR(folio);
611
goto out;
612
}
613
614
if (!is_prepared)
615
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
616
617
folio_unlock(folio);
618
619
if (!r)
620
*page = folio_file_page(folio, index);
621
else
622
folio_put(folio);
623
624
out:
625
fput(file);
626
return r;
627
}
628
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
629
630
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
631
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
632
kvm_gmem_populate_cb post_populate, void *opaque)
633
{
634
struct file *file;
635
struct kvm_memory_slot *slot;
636
void __user *p;
637
638
int ret = 0, max_order;
639
long i;
640
641
lockdep_assert_held(&kvm->slots_lock);
642
if (npages < 0)
643
return -EINVAL;
644
645
slot = gfn_to_memslot(kvm, start_gfn);
646
if (!kvm_slot_can_be_private(slot))
647
return -EINVAL;
648
649
file = kvm_gmem_get_file(slot);
650
if (!file)
651
return -EFAULT;
652
653
filemap_invalidate_lock(file->f_mapping);
654
655
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
656
for (i = 0; i < npages; i += (1 << max_order)) {
657
struct folio *folio;
658
gfn_t gfn = start_gfn + i;
659
pgoff_t index = kvm_gmem_get_index(slot, gfn);
660
bool is_prepared = false;
661
kvm_pfn_t pfn;
662
663
if (signal_pending(current)) {
664
ret = -EINTR;
665
break;
666
}
667
668
folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
669
if (IS_ERR(folio)) {
670
ret = PTR_ERR(folio);
671
break;
672
}
673
674
if (is_prepared) {
675
folio_unlock(folio);
676
folio_put(folio);
677
ret = -EEXIST;
678
break;
679
}
680
681
folio_unlock(folio);
682
WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
683
(npages - i) < (1 << max_order));
684
685
ret = -EINVAL;
686
while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
687
KVM_MEMORY_ATTRIBUTE_PRIVATE,
688
KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
689
if (!max_order)
690
goto put_folio_and_exit;
691
max_order--;
692
}
693
694
p = src ? src + i * PAGE_SIZE : NULL;
695
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
696
if (!ret)
697
kvm_gmem_mark_prepared(folio);
698
699
put_folio_and_exit:
700
folio_put(folio);
701
if (ret)
702
break;
703
}
704
705
filemap_invalidate_unlock(file->f_mapping);
706
707
fput(file);
708
return ret && !i ? ret : i;
709
}
710
EXPORT_SYMBOL_GPL(kvm_gmem_populate);
711
#endif
712
713