Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/virt/kvm/guest_memfd.c
50376 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/anon_inodes.h>
3
#include <linux/backing-dev.h>
4
#include <linux/falloc.h>
5
#include <linux/fs.h>
6
#include <linux/kvm_host.h>
7
#include <linux/mempolicy.h>
8
#include <linux/pseudo_fs.h>
9
#include <linux/pagemap.h>
10
11
#include "kvm_mm.h"
12
13
static struct vfsmount *kvm_gmem_mnt;
14
15
/*
16
* A guest_memfd instance can be associated multiple VMs, each with its own
17
* "view" of the underlying physical memory.
18
*
19
* The gmem's inode is effectively the raw underlying physical storage, and is
20
* used to track properties of the physical memory, while each gmem file is
21
* effectively a single VM's view of that storage, and is used to track assets
22
* specific to its associated VM, e.g. memslots=>gmem bindings.
23
*/
24
struct gmem_file {
25
struct kvm *kvm;
26
struct xarray bindings;
27
struct list_head entry;
28
};
29
30
struct gmem_inode {
31
struct shared_policy policy;
32
struct inode vfs_inode;
33
34
u64 flags;
35
};
36
37
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
38
{
39
return container_of(inode, struct gmem_inode, vfs_inode);
40
}
41
42
#define kvm_gmem_for_each_file(f, mapping) \
43
list_for_each_entry(f, &(mapping)->i_private_list, entry)
44
45
/**
46
* folio_file_pfn - like folio_file_page, but return a pfn.
47
* @folio: The folio which contains this index.
48
* @index: The index we want to look up.
49
*
50
* Return: The pfn for this index.
51
*/
52
static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
53
{
54
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
55
}
56
57
static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
58
{
59
return gfn - slot->base_gfn + slot->gmem.pgoff;
60
}
61
62
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
63
pgoff_t index, struct folio *folio)
64
{
65
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
66
kvm_pfn_t pfn = folio_file_pfn(folio, index);
67
gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
68
int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
69
if (rc) {
70
pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
71
index, gfn, pfn, rc);
72
return rc;
73
}
74
#endif
75
76
return 0;
77
}
78
79
static inline void kvm_gmem_mark_prepared(struct folio *folio)
80
{
81
folio_mark_uptodate(folio);
82
}
83
84
/*
85
* Process @folio, which contains @gfn, so that the guest can use it.
86
* The folio must be locked and the gfn must be contained in @slot.
87
* On successful return the guest sees a zero page so as to avoid
88
* leaking host data and the up-to-date flag is set.
89
*/
90
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
91
gfn_t gfn, struct folio *folio)
92
{
93
unsigned long nr_pages, i;
94
pgoff_t index;
95
int r;
96
97
nr_pages = folio_nr_pages(folio);
98
for (i = 0; i < nr_pages; i++)
99
clear_highpage(folio_page(folio, i));
100
101
/*
102
* Preparing huge folios should always be safe, since it should
103
* be possible to split them later if needed.
104
*
105
* Right now the folio order is always going to be zero, but the
106
* code is ready for huge folios. The only assumption is that
107
* the base pgoff of memslots is naturally aligned with the
108
* requested page order, ensuring that huge folios can also use
109
* huge page table entries for GPA->HPA mapping.
110
*
111
* The order will be passed when creating the guest_memfd, and
112
* checked when creating memslots.
113
*/
114
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
115
index = kvm_gmem_get_index(slot, gfn);
116
index = ALIGN_DOWN(index, folio_nr_pages(folio));
117
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
118
if (!r)
119
kvm_gmem_mark_prepared(folio);
120
121
return r;
122
}
123
124
/*
125
* Returns a locked folio on success. The caller is responsible for
126
* setting the up-to-date flag before the memory is mapped into the guest.
127
* There is no backing storage for the memory, so the folio will remain
128
* up-to-date until it's removed.
129
*
130
* Ignore accessed, referenced, and dirty flags. The memory is
131
* unevictable and there is no storage to write back to.
132
*/
133
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
134
{
135
/* TODO: Support huge pages. */
136
struct mempolicy *policy;
137
struct folio *folio;
138
139
/*
140
* Fast-path: See if folio is already present in mapping to avoid
141
* policy_lookup.
142
*/
143
folio = __filemap_get_folio(inode->i_mapping, index,
144
FGP_LOCK | FGP_ACCESSED, 0);
145
if (!IS_ERR(folio))
146
return folio;
147
148
policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
149
folio = __filemap_get_folio_mpol(inode->i_mapping, index,
150
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
151
mapping_gfp_mask(inode->i_mapping), policy);
152
mpol_cond_put(policy);
153
154
return folio;
155
}
156
157
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
158
{
159
if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
160
return KVM_FILTER_SHARED;
161
162
return KVM_FILTER_PRIVATE;
163
}
164
165
static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
166
pgoff_t end,
167
enum kvm_gfn_range_filter attr_filter)
168
{
169
bool flush = false, found_memslot = false;
170
struct kvm_memory_slot *slot;
171
struct kvm *kvm = f->kvm;
172
unsigned long index;
173
174
xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
175
pgoff_t pgoff = slot->gmem.pgoff;
176
177
struct kvm_gfn_range gfn_range = {
178
.start = slot->base_gfn + max(pgoff, start) - pgoff,
179
.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
180
.slot = slot,
181
.may_block = true,
182
.attr_filter = attr_filter,
183
};
184
185
if (!found_memslot) {
186
found_memslot = true;
187
188
KVM_MMU_LOCK(kvm);
189
kvm_mmu_invalidate_begin(kvm);
190
}
191
192
flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
193
}
194
195
if (flush)
196
kvm_flush_remote_tlbs(kvm);
197
198
if (found_memslot)
199
KVM_MMU_UNLOCK(kvm);
200
}
201
202
static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
203
pgoff_t end)
204
{
205
enum kvm_gfn_range_filter attr_filter;
206
struct gmem_file *f;
207
208
attr_filter = kvm_gmem_get_invalidate_filter(inode);
209
210
kvm_gmem_for_each_file(f, inode->i_mapping)
211
__kvm_gmem_invalidate_begin(f, start, end, attr_filter);
212
}
213
214
static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
215
pgoff_t end)
216
{
217
struct kvm *kvm = f->kvm;
218
219
if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
220
KVM_MMU_LOCK(kvm);
221
kvm_mmu_invalidate_end(kvm);
222
KVM_MMU_UNLOCK(kvm);
223
}
224
}
225
226
static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
227
pgoff_t end)
228
{
229
struct gmem_file *f;
230
231
kvm_gmem_for_each_file(f, inode->i_mapping)
232
__kvm_gmem_invalidate_end(f, start, end);
233
}
234
235
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
236
{
237
pgoff_t start = offset >> PAGE_SHIFT;
238
pgoff_t end = (offset + len) >> PAGE_SHIFT;
239
240
/*
241
* Bindings must be stable across invalidation to ensure the start+end
242
* are balanced.
243
*/
244
filemap_invalidate_lock(inode->i_mapping);
245
246
kvm_gmem_invalidate_begin(inode, start, end);
247
248
truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
249
250
kvm_gmem_invalidate_end(inode, start, end);
251
252
filemap_invalidate_unlock(inode->i_mapping);
253
254
return 0;
255
}
256
257
static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
258
{
259
struct address_space *mapping = inode->i_mapping;
260
pgoff_t start, index, end;
261
int r;
262
263
/* Dedicated guest is immutable by default. */
264
if (offset + len > i_size_read(inode))
265
return -EINVAL;
266
267
filemap_invalidate_lock_shared(mapping);
268
269
start = offset >> PAGE_SHIFT;
270
end = (offset + len) >> PAGE_SHIFT;
271
272
r = 0;
273
for (index = start; index < end; ) {
274
struct folio *folio;
275
276
if (signal_pending(current)) {
277
r = -EINTR;
278
break;
279
}
280
281
folio = kvm_gmem_get_folio(inode, index);
282
if (IS_ERR(folio)) {
283
r = PTR_ERR(folio);
284
break;
285
}
286
287
index = folio_next_index(folio);
288
289
folio_unlock(folio);
290
folio_put(folio);
291
292
/* 64-bit only, wrapping the index should be impossible. */
293
if (WARN_ON_ONCE(!index))
294
break;
295
296
cond_resched();
297
}
298
299
filemap_invalidate_unlock_shared(mapping);
300
301
return r;
302
}
303
304
static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
305
loff_t len)
306
{
307
int ret;
308
309
if (!(mode & FALLOC_FL_KEEP_SIZE))
310
return -EOPNOTSUPP;
311
312
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
313
return -EOPNOTSUPP;
314
315
if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
316
return -EINVAL;
317
318
if (mode & FALLOC_FL_PUNCH_HOLE)
319
ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
320
else
321
ret = kvm_gmem_allocate(file_inode(file), offset, len);
322
323
if (!ret)
324
file_modified(file);
325
return ret;
326
}
327
328
static int kvm_gmem_release(struct inode *inode, struct file *file)
329
{
330
struct gmem_file *f = file->private_data;
331
struct kvm_memory_slot *slot;
332
struct kvm *kvm = f->kvm;
333
unsigned long index;
334
335
/*
336
* Prevent concurrent attempts to *unbind* a memslot. This is the last
337
* reference to the file and thus no new bindings can be created, but
338
* dereferencing the slot for existing bindings needs to be protected
339
* against memslot updates, specifically so that unbind doesn't race
340
* and free the memslot (kvm_gmem_get_file() will return NULL).
341
*
342
* Since .release is called only when the reference count is zero,
343
* after which file_ref_get() and get_file_active() fail,
344
* kvm_gmem_get_pfn() cannot be using the file concurrently.
345
* file_ref_put() provides a full barrier, and get_file_active() the
346
* matching acquire barrier.
347
*/
348
mutex_lock(&kvm->slots_lock);
349
350
filemap_invalidate_lock(inode->i_mapping);
351
352
xa_for_each(&f->bindings, index, slot)
353
WRITE_ONCE(slot->gmem.file, NULL);
354
355
/*
356
* All in-flight operations are gone and new bindings can be created.
357
* Zap all SPTEs pointed at by this file. Do not free the backing
358
* memory, as its lifetime is associated with the inode, not the file.
359
*/
360
__kvm_gmem_invalidate_begin(f, 0, -1ul,
361
kvm_gmem_get_invalidate_filter(inode));
362
__kvm_gmem_invalidate_end(f, 0, -1ul);
363
364
list_del(&f->entry);
365
366
filemap_invalidate_unlock(inode->i_mapping);
367
368
mutex_unlock(&kvm->slots_lock);
369
370
xa_destroy(&f->bindings);
371
kfree(f);
372
373
kvm_put_kvm(kvm);
374
375
return 0;
376
}
377
378
static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
379
{
380
/*
381
* Do not return slot->gmem.file if it has already been closed;
382
* there might be some time between the last fput() and when
383
* kvm_gmem_release() clears slot->gmem.file.
384
*/
385
return get_file_active(&slot->gmem.file);
386
}
387
388
DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
389
kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
390
391
static bool kvm_gmem_supports_mmap(struct inode *inode)
392
{
393
return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
394
}
395
396
static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
397
{
398
struct inode *inode = file_inode(vmf->vma->vm_file);
399
struct folio *folio;
400
vm_fault_t ret = VM_FAULT_LOCKED;
401
402
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
403
return VM_FAULT_SIGBUS;
404
405
if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
406
return VM_FAULT_SIGBUS;
407
408
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
409
if (IS_ERR(folio)) {
410
if (PTR_ERR(folio) == -EAGAIN)
411
return VM_FAULT_RETRY;
412
413
return vmf_error(PTR_ERR(folio));
414
}
415
416
if (WARN_ON_ONCE(folio_test_large(folio))) {
417
ret = VM_FAULT_SIGBUS;
418
goto out_folio;
419
}
420
421
if (!folio_test_uptodate(folio)) {
422
clear_highpage(folio_page(folio, 0));
423
kvm_gmem_mark_prepared(folio);
424
}
425
426
vmf->page = folio_file_page(folio, vmf->pgoff);
427
428
out_folio:
429
if (ret != VM_FAULT_LOCKED) {
430
folio_unlock(folio);
431
folio_put(folio);
432
}
433
434
return ret;
435
}
436
437
#ifdef CONFIG_NUMA
438
static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
439
{
440
struct inode *inode = file_inode(vma->vm_file);
441
442
return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
443
}
444
445
static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
446
unsigned long addr, pgoff_t *pgoff)
447
{
448
struct inode *inode = file_inode(vma->vm_file);
449
450
*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
451
452
/*
453
* Return the memory policy for this index, or NULL if none is set.
454
*
455
* Returning NULL, e.g. instead of the current task's memory policy, is
456
* important for the .get_policy kernel ABI: it indicates that no
457
* explicit policy has been set via mbind() for this memory. The caller
458
* can then replace NULL with the default memory policy instead of the
459
* current task's memory policy.
460
*/
461
return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
462
}
463
#endif /* CONFIG_NUMA */
464
465
static const struct vm_operations_struct kvm_gmem_vm_ops = {
466
.fault = kvm_gmem_fault_user_mapping,
467
#ifdef CONFIG_NUMA
468
.get_policy = kvm_gmem_get_policy,
469
.set_policy = kvm_gmem_set_policy,
470
#endif
471
};
472
473
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
474
{
475
if (!kvm_gmem_supports_mmap(file_inode(file)))
476
return -ENODEV;
477
478
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
479
(VM_SHARED | VM_MAYSHARE)) {
480
return -EINVAL;
481
}
482
483
vma->vm_ops = &kvm_gmem_vm_ops;
484
485
return 0;
486
}
487
488
static struct file_operations kvm_gmem_fops = {
489
.mmap = kvm_gmem_mmap,
490
.open = generic_file_open,
491
.release = kvm_gmem_release,
492
.fallocate = kvm_gmem_fallocate,
493
};
494
495
static int kvm_gmem_migrate_folio(struct address_space *mapping,
496
struct folio *dst, struct folio *src,
497
enum migrate_mode mode)
498
{
499
WARN_ON_ONCE(1);
500
return -EINVAL;
501
}
502
503
static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
504
{
505
pgoff_t start, end;
506
507
filemap_invalidate_lock_shared(mapping);
508
509
start = folio->index;
510
end = start + folio_nr_pages(folio);
511
512
kvm_gmem_invalidate_begin(mapping->host, start, end);
513
514
/*
515
* Do not truncate the range, what action is taken in response to the
516
* error is userspace's decision (assuming the architecture supports
517
* gracefully handling memory errors). If/when the guest attempts to
518
* access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
519
* at which point KVM can either terminate the VM or propagate the
520
* error to userspace.
521
*/
522
523
kvm_gmem_invalidate_end(mapping->host, start, end);
524
525
filemap_invalidate_unlock_shared(mapping);
526
527
return MF_DELAYED;
528
}
529
530
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
531
static void kvm_gmem_free_folio(struct folio *folio)
532
{
533
struct page *page = folio_page(folio, 0);
534
kvm_pfn_t pfn = page_to_pfn(page);
535
int order = folio_order(folio);
536
537
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
538
}
539
#endif
540
541
static const struct address_space_operations kvm_gmem_aops = {
542
.dirty_folio = noop_dirty_folio,
543
.migrate_folio = kvm_gmem_migrate_folio,
544
.error_remove_folio = kvm_gmem_error_folio,
545
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
546
.free_folio = kvm_gmem_free_folio,
547
#endif
548
};
549
550
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
551
struct iattr *attr)
552
{
553
return -EINVAL;
554
}
555
static const struct inode_operations kvm_gmem_iops = {
556
.setattr = kvm_gmem_setattr,
557
};
558
559
bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
560
{
561
return true;
562
}
563
564
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
565
{
566
static const char *name = "[kvm-gmem]";
567
struct gmem_file *f;
568
struct inode *inode;
569
struct file *file;
570
int fd, err;
571
572
fd = get_unused_fd_flags(0);
573
if (fd < 0)
574
return fd;
575
576
f = kzalloc(sizeof(*f), GFP_KERNEL);
577
if (!f) {
578
err = -ENOMEM;
579
goto err_fd;
580
}
581
582
/* __fput() will take care of fops_put(). */
583
if (!fops_get(&kvm_gmem_fops)) {
584
err = -ENOENT;
585
goto err_gmem;
586
}
587
588
inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
589
if (IS_ERR(inode)) {
590
err = PTR_ERR(inode);
591
goto err_fops;
592
}
593
594
inode->i_op = &kvm_gmem_iops;
595
inode->i_mapping->a_ops = &kvm_gmem_aops;
596
inode->i_mode |= S_IFREG;
597
inode->i_size = size;
598
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
599
mapping_set_inaccessible(inode->i_mapping);
600
/* Unmovable mappings are supposed to be marked unevictable as well. */
601
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
602
603
GMEM_I(inode)->flags = flags;
604
605
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
606
if (IS_ERR(file)) {
607
err = PTR_ERR(file);
608
goto err_inode;
609
}
610
611
file->f_flags |= O_LARGEFILE;
612
file->private_data = f;
613
614
kvm_get_kvm(kvm);
615
f->kvm = kvm;
616
xa_init(&f->bindings);
617
list_add(&f->entry, &inode->i_mapping->i_private_list);
618
619
fd_install(fd, file);
620
return fd;
621
622
err_inode:
623
iput(inode);
624
err_fops:
625
fops_put(&kvm_gmem_fops);
626
err_gmem:
627
kfree(f);
628
err_fd:
629
put_unused_fd(fd);
630
return err;
631
}
632
633
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
634
{
635
loff_t size = args->size;
636
u64 flags = args->flags;
637
638
if (flags & ~kvm_gmem_get_supported_flags(kvm))
639
return -EINVAL;
640
641
if (size <= 0 || !PAGE_ALIGNED(size))
642
return -EINVAL;
643
644
return __kvm_gmem_create(kvm, size, flags);
645
}
646
647
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
648
unsigned int fd, loff_t offset)
649
{
650
loff_t size = slot->npages << PAGE_SHIFT;
651
unsigned long start, end;
652
struct gmem_file *f;
653
struct inode *inode;
654
struct file *file;
655
int r = -EINVAL;
656
657
BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
658
659
file = fget(fd);
660
if (!file)
661
return -EBADF;
662
663
if (file->f_op != &kvm_gmem_fops)
664
goto err;
665
666
f = file->private_data;
667
if (f->kvm != kvm)
668
goto err;
669
670
inode = file_inode(file);
671
672
if (offset < 0 || !PAGE_ALIGNED(offset) ||
673
offset + size > i_size_read(inode))
674
goto err;
675
676
filemap_invalidate_lock(inode->i_mapping);
677
678
start = offset >> PAGE_SHIFT;
679
end = start + slot->npages;
680
681
if (!xa_empty(&f->bindings) &&
682
xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
683
filemap_invalidate_unlock(inode->i_mapping);
684
goto err;
685
}
686
687
/*
688
* memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
689
* kvm_gmem_bind() must occur on a new memslot. Because the memslot
690
* is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
691
*/
692
WRITE_ONCE(slot->gmem.file, file);
693
slot->gmem.pgoff = start;
694
if (kvm_gmem_supports_mmap(inode))
695
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
696
697
xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
698
filemap_invalidate_unlock(inode->i_mapping);
699
700
/*
701
* Drop the reference to the file, even on success. The file pins KVM,
702
* not the other way 'round. Active bindings are invalidated if the
703
* file is closed before memslots are destroyed.
704
*/
705
r = 0;
706
err:
707
fput(file);
708
return r;
709
}
710
711
static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
712
{
713
unsigned long start = slot->gmem.pgoff;
714
unsigned long end = start + slot->npages;
715
716
xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
717
718
/*
719
* synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
720
* cannot see this memslot.
721
*/
722
WRITE_ONCE(slot->gmem.file, NULL);
723
}
724
725
void kvm_gmem_unbind(struct kvm_memory_slot *slot)
726
{
727
/*
728
* Nothing to do if the underlying file was _already_ closed, as
729
* kvm_gmem_release() invalidates and nullifies all bindings.
730
*/
731
if (!slot->gmem.file)
732
return;
733
734
CLASS(gmem_get_file, file)(slot);
735
736
/*
737
* However, if the file is _being_ closed, then the bindings need to be
738
* removed as kvm_gmem_release() might not run until after the memslot
739
* is freed. Note, modifying the bindings is safe even though the file
740
* is dying as kvm_gmem_release() nullifies slot->gmem.file under
741
* slots_lock, and only puts its reference to KVM after destroying all
742
* bindings. I.e. reaching this point means kvm_gmem_release() hasn't
743
* yet destroyed the bindings or freed the gmem_file, and can't do so
744
* until the caller drops slots_lock.
745
*/
746
if (!file) {
747
__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
748
return;
749
}
750
751
filemap_invalidate_lock(file->f_mapping);
752
__kvm_gmem_unbind(slot, file->private_data);
753
filemap_invalidate_unlock(file->f_mapping);
754
}
755
756
/* Returns a locked folio on success. */
757
static struct folio *__kvm_gmem_get_pfn(struct file *file,
758
struct kvm_memory_slot *slot,
759
pgoff_t index, kvm_pfn_t *pfn,
760
bool *is_prepared, int *max_order)
761
{
762
struct file *slot_file = READ_ONCE(slot->gmem.file);
763
struct gmem_file *f = file->private_data;
764
struct folio *folio;
765
766
if (file != slot_file) {
767
WARN_ON_ONCE(slot_file);
768
return ERR_PTR(-EFAULT);
769
}
770
771
if (xa_load(&f->bindings, index) != slot) {
772
WARN_ON_ONCE(xa_load(&f->bindings, index));
773
return ERR_PTR(-EIO);
774
}
775
776
folio = kvm_gmem_get_folio(file_inode(file), index);
777
if (IS_ERR(folio))
778
return folio;
779
780
if (folio_test_hwpoison(folio)) {
781
folio_unlock(folio);
782
folio_put(folio);
783
return ERR_PTR(-EHWPOISON);
784
}
785
786
*pfn = folio_file_pfn(folio, index);
787
if (max_order)
788
*max_order = 0;
789
790
*is_prepared = folio_test_uptodate(folio);
791
return folio;
792
}
793
794
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
795
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
796
int *max_order)
797
{
798
pgoff_t index = kvm_gmem_get_index(slot, gfn);
799
struct folio *folio;
800
bool is_prepared = false;
801
int r = 0;
802
803
CLASS(gmem_get_file, file)(slot);
804
if (!file)
805
return -EFAULT;
806
807
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
808
if (IS_ERR(folio))
809
return PTR_ERR(folio);
810
811
if (!is_prepared)
812
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
813
814
folio_unlock(folio);
815
816
if (!r)
817
*page = folio_file_page(folio, index);
818
else
819
folio_put(folio);
820
821
return r;
822
}
823
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
824
825
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
826
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
827
kvm_gmem_populate_cb post_populate, void *opaque)
828
{
829
struct kvm_memory_slot *slot;
830
void __user *p;
831
832
int ret = 0, max_order;
833
long i;
834
835
lockdep_assert_held(&kvm->slots_lock);
836
837
if (WARN_ON_ONCE(npages <= 0))
838
return -EINVAL;
839
840
slot = gfn_to_memslot(kvm, start_gfn);
841
if (!kvm_slot_has_gmem(slot))
842
return -EINVAL;
843
844
CLASS(gmem_get_file, file)(slot);
845
if (!file)
846
return -EFAULT;
847
848
filemap_invalidate_lock(file->f_mapping);
849
850
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
851
for (i = 0; i < npages; i += (1 << max_order)) {
852
struct folio *folio;
853
gfn_t gfn = start_gfn + i;
854
pgoff_t index = kvm_gmem_get_index(slot, gfn);
855
bool is_prepared = false;
856
kvm_pfn_t pfn;
857
858
if (signal_pending(current)) {
859
ret = -EINTR;
860
break;
861
}
862
863
folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
864
if (IS_ERR(folio)) {
865
ret = PTR_ERR(folio);
866
break;
867
}
868
869
if (is_prepared) {
870
folio_unlock(folio);
871
folio_put(folio);
872
ret = -EEXIST;
873
break;
874
}
875
876
folio_unlock(folio);
877
WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
878
(npages - i) < (1 << max_order));
879
880
ret = -EINVAL;
881
while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
882
KVM_MEMORY_ATTRIBUTE_PRIVATE,
883
KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
884
if (!max_order)
885
goto put_folio_and_exit;
886
max_order--;
887
}
888
889
p = src ? src + i * PAGE_SIZE : NULL;
890
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
891
if (!ret)
892
kvm_gmem_mark_prepared(folio);
893
894
put_folio_and_exit:
895
folio_put(folio);
896
if (ret)
897
break;
898
}
899
900
filemap_invalidate_unlock(file->f_mapping);
901
902
return ret && !i ? ret : i;
903
}
904
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
905
#endif
906
907
static struct kmem_cache *kvm_gmem_inode_cachep;
908
909
static void kvm_gmem_init_inode_once(void *__gi)
910
{
911
struct gmem_inode *gi = __gi;
912
913
/*
914
* Note! Don't initialize the inode with anything specific to the
915
* guest_memfd instance, or that might be specific to how the inode is
916
* used (from the VFS-layer's perspective). This hook is called only
917
* during the initial slab allocation, i.e. only fields/state that are
918
* idempotent across _all_ use of the inode _object_ can be initialized
919
* at this time!
920
*/
921
inode_init_once(&gi->vfs_inode);
922
}
923
924
static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
925
{
926
struct gmem_inode *gi;
927
928
gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
929
if (!gi)
930
return NULL;
931
932
mpol_shared_policy_init(&gi->policy, NULL);
933
934
gi->flags = 0;
935
return &gi->vfs_inode;
936
}
937
938
static void kvm_gmem_destroy_inode(struct inode *inode)
939
{
940
mpol_free_shared_policy(&GMEM_I(inode)->policy);
941
}
942
943
static void kvm_gmem_free_inode(struct inode *inode)
944
{
945
kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
946
}
947
948
static const struct super_operations kvm_gmem_super_operations = {
949
.statfs = simple_statfs,
950
.alloc_inode = kvm_gmem_alloc_inode,
951
.destroy_inode = kvm_gmem_destroy_inode,
952
.free_inode = kvm_gmem_free_inode,
953
};
954
955
static int kvm_gmem_init_fs_context(struct fs_context *fc)
956
{
957
struct pseudo_fs_context *ctx;
958
959
if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
960
return -ENOMEM;
961
962
fc->s_iflags |= SB_I_NOEXEC;
963
fc->s_iflags |= SB_I_NODEV;
964
ctx = fc->fs_private;
965
ctx->ops = &kvm_gmem_super_operations;
966
967
return 0;
968
}
969
970
static struct file_system_type kvm_gmem_fs = {
971
.name = "guest_memfd",
972
.init_fs_context = kvm_gmem_init_fs_context,
973
.kill_sb = kill_anon_super,
974
};
975
976
static int kvm_gmem_init_mount(void)
977
{
978
kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
979
980
if (IS_ERR(kvm_gmem_mnt))
981
return PTR_ERR(kvm_gmem_mnt);
982
983
kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
984
return 0;
985
}
986
987
int kvm_gmem_init(struct module *module)
988
{
989
struct kmem_cache_args args = {
990
.align = 0,
991
.ctor = kvm_gmem_init_inode_once,
992
};
993
int ret;
994
995
kvm_gmem_fops.owner = module;
996
kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
997
sizeof(struct gmem_inode),
998
&args, SLAB_ACCOUNT);
999
if (!kvm_gmem_inode_cachep)
1000
return -ENOMEM;
1001
1002
ret = kvm_gmem_init_mount();
1003
if (ret) {
1004
kmem_cache_destroy(kvm_gmem_inode_cachep);
1005
return ret;
1006
}
1007
return 0;
1008
}
1009
1010
void kvm_gmem_exit(void)
1011
{
1012
kern_unmount(kvm_gmem_mnt);
1013
kvm_gmem_mnt = NULL;
1014
rcu_barrier();
1015
kmem_cache_destroy(kvm_gmem_inode_cachep);
1016
}
1017
1018