Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/loongarch/kvm/mmu.c
26436 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2020-2023 Loongson Technology Corporation Limited
4
*/
5
6
#include <linux/highmem.h>
7
#include <linux/hugetlb.h>
8
#include <linux/kvm_host.h>
9
#include <linux/page-flags.h>
10
#include <linux/uaccess.h>
11
#include <asm/mmu_context.h>
12
#include <asm/pgalloc.h>
13
#include <asm/tlb.h>
14
#include <asm/kvm_mmu.h>
15
16
static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot)
17
{
18
return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE;
19
}
20
21
static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot)
22
{
23
return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE;
24
}
25
26
static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx)
27
{
28
ctx->level = kvm->arch.root_level;
29
/* pte table */
30
ctx->invalid_ptes = kvm->arch.invalid_ptes;
31
ctx->pte_shifts = kvm->arch.pte_shifts;
32
ctx->pgtable_shift = ctx->pte_shifts[ctx->level];
33
ctx->invalid_entry = ctx->invalid_ptes[ctx->level];
34
ctx->opaque = kvm;
35
}
36
37
/*
38
* Mark a range of guest physical address space old (all accesses fault) in the
39
* VM's GPA page table to allow detection of commonly used pages.
40
*/
41
static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
42
{
43
if (kvm_pte_young(*pte)) {
44
*pte = kvm_pte_mkold(*pte);
45
return 1;
46
}
47
48
return 0;
49
}
50
51
/*
52
* Mark a range of guest physical address space clean (writes fault) in the VM's
53
* GPA page table to allow dirty page tracking.
54
*/
55
static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
56
{
57
gfn_t offset;
58
kvm_pte_t val;
59
60
val = *pte;
61
/*
62
* For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end
63
* may cross hugepage, for first huge page parameter addr is equal to
64
* start, however for the second huge page addr is base address of
65
* this huge page, rather than start or end address
66
*/
67
if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) {
68
offset = (addr >> PAGE_SHIFT) - ctx->gfn;
69
if (!(BIT(offset) & ctx->mask))
70
return 0;
71
}
72
73
/*
74
* Need not split huge page now, just set write-proect pte bit
75
* Split huge page until next write fault
76
*/
77
if (kvm_pte_dirty(val)) {
78
*pte = kvm_pte_mkclean(val);
79
return 1;
80
}
81
82
return 0;
83
}
84
85
/*
86
* Clear pte entry
87
*/
88
static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
89
{
90
struct kvm *kvm;
91
92
kvm = ctx->opaque;
93
if (ctx->level)
94
kvm->stat.hugepages--;
95
else
96
kvm->stat.pages--;
97
98
*pte = ctx->invalid_entry;
99
100
return 1;
101
}
102
103
/*
104
* kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
105
*
106
* Allocate a blank KVM GPA page directory (PGD) for representing guest physical
107
* to host physical page mappings.
108
*
109
* Returns: Pointer to new KVM GPA page directory.
110
* NULL on allocation failure.
111
*/
112
kvm_pte_t *kvm_pgd_alloc(void)
113
{
114
kvm_pte_t *pgd;
115
116
pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0);
117
if (pgd)
118
pgd_init((void *)pgd);
119
120
return pgd;
121
}
122
123
static void _kvm_pte_init(void *addr, unsigned long val)
124
{
125
unsigned long *p, *end;
126
127
p = (unsigned long *)addr;
128
end = p + PTRS_PER_PTE;
129
do {
130
p[0] = val;
131
p[1] = val;
132
p[2] = val;
133
p[3] = val;
134
p[4] = val;
135
p += 8;
136
p[-3] = val;
137
p[-2] = val;
138
p[-1] = val;
139
} while (p != end);
140
}
141
142
/*
143
* Caller must hold kvm->mm_lock
144
*
145
* Walk the page tables of kvm to find the PTE corresponding to the
146
* address @addr. If page tables don't exist for @addr, they will be created
147
* from the MMU cache if @cache is not NULL.
148
*/
149
static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm,
150
struct kvm_mmu_memory_cache *cache,
151
unsigned long addr, int level)
152
{
153
kvm_ptw_ctx ctx;
154
kvm_pte_t *entry, *child;
155
156
kvm_ptw_prepare(kvm, &ctx);
157
child = kvm->arch.pgd;
158
while (ctx.level > level) {
159
entry = kvm_pgtable_offset(&ctx, child, addr);
160
if (kvm_pte_none(&ctx, entry)) {
161
if (!cache)
162
return NULL;
163
164
child = kvm_mmu_memory_cache_alloc(cache);
165
_kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]);
166
smp_wmb(); /* Make pte visible before pmd */
167
kvm_set_pte(entry, __pa(child));
168
} else if (kvm_pte_huge(*entry)) {
169
return entry;
170
} else
171
child = (kvm_pte_t *)__va(PHYSADDR(*entry));
172
kvm_ptw_enter(&ctx);
173
}
174
175
entry = kvm_pgtable_offset(&ctx, child, addr);
176
177
return entry;
178
}
179
180
/*
181
* Page walker for VM shadow mmu at last level
182
* The last level is small pte page or huge pmd page
183
*/
184
static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
185
{
186
int ret;
187
phys_addr_t next, start, size;
188
struct list_head *list;
189
kvm_pte_t *entry, *child;
190
191
ret = 0;
192
start = addr;
193
child = (kvm_pte_t *)__va(PHYSADDR(*dir));
194
entry = kvm_pgtable_offset(ctx, child, addr);
195
do {
196
next = addr + (0x1UL << ctx->pgtable_shift);
197
if (!kvm_pte_present(ctx, entry))
198
continue;
199
200
ret |= ctx->ops(entry, addr, ctx);
201
} while (entry++, addr = next, addr < end);
202
203
if (kvm_need_flush(ctx)) {
204
size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);
205
if (start + size == end) {
206
list = (struct list_head *)child;
207
list_add_tail(list, &ctx->list);
208
*dir = ctx->invalid_ptes[ctx->level + 1];
209
}
210
}
211
212
return ret;
213
}
214
215
/*
216
* Page walker for VM shadow mmu at page table dir level
217
*/
218
static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
219
{
220
int ret;
221
phys_addr_t next, start, size;
222
struct list_head *list;
223
kvm_pte_t *entry, *child;
224
225
ret = 0;
226
start = addr;
227
child = (kvm_pte_t *)__va(PHYSADDR(*dir));
228
entry = kvm_pgtable_offset(ctx, child, addr);
229
do {
230
next = kvm_pgtable_addr_end(ctx, addr, end);
231
if (!kvm_pte_present(ctx, entry))
232
continue;
233
234
if (kvm_pte_huge(*entry)) {
235
ret |= ctx->ops(entry, addr, ctx);
236
continue;
237
}
238
239
kvm_ptw_enter(ctx);
240
if (ctx->level == 0)
241
ret |= kvm_ptw_leaf(entry, addr, next, ctx);
242
else
243
ret |= kvm_ptw_dir(entry, addr, next, ctx);
244
kvm_ptw_exit(ctx);
245
} while (entry++, addr = next, addr < end);
246
247
if (kvm_need_flush(ctx)) {
248
size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);
249
if (start + size == end) {
250
list = (struct list_head *)child;
251
list_add_tail(list, &ctx->list);
252
*dir = ctx->invalid_ptes[ctx->level + 1];
253
}
254
}
255
256
return ret;
257
}
258
259
/*
260
* Page walker for VM shadow mmu at page root table
261
*/
262
static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
263
{
264
int ret;
265
phys_addr_t next;
266
kvm_pte_t *entry;
267
268
ret = 0;
269
entry = kvm_pgtable_offset(ctx, dir, addr);
270
do {
271
next = kvm_pgtable_addr_end(ctx, addr, end);
272
if (!kvm_pte_present(ctx, entry))
273
continue;
274
275
kvm_ptw_enter(ctx);
276
ret |= kvm_ptw_dir(entry, addr, next, ctx);
277
kvm_ptw_exit(ctx);
278
} while (entry++, addr = next, addr < end);
279
280
return ret;
281
}
282
283
/*
284
* kvm_flush_range() - Flush a range of guest physical addresses.
285
* @kvm: KVM pointer.
286
* @start_gfn: Guest frame number of first page in GPA range to flush.
287
* @end_gfn: Guest frame number of last page in GPA range to flush.
288
* @lock: Whether to hold mmu_lock or not
289
*
290
* Flushes a range of GPA mappings from the GPA page tables.
291
*/
292
static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock)
293
{
294
int ret;
295
kvm_ptw_ctx ctx;
296
struct list_head *pos, *temp;
297
298
ctx.ops = kvm_flush_pte;
299
ctx.flag = _KVM_FLUSH_PGTABLE;
300
kvm_ptw_prepare(kvm, &ctx);
301
INIT_LIST_HEAD(&ctx.list);
302
303
if (lock) {
304
spin_lock(&kvm->mmu_lock);
305
ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
306
end_gfn << PAGE_SHIFT, &ctx);
307
spin_unlock(&kvm->mmu_lock);
308
} else
309
ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
310
end_gfn << PAGE_SHIFT, &ctx);
311
312
/* Flush vpid for each vCPU individually */
313
if (ret)
314
kvm_flush_remote_tlbs(kvm);
315
316
/*
317
* free pte table page after mmu_lock
318
* the pte table page is linked together with ctx.list
319
*/
320
list_for_each_safe(pos, temp, &ctx.list) {
321
list_del(pos);
322
free_page((unsigned long)pos);
323
}
324
}
325
326
/*
327
* kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
328
* @kvm: KVM pointer.
329
* @start_gfn: Guest frame number of first page in GPA range to flush.
330
* @end_gfn: Guest frame number of last page in GPA range to flush.
331
*
332
* Make a range of GPA mappings clean so that guest writes will fault and
333
* trigger dirty page logging.
334
*
335
* The caller must hold the @kvm->mmu_lock spinlock.
336
*
337
* Returns: Whether any GPA mappings were modified, which would require
338
* derived mappings (GVA page tables & TLB enties) to be
339
* invalidated.
340
*/
341
static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
342
{
343
kvm_ptw_ctx ctx;
344
345
ctx.ops = kvm_mkclean_pte;
346
ctx.flag = 0;
347
kvm_ptw_prepare(kvm, &ctx);
348
return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx);
349
}
350
351
/*
352
* kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
353
* @kvm: The KVM pointer
354
* @slot: The memory slot associated with mask
355
* @gfn_offset: The gfn offset in memory slot
356
* @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
357
* slot to be write protected
358
*
359
* Walks bits set in mask write protects the associated pte's. Caller must
360
* acquire @kvm->mmu_lock.
361
*/
362
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
363
struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask)
364
{
365
kvm_ptw_ctx ctx;
366
gfn_t base_gfn = slot->base_gfn + gfn_offset;
367
gfn_t start = base_gfn + __ffs(mask);
368
gfn_t end = base_gfn + __fls(mask) + 1;
369
370
ctx.ops = kvm_mkclean_pte;
371
ctx.flag = _KVM_HAS_PGMASK;
372
ctx.mask = mask;
373
ctx.gfn = base_gfn;
374
kvm_ptw_prepare(kvm, &ctx);
375
376
kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx);
377
}
378
379
int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old,
380
struct kvm_memory_slot *new, enum kvm_mr_change change)
381
{
382
gpa_t gpa_start;
383
hva_t hva_start;
384
size_t size, gpa_offset, hva_offset;
385
386
if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE))
387
return 0;
388
/*
389
* Prevent userspace from creating a memory region outside of the
390
* VM GPA address space
391
*/
392
if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT))
393
return -ENOMEM;
394
395
new->arch.flags = 0;
396
size = new->npages * PAGE_SIZE;
397
gpa_start = new->base_gfn << PAGE_SHIFT;
398
hva_start = new->userspace_addr;
399
if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE)
400
&& IS_ALIGNED(hva_start, PMD_SIZE))
401
new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE;
402
else {
403
/*
404
* Pages belonging to memslots that don't have the same
405
* alignment within a PMD for userspace and GPA cannot be
406
* mapped with PMD entries, because we'll end up mapping
407
* the wrong pages.
408
*
409
* Consider a layout like the following:
410
*
411
* memslot->userspace_addr:
412
* +-----+--------------------+--------------------+---+
413
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
414
* +-----+--------------------+--------------------+---+
415
*
416
* memslot->base_gfn << PAGE_SIZE:
417
* +---+--------------------+--------------------+-----+
418
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
419
* +---+--------------------+--------------------+-----+
420
*
421
* If we create those stage-2 blocks, we'll end up with this
422
* incorrect mapping:
423
* d -> f
424
* e -> g
425
* f -> h
426
*/
427
gpa_offset = gpa_start & (PMD_SIZE - 1);
428
hva_offset = hva_start & (PMD_SIZE - 1);
429
if (gpa_offset != hva_offset) {
430
new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
431
} else {
432
if (gpa_offset == 0)
433
gpa_offset = PMD_SIZE;
434
if ((size + gpa_offset) < (PMD_SIZE * 2))
435
new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
436
}
437
}
438
439
return 0;
440
}
441
442
void kvm_arch_commit_memory_region(struct kvm *kvm,
443
struct kvm_memory_slot *old,
444
const struct kvm_memory_slot *new,
445
enum kvm_mr_change change)
446
{
447
int needs_flush;
448
u32 old_flags = old ? old->flags : 0;
449
u32 new_flags = new ? new->flags : 0;
450
bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
451
452
/* Only track memslot flags changed */
453
if (change != KVM_MR_FLAGS_ONLY)
454
return;
455
456
/* Discard dirty page tracking on readonly memslot */
457
if ((old_flags & new_flags) & KVM_MEM_READONLY)
458
return;
459
460
/*
461
* If dirty page logging is enabled, write protect all pages in the slot
462
* ready for dirty logging.
463
*
464
* There is no need to do this in any of the following cases:
465
* CREATE: No dirty mappings will already exist.
466
* MOVE/DELETE: The old mappings will already have been cleaned up by
467
* kvm_arch_flush_shadow_memslot()
468
*/
469
if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) {
470
/*
471
* Initially-all-set does not require write protecting any page
472
* because they're all assumed to be dirty.
473
*/
474
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
475
return;
476
477
spin_lock(&kvm->mmu_lock);
478
/* Write protect GPA page table entries */
479
needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
480
new->base_gfn + new->npages);
481
spin_unlock(&kvm->mmu_lock);
482
if (needs_flush)
483
kvm_flush_remote_tlbs(kvm);
484
}
485
}
486
487
void kvm_arch_flush_shadow_all(struct kvm *kvm)
488
{
489
kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0);
490
}
491
492
void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
493
{
494
/*
495
* The slot has been made invalid (ready for moving or deletion), so we
496
* need to ensure that it can no longer be accessed by any guest vCPUs.
497
*/
498
kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1);
499
}
500
501
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
502
{
503
kvm_ptw_ctx ctx;
504
505
ctx.flag = 0;
506
ctx.ops = kvm_flush_pte;
507
kvm_ptw_prepare(kvm, &ctx);
508
INIT_LIST_HEAD(&ctx.list);
509
510
return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,
511
range->end << PAGE_SHIFT, &ctx);
512
}
513
514
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
515
{
516
kvm_ptw_ctx ctx;
517
518
ctx.flag = 0;
519
ctx.ops = kvm_mkold_pte;
520
kvm_ptw_prepare(kvm, &ctx);
521
522
return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,
523
range->end << PAGE_SHIFT, &ctx);
524
}
525
526
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
527
{
528
gpa_t gpa = range->start << PAGE_SHIFT;
529
kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
530
531
if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep))
532
return true;
533
534
return false;
535
}
536
537
/*
538
* kvm_map_page_fast() - Fast path GPA fault handler.
539
* @vcpu: vCPU pointer.
540
* @gpa: Guest physical address of fault.
541
* @write: Whether the fault was due to a write.
542
*
543
* Perform fast path GPA fault handling, doing all that can be done without
544
* calling into KVM. This handles marking old pages young (for idle page
545
* tracking), and dirtying of clean pages (for dirty page logging).
546
*
547
* Returns: 0 on success, in which case we can update derived mappings and
548
* resume guest execution.
549
* -EFAULT on failure due to absent GPA mapping or write to
550
* read-only page, in which case KVM must be consulted.
551
*/
552
static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
553
{
554
int ret = 0;
555
kvm_pte_t *ptep, changed, new;
556
gfn_t gfn = gpa >> PAGE_SHIFT;
557
struct kvm *kvm = vcpu->kvm;
558
struct kvm_memory_slot *slot;
559
560
spin_lock(&kvm->mmu_lock);
561
562
/* Fast path - just check GPA page table for an existing entry */
563
ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
564
if (!ptep || !kvm_pte_present(NULL, ptep)) {
565
ret = -EFAULT;
566
goto out;
567
}
568
569
/* Track access to pages marked old */
570
new = kvm_pte_mkyoung(*ptep);
571
if (write && !kvm_pte_dirty(new)) {
572
if (!kvm_pte_write(new)) {
573
ret = -EFAULT;
574
goto out;
575
}
576
577
if (kvm_pte_huge(new)) {
578
/*
579
* Do not set write permission when dirty logging is
580
* enabled for HugePages
581
*/
582
slot = gfn_to_memslot(kvm, gfn);
583
if (kvm_slot_dirty_track_enabled(slot)) {
584
ret = -EFAULT;
585
goto out;
586
}
587
}
588
589
/* Track dirtying of writeable pages */
590
new = kvm_pte_mkdirty(new);
591
}
592
593
changed = new ^ (*ptep);
594
if (changed)
595
kvm_set_pte(ptep, new);
596
597
spin_unlock(&kvm->mmu_lock);
598
599
if (kvm_pte_dirty(changed))
600
mark_page_dirty(kvm, gfn);
601
602
return ret;
603
out:
604
spin_unlock(&kvm->mmu_lock);
605
return ret;
606
}
607
608
static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
609
unsigned long hva, bool write)
610
{
611
hva_t start, end;
612
613
/* Disable dirty logging on HugePages */
614
if (kvm_slot_dirty_track_enabled(memslot) && write)
615
return false;
616
617
if (kvm_hugepage_capable(memslot))
618
return true;
619
620
if (kvm_hugepage_incapable(memslot))
621
return false;
622
623
start = memslot->userspace_addr;
624
end = start + memslot->npages * PAGE_SIZE;
625
626
/*
627
* Next, let's make sure we're not trying to map anything not covered
628
* by the memslot. This means we have to prohibit block size mappings
629
* for the beginning and end of a non-block aligned and non-block sized
630
* memory slot (illustrated by the head and tail parts of the
631
* userspace view above containing pages 'abcde' and 'xyz',
632
* respectively).
633
*
634
* Note that it doesn't matter if we do the check using the
635
* userspace_addr or the base_gfn, as both are equally aligned (per
636
* the check above) and equally sized.
637
*/
638
return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE));
639
}
640
641
/*
642
* Lookup the mapping level for @gfn in the current mm.
643
*
644
* WARNING! Use of host_pfn_mapping_level() requires the caller and the end
645
* consumer to be tied into KVM's handlers for MMU notifier events!
646
*
647
* There are several ways to safely use this helper:
648
*
649
* - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
650
* consuming it. In this case, mmu_lock doesn't need to be held during the
651
* lookup, but it does need to be held while checking the MMU notifier.
652
*
653
* - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
654
* event for the hva. This can be done by explicit checking the MMU notifier
655
* or by ensuring that KVM already has a valid mapping that covers the hva.
656
*
657
* - Do not use the result to install new mappings, e.g. use the host mapping
658
* level only to decide whether or not to zap an entry. In this case, it's
659
* not required to hold mmu_lock (though it's highly likely the caller will
660
* want to hold mmu_lock anyways, e.g. to modify SPTEs).
661
*
662
* Note! The lookup can still race with modifications to host page tables, but
663
* the above "rules" ensure KVM will not _consume_ the result of the walk if a
664
* race with the primary MMU occurs.
665
*/
666
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
667
const struct kvm_memory_slot *slot)
668
{
669
int level = 0;
670
unsigned long hva;
671
unsigned long flags;
672
pgd_t pgd;
673
p4d_t p4d;
674
pud_t pud;
675
pmd_t pmd;
676
677
/*
678
* Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
679
* is not solely for performance, it's also necessary to avoid the
680
* "writable" check in __gfn_to_hva_many(), which will always fail on
681
* read-only memslots due to gfn_to_hva() assuming writes. Earlier
682
* page fault steps have already verified the guest isn't writing a
683
* read-only memslot.
684
*/
685
hva = __gfn_to_hva_memslot(slot, gfn);
686
687
/*
688
* Disable IRQs to prevent concurrent tear down of host page tables,
689
* e.g. if the primary MMU promotes a P*D to a huge page and then frees
690
* the original page table.
691
*/
692
local_irq_save(flags);
693
694
/*
695
* Read each entry once. As above, a non-leaf entry can be promoted to
696
* a huge page _during_ this walk. Re-reading the entry could send the
697
* walk into the weeks, e.g. p*d_leaf() returns false (sees the old
698
* value) and then p*d_offset() walks into the target huge page instead
699
* of the old page table (sees the new value).
700
*/
701
pgd = pgdp_get(pgd_offset(kvm->mm, hva));
702
if (pgd_none(pgd))
703
goto out;
704
705
p4d = p4dp_get(p4d_offset(&pgd, hva));
706
if (p4d_none(p4d) || !p4d_present(p4d))
707
goto out;
708
709
pud = pudp_get(pud_offset(&p4d, hva));
710
if (pud_none(pud) || !pud_present(pud))
711
goto out;
712
713
pmd = pmdp_get(pmd_offset(&pud, hva));
714
if (pmd_none(pmd) || !pmd_present(pmd))
715
goto out;
716
717
if (kvm_pte_huge(pmd_val(pmd)))
718
level = 1;
719
720
out:
721
local_irq_restore(flags);
722
return level;
723
}
724
725
/*
726
* Split huge page
727
*/
728
static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn)
729
{
730
int i;
731
kvm_pte_t val, *child;
732
struct kvm *kvm = vcpu->kvm;
733
struct kvm_mmu_memory_cache *memcache;
734
735
memcache = &vcpu->arch.mmu_page_cache;
736
child = kvm_mmu_memory_cache_alloc(memcache);
737
val = kvm_pte_mksmall(*ptep);
738
for (i = 0; i < PTRS_PER_PTE; i++) {
739
kvm_set_pte(child + i, val);
740
val += PAGE_SIZE;
741
}
742
743
smp_wmb(); /* Make pte visible before pmd */
744
/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
745
kvm_set_pte(ptep, __pa(child));
746
747
kvm->stat.hugepages--;
748
kvm->stat.pages += PTRS_PER_PTE;
749
750
return child + (gfn & (PTRS_PER_PTE - 1));
751
}
752
753
/*
754
* kvm_map_page() - Map a guest physical page.
755
* @vcpu: vCPU pointer.
756
* @gpa: Guest physical address of fault.
757
* @write: Whether the fault was due to a write.
758
*
759
* Handle GPA faults by creating a new GPA mapping (or updating an existing
760
* one).
761
*
762
* This takes care of marking pages young or dirty (idle/dirty page tracking),
763
* asking KVM for the corresponding PFN, and creating a mapping in the GPA page
764
* tables. Derived mappings (GVA page tables and TLBs) must be handled by the
765
* caller.
766
*
767
* Returns: 0 on success
768
* -EFAULT if there is no memory region at @gpa or a write was
769
* attempted to a read-only memory region. This is usually handled
770
* as an MMIO access.
771
*/
772
static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
773
{
774
bool writeable;
775
int srcu_idx, err, retry_no = 0, level;
776
unsigned long hva, mmu_seq, prot_bits;
777
kvm_pfn_t pfn;
778
kvm_pte_t *ptep, new_pte;
779
gfn_t gfn = gpa >> PAGE_SHIFT;
780
struct kvm *kvm = vcpu->kvm;
781
struct kvm_memory_slot *memslot;
782
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
783
struct page *page;
784
785
/* Try the fast path to handle old / clean pages */
786
srcu_idx = srcu_read_lock(&kvm->srcu);
787
err = kvm_map_page_fast(vcpu, gpa, write);
788
if (!err)
789
goto out;
790
791
memslot = gfn_to_memslot(kvm, gfn);
792
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);
793
if (kvm_is_error_hva(hva) || (write && !writeable)) {
794
err = -EFAULT;
795
goto out;
796
}
797
798
/* We need a minimum of cached pages ready for page table creation */
799
err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);
800
if (err)
801
goto out;
802
803
retry:
804
/*
805
* Used to check for invalidations in progress, of the pfn that is
806
* returned by pfn_to_pfn_prot below.
807
*/
808
mmu_seq = kvm->mmu_invalidate_seq;
809
/*
810
* Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in
811
* kvm_faultin_pfn() (which calls get_user_pages()), so that we don't
812
* risk the page we get a reference to getting unmapped before we have a
813
* chance to grab the mmu_lock without mmu_invalidate_retry() noticing.
814
*
815
* This smp_rmb() pairs with the effective smp_wmb() of the combination
816
* of the pte_unmap_unlock() after the PTE is zapped, and the
817
* spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before
818
* mmu_invalidate_seq is incremented.
819
*/
820
smp_rmb();
821
822
/* Slow path - ask KVM core whether we can access this GPA */
823
pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page);
824
if (is_error_noslot_pfn(pfn)) {
825
err = -EFAULT;
826
goto out;
827
}
828
829
/* Check if an invalidation has taken place since we got pfn */
830
spin_lock(&kvm->mmu_lock);
831
if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
832
/*
833
* This can happen when mappings are changed asynchronously, but
834
* also synchronously if a COW is triggered by
835
* kvm_faultin_pfn().
836
*/
837
spin_unlock(&kvm->mmu_lock);
838
kvm_release_page_unused(page);
839
if (retry_no > 100) {
840
retry_no = 0;
841
schedule();
842
}
843
retry_no++;
844
goto retry;
845
}
846
847
/*
848
* For emulated devices such virtio device, actual cache attribute is
849
* determined by physical machine.
850
* For pass through physical device, it should be uncachable
851
*/
852
prot_bits = _PAGE_PRESENT | __READABLE;
853
if (pfn_valid(pfn))
854
prot_bits |= _CACHE_CC;
855
else
856
prot_bits |= _CACHE_SUC;
857
858
if (writeable) {
859
prot_bits |= _PAGE_WRITE;
860
if (write)
861
prot_bits |= __WRITEABLE;
862
}
863
864
/* Disable dirty logging on HugePages */
865
level = 0;
866
if (fault_supports_huge_mapping(memslot, hva, write)) {
867
/* Check page level about host mmu*/
868
level = host_pfn_mapping_level(kvm, gfn, memslot);
869
if (level == 1) {
870
/*
871
* Check page level about secondary mmu
872
* Disable hugepage if it is normal page on
873
* secondary mmu already
874
*/
875
ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
876
if (ptep && !kvm_pte_huge(*ptep))
877
level = 0;
878
}
879
880
if (level == 1) {
881
gfn = gfn & ~(PTRS_PER_PTE - 1);
882
pfn = pfn & ~(PTRS_PER_PTE - 1);
883
}
884
}
885
886
/* Ensure page tables are allocated */
887
ptep = kvm_populate_gpa(kvm, memcache, gpa, level);
888
new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits));
889
if (level == 1) {
890
new_pte = kvm_pte_mkhuge(new_pte);
891
/*
892
* previous pmd entry is invalid_pte_table
893
* there is invalid tlb with small page
894
* need flush these invalid tlbs for current vcpu
895
*/
896
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
897
++kvm->stat.hugepages;
898
} else if (kvm_pte_huge(*ptep) && write)
899
ptep = kvm_split_huge(vcpu, ptep, gfn);
900
else
901
++kvm->stat.pages;
902
kvm_set_pte(ptep, new_pte);
903
904
kvm_release_faultin_page(kvm, page, false, writeable);
905
spin_unlock(&kvm->mmu_lock);
906
907
if (prot_bits & _PAGE_DIRTY)
908
mark_page_dirty_in_slot(kvm, memslot, gfn);
909
910
out:
911
srcu_read_unlock(&kvm->srcu, srcu_idx);
912
return err;
913
}
914
915
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write, int ecode)
916
{
917
int ret;
918
919
ret = kvm_map_page(vcpu, gpa, write);
920
if (ret)
921
return ret;
922
923
/* Invalidate this entry in the TLB */
924
if (!cpu_has_ptw || (ecode == EXCCODE_TLBM)) {
925
/*
926
* With HW PTW, invalid TLB is not added when page fault. But
927
* for EXCCODE_TLBM exception, stale TLB may exist because of
928
* the last read access.
929
*
930
* With SW PTW, invalid TLB is added in TLB refill exception.
931
*/
932
vcpu->arch.flush_gpa = gpa;
933
kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
934
}
935
936
return 0;
937
}
938
939
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
940
{
941
}
942
943
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
944
const struct kvm_memory_slot *memslot)
945
{
946
kvm_flush_remote_tlbs(kvm);
947
}
948
949