Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/kvm/book3s_64_mmu_radix.c
26451 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
*
4
* Copyright 2016 Paul Mackerras, IBM Corp. <[email protected]>
5
*/
6
7
#include <linux/types.h>
8
#include <linux/string.h>
9
#include <linux/kvm.h>
10
#include <linux/kvm_host.h>
11
#include <linux/anon_inodes.h>
12
#include <linux/file.h>
13
#include <linux/debugfs.h>
14
#include <linux/pgtable.h>
15
16
#include <asm/kvm_ppc.h>
17
#include <asm/kvm_book3s.h>
18
#include "book3s_hv.h"
19
#include <asm/page.h>
20
#include <asm/mmu.h>
21
#include <asm/pgalloc.h>
22
#include <asm/pte-walk.h>
23
#include <asm/ultravisor.h>
24
#include <asm/kvm_book3s_uvmem.h>
25
#include <asm/plpar_wrappers.h>
26
#include <asm/firmware.h>
27
28
/*
29
* Supported radix tree geometry.
30
* Like p9, we support either 5 or 9 bits at the first (lowest) level,
31
* for a page size of 64k or 4k.
32
*/
33
static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
34
35
unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
36
gva_t eaddr, void *to, void *from,
37
unsigned long n)
38
{
39
int old_pid, old_lpid;
40
unsigned long quadrant, ret = n;
41
bool is_load = !!to;
42
43
if (kvmhv_is_nestedv2())
44
return H_UNSUPPORTED;
45
46
/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
47
if (kvmhv_on_pseries())
48
return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
49
(to != NULL) ? __pa(to): 0,
50
(from != NULL) ? __pa(from): 0, n);
51
52
if (eaddr & (0xFFFUL << 52))
53
return ret;
54
55
quadrant = 1;
56
if (!pid)
57
quadrant = 2;
58
if (is_load)
59
from = (void *) (eaddr | (quadrant << 62));
60
else
61
to = (void *) (eaddr | (quadrant << 62));
62
63
preempt_disable();
64
65
asm volatile("hwsync" ::: "memory");
66
isync();
67
/* switch the lpid first to avoid running host with unallocated pid */
68
old_lpid = mfspr(SPRN_LPID);
69
if (old_lpid != lpid)
70
mtspr(SPRN_LPID, lpid);
71
if (quadrant == 1) {
72
old_pid = mfspr(SPRN_PID);
73
if (old_pid != pid)
74
mtspr(SPRN_PID, pid);
75
}
76
isync();
77
78
pagefault_disable();
79
if (is_load)
80
ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
81
else
82
ret = __copy_to_user_inatomic((void __user *)to, from, n);
83
pagefault_enable();
84
85
asm volatile("hwsync" ::: "memory");
86
isync();
87
/* switch the pid first to avoid running host with unallocated pid */
88
if (quadrant == 1 && pid != old_pid)
89
mtspr(SPRN_PID, old_pid);
90
if (lpid != old_lpid)
91
mtspr(SPRN_LPID, old_lpid);
92
isync();
93
94
preempt_enable();
95
96
return ret;
97
}
98
99
static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
100
void *to, void *from, unsigned long n)
101
{
102
int lpid = vcpu->kvm->arch.lpid;
103
int pid;
104
105
/* This would cause a data segment intr so don't allow the access */
106
if (eaddr & (0x3FFUL << 52))
107
return -EINVAL;
108
109
/* Should we be using the nested lpid */
110
if (vcpu->arch.nested)
111
lpid = vcpu->arch.nested->shadow_lpid;
112
113
/* If accessing quadrant 3 then pid is expected to be 0 */
114
if (((eaddr >> 62) & 0x3) == 0x3)
115
pid = 0;
116
else
117
pid = kvmppc_get_pid(vcpu);
118
119
eaddr &= ~(0xFFFUL << 52);
120
121
return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
122
}
123
124
long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
125
unsigned long n)
126
{
127
long ret;
128
129
ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
130
if (ret > 0)
131
memset(to + (n - ret), 0, ret);
132
133
return ret;
134
}
135
136
long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
137
unsigned long n)
138
{
139
return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
140
}
141
142
int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
143
struct kvmppc_pte *gpte, u64 root,
144
u64 *pte_ret_p)
145
{
146
struct kvm *kvm = vcpu->kvm;
147
int ret, level, ps;
148
unsigned long rts, bits, offset, index;
149
u64 pte, base, gpa;
150
__be64 rpte;
151
152
rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
153
((root & RTS2_MASK) >> RTS2_SHIFT);
154
bits = root & RPDS_MASK;
155
base = root & RPDB_MASK;
156
157
offset = rts + 31;
158
159
/* Current implementations only support 52-bit space */
160
if (offset != 52)
161
return -EINVAL;
162
163
/* Walk each level of the radix tree */
164
for (level = 3; level >= 0; --level) {
165
u64 addr;
166
/* Check a valid size */
167
if (level && bits != p9_supported_radix_bits[level])
168
return -EINVAL;
169
if (level == 0 && !(bits == 5 || bits == 9))
170
return -EINVAL;
171
offset -= bits;
172
index = (eaddr >> offset) & ((1UL << bits) - 1);
173
/* Check that low bits of page table base are zero */
174
if (base & ((1UL << (bits + 3)) - 1))
175
return -EINVAL;
176
/* Read the entry from guest memory */
177
addr = base + (index * sizeof(rpte));
178
179
kvm_vcpu_srcu_read_lock(vcpu);
180
ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
181
kvm_vcpu_srcu_read_unlock(vcpu);
182
if (ret) {
183
if (pte_ret_p)
184
*pte_ret_p = addr;
185
return ret;
186
}
187
pte = __be64_to_cpu(rpte);
188
if (!(pte & _PAGE_PRESENT))
189
return -ENOENT;
190
/* Check if a leaf entry */
191
if (pte & _PAGE_PTE)
192
break;
193
/* Get ready to walk the next level */
194
base = pte & RPDB_MASK;
195
bits = pte & RPDS_MASK;
196
}
197
198
/* Need a leaf at lowest level; 512GB pages not supported */
199
if (level < 0 || level == 3)
200
return -EINVAL;
201
202
/* We found a valid leaf PTE */
203
/* Offset is now log base 2 of the page size */
204
gpa = pte & 0x01fffffffffff000ul;
205
if (gpa & ((1ul << offset) - 1))
206
return -EINVAL;
207
gpa |= eaddr & ((1ul << offset) - 1);
208
for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
209
if (offset == mmu_psize_defs[ps].shift)
210
break;
211
gpte->page_size = ps;
212
gpte->page_shift = offset;
213
214
gpte->eaddr = eaddr;
215
gpte->raddr = gpa;
216
217
/* Work out permissions */
218
gpte->may_read = !!(pte & _PAGE_READ);
219
gpte->may_write = !!(pte & _PAGE_WRITE);
220
gpte->may_execute = !!(pte & _PAGE_EXEC);
221
222
gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
223
224
if (pte_ret_p)
225
*pte_ret_p = pte;
226
227
return 0;
228
}
229
230
/*
231
* Used to walk a partition or process table radix tree in guest memory
232
* Note: We exploit the fact that a partition table and a process
233
* table have the same layout, a partition-scoped page table and a
234
* process-scoped page table have the same layout, and the 2nd
235
* doubleword of a partition table entry has the same layout as
236
* the PTCR register.
237
*/
238
int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
239
struct kvmppc_pte *gpte, u64 table,
240
int table_index, u64 *pte_ret_p)
241
{
242
struct kvm *kvm = vcpu->kvm;
243
int ret;
244
unsigned long size, ptbl, root;
245
struct prtb_entry entry;
246
247
if ((table & PRTS_MASK) > 24)
248
return -EINVAL;
249
size = 1ul << ((table & PRTS_MASK) + 12);
250
251
/* Is the table big enough to contain this entry? */
252
if ((table_index * sizeof(entry)) >= size)
253
return -EINVAL;
254
255
/* Read the table to find the root of the radix tree */
256
ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
257
kvm_vcpu_srcu_read_lock(vcpu);
258
ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
259
kvm_vcpu_srcu_read_unlock(vcpu);
260
if (ret)
261
return ret;
262
263
/* Root is stored in the first double word */
264
root = be64_to_cpu(entry.prtb0);
265
266
return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
267
}
268
269
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
270
struct kvmppc_pte *gpte, bool data, bool iswrite)
271
{
272
u32 pid;
273
u64 pte;
274
int ret;
275
276
/* Work out effective PID */
277
switch (eaddr >> 62) {
278
case 0:
279
pid = kvmppc_get_pid(vcpu);
280
break;
281
case 3:
282
pid = 0;
283
break;
284
default:
285
return -EINVAL;
286
}
287
288
ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
289
vcpu->kvm->arch.process_table, pid, &pte);
290
if (ret)
291
return ret;
292
293
/* Check privilege (applies only to process scoped translations) */
294
if (kvmppc_get_msr(vcpu) & MSR_PR) {
295
if (pte & _PAGE_PRIVILEGED) {
296
gpte->may_read = 0;
297
gpte->may_write = 0;
298
gpte->may_execute = 0;
299
}
300
} else {
301
if (!(pte & _PAGE_PRIVILEGED)) {
302
/* Check AMR/IAMR to see if strict mode is in force */
303
if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))
304
gpte->may_read = 0;
305
if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))
306
gpte->may_write = 0;
307
if (vcpu->arch.iamr & (1ul << 62))
308
gpte->may_execute = 0;
309
}
310
}
311
312
return 0;
313
}
314
315
void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
316
unsigned int pshift, u64 lpid)
317
{
318
unsigned long psize = PAGE_SIZE;
319
int psi;
320
long rc;
321
unsigned long rb;
322
323
if (pshift)
324
psize = 1UL << pshift;
325
else
326
pshift = PAGE_SHIFT;
327
328
addr &= ~(psize - 1);
329
330
if (!kvmhv_on_pseries()) {
331
radix__flush_tlb_lpid_page(lpid, addr, psize);
332
return;
333
}
334
335
psi = shift_to_mmu_psize(pshift);
336
337
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
338
rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
339
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
340
lpid, rb);
341
} else {
342
rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
343
H_RPTI_TYPE_NESTED |
344
H_RPTI_TYPE_TLB,
345
psize_to_rpti_pgsize(psi),
346
addr, addr + psize);
347
}
348
349
if (rc)
350
pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
351
}
352
353
static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid)
354
{
355
long rc;
356
357
if (!kvmhv_on_pseries()) {
358
radix__flush_pwc_lpid(lpid);
359
return;
360
}
361
362
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
363
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
364
lpid, TLBIEL_INVAL_SET_LPID);
365
else
366
rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
367
H_RPTI_TYPE_NESTED |
368
H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
369
0, -1UL);
370
if (rc)
371
pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
372
}
373
374
static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
375
unsigned long clr, unsigned long set,
376
unsigned long addr, unsigned int shift)
377
{
378
return __radix_pte_update(ptep, clr, set);
379
}
380
381
static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
382
pte_t *ptep, pte_t pte)
383
{
384
radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
385
}
386
387
static struct kmem_cache *kvm_pte_cache;
388
static struct kmem_cache *kvm_pmd_cache;
389
390
static pte_t *kvmppc_pte_alloc(void)
391
{
392
pte_t *pte;
393
394
pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
395
/* pmd_populate() will only reference _pa(pte). */
396
kmemleak_ignore(pte);
397
398
return pte;
399
}
400
401
static void kvmppc_pte_free(pte_t *ptep)
402
{
403
kmem_cache_free(kvm_pte_cache, ptep);
404
}
405
406
static pmd_t *kvmppc_pmd_alloc(void)
407
{
408
pmd_t *pmd;
409
410
pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
411
/* pud_populate() will only reference _pa(pmd). */
412
kmemleak_ignore(pmd);
413
414
return pmd;
415
}
416
417
static void kvmppc_pmd_free(pmd_t *pmdp)
418
{
419
kmem_cache_free(kvm_pmd_cache, pmdp);
420
}
421
422
/* Called with kvm->mmu_lock held */
423
void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
424
unsigned int shift,
425
const struct kvm_memory_slot *memslot,
426
u64 lpid)
427
428
{
429
unsigned long old;
430
unsigned long gfn = gpa >> PAGE_SHIFT;
431
unsigned long page_size = PAGE_SIZE;
432
unsigned long hpa;
433
434
old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
435
kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
436
437
/* The following only applies to L1 entries */
438
if (lpid != kvm->arch.lpid)
439
return;
440
441
if (!memslot) {
442
memslot = gfn_to_memslot(kvm, gfn);
443
if (!memslot)
444
return;
445
}
446
if (shift) { /* 1GB or 2MB page */
447
page_size = 1ul << shift;
448
if (shift == PMD_SHIFT)
449
kvm->stat.num_2M_pages--;
450
else if (shift == PUD_SHIFT)
451
kvm->stat.num_1G_pages--;
452
}
453
454
gpa &= ~(page_size - 1);
455
hpa = old & PTE_RPN_MASK;
456
kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
457
458
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
459
kvmppc_update_dirty_map(memslot, gfn, page_size);
460
}
461
462
/*
463
* kvmppc_free_p?d are used to free existing page tables, and recursively
464
* descend and clear and free children.
465
* Callers are responsible for flushing the PWC.
466
*
467
* When page tables are being unmapped/freed as part of page fault path
468
* (full == false), valid ptes are generally not expected; however, there
469
* is one situation where they arise, which is when dirty page logging is
470
* turned off for a memslot while the VM is running. The new memslot
471
* becomes visible to page faults before the memslot commit function
472
* gets to flush the memslot, which can lead to a 2MB page mapping being
473
* installed for a guest physical address where there are already 64kB
474
* (or 4kB) mappings (of sub-pages of the same 2MB page).
475
*/
476
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
477
u64 lpid)
478
{
479
if (full) {
480
memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
481
} else {
482
pte_t *p = pte;
483
unsigned long it;
484
485
for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
486
if (pte_val(*p) == 0)
487
continue;
488
kvmppc_unmap_pte(kvm, p,
489
pte_pfn(*p) << PAGE_SHIFT,
490
PAGE_SHIFT, NULL, lpid);
491
}
492
}
493
494
kvmppc_pte_free(pte);
495
}
496
497
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
498
u64 lpid)
499
{
500
unsigned long im;
501
pmd_t *p = pmd;
502
503
for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
504
if (!pmd_present(*p))
505
continue;
506
if (pmd_leaf(*p)) {
507
if (full) {
508
pmd_clear(p);
509
} else {
510
WARN_ON_ONCE(1);
511
kvmppc_unmap_pte(kvm, (pte_t *)p,
512
pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
513
PMD_SHIFT, NULL, lpid);
514
}
515
} else {
516
pte_t *pte;
517
518
pte = pte_offset_kernel(p, 0);
519
kvmppc_unmap_free_pte(kvm, pte, full, lpid);
520
pmd_clear(p);
521
}
522
}
523
kvmppc_pmd_free(pmd);
524
}
525
526
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
527
u64 lpid)
528
{
529
unsigned long iu;
530
pud_t *p = pud;
531
532
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
533
if (!pud_present(*p))
534
continue;
535
if (pud_leaf(*p)) {
536
pud_clear(p);
537
} else {
538
pmd_t *pmd;
539
540
pmd = pmd_offset(p, 0);
541
kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
542
pud_clear(p);
543
}
544
}
545
pud_free(kvm->mm, pud);
546
}
547
548
void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid)
549
{
550
unsigned long ig;
551
552
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
553
p4d_t *p4d = p4d_offset(pgd, 0);
554
pud_t *pud;
555
556
if (!p4d_present(*p4d))
557
continue;
558
pud = pud_offset(p4d, 0);
559
kvmppc_unmap_free_pud(kvm, pud, lpid);
560
p4d_clear(p4d);
561
}
562
}
563
564
void kvmppc_free_radix(struct kvm *kvm)
565
{
566
if (kvm->arch.pgtable) {
567
kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
568
kvm->arch.lpid);
569
pgd_free(kvm->mm, kvm->arch.pgtable);
570
kvm->arch.pgtable = NULL;
571
}
572
}
573
574
static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
575
unsigned long gpa, u64 lpid)
576
{
577
pte_t *pte = pte_offset_kernel(pmd, 0);
578
579
/*
580
* Clearing the pmd entry then flushing the PWC ensures that the pte
581
* page no longer be cached by the MMU, so can be freed without
582
* flushing the PWC again.
583
*/
584
pmd_clear(pmd);
585
kvmppc_radix_flush_pwc(kvm, lpid);
586
587
kvmppc_unmap_free_pte(kvm, pte, false, lpid);
588
}
589
590
static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
591
unsigned long gpa, u64 lpid)
592
{
593
pmd_t *pmd = pmd_offset(pud, 0);
594
595
/*
596
* Clearing the pud entry then flushing the PWC ensures that the pmd
597
* page and any children pte pages will no longer be cached by the MMU,
598
* so can be freed without flushing the PWC again.
599
*/
600
pud_clear(pud);
601
kvmppc_radix_flush_pwc(kvm, lpid);
602
603
kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
604
}
605
606
/*
607
* There are a number of bits which may differ between different faults to
608
* the same partition scope entry. RC bits, in the course of cleaning and
609
* aging. And the write bit can change, either the access could have been
610
* upgraded, or a read fault could happen concurrently with a write fault
611
* that sets those bits first.
612
*/
613
#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
614
615
int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
616
unsigned long gpa, unsigned int level,
617
unsigned long mmu_seq, u64 lpid,
618
unsigned long *rmapp, struct rmap_nested **n_rmap)
619
{
620
pgd_t *pgd;
621
p4d_t *p4d;
622
pud_t *pud, *new_pud = NULL;
623
pmd_t *pmd, *new_pmd = NULL;
624
pte_t *ptep, *new_ptep = NULL;
625
int ret;
626
627
/* Traverse the guest's 2nd-level tree, allocate new levels needed */
628
pgd = pgtable + pgd_index(gpa);
629
p4d = p4d_offset(pgd, gpa);
630
631
pud = NULL;
632
if (p4d_present(*p4d))
633
pud = pud_offset(p4d, gpa);
634
else
635
new_pud = pud_alloc_one(kvm->mm, gpa);
636
637
pmd = NULL;
638
if (pud && pud_present(*pud) && !pud_leaf(*pud))
639
pmd = pmd_offset(pud, gpa);
640
else if (level <= 1)
641
new_pmd = kvmppc_pmd_alloc();
642
643
if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd)))
644
new_ptep = kvmppc_pte_alloc();
645
646
/* Check if we might have been invalidated; let the guest retry if so */
647
spin_lock(&kvm->mmu_lock);
648
ret = -EAGAIN;
649
if (mmu_invalidate_retry(kvm, mmu_seq))
650
goto out_unlock;
651
652
/* Now traverse again under the lock and change the tree */
653
ret = -ENOMEM;
654
if (p4d_none(*p4d)) {
655
if (!new_pud)
656
goto out_unlock;
657
p4d_populate(kvm->mm, p4d, new_pud);
658
new_pud = NULL;
659
}
660
pud = pud_offset(p4d, gpa);
661
if (pud_leaf(*pud)) {
662
unsigned long hgpa = gpa & PUD_MASK;
663
664
/* Check if we raced and someone else has set the same thing */
665
if (level == 2) {
666
if (pud_raw(*pud) == pte_raw(pte)) {
667
ret = 0;
668
goto out_unlock;
669
}
670
/* Valid 1GB page here already, add our extra bits */
671
WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
672
PTE_BITS_MUST_MATCH);
673
kvmppc_radix_update_pte(kvm, (pte_t *)pud,
674
0, pte_val(pte), hgpa, PUD_SHIFT);
675
ret = 0;
676
goto out_unlock;
677
}
678
/*
679
* If we raced with another CPU which has just put
680
* a 1GB pte in after we saw a pmd page, try again.
681
*/
682
if (!new_pmd) {
683
ret = -EAGAIN;
684
goto out_unlock;
685
}
686
/* Valid 1GB page here already, remove it */
687
kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
688
lpid);
689
}
690
if (level == 2) {
691
if (!pud_none(*pud)) {
692
/*
693
* There's a page table page here, but we wanted to
694
* install a large page, so remove and free the page
695
* table page.
696
*/
697
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
698
}
699
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
700
if (rmapp && n_rmap)
701
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
702
ret = 0;
703
goto out_unlock;
704
}
705
if (pud_none(*pud)) {
706
if (!new_pmd)
707
goto out_unlock;
708
pud_populate(kvm->mm, pud, new_pmd);
709
new_pmd = NULL;
710
}
711
pmd = pmd_offset(pud, gpa);
712
if (pmd_leaf(*pmd)) {
713
unsigned long lgpa = gpa & PMD_MASK;
714
715
/* Check if we raced and someone else has set the same thing */
716
if (level == 1) {
717
if (pmd_raw(*pmd) == pte_raw(pte)) {
718
ret = 0;
719
goto out_unlock;
720
}
721
/* Valid 2MB page here already, add our extra bits */
722
WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
723
PTE_BITS_MUST_MATCH);
724
kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
725
0, pte_val(pte), lgpa, PMD_SHIFT);
726
ret = 0;
727
goto out_unlock;
728
}
729
730
/*
731
* If we raced with another CPU which has just put
732
* a 2MB pte in after we saw a pte page, try again.
733
*/
734
if (!new_ptep) {
735
ret = -EAGAIN;
736
goto out_unlock;
737
}
738
/* Valid 2MB page here already, remove it */
739
kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
740
lpid);
741
}
742
if (level == 1) {
743
if (!pmd_none(*pmd)) {
744
/*
745
* There's a page table page here, but we wanted to
746
* install a large page, so remove and free the page
747
* table page.
748
*/
749
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
750
}
751
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
752
if (rmapp && n_rmap)
753
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
754
ret = 0;
755
goto out_unlock;
756
}
757
if (pmd_none(*pmd)) {
758
if (!new_ptep)
759
goto out_unlock;
760
pmd_populate(kvm->mm, pmd, new_ptep);
761
new_ptep = NULL;
762
}
763
ptep = pte_offset_kernel(pmd, gpa);
764
if (pte_present(*ptep)) {
765
/* Check if someone else set the same thing */
766
if (pte_raw(*ptep) == pte_raw(pte)) {
767
ret = 0;
768
goto out_unlock;
769
}
770
/* Valid page here already, add our extra bits */
771
WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
772
PTE_BITS_MUST_MATCH);
773
kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
774
ret = 0;
775
goto out_unlock;
776
}
777
kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
778
if (rmapp && n_rmap)
779
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
780
ret = 0;
781
782
out_unlock:
783
spin_unlock(&kvm->mmu_lock);
784
if (new_pud)
785
pud_free(kvm->mm, new_pud);
786
if (new_pmd)
787
kvmppc_pmd_free(new_pmd);
788
if (new_ptep)
789
kvmppc_pte_free(new_ptep);
790
return ret;
791
}
792
793
bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
794
unsigned long gpa, u64 lpid)
795
{
796
unsigned long pgflags;
797
unsigned int shift;
798
pte_t *ptep;
799
800
/*
801
* Need to set an R or C bit in the 2nd-level tables;
802
* since we are just helping out the hardware here,
803
* it is sufficient to do what the hardware does.
804
*/
805
pgflags = _PAGE_ACCESSED;
806
if (writing)
807
pgflags |= _PAGE_DIRTY;
808
809
if (nested)
810
ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
811
else
812
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
813
814
if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
815
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
816
return true;
817
}
818
return false;
819
}
820
821
int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
822
unsigned long gpa,
823
struct kvm_memory_slot *memslot,
824
bool writing,
825
pte_t *inserted_pte, unsigned int *levelp)
826
{
827
struct kvm *kvm = vcpu->kvm;
828
struct page *page = NULL;
829
unsigned long mmu_seq;
830
unsigned long hva, gfn = gpa >> PAGE_SHIFT;
831
bool upgrade_write = false;
832
pte_t pte, *ptep;
833
unsigned int shift, level;
834
int ret;
835
bool large_enable;
836
kvm_pfn_t pfn;
837
838
/* used to check for invalidations in progress */
839
mmu_seq = kvm->mmu_invalidate_seq;
840
smp_rmb();
841
842
hva = gfn_to_hva_memslot(memslot, gfn);
843
pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0,
844
&upgrade_write, &page);
845
if (is_error_noslot_pfn(pfn))
846
return -EFAULT;
847
848
/*
849
* Read the PTE from the process' radix tree and use that
850
* so we get the shift and attribute bits.
851
*/
852
spin_lock(&kvm->mmu_lock);
853
ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
854
pte = __pte(0);
855
if (ptep)
856
pte = READ_ONCE(*ptep);
857
spin_unlock(&kvm->mmu_lock);
858
/*
859
* If the PTE disappeared temporarily due to a THP
860
* collapse, just return and let the guest try again.
861
*/
862
if (!pte_present(pte)) {
863
if (page)
864
put_page(page);
865
return RESUME_GUEST;
866
}
867
868
/* If we're logging dirty pages, always map single pages */
869
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
870
871
/* Get pte level from shift/size */
872
if (large_enable && shift == PUD_SHIFT &&
873
(gpa & (PUD_SIZE - PAGE_SIZE)) ==
874
(hva & (PUD_SIZE - PAGE_SIZE))) {
875
level = 2;
876
} else if (large_enable && shift == PMD_SHIFT &&
877
(gpa & (PMD_SIZE - PAGE_SIZE)) ==
878
(hva & (PMD_SIZE - PAGE_SIZE))) {
879
level = 1;
880
} else {
881
level = 0;
882
if (shift > PAGE_SHIFT) {
883
/*
884
* If the pte maps more than one page, bring over
885
* bits from the virtual address to get the real
886
* address of the specific single page we want.
887
*/
888
unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
889
pte = __pte(pte_val(pte) | (hva & rpnmask));
890
}
891
}
892
893
pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
894
if (writing || upgrade_write) {
895
if (pte_val(pte) & _PAGE_WRITE)
896
pte = __pte(pte_val(pte) | _PAGE_DIRTY);
897
} else {
898
pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
899
}
900
901
/* Allocate space in the tree and write the PTE */
902
ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
903
mmu_seq, kvm->arch.lpid, NULL, NULL);
904
if (inserted_pte)
905
*inserted_pte = pte;
906
if (levelp)
907
*levelp = level;
908
909
if (page) {
910
if (!ret && (pte_val(pte) & _PAGE_WRITE))
911
set_page_dirty_lock(page);
912
put_page(page);
913
}
914
915
/* Increment number of large pages if we (successfully) inserted one */
916
if (!ret) {
917
if (level == 1)
918
kvm->stat.num_2M_pages++;
919
else if (level == 2)
920
kvm->stat.num_1G_pages++;
921
}
922
923
return ret;
924
}
925
926
int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
927
unsigned long ea, unsigned long dsisr)
928
{
929
struct kvm *kvm = vcpu->kvm;
930
unsigned long gpa, gfn;
931
struct kvm_memory_slot *memslot;
932
long ret;
933
bool writing = !!(dsisr & DSISR_ISSTORE);
934
935
/* Check for unusual errors */
936
if (dsisr & DSISR_UNSUPP_MMU) {
937
pr_err("KVM: Got unsupported MMU fault\n");
938
return -EFAULT;
939
}
940
if (dsisr & DSISR_BADACCESS) {
941
/* Reflect to the guest as DSI */
942
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
943
kvmppc_core_queue_data_storage(vcpu,
944
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
945
ea, dsisr);
946
return RESUME_GUEST;
947
}
948
949
/* Translate the logical address */
950
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
951
gpa &= ~0xF000000000000000ul;
952
gfn = gpa >> PAGE_SHIFT;
953
if (!(dsisr & DSISR_PRTABLE_FAULT))
954
gpa |= ea & 0xfff;
955
956
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
957
return kvmppc_send_page_to_uv(kvm, gfn);
958
959
/* Get the corresponding memslot */
960
memslot = gfn_to_memslot(kvm, gfn);
961
962
/* No memslot means it's an emulated MMIO region */
963
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
964
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
965
DSISR_SET_RC)) {
966
/*
967
* Bad address in guest page table tree, or other
968
* unusual error - reflect it to the guest as DSI.
969
*/
970
kvmppc_core_queue_data_storage(vcpu,
971
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
972
ea, dsisr);
973
return RESUME_GUEST;
974
}
975
return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
976
}
977
978
if (memslot->flags & KVM_MEM_READONLY) {
979
if (writing) {
980
/* give the guest a DSI */
981
kvmppc_core_queue_data_storage(vcpu,
982
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
983
ea, DSISR_ISSTORE | DSISR_PROTFAULT);
984
return RESUME_GUEST;
985
}
986
}
987
988
/* Failed to set the reference/change bits */
989
if (dsisr & DSISR_SET_RC) {
990
spin_lock(&kvm->mmu_lock);
991
if (kvmppc_hv_handle_set_rc(kvm, false, writing,
992
gpa, kvm->arch.lpid))
993
dsisr &= ~DSISR_SET_RC;
994
spin_unlock(&kvm->mmu_lock);
995
996
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
997
DSISR_PROTFAULT | DSISR_SET_RC)))
998
return RESUME_GUEST;
999
}
1000
1001
/* Try to insert a pte */
1002
ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1003
NULL, NULL);
1004
1005
if (ret == 0 || ret == -EAGAIN)
1006
ret = RESUME_GUEST;
1007
return ret;
1008
}
1009
1010
/* Called with kvm->mmu_lock held */
1011
void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1012
unsigned long gfn)
1013
{
1014
pte_t *ptep;
1015
unsigned long gpa = gfn << PAGE_SHIFT;
1016
unsigned int shift;
1017
1018
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1019
uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1020
return;
1021
}
1022
1023
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1024
if (ptep && pte_present(*ptep))
1025
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1026
kvm->arch.lpid);
1027
}
1028
1029
/* Called with kvm->mmu_lock held */
1030
bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1031
unsigned long gfn)
1032
{
1033
pte_t *ptep;
1034
unsigned long gpa = gfn << PAGE_SHIFT;
1035
unsigned int shift;
1036
bool ref = false;
1037
unsigned long old, *rmapp;
1038
1039
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1040
return ref;
1041
1042
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1043
if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1044
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1045
gpa, shift);
1046
/* XXX need to flush tlb here? */
1047
/* Also clear bit in ptes in shadow pgtable for nested guests */
1048
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1049
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1050
old & PTE_RPN_MASK,
1051
1UL << shift);
1052
ref = true;
1053
}
1054
return ref;
1055
}
1056
1057
/* Called with kvm->mmu_lock held */
1058
bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1059
unsigned long gfn)
1060
1061
{
1062
pte_t *ptep;
1063
unsigned long gpa = gfn << PAGE_SHIFT;
1064
unsigned int shift;
1065
bool ref = false;
1066
1067
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1068
return ref;
1069
1070
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1071
if (ptep && pte_present(*ptep) && pte_young(*ptep))
1072
ref = true;
1073
return ref;
1074
}
1075
1076
/* Returns the number of PAGE_SIZE pages that are dirty */
1077
static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1078
struct kvm_memory_slot *memslot, int pagenum)
1079
{
1080
unsigned long gfn = memslot->base_gfn + pagenum;
1081
unsigned long gpa = gfn << PAGE_SHIFT;
1082
pte_t *ptep, pte;
1083
unsigned int shift;
1084
int ret = 0;
1085
unsigned long old, *rmapp;
1086
1087
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1088
return ret;
1089
1090
/*
1091
* For performance reasons we don't hold kvm->mmu_lock while walking the
1092
* partition scoped table.
1093
*/
1094
ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1095
if (!ptep)
1096
return 0;
1097
1098
pte = READ_ONCE(*ptep);
1099
if (pte_present(pte) && pte_dirty(pte)) {
1100
spin_lock(&kvm->mmu_lock);
1101
/*
1102
* Recheck the pte again
1103
*/
1104
if (pte_val(pte) != pte_val(*ptep)) {
1105
/*
1106
* We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1107
* only find PAGE_SIZE pte entries here. We can continue
1108
* to use the pte addr returned by above page table
1109
* walk.
1110
*/
1111
if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1112
spin_unlock(&kvm->mmu_lock);
1113
return 0;
1114
}
1115
}
1116
1117
ret = 1;
1118
VM_BUG_ON(shift);
1119
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1120
gpa, shift);
1121
kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1122
/* Also clear bit in ptes in shadow pgtable for nested guests */
1123
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1124
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1125
old & PTE_RPN_MASK,
1126
1UL << shift);
1127
spin_unlock(&kvm->mmu_lock);
1128
}
1129
return ret;
1130
}
1131
1132
long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1133
struct kvm_memory_slot *memslot, unsigned long *map)
1134
{
1135
unsigned long i, j;
1136
int npages;
1137
1138
for (i = 0; i < memslot->npages; i = j) {
1139
npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1140
1141
/*
1142
* Note that if npages > 0 then i must be a multiple of npages,
1143
* since huge pages are only used to back the guest at guest
1144
* real addresses that are a multiple of their size.
1145
* Since we have at most one PTE covering any given guest
1146
* real address, if npages > 1 we can skip to i + npages.
1147
*/
1148
j = i + 1;
1149
if (npages) {
1150
set_dirty_bits(map, i, npages);
1151
j = i + npages;
1152
}
1153
}
1154
return 0;
1155
}
1156
1157
void kvmppc_radix_flush_memslot(struct kvm *kvm,
1158
const struct kvm_memory_slot *memslot)
1159
{
1160
unsigned long n;
1161
pte_t *ptep;
1162
unsigned long gpa;
1163
unsigned int shift;
1164
1165
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1166
kvmppc_uvmem_drop_pages(memslot, kvm, true);
1167
1168
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1169
return;
1170
1171
gpa = memslot->base_gfn << PAGE_SHIFT;
1172
spin_lock(&kvm->mmu_lock);
1173
for (n = memslot->npages; n; --n) {
1174
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1175
if (ptep && pte_present(*ptep))
1176
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1177
kvm->arch.lpid);
1178
gpa += PAGE_SIZE;
1179
}
1180
/*
1181
* Increase the mmu notifier sequence number to prevent any page
1182
* fault that read the memslot earlier from writing a PTE.
1183
*/
1184
kvm->mmu_invalidate_seq++;
1185
spin_unlock(&kvm->mmu_lock);
1186
}
1187
1188
static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1189
int psize, int *indexp)
1190
{
1191
if (!mmu_psize_defs[psize].shift)
1192
return;
1193
info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1194
(mmu_psize_defs[psize].ap << 29);
1195
++(*indexp);
1196
}
1197
1198
int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1199
{
1200
int i;
1201
1202
if (!radix_enabled())
1203
return -EINVAL;
1204
memset(info, 0, sizeof(*info));
1205
1206
/* 4k page size */
1207
info->geometries[0].page_shift = 12;
1208
info->geometries[0].level_bits[0] = 9;
1209
for (i = 1; i < 4; ++i)
1210
info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1211
/* 64k page size */
1212
info->geometries[1].page_shift = 16;
1213
for (i = 0; i < 4; ++i)
1214
info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1215
1216
i = 0;
1217
add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1218
add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1219
add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1220
add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1221
1222
return 0;
1223
}
1224
1225
int kvmppc_init_vm_radix(struct kvm *kvm)
1226
{
1227
kvm->arch.pgtable = pgd_alloc(kvm->mm);
1228
if (!kvm->arch.pgtable)
1229
return -ENOMEM;
1230
return 0;
1231
}
1232
1233
static void pte_ctor(void *addr)
1234
{
1235
memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1236
}
1237
1238
static void pmd_ctor(void *addr)
1239
{
1240
memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1241
}
1242
1243
struct debugfs_radix_state {
1244
struct kvm *kvm;
1245
struct mutex mutex;
1246
unsigned long gpa;
1247
int lpid;
1248
int chars_left;
1249
int buf_index;
1250
char buf[128];
1251
u8 hdr;
1252
};
1253
1254
static int debugfs_radix_open(struct inode *inode, struct file *file)
1255
{
1256
struct kvm *kvm = inode->i_private;
1257
struct debugfs_radix_state *p;
1258
1259
p = kzalloc(sizeof(*p), GFP_KERNEL);
1260
if (!p)
1261
return -ENOMEM;
1262
1263
kvm_get_kvm(kvm);
1264
p->kvm = kvm;
1265
mutex_init(&p->mutex);
1266
file->private_data = p;
1267
1268
return nonseekable_open(inode, file);
1269
}
1270
1271
static int debugfs_radix_release(struct inode *inode, struct file *file)
1272
{
1273
struct debugfs_radix_state *p = file->private_data;
1274
1275
kvm_put_kvm(p->kvm);
1276
kfree(p);
1277
return 0;
1278
}
1279
1280
static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1281
size_t len, loff_t *ppos)
1282
{
1283
struct debugfs_radix_state *p = file->private_data;
1284
ssize_t ret, r;
1285
unsigned long n;
1286
struct kvm *kvm;
1287
unsigned long gpa;
1288
pgd_t *pgt;
1289
struct kvm_nested_guest *nested;
1290
pgd_t *pgdp;
1291
p4d_t p4d, *p4dp;
1292
pud_t pud, *pudp;
1293
pmd_t pmd, *pmdp;
1294
pte_t *ptep;
1295
int shift;
1296
unsigned long pte;
1297
1298
kvm = p->kvm;
1299
if (!kvm_is_radix(kvm))
1300
return 0;
1301
1302
ret = mutex_lock_interruptible(&p->mutex);
1303
if (ret)
1304
return ret;
1305
1306
if (p->chars_left) {
1307
n = p->chars_left;
1308
if (n > len)
1309
n = len;
1310
r = copy_to_user(buf, p->buf + p->buf_index, n);
1311
n -= r;
1312
p->chars_left -= n;
1313
p->buf_index += n;
1314
buf += n;
1315
len -= n;
1316
ret = n;
1317
if (r) {
1318
if (!n)
1319
ret = -EFAULT;
1320
goto out;
1321
}
1322
}
1323
1324
gpa = p->gpa;
1325
nested = NULL;
1326
pgt = NULL;
1327
while (len != 0 && p->lpid >= 0) {
1328
if (gpa >= RADIX_PGTABLE_RANGE) {
1329
gpa = 0;
1330
pgt = NULL;
1331
if (nested) {
1332
kvmhv_put_nested(nested);
1333
nested = NULL;
1334
}
1335
p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1336
p->hdr = 0;
1337
if (p->lpid < 0)
1338
break;
1339
}
1340
if (!pgt) {
1341
if (p->lpid == 0) {
1342
pgt = kvm->arch.pgtable;
1343
} else {
1344
nested = kvmhv_get_nested(kvm, p->lpid, false);
1345
if (!nested) {
1346
gpa = RADIX_PGTABLE_RANGE;
1347
continue;
1348
}
1349
pgt = nested->shadow_pgtable;
1350
}
1351
}
1352
n = 0;
1353
if (!p->hdr) {
1354
if (p->lpid > 0)
1355
n = scnprintf(p->buf, sizeof(p->buf),
1356
"\nNested LPID %d: ", p->lpid);
1357
n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1358
"pgdir: %lx\n", (unsigned long)pgt);
1359
p->hdr = 1;
1360
goto copy;
1361
}
1362
1363
pgdp = pgt + pgd_index(gpa);
1364
p4dp = p4d_offset(pgdp, gpa);
1365
p4d = READ_ONCE(*p4dp);
1366
if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1367
gpa = (gpa & P4D_MASK) + P4D_SIZE;
1368
continue;
1369
}
1370
1371
pudp = pud_offset(&p4d, gpa);
1372
pud = READ_ONCE(*pudp);
1373
if (!(pud_val(pud) & _PAGE_PRESENT)) {
1374
gpa = (gpa & PUD_MASK) + PUD_SIZE;
1375
continue;
1376
}
1377
if (pud_val(pud) & _PAGE_PTE) {
1378
pte = pud_val(pud);
1379
shift = PUD_SHIFT;
1380
goto leaf;
1381
}
1382
1383
pmdp = pmd_offset(&pud, gpa);
1384
pmd = READ_ONCE(*pmdp);
1385
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1386
gpa = (gpa & PMD_MASK) + PMD_SIZE;
1387
continue;
1388
}
1389
if (pmd_val(pmd) & _PAGE_PTE) {
1390
pte = pmd_val(pmd);
1391
shift = PMD_SHIFT;
1392
goto leaf;
1393
}
1394
1395
ptep = pte_offset_kernel(&pmd, gpa);
1396
pte = pte_val(READ_ONCE(*ptep));
1397
if (!(pte & _PAGE_PRESENT)) {
1398
gpa += PAGE_SIZE;
1399
continue;
1400
}
1401
shift = PAGE_SHIFT;
1402
leaf:
1403
n = scnprintf(p->buf, sizeof(p->buf),
1404
" %lx: %lx %d\n", gpa, pte, shift);
1405
gpa += 1ul << shift;
1406
copy:
1407
p->chars_left = n;
1408
if (n > len)
1409
n = len;
1410
r = copy_to_user(buf, p->buf, n);
1411
n -= r;
1412
p->chars_left -= n;
1413
p->buf_index = n;
1414
buf += n;
1415
len -= n;
1416
ret += n;
1417
if (r) {
1418
if (!ret)
1419
ret = -EFAULT;
1420
break;
1421
}
1422
}
1423
p->gpa = gpa;
1424
if (nested)
1425
kvmhv_put_nested(nested);
1426
1427
out:
1428
mutex_unlock(&p->mutex);
1429
return ret;
1430
}
1431
1432
static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1433
size_t len, loff_t *ppos)
1434
{
1435
return -EACCES;
1436
}
1437
1438
static const struct file_operations debugfs_radix_fops = {
1439
.owner = THIS_MODULE,
1440
.open = debugfs_radix_open,
1441
.release = debugfs_radix_release,
1442
.read = debugfs_radix_read,
1443
.write = debugfs_radix_write,
1444
.llseek = generic_file_llseek,
1445
};
1446
1447
void kvmhv_radix_debugfs_init(struct kvm *kvm)
1448
{
1449
debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
1450
&debugfs_radix_fops);
1451
}
1452
1453
int kvmppc_radix_init(void)
1454
{
1455
unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1456
1457
kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1458
if (!kvm_pte_cache)
1459
return -ENOMEM;
1460
1461
size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1462
1463
kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1464
if (!kvm_pmd_cache) {
1465
kmem_cache_destroy(kvm_pte_cache);
1466
return -ENOMEM;
1467
}
1468
1469
return 0;
1470
}
1471
1472
void kvmppc_radix_exit(void)
1473
{
1474
kmem_cache_destroy(kvm_pte_cache);
1475
kmem_cache_destroy(kvm_pmd_cache);
1476
}
1477
1478