Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/kvm/book3s_64_mmu_hv.c
26451 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
*
4
* Copyright 2010 Paul Mackerras, IBM Corp. <[email protected]>
5
*/
6
7
#include <linux/types.h>
8
#include <linux/string.h>
9
#include <linux/kvm.h>
10
#include <linux/kvm_host.h>
11
#include <linux/highmem.h>
12
#include <linux/gfp.h>
13
#include <linux/slab.h>
14
#include <linux/hugetlb.h>
15
#include <linux/vmalloc.h>
16
#include <linux/srcu.h>
17
#include <linux/anon_inodes.h>
18
#include <linux/file.h>
19
#include <linux/debugfs.h>
20
21
#include <asm/kvm_ppc.h>
22
#include <asm/kvm_book3s.h>
23
#include <asm/book3s/64/mmu-hash.h>
24
#include <asm/hvcall.h>
25
#include <asm/synch.h>
26
#include <asm/ppc-opcode.h>
27
#include <asm/cputable.h>
28
#include <asm/pte-walk.h>
29
30
#include "book3s.h"
31
#include "book3s_hv.h"
32
#include "trace_hv.h"
33
34
//#define DEBUG_RESIZE_HPT 1
35
36
#ifdef DEBUG_RESIZE_HPT
37
#define resize_hpt_debug(resize, ...) \
38
do { \
39
printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \
40
printk(__VA_ARGS__); \
41
} while (0)
42
#else
43
#define resize_hpt_debug(resize, ...) \
44
do { } while (0)
45
#endif
46
47
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
48
long pte_index, unsigned long pteh,
49
unsigned long ptel, unsigned long *pte_idx_ret);
50
51
struct kvm_resize_hpt {
52
/* These fields read-only after init */
53
struct kvm *kvm;
54
struct work_struct work;
55
u32 order;
56
57
/* These fields protected by kvm->arch.mmu_setup_lock */
58
59
/* Possible values and their usage:
60
* <0 an error occurred during allocation,
61
* -EBUSY allocation is in the progress,
62
* 0 allocation made successfully.
63
*/
64
int error;
65
66
/* Private to the work thread, until error != -EBUSY,
67
* then protected by kvm->arch.mmu_setup_lock.
68
*/
69
struct kvm_hpt_info hpt;
70
};
71
72
int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
73
{
74
unsigned long hpt = 0;
75
int cma = 0;
76
struct page *page = NULL;
77
struct revmap_entry *rev;
78
unsigned long npte;
79
80
if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
81
return -EINVAL;
82
83
page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
84
if (page) {
85
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
86
memset((void *)hpt, 0, (1ul << order));
87
cma = 1;
88
}
89
90
if (!hpt)
91
hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL
92
|__GFP_NOWARN, order - PAGE_SHIFT);
93
94
if (!hpt)
95
return -ENOMEM;
96
97
/* HPTEs are 2**4 bytes long */
98
npte = 1ul << (order - 4);
99
100
/* Allocate reverse map array */
101
rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
102
if (!rev) {
103
if (cma)
104
kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
105
else
106
free_pages(hpt, order - PAGE_SHIFT);
107
return -ENOMEM;
108
}
109
110
info->order = order;
111
info->virt = hpt;
112
info->cma = cma;
113
info->rev = rev;
114
115
return 0;
116
}
117
118
void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
119
{
120
atomic64_set(&kvm->arch.mmio_update, 0);
121
kvm->arch.hpt = *info;
122
kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
123
124
pr_debug("KVM guest htab at %lx (order %ld), LPID %llx\n",
125
info->virt, (long)info->order, kvm->arch.lpid);
126
}
127
128
int kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
129
{
130
int err = -EBUSY;
131
struct kvm_hpt_info info;
132
133
mutex_lock(&kvm->arch.mmu_setup_lock);
134
if (kvm->arch.mmu_ready) {
135
kvm->arch.mmu_ready = 0;
136
/* order mmu_ready vs. vcpus_running */
137
smp_mb();
138
if (atomic_read(&kvm->arch.vcpus_running)) {
139
kvm->arch.mmu_ready = 1;
140
goto out;
141
}
142
}
143
if (kvm_is_radix(kvm)) {
144
err = kvmppc_switch_mmu_to_hpt(kvm);
145
if (err)
146
goto out;
147
}
148
149
if (kvm->arch.hpt.order == order) {
150
/* We already have a suitable HPT */
151
152
/* Set the entire HPT to 0, i.e. invalid HPTEs */
153
memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
154
/*
155
* Reset all the reverse-mapping chains for all memslots
156
*/
157
kvmppc_rmap_reset(kvm);
158
err = 0;
159
goto out;
160
}
161
162
if (kvm->arch.hpt.virt) {
163
kvmppc_free_hpt(&kvm->arch.hpt);
164
kvmppc_rmap_reset(kvm);
165
}
166
167
err = kvmppc_allocate_hpt(&info, order);
168
if (err < 0)
169
goto out;
170
kvmppc_set_hpt(kvm, &info);
171
172
out:
173
if (err == 0)
174
/* Ensure that each vcpu will flush its TLB on next entry. */
175
cpumask_setall(&kvm->arch.need_tlb_flush);
176
177
mutex_unlock(&kvm->arch.mmu_setup_lock);
178
return err;
179
}
180
181
void kvmppc_free_hpt(struct kvm_hpt_info *info)
182
{
183
vfree(info->rev);
184
info->rev = NULL;
185
if (info->cma)
186
kvm_free_hpt_cma(virt_to_page((void *)info->virt),
187
1 << (info->order - PAGE_SHIFT));
188
else if (info->virt)
189
free_pages(info->virt, info->order - PAGE_SHIFT);
190
info->virt = 0;
191
info->order = 0;
192
}
193
194
/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
195
static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
196
{
197
return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
198
}
199
200
/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
201
static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
202
{
203
return (pgsize == 0x10000) ? 0x1000 : 0;
204
}
205
206
void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
207
unsigned long porder)
208
{
209
unsigned long i;
210
unsigned long npages;
211
unsigned long hp_v, hp_r;
212
unsigned long addr, hash;
213
unsigned long psize;
214
unsigned long hp0, hp1;
215
unsigned long idx_ret;
216
long ret;
217
struct kvm *kvm = vcpu->kvm;
218
219
psize = 1ul << porder;
220
npages = memslot->npages >> (porder - PAGE_SHIFT);
221
222
/* VRMA can't be > 1TB */
223
if (npages > 1ul << (40 - porder))
224
npages = 1ul << (40 - porder);
225
/* Can't use more than 1 HPTE per HPTEG */
226
if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
227
npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
228
229
hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
230
HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
231
hp1 = hpte1_pgsize_encoding(psize) |
232
HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
233
234
for (i = 0; i < npages; ++i) {
235
addr = i << porder;
236
/* can't use hpt_hash since va > 64 bits */
237
hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
238
& kvmppc_hpt_mask(&kvm->arch.hpt);
239
/*
240
* We assume that the hash table is empty and no
241
* vcpus are using it at this stage. Since we create
242
* at most one HPTE per HPTEG, we just assume entry 7
243
* is available and use it.
244
*/
245
hash = (hash << 3) + 7;
246
hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
247
hp_r = hp1 | addr;
248
ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
249
&idx_ret);
250
if (ret != H_SUCCESS) {
251
pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
252
addr, ret);
253
break;
254
}
255
}
256
}
257
258
int kvmppc_mmu_hv_init(void)
259
{
260
unsigned long nr_lpids;
261
262
if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
263
return -EINVAL;
264
265
if (cpu_has_feature(CPU_FTR_HVMODE)) {
266
if (WARN_ON(mfspr(SPRN_LPID) != 0))
267
return -EINVAL;
268
nr_lpids = 1UL << mmu_lpid_bits;
269
} else {
270
nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT;
271
}
272
273
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
274
/* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */
275
if (cpu_has_feature(CPU_FTR_ARCH_207S))
276
WARN_ON(nr_lpids != 1UL << 12);
277
else
278
WARN_ON(nr_lpids != 1UL << 10);
279
280
/*
281
* Reserve the last implemented LPID use in partition
282
* switching for POWER7 and POWER8.
283
*/
284
nr_lpids -= 1;
285
}
286
287
kvmppc_init_lpid(nr_lpids);
288
289
return 0;
290
}
291
292
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
293
long pte_index, unsigned long pteh,
294
unsigned long ptel, unsigned long *pte_idx_ret)
295
{
296
long ret;
297
298
preempt_disable();
299
ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
300
kvm->mm->pgd, false, pte_idx_ret);
301
preempt_enable();
302
if (ret == H_TOO_HARD) {
303
/* this can't happen */
304
pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
305
ret = H_RESOURCE; /* or something */
306
}
307
return ret;
308
309
}
310
311
static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
312
gva_t eaddr)
313
{
314
u64 mask;
315
int i;
316
317
for (i = 0; i < vcpu->arch.slb_nr; i++) {
318
if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
319
continue;
320
321
if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
322
mask = ESID_MASK_1T;
323
else
324
mask = ESID_MASK;
325
326
if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
327
return &vcpu->arch.slb[i];
328
}
329
return NULL;
330
}
331
332
static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
333
unsigned long ea)
334
{
335
unsigned long ra_mask;
336
337
ra_mask = kvmppc_actual_pgsz(v, r) - 1;
338
return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
339
}
340
341
static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
342
struct kvmppc_pte *gpte, bool data, bool iswrite)
343
{
344
struct kvm *kvm = vcpu->kvm;
345
struct kvmppc_slb *slbe;
346
unsigned long slb_v;
347
unsigned long pp, key;
348
unsigned long v, orig_v, gr;
349
__be64 *hptep;
350
long int index;
351
int virtmode = __kvmppc_get_msr_hv(vcpu) & (data ? MSR_DR : MSR_IR);
352
353
if (kvm_is_radix(vcpu->kvm))
354
return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
355
356
/* Get SLB entry */
357
if (virtmode) {
358
slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
359
if (!slbe)
360
return -EINVAL;
361
slb_v = slbe->origv;
362
} else {
363
/* real mode access */
364
slb_v = vcpu->kvm->arch.vrma_slb_v;
365
}
366
367
preempt_disable();
368
/* Find the HPTE in the hash table */
369
index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
370
HPTE_V_VALID | HPTE_V_ABSENT);
371
if (index < 0) {
372
preempt_enable();
373
return -ENOENT;
374
}
375
hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
376
v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
377
if (cpu_has_feature(CPU_FTR_ARCH_300))
378
v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
379
gr = kvm->arch.hpt.rev[index].guest_rpte;
380
381
unlock_hpte(hptep, orig_v);
382
preempt_enable();
383
384
gpte->eaddr = eaddr;
385
gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
386
387
/* Get PP bits and key for permission check */
388
pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
389
key = (__kvmppc_get_msr_hv(vcpu) & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
390
key &= slb_v;
391
392
/* Calculate permissions */
393
gpte->may_read = hpte_read_permission(pp, key);
394
gpte->may_write = hpte_write_permission(pp, key);
395
gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
396
397
/* Storage key permission check for POWER7 */
398
if (data && virtmode) {
399
int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
400
if (amrfield & 1)
401
gpte->may_read = 0;
402
if (amrfield & 2)
403
gpte->may_write = 0;
404
}
405
406
/* Get the guest physical address */
407
gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
408
return 0;
409
}
410
411
/*
412
* Quick test for whether an instruction is a load or a store.
413
* If the instruction is a load or a store, then this will indicate
414
* which it is, at least on server processors. (Embedded processors
415
* have some external PID instructions that don't follow the rule
416
* embodied here.) If the instruction isn't a load or store, then
417
* this doesn't return anything useful.
418
*/
419
static int instruction_is_store(ppc_inst_t instr)
420
{
421
unsigned int mask;
422
unsigned int suffix;
423
424
mask = 0x10000000;
425
suffix = ppc_inst_val(instr);
426
if (ppc_inst_prefixed(instr))
427
suffix = ppc_inst_suffix(instr);
428
else if ((suffix & 0xfc000000) == 0x7c000000)
429
mask = 0x100; /* major opcode 31 */
430
return (suffix & mask) != 0;
431
}
432
433
int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu,
434
unsigned long gpa, gva_t ea, int is_store)
435
{
436
ppc_inst_t last_inst;
437
bool is_prefixed = !!(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
438
439
/*
440
* Fast path - check if the guest physical address corresponds to a
441
* device on the FAST_MMIO_BUS, if so we can avoid loading the
442
* instruction all together, then we can just handle it and return.
443
*/
444
if (is_store) {
445
int idx, ret;
446
447
idx = srcu_read_lock(&vcpu->kvm->srcu);
448
ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0,
449
NULL);
450
srcu_read_unlock(&vcpu->kvm->srcu, idx);
451
if (!ret) {
452
kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + (is_prefixed ? 8 : 4));
453
return RESUME_GUEST;
454
}
455
}
456
457
/*
458
* If we fail, we just return to the guest and try executing it again.
459
*/
460
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
461
EMULATE_DONE)
462
return RESUME_GUEST;
463
464
/*
465
* WARNING: We do not know for sure whether the instruction we just
466
* read from memory is the same that caused the fault in the first
467
* place.
468
*
469
* If the fault is prefixed but the instruction is not or vice
470
* versa, try again so that we don't advance pc the wrong amount.
471
*/
472
if (ppc_inst_prefixed(last_inst) != is_prefixed)
473
return RESUME_GUEST;
474
475
/*
476
* If the instruction we read is neither an load or a store,
477
* then it can't access memory, so we don't need to worry about
478
* enforcing access permissions. So, assuming it is a load or
479
* store, we just check that its direction (load or store) is
480
* consistent with the original fault, since that's what we
481
* checked the access permissions against. If there is a mismatch
482
* we just return and retry the instruction.
483
*/
484
485
if (instruction_is_store(last_inst) != !!is_store)
486
return RESUME_GUEST;
487
488
/*
489
* Emulated accesses are emulated by looking at the hash for
490
* translation once, then performing the access later. The
491
* translation could be invalidated in the meantime in which
492
* point performing the subsequent memory access on the old
493
* physical address could possibly be a security hole for the
494
* guest (but not the host).
495
*
496
* This is less of an issue for MMIO stores since they aren't
497
* globally visible. It could be an issue for MMIO loads to
498
* a certain extent but we'll ignore it for now.
499
*/
500
501
vcpu->arch.paddr_accessed = gpa;
502
vcpu->arch.vaddr_accessed = ea;
503
return kvmppc_emulate_mmio(vcpu);
504
}
505
506
int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
507
unsigned long ea, unsigned long dsisr)
508
{
509
struct kvm *kvm = vcpu->kvm;
510
unsigned long hpte[3], r;
511
unsigned long hnow_v, hnow_r;
512
__be64 *hptep;
513
unsigned long mmu_seq, psize, pte_size;
514
unsigned long gpa_base, gfn_base;
515
unsigned long gpa, gfn, hva, pfn, hpa;
516
struct kvm_memory_slot *memslot;
517
unsigned long *rmap;
518
struct revmap_entry *rev;
519
struct page *page;
520
long index, ret;
521
bool is_ci;
522
bool writing, write_ok;
523
unsigned int shift;
524
unsigned long rcbits;
525
long mmio_update;
526
pte_t pte, *ptep;
527
528
if (kvm_is_radix(kvm))
529
return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr);
530
531
/*
532
* Real-mode code has already searched the HPT and found the
533
* entry we're interested in. Lock the entry and check that
534
* it hasn't changed. If it has, just return and re-execute the
535
* instruction.
536
*/
537
if (ea != vcpu->arch.pgfault_addr)
538
return RESUME_GUEST;
539
540
if (vcpu->arch.pgfault_cache) {
541
mmio_update = atomic64_read(&kvm->arch.mmio_update);
542
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
543
r = vcpu->arch.pgfault_cache->rpte;
544
psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
545
r);
546
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
547
gfn_base = gpa_base >> PAGE_SHIFT;
548
gpa = gpa_base | (ea & (psize - 1));
549
return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
550
dsisr & DSISR_ISSTORE);
551
}
552
}
553
index = vcpu->arch.pgfault_index;
554
hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
555
rev = &kvm->arch.hpt.rev[index];
556
preempt_disable();
557
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
558
cpu_relax();
559
hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
560
hpte[1] = be64_to_cpu(hptep[1]);
561
hpte[2] = r = rev->guest_rpte;
562
unlock_hpte(hptep, hpte[0]);
563
preempt_enable();
564
565
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
566
hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
567
hpte[1] = hpte_new_to_old_r(hpte[1]);
568
}
569
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
570
hpte[1] != vcpu->arch.pgfault_hpte[1])
571
return RESUME_GUEST;
572
573
/* Translate the logical address and get the page */
574
psize = kvmppc_actual_pgsz(hpte[0], r);
575
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
576
gfn_base = gpa_base >> PAGE_SHIFT;
577
gpa = gpa_base | (ea & (psize - 1));
578
gfn = gpa >> PAGE_SHIFT;
579
memslot = gfn_to_memslot(kvm, gfn);
580
581
trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
582
583
/* No memslot means it's an emulated MMIO region */
584
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
585
return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
586
dsisr & DSISR_ISSTORE);
587
588
/*
589
* This should never happen, because of the slot_is_aligned()
590
* check in kvmppc_do_h_enter().
591
*/
592
if (gfn_base < memslot->base_gfn)
593
return -EFAULT;
594
595
/* used to check for invalidations in progress */
596
mmu_seq = kvm->mmu_invalidate_seq;
597
smp_rmb();
598
599
ret = -EFAULT;
600
page = NULL;
601
writing = (dsisr & DSISR_ISSTORE) != 0;
602
/* If writing != 0, then the HPTE must allow writing, if we get here */
603
write_ok = writing;
604
hva = gfn_to_hva_memslot(memslot, gfn);
605
606
pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0,
607
&write_ok, &page);
608
if (is_error_noslot_pfn(pfn))
609
return -EFAULT;
610
611
/*
612
* Read the PTE from the process' radix tree and use that
613
* so we get the shift and attribute bits.
614
*/
615
spin_lock(&kvm->mmu_lock);
616
ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
617
pte = __pte(0);
618
if (ptep)
619
pte = READ_ONCE(*ptep);
620
spin_unlock(&kvm->mmu_lock);
621
/*
622
* If the PTE disappeared temporarily due to a THP
623
* collapse, just return and let the guest try again.
624
*/
625
if (!pte_present(pte)) {
626
if (page)
627
put_page(page);
628
return RESUME_GUEST;
629
}
630
hpa = pte_pfn(pte) << PAGE_SHIFT;
631
pte_size = PAGE_SIZE;
632
if (shift)
633
pte_size = 1ul << shift;
634
is_ci = pte_ci(pte);
635
636
if (psize > pte_size)
637
goto out_put;
638
if (pte_size > psize)
639
hpa |= hva & (pte_size - psize);
640
641
/* Check WIMG vs. the actual page we're accessing */
642
if (!hpte_cache_flags_ok(r, is_ci)) {
643
if (is_ci)
644
goto out_put;
645
/*
646
* Allow guest to map emulated device memory as
647
* uncacheable, but actually make it cacheable.
648
*/
649
r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
650
}
651
652
/*
653
* Set the HPTE to point to hpa.
654
* Since the hpa is at PAGE_SIZE granularity, make sure we
655
* don't mask out lower-order bits if psize < PAGE_SIZE.
656
*/
657
if (psize < PAGE_SIZE)
658
psize = PAGE_SIZE;
659
r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
660
if (hpte_is_writable(r) && !write_ok)
661
r = hpte_make_readonly(r);
662
ret = RESUME_GUEST;
663
preempt_disable();
664
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
665
cpu_relax();
666
hnow_v = be64_to_cpu(hptep[0]);
667
hnow_r = be64_to_cpu(hptep[1]);
668
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
669
hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
670
hnow_r = hpte_new_to_old_r(hnow_r);
671
}
672
673
/*
674
* If the HPT is being resized, don't update the HPTE,
675
* instead let the guest retry after the resize operation is complete.
676
* The synchronization for mmu_ready test vs. set is provided
677
* by the HPTE lock.
678
*/
679
if (!kvm->arch.mmu_ready)
680
goto out_unlock;
681
682
if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
683
rev->guest_rpte != hpte[2])
684
/* HPTE has been changed under us; let the guest retry */
685
goto out_unlock;
686
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
687
688
/* Always put the HPTE in the rmap chain for the page base address */
689
rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
690
lock_rmap(rmap);
691
692
/* Check if we might have been invalidated; let the guest retry if so */
693
ret = RESUME_GUEST;
694
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) {
695
unlock_rmap(rmap);
696
goto out_unlock;
697
}
698
699
/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
700
rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
701
r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
702
703
if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
704
/* HPTE was previously valid, so we need to invalidate it */
705
unlock_rmap(rmap);
706
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
707
kvmppc_invalidate_hpte(kvm, hptep, index);
708
/* don't lose previous R and C bits */
709
r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
710
} else {
711
kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
712
}
713
714
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
715
r = hpte_old_to_new_r(hpte[0], r);
716
hpte[0] = hpte_old_to_new_v(hpte[0]);
717
}
718
hptep[1] = cpu_to_be64(r);
719
eieio();
720
__unlock_hpte(hptep, hpte[0]);
721
asm volatile("ptesync" : : : "memory");
722
preempt_enable();
723
if (page && hpte_is_writable(r))
724
set_page_dirty_lock(page);
725
726
out_put:
727
trace_kvm_page_fault_exit(vcpu, hpte, ret);
728
729
if (page)
730
put_page(page);
731
return ret;
732
733
out_unlock:
734
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
735
preempt_enable();
736
goto out_put;
737
}
738
739
void kvmppc_rmap_reset(struct kvm *kvm)
740
{
741
struct kvm_memslots *slots;
742
struct kvm_memory_slot *memslot;
743
int srcu_idx, bkt;
744
745
srcu_idx = srcu_read_lock(&kvm->srcu);
746
slots = kvm_memslots(kvm);
747
kvm_for_each_memslot(memslot, bkt, slots) {
748
/* Mutual exclusion with kvm_unmap_hva_range etc. */
749
spin_lock(&kvm->mmu_lock);
750
/*
751
* This assumes it is acceptable to lose reference and
752
* change bits across a reset.
753
*/
754
memset(memslot->arch.rmap, 0,
755
memslot->npages * sizeof(*memslot->arch.rmap));
756
spin_unlock(&kvm->mmu_lock);
757
}
758
srcu_read_unlock(&kvm->srcu, srcu_idx);
759
}
760
761
/* Must be called with both HPTE and rmap locked */
762
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
763
struct kvm_memory_slot *memslot,
764
unsigned long *rmapp, unsigned long gfn)
765
{
766
__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
767
struct revmap_entry *rev = kvm->arch.hpt.rev;
768
unsigned long j, h;
769
unsigned long ptel, psize, rcbits;
770
771
j = rev[i].forw;
772
if (j == i) {
773
/* chain is now empty */
774
*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
775
} else {
776
/* remove i from chain */
777
h = rev[i].back;
778
rev[h].forw = j;
779
rev[j].back = h;
780
rev[i].forw = rev[i].back = i;
781
*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
782
}
783
784
/* Now check and modify the HPTE */
785
ptel = rev[i].guest_rpte;
786
psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
787
if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
788
hpte_rpn(ptel, psize) == gfn) {
789
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
790
kvmppc_invalidate_hpte(kvm, hptep, i);
791
hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
792
/* Harvest R and C */
793
rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
794
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
795
if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
796
kvmppc_update_dirty_map(memslot, gfn, psize);
797
if (rcbits & ~rev[i].guest_rpte) {
798
rev[i].guest_rpte = ptel | rcbits;
799
note_hpte_modification(kvm, &rev[i]);
800
}
801
}
802
}
803
804
static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
805
unsigned long gfn)
806
{
807
unsigned long i;
808
__be64 *hptep;
809
unsigned long *rmapp;
810
811
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
812
for (;;) {
813
lock_rmap(rmapp);
814
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
815
unlock_rmap(rmapp);
816
break;
817
}
818
819
/*
820
* To avoid an ABBA deadlock with the HPTE lock bit,
821
* we can't spin on the HPTE lock while holding the
822
* rmap chain lock.
823
*/
824
i = *rmapp & KVMPPC_RMAP_INDEX;
825
hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
826
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
827
/* unlock rmap before spinning on the HPTE lock */
828
unlock_rmap(rmapp);
829
while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
830
cpu_relax();
831
continue;
832
}
833
834
kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
835
unlock_rmap(rmapp);
836
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
837
}
838
}
839
840
bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
841
{
842
gfn_t gfn;
843
844
if (kvm_is_radix(kvm)) {
845
for (gfn = range->start; gfn < range->end; gfn++)
846
kvm_unmap_radix(kvm, range->slot, gfn);
847
} else {
848
for (gfn = range->start; gfn < range->end; gfn++)
849
kvm_unmap_rmapp(kvm, range->slot, gfn);
850
}
851
852
return false;
853
}
854
855
void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
856
struct kvm_memory_slot *memslot)
857
{
858
unsigned long gfn;
859
unsigned long n;
860
unsigned long *rmapp;
861
862
gfn = memslot->base_gfn;
863
rmapp = memslot->arch.rmap;
864
if (kvm_is_radix(kvm)) {
865
kvmppc_radix_flush_memslot(kvm, memslot);
866
return;
867
}
868
869
for (n = memslot->npages; n; --n, ++gfn) {
870
/*
871
* Testing the present bit without locking is OK because
872
* the memslot has been marked invalid already, and hence
873
* no new HPTEs referencing this page can be created,
874
* thus the present bit can't go from 0 to 1.
875
*/
876
if (*rmapp & KVMPPC_RMAP_PRESENT)
877
kvm_unmap_rmapp(kvm, memslot, gfn);
878
++rmapp;
879
}
880
}
881
882
static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
883
unsigned long gfn)
884
{
885
struct revmap_entry *rev = kvm->arch.hpt.rev;
886
unsigned long head, i, j;
887
__be64 *hptep;
888
bool ret = false;
889
unsigned long *rmapp;
890
891
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
892
retry:
893
lock_rmap(rmapp);
894
if (*rmapp & KVMPPC_RMAP_REFERENCED) {
895
*rmapp &= ~KVMPPC_RMAP_REFERENCED;
896
ret = true;
897
}
898
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
899
unlock_rmap(rmapp);
900
return ret;
901
}
902
903
i = head = *rmapp & KVMPPC_RMAP_INDEX;
904
do {
905
hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
906
j = rev[i].forw;
907
908
/* If this HPTE isn't referenced, ignore it */
909
if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
910
continue;
911
912
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
913
/* unlock rmap before spinning on the HPTE lock */
914
unlock_rmap(rmapp);
915
while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
916
cpu_relax();
917
goto retry;
918
}
919
920
/* Now check and modify the HPTE */
921
if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
922
(be64_to_cpu(hptep[1]) & HPTE_R_R)) {
923
kvmppc_clear_ref_hpte(kvm, hptep, i);
924
if (!(rev[i].guest_rpte & HPTE_R_R)) {
925
rev[i].guest_rpte |= HPTE_R_R;
926
note_hpte_modification(kvm, &rev[i]);
927
}
928
ret = true;
929
}
930
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
931
} while ((i = j) != head);
932
933
unlock_rmap(rmapp);
934
return ret;
935
}
936
937
bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
938
{
939
gfn_t gfn;
940
bool ret = false;
941
942
if (kvm_is_radix(kvm)) {
943
for (gfn = range->start; gfn < range->end; gfn++)
944
ret |= kvm_age_radix(kvm, range->slot, gfn);
945
} else {
946
for (gfn = range->start; gfn < range->end; gfn++)
947
ret |= kvm_age_rmapp(kvm, range->slot, gfn);
948
}
949
950
return ret;
951
}
952
953
static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
954
unsigned long gfn)
955
{
956
struct revmap_entry *rev = kvm->arch.hpt.rev;
957
unsigned long head, i, j;
958
unsigned long *hp;
959
bool ret = true;
960
unsigned long *rmapp;
961
962
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
963
if (*rmapp & KVMPPC_RMAP_REFERENCED)
964
return true;
965
966
lock_rmap(rmapp);
967
if (*rmapp & KVMPPC_RMAP_REFERENCED)
968
goto out;
969
970
if (*rmapp & KVMPPC_RMAP_PRESENT) {
971
i = head = *rmapp & KVMPPC_RMAP_INDEX;
972
do {
973
hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
974
j = rev[i].forw;
975
if (be64_to_cpu(hp[1]) & HPTE_R_R)
976
goto out;
977
} while ((i = j) != head);
978
}
979
ret = false;
980
981
out:
982
unlock_rmap(rmapp);
983
return ret;
984
}
985
986
bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
987
{
988
WARN_ON(range->start + 1 != range->end);
989
990
if (kvm_is_radix(kvm))
991
return kvm_test_age_radix(kvm, range->slot, range->start);
992
else
993
return kvm_test_age_rmapp(kvm, range->slot, range->start);
994
}
995
996
static int vcpus_running(struct kvm *kvm)
997
{
998
return atomic_read(&kvm->arch.vcpus_running) != 0;
999
}
1000
1001
/*
1002
* Returns the number of system pages that are dirty.
1003
* This can be more than 1 if we find a huge-page HPTE.
1004
*/
1005
static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1006
{
1007
struct revmap_entry *rev = kvm->arch.hpt.rev;
1008
unsigned long head, i, j;
1009
unsigned long n;
1010
unsigned long v, r;
1011
__be64 *hptep;
1012
int npages_dirty = 0;
1013
1014
retry:
1015
lock_rmap(rmapp);
1016
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
1017
unlock_rmap(rmapp);
1018
return npages_dirty;
1019
}
1020
1021
i = head = *rmapp & KVMPPC_RMAP_INDEX;
1022
do {
1023
unsigned long hptep1;
1024
hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
1025
j = rev[i].forw;
1026
1027
/*
1028
* Checking the C (changed) bit here is racy since there
1029
* is no guarantee about when the hardware writes it back.
1030
* If the HPTE is not writable then it is stable since the
1031
* page can't be written to, and we would have done a tlbie
1032
* (which forces the hardware to complete any writeback)
1033
* when making the HPTE read-only.
1034
* If vcpus are running then this call is racy anyway
1035
* since the page could get dirtied subsequently, so we
1036
* expect there to be a further call which would pick up
1037
* any delayed C bit writeback.
1038
* Otherwise we need to do the tlbie even if C==0 in
1039
* order to pick up any delayed writeback of C.
1040
*/
1041
hptep1 = be64_to_cpu(hptep[1]);
1042
if (!(hptep1 & HPTE_R_C) &&
1043
(!hpte_is_writable(hptep1) || vcpus_running(kvm)))
1044
continue;
1045
1046
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
1047
/* unlock rmap before spinning on the HPTE lock */
1048
unlock_rmap(rmapp);
1049
while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
1050
cpu_relax();
1051
goto retry;
1052
}
1053
1054
/* Now check and modify the HPTE */
1055
if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
1056
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
1057
continue;
1058
}
1059
1060
/* need to make it temporarily absent so C is stable */
1061
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
1062
kvmppc_invalidate_hpte(kvm, hptep, i);
1063
v = be64_to_cpu(hptep[0]);
1064
r = be64_to_cpu(hptep[1]);
1065
if (r & HPTE_R_C) {
1066
hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
1067
if (!(rev[i].guest_rpte & HPTE_R_C)) {
1068
rev[i].guest_rpte |= HPTE_R_C;
1069
note_hpte_modification(kvm, &rev[i]);
1070
}
1071
n = kvmppc_actual_pgsz(v, r);
1072
n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1073
if (n > npages_dirty)
1074
npages_dirty = n;
1075
eieio();
1076
}
1077
v &= ~HPTE_V_ABSENT;
1078
v |= HPTE_V_VALID;
1079
__unlock_hpte(hptep, v);
1080
} while ((i = j) != head);
1081
1082
unlock_rmap(rmapp);
1083
return npages_dirty;
1084
}
1085
1086
void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1087
struct kvm_memory_slot *memslot,
1088
unsigned long *map)
1089
{
1090
unsigned long gfn;
1091
1092
if (!vpa->dirty || !vpa->pinned_addr)
1093
return;
1094
gfn = vpa->gpa >> PAGE_SHIFT;
1095
if (gfn < memslot->base_gfn ||
1096
gfn >= memslot->base_gfn + memslot->npages)
1097
return;
1098
1099
vpa->dirty = false;
1100
if (map)
1101
__set_bit_le(gfn - memslot->base_gfn, map);
1102
}
1103
1104
long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1105
struct kvm_memory_slot *memslot, unsigned long *map)
1106
{
1107
unsigned long i;
1108
unsigned long *rmapp;
1109
1110
preempt_disable();
1111
rmapp = memslot->arch.rmap;
1112
for (i = 0; i < memslot->npages; ++i) {
1113
int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
1114
/*
1115
* Note that if npages > 0 then i must be a multiple of npages,
1116
* since we always put huge-page HPTEs in the rmap chain
1117
* corresponding to their page base address.
1118
*/
1119
if (npages)
1120
set_dirty_bits(map, i, npages);
1121
++rmapp;
1122
}
1123
preempt_enable();
1124
return 0;
1125
}
1126
1127
void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1128
unsigned long *nb_ret)
1129
{
1130
struct kvm_memory_slot *memslot;
1131
unsigned long gfn = gpa >> PAGE_SHIFT;
1132
struct page *page, *pages[1];
1133
int npages;
1134
unsigned long hva, offset;
1135
int srcu_idx;
1136
1137
srcu_idx = srcu_read_lock(&kvm->srcu);
1138
memslot = gfn_to_memslot(kvm, gfn);
1139
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1140
goto err;
1141
hva = gfn_to_hva_memslot(memslot, gfn);
1142
npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
1143
if (npages < 1)
1144
goto err;
1145
page = pages[0];
1146
srcu_read_unlock(&kvm->srcu, srcu_idx);
1147
1148
offset = gpa & (PAGE_SIZE - 1);
1149
if (nb_ret)
1150
*nb_ret = PAGE_SIZE - offset;
1151
return page_address(page) + offset;
1152
1153
err:
1154
srcu_read_unlock(&kvm->srcu, srcu_idx);
1155
return NULL;
1156
}
1157
1158
void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1159
bool dirty)
1160
{
1161
struct page *page = virt_to_page(va);
1162
struct kvm_memory_slot *memslot;
1163
unsigned long gfn;
1164
int srcu_idx;
1165
1166
put_page(page);
1167
1168
if (!dirty)
1169
return;
1170
1171
/* We need to mark this page dirty in the memslot dirty_bitmap, if any */
1172
gfn = gpa >> PAGE_SHIFT;
1173
srcu_idx = srcu_read_lock(&kvm->srcu);
1174
memslot = gfn_to_memslot(kvm, gfn);
1175
if (memslot && memslot->dirty_bitmap)
1176
set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
1177
srcu_read_unlock(&kvm->srcu, srcu_idx);
1178
}
1179
1180
/*
1181
* HPT resizing
1182
*/
1183
static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
1184
{
1185
int rc;
1186
1187
rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
1188
if (rc < 0)
1189
return rc;
1190
1191
resize_hpt_debug(resize, "%s(): HPT @ 0x%lx\n", __func__,
1192
resize->hpt.virt);
1193
1194
return 0;
1195
}
1196
1197
static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1198
unsigned long idx)
1199
{
1200
struct kvm *kvm = resize->kvm;
1201
struct kvm_hpt_info *old = &kvm->arch.hpt;
1202
struct kvm_hpt_info *new = &resize->hpt;
1203
unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
1204
unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
1205
__be64 *hptep, *new_hptep;
1206
unsigned long vpte, rpte, guest_rpte;
1207
int ret;
1208
struct revmap_entry *rev;
1209
unsigned long apsize, avpn, pteg, hash;
1210
unsigned long new_idx, new_pteg, replace_vpte;
1211
int pshift;
1212
1213
hptep = (__be64 *)(old->virt + (idx << 4));
1214
1215
/* Guest is stopped, so new HPTEs can't be added or faulted
1216
* in, only unmapped or altered by host actions. So, it's
1217
* safe to check this before we take the HPTE lock */
1218
vpte = be64_to_cpu(hptep[0]);
1219
if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1220
return 0; /* nothing to do */
1221
1222
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
1223
cpu_relax();
1224
1225
vpte = be64_to_cpu(hptep[0]);
1226
1227
ret = 0;
1228
if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1229
/* Nothing to do */
1230
goto out;
1231
1232
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1233
rpte = be64_to_cpu(hptep[1]);
1234
vpte = hpte_new_to_old_v(vpte, rpte);
1235
}
1236
1237
/* Unmap */
1238
rev = &old->rev[idx];
1239
guest_rpte = rev->guest_rpte;
1240
1241
ret = -EIO;
1242
apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
1243
if (!apsize)
1244
goto out;
1245
1246
if (vpte & HPTE_V_VALID) {
1247
unsigned long gfn = hpte_rpn(guest_rpte, apsize);
1248
int srcu_idx = srcu_read_lock(&kvm->srcu);
1249
struct kvm_memory_slot *memslot =
1250
__gfn_to_memslot(kvm_memslots(kvm), gfn);
1251
1252
if (memslot) {
1253
unsigned long *rmapp;
1254
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1255
1256
lock_rmap(rmapp);
1257
kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
1258
unlock_rmap(rmapp);
1259
}
1260
1261
srcu_read_unlock(&kvm->srcu, srcu_idx);
1262
}
1263
1264
/* Reload PTE after unmap */
1265
vpte = be64_to_cpu(hptep[0]);
1266
BUG_ON(vpte & HPTE_V_VALID);
1267
BUG_ON(!(vpte & HPTE_V_ABSENT));
1268
1269
ret = 0;
1270
if (!(vpte & HPTE_V_BOLTED))
1271
goto out;
1272
1273
rpte = be64_to_cpu(hptep[1]);
1274
1275
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1276
vpte = hpte_new_to_old_v(vpte, rpte);
1277
rpte = hpte_new_to_old_r(rpte);
1278
}
1279
1280
pshift = kvmppc_hpte_base_page_shift(vpte, rpte);
1281
avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23);
1282
pteg = idx / HPTES_PER_GROUP;
1283
if (vpte & HPTE_V_SECONDARY)
1284
pteg = ~pteg;
1285
1286
if (!(vpte & HPTE_V_1TB_SEG)) {
1287
unsigned long offset, vsid;
1288
1289
/* We only have 28 - 23 bits of offset in avpn */
1290
offset = (avpn & 0x1f) << 23;
1291
vsid = avpn >> 5;
1292
/* We can find more bits from the pteg value */
1293
if (pshift < 23)
1294
offset |= ((vsid ^ pteg) & old_hash_mask) << pshift;
1295
1296
hash = vsid ^ (offset >> pshift);
1297
} else {
1298
unsigned long offset, vsid;
1299
1300
/* We only have 40 - 23 bits of seg_off in avpn */
1301
offset = (avpn & 0x1ffff) << 23;
1302
vsid = avpn >> 17;
1303
if (pshift < 23)
1304
offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift;
1305
1306
hash = vsid ^ (vsid << 25) ^ (offset >> pshift);
1307
}
1308
1309
new_pteg = hash & new_hash_mask;
1310
if (vpte & HPTE_V_SECONDARY)
1311
new_pteg = ~hash & new_hash_mask;
1312
1313
new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
1314
new_hptep = (__be64 *)(new->virt + (new_idx << 4));
1315
1316
replace_vpte = be64_to_cpu(new_hptep[0]);
1317
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1318
unsigned long replace_rpte = be64_to_cpu(new_hptep[1]);
1319
replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte);
1320
}
1321
1322
if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1323
BUG_ON(new->order >= old->order);
1324
1325
if (replace_vpte & HPTE_V_BOLTED) {
1326
if (vpte & HPTE_V_BOLTED)
1327
/* Bolted collision, nothing we can do */
1328
ret = -ENOSPC;
1329
/* Discard the new HPTE */
1330
goto out;
1331
}
1332
1333
/* Discard the previous HPTE */
1334
}
1335
1336
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1337
rpte = hpte_old_to_new_r(vpte, rpte);
1338
vpte = hpte_old_to_new_v(vpte);
1339
}
1340
1341
new_hptep[1] = cpu_to_be64(rpte);
1342
new->rev[new_idx].guest_rpte = guest_rpte;
1343
/* No need for a barrier, since new HPT isn't active */
1344
new_hptep[0] = cpu_to_be64(vpte);
1345
unlock_hpte(new_hptep, vpte);
1346
1347
out:
1348
unlock_hpte(hptep, vpte);
1349
return ret;
1350
}
1351
1352
static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
1353
{
1354
struct kvm *kvm = resize->kvm;
1355
unsigned long i;
1356
int rc;
1357
1358
for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
1359
rc = resize_hpt_rehash_hpte(resize, i);
1360
if (rc != 0)
1361
return rc;
1362
}
1363
1364
return 0;
1365
}
1366
1367
static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
1368
{
1369
struct kvm *kvm = resize->kvm;
1370
struct kvm_hpt_info hpt_tmp;
1371
1372
/* Exchange the pending tables in the resize structure with
1373
* the active tables */
1374
1375
resize_hpt_debug(resize, "resize_hpt_pivot()\n");
1376
1377
spin_lock(&kvm->mmu_lock);
1378
asm volatile("ptesync" : : : "memory");
1379
1380
hpt_tmp = kvm->arch.hpt;
1381
kvmppc_set_hpt(kvm, &resize->hpt);
1382
resize->hpt = hpt_tmp;
1383
1384
spin_unlock(&kvm->mmu_lock);
1385
1386
synchronize_srcu_expedited(&kvm->srcu);
1387
1388
if (cpu_has_feature(CPU_FTR_ARCH_300))
1389
kvmppc_setup_partition_table(kvm);
1390
1391
resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
1392
}
1393
1394
static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
1395
{
1396
if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock)))
1397
return;
1398
1399
if (!resize)
1400
return;
1401
1402
if (resize->error != -EBUSY) {
1403
if (resize->hpt.virt)
1404
kvmppc_free_hpt(&resize->hpt);
1405
kfree(resize);
1406
}
1407
1408
if (kvm->arch.resize_hpt == resize)
1409
kvm->arch.resize_hpt = NULL;
1410
}
1411
1412
static void resize_hpt_prepare_work(struct work_struct *work)
1413
{
1414
struct kvm_resize_hpt *resize = container_of(work,
1415
struct kvm_resize_hpt,
1416
work);
1417
struct kvm *kvm = resize->kvm;
1418
int err = 0;
1419
1420
if (WARN_ON(resize->error != -EBUSY))
1421
return;
1422
1423
mutex_lock(&kvm->arch.mmu_setup_lock);
1424
1425
/* Request is still current? */
1426
if (kvm->arch.resize_hpt == resize) {
1427
/* We may request large allocations here:
1428
* do not sleep with kvm->arch.mmu_setup_lock held for a while.
1429
*/
1430
mutex_unlock(&kvm->arch.mmu_setup_lock);
1431
1432
resize_hpt_debug(resize, "%s(): order = %d\n", __func__,
1433
resize->order);
1434
1435
err = resize_hpt_allocate(resize);
1436
1437
/* We have strict assumption about -EBUSY
1438
* when preparing for HPT resize.
1439
*/
1440
if (WARN_ON(err == -EBUSY))
1441
err = -EINPROGRESS;
1442
1443
mutex_lock(&kvm->arch.mmu_setup_lock);
1444
/* It is possible that kvm->arch.resize_hpt != resize
1445
* after we grab kvm->arch.mmu_setup_lock again.
1446
*/
1447
}
1448
1449
resize->error = err;
1450
1451
if (kvm->arch.resize_hpt != resize)
1452
resize_hpt_release(kvm, resize);
1453
1454
mutex_unlock(&kvm->arch.mmu_setup_lock);
1455
}
1456
1457
int kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
1458
struct kvm_ppc_resize_hpt *rhpt)
1459
{
1460
unsigned long flags = rhpt->flags;
1461
unsigned long shift = rhpt->shift;
1462
struct kvm_resize_hpt *resize;
1463
int ret;
1464
1465
if (flags != 0 || kvm_is_radix(kvm))
1466
return -EINVAL;
1467
1468
if (shift && ((shift < 18) || (shift > 46)))
1469
return -EINVAL;
1470
1471
mutex_lock(&kvm->arch.mmu_setup_lock);
1472
1473
resize = kvm->arch.resize_hpt;
1474
1475
if (resize) {
1476
if (resize->order == shift) {
1477
/* Suitable resize in progress? */
1478
ret = resize->error;
1479
if (ret == -EBUSY)
1480
ret = 100; /* estimated time in ms */
1481
else if (ret)
1482
resize_hpt_release(kvm, resize);
1483
1484
goto out;
1485
}
1486
1487
/* not suitable, cancel it */
1488
resize_hpt_release(kvm, resize);
1489
}
1490
1491
ret = 0;
1492
if (!shift)
1493
goto out; /* nothing to do */
1494
1495
/* start new resize */
1496
1497
resize = kzalloc(sizeof(*resize), GFP_KERNEL);
1498
if (!resize) {
1499
ret = -ENOMEM;
1500
goto out;
1501
}
1502
1503
resize->error = -EBUSY;
1504
resize->order = shift;
1505
resize->kvm = kvm;
1506
INIT_WORK(&resize->work, resize_hpt_prepare_work);
1507
kvm->arch.resize_hpt = resize;
1508
1509
schedule_work(&resize->work);
1510
1511
ret = 100; /* estimated time in ms */
1512
1513
out:
1514
mutex_unlock(&kvm->arch.mmu_setup_lock);
1515
return ret;
1516
}
1517
1518
static void resize_hpt_boot_vcpu(void *opaque)
1519
{
1520
/* Nothing to do, just force a KVM exit */
1521
}
1522
1523
int kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1524
struct kvm_ppc_resize_hpt *rhpt)
1525
{
1526
unsigned long flags = rhpt->flags;
1527
unsigned long shift = rhpt->shift;
1528
struct kvm_resize_hpt *resize;
1529
int ret;
1530
1531
if (flags != 0 || kvm_is_radix(kvm))
1532
return -EINVAL;
1533
1534
if (shift && ((shift < 18) || (shift > 46)))
1535
return -EINVAL;
1536
1537
mutex_lock(&kvm->arch.mmu_setup_lock);
1538
1539
resize = kvm->arch.resize_hpt;
1540
1541
/* This shouldn't be possible */
1542
ret = -EIO;
1543
if (WARN_ON(!kvm->arch.mmu_ready))
1544
goto out_no_hpt;
1545
1546
/* Stop VCPUs from running while we mess with the HPT */
1547
kvm->arch.mmu_ready = 0;
1548
smp_mb();
1549
1550
/* Boot all CPUs out of the guest so they re-read
1551
* mmu_ready */
1552
on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
1553
1554
ret = -ENXIO;
1555
if (!resize || (resize->order != shift))
1556
goto out;
1557
1558
ret = resize->error;
1559
if (ret)
1560
goto out;
1561
1562
ret = resize_hpt_rehash(resize);
1563
if (ret)
1564
goto out;
1565
1566
resize_hpt_pivot(resize);
1567
1568
out:
1569
/* Let VCPUs run again */
1570
kvm->arch.mmu_ready = 1;
1571
smp_mb();
1572
out_no_hpt:
1573
resize_hpt_release(kvm, resize);
1574
mutex_unlock(&kvm->arch.mmu_setup_lock);
1575
return ret;
1576
}
1577
1578
/*
1579
* Functions for reading and writing the hash table via reads and
1580
* writes on a file descriptor.
1581
*
1582
* Reads return the guest view of the hash table, which has to be
1583
* pieced together from the real hash table and the guest_rpte
1584
* values in the revmap array.
1585
*
1586
* On writes, each HPTE written is considered in turn, and if it
1587
* is valid, it is written to the HPT as if an H_ENTER with the
1588
* exact flag set was done. When the invalid count is non-zero
1589
* in the header written to the stream, the kernel will make
1590
* sure that that many HPTEs are invalid, and invalidate them
1591
* if not.
1592
*/
1593
1594
struct kvm_htab_ctx {
1595
unsigned long index;
1596
unsigned long flags;
1597
struct kvm *kvm;
1598
int first_pass;
1599
};
1600
1601
#define HPTE_SIZE (2 * sizeof(unsigned long))
1602
1603
/*
1604
* Returns 1 if this HPT entry has been modified or has pending
1605
* R/C bit changes.
1606
*/
1607
static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
1608
{
1609
unsigned long rcbits_unset;
1610
1611
if (revp->guest_rpte & HPTE_GR_MODIFIED)
1612
return 1;
1613
1614
/* Also need to consider changes in reference and changed bits */
1615
rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1616
if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
1617
(be64_to_cpu(hptp[1]) & rcbits_unset))
1618
return 1;
1619
1620
return 0;
1621
}
1622
1623
static long record_hpte(unsigned long flags, __be64 *hptp,
1624
unsigned long *hpte, struct revmap_entry *revp,
1625
int want_valid, int first_pass)
1626
{
1627
unsigned long v, r, hr;
1628
unsigned long rcbits_unset;
1629
int ok = 1;
1630
int valid, dirty;
1631
1632
/* Unmodified entries are uninteresting except on the first pass */
1633
dirty = hpte_dirty(revp, hptp);
1634
if (!first_pass && !dirty)
1635
return 0;
1636
1637
valid = 0;
1638
if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1639
valid = 1;
1640
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1641
!(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
1642
valid = 0;
1643
}
1644
if (valid != want_valid)
1645
return 0;
1646
1647
v = r = 0;
1648
if (valid || dirty) {
1649
/* lock the HPTE so it's stable and read it */
1650
preempt_disable();
1651
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1652
cpu_relax();
1653
v = be64_to_cpu(hptp[0]);
1654
hr = be64_to_cpu(hptp[1]);
1655
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1656
v = hpte_new_to_old_v(v, hr);
1657
hr = hpte_new_to_old_r(hr);
1658
}
1659
1660
/* re-evaluate valid and dirty from synchronized HPTE value */
1661
valid = !!(v & HPTE_V_VALID);
1662
dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1663
1664
/* Harvest R and C into guest view if necessary */
1665
rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1666
if (valid && (rcbits_unset & hr)) {
1667
revp->guest_rpte |= (hr &
1668
(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
1669
dirty = 1;
1670
}
1671
1672
if (v & HPTE_V_ABSENT) {
1673
v &= ~HPTE_V_ABSENT;
1674
v |= HPTE_V_VALID;
1675
valid = 1;
1676
}
1677
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1678
valid = 0;
1679
1680
r = revp->guest_rpte;
1681
/* only clear modified if this is the right sort of entry */
1682
if (valid == want_valid && dirty) {
1683
r &= ~HPTE_GR_MODIFIED;
1684
revp->guest_rpte = r;
1685
}
1686
unlock_hpte(hptp, be64_to_cpu(hptp[0]));
1687
preempt_enable();
1688
if (!(valid == want_valid && (first_pass || dirty)))
1689
ok = 0;
1690
}
1691
hpte[0] = cpu_to_be64(v);
1692
hpte[1] = cpu_to_be64(r);
1693
return ok;
1694
}
1695
1696
static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1697
size_t count, loff_t *ppos)
1698
{
1699
struct kvm_htab_ctx *ctx = file->private_data;
1700
struct kvm *kvm = ctx->kvm;
1701
struct kvm_get_htab_header hdr;
1702
__be64 *hptp;
1703
struct revmap_entry *revp;
1704
unsigned long i, nb, nw;
1705
unsigned long __user *lbuf;
1706
struct kvm_get_htab_header __user *hptr;
1707
unsigned long flags;
1708
int first_pass;
1709
unsigned long hpte[2];
1710
1711
if (!access_ok(buf, count))
1712
return -EFAULT;
1713
if (kvm_is_radix(kvm))
1714
return 0;
1715
1716
first_pass = ctx->first_pass;
1717
flags = ctx->flags;
1718
1719
i = ctx->index;
1720
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1721
revp = kvm->arch.hpt.rev + i;
1722
lbuf = (unsigned long __user *)buf;
1723
1724
nb = 0;
1725
while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1726
/* Initialize header */
1727
hptr = (struct kvm_get_htab_header __user *)buf;
1728
hdr.n_valid = 0;
1729
hdr.n_invalid = 0;
1730
nw = nb;
1731
nb += sizeof(hdr);
1732
lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1733
1734
/* Skip uninteresting entries, i.e. clean on not-first pass */
1735
if (!first_pass) {
1736
while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1737
!hpte_dirty(revp, hptp)) {
1738
++i;
1739
hptp += 2;
1740
++revp;
1741
}
1742
}
1743
hdr.index = i;
1744
1745
/* Grab a series of valid entries */
1746
while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1747
hdr.n_valid < 0xffff &&
1748
nb + HPTE_SIZE < count &&
1749
record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1750
/* valid entry, write it out */
1751
++hdr.n_valid;
1752
if (__put_user(hpte[0], lbuf) ||
1753
__put_user(hpte[1], lbuf + 1))
1754
return -EFAULT;
1755
nb += HPTE_SIZE;
1756
lbuf += 2;
1757
++i;
1758
hptp += 2;
1759
++revp;
1760
}
1761
/* Now skip invalid entries while we can */
1762
while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1763
hdr.n_invalid < 0xffff &&
1764
record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1765
/* found an invalid entry */
1766
++hdr.n_invalid;
1767
++i;
1768
hptp += 2;
1769
++revp;
1770
}
1771
1772
if (hdr.n_valid || hdr.n_invalid) {
1773
/* write back the header */
1774
if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1775
return -EFAULT;
1776
nw = nb;
1777
buf = (char __user *)lbuf;
1778
} else {
1779
nb = nw;
1780
}
1781
1782
/* Check if we've wrapped around the hash table */
1783
if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
1784
i = 0;
1785
ctx->first_pass = 0;
1786
break;
1787
}
1788
}
1789
1790
ctx->index = i;
1791
1792
return nb;
1793
}
1794
1795
static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1796
size_t count, loff_t *ppos)
1797
{
1798
struct kvm_htab_ctx *ctx = file->private_data;
1799
struct kvm *kvm = ctx->kvm;
1800
struct kvm_get_htab_header hdr;
1801
unsigned long i, j;
1802
unsigned long v, r;
1803
unsigned long __user *lbuf;
1804
__be64 *hptp;
1805
unsigned long tmp[2];
1806
ssize_t nb;
1807
long int err, ret;
1808
int mmu_ready;
1809
int pshift;
1810
1811
if (!access_ok(buf, count))
1812
return -EFAULT;
1813
if (kvm_is_radix(kvm))
1814
return -EINVAL;
1815
1816
/* lock out vcpus from running while we're doing this */
1817
mutex_lock(&kvm->arch.mmu_setup_lock);
1818
mmu_ready = kvm->arch.mmu_ready;
1819
if (mmu_ready) {
1820
kvm->arch.mmu_ready = 0; /* temporarily */
1821
/* order mmu_ready vs. vcpus_running */
1822
smp_mb();
1823
if (atomic_read(&kvm->arch.vcpus_running)) {
1824
kvm->arch.mmu_ready = 1;
1825
mutex_unlock(&kvm->arch.mmu_setup_lock);
1826
return -EBUSY;
1827
}
1828
}
1829
1830
err = 0;
1831
for (nb = 0; nb + sizeof(hdr) <= count; ) {
1832
err = -EFAULT;
1833
if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1834
break;
1835
1836
err = 0;
1837
if (nb + hdr.n_valid * HPTE_SIZE > count)
1838
break;
1839
1840
nb += sizeof(hdr);
1841
buf += sizeof(hdr);
1842
1843
err = -EINVAL;
1844
i = hdr.index;
1845
if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
1846
i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
1847
break;
1848
1849
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1850
lbuf = (unsigned long __user *)buf;
1851
for (j = 0; j < hdr.n_valid; ++j) {
1852
__be64 hpte_v;
1853
__be64 hpte_r;
1854
1855
err = -EFAULT;
1856
if (__get_user(hpte_v, lbuf) ||
1857
__get_user(hpte_r, lbuf + 1))
1858
goto out;
1859
v = be64_to_cpu(hpte_v);
1860
r = be64_to_cpu(hpte_r);
1861
err = -EINVAL;
1862
if (!(v & HPTE_V_VALID))
1863
goto out;
1864
pshift = kvmppc_hpte_base_page_shift(v, r);
1865
if (pshift <= 0)
1866
goto out;
1867
lbuf += 2;
1868
nb += HPTE_SIZE;
1869
1870
if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1871
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1872
err = -EIO;
1873
ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1874
tmp);
1875
if (ret != H_SUCCESS) {
1876
pr_err("%s ret %ld i=%ld v=%lx r=%lx\n", __func__, ret, i, v, r);
1877
goto out;
1878
}
1879
if (!mmu_ready && is_vrma_hpte(v)) {
1880
unsigned long senc, lpcr;
1881
1882
senc = slb_pgsize_encoding(1ul << pshift);
1883
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1884
(VRMA_VSID << SLB_VSID_SHIFT_1T);
1885
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
1886
lpcr = senc << (LPCR_VRMASD_SH - 4);
1887
kvmppc_update_lpcr(kvm, lpcr,
1888
LPCR_VRMASD);
1889
} else {
1890
kvmppc_setup_partition_table(kvm);
1891
}
1892
mmu_ready = 1;
1893
}
1894
++i;
1895
hptp += 2;
1896
}
1897
1898
for (j = 0; j < hdr.n_invalid; ++j) {
1899
if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1900
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1901
++i;
1902
hptp += 2;
1903
}
1904
err = 0;
1905
}
1906
1907
out:
1908
/* Order HPTE updates vs. mmu_ready */
1909
smp_wmb();
1910
kvm->arch.mmu_ready = mmu_ready;
1911
mutex_unlock(&kvm->arch.mmu_setup_lock);
1912
1913
if (err)
1914
return err;
1915
return nb;
1916
}
1917
1918
static int kvm_htab_release(struct inode *inode, struct file *filp)
1919
{
1920
struct kvm_htab_ctx *ctx = filp->private_data;
1921
1922
filp->private_data = NULL;
1923
if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1924
atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1925
kvm_put_kvm(ctx->kvm);
1926
kfree(ctx);
1927
return 0;
1928
}
1929
1930
static const struct file_operations kvm_htab_fops = {
1931
.read = kvm_htab_read,
1932
.write = kvm_htab_write,
1933
.llseek = default_llseek,
1934
.release = kvm_htab_release,
1935
};
1936
1937
int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1938
{
1939
int ret;
1940
struct kvm_htab_ctx *ctx;
1941
int rwflag;
1942
1943
/* reject flags we don't recognize */
1944
if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1945
return -EINVAL;
1946
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1947
if (!ctx)
1948
return -ENOMEM;
1949
kvm_get_kvm(kvm);
1950
ctx->kvm = kvm;
1951
ctx->index = ghf->start_index;
1952
ctx->flags = ghf->flags;
1953
ctx->first_pass = 1;
1954
1955
rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1956
ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1957
if (ret < 0) {
1958
kfree(ctx);
1959
kvm_put_kvm_no_destroy(kvm);
1960
return ret;
1961
}
1962
1963
if (rwflag == O_RDONLY) {
1964
mutex_lock(&kvm->slots_lock);
1965
atomic_inc(&kvm->arch.hpte_mod_interest);
1966
/* make sure kvmppc_do_h_enter etc. see the increment */
1967
synchronize_srcu_expedited(&kvm->srcu);
1968
mutex_unlock(&kvm->slots_lock);
1969
}
1970
1971
return ret;
1972
}
1973
1974
struct debugfs_htab_state {
1975
struct kvm *kvm;
1976
struct mutex mutex;
1977
unsigned long hpt_index;
1978
int chars_left;
1979
int buf_index;
1980
char buf[64];
1981
};
1982
1983
static int debugfs_htab_open(struct inode *inode, struct file *file)
1984
{
1985
struct kvm *kvm = inode->i_private;
1986
struct debugfs_htab_state *p;
1987
1988
p = kzalloc(sizeof(*p), GFP_KERNEL);
1989
if (!p)
1990
return -ENOMEM;
1991
1992
kvm_get_kvm(kvm);
1993
p->kvm = kvm;
1994
mutex_init(&p->mutex);
1995
file->private_data = p;
1996
1997
return nonseekable_open(inode, file);
1998
}
1999
2000
static int debugfs_htab_release(struct inode *inode, struct file *file)
2001
{
2002
struct debugfs_htab_state *p = file->private_data;
2003
2004
kvm_put_kvm(p->kvm);
2005
kfree(p);
2006
return 0;
2007
}
2008
2009
static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
2010
size_t len, loff_t *ppos)
2011
{
2012
struct debugfs_htab_state *p = file->private_data;
2013
ssize_t ret, r;
2014
unsigned long i, n;
2015
unsigned long v, hr, gr;
2016
struct kvm *kvm;
2017
__be64 *hptp;
2018
2019
kvm = p->kvm;
2020
if (kvm_is_radix(kvm))
2021
return 0;
2022
2023
ret = mutex_lock_interruptible(&p->mutex);
2024
if (ret)
2025
return ret;
2026
2027
if (p->chars_left) {
2028
n = p->chars_left;
2029
if (n > len)
2030
n = len;
2031
r = copy_to_user(buf, p->buf + p->buf_index, n);
2032
n -= r;
2033
p->chars_left -= n;
2034
p->buf_index += n;
2035
buf += n;
2036
len -= n;
2037
ret = n;
2038
if (r) {
2039
if (!n)
2040
ret = -EFAULT;
2041
goto out;
2042
}
2043
}
2044
2045
i = p->hpt_index;
2046
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
2047
for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
2048
++i, hptp += 2) {
2049
if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
2050
continue;
2051
2052
/* lock the HPTE so it's stable and read it */
2053
preempt_disable();
2054
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
2055
cpu_relax();
2056
v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
2057
hr = be64_to_cpu(hptp[1]);
2058
gr = kvm->arch.hpt.rev[i].guest_rpte;
2059
unlock_hpte(hptp, v);
2060
preempt_enable();
2061
2062
if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
2063
continue;
2064
2065
n = scnprintf(p->buf, sizeof(p->buf),
2066
"%6lx %.16lx %.16lx %.16lx\n",
2067
i, v, hr, gr);
2068
p->chars_left = n;
2069
if (n > len)
2070
n = len;
2071
r = copy_to_user(buf, p->buf, n);
2072
n -= r;
2073
p->chars_left -= n;
2074
p->buf_index = n;
2075
buf += n;
2076
len -= n;
2077
ret += n;
2078
if (r) {
2079
if (!ret)
2080
ret = -EFAULT;
2081
goto out;
2082
}
2083
}
2084
p->hpt_index = i;
2085
2086
out:
2087
mutex_unlock(&p->mutex);
2088
return ret;
2089
}
2090
2091
static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
2092
size_t len, loff_t *ppos)
2093
{
2094
return -EACCES;
2095
}
2096
2097
static const struct file_operations debugfs_htab_fops = {
2098
.owner = THIS_MODULE,
2099
.open = debugfs_htab_open,
2100
.release = debugfs_htab_release,
2101
.read = debugfs_htab_read,
2102
.write = debugfs_htab_write,
2103
.llseek = generic_file_llseek,
2104
};
2105
2106
void kvmppc_mmu_debugfs_init(struct kvm *kvm)
2107
{
2108
debugfs_create_file("htab", 0400, kvm->debugfs_dentry, kvm,
2109
&debugfs_htab_fops);
2110
}
2111
2112
void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
2113
{
2114
struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
2115
2116
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
2117
2118
mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2119
2120
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
2121
}
2122
2123