Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/kvm/book3s_hv.c
26424 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Copyright 2011 Paul Mackerras, IBM Corp. <[email protected]>
4
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
5
*
6
* Authors:
7
* Paul Mackerras <[email protected]>
8
* Alexander Graf <[email protected]>
9
* Kevin Wolf <[email protected]>
10
*
11
* Description: KVM functions specific to running on Book 3S
12
* processors in hypervisor mode (specifically POWER7 and later).
13
*
14
* This file is derived from arch/powerpc/kvm/book3s.c,
15
* by Alexander Graf <[email protected]>.
16
*/
17
18
#include <linux/kvm_host.h>
19
#include <linux/kernel.h>
20
#include <linux/err.h>
21
#include <linux/slab.h>
22
#include <linux/preempt.h>
23
#include <linux/sched/signal.h>
24
#include <linux/sched/stat.h>
25
#include <linux/delay.h>
26
#include <linux/export.h>
27
#include <linux/fs.h>
28
#include <linux/anon_inodes.h>
29
#include <linux/cpu.h>
30
#include <linux/cpumask.h>
31
#include <linux/spinlock.h>
32
#include <linux/page-flags.h>
33
#include <linux/srcu.h>
34
#include <linux/miscdevice.h>
35
#include <linux/debugfs.h>
36
#include <linux/gfp.h>
37
#include <linux/vmalloc.h>
38
#include <linux/highmem.h>
39
#include <linux/hugetlb.h>
40
#include <linux/kvm_irqfd.h>
41
#include <linux/irqbypass.h>
42
#include <linux/module.h>
43
#include <linux/compiler.h>
44
#include <linux/of.h>
45
#include <linux/irqdomain.h>
46
#include <linux/smp.h>
47
48
#include <asm/ftrace.h>
49
#include <asm/reg.h>
50
#include <asm/ppc-opcode.h>
51
#include <asm/asm-prototypes.h>
52
#include <asm/archrandom.h>
53
#include <asm/debug.h>
54
#include <asm/disassemble.h>
55
#include <asm/cputable.h>
56
#include <asm/cacheflush.h>
57
#include <linux/uaccess.h>
58
#include <asm/interrupt.h>
59
#include <asm/io.h>
60
#include <asm/kvm_ppc.h>
61
#include <asm/kvm_book3s.h>
62
#include <asm/mmu_context.h>
63
#include <asm/lppaca.h>
64
#include <asm/pmc.h>
65
#include <asm/processor.h>
66
#include <asm/cputhreads.h>
67
#include <asm/page.h>
68
#include <asm/hvcall.h>
69
#include <asm/switch_to.h>
70
#include <asm/smp.h>
71
#include <asm/dbell.h>
72
#include <asm/hmi.h>
73
#include <asm/pnv-pci.h>
74
#include <asm/mmu.h>
75
#include <asm/opal.h>
76
#include <asm/xics.h>
77
#include <asm/xive.h>
78
#include <asm/hw_breakpoint.h>
79
#include <asm/kvm_book3s_uvmem.h>
80
#include <asm/ultravisor.h>
81
#include <asm/dtl.h>
82
#include <asm/plpar_wrappers.h>
83
84
#include <trace/events/ipi.h>
85
86
#include "book3s.h"
87
#include "book3s_hv.h"
88
89
#define CREATE_TRACE_POINTS
90
#include "trace_hv.h"
91
92
/* #define EXIT_DEBUG */
93
/* #define EXIT_DEBUG_SIMPLE */
94
/* #define EXIT_DEBUG_INT */
95
96
/* Used to indicate that a guest page fault needs to be handled */
97
#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
98
/* Used to indicate that a guest passthrough interrupt needs to be handled */
99
#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
100
101
/* Used as a "null" value for timebase values */
102
#define TB_NIL (~(u64)0)
103
104
static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
105
106
static int dynamic_mt_modes = 6;
107
module_param(dynamic_mt_modes, int, 0644);
108
MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
109
static int target_smt_mode;
110
module_param(target_smt_mode, int, 0644);
111
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
112
113
static bool one_vm_per_core;
114
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
115
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
116
117
#ifdef CONFIG_KVM_XICS
118
static const struct kernel_param_ops module_param_ops = {
119
.set = param_set_int,
120
.get = param_get_int,
121
};
122
123
module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
124
MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
125
126
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
127
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
128
#endif
129
130
/* If set, guests are allowed to create and control nested guests */
131
static bool nested = true;
132
module_param(nested, bool, S_IRUGO | S_IWUSR);
133
MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
134
135
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
136
137
/*
138
* RWMR values for POWER8. These control the rate at which PURR
139
* and SPURR count and should be set according to the number of
140
* online threads in the vcore being run.
141
*/
142
#define RWMR_RPA_P8_1THREAD 0x164520C62609AECAUL
143
#define RWMR_RPA_P8_2THREAD 0x7FFF2908450D8DA9UL
144
#define RWMR_RPA_P8_3THREAD 0x164520C62609AECAUL
145
#define RWMR_RPA_P8_4THREAD 0x199A421245058DA9UL
146
#define RWMR_RPA_P8_5THREAD 0x164520C62609AECAUL
147
#define RWMR_RPA_P8_6THREAD 0x164520C62609AECAUL
148
#define RWMR_RPA_P8_7THREAD 0x164520C62609AECAUL
149
#define RWMR_RPA_P8_8THREAD 0x164520C62609AECAUL
150
151
static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
152
RWMR_RPA_P8_1THREAD,
153
RWMR_RPA_P8_1THREAD,
154
RWMR_RPA_P8_2THREAD,
155
RWMR_RPA_P8_3THREAD,
156
RWMR_RPA_P8_4THREAD,
157
RWMR_RPA_P8_5THREAD,
158
RWMR_RPA_P8_6THREAD,
159
RWMR_RPA_P8_7THREAD,
160
RWMR_RPA_P8_8THREAD,
161
};
162
163
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
164
int *ip)
165
{
166
int i = *ip;
167
struct kvm_vcpu *vcpu;
168
169
while (++i < MAX_SMT_THREADS) {
170
vcpu = READ_ONCE(vc->runnable_threads[i]);
171
if (vcpu) {
172
*ip = i;
173
return vcpu;
174
}
175
}
176
return NULL;
177
}
178
179
/* Used to traverse the list of runnable threads for a given vcore */
180
#define for_each_runnable_thread(i, vcpu, vc) \
181
for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
182
183
static bool kvmppc_ipi_thread(int cpu)
184
{
185
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
186
187
/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
188
if (kvmhv_on_pseries())
189
return false;
190
191
/* On POWER9 we can use msgsnd to IPI any cpu */
192
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
193
msg |= get_hard_smp_processor_id(cpu);
194
smp_mb();
195
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
196
return true;
197
}
198
199
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
200
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
201
preempt_disable();
202
if (cpu_first_thread_sibling(cpu) ==
203
cpu_first_thread_sibling(smp_processor_id())) {
204
msg |= cpu_thread_in_core(cpu);
205
smp_mb();
206
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
207
preempt_enable();
208
return true;
209
}
210
preempt_enable();
211
}
212
213
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
214
if (cpu >= 0 && cpu < nr_cpu_ids) {
215
if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
216
xics_wake_cpu(cpu);
217
return true;
218
}
219
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
220
return true;
221
}
222
#endif
223
224
return false;
225
}
226
227
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
228
{
229
int cpu;
230
struct rcuwait *waitp;
231
232
/*
233
* rcuwait_wake_up contains smp_mb() which orders prior stores that
234
* create pending work vs below loads of cpu fields. The other side
235
* is the barrier in vcpu run that orders setting the cpu fields vs
236
* testing for pending work.
237
*/
238
239
waitp = kvm_arch_vcpu_get_wait(vcpu);
240
if (rcuwait_wake_up(waitp))
241
++vcpu->stat.generic.halt_wakeup;
242
243
cpu = READ_ONCE(vcpu->arch.thread_cpu);
244
if (cpu >= 0 && kvmppc_ipi_thread(cpu))
245
return;
246
247
/* CPU points to the first thread of the core */
248
cpu = vcpu->cpu;
249
if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
250
smp_send_reschedule(cpu);
251
}
252
253
/*
254
* We use the vcpu_load/put functions to measure stolen time.
255
*
256
* Stolen time is counted as time when either the vcpu is able to
257
* run as part of a virtual core, but the task running the vcore
258
* is preempted or sleeping, or when the vcpu needs something done
259
* in the kernel by the task running the vcpu, but that task is
260
* preempted or sleeping. Those two things have to be counted
261
* separately, since one of the vcpu tasks will take on the job
262
* of running the core, and the other vcpu tasks in the vcore will
263
* sleep waiting for it to do that, but that sleep shouldn't count
264
* as stolen time.
265
*
266
* Hence we accumulate stolen time when the vcpu can run as part of
267
* a vcore using vc->stolen_tb, and the stolen time when the vcpu
268
* needs its task to do other things in the kernel (for example,
269
* service a page fault) in busy_stolen. We don't accumulate
270
* stolen time for a vcore when it is inactive, or for a vcpu
271
* when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
272
* a misnomer; it means that the vcpu task is not executing in
273
* the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
274
* the kernel. We don't have any way of dividing up that time
275
* between time that the vcpu is genuinely stopped, time that
276
* the task is actively working on behalf of the vcpu, and time
277
* that the task is preempted, so we don't count any of it as
278
* stolen.
279
*
280
* Updates to busy_stolen are protected by arch.tbacct_lock;
281
* updates to vc->stolen_tb are protected by the vcore->stoltb_lock
282
* lock. The stolen times are measured in units of timebase ticks.
283
* (Note that the != TB_NIL checks below are purely defensive;
284
* they should never fail.)
285
*
286
* The POWER9 path is simpler, one vcpu per virtual core so the
287
* former case does not exist. If a vcpu is preempted when it is
288
* BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate
289
* the stolen cycles in busy_stolen. RUNNING is not a preemptible
290
* state in the P9 path.
291
*/
292
293
static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
294
{
295
unsigned long flags;
296
297
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
298
299
spin_lock_irqsave(&vc->stoltb_lock, flags);
300
vc->preempt_tb = tb;
301
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
302
}
303
304
static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
305
{
306
unsigned long flags;
307
308
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
309
310
spin_lock_irqsave(&vc->stoltb_lock, flags);
311
if (vc->preempt_tb != TB_NIL) {
312
vc->stolen_tb += tb - vc->preempt_tb;
313
vc->preempt_tb = TB_NIL;
314
}
315
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
316
}
317
318
static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
319
{
320
struct kvmppc_vcore *vc = vcpu->arch.vcore;
321
unsigned long flags;
322
u64 now;
323
324
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
325
if (vcpu->arch.busy_preempt != TB_NIL) {
326
WARN_ON_ONCE(vcpu->arch.state != KVMPPC_VCPU_BUSY_IN_HOST);
327
vc->stolen_tb += mftb() - vcpu->arch.busy_preempt;
328
vcpu->arch.busy_preempt = TB_NIL;
329
}
330
return;
331
}
332
333
now = mftb();
334
335
/*
336
* We can test vc->runner without taking the vcore lock,
337
* because only this task ever sets vc->runner to this
338
* vcpu, and once it is set to this vcpu, only this task
339
* ever sets it to NULL.
340
*/
341
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
342
kvmppc_core_end_stolen(vc, now);
343
344
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
345
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
346
vcpu->arch.busy_preempt != TB_NIL) {
347
vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
348
vcpu->arch.busy_preempt = TB_NIL;
349
}
350
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
351
}
352
353
static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
354
{
355
struct kvmppc_vcore *vc = vcpu->arch.vcore;
356
unsigned long flags;
357
u64 now;
358
359
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
360
/*
361
* In the P9 path, RUNNABLE is not preemptible
362
* (nor takes host interrupts)
363
*/
364
WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE);
365
/*
366
* Account stolen time when preempted while the vcpu task is
367
* running in the kernel (but not in qemu, which is INACTIVE).
368
*/
369
if (task_is_running(current) &&
370
vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
371
vcpu->arch.busy_preempt = mftb();
372
return;
373
}
374
375
now = mftb();
376
377
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
378
kvmppc_core_start_stolen(vc, now);
379
380
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
381
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
382
vcpu->arch.busy_preempt = now;
383
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
384
}
385
386
static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
387
{
388
vcpu->arch.pvr = pvr;
389
}
390
391
/* Dummy value used in computing PCR value below */
392
#define PCR_ARCH_31 (PCR_ARCH_300 << 1)
393
394
static inline unsigned long map_pcr_to_cap(unsigned long pcr)
395
{
396
unsigned long cap = 0;
397
398
switch (pcr) {
399
case PCR_ARCH_300:
400
cap = H_GUEST_CAP_POWER9;
401
break;
402
case PCR_ARCH_31:
403
if (cpu_has_feature(CPU_FTR_P11_PVR))
404
cap = H_GUEST_CAP_POWER11;
405
else
406
cap = H_GUEST_CAP_POWER10;
407
break;
408
default:
409
break;
410
}
411
412
return cap;
413
}
414
415
static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
416
{
417
unsigned long host_pcr_bit = 0, guest_pcr_bit = 0, cap = 0;
418
struct kvmppc_vcore *vc = vcpu->arch.vcore;
419
420
/* We can (emulate) our own architecture version and anything older */
421
if (cpu_has_feature(CPU_FTR_P11_PVR) || cpu_has_feature(CPU_FTR_ARCH_31))
422
host_pcr_bit = PCR_ARCH_31;
423
else if (cpu_has_feature(CPU_FTR_ARCH_300))
424
host_pcr_bit = PCR_ARCH_300;
425
else if (cpu_has_feature(CPU_FTR_ARCH_207S))
426
host_pcr_bit = PCR_ARCH_207;
427
else if (cpu_has_feature(CPU_FTR_ARCH_206))
428
host_pcr_bit = PCR_ARCH_206;
429
else
430
host_pcr_bit = PCR_ARCH_205;
431
432
/* Determine lowest PCR bit needed to run guest in given PVR level */
433
guest_pcr_bit = host_pcr_bit;
434
if (arch_compat) {
435
switch (arch_compat) {
436
case PVR_ARCH_205:
437
guest_pcr_bit = PCR_ARCH_205;
438
break;
439
case PVR_ARCH_206:
440
case PVR_ARCH_206p:
441
guest_pcr_bit = PCR_ARCH_206;
442
break;
443
case PVR_ARCH_207:
444
guest_pcr_bit = PCR_ARCH_207;
445
break;
446
case PVR_ARCH_300:
447
guest_pcr_bit = PCR_ARCH_300;
448
break;
449
case PVR_ARCH_31:
450
case PVR_ARCH_31_P11:
451
guest_pcr_bit = PCR_ARCH_31;
452
break;
453
default:
454
return -EINVAL;
455
}
456
}
457
458
/* Check requested PCR bits don't exceed our capabilities */
459
if (guest_pcr_bit > host_pcr_bit)
460
return -EINVAL;
461
462
if (kvmhv_on_pseries() && kvmhv_is_nestedv2()) {
463
/*
464
* 'arch_compat == 0' would mean the guest should default to
465
* L1's compatibility. In this case, the guest would pick
466
* host's PCR and evaluate the corresponding capabilities.
467
*/
468
cap = map_pcr_to_cap(guest_pcr_bit);
469
if (!(cap & nested_capabilities))
470
return -EINVAL;
471
}
472
473
spin_lock(&vc->lock);
474
vc->arch_compat = arch_compat;
475
kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LOGICAL_PVR);
476
/*
477
* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
478
* Also set all reserved PCR bits
479
*/
480
vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
481
spin_unlock(&vc->lock);
482
483
return 0;
484
}
485
486
static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
487
{
488
int r;
489
490
pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
491
pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
492
vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
493
for (r = 0; r < 16; ++r)
494
pr_err("r%2d = %.16lx r%d = %.16lx\n",
495
r, kvmppc_get_gpr(vcpu, r),
496
r+16, kvmppc_get_gpr(vcpu, r+16));
497
pr_err("ctr = %.16lx lr = %.16lx\n",
498
vcpu->arch.regs.ctr, vcpu->arch.regs.link);
499
pr_err("srr0 = %.16llx srr1 = %.16llx\n",
500
vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
501
pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
502
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
503
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
504
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
505
pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
506
vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
507
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
508
pr_err("fault dar = %.16lx dsisr = %.8x\n",
509
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
510
pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
511
for (r = 0; r < vcpu->arch.slb_max; ++r)
512
pr_err(" ESID = %.16llx VSID = %.16llx\n",
513
vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
514
pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.16lx\n",
515
vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
516
vcpu->arch.last_inst);
517
}
518
519
static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
520
{
521
return kvm_get_vcpu_by_id(kvm, id);
522
}
523
524
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
525
{
526
vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
527
vpa->yield_count = cpu_to_be32(1);
528
}
529
530
static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
531
unsigned long addr, unsigned long len)
532
{
533
/* check address is cacheline aligned */
534
if (addr & (L1_CACHE_BYTES - 1))
535
return -EINVAL;
536
spin_lock(&vcpu->arch.vpa_update_lock);
537
if (v->next_gpa != addr || v->len != len) {
538
v->next_gpa = addr;
539
v->len = addr ? len : 0;
540
v->update_pending = 1;
541
}
542
spin_unlock(&vcpu->arch.vpa_update_lock);
543
return 0;
544
}
545
546
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
547
struct reg_vpa {
548
u32 dummy;
549
union {
550
__be16 hword;
551
__be32 word;
552
} length;
553
};
554
555
static int vpa_is_registered(struct kvmppc_vpa *vpap)
556
{
557
if (vpap->update_pending)
558
return vpap->next_gpa != 0;
559
return vpap->pinned_addr != NULL;
560
}
561
562
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
563
unsigned long flags,
564
unsigned long vcpuid, unsigned long vpa)
565
{
566
struct kvm *kvm = vcpu->kvm;
567
unsigned long len, nb;
568
void *va;
569
struct kvm_vcpu *tvcpu;
570
int err;
571
int subfunc;
572
struct kvmppc_vpa *vpap;
573
574
tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
575
if (!tvcpu)
576
return H_PARAMETER;
577
578
subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
579
if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
580
subfunc == H_VPA_REG_SLB) {
581
/* Registering new area - address must be cache-line aligned */
582
if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
583
return H_PARAMETER;
584
585
/* convert logical addr to kernel addr and read length */
586
va = kvmppc_pin_guest_page(kvm, vpa, &nb);
587
if (va == NULL)
588
return H_PARAMETER;
589
if (subfunc == H_VPA_REG_VPA)
590
len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
591
else
592
len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
593
kvmppc_unpin_guest_page(kvm, va, vpa, false);
594
595
/* Check length */
596
if (len > nb || len < sizeof(struct reg_vpa))
597
return H_PARAMETER;
598
} else {
599
vpa = 0;
600
len = 0;
601
}
602
603
err = H_PARAMETER;
604
vpap = NULL;
605
spin_lock(&tvcpu->arch.vpa_update_lock);
606
607
switch (subfunc) {
608
case H_VPA_REG_VPA: /* register VPA */
609
/*
610
* The size of our lppaca is 1kB because of the way we align
611
* it for the guest to avoid crossing a 4kB boundary. We only
612
* use 640 bytes of the structure though, so we should accept
613
* clients that set a size of 640.
614
*/
615
BUILD_BUG_ON(sizeof(struct lppaca) != 640);
616
if (len < sizeof(struct lppaca))
617
break;
618
vpap = &tvcpu->arch.vpa;
619
err = 0;
620
break;
621
622
case H_VPA_REG_DTL: /* register DTL */
623
if (len < sizeof(struct dtl_entry))
624
break;
625
len -= len % sizeof(struct dtl_entry);
626
627
/* Check that they have previously registered a VPA */
628
err = H_RESOURCE;
629
if (!vpa_is_registered(&tvcpu->arch.vpa))
630
break;
631
632
vpap = &tvcpu->arch.dtl;
633
err = 0;
634
break;
635
636
case H_VPA_REG_SLB: /* register SLB shadow buffer */
637
/* Check that they have previously registered a VPA */
638
err = H_RESOURCE;
639
if (!vpa_is_registered(&tvcpu->arch.vpa))
640
break;
641
642
vpap = &tvcpu->arch.slb_shadow;
643
err = 0;
644
break;
645
646
case H_VPA_DEREG_VPA: /* deregister VPA */
647
/* Check they don't still have a DTL or SLB buf registered */
648
err = H_RESOURCE;
649
if (vpa_is_registered(&tvcpu->arch.dtl) ||
650
vpa_is_registered(&tvcpu->arch.slb_shadow))
651
break;
652
653
vpap = &tvcpu->arch.vpa;
654
err = 0;
655
break;
656
657
case H_VPA_DEREG_DTL: /* deregister DTL */
658
vpap = &tvcpu->arch.dtl;
659
err = 0;
660
break;
661
662
case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */
663
vpap = &tvcpu->arch.slb_shadow;
664
err = 0;
665
break;
666
}
667
668
if (vpap) {
669
vpap->next_gpa = vpa;
670
vpap->len = len;
671
vpap->update_pending = 1;
672
}
673
674
spin_unlock(&tvcpu->arch.vpa_update_lock);
675
676
return err;
677
}
678
679
static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap,
680
struct kvmppc_vpa *old_vpap)
681
{
682
struct kvm *kvm = vcpu->kvm;
683
void *va;
684
unsigned long nb;
685
unsigned long gpa;
686
687
/*
688
* We need to pin the page pointed to by vpap->next_gpa,
689
* but we can't call kvmppc_pin_guest_page under the lock
690
* as it does get_user_pages() and down_read(). So we
691
* have to drop the lock, pin the page, then get the lock
692
* again and check that a new area didn't get registered
693
* in the meantime.
694
*/
695
for (;;) {
696
gpa = vpap->next_gpa;
697
spin_unlock(&vcpu->arch.vpa_update_lock);
698
va = NULL;
699
nb = 0;
700
if (gpa)
701
va = kvmppc_pin_guest_page(kvm, gpa, &nb);
702
spin_lock(&vcpu->arch.vpa_update_lock);
703
if (gpa == vpap->next_gpa)
704
break;
705
/* sigh... unpin that one and try again */
706
if (va)
707
kvmppc_unpin_guest_page(kvm, va, gpa, false);
708
}
709
710
vpap->update_pending = 0;
711
if (va && nb < vpap->len) {
712
/*
713
* If it's now too short, it must be that userspace
714
* has changed the mappings underlying guest memory,
715
* so unregister the region.
716
*/
717
kvmppc_unpin_guest_page(kvm, va, gpa, false);
718
va = NULL;
719
}
720
*old_vpap = *vpap;
721
722
vpap->gpa = gpa;
723
vpap->pinned_addr = va;
724
vpap->dirty = false;
725
if (va)
726
vpap->pinned_end = va + vpap->len;
727
}
728
729
static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
730
{
731
struct kvm *kvm = vcpu->kvm;
732
struct kvmppc_vpa old_vpa = { 0 };
733
734
if (!(vcpu->arch.vpa.update_pending ||
735
vcpu->arch.slb_shadow.update_pending ||
736
vcpu->arch.dtl.update_pending))
737
return;
738
739
spin_lock(&vcpu->arch.vpa_update_lock);
740
if (vcpu->arch.vpa.update_pending) {
741
kvmppc_update_vpa(vcpu, &vcpu->arch.vpa, &old_vpa);
742
if (old_vpa.pinned_addr) {
743
if (kvmhv_is_nestedv2())
744
kvmhv_nestedv2_set_vpa(vcpu, ~0ull);
745
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
746
old_vpa.dirty);
747
}
748
if (vcpu->arch.vpa.pinned_addr) {
749
init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
750
if (kvmhv_is_nestedv2())
751
kvmhv_nestedv2_set_vpa(vcpu, __pa(vcpu->arch.vpa.pinned_addr));
752
}
753
}
754
if (vcpu->arch.dtl.update_pending) {
755
kvmppc_update_vpa(vcpu, &vcpu->arch.dtl, &old_vpa);
756
if (old_vpa.pinned_addr)
757
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
758
old_vpa.dirty);
759
vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
760
vcpu->arch.dtl_index = 0;
761
}
762
if (vcpu->arch.slb_shadow.update_pending) {
763
kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow, &old_vpa);
764
if (old_vpa.pinned_addr)
765
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
766
old_vpa.dirty);
767
}
768
769
spin_unlock(&vcpu->arch.vpa_update_lock);
770
}
771
772
/*
773
* Return the accumulated stolen time for the vcore up until `now'.
774
* The caller should hold the vcore lock.
775
*/
776
static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
777
{
778
u64 p;
779
unsigned long flags;
780
781
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
782
783
spin_lock_irqsave(&vc->stoltb_lock, flags);
784
p = vc->stolen_tb;
785
if (vc->vcore_state != VCORE_INACTIVE &&
786
vc->preempt_tb != TB_NIL)
787
p += now - vc->preempt_tb;
788
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
789
return p;
790
}
791
792
static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
793
struct lppaca *vpa,
794
unsigned int pcpu, u64 now,
795
unsigned long stolen)
796
{
797
struct dtl_entry *dt;
798
799
dt = vcpu->arch.dtl_ptr;
800
801
if (!dt)
802
return;
803
804
dt->dispatch_reason = 7;
805
dt->preempt_reason = 0;
806
dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
807
dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
808
dt->ready_to_enqueue_time = 0;
809
dt->waiting_to_ready_time = 0;
810
dt->timebase = cpu_to_be64(now);
811
dt->fault_addr = 0;
812
dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
813
dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
814
815
++dt;
816
if (dt == vcpu->arch.dtl.pinned_end)
817
dt = vcpu->arch.dtl.pinned_addr;
818
vcpu->arch.dtl_ptr = dt;
819
/* order writing *dt vs. writing vpa->dtl_idx */
820
smp_wmb();
821
vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
822
823
/* vcpu->arch.dtl.dirty is set by the caller */
824
}
825
826
static void kvmppc_update_vpa_dispatch(struct kvm_vcpu *vcpu,
827
struct kvmppc_vcore *vc)
828
{
829
struct lppaca *vpa;
830
unsigned long stolen;
831
unsigned long core_stolen;
832
u64 now;
833
unsigned long flags;
834
835
vpa = vcpu->arch.vpa.pinned_addr;
836
if (!vpa)
837
return;
838
839
now = mftb();
840
841
core_stolen = vcore_stolen_time(vc, now);
842
stolen = core_stolen - vcpu->arch.stolen_logged;
843
vcpu->arch.stolen_logged = core_stolen;
844
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
845
stolen += vcpu->arch.busy_stolen;
846
vcpu->arch.busy_stolen = 0;
847
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
848
849
vpa->enqueue_dispatch_tb = cpu_to_be64(be64_to_cpu(vpa->enqueue_dispatch_tb) + stolen);
850
851
__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now + kvmppc_get_tb_offset(vcpu), stolen);
852
853
vcpu->arch.vpa.dirty = true;
854
}
855
856
static void kvmppc_update_vpa_dispatch_p9(struct kvm_vcpu *vcpu,
857
struct kvmppc_vcore *vc,
858
u64 now)
859
{
860
struct lppaca *vpa;
861
unsigned long stolen;
862
unsigned long stolen_delta;
863
864
vpa = vcpu->arch.vpa.pinned_addr;
865
if (!vpa)
866
return;
867
868
stolen = vc->stolen_tb;
869
stolen_delta = stolen - vcpu->arch.stolen_logged;
870
vcpu->arch.stolen_logged = stolen;
871
872
vpa->enqueue_dispatch_tb = cpu_to_be64(stolen);
873
874
__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now, stolen_delta);
875
876
vcpu->arch.vpa.dirty = true;
877
}
878
879
/* See if there is a doorbell interrupt pending for a vcpu */
880
static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
881
{
882
int thr;
883
struct kvmppc_vcore *vc;
884
885
if (vcpu->arch.doorbell_request)
886
return true;
887
if (cpu_has_feature(CPU_FTR_ARCH_300))
888
return false;
889
/*
890
* Ensure that the read of vcore->dpdes comes after the read
891
* of vcpu->doorbell_request. This barrier matches the
892
* smp_wmb() in kvmppc_guest_entry_inject().
893
*/
894
smp_rmb();
895
vc = vcpu->arch.vcore;
896
thr = vcpu->vcpu_id - vc->first_vcpuid;
897
return !!(vc->dpdes & (1 << thr));
898
}
899
900
static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
901
{
902
if (kvmppc_get_arch_compat(vcpu) >= PVR_ARCH_207)
903
return true;
904
if ((!kvmppc_get_arch_compat(vcpu)) &&
905
cpu_has_feature(CPU_FTR_ARCH_207S))
906
return true;
907
return false;
908
}
909
910
static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
911
unsigned long resource, unsigned long value1,
912
unsigned long value2)
913
{
914
switch (resource) {
915
case H_SET_MODE_RESOURCE_SET_CIABR:
916
if (!kvmppc_power8_compatible(vcpu))
917
return H_P2;
918
if (value2)
919
return H_P4;
920
if (mflags)
921
return H_UNSUPPORTED_FLAG_START;
922
/* Guests can't breakpoint the hypervisor */
923
if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
924
return H_P3;
925
kvmppc_set_ciabr_hv(vcpu, value1);
926
return H_SUCCESS;
927
case H_SET_MODE_RESOURCE_SET_DAWR0:
928
if (!kvmppc_power8_compatible(vcpu))
929
return H_P2;
930
if (!ppc_breakpoint_available())
931
return H_P2;
932
if (mflags)
933
return H_UNSUPPORTED_FLAG_START;
934
if (value2 & DABRX_HYP)
935
return H_P4;
936
kvmppc_set_dawr0_hv(vcpu, value1);
937
kvmppc_set_dawrx0_hv(vcpu, value2);
938
return H_SUCCESS;
939
case H_SET_MODE_RESOURCE_SET_DAWR1:
940
if (!kvmppc_power8_compatible(vcpu))
941
return H_P2;
942
if (!ppc_breakpoint_available())
943
return H_P2;
944
if (!cpu_has_feature(CPU_FTR_DAWR1))
945
return H_P2;
946
if (!vcpu->kvm->arch.dawr1_enabled)
947
return H_FUNCTION;
948
if (mflags)
949
return H_UNSUPPORTED_FLAG_START;
950
if (value2 & DABRX_HYP)
951
return H_P4;
952
kvmppc_set_dawr1_hv(vcpu, value1);
953
kvmppc_set_dawrx1_hv(vcpu, value2);
954
return H_SUCCESS;
955
case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
956
/*
957
* KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
958
* Keep this in synch with kvmppc_filter_guest_lpcr_hv.
959
*/
960
if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
961
kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
962
return H_UNSUPPORTED_FLAG_START;
963
return H_TOO_HARD;
964
default:
965
return H_TOO_HARD;
966
}
967
}
968
969
/* Copy guest memory in place - must reside within a single memslot */
970
static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
971
unsigned long len)
972
{
973
struct kvm_memory_slot *to_memslot = NULL;
974
struct kvm_memory_slot *from_memslot = NULL;
975
unsigned long to_addr, from_addr;
976
int r;
977
978
/* Get HPA for from address */
979
from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
980
if (!from_memslot)
981
return -EFAULT;
982
if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
983
<< PAGE_SHIFT))
984
return -EINVAL;
985
from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
986
if (kvm_is_error_hva(from_addr))
987
return -EFAULT;
988
from_addr |= (from & (PAGE_SIZE - 1));
989
990
/* Get HPA for to address */
991
to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
992
if (!to_memslot)
993
return -EFAULT;
994
if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
995
<< PAGE_SHIFT))
996
return -EINVAL;
997
to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
998
if (kvm_is_error_hva(to_addr))
999
return -EFAULT;
1000
to_addr |= (to & (PAGE_SIZE - 1));
1001
1002
/* Perform copy */
1003
r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
1004
len);
1005
if (r)
1006
return -EFAULT;
1007
mark_page_dirty(kvm, to >> PAGE_SHIFT);
1008
return 0;
1009
}
1010
1011
static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
1012
unsigned long dest, unsigned long src)
1013
{
1014
u64 pg_sz = SZ_4K; /* 4K page size */
1015
u64 pg_mask = SZ_4K - 1;
1016
int ret;
1017
1018
/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
1019
if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
1020
H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
1021
return H_PARAMETER;
1022
1023
/* dest (and src if copy_page flag set) must be page aligned */
1024
if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
1025
return H_PARAMETER;
1026
1027
/* zero and/or copy the page as determined by the flags */
1028
if (flags & H_COPY_PAGE) {
1029
ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
1030
if (ret < 0)
1031
return H_PARAMETER;
1032
} else if (flags & H_ZERO_PAGE) {
1033
ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
1034
if (ret < 0)
1035
return H_PARAMETER;
1036
}
1037
1038
/* We can ignore the remaining flags */
1039
1040
return H_SUCCESS;
1041
}
1042
1043
static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
1044
{
1045
struct kvmppc_vcore *vcore = target->arch.vcore;
1046
1047
/*
1048
* We expect to have been called by the real mode handler
1049
* (kvmppc_rm_h_confer()) which would have directly returned
1050
* H_SUCCESS if the source vcore wasn't idle (e.g. if it may
1051
* have useful work to do and should not confer) so we don't
1052
* recheck that here.
1053
*
1054
* In the case of the P9 single vcpu per vcore case, the real
1055
* mode handler is not called but no other threads are in the
1056
* source vcore.
1057
*/
1058
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
1059
spin_lock(&vcore->lock);
1060
if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
1061
vcore->vcore_state != VCORE_INACTIVE &&
1062
vcore->runner)
1063
target = vcore->runner;
1064
spin_unlock(&vcore->lock);
1065
}
1066
1067
return kvm_vcpu_yield_to(target);
1068
}
1069
1070
static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
1071
{
1072
int yield_count = 0;
1073
struct lppaca *lppaca;
1074
1075
spin_lock(&vcpu->arch.vpa_update_lock);
1076
lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
1077
if (lppaca)
1078
yield_count = be32_to_cpu(lppaca->yield_count);
1079
spin_unlock(&vcpu->arch.vpa_update_lock);
1080
return yield_count;
1081
}
1082
1083
/*
1084
* H_RPT_INVALIDATE hcall handler for nested guests.
1085
*
1086
* Handles only nested process-scoped invalidation requests in L0.
1087
*/
1088
static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
1089
{
1090
unsigned long type = kvmppc_get_gpr(vcpu, 6);
1091
unsigned long pid, pg_sizes, start, end;
1092
1093
/*
1094
* The partition-scoped invalidations aren't handled here in L0.
1095
*/
1096
if (type & H_RPTI_TYPE_NESTED)
1097
return RESUME_HOST;
1098
1099
pid = kvmppc_get_gpr(vcpu, 4);
1100
pg_sizes = kvmppc_get_gpr(vcpu, 7);
1101
start = kvmppc_get_gpr(vcpu, 8);
1102
end = kvmppc_get_gpr(vcpu, 9);
1103
1104
do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
1105
type, pg_sizes, start, end);
1106
1107
kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
1108
return RESUME_GUEST;
1109
}
1110
1111
static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
1112
unsigned long id, unsigned long target,
1113
unsigned long type, unsigned long pg_sizes,
1114
unsigned long start, unsigned long end)
1115
{
1116
if (!kvm_is_radix(vcpu->kvm))
1117
return H_UNSUPPORTED;
1118
1119
if (end < start)
1120
return H_P5;
1121
1122
/*
1123
* Partition-scoped invalidation for nested guests.
1124
*/
1125
if (type & H_RPTI_TYPE_NESTED) {
1126
if (!nesting_enabled(vcpu->kvm))
1127
return H_FUNCTION;
1128
1129
/* Support only cores as target */
1130
if (target != H_RPTI_TARGET_CMMU)
1131
return H_P2;
1132
1133
return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
1134
start, end);
1135
}
1136
1137
/*
1138
* Process-scoped invalidation for L1 guests.
1139
*/
1140
do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
1141
type, pg_sizes, start, end);
1142
return H_SUCCESS;
1143
}
1144
1145
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1146
{
1147
struct kvm *kvm = vcpu->kvm;
1148
unsigned long req = kvmppc_get_gpr(vcpu, 3);
1149
unsigned long target, ret = H_SUCCESS;
1150
int yield_count;
1151
struct kvm_vcpu *tvcpu;
1152
int idx, rc;
1153
1154
if (req <= MAX_HCALL_OPCODE &&
1155
!test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
1156
return RESUME_HOST;
1157
1158
switch (req) {
1159
case H_REMOVE:
1160
ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
1161
kvmppc_get_gpr(vcpu, 5),
1162
kvmppc_get_gpr(vcpu, 6));
1163
if (ret == H_TOO_HARD)
1164
return RESUME_HOST;
1165
break;
1166
case H_ENTER:
1167
ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
1168
kvmppc_get_gpr(vcpu, 5),
1169
kvmppc_get_gpr(vcpu, 6),
1170
kvmppc_get_gpr(vcpu, 7));
1171
if (ret == H_TOO_HARD)
1172
return RESUME_HOST;
1173
break;
1174
case H_READ:
1175
ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
1176
kvmppc_get_gpr(vcpu, 5));
1177
if (ret == H_TOO_HARD)
1178
return RESUME_HOST;
1179
break;
1180
case H_CLEAR_MOD:
1181
ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
1182
kvmppc_get_gpr(vcpu, 5));
1183
if (ret == H_TOO_HARD)
1184
return RESUME_HOST;
1185
break;
1186
case H_CLEAR_REF:
1187
ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
1188
kvmppc_get_gpr(vcpu, 5));
1189
if (ret == H_TOO_HARD)
1190
return RESUME_HOST;
1191
break;
1192
case H_PROTECT:
1193
ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
1194
kvmppc_get_gpr(vcpu, 5),
1195
kvmppc_get_gpr(vcpu, 6));
1196
if (ret == H_TOO_HARD)
1197
return RESUME_HOST;
1198
break;
1199
case H_BULK_REMOVE:
1200
ret = kvmppc_h_bulk_remove(vcpu);
1201
if (ret == H_TOO_HARD)
1202
return RESUME_HOST;
1203
break;
1204
1205
case H_CEDE:
1206
break;
1207
case H_PROD:
1208
target = kvmppc_get_gpr(vcpu, 4);
1209
tvcpu = kvmppc_find_vcpu(kvm, target);
1210
if (!tvcpu) {
1211
ret = H_PARAMETER;
1212
break;
1213
}
1214
tvcpu->arch.prodded = 1;
1215
smp_mb(); /* This orders prodded store vs ceded load */
1216
if (tvcpu->arch.ceded)
1217
kvmppc_fast_vcpu_kick_hv(tvcpu);
1218
break;
1219
case H_CONFER:
1220
target = kvmppc_get_gpr(vcpu, 4);
1221
if (target == -1)
1222
break;
1223
tvcpu = kvmppc_find_vcpu(kvm, target);
1224
if (!tvcpu) {
1225
ret = H_PARAMETER;
1226
break;
1227
}
1228
yield_count = kvmppc_get_gpr(vcpu, 5);
1229
if (kvmppc_get_yield_count(tvcpu) != yield_count)
1230
break;
1231
kvm_arch_vcpu_yield_to(tvcpu);
1232
break;
1233
case H_REGISTER_VPA:
1234
ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
1235
kvmppc_get_gpr(vcpu, 5),
1236
kvmppc_get_gpr(vcpu, 6));
1237
break;
1238
case H_RTAS:
1239
if (list_empty(&kvm->arch.rtas_tokens))
1240
return RESUME_HOST;
1241
1242
idx = srcu_read_lock(&kvm->srcu);
1243
rc = kvmppc_rtas_hcall(vcpu);
1244
srcu_read_unlock(&kvm->srcu, idx);
1245
1246
if (rc == -ENOENT)
1247
return RESUME_HOST;
1248
else if (rc == 0)
1249
break;
1250
1251
/* Send the error out to userspace via KVM_RUN */
1252
return rc;
1253
case H_LOGICAL_CI_LOAD:
1254
ret = kvmppc_h_logical_ci_load(vcpu);
1255
if (ret == H_TOO_HARD)
1256
return RESUME_HOST;
1257
break;
1258
case H_LOGICAL_CI_STORE:
1259
ret = kvmppc_h_logical_ci_store(vcpu);
1260
if (ret == H_TOO_HARD)
1261
return RESUME_HOST;
1262
break;
1263
case H_SET_MODE:
1264
ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
1265
kvmppc_get_gpr(vcpu, 5),
1266
kvmppc_get_gpr(vcpu, 6),
1267
kvmppc_get_gpr(vcpu, 7));
1268
if (ret == H_TOO_HARD)
1269
return RESUME_HOST;
1270
break;
1271
case H_XIRR:
1272
case H_CPPR:
1273
case H_EOI:
1274
case H_IPI:
1275
case H_IPOLL:
1276
case H_XIRR_X:
1277
if (kvmppc_xics_enabled(vcpu)) {
1278
if (xics_on_xive()) {
1279
ret = H_NOT_AVAILABLE;
1280
return RESUME_GUEST;
1281
}
1282
ret = kvmppc_xics_hcall(vcpu, req);
1283
break;
1284
}
1285
return RESUME_HOST;
1286
case H_SET_DABR:
1287
ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
1288
break;
1289
case H_SET_XDABR:
1290
ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
1291
kvmppc_get_gpr(vcpu, 5));
1292
break;
1293
#ifdef CONFIG_SPAPR_TCE_IOMMU
1294
case H_GET_TCE:
1295
ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1296
kvmppc_get_gpr(vcpu, 5));
1297
if (ret == H_TOO_HARD)
1298
return RESUME_HOST;
1299
break;
1300
case H_PUT_TCE:
1301
ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1302
kvmppc_get_gpr(vcpu, 5),
1303
kvmppc_get_gpr(vcpu, 6));
1304
if (ret == H_TOO_HARD)
1305
return RESUME_HOST;
1306
break;
1307
case H_PUT_TCE_INDIRECT:
1308
ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
1309
kvmppc_get_gpr(vcpu, 5),
1310
kvmppc_get_gpr(vcpu, 6),
1311
kvmppc_get_gpr(vcpu, 7));
1312
if (ret == H_TOO_HARD)
1313
return RESUME_HOST;
1314
break;
1315
case H_STUFF_TCE:
1316
ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1317
kvmppc_get_gpr(vcpu, 5),
1318
kvmppc_get_gpr(vcpu, 6),
1319
kvmppc_get_gpr(vcpu, 7));
1320
if (ret == H_TOO_HARD)
1321
return RESUME_HOST;
1322
break;
1323
#endif
1324
case H_RANDOM: {
1325
unsigned long rand;
1326
1327
if (!arch_get_random_seed_longs(&rand, 1))
1328
ret = H_HARDWARE;
1329
kvmppc_set_gpr(vcpu, 4, rand);
1330
break;
1331
}
1332
case H_RPT_INVALIDATE:
1333
ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
1334
kvmppc_get_gpr(vcpu, 5),
1335
kvmppc_get_gpr(vcpu, 6),
1336
kvmppc_get_gpr(vcpu, 7),
1337
kvmppc_get_gpr(vcpu, 8),
1338
kvmppc_get_gpr(vcpu, 9));
1339
break;
1340
1341
case H_SET_PARTITION_TABLE:
1342
ret = H_FUNCTION;
1343
if (nesting_enabled(kvm))
1344
ret = kvmhv_set_partition_table(vcpu);
1345
break;
1346
case H_ENTER_NESTED:
1347
ret = H_FUNCTION;
1348
if (!nesting_enabled(kvm))
1349
break;
1350
ret = kvmhv_enter_nested_guest(vcpu);
1351
if (ret == H_INTERRUPT) {
1352
kvmppc_set_gpr(vcpu, 3, 0);
1353
vcpu->arch.hcall_needed = 0;
1354
return -EINTR;
1355
} else if (ret == H_TOO_HARD) {
1356
kvmppc_set_gpr(vcpu, 3, 0);
1357
vcpu->arch.hcall_needed = 0;
1358
return RESUME_HOST;
1359
}
1360
break;
1361
case H_TLB_INVALIDATE:
1362
ret = H_FUNCTION;
1363
if (nesting_enabled(kvm))
1364
ret = kvmhv_do_nested_tlbie(vcpu);
1365
break;
1366
case H_COPY_TOFROM_GUEST:
1367
ret = H_FUNCTION;
1368
if (nesting_enabled(kvm))
1369
ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1370
break;
1371
case H_PAGE_INIT:
1372
ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1373
kvmppc_get_gpr(vcpu, 5),
1374
kvmppc_get_gpr(vcpu, 6));
1375
break;
1376
case H_SVM_PAGE_IN:
1377
ret = H_UNSUPPORTED;
1378
if (kvmppc_get_srr1(vcpu) & MSR_S)
1379
ret = kvmppc_h_svm_page_in(kvm,
1380
kvmppc_get_gpr(vcpu, 4),
1381
kvmppc_get_gpr(vcpu, 5),
1382
kvmppc_get_gpr(vcpu, 6));
1383
break;
1384
case H_SVM_PAGE_OUT:
1385
ret = H_UNSUPPORTED;
1386
if (kvmppc_get_srr1(vcpu) & MSR_S)
1387
ret = kvmppc_h_svm_page_out(kvm,
1388
kvmppc_get_gpr(vcpu, 4),
1389
kvmppc_get_gpr(vcpu, 5),
1390
kvmppc_get_gpr(vcpu, 6));
1391
break;
1392
case H_SVM_INIT_START:
1393
ret = H_UNSUPPORTED;
1394
if (kvmppc_get_srr1(vcpu) & MSR_S)
1395
ret = kvmppc_h_svm_init_start(kvm);
1396
break;
1397
case H_SVM_INIT_DONE:
1398
ret = H_UNSUPPORTED;
1399
if (kvmppc_get_srr1(vcpu) & MSR_S)
1400
ret = kvmppc_h_svm_init_done(kvm);
1401
break;
1402
case H_SVM_INIT_ABORT:
1403
/*
1404
* Even if that call is made by the Ultravisor, the SSR1 value
1405
* is the guest context one, with the secure bit clear as it has
1406
* not yet been secured. So we can't check it here.
1407
* Instead the kvm->arch.secure_guest flag is checked inside
1408
* kvmppc_h_svm_init_abort().
1409
*/
1410
ret = kvmppc_h_svm_init_abort(kvm);
1411
break;
1412
1413
default:
1414
return RESUME_HOST;
1415
}
1416
WARN_ON_ONCE(ret == H_TOO_HARD);
1417
kvmppc_set_gpr(vcpu, 3, ret);
1418
vcpu->arch.hcall_needed = 0;
1419
return RESUME_GUEST;
1420
}
1421
1422
/*
1423
* Handle H_CEDE in the P9 path where we don't call the real-mode hcall
1424
* handlers in book3s_hv_rmhandlers.S.
1425
*
1426
* This has to be done early, not in kvmppc_pseries_do_hcall(), so
1427
* that the cede logic in kvmppc_run_single_vcpu() works properly.
1428
*/
1429
static void kvmppc_cede(struct kvm_vcpu *vcpu)
1430
{
1431
__kvmppc_set_msr_hv(vcpu, __kvmppc_get_msr_hv(vcpu) | MSR_EE);
1432
vcpu->arch.ceded = 1;
1433
smp_mb();
1434
if (vcpu->arch.prodded) {
1435
vcpu->arch.prodded = 0;
1436
smp_mb();
1437
vcpu->arch.ceded = 0;
1438
}
1439
}
1440
1441
static int kvmppc_hcall_impl_hv(unsigned long cmd)
1442
{
1443
switch (cmd) {
1444
case H_CEDE:
1445
case H_PROD:
1446
case H_CONFER:
1447
case H_REGISTER_VPA:
1448
case H_SET_MODE:
1449
#ifdef CONFIG_SPAPR_TCE_IOMMU
1450
case H_GET_TCE:
1451
case H_PUT_TCE:
1452
case H_PUT_TCE_INDIRECT:
1453
case H_STUFF_TCE:
1454
#endif
1455
case H_LOGICAL_CI_LOAD:
1456
case H_LOGICAL_CI_STORE:
1457
#ifdef CONFIG_KVM_XICS
1458
case H_XIRR:
1459
case H_CPPR:
1460
case H_EOI:
1461
case H_IPI:
1462
case H_IPOLL:
1463
case H_XIRR_X:
1464
#endif
1465
case H_PAGE_INIT:
1466
case H_RPT_INVALIDATE:
1467
return 1;
1468
}
1469
1470
/* See if it's in the real-mode table */
1471
return kvmppc_hcall_impl_hv_realmode(cmd);
1472
}
1473
1474
static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
1475
{
1476
ppc_inst_t last_inst;
1477
1478
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
1479
EMULATE_DONE) {
1480
/*
1481
* Fetch failed, so return to guest and
1482
* try executing it again.
1483
*/
1484
return RESUME_GUEST;
1485
}
1486
1487
if (ppc_inst_val(last_inst) == KVMPPC_INST_SW_BREAKPOINT) {
1488
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
1489
vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
1490
return RESUME_HOST;
1491
} else {
1492
kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1493
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1494
return RESUME_GUEST;
1495
}
1496
}
1497
1498
static void do_nothing(void *x)
1499
{
1500
}
1501
1502
static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
1503
{
1504
int thr, cpu, pcpu, nthreads;
1505
struct kvm_vcpu *v;
1506
unsigned long dpdes;
1507
1508
nthreads = vcpu->kvm->arch.emul_smt_mode;
1509
dpdes = 0;
1510
cpu = vcpu->vcpu_id & ~(nthreads - 1);
1511
for (thr = 0; thr < nthreads; ++thr, ++cpu) {
1512
v = kvmppc_find_vcpu(vcpu->kvm, cpu);
1513
if (!v)
1514
continue;
1515
/*
1516
* If the vcpu is currently running on a physical cpu thread,
1517
* interrupt it in order to pull it out of the guest briefly,
1518
* which will update its vcore->dpdes value.
1519
*/
1520
pcpu = READ_ONCE(v->cpu);
1521
if (pcpu >= 0)
1522
smp_call_function_single(pcpu, do_nothing, NULL, 1);
1523
if (kvmppc_doorbell_pending(v))
1524
dpdes |= 1 << thr;
1525
}
1526
return dpdes;
1527
}
1528
1529
/*
1530
* On POWER9, emulate doorbell-related instructions in order to
1531
* give the guest the illusion of running on a multi-threaded core.
1532
* The instructions emulated are msgsndp, msgclrp, mfspr TIR,
1533
* and mfspr DPDES.
1534
*/
1535
static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
1536
{
1537
u32 inst, rb, thr;
1538
unsigned long arg;
1539
struct kvm *kvm = vcpu->kvm;
1540
struct kvm_vcpu *tvcpu;
1541
ppc_inst_t pinst;
1542
1543
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst) != EMULATE_DONE)
1544
return RESUME_GUEST;
1545
inst = ppc_inst_val(pinst);
1546
if (get_op(inst) != 31)
1547
return EMULATE_FAIL;
1548
rb = get_rb(inst);
1549
thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
1550
switch (get_xop(inst)) {
1551
case OP_31_XOP_MSGSNDP:
1552
arg = kvmppc_get_gpr(vcpu, rb);
1553
if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1554
break;
1555
arg &= 0x7f;
1556
if (arg >= kvm->arch.emul_smt_mode)
1557
break;
1558
tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
1559
if (!tvcpu)
1560
break;
1561
if (!tvcpu->arch.doorbell_request) {
1562
tvcpu->arch.doorbell_request = 1;
1563
kvmppc_fast_vcpu_kick_hv(tvcpu);
1564
}
1565
break;
1566
case OP_31_XOP_MSGCLRP:
1567
arg = kvmppc_get_gpr(vcpu, rb);
1568
if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1569
break;
1570
vcpu->arch.vcore->dpdes = 0;
1571
vcpu->arch.doorbell_request = 0;
1572
break;
1573
case OP_31_XOP_MFSPR:
1574
switch (get_sprn(inst)) {
1575
case SPRN_TIR:
1576
arg = thr;
1577
break;
1578
case SPRN_DPDES:
1579
arg = kvmppc_read_dpdes(vcpu);
1580
break;
1581
default:
1582
return EMULATE_FAIL;
1583
}
1584
kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1585
break;
1586
default:
1587
return EMULATE_FAIL;
1588
}
1589
kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
1590
return RESUME_GUEST;
1591
}
1592
1593
/*
1594
* If the lppaca had pmcregs_in_use clear when we exited the guest, then
1595
* HFSCR_PM is cleared for next entry. If the guest then tries to access
1596
* the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
1597
* back in the guest HFSCR will cause the next entry to load the PMU SPRs and
1598
* allow the guest access to continue.
1599
*/
1600
static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
1601
{
1602
if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
1603
return EMULATE_FAIL;
1604
1605
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PM);
1606
1607
return RESUME_GUEST;
1608
}
1609
1610
static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
1611
{
1612
if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
1613
return EMULATE_FAIL;
1614
1615
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_EBB);
1616
1617
return RESUME_GUEST;
1618
}
1619
1620
static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
1621
{
1622
if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
1623
return EMULATE_FAIL;
1624
1625
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
1626
1627
return RESUME_GUEST;
1628
}
1629
1630
static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
1631
struct task_struct *tsk)
1632
{
1633
struct kvm_run *run = vcpu->run;
1634
int r = RESUME_HOST;
1635
1636
vcpu->stat.sum_exits++;
1637
1638
/*
1639
* This can happen if an interrupt occurs in the last stages
1640
* of guest entry or the first stages of guest exit (i.e. after
1641
* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1642
* and before setting it to KVM_GUEST_MODE_HOST_HV).
1643
* That can happen due to a bug, or due to a machine check
1644
* occurring at just the wrong time.
1645
*/
1646
if (!kvmhv_is_nestedv2() && (__kvmppc_get_msr_hv(vcpu) & MSR_HV)) {
1647
printk(KERN_EMERG "KVM trap in HV mode!\n");
1648
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1649
vcpu->arch.trap, kvmppc_get_pc(vcpu),
1650
vcpu->arch.shregs.msr);
1651
kvmppc_dump_regs(vcpu);
1652
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1653
run->hw.hardware_exit_reason = vcpu->arch.trap;
1654
return RESUME_HOST;
1655
}
1656
run->exit_reason = KVM_EXIT_UNKNOWN;
1657
run->ready_for_interrupt_injection = 1;
1658
switch (vcpu->arch.trap) {
1659
/* We're good on these - the host merely wanted to get our attention */
1660
case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
1661
WARN_ON_ONCE(1); /* Should never happen */
1662
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
1663
fallthrough;
1664
case BOOK3S_INTERRUPT_HV_DECREMENTER:
1665
vcpu->stat.dec_exits++;
1666
r = RESUME_GUEST;
1667
break;
1668
case BOOK3S_INTERRUPT_EXTERNAL:
1669
case BOOK3S_INTERRUPT_H_DOORBELL:
1670
case BOOK3S_INTERRUPT_H_VIRT:
1671
vcpu->stat.ext_intr_exits++;
1672
r = RESUME_GUEST;
1673
break;
1674
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1675
case BOOK3S_INTERRUPT_HMI:
1676
case BOOK3S_INTERRUPT_PERFMON:
1677
case BOOK3S_INTERRUPT_SYSTEM_RESET:
1678
r = RESUME_GUEST;
1679
break;
1680
case BOOK3S_INTERRUPT_MACHINE_CHECK: {
1681
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1682
DEFAULT_RATELIMIT_BURST);
1683
/*
1684
* Print the MCE event to host console. Ratelimit so the guest
1685
* can't flood the host log.
1686
*/
1687
if (__ratelimit(&rs))
1688
machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
1689
1690
/*
1691
* If the guest can do FWNMI, exit to userspace so it can
1692
* deliver a FWNMI to the guest.
1693
* Otherwise we synthesize a machine check for the guest
1694
* so that it knows that the machine check occurred.
1695
*/
1696
if (!vcpu->kvm->arch.fwnmi_enabled) {
1697
ulong flags = (__kvmppc_get_msr_hv(vcpu) & 0x083c0000) |
1698
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1699
kvmppc_core_queue_machine_check(vcpu, flags);
1700
r = RESUME_GUEST;
1701
break;
1702
}
1703
1704
/* Exit to guest with KVM_EXIT_NMI as exit reason */
1705
run->exit_reason = KVM_EXIT_NMI;
1706
run->hw.hardware_exit_reason = vcpu->arch.trap;
1707
/* Clear out the old NMI status from run->flags */
1708
run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
1709
/* Now set the NMI status */
1710
if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
1711
run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
1712
else
1713
run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1714
1715
r = RESUME_HOST;
1716
break;
1717
}
1718
case BOOK3S_INTERRUPT_PROGRAM:
1719
{
1720
ulong flags;
1721
/*
1722
* Normally program interrupts are delivered directly
1723
* to the guest by the hardware, but we can get here
1724
* as a result of a hypervisor emulation interrupt
1725
* (e40) getting turned into a 700 by BML RTAS.
1726
*/
1727
flags = (__kvmppc_get_msr_hv(vcpu) & 0x1f0000ull) |
1728
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1729
kvmppc_core_queue_program(vcpu, flags);
1730
r = RESUME_GUEST;
1731
break;
1732
}
1733
case BOOK3S_INTERRUPT_SYSCALL:
1734
{
1735
int i;
1736
1737
if (!kvmhv_is_nestedv2() && unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
1738
/*
1739
* Guest userspace executed sc 1. This can only be
1740
* reached by the P9 path because the old path
1741
* handles this case in realmode hcall handlers.
1742
*/
1743
if (!kvmhv_vcpu_is_radix(vcpu)) {
1744
/*
1745
* A guest could be running PR KVM, so this
1746
* may be a PR KVM hcall. It must be reflected
1747
* to the guest kernel as a sc interrupt.
1748
*/
1749
kvmppc_core_queue_syscall(vcpu);
1750
} else {
1751
/*
1752
* Radix guests can not run PR KVM or nested HV
1753
* hash guests which might run PR KVM, so this
1754
* is always a privilege fault. Send a program
1755
* check to guest kernel.
1756
*/
1757
kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
1758
}
1759
r = RESUME_GUEST;
1760
break;
1761
}
1762
1763
/*
1764
* hcall - gather args and set exit_reason. This will next be
1765
* handled by kvmppc_pseries_do_hcall which may be able to deal
1766
* with it and resume guest, or may punt to userspace.
1767
*/
1768
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
1769
for (i = 0; i < 9; ++i)
1770
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
1771
run->exit_reason = KVM_EXIT_PAPR_HCALL;
1772
vcpu->arch.hcall_needed = 1;
1773
r = RESUME_HOST;
1774
break;
1775
}
1776
/*
1777
* We get these next two if the guest accesses a page which it thinks
1778
* it has mapped but which is not actually present, either because
1779
* it is for an emulated I/O device or because the corresonding
1780
* host page has been paged out.
1781
*
1782
* Any other HDSI/HISI interrupts have been handled already for P7/8
1783
* guests. For POWER9 hash guests not using rmhandlers, basic hash
1784
* fault handling is done here.
1785
*/
1786
case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
1787
unsigned long vsid;
1788
long err;
1789
1790
if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
1791
unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
1792
r = RESUME_GUEST; /* Just retry if it's the canary */
1793
break;
1794
}
1795
1796
if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1797
/*
1798
* Radix doesn't require anything, and pre-ISAv3.0 hash
1799
* already attempted to handle this in rmhandlers. The
1800
* hash fault handling below is v3 only (it uses ASDR
1801
* via fault_gpa).
1802
*/
1803
r = RESUME_PAGE_FAULT;
1804
break;
1805
}
1806
1807
if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
1808
kvmppc_core_queue_data_storage(vcpu,
1809
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1810
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1811
r = RESUME_GUEST;
1812
break;
1813
}
1814
1815
if (!(__kvmppc_get_msr_hv(vcpu) & MSR_DR))
1816
vsid = vcpu->kvm->arch.vrma_slb_v;
1817
else
1818
vsid = vcpu->arch.fault_gpa;
1819
1820
err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1821
vsid, vcpu->arch.fault_dsisr, true);
1822
if (err == 0) {
1823
r = RESUME_GUEST;
1824
} else if (err == -1 || err == -2) {
1825
r = RESUME_PAGE_FAULT;
1826
} else {
1827
kvmppc_core_queue_data_storage(vcpu,
1828
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1829
vcpu->arch.fault_dar, err);
1830
r = RESUME_GUEST;
1831
}
1832
break;
1833
}
1834
case BOOK3S_INTERRUPT_H_INST_STORAGE: {
1835
unsigned long vsid;
1836
long err;
1837
1838
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1839
vcpu->arch.fault_dsisr = __kvmppc_get_msr_hv(vcpu) &
1840
DSISR_SRR1_MATCH_64S;
1841
if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1842
/*
1843
* Radix doesn't require anything, and pre-ISAv3.0 hash
1844
* already attempted to handle this in rmhandlers. The
1845
* hash fault handling below is v3 only (it uses ASDR
1846
* via fault_gpa).
1847
*/
1848
if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
1849
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1850
r = RESUME_PAGE_FAULT;
1851
break;
1852
}
1853
1854
if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
1855
kvmppc_core_queue_inst_storage(vcpu,
1856
vcpu->arch.fault_dsisr |
1857
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1858
r = RESUME_GUEST;
1859
break;
1860
}
1861
1862
if (!(__kvmppc_get_msr_hv(vcpu) & MSR_IR))
1863
vsid = vcpu->kvm->arch.vrma_slb_v;
1864
else
1865
vsid = vcpu->arch.fault_gpa;
1866
1867
err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1868
vsid, vcpu->arch.fault_dsisr, false);
1869
if (err == 0) {
1870
r = RESUME_GUEST;
1871
} else if (err == -1) {
1872
r = RESUME_PAGE_FAULT;
1873
} else {
1874
kvmppc_core_queue_inst_storage(vcpu,
1875
err | (kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1876
r = RESUME_GUEST;
1877
}
1878
break;
1879
}
1880
1881
/*
1882
* This occurs if the guest executes an illegal instruction.
1883
* If the guest debug is disabled, generate a program interrupt
1884
* to the guest. If guest debug is enabled, we need to check
1885
* whether the instruction is a software breakpoint instruction.
1886
* Accordingly return to Guest or Host.
1887
*/
1888
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1889
if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1890
vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1891
swab32(vcpu->arch.emul_inst) :
1892
vcpu->arch.emul_inst;
1893
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1894
r = kvmppc_emulate_debug_inst(vcpu);
1895
} else {
1896
kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1897
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1898
r = RESUME_GUEST;
1899
}
1900
break;
1901
1902
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1903
case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1904
/*
1905
* This occurs for various TM-related instructions that
1906
* we need to emulate on POWER9 DD2.2. We have already
1907
* handled the cases where the guest was in real-suspend
1908
* mode and was transitioning to transactional state.
1909
*/
1910
r = kvmhv_p9_tm_emulation(vcpu);
1911
if (r != -1)
1912
break;
1913
fallthrough; /* go to facility unavailable handler */
1914
#endif
1915
1916
/*
1917
* This occurs if the guest (kernel or userspace), does something that
1918
* is prohibited by HFSCR.
1919
* On POWER9, this could be a doorbell instruction that we need
1920
* to emulate.
1921
* Otherwise, we just generate a program interrupt to the guest.
1922
*/
1923
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
1924
u64 cause = kvmppc_get_hfscr_hv(vcpu) >> 56;
1925
1926
r = EMULATE_FAIL;
1927
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1928
switch (cause) {
1929
case FSCR_MSGP_LG:
1930
r = kvmppc_emulate_doorbell_instr(vcpu);
1931
break;
1932
case FSCR_PM_LG:
1933
r = kvmppc_pmu_unavailable(vcpu);
1934
break;
1935
case FSCR_EBB_LG:
1936
r = kvmppc_ebb_unavailable(vcpu);
1937
break;
1938
case FSCR_TM_LG:
1939
r = kvmppc_tm_unavailable(vcpu);
1940
break;
1941
default:
1942
break;
1943
}
1944
}
1945
if (r == EMULATE_FAIL) {
1946
kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1947
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1948
r = RESUME_GUEST;
1949
}
1950
break;
1951
}
1952
1953
case BOOK3S_INTERRUPT_HV_RM_HARD:
1954
r = RESUME_PASSTHROUGH;
1955
break;
1956
default:
1957
kvmppc_dump_regs(vcpu);
1958
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1959
vcpu->arch.trap, kvmppc_get_pc(vcpu),
1960
__kvmppc_get_msr_hv(vcpu));
1961
run->hw.hardware_exit_reason = vcpu->arch.trap;
1962
r = RESUME_HOST;
1963
break;
1964
}
1965
1966
return r;
1967
}
1968
1969
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1970
{
1971
int r;
1972
int srcu_idx;
1973
1974
vcpu->stat.sum_exits++;
1975
1976
/*
1977
* This can happen if an interrupt occurs in the last stages
1978
* of guest entry or the first stages of guest exit (i.e. after
1979
* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1980
* and before setting it to KVM_GUEST_MODE_HOST_HV).
1981
* That can happen due to a bug, or due to a machine check
1982
* occurring at just the wrong time.
1983
*/
1984
if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
1985
pr_emerg("KVM trap in HV mode while nested!\n");
1986
pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1987
vcpu->arch.trap, kvmppc_get_pc(vcpu),
1988
__kvmppc_get_msr_hv(vcpu));
1989
kvmppc_dump_regs(vcpu);
1990
return RESUME_HOST;
1991
}
1992
switch (vcpu->arch.trap) {
1993
/* We're good on these - the host merely wanted to get our attention */
1994
case BOOK3S_INTERRUPT_HV_DECREMENTER:
1995
vcpu->stat.dec_exits++;
1996
r = RESUME_GUEST;
1997
break;
1998
case BOOK3S_INTERRUPT_EXTERNAL:
1999
vcpu->stat.ext_intr_exits++;
2000
r = RESUME_HOST;
2001
break;
2002
case BOOK3S_INTERRUPT_H_DOORBELL:
2003
case BOOK3S_INTERRUPT_H_VIRT:
2004
vcpu->stat.ext_intr_exits++;
2005
r = RESUME_GUEST;
2006
break;
2007
/* These need to go to the nested HV */
2008
case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
2009
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
2010
vcpu->stat.dec_exits++;
2011
r = RESUME_HOST;
2012
break;
2013
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
2014
case BOOK3S_INTERRUPT_HMI:
2015
case BOOK3S_INTERRUPT_PERFMON:
2016
case BOOK3S_INTERRUPT_SYSTEM_RESET:
2017
r = RESUME_GUEST;
2018
break;
2019
case BOOK3S_INTERRUPT_MACHINE_CHECK:
2020
{
2021
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
2022
DEFAULT_RATELIMIT_BURST);
2023
/* Pass the machine check to the L1 guest */
2024
r = RESUME_HOST;
2025
/* Print the MCE event to host console. */
2026
if (__ratelimit(&rs))
2027
machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
2028
break;
2029
}
2030
/*
2031
* We get these next two if the guest accesses a page which it thinks
2032
* it has mapped but which is not actually present, either because
2033
* it is for an emulated I/O device or because the corresonding
2034
* host page has been paged out.
2035
*/
2036
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
2037
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2038
r = kvmhv_nested_page_fault(vcpu);
2039
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2040
break;
2041
case BOOK3S_INTERRUPT_H_INST_STORAGE:
2042
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
2043
vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
2044
DSISR_SRR1_MATCH_64S;
2045
if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
2046
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
2047
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2048
r = kvmhv_nested_page_fault(vcpu);
2049
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2050
break;
2051
2052
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2053
case BOOK3S_INTERRUPT_HV_SOFTPATCH:
2054
/*
2055
* This occurs for various TM-related instructions that
2056
* we need to emulate on POWER9 DD2.2. We have already
2057
* handled the cases where the guest was in real-suspend
2058
* mode and was transitioning to transactional state.
2059
*/
2060
r = kvmhv_p9_tm_emulation(vcpu);
2061
if (r != -1)
2062
break;
2063
fallthrough; /* go to facility unavailable handler */
2064
#endif
2065
2066
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
2067
r = RESUME_HOST;
2068
break;
2069
2070
case BOOK3S_INTERRUPT_HV_RM_HARD:
2071
vcpu->arch.trap = 0;
2072
r = RESUME_GUEST;
2073
if (!xics_on_xive())
2074
kvmppc_xics_rm_complete(vcpu, 0);
2075
break;
2076
case BOOK3S_INTERRUPT_SYSCALL:
2077
{
2078
unsigned long req = kvmppc_get_gpr(vcpu, 3);
2079
2080
/*
2081
* The H_RPT_INVALIDATE hcalls issued by nested
2082
* guests for process-scoped invalidations when
2083
* GTSE=0, are handled here in L0.
2084
*/
2085
if (req == H_RPT_INVALIDATE) {
2086
r = kvmppc_nested_h_rpt_invalidate(vcpu);
2087
break;
2088
}
2089
2090
r = RESUME_HOST;
2091
break;
2092
}
2093
default:
2094
r = RESUME_HOST;
2095
break;
2096
}
2097
2098
return r;
2099
}
2100
2101
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
2102
struct kvm_sregs *sregs)
2103
{
2104
int i;
2105
2106
memset(sregs, 0, sizeof(struct kvm_sregs));
2107
sregs->pvr = vcpu->arch.pvr;
2108
for (i = 0; i < vcpu->arch.slb_max; i++) {
2109
sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
2110
sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
2111
}
2112
2113
return 0;
2114
}
2115
2116
static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
2117
struct kvm_sregs *sregs)
2118
{
2119
int i, j;
2120
2121
/* Only accept the same PVR as the host's, since we can't spoof it */
2122
if (sregs->pvr != vcpu->arch.pvr)
2123
return -EINVAL;
2124
2125
j = 0;
2126
for (i = 0; i < vcpu->arch.slb_nr; i++) {
2127
if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
2128
vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
2129
vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
2130
++j;
2131
}
2132
}
2133
vcpu->arch.slb_max = j;
2134
2135
return 0;
2136
}
2137
2138
/*
2139
* Enforce limits on guest LPCR values based on hardware availability,
2140
* guest configuration, and possibly hypervisor support and security
2141
* concerns.
2142
*/
2143
unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
2144
{
2145
/* LPCR_TC only applies to HPT guests */
2146
if (kvm_is_radix(kvm))
2147
lpcr &= ~LPCR_TC;
2148
2149
/* On POWER8 and above, userspace can modify AIL */
2150
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2151
lpcr &= ~LPCR_AIL;
2152
if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
2153
lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
2154
/*
2155
* On some POWER9s we force AIL off for radix guests to prevent
2156
* executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
2157
* guest, which can result in Q0 translations with LPID=0 PID=PIDR to
2158
* be cached, which the host TLB management does not expect.
2159
*/
2160
if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
2161
lpcr &= ~LPCR_AIL;
2162
2163
/*
2164
* On POWER9, allow userspace to enable large decrementer for the
2165
* guest, whether or not the host has it enabled.
2166
*/
2167
if (!cpu_has_feature(CPU_FTR_ARCH_300))
2168
lpcr &= ~LPCR_LD;
2169
2170
return lpcr;
2171
}
2172
2173
static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
2174
{
2175
if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
2176
WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
2177
lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
2178
}
2179
}
2180
2181
static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
2182
bool preserve_top32)
2183
{
2184
struct kvm *kvm = vcpu->kvm;
2185
struct kvmppc_vcore *vc = vcpu->arch.vcore;
2186
u64 mask;
2187
2188
spin_lock(&vc->lock);
2189
2190
/*
2191
* Userspace can only modify
2192
* DPFD (default prefetch depth), ILE (interrupt little-endian),
2193
* TC (translation control), AIL (alternate interrupt location),
2194
* LD (large decrementer).
2195
* These are subject to restrictions from kvmppc_filter_lcpr_hv().
2196
*/
2197
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;
2198
2199
/* Broken 32-bit version of LPCR must not clear top bits */
2200
if (preserve_top32)
2201
mask &= 0xFFFFFFFF;
2202
2203
new_lpcr = kvmppc_filter_lpcr_hv(kvm,
2204
(vc->lpcr & ~mask) | (new_lpcr & mask));
2205
2206
/*
2207
* If ILE (interrupt little-endian) has changed, update the
2208
* MSR_LE bit in the intr_msr for each vcpu in this vcore.
2209
*/
2210
if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
2211
struct kvm_vcpu *vcpu;
2212
unsigned long i;
2213
2214
kvm_for_each_vcpu(i, vcpu, kvm) {
2215
if (vcpu->arch.vcore != vc)
2216
continue;
2217
if (new_lpcr & LPCR_ILE)
2218
vcpu->arch.intr_msr |= MSR_LE;
2219
else
2220
vcpu->arch.intr_msr &= ~MSR_LE;
2221
}
2222
}
2223
2224
vc->lpcr = new_lpcr;
2225
kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
2226
2227
spin_unlock(&vc->lock);
2228
}
2229
2230
static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2231
union kvmppc_one_reg *val)
2232
{
2233
int r = 0;
2234
long int i;
2235
2236
switch (id) {
2237
case KVM_REG_PPC_DEBUG_INST:
2238
*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
2239
break;
2240
case KVM_REG_PPC_HIOR:
2241
*val = get_reg_val(id, 0);
2242
break;
2243
case KVM_REG_PPC_DABR:
2244
*val = get_reg_val(id, vcpu->arch.dabr);
2245
break;
2246
case KVM_REG_PPC_DABRX:
2247
*val = get_reg_val(id, vcpu->arch.dabrx);
2248
break;
2249
case KVM_REG_PPC_DSCR:
2250
*val = get_reg_val(id, kvmppc_get_dscr_hv(vcpu));
2251
break;
2252
case KVM_REG_PPC_PURR:
2253
*val = get_reg_val(id, kvmppc_get_purr_hv(vcpu));
2254
break;
2255
case KVM_REG_PPC_SPURR:
2256
*val = get_reg_val(id, kvmppc_get_spurr_hv(vcpu));
2257
break;
2258
case KVM_REG_PPC_AMR:
2259
*val = get_reg_val(id, kvmppc_get_amr_hv(vcpu));
2260
break;
2261
case KVM_REG_PPC_UAMOR:
2262
*val = get_reg_val(id, kvmppc_get_uamor_hv(vcpu));
2263
break;
2264
case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2265
i = id - KVM_REG_PPC_MMCR0;
2266
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i));
2267
break;
2268
case KVM_REG_PPC_MMCR2:
2269
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 2));
2270
break;
2271
case KVM_REG_PPC_MMCRA:
2272
*val = get_reg_val(id, kvmppc_get_mmcra_hv(vcpu));
2273
break;
2274
case KVM_REG_PPC_MMCRS:
2275
*val = get_reg_val(id, vcpu->arch.mmcrs);
2276
break;
2277
case KVM_REG_PPC_MMCR3:
2278
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 3));
2279
break;
2280
case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2281
i = id - KVM_REG_PPC_PMC1;
2282
*val = get_reg_val(id, kvmppc_get_pmc_hv(vcpu, i));
2283
break;
2284
case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2285
i = id - KVM_REG_PPC_SPMC1;
2286
*val = get_reg_val(id, vcpu->arch.spmc[i]);
2287
break;
2288
case KVM_REG_PPC_SIAR:
2289
*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
2290
break;
2291
case KVM_REG_PPC_SDAR:
2292
*val = get_reg_val(id, kvmppc_get_sdar_hv(vcpu));
2293
break;
2294
case KVM_REG_PPC_SIER:
2295
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 0));
2296
break;
2297
case KVM_REG_PPC_SIER2:
2298
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 1));
2299
break;
2300
case KVM_REG_PPC_SIER3:
2301
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 2));
2302
break;
2303
case KVM_REG_PPC_IAMR:
2304
*val = get_reg_val(id, kvmppc_get_iamr_hv(vcpu));
2305
break;
2306
case KVM_REG_PPC_PSPB:
2307
*val = get_reg_val(id, kvmppc_get_pspb_hv(vcpu));
2308
break;
2309
case KVM_REG_PPC_DPDES:
2310
/*
2311
* On POWER9, where we are emulating msgsndp etc.,
2312
* we return 1 bit for each vcpu, which can come from
2313
* either vcore->dpdes or doorbell_request.
2314
* On POWER8, doorbell_request is 0.
2315
*/
2316
if (cpu_has_feature(CPU_FTR_ARCH_300))
2317
*val = get_reg_val(id, vcpu->arch.doorbell_request);
2318
else
2319
*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
2320
break;
2321
case KVM_REG_PPC_VTB:
2322
*val = get_reg_val(id, kvmppc_get_vtb(vcpu));
2323
break;
2324
case KVM_REG_PPC_DAWR:
2325
*val = get_reg_val(id, kvmppc_get_dawr0_hv(vcpu));
2326
break;
2327
case KVM_REG_PPC_DAWRX:
2328
*val = get_reg_val(id, kvmppc_get_dawrx0_hv(vcpu));
2329
break;
2330
case KVM_REG_PPC_DAWR1:
2331
*val = get_reg_val(id, kvmppc_get_dawr1_hv(vcpu));
2332
break;
2333
case KVM_REG_PPC_DAWRX1:
2334
*val = get_reg_val(id, kvmppc_get_dawrx1_hv(vcpu));
2335
break;
2336
case KVM_REG_PPC_DEXCR:
2337
*val = get_reg_val(id, kvmppc_get_dexcr_hv(vcpu));
2338
break;
2339
case KVM_REG_PPC_HASHKEYR:
2340
*val = get_reg_val(id, kvmppc_get_hashkeyr_hv(vcpu));
2341
break;
2342
case KVM_REG_PPC_HASHPKEYR:
2343
*val = get_reg_val(id, kvmppc_get_hashpkeyr_hv(vcpu));
2344
break;
2345
case KVM_REG_PPC_CIABR:
2346
*val = get_reg_val(id, kvmppc_get_ciabr_hv(vcpu));
2347
break;
2348
case KVM_REG_PPC_CSIGR:
2349
*val = get_reg_val(id, vcpu->arch.csigr);
2350
break;
2351
case KVM_REG_PPC_TACR:
2352
*val = get_reg_val(id, vcpu->arch.tacr);
2353
break;
2354
case KVM_REG_PPC_TCSCR:
2355
*val = get_reg_val(id, vcpu->arch.tcscr);
2356
break;
2357
case KVM_REG_PPC_PID:
2358
*val = get_reg_val(id, kvmppc_get_pid(vcpu));
2359
break;
2360
case KVM_REG_PPC_ACOP:
2361
*val = get_reg_val(id, vcpu->arch.acop);
2362
break;
2363
case KVM_REG_PPC_WORT:
2364
*val = get_reg_val(id, kvmppc_get_wort_hv(vcpu));
2365
break;
2366
case KVM_REG_PPC_TIDR:
2367
*val = get_reg_val(id, vcpu->arch.tid);
2368
break;
2369
case KVM_REG_PPC_PSSCR:
2370
*val = get_reg_val(id, vcpu->arch.psscr);
2371
break;
2372
case KVM_REG_PPC_VPA_ADDR:
2373
spin_lock(&vcpu->arch.vpa_update_lock);
2374
*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
2375
spin_unlock(&vcpu->arch.vpa_update_lock);
2376
break;
2377
case KVM_REG_PPC_VPA_SLB:
2378
spin_lock(&vcpu->arch.vpa_update_lock);
2379
val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
2380
val->vpaval.length = vcpu->arch.slb_shadow.len;
2381
spin_unlock(&vcpu->arch.vpa_update_lock);
2382
break;
2383
case KVM_REG_PPC_VPA_DTL:
2384
spin_lock(&vcpu->arch.vpa_update_lock);
2385
val->vpaval.addr = vcpu->arch.dtl.next_gpa;
2386
val->vpaval.length = vcpu->arch.dtl.len;
2387
spin_unlock(&vcpu->arch.vpa_update_lock);
2388
break;
2389
case KVM_REG_PPC_TB_OFFSET:
2390
*val = get_reg_val(id, kvmppc_get_tb_offset(vcpu));
2391
break;
2392
case KVM_REG_PPC_LPCR:
2393
case KVM_REG_PPC_LPCR_64:
2394
*val = get_reg_val(id, kvmppc_get_lpcr(vcpu));
2395
break;
2396
case KVM_REG_PPC_PPR:
2397
*val = get_reg_val(id, kvmppc_get_ppr_hv(vcpu));
2398
break;
2399
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2400
case KVM_REG_PPC_TFHAR:
2401
*val = get_reg_val(id, vcpu->arch.tfhar);
2402
break;
2403
case KVM_REG_PPC_TFIAR:
2404
*val = get_reg_val(id, vcpu->arch.tfiar);
2405
break;
2406
case KVM_REG_PPC_TEXASR:
2407
*val = get_reg_val(id, vcpu->arch.texasr);
2408
break;
2409
case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2410
i = id - KVM_REG_PPC_TM_GPR0;
2411
*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
2412
break;
2413
case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2414
{
2415
int j;
2416
i = id - KVM_REG_PPC_TM_VSR0;
2417
if (i < 32)
2418
for (j = 0; j < TS_FPRWIDTH; j++)
2419
val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
2420
else {
2421
if (cpu_has_feature(CPU_FTR_ALTIVEC))
2422
val->vval = vcpu->arch.vr_tm.vr[i-32];
2423
else
2424
r = -ENXIO;
2425
}
2426
break;
2427
}
2428
case KVM_REG_PPC_TM_CR:
2429
*val = get_reg_val(id, vcpu->arch.cr_tm);
2430
break;
2431
case KVM_REG_PPC_TM_XER:
2432
*val = get_reg_val(id, vcpu->arch.xer_tm);
2433
break;
2434
case KVM_REG_PPC_TM_LR:
2435
*val = get_reg_val(id, vcpu->arch.lr_tm);
2436
break;
2437
case KVM_REG_PPC_TM_CTR:
2438
*val = get_reg_val(id, vcpu->arch.ctr_tm);
2439
break;
2440
case KVM_REG_PPC_TM_FPSCR:
2441
*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
2442
break;
2443
case KVM_REG_PPC_TM_AMR:
2444
*val = get_reg_val(id, vcpu->arch.amr_tm);
2445
break;
2446
case KVM_REG_PPC_TM_PPR:
2447
*val = get_reg_val(id, vcpu->arch.ppr_tm);
2448
break;
2449
case KVM_REG_PPC_TM_VRSAVE:
2450
*val = get_reg_val(id, vcpu->arch.vrsave_tm);
2451
break;
2452
case KVM_REG_PPC_TM_VSCR:
2453
if (cpu_has_feature(CPU_FTR_ALTIVEC))
2454
*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
2455
else
2456
r = -ENXIO;
2457
break;
2458
case KVM_REG_PPC_TM_DSCR:
2459
*val = get_reg_val(id, vcpu->arch.dscr_tm);
2460
break;
2461
case KVM_REG_PPC_TM_TAR:
2462
*val = get_reg_val(id, vcpu->arch.tar_tm);
2463
break;
2464
#endif
2465
case KVM_REG_PPC_ARCH_COMPAT:
2466
*val = get_reg_val(id, kvmppc_get_arch_compat(vcpu));
2467
break;
2468
case KVM_REG_PPC_DEC_EXPIRY:
2469
*val = get_reg_val(id, kvmppc_get_dec_expires(vcpu));
2470
break;
2471
case KVM_REG_PPC_ONLINE:
2472
*val = get_reg_val(id, vcpu->arch.online);
2473
break;
2474
case KVM_REG_PPC_PTCR:
2475
*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
2476
break;
2477
case KVM_REG_PPC_FSCR:
2478
*val = get_reg_val(id, kvmppc_get_fscr_hv(vcpu));
2479
break;
2480
default:
2481
r = -EINVAL;
2482
break;
2483
}
2484
2485
return r;
2486
}
2487
2488
static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2489
union kvmppc_one_reg *val)
2490
{
2491
int r = 0;
2492
long int i;
2493
unsigned long addr, len;
2494
2495
switch (id) {
2496
case KVM_REG_PPC_HIOR:
2497
/* Only allow this to be set to zero */
2498
if (set_reg_val(id, *val))
2499
r = -EINVAL;
2500
break;
2501
case KVM_REG_PPC_DABR:
2502
vcpu->arch.dabr = set_reg_val(id, *val);
2503
break;
2504
case KVM_REG_PPC_DABRX:
2505
vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
2506
break;
2507
case KVM_REG_PPC_DSCR:
2508
kvmppc_set_dscr_hv(vcpu, set_reg_val(id, *val));
2509
break;
2510
case KVM_REG_PPC_PURR:
2511
kvmppc_set_purr_hv(vcpu, set_reg_val(id, *val));
2512
break;
2513
case KVM_REG_PPC_SPURR:
2514
kvmppc_set_spurr_hv(vcpu, set_reg_val(id, *val));
2515
break;
2516
case KVM_REG_PPC_AMR:
2517
kvmppc_set_amr_hv(vcpu, set_reg_val(id, *val));
2518
break;
2519
case KVM_REG_PPC_UAMOR:
2520
kvmppc_set_uamor_hv(vcpu, set_reg_val(id, *val));
2521
break;
2522
case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2523
i = id - KVM_REG_PPC_MMCR0;
2524
kvmppc_set_mmcr_hv(vcpu, i, set_reg_val(id, *val));
2525
break;
2526
case KVM_REG_PPC_MMCR2:
2527
kvmppc_set_mmcr_hv(vcpu, 2, set_reg_val(id, *val));
2528
break;
2529
case KVM_REG_PPC_MMCRA:
2530
kvmppc_set_mmcra_hv(vcpu, set_reg_val(id, *val));
2531
break;
2532
case KVM_REG_PPC_MMCRS:
2533
vcpu->arch.mmcrs = set_reg_val(id, *val);
2534
break;
2535
case KVM_REG_PPC_MMCR3:
2536
kvmppc_set_mmcr_hv(vcpu, 3, set_reg_val(id, *val));
2537
break;
2538
case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2539
i = id - KVM_REG_PPC_PMC1;
2540
kvmppc_set_pmc_hv(vcpu, i, set_reg_val(id, *val));
2541
break;
2542
case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2543
i = id - KVM_REG_PPC_SPMC1;
2544
vcpu->arch.spmc[i] = set_reg_val(id, *val);
2545
break;
2546
case KVM_REG_PPC_SIAR:
2547
kvmppc_set_siar_hv(vcpu, set_reg_val(id, *val));
2548
break;
2549
case KVM_REG_PPC_SDAR:
2550
kvmppc_set_sdar_hv(vcpu, set_reg_val(id, *val));
2551
break;
2552
case KVM_REG_PPC_SIER:
2553
kvmppc_set_sier_hv(vcpu, 0, set_reg_val(id, *val));
2554
break;
2555
case KVM_REG_PPC_SIER2:
2556
kvmppc_set_sier_hv(vcpu, 1, set_reg_val(id, *val));
2557
break;
2558
case KVM_REG_PPC_SIER3:
2559
kvmppc_set_sier_hv(vcpu, 2, set_reg_val(id, *val));
2560
break;
2561
case KVM_REG_PPC_IAMR:
2562
kvmppc_set_iamr_hv(vcpu, set_reg_val(id, *val));
2563
break;
2564
case KVM_REG_PPC_PSPB:
2565
kvmppc_set_pspb_hv(vcpu, set_reg_val(id, *val));
2566
break;
2567
case KVM_REG_PPC_DPDES:
2568
if (cpu_has_feature(CPU_FTR_ARCH_300))
2569
vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
2570
else
2571
vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
2572
break;
2573
case KVM_REG_PPC_VTB:
2574
kvmppc_set_vtb(vcpu, set_reg_val(id, *val));
2575
break;
2576
case KVM_REG_PPC_DAWR:
2577
kvmppc_set_dawr0_hv(vcpu, set_reg_val(id, *val));
2578
break;
2579
case KVM_REG_PPC_DAWRX:
2580
kvmppc_set_dawrx0_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2581
break;
2582
case KVM_REG_PPC_DAWR1:
2583
kvmppc_set_dawr1_hv(vcpu, set_reg_val(id, *val));
2584
break;
2585
case KVM_REG_PPC_DAWRX1:
2586
kvmppc_set_dawrx1_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2587
break;
2588
case KVM_REG_PPC_DEXCR:
2589
kvmppc_set_dexcr_hv(vcpu, set_reg_val(id, *val));
2590
break;
2591
case KVM_REG_PPC_HASHKEYR:
2592
kvmppc_set_hashkeyr_hv(vcpu, set_reg_val(id, *val));
2593
break;
2594
case KVM_REG_PPC_HASHPKEYR:
2595
kvmppc_set_hashpkeyr_hv(vcpu, set_reg_val(id, *val));
2596
break;
2597
case KVM_REG_PPC_CIABR:
2598
kvmppc_set_ciabr_hv(vcpu, set_reg_val(id, *val));
2599
/* Don't allow setting breakpoints in hypervisor code */
2600
if ((kvmppc_get_ciabr_hv(vcpu) & CIABR_PRIV) == CIABR_PRIV_HYPER)
2601
kvmppc_set_ciabr_hv(vcpu, kvmppc_get_ciabr_hv(vcpu) & ~CIABR_PRIV);
2602
break;
2603
case KVM_REG_PPC_CSIGR:
2604
vcpu->arch.csigr = set_reg_val(id, *val);
2605
break;
2606
case KVM_REG_PPC_TACR:
2607
vcpu->arch.tacr = set_reg_val(id, *val);
2608
break;
2609
case KVM_REG_PPC_TCSCR:
2610
vcpu->arch.tcscr = set_reg_val(id, *val);
2611
break;
2612
case KVM_REG_PPC_PID:
2613
kvmppc_set_pid(vcpu, set_reg_val(id, *val));
2614
break;
2615
case KVM_REG_PPC_ACOP:
2616
vcpu->arch.acop = set_reg_val(id, *val);
2617
break;
2618
case KVM_REG_PPC_WORT:
2619
kvmppc_set_wort_hv(vcpu, set_reg_val(id, *val));
2620
break;
2621
case KVM_REG_PPC_TIDR:
2622
vcpu->arch.tid = set_reg_val(id, *val);
2623
break;
2624
case KVM_REG_PPC_PSSCR:
2625
vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
2626
break;
2627
case KVM_REG_PPC_VPA_ADDR:
2628
addr = set_reg_val(id, *val);
2629
r = -EINVAL;
2630
if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
2631
vcpu->arch.dtl.next_gpa))
2632
break;
2633
r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
2634
break;
2635
case KVM_REG_PPC_VPA_SLB:
2636
addr = val->vpaval.addr;
2637
len = val->vpaval.length;
2638
r = -EINVAL;
2639
if (addr && !vcpu->arch.vpa.next_gpa)
2640
break;
2641
r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
2642
break;
2643
case KVM_REG_PPC_VPA_DTL:
2644
addr = val->vpaval.addr;
2645
len = val->vpaval.length;
2646
r = -EINVAL;
2647
if (addr && (len < sizeof(struct dtl_entry) ||
2648
!vcpu->arch.vpa.next_gpa))
2649
break;
2650
len -= len % sizeof(struct dtl_entry);
2651
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
2652
break;
2653
case KVM_REG_PPC_TB_OFFSET:
2654
{
2655
/* round up to multiple of 2^24 */
2656
u64 tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24);
2657
2658
/*
2659
* Now that we know the timebase offset, update the
2660
* decrementer expiry with a guest timebase value. If
2661
* the userspace does not set DEC_EXPIRY, this ensures
2662
* a migrated vcpu at least starts with an expired
2663
* decrementer, which is better than a large one that
2664
* causes a hang.
2665
*/
2666
kvmppc_set_tb_offset(vcpu, tb_offset);
2667
if (!kvmppc_get_dec_expires(vcpu) && tb_offset)
2668
kvmppc_set_dec_expires(vcpu, get_tb() + tb_offset);
2669
2670
kvmppc_set_tb_offset(vcpu, tb_offset);
2671
break;
2672
}
2673
case KVM_REG_PPC_LPCR:
2674
kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
2675
break;
2676
case KVM_REG_PPC_LPCR_64:
2677
kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
2678
break;
2679
case KVM_REG_PPC_PPR:
2680
kvmppc_set_ppr_hv(vcpu, set_reg_val(id, *val));
2681
break;
2682
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2683
case KVM_REG_PPC_TFHAR:
2684
vcpu->arch.tfhar = set_reg_val(id, *val);
2685
break;
2686
case KVM_REG_PPC_TFIAR:
2687
vcpu->arch.tfiar = set_reg_val(id, *val);
2688
break;
2689
case KVM_REG_PPC_TEXASR:
2690
vcpu->arch.texasr = set_reg_val(id, *val);
2691
break;
2692
case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2693
i = id - KVM_REG_PPC_TM_GPR0;
2694
vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
2695
break;
2696
case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2697
{
2698
int j;
2699
i = id - KVM_REG_PPC_TM_VSR0;
2700
if (i < 32)
2701
for (j = 0; j < TS_FPRWIDTH; j++)
2702
vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
2703
else
2704
if (cpu_has_feature(CPU_FTR_ALTIVEC))
2705
vcpu->arch.vr_tm.vr[i-32] = val->vval;
2706
else
2707
r = -ENXIO;
2708
break;
2709
}
2710
case KVM_REG_PPC_TM_CR:
2711
vcpu->arch.cr_tm = set_reg_val(id, *val);
2712
break;
2713
case KVM_REG_PPC_TM_XER:
2714
vcpu->arch.xer_tm = set_reg_val(id, *val);
2715
break;
2716
case KVM_REG_PPC_TM_LR:
2717
vcpu->arch.lr_tm = set_reg_val(id, *val);
2718
break;
2719
case KVM_REG_PPC_TM_CTR:
2720
vcpu->arch.ctr_tm = set_reg_val(id, *val);
2721
break;
2722
case KVM_REG_PPC_TM_FPSCR:
2723
vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
2724
break;
2725
case KVM_REG_PPC_TM_AMR:
2726
vcpu->arch.amr_tm = set_reg_val(id, *val);
2727
break;
2728
case KVM_REG_PPC_TM_PPR:
2729
vcpu->arch.ppr_tm = set_reg_val(id, *val);
2730
break;
2731
case KVM_REG_PPC_TM_VRSAVE:
2732
vcpu->arch.vrsave_tm = set_reg_val(id, *val);
2733
break;
2734
case KVM_REG_PPC_TM_VSCR:
2735
if (cpu_has_feature(CPU_FTR_ALTIVEC))
2736
vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
2737
else
2738
r = - ENXIO;
2739
break;
2740
case KVM_REG_PPC_TM_DSCR:
2741
vcpu->arch.dscr_tm = set_reg_val(id, *val);
2742
break;
2743
case KVM_REG_PPC_TM_TAR:
2744
vcpu->arch.tar_tm = set_reg_val(id, *val);
2745
break;
2746
#endif
2747
case KVM_REG_PPC_ARCH_COMPAT:
2748
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
2749
break;
2750
case KVM_REG_PPC_DEC_EXPIRY:
2751
kvmppc_set_dec_expires(vcpu, set_reg_val(id, *val));
2752
break;
2753
case KVM_REG_PPC_ONLINE:
2754
i = set_reg_val(id, *val);
2755
if (i && !vcpu->arch.online)
2756
atomic_inc(&vcpu->arch.vcore->online_count);
2757
else if (!i && vcpu->arch.online)
2758
atomic_dec(&vcpu->arch.vcore->online_count);
2759
vcpu->arch.online = i;
2760
break;
2761
case KVM_REG_PPC_PTCR:
2762
vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
2763
break;
2764
case KVM_REG_PPC_FSCR:
2765
kvmppc_set_fscr_hv(vcpu, set_reg_val(id, *val));
2766
break;
2767
default:
2768
r = -EINVAL;
2769
break;
2770
}
2771
2772
return r;
2773
}
2774
2775
/*
2776
* On POWER9, threads are independent and can be in different partitions.
2777
* Therefore we consider each thread to be a subcore.
2778
* There is a restriction that all threads have to be in the same
2779
* MMU mode (radix or HPT), unfortunately, but since we only support
2780
* HPT guests on a HPT host so far, that isn't an impediment yet.
2781
*/
2782
static int threads_per_vcore(struct kvm *kvm)
2783
{
2784
if (cpu_has_feature(CPU_FTR_ARCH_300))
2785
return 1;
2786
return threads_per_subcore;
2787
}
2788
2789
static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
2790
{
2791
struct kvmppc_vcore *vcore;
2792
2793
vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
2794
2795
if (vcore == NULL)
2796
return NULL;
2797
2798
spin_lock_init(&vcore->lock);
2799
spin_lock_init(&vcore->stoltb_lock);
2800
rcuwait_init(&vcore->wait);
2801
vcore->preempt_tb = TB_NIL;
2802
vcore->lpcr = kvm->arch.lpcr;
2803
vcore->first_vcpuid = id;
2804
vcore->kvm = kvm;
2805
INIT_LIST_HEAD(&vcore->preempt_list);
2806
2807
return vcore;
2808
}
2809
2810
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
2811
static struct debugfs_timings_element {
2812
const char *name;
2813
size_t offset;
2814
} timings[] = {
2815
#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
2816
{"vcpu_entry", offsetof(struct kvm_vcpu, arch.vcpu_entry)},
2817
{"guest_entry", offsetof(struct kvm_vcpu, arch.guest_entry)},
2818
{"in_guest", offsetof(struct kvm_vcpu, arch.in_guest)},
2819
{"guest_exit", offsetof(struct kvm_vcpu, arch.guest_exit)},
2820
{"vcpu_exit", offsetof(struct kvm_vcpu, arch.vcpu_exit)},
2821
{"hypercall", offsetof(struct kvm_vcpu, arch.hcall)},
2822
{"page_fault", offsetof(struct kvm_vcpu, arch.pg_fault)},
2823
#else
2824
{"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)},
2825
{"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)},
2826
{"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)},
2827
{"guest", offsetof(struct kvm_vcpu, arch.guest_time)},
2828
{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
2829
#endif
2830
};
2831
2832
#define N_TIMINGS (ARRAY_SIZE(timings))
2833
2834
struct debugfs_timings_state {
2835
struct kvm_vcpu *vcpu;
2836
unsigned int buflen;
2837
char buf[N_TIMINGS * 100];
2838
};
2839
2840
static int debugfs_timings_open(struct inode *inode, struct file *file)
2841
{
2842
struct kvm_vcpu *vcpu = inode->i_private;
2843
struct debugfs_timings_state *p;
2844
2845
p = kzalloc(sizeof(*p), GFP_KERNEL);
2846
if (!p)
2847
return -ENOMEM;
2848
2849
kvm_get_kvm(vcpu->kvm);
2850
p->vcpu = vcpu;
2851
file->private_data = p;
2852
2853
return nonseekable_open(inode, file);
2854
}
2855
2856
static int debugfs_timings_release(struct inode *inode, struct file *file)
2857
{
2858
struct debugfs_timings_state *p = file->private_data;
2859
2860
kvm_put_kvm(p->vcpu->kvm);
2861
kfree(p);
2862
return 0;
2863
}
2864
2865
static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
2866
size_t len, loff_t *ppos)
2867
{
2868
struct debugfs_timings_state *p = file->private_data;
2869
struct kvm_vcpu *vcpu = p->vcpu;
2870
char *s, *buf_end;
2871
struct kvmhv_tb_accumulator tb;
2872
u64 count;
2873
loff_t pos;
2874
ssize_t n;
2875
int i, loops;
2876
bool ok;
2877
2878
if (!p->buflen) {
2879
s = p->buf;
2880
buf_end = s + sizeof(p->buf);
2881
for (i = 0; i < N_TIMINGS; ++i) {
2882
struct kvmhv_tb_accumulator *acc;
2883
2884
acc = (struct kvmhv_tb_accumulator *)
2885
((unsigned long)vcpu + timings[i].offset);
2886
ok = false;
2887
for (loops = 0; loops < 1000; ++loops) {
2888
count = acc->seqcount;
2889
if (!(count & 1)) {
2890
smp_rmb();
2891
tb = *acc;
2892
smp_rmb();
2893
if (count == acc->seqcount) {
2894
ok = true;
2895
break;
2896
}
2897
}
2898
udelay(1);
2899
}
2900
if (!ok)
2901
snprintf(s, buf_end - s, "%s: stuck\n",
2902
timings[i].name);
2903
else
2904
snprintf(s, buf_end - s,
2905
"%s: %llu %llu %llu %llu\n",
2906
timings[i].name, count / 2,
2907
tb_to_ns(tb.tb_total),
2908
tb_to_ns(tb.tb_min),
2909
tb_to_ns(tb.tb_max));
2910
s += strlen(s);
2911
}
2912
p->buflen = s - p->buf;
2913
}
2914
2915
pos = *ppos;
2916
if (pos >= p->buflen)
2917
return 0;
2918
if (len > p->buflen - pos)
2919
len = p->buflen - pos;
2920
n = copy_to_user(buf, p->buf + pos, len);
2921
if (n) {
2922
if (n == len)
2923
return -EFAULT;
2924
len -= n;
2925
}
2926
*ppos = pos + len;
2927
return len;
2928
}
2929
2930
static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
2931
size_t len, loff_t *ppos)
2932
{
2933
return -EACCES;
2934
}
2935
2936
static const struct file_operations debugfs_timings_ops = {
2937
.owner = THIS_MODULE,
2938
.open = debugfs_timings_open,
2939
.release = debugfs_timings_release,
2940
.read = debugfs_timings_read,
2941
.write = debugfs_timings_write,
2942
.llseek = generic_file_llseek,
2943
};
2944
2945
/* Create a debugfs directory for the vcpu */
2946
static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
2947
{
2948
if (cpu_has_feature(CPU_FTR_ARCH_300) == IS_ENABLED(CONFIG_KVM_BOOK3S_HV_P9_TIMING))
2949
debugfs_create_file("timings", 0444, debugfs_dentry, vcpu,
2950
&debugfs_timings_ops);
2951
return 0;
2952
}
2953
2954
#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2955
static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
2956
{
2957
return 0;
2958
}
2959
#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2960
2961
static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
2962
{
2963
int err;
2964
int core;
2965
struct kvmppc_vcore *vcore;
2966
struct kvm *kvm;
2967
unsigned int id;
2968
2969
kvm = vcpu->kvm;
2970
id = vcpu->vcpu_id;
2971
2972
vcpu->arch.shared = &vcpu->arch.shregs;
2973
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
2974
/*
2975
* The shared struct is never shared on HV,
2976
* so we can always use host endianness
2977
*/
2978
#ifdef __BIG_ENDIAN__
2979
vcpu->arch.shared_big_endian = true;
2980
#else
2981
vcpu->arch.shared_big_endian = false;
2982
#endif
2983
#endif
2984
2985
if (kvmhv_is_nestedv2()) {
2986
err = kvmhv_nestedv2_vcpu_create(vcpu, &vcpu->arch.nestedv2_io);
2987
if (err < 0)
2988
return err;
2989
}
2990
2991
kvmppc_set_mmcr_hv(vcpu, 0, MMCR0_FC);
2992
if (cpu_has_feature(CPU_FTR_ARCH_31)) {
2993
kvmppc_set_mmcr_hv(vcpu, 0, kvmppc_get_mmcr_hv(vcpu, 0) | MMCR0_PMCCEXT);
2994
kvmppc_set_mmcra_hv(vcpu, MMCRA_BHRB_DISABLE);
2995
}
2996
2997
kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH);
2998
/* default to host PVR, since we can't spoof it */
2999
kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
3000
spin_lock_init(&vcpu->arch.vpa_update_lock);
3001
spin_lock_init(&vcpu->arch.tbacct_lock);
3002
vcpu->arch.busy_preempt = TB_NIL;
3003
__kvmppc_set_msr_hv(vcpu, MSR_ME);
3004
vcpu->arch.intr_msr = MSR_SF | MSR_ME;
3005
3006
/*
3007
* Set the default HFSCR for the guest from the host value.
3008
* This value is only used on POWER9 and later.
3009
* On >= POWER9, we want to virtualize the doorbell facility, so we
3010
* don't set the HFSCR_MSGP bit, and that causes those instructions
3011
* to trap and then we emulate them.
3012
*/
3013
kvmppc_set_hfscr_hv(vcpu, HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
3014
HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP);
3015
3016
/* On POWER10 and later, allow prefixed instructions */
3017
if (cpu_has_feature(CPU_FTR_ARCH_31))
3018
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PREFIX);
3019
3020
if (cpu_has_feature(CPU_FTR_HVMODE)) {
3021
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & mfspr(SPRN_HFSCR));
3022
3023
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
3024
if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3025
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
3026
#endif
3027
}
3028
if (cpu_has_feature(CPU_FTR_TM_COMP))
3029
vcpu->arch.hfscr |= HFSCR_TM;
3030
3031
vcpu->arch.hfscr_permitted = kvmppc_get_hfscr_hv(vcpu);
3032
3033
/*
3034
* PM, EBB, TM are demand-faulted so start with it clear.
3035
*/
3036
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM));
3037
3038
kvmppc_mmu_book3s_hv_init(vcpu);
3039
3040
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
3041
3042
init_waitqueue_head(&vcpu->arch.cpu_run);
3043
3044
mutex_lock(&kvm->lock);
3045
vcore = NULL;
3046
err = -EINVAL;
3047
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
3048
if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
3049
pr_devel("KVM: VCPU ID too high\n");
3050
core = KVM_MAX_VCORES;
3051
} else {
3052
BUG_ON(kvm->arch.smt_mode != 1);
3053
core = kvmppc_pack_vcpu_id(kvm, id);
3054
}
3055
} else {
3056
core = id / kvm->arch.smt_mode;
3057
}
3058
if (core < KVM_MAX_VCORES) {
3059
vcore = kvm->arch.vcores[core];
3060
if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
3061
pr_devel("KVM: collision on id %u", id);
3062
vcore = NULL;
3063
} else if (!vcore) {
3064
/*
3065
* Take mmu_setup_lock for mutual exclusion
3066
* with kvmppc_update_lpcr().
3067
*/
3068
err = -ENOMEM;
3069
vcore = kvmppc_vcore_create(kvm,
3070
id & ~(kvm->arch.smt_mode - 1));
3071
mutex_lock(&kvm->arch.mmu_setup_lock);
3072
kvm->arch.vcores[core] = vcore;
3073
kvm->arch.online_vcores++;
3074
mutex_unlock(&kvm->arch.mmu_setup_lock);
3075
}
3076
}
3077
mutex_unlock(&kvm->lock);
3078
3079
if (!vcore)
3080
return err;
3081
3082
spin_lock(&vcore->lock);
3083
++vcore->num_threads;
3084
spin_unlock(&vcore->lock);
3085
vcpu->arch.vcore = vcore;
3086
vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
3087
vcpu->arch.thread_cpu = -1;
3088
vcpu->arch.prev_cpu = -1;
3089
3090
vcpu->arch.cpu_type = KVM_CPU_3S_64;
3091
kvmppc_sanity_check(vcpu);
3092
3093
return 0;
3094
}
3095
3096
static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
3097
unsigned long flags)
3098
{
3099
int err;
3100
int esmt = 0;
3101
3102
if (flags)
3103
return -EINVAL;
3104
if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
3105
return -EINVAL;
3106
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
3107
/*
3108
* On POWER8 (or POWER7), the threading mode is "strict",
3109
* so we pack smt_mode vcpus per vcore.
3110
*/
3111
if (smt_mode > threads_per_subcore)
3112
return -EINVAL;
3113
} else {
3114
/*
3115
* On POWER9, the threading mode is "loose",
3116
* so each vcpu gets its own vcore.
3117
*/
3118
esmt = smt_mode;
3119
smt_mode = 1;
3120
}
3121
mutex_lock(&kvm->lock);
3122
err = -EBUSY;
3123
if (!kvm->arch.online_vcores) {
3124
kvm->arch.smt_mode = smt_mode;
3125
kvm->arch.emul_smt_mode = esmt;
3126
err = 0;
3127
}
3128
mutex_unlock(&kvm->lock);
3129
3130
return err;
3131
}
3132
3133
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
3134
{
3135
if (vpa->pinned_addr)
3136
kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
3137
vpa->dirty);
3138
}
3139
3140
static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
3141
{
3142
spin_lock(&vcpu->arch.vpa_update_lock);
3143
unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
3144
unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
3145
unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
3146
spin_unlock(&vcpu->arch.vpa_update_lock);
3147
if (kvmhv_is_nestedv2())
3148
kvmhv_nestedv2_vcpu_free(vcpu, &vcpu->arch.nestedv2_io);
3149
}
3150
3151
static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
3152
{
3153
/* Indicate we want to get back into the guest */
3154
return 1;
3155
}
3156
3157
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
3158
{
3159
unsigned long dec_nsec, now;
3160
3161
now = get_tb();
3162
if (now > kvmppc_dec_expires_host_tb(vcpu)) {
3163
/* decrementer has already gone negative */
3164
kvmppc_core_queue_dec(vcpu);
3165
kvmppc_core_prepare_to_enter(vcpu);
3166
return;
3167
}
3168
dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
3169
hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
3170
vcpu->arch.timer_running = 1;
3171
}
3172
3173
extern int __kvmppc_vcore_entry(void);
3174
3175
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
3176
struct kvm_vcpu *vcpu, u64 tb)
3177
{
3178
u64 now;
3179
3180
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
3181
return;
3182
spin_lock_irq(&vcpu->arch.tbacct_lock);
3183
now = tb;
3184
vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
3185
vcpu->arch.stolen_logged;
3186
vcpu->arch.busy_preempt = now;
3187
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
3188
spin_unlock_irq(&vcpu->arch.tbacct_lock);
3189
--vc->n_runnable;
3190
WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
3191
}
3192
3193
static int kvmppc_grab_hwthread(int cpu)
3194
{
3195
struct paca_struct *tpaca;
3196
long timeout = 10000;
3197
3198
tpaca = paca_ptrs[cpu];
3199
3200
/* Ensure the thread won't go into the kernel if it wakes */
3201
tpaca->kvm_hstate.kvm_vcpu = NULL;
3202
tpaca->kvm_hstate.kvm_vcore = NULL;
3203
tpaca->kvm_hstate.napping = 0;
3204
smp_wmb();
3205
tpaca->kvm_hstate.hwthread_req = 1;
3206
3207
/*
3208
* If the thread is already executing in the kernel (e.g. handling
3209
* a stray interrupt), wait for it to get back to nap mode.
3210
* The smp_mb() is to ensure that our setting of hwthread_req
3211
* is visible before we look at hwthread_state, so if this
3212
* races with the code at system_reset_pSeries and the thread
3213
* misses our setting of hwthread_req, we are sure to see its
3214
* setting of hwthread_state, and vice versa.
3215
*/
3216
smp_mb();
3217
while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
3218
if (--timeout <= 0) {
3219
pr_err("KVM: couldn't grab cpu %d\n", cpu);
3220
return -EBUSY;
3221
}
3222
udelay(1);
3223
}
3224
return 0;
3225
}
3226
3227
static void kvmppc_release_hwthread(int cpu)
3228
{
3229
struct paca_struct *tpaca;
3230
3231
tpaca = paca_ptrs[cpu];
3232
tpaca->kvm_hstate.hwthread_req = 0;
3233
tpaca->kvm_hstate.kvm_vcpu = NULL;
3234
tpaca->kvm_hstate.kvm_vcore = NULL;
3235
tpaca->kvm_hstate.kvm_split_mode = NULL;
3236
}
3237
3238
static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
3239
3240
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
3241
{
3242
struct kvm_nested_guest *nested = vcpu->arch.nested;
3243
cpumask_t *need_tlb_flush;
3244
int i;
3245
3246
if (nested)
3247
need_tlb_flush = &nested->need_tlb_flush;
3248
else
3249
need_tlb_flush = &kvm->arch.need_tlb_flush;
3250
3251
cpu = cpu_first_tlb_thread_sibling(cpu);
3252
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3253
i += cpu_tlb_thread_sibling_step())
3254
cpumask_set_cpu(i, need_tlb_flush);
3255
3256
/*
3257
* Make sure setting of bit in need_tlb_flush precedes testing of
3258
* cpu_in_guest. The matching barrier on the other side is hwsync
3259
* when switching to guest MMU mode, which happens between
3260
* cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
3261
* being tested.
3262
*/
3263
smp_mb();
3264
3265
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3266
i += cpu_tlb_thread_sibling_step()) {
3267
struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
3268
3269
if (running == kvm)
3270
smp_call_function_single(i, do_nothing, NULL, 1);
3271
}
3272
}
3273
3274
static void do_migrate_away_vcpu(void *arg)
3275
{
3276
struct kvm_vcpu *vcpu = arg;
3277
struct kvm *kvm = vcpu->kvm;
3278
3279
/*
3280
* If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
3281
* ptesync sequence on the old CPU before migrating to a new one, in
3282
* case we interrupted the guest between a tlbie ; eieio ;
3283
* tlbsync; ptesync sequence.
3284
*
3285
* Otherwise, ptesync is sufficient for ordering tlbiel sequences.
3286
*/
3287
if (kvm->arch.lpcr & LPCR_GTSE)
3288
asm volatile("eieio; tlbsync; ptesync");
3289
else
3290
asm volatile("ptesync");
3291
}
3292
3293
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
3294
{
3295
struct kvm_nested_guest *nested = vcpu->arch.nested;
3296
struct kvm *kvm = vcpu->kvm;
3297
int prev_cpu;
3298
3299
if (!cpu_has_feature(CPU_FTR_HVMODE))
3300
return;
3301
3302
if (nested)
3303
prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
3304
else
3305
prev_cpu = vcpu->arch.prev_cpu;
3306
3307
/*
3308
* With radix, the guest can do TLB invalidations itself,
3309
* and it could choose to use the local form (tlbiel) if
3310
* it is invalidating a translation that has only ever been
3311
* used on one vcpu. However, that doesn't mean it has
3312
* only ever been used on one physical cpu, since vcpus
3313
* can move around between pcpus. To cope with this, when
3314
* a vcpu moves from one pcpu to another, we need to tell
3315
* any vcpus running on the same core as this vcpu previously
3316
* ran to flush the TLB.
3317
*/
3318
if (prev_cpu != pcpu) {
3319
if (prev_cpu >= 0) {
3320
if (cpu_first_tlb_thread_sibling(prev_cpu) !=
3321
cpu_first_tlb_thread_sibling(pcpu))
3322
radix_flush_cpu(kvm, prev_cpu, vcpu);
3323
3324
smp_call_function_single(prev_cpu,
3325
do_migrate_away_vcpu, vcpu, 1);
3326
}
3327
if (nested)
3328
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
3329
else
3330
vcpu->arch.prev_cpu = pcpu;
3331
}
3332
}
3333
3334
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
3335
{
3336
int cpu;
3337
struct paca_struct *tpaca;
3338
3339
cpu = vc->pcpu;
3340
if (vcpu) {
3341
if (vcpu->arch.timer_running) {
3342
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
3343
vcpu->arch.timer_running = 0;
3344
}
3345
cpu += vcpu->arch.ptid;
3346
vcpu->cpu = vc->pcpu;
3347
vcpu->arch.thread_cpu = cpu;
3348
}
3349
tpaca = paca_ptrs[cpu];
3350
tpaca->kvm_hstate.kvm_vcpu = vcpu;
3351
tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
3352
tpaca->kvm_hstate.fake_suspend = 0;
3353
/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
3354
smp_wmb();
3355
tpaca->kvm_hstate.kvm_vcore = vc;
3356
if (cpu != smp_processor_id())
3357
kvmppc_ipi_thread(cpu);
3358
}
3359
3360
static void kvmppc_wait_for_nap(int n_threads)
3361
{
3362
int cpu = smp_processor_id();
3363
int i, loops;
3364
3365
if (n_threads <= 1)
3366
return;
3367
for (loops = 0; loops < 1000000; ++loops) {
3368
/*
3369
* Check if all threads are finished.
3370
* We set the vcore pointer when starting a thread
3371
* and the thread clears it when finished, so we look
3372
* for any threads that still have a non-NULL vcore ptr.
3373
*/
3374
for (i = 1; i < n_threads; ++i)
3375
if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3376
break;
3377
if (i == n_threads) {
3378
HMT_medium();
3379
return;
3380
}
3381
HMT_low();
3382
}
3383
HMT_medium();
3384
for (i = 1; i < n_threads; ++i)
3385
if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3386
pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
3387
}
3388
3389
/*
3390
* Check that we are on thread 0 and that any other threads in
3391
* this core are off-line. Then grab the threads so they can't
3392
* enter the kernel.
3393
*/
3394
static int on_primary_thread(void)
3395
{
3396
int cpu = smp_processor_id();
3397
int thr;
3398
3399
/* Are we on a primary subcore? */
3400
if (cpu_thread_in_subcore(cpu))
3401
return 0;
3402
3403
thr = 0;
3404
while (++thr < threads_per_subcore)
3405
if (cpu_online(cpu + thr))
3406
return 0;
3407
3408
/* Grab all hw threads so they can't go into the kernel */
3409
for (thr = 1; thr < threads_per_subcore; ++thr) {
3410
if (kvmppc_grab_hwthread(cpu + thr)) {
3411
/* Couldn't grab one; let the others go */
3412
do {
3413
kvmppc_release_hwthread(cpu + thr);
3414
} while (--thr > 0);
3415
return 0;
3416
}
3417
}
3418
return 1;
3419
}
3420
3421
/*
3422
* A list of virtual cores for each physical CPU.
3423
* These are vcores that could run but their runner VCPU tasks are
3424
* (or may be) preempted.
3425
*/
3426
struct preempted_vcore_list {
3427
struct list_head list;
3428
spinlock_t lock;
3429
};
3430
3431
static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
3432
3433
static void init_vcore_lists(void)
3434
{
3435
int cpu;
3436
3437
for_each_possible_cpu(cpu) {
3438
struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
3439
spin_lock_init(&lp->lock);
3440
INIT_LIST_HEAD(&lp->list);
3441
}
3442
}
3443
3444
static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
3445
{
3446
struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3447
3448
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3449
3450
vc->vcore_state = VCORE_PREEMPT;
3451
vc->pcpu = smp_processor_id();
3452
if (vc->num_threads < threads_per_vcore(vc->kvm)) {
3453
spin_lock(&lp->lock);
3454
list_add_tail(&vc->preempt_list, &lp->list);
3455
spin_unlock(&lp->lock);
3456
}
3457
3458
/* Start accumulating stolen time */
3459
kvmppc_core_start_stolen(vc, mftb());
3460
}
3461
3462
static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
3463
{
3464
struct preempted_vcore_list *lp;
3465
3466
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3467
3468
kvmppc_core_end_stolen(vc, mftb());
3469
if (!list_empty(&vc->preempt_list)) {
3470
lp = &per_cpu(preempted_vcores, vc->pcpu);
3471
spin_lock(&lp->lock);
3472
list_del_init(&vc->preempt_list);
3473
spin_unlock(&lp->lock);
3474
}
3475
vc->vcore_state = VCORE_INACTIVE;
3476
}
3477
3478
/*
3479
* This stores information about the virtual cores currently
3480
* assigned to a physical core.
3481
*/
3482
struct core_info {
3483
int n_subcores;
3484
int max_subcore_threads;
3485
int total_threads;
3486
int subcore_threads[MAX_SUBCORES];
3487
struct kvmppc_vcore *vc[MAX_SUBCORES];
3488
};
3489
3490
/*
3491
* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
3492
* respectively in 2-way micro-threading (split-core) mode on POWER8.
3493
*/
3494
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
3495
3496
static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
3497
{
3498
memset(cip, 0, sizeof(*cip));
3499
cip->n_subcores = 1;
3500
cip->max_subcore_threads = vc->num_threads;
3501
cip->total_threads = vc->num_threads;
3502
cip->subcore_threads[0] = vc->num_threads;
3503
cip->vc[0] = vc;
3504
}
3505
3506
static bool subcore_config_ok(int n_subcores, int n_threads)
3507
{
3508
/*
3509
* POWER9 "SMT4" cores are permanently in what is effectively a 4-way
3510
* split-core mode, with one thread per subcore.
3511
*/
3512
if (cpu_has_feature(CPU_FTR_ARCH_300))
3513
return n_subcores <= 4 && n_threads == 1;
3514
3515
/* On POWER8, can only dynamically split if unsplit to begin with */
3516
if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
3517
return false;
3518
if (n_subcores > MAX_SUBCORES)
3519
return false;
3520
if (n_subcores > 1) {
3521
if (!(dynamic_mt_modes & 2))
3522
n_subcores = 4;
3523
if (n_subcores > 2 && !(dynamic_mt_modes & 4))
3524
return false;
3525
}
3526
3527
return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
3528
}
3529
3530
static void init_vcore_to_run(struct kvmppc_vcore *vc)
3531
{
3532
vc->entry_exit_map = 0;
3533
vc->in_guest = 0;
3534
vc->napping_threads = 0;
3535
vc->conferring_threads = 0;
3536
vc->tb_offset_applied = 0;
3537
}
3538
3539
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
3540
{
3541
int n_threads = vc->num_threads;
3542
int sub;
3543
3544
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
3545
return false;
3546
3547
/* In one_vm_per_core mode, require all vcores to be from the same vm */
3548
if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
3549
return false;
3550
3551
if (n_threads < cip->max_subcore_threads)
3552
n_threads = cip->max_subcore_threads;
3553
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
3554
return false;
3555
cip->max_subcore_threads = n_threads;
3556
3557
sub = cip->n_subcores;
3558
++cip->n_subcores;
3559
cip->total_threads += vc->num_threads;
3560
cip->subcore_threads[sub] = vc->num_threads;
3561
cip->vc[sub] = vc;
3562
init_vcore_to_run(vc);
3563
list_del_init(&vc->preempt_list);
3564
3565
return true;
3566
}
3567
3568
/*
3569
* Work out whether it is possible to piggyback the execution of
3570
* vcore *pvc onto the execution of the other vcores described in *cip.
3571
*/
3572
static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
3573
int target_threads)
3574
{
3575
if (cip->total_threads + pvc->num_threads > target_threads)
3576
return false;
3577
3578
return can_dynamic_split(pvc, cip);
3579
}
3580
3581
static void prepare_threads(struct kvmppc_vcore *vc)
3582
{
3583
int i;
3584
struct kvm_vcpu *vcpu;
3585
3586
for_each_runnable_thread(i, vcpu, vc) {
3587
if (signal_pending(vcpu->arch.run_task))
3588
vcpu->arch.ret = -EINTR;
3589
else if (vcpu->arch.vpa.update_pending ||
3590
vcpu->arch.slb_shadow.update_pending ||
3591
vcpu->arch.dtl.update_pending)
3592
vcpu->arch.ret = RESUME_GUEST;
3593
else
3594
continue;
3595
kvmppc_remove_runnable(vc, vcpu, mftb());
3596
wake_up(&vcpu->arch.cpu_run);
3597
}
3598
}
3599
3600
static void collect_piggybacks(struct core_info *cip, int target_threads)
3601
{
3602
struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3603
struct kvmppc_vcore *pvc, *vcnext;
3604
3605
spin_lock(&lp->lock);
3606
list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
3607
if (!spin_trylock(&pvc->lock))
3608
continue;
3609
prepare_threads(pvc);
3610
if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
3611
list_del_init(&pvc->preempt_list);
3612
if (pvc->runner == NULL) {
3613
pvc->vcore_state = VCORE_INACTIVE;
3614
kvmppc_core_end_stolen(pvc, mftb());
3615
}
3616
spin_unlock(&pvc->lock);
3617
continue;
3618
}
3619
if (!can_piggyback(pvc, cip, target_threads)) {
3620
spin_unlock(&pvc->lock);
3621
continue;
3622
}
3623
kvmppc_core_end_stolen(pvc, mftb());
3624
pvc->vcore_state = VCORE_PIGGYBACK;
3625
if (cip->total_threads >= target_threads)
3626
break;
3627
}
3628
spin_unlock(&lp->lock);
3629
}
3630
3631
static bool recheck_signals_and_mmu(struct core_info *cip)
3632
{
3633
int sub, i;
3634
struct kvm_vcpu *vcpu;
3635
struct kvmppc_vcore *vc;
3636
3637
for (sub = 0; sub < cip->n_subcores; ++sub) {
3638
vc = cip->vc[sub];
3639
if (!vc->kvm->arch.mmu_ready)
3640
return true;
3641
for_each_runnable_thread(i, vcpu, vc)
3642
if (signal_pending(vcpu->arch.run_task))
3643
return true;
3644
}
3645
return false;
3646
}
3647
3648
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
3649
{
3650
int still_running = 0, i;
3651
u64 now;
3652
long ret;
3653
struct kvm_vcpu *vcpu;
3654
3655
spin_lock(&vc->lock);
3656
now = get_tb();
3657
for_each_runnable_thread(i, vcpu, vc) {
3658
/*
3659
* It's safe to unlock the vcore in the loop here, because
3660
* for_each_runnable_thread() is safe against removal of
3661
* the vcpu, and the vcore state is VCORE_EXITING here,
3662
* so any vcpus becoming runnable will have their arch.trap
3663
* set to zero and can't actually run in the guest.
3664
*/
3665
spin_unlock(&vc->lock);
3666
/* cancel pending dec exception if dec is positive */
3667
if (now < kvmppc_dec_expires_host_tb(vcpu) &&
3668
kvmppc_core_pending_dec(vcpu))
3669
kvmppc_core_dequeue_dec(vcpu);
3670
3671
trace_kvm_guest_exit(vcpu);
3672
3673
ret = RESUME_GUEST;
3674
if (vcpu->arch.trap)
3675
ret = kvmppc_handle_exit_hv(vcpu,
3676
vcpu->arch.run_task);
3677
3678
vcpu->arch.ret = ret;
3679
vcpu->arch.trap = 0;
3680
3681
spin_lock(&vc->lock);
3682
if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
3683
if (vcpu->arch.pending_exceptions)
3684
kvmppc_core_prepare_to_enter(vcpu);
3685
if (vcpu->arch.ceded)
3686
kvmppc_set_timer(vcpu);
3687
else
3688
++still_running;
3689
} else {
3690
kvmppc_remove_runnable(vc, vcpu, mftb());
3691
wake_up(&vcpu->arch.cpu_run);
3692
}
3693
}
3694
if (!is_master) {
3695
if (still_running > 0) {
3696
kvmppc_vcore_preempt(vc);
3697
} else if (vc->runner) {
3698
vc->vcore_state = VCORE_PREEMPT;
3699
kvmppc_core_start_stolen(vc, mftb());
3700
} else {
3701
vc->vcore_state = VCORE_INACTIVE;
3702
}
3703
if (vc->n_runnable > 0 && vc->runner == NULL) {
3704
/* make sure there's a candidate runner awake */
3705
i = -1;
3706
vcpu = next_runnable_thread(vc, &i);
3707
wake_up(&vcpu->arch.cpu_run);
3708
}
3709
}
3710
spin_unlock(&vc->lock);
3711
}
3712
3713
/*
3714
* Clear core from the list of active host cores as we are about to
3715
* enter the guest. Only do this if it is the primary thread of the
3716
* core (not if a subcore) that is entering the guest.
3717
*/
3718
static inline int kvmppc_clear_host_core(unsigned int cpu)
3719
{
3720
int core;
3721
3722
if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3723
return 0;
3724
/*
3725
* Memory barrier can be omitted here as we will do a smp_wmb()
3726
* later in kvmppc_start_thread and we need ensure that state is
3727
* visible to other CPUs only after we enter guest.
3728
*/
3729
core = cpu >> threads_shift;
3730
kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
3731
return 0;
3732
}
3733
3734
/*
3735
* Advertise this core as an active host core since we exited the guest
3736
* Only need to do this if it is the primary thread of the core that is
3737
* exiting.
3738
*/
3739
static inline int kvmppc_set_host_core(unsigned int cpu)
3740
{
3741
int core;
3742
3743
if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3744
return 0;
3745
3746
/*
3747
* Memory barrier can be omitted here because we do a spin_unlock
3748
* immediately after this which provides the memory barrier.
3749
*/
3750
core = cpu >> threads_shift;
3751
kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
3752
return 0;
3753
}
3754
3755
static void set_irq_happened(int trap)
3756
{
3757
switch (trap) {
3758
case BOOK3S_INTERRUPT_EXTERNAL:
3759
local_paca->irq_happened |= PACA_IRQ_EE;
3760
break;
3761
case BOOK3S_INTERRUPT_H_DOORBELL:
3762
local_paca->irq_happened |= PACA_IRQ_DBELL;
3763
break;
3764
case BOOK3S_INTERRUPT_HMI:
3765
local_paca->irq_happened |= PACA_IRQ_HMI;
3766
break;
3767
case BOOK3S_INTERRUPT_SYSTEM_RESET:
3768
replay_system_reset();
3769
break;
3770
}
3771
}
3772
3773
/*
3774
* Run a set of guest threads on a physical core.
3775
* Called with vc->lock held.
3776
*/
3777
static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3778
{
3779
struct kvm_vcpu *vcpu;
3780
int i;
3781
int srcu_idx;
3782
struct core_info core_info;
3783
struct kvmppc_vcore *pvc;
3784
struct kvm_split_mode split_info, *sip;
3785
int split, subcore_size, active;
3786
int sub;
3787
bool thr0_done;
3788
unsigned long cmd_bit, stat_bit;
3789
int pcpu, thr;
3790
int target_threads;
3791
int controlled_threads;
3792
int trap;
3793
bool is_power8;
3794
3795
if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
3796
return;
3797
3798
/*
3799
* Remove from the list any threads that have a signal pending
3800
* or need a VPA update done
3801
*/
3802
prepare_threads(vc);
3803
3804
/* if the runner is no longer runnable, let the caller pick a new one */
3805
if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
3806
return;
3807
3808
/*
3809
* Initialize *vc.
3810
*/
3811
init_vcore_to_run(vc);
3812
vc->preempt_tb = TB_NIL;
3813
3814
/*
3815
* Number of threads that we will be controlling: the same as
3816
* the number of threads per subcore, except on POWER9,
3817
* where it's 1 because the threads are (mostly) independent.
3818
*/
3819
controlled_threads = threads_per_vcore(vc->kvm);
3820
3821
/*
3822
* Make sure we are running on primary threads, and that secondary
3823
* threads are offline. Also check if the number of threads in this
3824
* guest are greater than the current system threads per guest.
3825
*/
3826
if ((controlled_threads > 1) &&
3827
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
3828
for_each_runnable_thread(i, vcpu, vc) {
3829
vcpu->arch.ret = -EBUSY;
3830
kvmppc_remove_runnable(vc, vcpu, mftb());
3831
wake_up(&vcpu->arch.cpu_run);
3832
}
3833
goto out;
3834
}
3835
3836
/*
3837
* See if we could run any other vcores on the physical core
3838
* along with this one.
3839
*/
3840
init_core_info(&core_info, vc);
3841
pcpu = smp_processor_id();
3842
target_threads = controlled_threads;
3843
if (target_smt_mode && target_smt_mode < target_threads)
3844
target_threads = target_smt_mode;
3845
if (vc->num_threads < target_threads)
3846
collect_piggybacks(&core_info, target_threads);
3847
3848
/*
3849
* Hard-disable interrupts, and check resched flag and signals.
3850
* If we need to reschedule or deliver a signal, clean up
3851
* and return without going into the guest(s).
3852
* If the mmu_ready flag has been cleared, don't go into the
3853
* guest because that means a HPT resize operation is in progress.
3854
*/
3855
local_irq_disable();
3856
hard_irq_disable();
3857
if (lazy_irq_pending() || need_resched() ||
3858
recheck_signals_and_mmu(&core_info)) {
3859
local_irq_enable();
3860
vc->vcore_state = VCORE_INACTIVE;
3861
/* Unlock all except the primary vcore */
3862
for (sub = 1; sub < core_info.n_subcores; ++sub) {
3863
pvc = core_info.vc[sub];
3864
/* Put back on to the preempted vcores list */
3865
kvmppc_vcore_preempt(pvc);
3866
spin_unlock(&pvc->lock);
3867
}
3868
for (i = 0; i < controlled_threads; ++i)
3869
kvmppc_release_hwthread(pcpu + i);
3870
return;
3871
}
3872
3873
kvmppc_clear_host_core(pcpu);
3874
3875
/* Decide on micro-threading (split-core) mode */
3876
subcore_size = threads_per_subcore;
3877
cmd_bit = stat_bit = 0;
3878
split = core_info.n_subcores;
3879
sip = NULL;
3880
is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
3881
3882
if (split > 1) {
3883
sip = &split_info;
3884
memset(&split_info, 0, sizeof(split_info));
3885
for (sub = 0; sub < core_info.n_subcores; ++sub)
3886
split_info.vc[sub] = core_info.vc[sub];
3887
3888
if (is_power8) {
3889
if (split == 2 && (dynamic_mt_modes & 2)) {
3890
cmd_bit = HID0_POWER8_1TO2LPAR;
3891
stat_bit = HID0_POWER8_2LPARMODE;
3892
} else {
3893
split = 4;
3894
cmd_bit = HID0_POWER8_1TO4LPAR;
3895
stat_bit = HID0_POWER8_4LPARMODE;
3896
}
3897
subcore_size = MAX_SMT_THREADS / split;
3898
split_info.rpr = mfspr(SPRN_RPR);
3899
split_info.pmmar = mfspr(SPRN_PMMAR);
3900
split_info.ldbar = mfspr(SPRN_LDBAR);
3901
split_info.subcore_size = subcore_size;
3902
} else {
3903
split_info.subcore_size = 1;
3904
}
3905
3906
/* order writes to split_info before kvm_split_mode pointer */
3907
smp_wmb();
3908
}
3909
3910
for (thr = 0; thr < controlled_threads; ++thr) {
3911
struct paca_struct *paca = paca_ptrs[pcpu + thr];
3912
3913
paca->kvm_hstate.napping = 0;
3914
paca->kvm_hstate.kvm_split_mode = sip;
3915
}
3916
3917
/* Initiate micro-threading (split-core) on POWER8 if required */
3918
if (cmd_bit) {
3919
unsigned long hid0 = mfspr(SPRN_HID0);
3920
3921
hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
3922
mb();
3923
mtspr(SPRN_HID0, hid0);
3924
isync();
3925
for (;;) {
3926
hid0 = mfspr(SPRN_HID0);
3927
if (hid0 & stat_bit)
3928
break;
3929
cpu_relax();
3930
}
3931
}
3932
3933
/*
3934
* On POWER8, set RWMR register.
3935
* Since it only affects PURR and SPURR, it doesn't affect
3936
* the host, so we don't save/restore the host value.
3937
*/
3938
if (is_power8) {
3939
unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
3940
int n_online = atomic_read(&vc->online_count);
3941
3942
/*
3943
* Use the 8-thread value if we're doing split-core
3944
* or if the vcore's online count looks bogus.
3945
*/
3946
if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
3947
n_online >= 1 && n_online <= MAX_SMT_THREADS)
3948
rwmr_val = p8_rwmr_values[n_online];
3949
mtspr(SPRN_RWMR, rwmr_val);
3950
}
3951
3952
/* Start all the threads */
3953
active = 0;
3954
for (sub = 0; sub < core_info.n_subcores; ++sub) {
3955
thr = is_power8 ? subcore_thread_map[sub] : sub;
3956
thr0_done = false;
3957
active |= 1 << thr;
3958
pvc = core_info.vc[sub];
3959
pvc->pcpu = pcpu + thr;
3960
for_each_runnable_thread(i, vcpu, pvc) {
3961
/*
3962
* XXX: is kvmppc_start_thread called too late here?
3963
* It updates vcpu->cpu and vcpu->arch.thread_cpu
3964
* which are used by kvmppc_fast_vcpu_kick_hv(), but
3965
* kick is called after new exceptions become available
3966
* and exceptions are checked earlier than here, by
3967
* kvmppc_core_prepare_to_enter.
3968
*/
3969
kvmppc_start_thread(vcpu, pvc);
3970
kvmppc_update_vpa_dispatch(vcpu, pvc);
3971
trace_kvm_guest_enter(vcpu);
3972
if (!vcpu->arch.ptid)
3973
thr0_done = true;
3974
active |= 1 << (thr + vcpu->arch.ptid);
3975
}
3976
/*
3977
* We need to start the first thread of each subcore
3978
* even if it doesn't have a vcpu.
3979
*/
3980
if (!thr0_done)
3981
kvmppc_start_thread(NULL, pvc);
3982
}
3983
3984
/*
3985
* Ensure that split_info.do_nap is set after setting
3986
* the vcore pointer in the PACA of the secondaries.
3987
*/
3988
smp_mb();
3989
3990
/*
3991
* When doing micro-threading, poke the inactive threads as well.
3992
* This gets them to the nap instruction after kvm_do_nap,
3993
* which reduces the time taken to unsplit later.
3994
*/
3995
if (cmd_bit) {
3996
split_info.do_nap = 1; /* ask secondaries to nap when done */
3997
for (thr = 1; thr < threads_per_subcore; ++thr)
3998
if (!(active & (1 << thr)))
3999
kvmppc_ipi_thread(pcpu + thr);
4000
}
4001
4002
vc->vcore_state = VCORE_RUNNING;
4003
preempt_disable();
4004
4005
trace_kvmppc_run_core(vc, 0);
4006
4007
for (sub = 0; sub < core_info.n_subcores; ++sub)
4008
spin_unlock(&core_info.vc[sub]->lock);
4009
4010
guest_timing_enter_irqoff();
4011
4012
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
4013
4014
guest_state_enter_irqoff();
4015
this_cpu_disable_ftrace();
4016
4017
trap = __kvmppc_vcore_entry();
4018
4019
this_cpu_enable_ftrace();
4020
guest_state_exit_irqoff();
4021
4022
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
4023
4024
set_irq_happened(trap);
4025
4026
spin_lock(&vc->lock);
4027
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
4028
vc->vcore_state = VCORE_EXITING;
4029
4030
/* wait for secondary threads to finish writing their state to memory */
4031
kvmppc_wait_for_nap(controlled_threads);
4032
4033
/* Return to whole-core mode if we split the core earlier */
4034
if (cmd_bit) {
4035
unsigned long hid0 = mfspr(SPRN_HID0);
4036
4037
hid0 &= ~HID0_POWER8_DYNLPARDIS;
4038
stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
4039
mb();
4040
mtspr(SPRN_HID0, hid0);
4041
isync();
4042
for (;;) {
4043
hid0 = mfspr(SPRN_HID0);
4044
if (!(hid0 & stat_bit))
4045
break;
4046
cpu_relax();
4047
}
4048
split_info.do_nap = 0;
4049
}
4050
4051
kvmppc_set_host_core(pcpu);
4052
4053
if (!vtime_accounting_enabled_this_cpu()) {
4054
local_irq_enable();
4055
/*
4056
* Service IRQs here before guest_timing_exit_irqoff() so any
4057
* ticks that occurred while running the guest are accounted to
4058
* the guest. If vtime accounting is enabled, accounting uses
4059
* TB rather than ticks, so it can be done without enabling
4060
* interrupts here, which has the problem that it accounts
4061
* interrupt processing overhead to the host.
4062
*/
4063
local_irq_disable();
4064
}
4065
guest_timing_exit_irqoff();
4066
4067
local_irq_enable();
4068
4069
/* Let secondaries go back to the offline loop */
4070
for (i = 0; i < controlled_threads; ++i) {
4071
kvmppc_release_hwthread(pcpu + i);
4072
if (sip && sip->napped[i])
4073
kvmppc_ipi_thread(pcpu + i);
4074
}
4075
4076
spin_unlock(&vc->lock);
4077
4078
/* make sure updates to secondary vcpu structs are visible now */
4079
smp_mb();
4080
4081
preempt_enable();
4082
4083
for (sub = 0; sub < core_info.n_subcores; ++sub) {
4084
pvc = core_info.vc[sub];
4085
post_guest_process(pvc, pvc == vc);
4086
}
4087
4088
spin_lock(&vc->lock);
4089
4090
out:
4091
vc->vcore_state = VCORE_INACTIVE;
4092
trace_kvmppc_run_core(vc, 1);
4093
}
4094
4095
static inline bool hcall_is_xics(unsigned long req)
4096
{
4097
return req == H_EOI || req == H_CPPR || req == H_IPI ||
4098
req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
4099
}
4100
4101
static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
4102
{
4103
struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
4104
if (lp) {
4105
u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
4106
lp->yield_count = cpu_to_be32(yield_count);
4107
vcpu->arch.vpa.dirty = 1;
4108
}
4109
}
4110
4111
/* Helper functions for reading L2's stats from L1's VPA */
4112
#ifdef CONFIG_PPC_PSERIES
4113
static DEFINE_PER_CPU(u64, l1_to_l2_cs);
4114
static DEFINE_PER_CPU(u64, l2_to_l1_cs);
4115
static DEFINE_PER_CPU(u64, l2_runtime_agg);
4116
4117
int kvmhv_get_l2_counters_status(void)
4118
{
4119
return firmware_has_feature(FW_FEATURE_LPAR) &&
4120
get_lppaca()->l2_counters_enable;
4121
}
4122
4123
void kvmhv_set_l2_counters_status(int cpu, bool status)
4124
{
4125
if (!firmware_has_feature(FW_FEATURE_LPAR))
4126
return;
4127
if (status)
4128
lppaca_of(cpu).l2_counters_enable = 1;
4129
else
4130
lppaca_of(cpu).l2_counters_enable = 0;
4131
}
4132
EXPORT_SYMBOL(kvmhv_set_l2_counters_status);
4133
4134
int kvmhv_counters_tracepoint_regfunc(void)
4135
{
4136
int cpu;
4137
4138
for_each_present_cpu(cpu) {
4139
kvmhv_set_l2_counters_status(cpu, true);
4140
}
4141
return 0;
4142
}
4143
4144
void kvmhv_counters_tracepoint_unregfunc(void)
4145
{
4146
int cpu;
4147
4148
for_each_present_cpu(cpu) {
4149
kvmhv_set_l2_counters_status(cpu, false);
4150
}
4151
}
4152
4153
static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
4154
{
4155
struct lppaca *lp = get_lppaca();
4156
u64 l1_to_l2_ns, l2_to_l1_ns, l2_runtime_ns;
4157
u64 *l1_to_l2_cs_ptr = this_cpu_ptr(&l1_to_l2_cs);
4158
u64 *l2_to_l1_cs_ptr = this_cpu_ptr(&l2_to_l1_cs);
4159
u64 *l2_runtime_agg_ptr = this_cpu_ptr(&l2_runtime_agg);
4160
4161
l1_to_l2_ns = tb_to_ns(be64_to_cpu(lp->l1_to_l2_cs_tb));
4162
l2_to_l1_ns = tb_to_ns(be64_to_cpu(lp->l2_to_l1_cs_tb));
4163
l2_runtime_ns = tb_to_ns(be64_to_cpu(lp->l2_runtime_tb));
4164
trace_kvmppc_vcpu_stats(vcpu, l1_to_l2_ns - *l1_to_l2_cs_ptr,
4165
l2_to_l1_ns - *l2_to_l1_cs_ptr,
4166
l2_runtime_ns - *l2_runtime_agg_ptr);
4167
*l1_to_l2_cs_ptr = l1_to_l2_ns;
4168
*l2_to_l1_cs_ptr = l2_to_l1_ns;
4169
*l2_runtime_agg_ptr = l2_runtime_ns;
4170
vcpu->arch.l1_to_l2_cs = l1_to_l2_ns;
4171
vcpu->arch.l2_to_l1_cs = l2_to_l1_ns;
4172
vcpu->arch.l2_runtime_agg = l2_runtime_ns;
4173
}
4174
4175
u64 kvmhv_get_l1_to_l2_cs_time(void)
4176
{
4177
return tb_to_ns(be64_to_cpu(get_lppaca()->l1_to_l2_cs_tb));
4178
}
4179
EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time);
4180
4181
u64 kvmhv_get_l2_to_l1_cs_time(void)
4182
{
4183
return tb_to_ns(be64_to_cpu(get_lppaca()->l2_to_l1_cs_tb));
4184
}
4185
EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time);
4186
4187
u64 kvmhv_get_l2_runtime_agg(void)
4188
{
4189
return tb_to_ns(be64_to_cpu(get_lppaca()->l2_runtime_tb));
4190
}
4191
EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg);
4192
4193
u64 kvmhv_get_l1_to_l2_cs_time_vcpu(void)
4194
{
4195
struct kvm_vcpu *vcpu;
4196
struct kvm_vcpu_arch *arch;
4197
4198
vcpu = local_paca->kvm_hstate.kvm_vcpu;
4199
if (vcpu) {
4200
arch = &vcpu->arch;
4201
return arch->l1_to_l2_cs;
4202
} else {
4203
return 0;
4204
}
4205
}
4206
EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time_vcpu);
4207
4208
u64 kvmhv_get_l2_to_l1_cs_time_vcpu(void)
4209
{
4210
struct kvm_vcpu *vcpu;
4211
struct kvm_vcpu_arch *arch;
4212
4213
vcpu = local_paca->kvm_hstate.kvm_vcpu;
4214
if (vcpu) {
4215
arch = &vcpu->arch;
4216
return arch->l2_to_l1_cs;
4217
} else {
4218
return 0;
4219
}
4220
}
4221
EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time_vcpu);
4222
4223
u64 kvmhv_get_l2_runtime_agg_vcpu(void)
4224
{
4225
struct kvm_vcpu *vcpu;
4226
struct kvm_vcpu_arch *arch;
4227
4228
vcpu = local_paca->kvm_hstate.kvm_vcpu;
4229
if (vcpu) {
4230
arch = &vcpu->arch;
4231
return arch->l2_runtime_agg;
4232
} else {
4233
return 0;
4234
}
4235
}
4236
EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg_vcpu);
4237
4238
#else
4239
int kvmhv_get_l2_counters_status(void)
4240
{
4241
return 0;
4242
}
4243
4244
static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
4245
{
4246
}
4247
#endif
4248
4249
static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit,
4250
unsigned long lpcr, u64 *tb)
4251
{
4252
struct kvmhv_nestedv2_io *io;
4253
unsigned long msr, i;
4254
int trap;
4255
long rc;
4256
4257
if (vcpu->arch.doorbell_request) {
4258
vcpu->arch.doorbell_request = 0;
4259
kvmppc_set_dpdes(vcpu, 1);
4260
}
4261
4262
io = &vcpu->arch.nestedv2_io;
4263
4264
msr = mfmsr();
4265
kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4266
if (lazy_irq_pending())
4267
return 0;
4268
4269
rc = kvmhv_nestedv2_flush_vcpu(vcpu, time_limit);
4270
if (rc < 0)
4271
return -EINVAL;
4272
4273
kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr);
4274
4275
accumulate_time(vcpu, &vcpu->arch.in_guest);
4276
rc = plpar_guest_run_vcpu(0, vcpu->kvm->arch.lpid, vcpu->vcpu_id,
4277
&trap, &i);
4278
4279
if (rc != H_SUCCESS) {
4280
pr_err("KVM Guest Run VCPU hcall failed\n");
4281
if (rc == H_INVALID_ELEMENT_ID)
4282
pr_err("KVM: Guest Run VCPU invalid element id at %ld\n", i);
4283
else if (rc == H_INVALID_ELEMENT_SIZE)
4284
pr_err("KVM: Guest Run VCPU invalid element size at %ld\n", i);
4285
else if (rc == H_INVALID_ELEMENT_VALUE)
4286
pr_err("KVM: Guest Run VCPU invalid element value at %ld\n", i);
4287
return -EINVAL;
4288
}
4289
accumulate_time(vcpu, &vcpu->arch.guest_exit);
4290
4291
*tb = mftb();
4292
kvmppc_gsm_reset(io->vcpu_message);
4293
kvmppc_gsm_reset(io->vcore_message);
4294
kvmppc_gsbm_zero(&io->valids);
4295
4296
rc = kvmhv_nestedv2_parse_output(vcpu);
4297
if (rc < 0)
4298
return -EINVAL;
4299
4300
timer_rearm_host_dec(*tb);
4301
4302
/* Record context switch and guest_run_time data */
4303
if (kvmhv_get_l2_counters_status())
4304
do_trace_nested_cs_time(vcpu);
4305
4306
return trap;
4307
}
4308
4309
/* call our hypervisor to load up HV regs and go */
4310
static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
4311
{
4312
unsigned long host_psscr;
4313
unsigned long msr;
4314
struct hv_guest_state hvregs;
4315
struct p9_host_os_sprs host_os_sprs;
4316
s64 dec;
4317
int trap;
4318
4319
msr = mfmsr();
4320
4321
save_p9_host_os_sprs(&host_os_sprs);
4322
4323
/*
4324
* We need to save and restore the guest visible part of the
4325
* psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
4326
* doesn't do this for us. Note only required if pseries since
4327
* this is done in kvmhv_vcpu_entry_p9() below otherwise.
4328
*/
4329
host_psscr = mfspr(SPRN_PSSCR_PR);
4330
4331
kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4332
if (lazy_irq_pending())
4333
return 0;
4334
4335
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
4336
msr = mfmsr(); /* TM restore can update msr */
4337
4338
if (vcpu->arch.psscr != host_psscr)
4339
mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
4340
4341
kvmhv_save_hv_regs(vcpu, &hvregs);
4342
hvregs.lpcr = lpcr;
4343
hvregs.amor = ~0;
4344
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
4345
hvregs.version = HV_GUEST_STATE_VERSION;
4346
if (vcpu->arch.nested) {
4347
hvregs.lpid = vcpu->arch.nested->shadow_lpid;
4348
hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
4349
} else {
4350
hvregs.lpid = vcpu->kvm->arch.lpid;
4351
hvregs.vcpu_token = vcpu->vcpu_id;
4352
}
4353
hvregs.hdec_expiry = time_limit;
4354
4355
/*
4356
* hvregs has the doorbell status, so zero it here which
4357
* enables us to receive doorbells when H_ENTER_NESTED is
4358
* in progress for this vCPU
4359
*/
4360
4361
if (vcpu->arch.doorbell_request)
4362
vcpu->arch.doorbell_request = 0;
4363
4364
/*
4365
* When setting DEC, we must always deal with irq_work_raise
4366
* via NMI vs setting DEC. The problem occurs right as we
4367
* switch into guest mode if a NMI hits and sets pending work
4368
* and sets DEC, then that will apply to the guest and not
4369
* bring us back to the host.
4370
*
4371
* irq_work_raise could check a flag (or possibly LPCR[HDICE]
4372
* for example) and set HDEC to 1? That wouldn't solve the
4373
* nested hv case which needs to abort the hcall or zero the
4374
* time limit.
4375
*
4376
* XXX: Another day's problem.
4377
*/
4378
mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
4379
4380
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
4381
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
4382
switch_pmu_to_guest(vcpu, &host_os_sprs);
4383
accumulate_time(vcpu, &vcpu->arch.in_guest);
4384
trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
4385
__pa(&vcpu->arch.regs));
4386
accumulate_time(vcpu, &vcpu->arch.guest_exit);
4387
kvmhv_restore_hv_return_state(vcpu, &hvregs);
4388
switch_pmu_to_host(vcpu, &host_os_sprs);
4389
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
4390
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
4391
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
4392
vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
4393
4394
store_vcpu_state(vcpu);
4395
4396
dec = mfspr(SPRN_DEC);
4397
if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
4398
dec = (s32) dec;
4399
*tb = mftb();
4400
vcpu->arch.dec_expires = dec + (*tb + kvmppc_get_tb_offset(vcpu));
4401
4402
timer_rearm_host_dec(*tb);
4403
4404
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
4405
if (vcpu->arch.psscr != host_psscr)
4406
mtspr(SPRN_PSSCR_PR, host_psscr);
4407
4408
return trap;
4409
}
4410
4411
/*
4412
* Guest entry for POWER9 and later CPUs.
4413
*/
4414
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
4415
unsigned long lpcr, u64 *tb)
4416
{
4417
struct kvm *kvm = vcpu->kvm;
4418
struct kvm_nested_guest *nested = vcpu->arch.nested;
4419
u64 next_timer;
4420
int trap;
4421
4422
next_timer = timer_get_next_tb();
4423
if (*tb >= next_timer)
4424
return BOOK3S_INTERRUPT_HV_DECREMENTER;
4425
if (next_timer < time_limit)
4426
time_limit = next_timer;
4427
else if (*tb >= time_limit) /* nested time limit */
4428
return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
4429
4430
vcpu->arch.ceded = 0;
4431
4432
vcpu_vpa_increment_dispatch(vcpu);
4433
4434
if (kvmhv_on_pseries()) {
4435
if (kvmhv_is_nestedv1())
4436
trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
4437
else
4438
trap = kvmhv_vcpu_entry_nestedv2(vcpu, time_limit, lpcr, tb);
4439
4440
/* H_CEDE has to be handled now, not later */
4441
if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
4442
kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
4443
kvmppc_cede(vcpu);
4444
kvmppc_set_gpr(vcpu, 3, 0);
4445
trap = 0;
4446
}
4447
4448
} else if (nested) {
4449
__this_cpu_write(cpu_in_guest, kvm);
4450
trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4451
__this_cpu_write(cpu_in_guest, NULL);
4452
4453
} else {
4454
kvmppc_xive_push_vcpu(vcpu);
4455
4456
__this_cpu_write(cpu_in_guest, kvm);
4457
trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4458
__this_cpu_write(cpu_in_guest, NULL);
4459
4460
if (trap == BOOK3S_INTERRUPT_SYSCALL &&
4461
!(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
4462
unsigned long req = kvmppc_get_gpr(vcpu, 3);
4463
4464
/*
4465
* XIVE rearm and XICS hcalls must be handled
4466
* before xive context is pulled (is this
4467
* true?)
4468
*/
4469
if (req == H_CEDE) {
4470
/* H_CEDE has to be handled now */
4471
kvmppc_cede(vcpu);
4472
if (!kvmppc_xive_rearm_escalation(vcpu)) {
4473
/*
4474
* Pending escalation so abort
4475
* the cede.
4476
*/
4477
vcpu->arch.ceded = 0;
4478
}
4479
kvmppc_set_gpr(vcpu, 3, 0);
4480
trap = 0;
4481
4482
} else if (req == H_ENTER_NESTED) {
4483
/*
4484
* L2 should not run with the L1
4485
* context so rearm and pull it.
4486
*/
4487
if (!kvmppc_xive_rearm_escalation(vcpu)) {
4488
/*
4489
* Pending escalation so abort
4490
* H_ENTER_NESTED.
4491
*/
4492
kvmppc_set_gpr(vcpu, 3, 0);
4493
trap = 0;
4494
}
4495
4496
} else if (hcall_is_xics(req)) {
4497
int ret;
4498
4499
ret = kvmppc_xive_xics_hcall(vcpu, req);
4500
if (ret != H_TOO_HARD) {
4501
kvmppc_set_gpr(vcpu, 3, ret);
4502
trap = 0;
4503
}
4504
}
4505
}
4506
kvmppc_xive_pull_vcpu(vcpu);
4507
4508
if (kvm_is_radix(kvm))
4509
vcpu->arch.slb_max = 0;
4510
}
4511
4512
vcpu_vpa_increment_dispatch(vcpu);
4513
4514
return trap;
4515
}
4516
4517
/*
4518
* Wait for some other vcpu thread to execute us, and
4519
* wake us up when we need to handle something in the host.
4520
*/
4521
static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
4522
struct kvm_vcpu *vcpu, int wait_state)
4523
{
4524
DEFINE_WAIT(wait);
4525
4526
prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
4527
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4528
spin_unlock(&vc->lock);
4529
schedule();
4530
spin_lock(&vc->lock);
4531
}
4532
finish_wait(&vcpu->arch.cpu_run, &wait);
4533
}
4534
4535
static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
4536
{
4537
if (!halt_poll_ns_grow)
4538
return;
4539
4540
vc->halt_poll_ns *= halt_poll_ns_grow;
4541
if (vc->halt_poll_ns < halt_poll_ns_grow_start)
4542
vc->halt_poll_ns = halt_poll_ns_grow_start;
4543
}
4544
4545
static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
4546
{
4547
if (halt_poll_ns_shrink == 0)
4548
vc->halt_poll_ns = 0;
4549
else
4550
vc->halt_poll_ns /= halt_poll_ns_shrink;
4551
}
4552
4553
#ifdef CONFIG_KVM_XICS
4554
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4555
{
4556
if (!xics_on_xive())
4557
return false;
4558
return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
4559
vcpu->arch.xive_saved_state.cppr;
4560
}
4561
#else
4562
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4563
{
4564
return false;
4565
}
4566
#endif /* CONFIG_KVM_XICS */
4567
4568
static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
4569
{
4570
if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
4571
kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
4572
return true;
4573
4574
return false;
4575
}
4576
4577
static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
4578
{
4579
if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
4580
return true;
4581
return false;
4582
}
4583
4584
/*
4585
* Check to see if any of the runnable vcpus on the vcore have pending
4586
* exceptions or are no longer ceded
4587
*/
4588
static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
4589
{
4590
struct kvm_vcpu *vcpu;
4591
int i;
4592
4593
for_each_runnable_thread(i, vcpu, vc) {
4594
if (kvmppc_vcpu_check_block(vcpu))
4595
return 1;
4596
}
4597
4598
return 0;
4599
}
4600
4601
/*
4602
* All the vcpus in this vcore are idle, so wait for a decrementer
4603
* or external interrupt to one of the vcpus. vc->lock is held.
4604
*/
4605
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
4606
{
4607
ktime_t cur, start_poll, start_wait;
4608
int do_sleep = 1;
4609
u64 block_ns;
4610
4611
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
4612
4613
/* Poll for pending exceptions and ceded state */
4614
cur = start_poll = ktime_get();
4615
if (vc->halt_poll_ns) {
4616
ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
4617
++vc->runner->stat.generic.halt_attempted_poll;
4618
4619
vc->vcore_state = VCORE_POLLING;
4620
spin_unlock(&vc->lock);
4621
4622
do {
4623
if (kvmppc_vcore_check_block(vc)) {
4624
do_sleep = 0;
4625
break;
4626
}
4627
cur = ktime_get();
4628
} while (kvm_vcpu_can_poll(cur, stop));
4629
4630
spin_lock(&vc->lock);
4631
vc->vcore_state = VCORE_INACTIVE;
4632
4633
if (!do_sleep) {
4634
++vc->runner->stat.generic.halt_successful_poll;
4635
goto out;
4636
}
4637
}
4638
4639
prepare_to_rcuwait(&vc->wait);
4640
set_current_state(TASK_INTERRUPTIBLE);
4641
if (kvmppc_vcore_check_block(vc)) {
4642
finish_rcuwait(&vc->wait);
4643
do_sleep = 0;
4644
/* If we polled, count this as a successful poll */
4645
if (vc->halt_poll_ns)
4646
++vc->runner->stat.generic.halt_successful_poll;
4647
goto out;
4648
}
4649
4650
start_wait = ktime_get();
4651
4652
vc->vcore_state = VCORE_SLEEPING;
4653
trace_kvmppc_vcore_blocked(vc->runner, 0);
4654
spin_unlock(&vc->lock);
4655
schedule();
4656
finish_rcuwait(&vc->wait);
4657
spin_lock(&vc->lock);
4658
vc->vcore_state = VCORE_INACTIVE;
4659
trace_kvmppc_vcore_blocked(vc->runner, 1);
4660
++vc->runner->stat.halt_successful_wait;
4661
4662
cur = ktime_get();
4663
4664
out:
4665
block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
4666
4667
/* Attribute wait time */
4668
if (do_sleep) {
4669
vc->runner->stat.generic.halt_wait_ns +=
4670
ktime_to_ns(cur) - ktime_to_ns(start_wait);
4671
KVM_STATS_LOG_HIST_UPDATE(
4672
vc->runner->stat.generic.halt_wait_hist,
4673
ktime_to_ns(cur) - ktime_to_ns(start_wait));
4674
/* Attribute failed poll time */
4675
if (vc->halt_poll_ns) {
4676
vc->runner->stat.generic.halt_poll_fail_ns +=
4677
ktime_to_ns(start_wait) -
4678
ktime_to_ns(start_poll);
4679
KVM_STATS_LOG_HIST_UPDATE(
4680
vc->runner->stat.generic.halt_poll_fail_hist,
4681
ktime_to_ns(start_wait) -
4682
ktime_to_ns(start_poll));
4683
}
4684
} else {
4685
/* Attribute successful poll time */
4686
if (vc->halt_poll_ns) {
4687
vc->runner->stat.generic.halt_poll_success_ns +=
4688
ktime_to_ns(cur) -
4689
ktime_to_ns(start_poll);
4690
KVM_STATS_LOG_HIST_UPDATE(
4691
vc->runner->stat.generic.halt_poll_success_hist,
4692
ktime_to_ns(cur) - ktime_to_ns(start_poll));
4693
}
4694
}
4695
4696
/* Adjust poll time */
4697
if (halt_poll_ns) {
4698
if (block_ns <= vc->halt_poll_ns)
4699
;
4700
/* We slept and blocked for longer than the max halt time */
4701
else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
4702
shrink_halt_poll_ns(vc);
4703
/* We slept and our poll time is too small */
4704
else if (vc->halt_poll_ns < halt_poll_ns &&
4705
block_ns < halt_poll_ns)
4706
grow_halt_poll_ns(vc);
4707
if (vc->halt_poll_ns > halt_poll_ns)
4708
vc->halt_poll_ns = halt_poll_ns;
4709
} else
4710
vc->halt_poll_ns = 0;
4711
4712
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
4713
}
4714
4715
/*
4716
* This never fails for a radix guest, as none of the operations it does
4717
* for a radix guest can fail or have a way to report failure.
4718
*/
4719
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
4720
{
4721
int r = 0;
4722
struct kvm *kvm = vcpu->kvm;
4723
4724
mutex_lock(&kvm->arch.mmu_setup_lock);
4725
if (!kvm->arch.mmu_ready) {
4726
if (!kvm_is_radix(kvm))
4727
r = kvmppc_hv_setup_htab_rma(vcpu);
4728
if (!r) {
4729
if (cpu_has_feature(CPU_FTR_ARCH_300))
4730
kvmppc_setup_partition_table(kvm);
4731
kvm->arch.mmu_ready = 1;
4732
}
4733
}
4734
mutex_unlock(&kvm->arch.mmu_setup_lock);
4735
return r;
4736
}
4737
4738
static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
4739
{
4740
struct kvm_run *run = vcpu->run;
4741
int n_ceded, i, r;
4742
struct kvmppc_vcore *vc;
4743
struct kvm_vcpu *v;
4744
4745
trace_kvmppc_run_vcpu_enter(vcpu);
4746
4747
run->exit_reason = 0;
4748
vcpu->arch.ret = RESUME_GUEST;
4749
vcpu->arch.trap = 0;
4750
kvmppc_update_vpas(vcpu);
4751
4752
/*
4753
* Synchronize with other threads in this virtual core
4754
*/
4755
vc = vcpu->arch.vcore;
4756
spin_lock(&vc->lock);
4757
vcpu->arch.ceded = 0;
4758
vcpu->arch.run_task = current;
4759
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4760
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4761
vcpu->arch.busy_preempt = TB_NIL;
4762
WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
4763
++vc->n_runnable;
4764
4765
/*
4766
* This happens the first time this is called for a vcpu.
4767
* If the vcore is already running, we may be able to start
4768
* this thread straight away and have it join in.
4769
*/
4770
if (!signal_pending(current)) {
4771
if ((vc->vcore_state == VCORE_PIGGYBACK ||
4772
vc->vcore_state == VCORE_RUNNING) &&
4773
!VCORE_IS_EXITING(vc)) {
4774
kvmppc_update_vpa_dispatch(vcpu, vc);
4775
kvmppc_start_thread(vcpu, vc);
4776
trace_kvm_guest_enter(vcpu);
4777
} else if (vc->vcore_state == VCORE_SLEEPING) {
4778
rcuwait_wake_up(&vc->wait);
4779
}
4780
4781
}
4782
4783
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4784
!signal_pending(current)) {
4785
/* See if the MMU is ready to go */
4786
if (!vcpu->kvm->arch.mmu_ready) {
4787
spin_unlock(&vc->lock);
4788
r = kvmhv_setup_mmu(vcpu);
4789
spin_lock(&vc->lock);
4790
if (r) {
4791
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4792
run->fail_entry.
4793
hardware_entry_failure_reason = 0;
4794
vcpu->arch.ret = r;
4795
break;
4796
}
4797
}
4798
4799
if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4800
kvmppc_vcore_end_preempt(vc);
4801
4802
if (vc->vcore_state != VCORE_INACTIVE) {
4803
kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
4804
continue;
4805
}
4806
for_each_runnable_thread(i, v, vc) {
4807
kvmppc_core_prepare_to_enter(v);
4808
if (signal_pending(v->arch.run_task)) {
4809
kvmppc_remove_runnable(vc, v, mftb());
4810
v->stat.signal_exits++;
4811
v->run->exit_reason = KVM_EXIT_INTR;
4812
v->arch.ret = -EINTR;
4813
wake_up(&v->arch.cpu_run);
4814
}
4815
}
4816
if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
4817
break;
4818
n_ceded = 0;
4819
for_each_runnable_thread(i, v, vc) {
4820
if (!kvmppc_vcpu_woken(v))
4821
n_ceded += v->arch.ceded;
4822
else
4823
v->arch.ceded = 0;
4824
}
4825
vc->runner = vcpu;
4826
if (n_ceded == vc->n_runnable) {
4827
kvmppc_vcore_blocked(vc);
4828
} else if (need_resched()) {
4829
kvmppc_vcore_preempt(vc);
4830
/* Let something else run */
4831
cond_resched_lock(&vc->lock);
4832
if (vc->vcore_state == VCORE_PREEMPT)
4833
kvmppc_vcore_end_preempt(vc);
4834
} else {
4835
kvmppc_run_core(vc);
4836
}
4837
vc->runner = NULL;
4838
}
4839
4840
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4841
(vc->vcore_state == VCORE_RUNNING ||
4842
vc->vcore_state == VCORE_EXITING ||
4843
vc->vcore_state == VCORE_PIGGYBACK))
4844
kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
4845
4846
if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4847
kvmppc_vcore_end_preempt(vc);
4848
4849
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4850
kvmppc_remove_runnable(vc, vcpu, mftb());
4851
vcpu->stat.signal_exits++;
4852
run->exit_reason = KVM_EXIT_INTR;
4853
vcpu->arch.ret = -EINTR;
4854
}
4855
4856
if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
4857
/* Wake up some vcpu to run the core */
4858
i = -1;
4859
v = next_runnable_thread(vc, &i);
4860
wake_up(&v->arch.cpu_run);
4861
}
4862
4863
trace_kvmppc_run_vcpu_exit(vcpu);
4864
spin_unlock(&vc->lock);
4865
return vcpu->arch.ret;
4866
}
4867
4868
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4869
unsigned long lpcr)
4870
{
4871
struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
4872
struct kvm_run *run = vcpu->run;
4873
int trap, r, pcpu;
4874
int srcu_idx;
4875
struct kvmppc_vcore *vc;
4876
struct kvm *kvm = vcpu->kvm;
4877
struct kvm_nested_guest *nested = vcpu->arch.nested;
4878
unsigned long flags;
4879
u64 tb;
4880
4881
trace_kvmppc_run_vcpu_enter(vcpu);
4882
4883
run->exit_reason = 0;
4884
vcpu->arch.ret = RESUME_GUEST;
4885
vcpu->arch.trap = 0;
4886
4887
vc = vcpu->arch.vcore;
4888
vcpu->arch.ceded = 0;
4889
vcpu->arch.run_task = current;
4890
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
4891
4892
/* See if the MMU is ready to go */
4893
if (unlikely(!kvm->arch.mmu_ready)) {
4894
r = kvmhv_setup_mmu(vcpu);
4895
if (r) {
4896
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4897
run->fail_entry.hardware_entry_failure_reason = 0;
4898
vcpu->arch.ret = r;
4899
return r;
4900
}
4901
}
4902
4903
if (need_resched())
4904
cond_resched();
4905
4906
kvmppc_update_vpas(vcpu);
4907
4908
preempt_disable();
4909
pcpu = smp_processor_id();
4910
if (kvm_is_radix(kvm))
4911
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4912
4913
/* flags save not required, but irq_pmu has no disable/enable API */
4914
powerpc_local_irq_pmu_save(flags);
4915
4916
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4917
4918
if (signal_pending(current))
4919
goto sigpend;
4920
if (need_resched() || !kvm->arch.mmu_ready)
4921
goto out;
4922
4923
vcpu->cpu = pcpu;
4924
vcpu->arch.thread_cpu = pcpu;
4925
vc->pcpu = pcpu;
4926
local_paca->kvm_hstate.kvm_vcpu = vcpu;
4927
local_paca->kvm_hstate.ptid = 0;
4928
local_paca->kvm_hstate.fake_suspend = 0;
4929
4930
/*
4931
* Orders set cpu/thread_cpu vs testing for pending interrupts and
4932
* doorbells below. The other side is when these fields are set vs
4933
* kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to
4934
* kick a vCPU to notice the pending interrupt.
4935
*/
4936
smp_mb();
4937
4938
if (!nested) {
4939
kvmppc_core_prepare_to_enter(vcpu);
4940
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
4941
&vcpu->arch.pending_exceptions) ||
4942
xive_interrupt_pending(vcpu)) {
4943
/*
4944
* For nested HV, don't synthesize but always pass MER,
4945
* the L0 will be able to optimise that more
4946
* effectively than manipulating registers directly.
4947
*/
4948
if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE))
4949
kvmppc_inject_interrupt_hv(vcpu,
4950
BOOK3S_INTERRUPT_EXTERNAL, 0);
4951
else
4952
lpcr |= LPCR_MER;
4953
} else {
4954
/*
4955
* L1's copy of L2's LPCR (vcpu->arch.vcore->lpcr) can get its MER bit
4956
* unexpectedly set - for e.g. during NMI handling when all register
4957
* states are synchronized from L0 to L1. L1 needs to inform L0 about
4958
* MER=1 only when there are pending external interrupts.
4959
* In the above if check, MER bit is set if there are pending
4960
* external interrupts. Hence, explicitly mask off MER bit
4961
* here as otherwise it may generate spurious interrupts in L2 KVM
4962
* causing an endless loop, which results in L2 guest getting hung.
4963
*/
4964
lpcr &= ~LPCR_MER;
4965
}
4966
} else if (vcpu->arch.pending_exceptions ||
4967
xive_interrupt_pending(vcpu)) {
4968
vcpu->arch.ret = RESUME_HOST;
4969
goto out;
4970
}
4971
4972
if (vcpu->arch.timer_running) {
4973
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
4974
vcpu->arch.timer_running = 0;
4975
}
4976
4977
tb = mftb();
4978
4979
kvmppc_update_vpa_dispatch_p9(vcpu, vc, tb + kvmppc_get_tb_offset(vcpu));
4980
4981
trace_kvm_guest_enter(vcpu);
4982
4983
guest_timing_enter_irqoff();
4984
4985
srcu_idx = srcu_read_lock(&kvm->srcu);
4986
4987
guest_state_enter_irqoff();
4988
this_cpu_disable_ftrace();
4989
4990
trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb);
4991
vcpu->arch.trap = trap;
4992
4993
this_cpu_enable_ftrace();
4994
guest_state_exit_irqoff();
4995
4996
srcu_read_unlock(&kvm->srcu, srcu_idx);
4997
4998
set_irq_happened(trap);
4999
5000
vcpu->cpu = -1;
5001
vcpu->arch.thread_cpu = -1;
5002
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5003
5004
if (!vtime_accounting_enabled_this_cpu()) {
5005
powerpc_local_irq_pmu_restore(flags);
5006
/*
5007
* Service IRQs here before guest_timing_exit_irqoff() so any
5008
* ticks that occurred while running the guest are accounted to
5009
* the guest. If vtime accounting is enabled, accounting uses
5010
* TB rather than ticks, so it can be done without enabling
5011
* interrupts here, which has the problem that it accounts
5012
* interrupt processing overhead to the host.
5013
*/
5014
powerpc_local_irq_pmu_save(flags);
5015
}
5016
guest_timing_exit_irqoff();
5017
5018
powerpc_local_irq_pmu_restore(flags);
5019
5020
preempt_enable();
5021
5022
/*
5023
* cancel pending decrementer exception if DEC is now positive, or if
5024
* entering a nested guest in which case the decrementer is now owned
5025
* by L2 and the L1 decrementer is provided in hdec_expires
5026
*/
5027
if (kvmppc_core_pending_dec(vcpu) &&
5028
((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
5029
(trap == BOOK3S_INTERRUPT_SYSCALL &&
5030
kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
5031
kvmppc_core_dequeue_dec(vcpu);
5032
5033
trace_kvm_guest_exit(vcpu);
5034
r = RESUME_GUEST;
5035
if (trap) {
5036
if (!nested)
5037
r = kvmppc_handle_exit_hv(vcpu, current);
5038
else
5039
r = kvmppc_handle_nested_exit(vcpu);
5040
}
5041
vcpu->arch.ret = r;
5042
5043
if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
5044
kvmppc_set_timer(vcpu);
5045
5046
prepare_to_rcuwait(wait);
5047
for (;;) {
5048
set_current_state(TASK_INTERRUPTIBLE);
5049
if (signal_pending(current)) {
5050
vcpu->stat.signal_exits++;
5051
run->exit_reason = KVM_EXIT_INTR;
5052
vcpu->arch.ret = -EINTR;
5053
break;
5054
}
5055
5056
if (kvmppc_vcpu_check_block(vcpu))
5057
break;
5058
5059
trace_kvmppc_vcore_blocked(vcpu, 0);
5060
schedule();
5061
trace_kvmppc_vcore_blocked(vcpu, 1);
5062
}
5063
finish_rcuwait(wait);
5064
}
5065
vcpu->arch.ceded = 0;
5066
5067
done:
5068
trace_kvmppc_run_vcpu_exit(vcpu);
5069
5070
return vcpu->arch.ret;
5071
5072
sigpend:
5073
vcpu->stat.signal_exits++;
5074
run->exit_reason = KVM_EXIT_INTR;
5075
vcpu->arch.ret = -EINTR;
5076
out:
5077
vcpu->cpu = -1;
5078
vcpu->arch.thread_cpu = -1;
5079
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5080
powerpc_local_irq_pmu_restore(flags);
5081
preempt_enable();
5082
goto done;
5083
}
5084
5085
static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
5086
{
5087
struct kvm_run *run = vcpu->run;
5088
int r;
5089
int srcu_idx;
5090
struct kvm *kvm;
5091
unsigned long msr;
5092
5093
start_timing(vcpu, &vcpu->arch.vcpu_entry);
5094
5095
if (!vcpu->arch.sane) {
5096
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5097
return -EINVAL;
5098
}
5099
5100
/* No need to go into the guest when all we'll do is come back out */
5101
if (signal_pending(current)) {
5102
run->exit_reason = KVM_EXIT_INTR;
5103
return -EINTR;
5104
}
5105
5106
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
5107
/*
5108
* Don't allow entry with a suspended transaction, because
5109
* the guest entry/exit code will lose it.
5110
*/
5111
if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
5112
(current->thread.regs->msr & MSR_TM)) {
5113
if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
5114
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5115
run->fail_entry.hardware_entry_failure_reason = 0;
5116
return -EINVAL;
5117
}
5118
}
5119
#endif
5120
5121
/*
5122
* Force online to 1 for the sake of old userspace which doesn't
5123
* set it.
5124
*/
5125
if (!vcpu->arch.online) {
5126
atomic_inc(&vcpu->arch.vcore->online_count);
5127
vcpu->arch.online = 1;
5128
}
5129
5130
kvmppc_core_prepare_to_enter(vcpu);
5131
5132
kvm = vcpu->kvm;
5133
atomic_inc(&kvm->arch.vcpus_running);
5134
/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
5135
smp_mb();
5136
5137
msr = 0;
5138
if (IS_ENABLED(CONFIG_PPC_FPU))
5139
msr |= MSR_FP;
5140
if (cpu_has_feature(CPU_FTR_ALTIVEC))
5141
msr |= MSR_VEC;
5142
if (cpu_has_feature(CPU_FTR_VSX))
5143
msr |= MSR_VSX;
5144
if ((cpu_has_feature(CPU_FTR_TM) ||
5145
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
5146
(kvmppc_get_hfscr_hv(vcpu) & HFSCR_TM))
5147
msr |= MSR_TM;
5148
msr = msr_check_and_set(msr);
5149
5150
kvmppc_save_user_regs();
5151
5152
kvmppc_save_current_sprs();
5153
5154
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5155
vcpu->arch.waitp = &vcpu->arch.vcore->wait;
5156
vcpu->arch.pgdir = kvm->mm->pgd;
5157
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5158
5159
do {
5160
accumulate_time(vcpu, &vcpu->arch.guest_entry);
5161
if (cpu_has_feature(CPU_FTR_ARCH_300))
5162
r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
5163
vcpu->arch.vcore->lpcr);
5164
else
5165
r = kvmppc_run_vcpu(vcpu);
5166
5167
if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
5168
accumulate_time(vcpu, &vcpu->arch.hcall);
5169
5170
if (!kvmhv_is_nestedv2() && WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
5171
/*
5172
* These should have been caught reflected
5173
* into the guest by now. Final sanity check:
5174
* don't allow userspace to execute hcalls in
5175
* the hypervisor.
5176
*/
5177
r = RESUME_GUEST;
5178
continue;
5179
}
5180
trace_kvm_hcall_enter(vcpu);
5181
r = kvmppc_pseries_do_hcall(vcpu);
5182
trace_kvm_hcall_exit(vcpu, r);
5183
kvmppc_core_prepare_to_enter(vcpu);
5184
} else if (r == RESUME_PAGE_FAULT) {
5185
accumulate_time(vcpu, &vcpu->arch.pg_fault);
5186
srcu_idx = srcu_read_lock(&kvm->srcu);
5187
r = kvmppc_book3s_hv_page_fault(vcpu,
5188
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
5189
srcu_read_unlock(&kvm->srcu, srcu_idx);
5190
} else if (r == RESUME_PASSTHROUGH) {
5191
if (WARN_ON(xics_on_xive()))
5192
r = H_SUCCESS;
5193
else
5194
r = kvmppc_xics_rm_complete(vcpu, 0);
5195
}
5196
} while (is_kvmppc_resume_guest(r));
5197
accumulate_time(vcpu, &vcpu->arch.vcpu_exit);
5198
5199
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
5200
atomic_dec(&kvm->arch.vcpus_running);
5201
5202
srr_regs_clobbered();
5203
5204
end_timing(vcpu);
5205
5206
return r;
5207
}
5208
5209
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
5210
int shift, int sllp)
5211
{
5212
(*sps)->page_shift = shift;
5213
(*sps)->slb_enc = sllp;
5214
(*sps)->enc[0].page_shift = shift;
5215
(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
5216
/*
5217
* Add 16MB MPSS support (may get filtered out by userspace)
5218
*/
5219
if (shift != 24) {
5220
int penc = kvmppc_pgsize_lp_encoding(shift, 24);
5221
if (penc != -1) {
5222
(*sps)->enc[1].page_shift = 24;
5223
(*sps)->enc[1].pte_enc = penc;
5224
}
5225
}
5226
(*sps)++;
5227
}
5228
5229
static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
5230
struct kvm_ppc_smmu_info *info)
5231
{
5232
struct kvm_ppc_one_seg_page_size *sps;
5233
5234
/*
5235
* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
5236
* POWER7 doesn't support keys for instruction accesses,
5237
* POWER8 and POWER9 do.
5238
*/
5239
info->data_keys = 32;
5240
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
5241
5242
/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
5243
info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
5244
info->slb_size = 32;
5245
5246
/* We only support these sizes for now, and no muti-size segments */
5247
sps = &info->sps[0];
5248
kvmppc_add_seg_page_size(&sps, 12, 0);
5249
kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
5250
kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
5251
5252
/* If running as a nested hypervisor, we don't support HPT guests */
5253
if (kvmhv_on_pseries())
5254
info->flags |= KVM_PPC_NO_HASH;
5255
5256
return 0;
5257
}
5258
5259
/*
5260
* Get (and clear) the dirty memory log for a memory slot.
5261
*/
5262
static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
5263
struct kvm_dirty_log *log)
5264
{
5265
struct kvm_memslots *slots;
5266
struct kvm_memory_slot *memslot;
5267
int r;
5268
unsigned long n, i;
5269
unsigned long *buf, *p;
5270
struct kvm_vcpu *vcpu;
5271
5272
mutex_lock(&kvm->slots_lock);
5273
5274
r = -EINVAL;
5275
if (log->slot >= KVM_USER_MEM_SLOTS)
5276
goto out;
5277
5278
slots = kvm_memslots(kvm);
5279
memslot = id_to_memslot(slots, log->slot);
5280
r = -ENOENT;
5281
if (!memslot || !memslot->dirty_bitmap)
5282
goto out;
5283
5284
/*
5285
* Use second half of bitmap area because both HPT and radix
5286
* accumulate bits in the first half.
5287
*/
5288
n = kvm_dirty_bitmap_bytes(memslot);
5289
buf = memslot->dirty_bitmap + n / sizeof(long);
5290
memset(buf, 0, n);
5291
5292
if (kvm_is_radix(kvm))
5293
r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
5294
else
5295
r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
5296
if (r)
5297
goto out;
5298
5299
/*
5300
* We accumulate dirty bits in the first half of the
5301
* memslot's dirty_bitmap area, for when pages are paged
5302
* out or modified by the host directly. Pick up these
5303
* bits and add them to the map.
5304
*/
5305
p = memslot->dirty_bitmap;
5306
for (i = 0; i < n / sizeof(long); ++i)
5307
buf[i] |= xchg(&p[i], 0);
5308
5309
/* Harvest dirty bits from VPA and DTL updates */
5310
/* Note: we never modify the SLB shadow buffer areas */
5311
kvm_for_each_vcpu(i, vcpu, kvm) {
5312
spin_lock(&vcpu->arch.vpa_update_lock);
5313
kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
5314
kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
5315
spin_unlock(&vcpu->arch.vpa_update_lock);
5316
}
5317
5318
r = -EFAULT;
5319
if (copy_to_user(log->dirty_bitmap, buf, n))
5320
goto out;
5321
5322
r = 0;
5323
out:
5324
mutex_unlock(&kvm->slots_lock);
5325
return r;
5326
}
5327
5328
static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
5329
{
5330
vfree(slot->arch.rmap);
5331
slot->arch.rmap = NULL;
5332
}
5333
5334
static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
5335
const struct kvm_memory_slot *old,
5336
struct kvm_memory_slot *new,
5337
enum kvm_mr_change change)
5338
{
5339
if (change == KVM_MR_CREATE) {
5340
unsigned long size = array_size(new->npages, sizeof(*new->arch.rmap));
5341
5342
if ((size >> PAGE_SHIFT) > totalram_pages())
5343
return -ENOMEM;
5344
5345
new->arch.rmap = vzalloc(size);
5346
if (!new->arch.rmap)
5347
return -ENOMEM;
5348
} else if (change != KVM_MR_DELETE) {
5349
new->arch.rmap = old->arch.rmap;
5350
}
5351
5352
return 0;
5353
}
5354
5355
static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
5356
struct kvm_memory_slot *old,
5357
const struct kvm_memory_slot *new,
5358
enum kvm_mr_change change)
5359
{
5360
/*
5361
* If we are creating or modifying a memslot, it might make
5362
* some address that was previously cached as emulated
5363
* MMIO be no longer emulated MMIO, so invalidate
5364
* all the caches of emulated MMIO translations.
5365
*/
5366
if (change != KVM_MR_DELETE)
5367
atomic64_inc(&kvm->arch.mmio_update);
5368
5369
/*
5370
* For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
5371
* have already called kvm_arch_flush_shadow_memslot() to
5372
* flush shadow mappings. For KVM_MR_CREATE we have no
5373
* previous mappings. So the only case to handle is
5374
* KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
5375
* has been changed.
5376
* For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
5377
* to get rid of any THP PTEs in the partition-scoped page tables
5378
* so we can track dirtiness at the page level; we flush when
5379
* clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
5380
* using THP PTEs.
5381
*/
5382
if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
5383
((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
5384
kvmppc_radix_flush_memslot(kvm, old);
5385
/*
5386
* If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
5387
*/
5388
if (!kvm->arch.secure_guest)
5389
return;
5390
5391
switch (change) {
5392
case KVM_MR_CREATE:
5393
/*
5394
* @TODO kvmppc_uvmem_memslot_create() can fail and
5395
* return error. Fix this.
5396
*/
5397
kvmppc_uvmem_memslot_create(kvm, new);
5398
break;
5399
case KVM_MR_DELETE:
5400
kvmppc_uvmem_memslot_delete(kvm, old);
5401
break;
5402
default:
5403
/* TODO: Handle KVM_MR_MOVE */
5404
break;
5405
}
5406
}
5407
5408
/*
5409
* Update LPCR values in kvm->arch and in vcores.
5410
* Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
5411
* of kvm->arch.lpcr update).
5412
*/
5413
void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
5414
{
5415
long int i;
5416
u32 cores_done = 0;
5417
5418
if ((kvm->arch.lpcr & mask) == lpcr)
5419
return;
5420
5421
kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
5422
5423
for (i = 0; i < KVM_MAX_VCORES; ++i) {
5424
struct kvmppc_vcore *vc = kvm->arch.vcores[i];
5425
if (!vc)
5426
continue;
5427
5428
spin_lock(&vc->lock);
5429
vc->lpcr = (vc->lpcr & ~mask) | lpcr;
5430
verify_lpcr(kvm, vc->lpcr);
5431
spin_unlock(&vc->lock);
5432
if (++cores_done >= kvm->arch.online_vcores)
5433
break;
5434
}
5435
5436
if (kvmhv_is_nestedv2()) {
5437
struct kvm_vcpu *vcpu;
5438
5439
kvm_for_each_vcpu(i, vcpu, kvm) {
5440
kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
5441
}
5442
}
5443
}
5444
5445
void kvmppc_setup_partition_table(struct kvm *kvm)
5446
{
5447
unsigned long dw0, dw1;
5448
5449
if (!kvm_is_radix(kvm)) {
5450
/* PS field - page size for VRMA */
5451
dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
5452
((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
5453
/* HTABSIZE and HTABORG fields */
5454
dw0 |= kvm->arch.sdr1;
5455
5456
/* Second dword as set by userspace */
5457
dw1 = kvm->arch.process_table;
5458
} else {
5459
dw0 = PATB_HR | radix__get_tree_size() |
5460
__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
5461
dw1 = PATB_GR | kvm->arch.process_table;
5462
}
5463
kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
5464
}
5465
5466
/*
5467
* Set up HPT (hashed page table) and RMA (real-mode area).
5468
* Must be called with kvm->arch.mmu_setup_lock held.
5469
*/
5470
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
5471
{
5472
int err = 0;
5473
struct kvm *kvm = vcpu->kvm;
5474
unsigned long hva;
5475
struct kvm_memory_slot *memslot;
5476
struct vm_area_struct *vma;
5477
unsigned long lpcr = 0, senc;
5478
unsigned long psize, porder;
5479
int srcu_idx;
5480
5481
/* Allocate hashed page table (if not done already) and reset it */
5482
if (!kvm->arch.hpt.virt) {
5483
int order = KVM_DEFAULT_HPT_ORDER;
5484
struct kvm_hpt_info info;
5485
5486
err = kvmppc_allocate_hpt(&info, order);
5487
/* If we get here, it means userspace didn't specify a
5488
* size explicitly. So, try successively smaller
5489
* sizes if the default failed. */
5490
while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
5491
err = kvmppc_allocate_hpt(&info, order);
5492
5493
if (err < 0) {
5494
pr_err("KVM: Couldn't alloc HPT\n");
5495
goto out;
5496
}
5497
5498
kvmppc_set_hpt(kvm, &info);
5499
}
5500
5501
/* Look up the memslot for guest physical address 0 */
5502
srcu_idx = srcu_read_lock(&kvm->srcu);
5503
memslot = gfn_to_memslot(kvm, 0);
5504
5505
/* We must have some memory at 0 by now */
5506
err = -EINVAL;
5507
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
5508
goto out_srcu;
5509
5510
/* Look up the VMA for the start of this memory slot */
5511
hva = memslot->userspace_addr;
5512
mmap_read_lock(kvm->mm);
5513
vma = vma_lookup(kvm->mm, hva);
5514
if (!vma || (vma->vm_flags & VM_IO))
5515
goto up_out;
5516
5517
psize = vma_kernel_pagesize(vma);
5518
5519
mmap_read_unlock(kvm->mm);
5520
5521
/* We can handle 4k, 64k or 16M pages in the VRMA */
5522
if (psize >= 0x1000000)
5523
psize = 0x1000000;
5524
else if (psize >= 0x10000)
5525
psize = 0x10000;
5526
else
5527
psize = 0x1000;
5528
porder = __ilog2(psize);
5529
5530
senc = slb_pgsize_encoding(psize);
5531
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
5532
(VRMA_VSID << SLB_VSID_SHIFT_1T);
5533
/* Create HPTEs in the hash page table for the VRMA */
5534
kvmppc_map_vrma(vcpu, memslot, porder);
5535
5536
/* Update VRMASD field in the LPCR */
5537
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
5538
/* the -4 is to account for senc values starting at 0x10 */
5539
lpcr = senc << (LPCR_VRMASD_SH - 4);
5540
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
5541
}
5542
5543
/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
5544
smp_wmb();
5545
err = 0;
5546
out_srcu:
5547
srcu_read_unlock(&kvm->srcu, srcu_idx);
5548
out:
5549
return err;
5550
5551
up_out:
5552
mmap_read_unlock(kvm->mm);
5553
goto out_srcu;
5554
}
5555
5556
/*
5557
* Must be called with kvm->arch.mmu_setup_lock held and
5558
* mmu_ready = 0 and no vcpus running.
5559
*/
5560
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
5561
{
5562
unsigned long lpcr, lpcr_mask;
5563
5564
if (nesting_enabled(kvm))
5565
kvmhv_release_all_nested(kvm);
5566
kvmppc_rmap_reset(kvm);
5567
kvm->arch.process_table = 0;
5568
/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5569
spin_lock(&kvm->mmu_lock);
5570
kvm->arch.radix = 0;
5571
spin_unlock(&kvm->mmu_lock);
5572
kvmppc_free_radix(kvm);
5573
5574
lpcr = LPCR_VPM1;
5575
lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5576
if (cpu_has_feature(CPU_FTR_ARCH_31))
5577
lpcr_mask |= LPCR_HAIL;
5578
kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
5579
5580
return 0;
5581
}
5582
5583
/*
5584
* Must be called with kvm->arch.mmu_setup_lock held and
5585
* mmu_ready = 0 and no vcpus running.
5586
*/
5587
int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
5588
{
5589
unsigned long lpcr, lpcr_mask;
5590
int err;
5591
5592
err = kvmppc_init_vm_radix(kvm);
5593
if (err)
5594
return err;
5595
kvmppc_rmap_reset(kvm);
5596
/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5597
spin_lock(&kvm->mmu_lock);
5598
kvm->arch.radix = 1;
5599
spin_unlock(&kvm->mmu_lock);
5600
kvmppc_free_hpt(&kvm->arch.hpt);
5601
5602
lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5603
lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5604
if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5605
lpcr_mask |= LPCR_HAIL;
5606
if (cpu_has_feature(CPU_FTR_HVMODE) &&
5607
(kvm->arch.host_lpcr & LPCR_HAIL))
5608
lpcr |= LPCR_HAIL;
5609
}
5610
kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
5611
5612
return 0;
5613
}
5614
5615
#ifdef CONFIG_KVM_XICS
5616
/*
5617
* Allocate a per-core structure for managing state about which cores are
5618
* running in the host versus the guest and for exchanging data between
5619
* real mode KVM and CPU running in the host.
5620
* This is only done for the first VM.
5621
* The allocated structure stays even if all VMs have stopped.
5622
* It is only freed when the kvm-hv module is unloaded.
5623
* It's OK for this routine to fail, we just don't support host
5624
* core operations like redirecting H_IPI wakeups.
5625
*/
5626
void kvmppc_alloc_host_rm_ops(void)
5627
{
5628
struct kvmppc_host_rm_ops *ops;
5629
unsigned long l_ops;
5630
int cpu, core;
5631
int size;
5632
5633
if (cpu_has_feature(CPU_FTR_ARCH_300))
5634
return;
5635
5636
/* Not the first time here ? */
5637
if (kvmppc_host_rm_ops_hv != NULL)
5638
return;
5639
5640
ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
5641
if (!ops)
5642
return;
5643
5644
size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
5645
ops->rm_core = kzalloc(size, GFP_KERNEL);
5646
5647
if (!ops->rm_core) {
5648
kfree(ops);
5649
return;
5650
}
5651
5652
cpus_read_lock();
5653
5654
for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
5655
if (!cpu_online(cpu))
5656
continue;
5657
5658
core = cpu >> threads_shift;
5659
ops->rm_core[core].rm_state.in_host = 1;
5660
}
5661
5662
ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
5663
5664
/*
5665
* Make the contents of the kvmppc_host_rm_ops structure visible
5666
* to other CPUs before we assign it to the global variable.
5667
* Do an atomic assignment (no locks used here), but if someone
5668
* beats us to it, just free our copy and return.
5669
*/
5670
smp_wmb();
5671
l_ops = (unsigned long) ops;
5672
5673
if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
5674
cpus_read_unlock();
5675
kfree(ops->rm_core);
5676
kfree(ops);
5677
return;
5678
}
5679
5680
cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
5681
"ppc/kvm_book3s:prepare",
5682
kvmppc_set_host_core,
5683
kvmppc_clear_host_core);
5684
cpus_read_unlock();
5685
}
5686
5687
void kvmppc_free_host_rm_ops(void)
5688
{
5689
if (kvmppc_host_rm_ops_hv) {
5690
cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
5691
kfree(kvmppc_host_rm_ops_hv->rm_core);
5692
kfree(kvmppc_host_rm_ops_hv);
5693
kvmppc_host_rm_ops_hv = NULL;
5694
}
5695
}
5696
#endif
5697
5698
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
5699
{
5700
unsigned long lpcr, lpid;
5701
int ret;
5702
5703
mutex_init(&kvm->arch.uvmem_lock);
5704
INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
5705
mutex_init(&kvm->arch.mmu_setup_lock);
5706
5707
/* Allocate the guest's logical partition ID */
5708
5709
if (!kvmhv_is_nestedv2()) {
5710
lpid = kvmppc_alloc_lpid();
5711
if ((long)lpid < 0)
5712
return -ENOMEM;
5713
kvm->arch.lpid = lpid;
5714
}
5715
5716
kvmppc_alloc_host_rm_ops();
5717
5718
kvmhv_vm_nested_init(kvm);
5719
5720
if (kvmhv_is_nestedv2()) {
5721
long rc;
5722
unsigned long guest_id;
5723
5724
rc = plpar_guest_create(0, &guest_id);
5725
5726
if (rc != H_SUCCESS)
5727
pr_err("KVM: Create Guest hcall failed, rc=%ld\n", rc);
5728
5729
switch (rc) {
5730
case H_PARAMETER:
5731
case H_FUNCTION:
5732
case H_STATE:
5733
return -EINVAL;
5734
case H_NOT_ENOUGH_RESOURCES:
5735
case H_ABORTED:
5736
return -ENOMEM;
5737
case H_AUTHORITY:
5738
return -EPERM;
5739
case H_NOT_AVAILABLE:
5740
return -EBUSY;
5741
}
5742
kvm->arch.lpid = guest_id;
5743
}
5744
5745
5746
/*
5747
* Since we don't flush the TLB when tearing down a VM,
5748
* and this lpid might have previously been used,
5749
* make sure we flush on each core before running the new VM.
5750
* On POWER9, the tlbie in mmu_partition_table_set_entry()
5751
* does this flush for us.
5752
*/
5753
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5754
cpumask_setall(&kvm->arch.need_tlb_flush);
5755
5756
/* Start out with the default set of hcalls enabled */
5757
memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
5758
sizeof(kvm->arch.enabled_hcalls));
5759
5760
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5761
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
5762
5763
/* Init LPCR for virtual RMA mode */
5764
if (cpu_has_feature(CPU_FTR_HVMODE)) {
5765
kvm->arch.host_lpid = mfspr(SPRN_LPID);
5766
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
5767
lpcr &= LPCR_PECE | LPCR_LPES;
5768
} else {
5769
/*
5770
* The L2 LPES mode will be set by the L0 according to whether
5771
* or not it needs to take external interrupts in HV mode.
5772
*/
5773
lpcr = 0;
5774
}
5775
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
5776
LPCR_VPM0 | LPCR_VPM1;
5777
kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
5778
(VRMA_VSID << SLB_VSID_SHIFT_1T);
5779
/* On POWER8 turn on online bit to enable PURR/SPURR */
5780
if (cpu_has_feature(CPU_FTR_ARCH_207S))
5781
lpcr |= LPCR_ONL;
5782
/*
5783
* On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
5784
* Set HVICE bit to enable hypervisor virtualization interrupts.
5785
* Set HEIC to prevent OS interrupts to go to hypervisor (should
5786
* be unnecessary but better safe than sorry in case we re-enable
5787
* EE in HV mode with this LPCR still set)
5788
*/
5789
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5790
lpcr &= ~LPCR_VPM0;
5791
lpcr |= LPCR_HVICE | LPCR_HEIC;
5792
5793
/*
5794
* If xive is enabled, we route 0x500 interrupts directly
5795
* to the guest.
5796
*/
5797
if (xics_on_xive())
5798
lpcr |= LPCR_LPES;
5799
}
5800
5801
/*
5802
* If the host uses radix, the guest starts out as radix.
5803
*/
5804
if (radix_enabled()) {
5805
kvm->arch.radix = 1;
5806
kvm->arch.mmu_ready = 1;
5807
lpcr &= ~LPCR_VPM1;
5808
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5809
if (cpu_has_feature(CPU_FTR_HVMODE) &&
5810
cpu_has_feature(CPU_FTR_ARCH_31) &&
5811
(kvm->arch.host_lpcr & LPCR_HAIL))
5812
lpcr |= LPCR_HAIL;
5813
ret = kvmppc_init_vm_radix(kvm);
5814
if (ret) {
5815
if (kvmhv_is_nestedv2())
5816
plpar_guest_delete(0, kvm->arch.lpid);
5817
else
5818
kvmppc_free_lpid(kvm->arch.lpid);
5819
return ret;
5820
}
5821
kvmppc_setup_partition_table(kvm);
5822
}
5823
5824
verify_lpcr(kvm, lpcr);
5825
kvm->arch.lpcr = lpcr;
5826
5827
/* Initialization for future HPT resizes */
5828
kvm->arch.resize_hpt = NULL;
5829
5830
/*
5831
* Work out how many sets the TLB has, for the use of
5832
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
5833
*/
5834
if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5835
/*
5836
* P10 will flush all the congruence class with a single tlbiel
5837
*/
5838
kvm->arch.tlb_sets = 1;
5839
} else if (radix_enabled())
5840
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
5841
else if (cpu_has_feature(CPU_FTR_ARCH_300))
5842
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
5843
else if (cpu_has_feature(CPU_FTR_ARCH_207S))
5844
kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */
5845
else
5846
kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */
5847
5848
/*
5849
* Track that we now have a HV mode VM active. This blocks secondary
5850
* CPU threads from coming online.
5851
*/
5852
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5853
kvm_hv_vm_activated();
5854
5855
/*
5856
* Initialize smt_mode depending on processor.
5857
* POWER8 and earlier have to use "strict" threading, where
5858
* all vCPUs in a vcore have to run on the same (sub)core,
5859
* whereas on POWER9 the threads can each run a different
5860
* guest.
5861
*/
5862
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5863
kvm->arch.smt_mode = threads_per_subcore;
5864
else
5865
kvm->arch.smt_mode = 1;
5866
kvm->arch.emul_smt_mode = 1;
5867
5868
return 0;
5869
}
5870
5871
static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm)
5872
{
5873
kvmppc_mmu_debugfs_init(kvm);
5874
if (radix_enabled())
5875
kvmhv_radix_debugfs_init(kvm);
5876
return 0;
5877
}
5878
5879
static void kvmppc_free_vcores(struct kvm *kvm)
5880
{
5881
long int i;
5882
5883
for (i = 0; i < KVM_MAX_VCORES; ++i)
5884
kfree(kvm->arch.vcores[i]);
5885
kvm->arch.online_vcores = 0;
5886
}
5887
5888
static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
5889
{
5890
if (!cpu_has_feature(CPU_FTR_ARCH_300))
5891
kvm_hv_vm_deactivated();
5892
5893
kvmppc_free_vcores(kvm);
5894
5895
5896
if (kvm_is_radix(kvm))
5897
kvmppc_free_radix(kvm);
5898
else
5899
kvmppc_free_hpt(&kvm->arch.hpt);
5900
5901
/* Perform global invalidation and return lpid to the pool */
5902
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5903
if (nesting_enabled(kvm))
5904
kvmhv_release_all_nested(kvm);
5905
kvm->arch.process_table = 0;
5906
if (kvm->arch.secure_guest)
5907
uv_svm_terminate(kvm->arch.lpid);
5908
if (!kvmhv_is_nestedv2())
5909
kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
5910
}
5911
5912
if (kvmhv_is_nestedv2()) {
5913
kvmhv_flush_lpid(kvm->arch.lpid);
5914
plpar_guest_delete(0, kvm->arch.lpid);
5915
} else {
5916
kvmppc_free_lpid(kvm->arch.lpid);
5917
}
5918
5919
kvmppc_free_pimap(kvm);
5920
}
5921
5922
/* We don't need to emulate any privileged instructions or dcbz */
5923
static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
5924
unsigned int inst, int *advance)
5925
{
5926
return EMULATE_FAIL;
5927
}
5928
5929
static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
5930
ulong spr_val)
5931
{
5932
return EMULATE_FAIL;
5933
}
5934
5935
static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
5936
ulong *spr_val)
5937
{
5938
return EMULATE_FAIL;
5939
}
5940
5941
static int kvmppc_core_check_processor_compat_hv(void)
5942
{
5943
if (cpu_has_feature(CPU_FTR_HVMODE) &&
5944
cpu_has_feature(CPU_FTR_ARCH_206))
5945
return 0;
5946
5947
/* POWER9 in radix mode is capable of being a nested hypervisor. */
5948
if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
5949
return 0;
5950
5951
return -EIO;
5952
}
5953
5954
#ifdef CONFIG_KVM_XICS
5955
5956
void kvmppc_free_pimap(struct kvm *kvm)
5957
{
5958
kfree(kvm->arch.pimap);
5959
}
5960
5961
static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
5962
{
5963
return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
5964
}
5965
5966
static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
5967
{
5968
struct irq_desc *desc;
5969
struct kvmppc_irq_map *irq_map;
5970
struct kvmppc_passthru_irqmap *pimap;
5971
struct irq_chip *chip;
5972
int i, rc = 0;
5973
struct irq_data *host_data;
5974
5975
if (!kvm_irq_bypass)
5976
return 1;
5977
5978
desc = irq_to_desc(host_irq);
5979
if (!desc)
5980
return -EIO;
5981
5982
mutex_lock(&kvm->lock);
5983
5984
pimap = kvm->arch.pimap;
5985
if (pimap == NULL) {
5986
/* First call, allocate structure to hold IRQ map */
5987
pimap = kvmppc_alloc_pimap();
5988
if (pimap == NULL) {
5989
mutex_unlock(&kvm->lock);
5990
return -ENOMEM;
5991
}
5992
kvm->arch.pimap = pimap;
5993
}
5994
5995
/*
5996
* For now, we only support interrupts for which the EOI operation
5997
* is an OPAL call followed by a write to XIRR, since that's
5998
* what our real-mode EOI code does, or a XIVE interrupt
5999
*/
6000
chip = irq_data_get_irq_chip(&desc->irq_data);
6001
if (!chip || !is_pnv_opal_msi(chip)) {
6002
pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
6003
host_irq, guest_gsi);
6004
mutex_unlock(&kvm->lock);
6005
return -ENOENT;
6006
}
6007
6008
/*
6009
* See if we already have an entry for this guest IRQ number.
6010
* If it's mapped to a hardware IRQ number, that's an error,
6011
* otherwise re-use this entry.
6012
*/
6013
for (i = 0; i < pimap->n_mapped; i++) {
6014
if (guest_gsi == pimap->mapped[i].v_hwirq) {
6015
if (pimap->mapped[i].r_hwirq) {
6016
mutex_unlock(&kvm->lock);
6017
return -EINVAL;
6018
}
6019
break;
6020
}
6021
}
6022
6023
if (i == KVMPPC_PIRQ_MAPPED) {
6024
mutex_unlock(&kvm->lock);
6025
return -EAGAIN; /* table is full */
6026
}
6027
6028
irq_map = &pimap->mapped[i];
6029
6030
irq_map->v_hwirq = guest_gsi;
6031
irq_map->desc = desc;
6032
6033
/*
6034
* Order the above two stores before the next to serialize with
6035
* the KVM real mode handler.
6036
*/
6037
smp_wmb();
6038
6039
/*
6040
* The 'host_irq' number is mapped in the PCI-MSI domain but
6041
* the underlying calls, which will EOI the interrupt in real
6042
* mode, need an HW IRQ number mapped in the XICS IRQ domain.
6043
*/
6044
host_data = irq_domain_get_irq_data(irq_get_default_domain(), host_irq);
6045
irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
6046
6047
if (i == pimap->n_mapped)
6048
pimap->n_mapped++;
6049
6050
if (xics_on_xive())
6051
rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
6052
else
6053
kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
6054
if (rc)
6055
irq_map->r_hwirq = 0;
6056
6057
mutex_unlock(&kvm->lock);
6058
6059
return 0;
6060
}
6061
6062
static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
6063
{
6064
struct irq_desc *desc;
6065
struct kvmppc_passthru_irqmap *pimap;
6066
int i, rc = 0;
6067
6068
if (!kvm_irq_bypass)
6069
return 0;
6070
6071
desc = irq_to_desc(host_irq);
6072
if (!desc)
6073
return -EIO;
6074
6075
mutex_lock(&kvm->lock);
6076
if (!kvm->arch.pimap)
6077
goto unlock;
6078
6079
pimap = kvm->arch.pimap;
6080
6081
for (i = 0; i < pimap->n_mapped; i++) {
6082
if (guest_gsi == pimap->mapped[i].v_hwirq)
6083
break;
6084
}
6085
6086
if (i == pimap->n_mapped) {
6087
mutex_unlock(&kvm->lock);
6088
return -ENODEV;
6089
}
6090
6091
if (xics_on_xive())
6092
rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
6093
else
6094
kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
6095
6096
/* invalidate the entry (what to do on error from the above ?) */
6097
pimap->mapped[i].r_hwirq = 0;
6098
6099
/*
6100
* We don't free this structure even when the count goes to
6101
* zero. The structure is freed when we destroy the VM.
6102
*/
6103
unlock:
6104
mutex_unlock(&kvm->lock);
6105
return rc;
6106
}
6107
6108
static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
6109
struct irq_bypass_producer *prod)
6110
{
6111
int ret = 0;
6112
struct kvm_kernel_irqfd *irqfd =
6113
container_of(cons, struct kvm_kernel_irqfd, consumer);
6114
6115
irqfd->producer = prod;
6116
6117
ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
6118
if (ret)
6119
pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
6120
prod->irq, irqfd->gsi, ret);
6121
6122
return ret;
6123
}
6124
6125
static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
6126
struct irq_bypass_producer *prod)
6127
{
6128
int ret;
6129
struct kvm_kernel_irqfd *irqfd =
6130
container_of(cons, struct kvm_kernel_irqfd, consumer);
6131
6132
irqfd->producer = NULL;
6133
6134
/*
6135
* When producer of consumer is unregistered, we change back to
6136
* default external interrupt handling mode - KVM real mode
6137
* will switch back to host.
6138
*/
6139
ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
6140
if (ret)
6141
pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
6142
prod->irq, irqfd->gsi, ret);
6143
}
6144
#endif
6145
6146
static int kvm_arch_vm_ioctl_hv(struct file *filp,
6147
unsigned int ioctl, unsigned long arg)
6148
{
6149
struct kvm *kvm __maybe_unused = filp->private_data;
6150
void __user *argp = (void __user *)arg;
6151
int r;
6152
6153
switch (ioctl) {
6154
6155
case KVM_PPC_ALLOCATE_HTAB: {
6156
u32 htab_order;
6157
6158
/* If we're a nested hypervisor, we currently only support radix */
6159
if (kvmhv_on_pseries()) {
6160
r = -EOPNOTSUPP;
6161
break;
6162
}
6163
6164
r = -EFAULT;
6165
if (get_user(htab_order, (u32 __user *)argp))
6166
break;
6167
r = kvmppc_alloc_reset_hpt(kvm, htab_order);
6168
if (r)
6169
break;
6170
r = 0;
6171
break;
6172
}
6173
6174
case KVM_PPC_GET_HTAB_FD: {
6175
struct kvm_get_htab_fd ghf;
6176
6177
r = -EFAULT;
6178
if (copy_from_user(&ghf, argp, sizeof(ghf)))
6179
break;
6180
r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
6181
break;
6182
}
6183
6184
case KVM_PPC_RESIZE_HPT_PREPARE: {
6185
struct kvm_ppc_resize_hpt rhpt;
6186
6187
r = -EFAULT;
6188
if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
6189
break;
6190
6191
r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
6192
break;
6193
}
6194
6195
case KVM_PPC_RESIZE_HPT_COMMIT: {
6196
struct kvm_ppc_resize_hpt rhpt;
6197
6198
r = -EFAULT;
6199
if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
6200
break;
6201
6202
r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
6203
break;
6204
}
6205
6206
default:
6207
r = -ENOTTY;
6208
}
6209
6210
return r;
6211
}
6212
6213
/*
6214
* List of hcall numbers to enable by default.
6215
* For compatibility with old userspace, we enable by default
6216
* all hcalls that were implemented before the hcall-enabling
6217
* facility was added. Note this list should not include H_RTAS.
6218
*/
6219
static unsigned int default_hcall_list[] = {
6220
H_REMOVE,
6221
H_ENTER,
6222
H_READ,
6223
H_PROTECT,
6224
H_BULK_REMOVE,
6225
#ifdef CONFIG_SPAPR_TCE_IOMMU
6226
H_GET_TCE,
6227
H_PUT_TCE,
6228
#endif
6229
H_SET_DABR,
6230
H_SET_XDABR,
6231
H_CEDE,
6232
H_PROD,
6233
H_CONFER,
6234
H_REGISTER_VPA,
6235
#ifdef CONFIG_KVM_XICS
6236
H_EOI,
6237
H_CPPR,
6238
H_IPI,
6239
H_IPOLL,
6240
H_XIRR,
6241
H_XIRR_X,
6242
#endif
6243
0
6244
};
6245
6246
static void init_default_hcalls(void)
6247
{
6248
int i;
6249
unsigned int hcall;
6250
6251
for (i = 0; default_hcall_list[i]; ++i) {
6252
hcall = default_hcall_list[i];
6253
WARN_ON(!kvmppc_hcall_impl_hv(hcall));
6254
__set_bit(hcall / 4, default_enabled_hcalls);
6255
}
6256
}
6257
6258
static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
6259
{
6260
unsigned long lpcr;
6261
int radix;
6262
int err;
6263
6264
/* If not on a POWER9, reject it */
6265
if (!cpu_has_feature(CPU_FTR_ARCH_300))
6266
return -ENODEV;
6267
6268
/* If any unknown flags set, reject it */
6269
if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
6270
return -EINVAL;
6271
6272
/* GR (guest radix) bit in process_table field must match */
6273
radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
6274
if (!!(cfg->process_table & PATB_GR) != radix)
6275
return -EINVAL;
6276
6277
/* Process table size field must be reasonable, i.e. <= 24 */
6278
if ((cfg->process_table & PRTS_MASK) > 24)
6279
return -EINVAL;
6280
6281
/* We can change a guest to/from radix now, if the host is radix */
6282
if (radix && !radix_enabled())
6283
return -EINVAL;
6284
6285
/* If we're a nested hypervisor, we currently only support radix */
6286
if (kvmhv_on_pseries() && !radix)
6287
return -EINVAL;
6288
6289
mutex_lock(&kvm->arch.mmu_setup_lock);
6290
if (radix != kvm_is_radix(kvm)) {
6291
if (kvm->arch.mmu_ready) {
6292
kvm->arch.mmu_ready = 0;
6293
/* order mmu_ready vs. vcpus_running */
6294
smp_mb();
6295
if (atomic_read(&kvm->arch.vcpus_running)) {
6296
kvm->arch.mmu_ready = 1;
6297
err = -EBUSY;
6298
goto out_unlock;
6299
}
6300
}
6301
if (radix)
6302
err = kvmppc_switch_mmu_to_radix(kvm);
6303
else
6304
err = kvmppc_switch_mmu_to_hpt(kvm);
6305
if (err)
6306
goto out_unlock;
6307
}
6308
6309
kvm->arch.process_table = cfg->process_table;
6310
kvmppc_setup_partition_table(kvm);
6311
6312
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
6313
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
6314
err = 0;
6315
6316
out_unlock:
6317
mutex_unlock(&kvm->arch.mmu_setup_lock);
6318
return err;
6319
}
6320
6321
static int kvmhv_enable_nested(struct kvm *kvm)
6322
{
6323
if (!nested)
6324
return -EPERM;
6325
if (!cpu_has_feature(CPU_FTR_ARCH_300))
6326
return -ENODEV;
6327
if (!radix_enabled())
6328
return -ENODEV;
6329
if (kvmhv_is_nestedv2())
6330
return -ENODEV;
6331
6332
/* kvm == NULL means the caller is testing if the capability exists */
6333
if (kvm)
6334
kvm->arch.nested_enable = true;
6335
return 0;
6336
}
6337
6338
static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
6339
int size)
6340
{
6341
int rc = -EINVAL;
6342
6343
if (kvmhv_vcpu_is_radix(vcpu)) {
6344
rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
6345
6346
if (rc > 0)
6347
rc = -EINVAL;
6348
}
6349
6350
/* For now quadrants are the only way to access nested guest memory */
6351
if (rc && vcpu->arch.nested)
6352
rc = -EAGAIN;
6353
6354
return rc;
6355
}
6356
6357
static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
6358
int size)
6359
{
6360
int rc = -EINVAL;
6361
6362
if (kvmhv_vcpu_is_radix(vcpu)) {
6363
rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
6364
6365
if (rc > 0)
6366
rc = -EINVAL;
6367
}
6368
6369
/* For now quadrants are the only way to access nested guest memory */
6370
if (rc && vcpu->arch.nested)
6371
rc = -EAGAIN;
6372
6373
return rc;
6374
}
6375
6376
static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
6377
{
6378
unpin_vpa(kvm, vpa);
6379
vpa->gpa = 0;
6380
vpa->pinned_addr = NULL;
6381
vpa->dirty = false;
6382
vpa->update_pending = 0;
6383
}
6384
6385
/*
6386
* Enable a guest to become a secure VM, or test whether
6387
* that could be enabled.
6388
* Called when the KVM_CAP_PPC_SECURE_GUEST capability is
6389
* tested (kvm == NULL) or enabled (kvm != NULL).
6390
*/
6391
static int kvmhv_enable_svm(struct kvm *kvm)
6392
{
6393
if (!kvmppc_uvmem_available())
6394
return -EINVAL;
6395
if (kvm)
6396
kvm->arch.svm_enabled = 1;
6397
return 0;
6398
}
6399
6400
/*
6401
* IOCTL handler to turn off secure mode of guest
6402
*
6403
* - Release all device pages
6404
* - Issue ucall to terminate the guest on the UV side
6405
* - Unpin the VPA pages.
6406
* - Reinit the partition scoped page tables
6407
*/
6408
static int kvmhv_svm_off(struct kvm *kvm)
6409
{
6410
struct kvm_vcpu *vcpu;
6411
int mmu_was_ready;
6412
int srcu_idx;
6413
int ret = 0;
6414
unsigned long i;
6415
6416
if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
6417
return ret;
6418
6419
mutex_lock(&kvm->arch.mmu_setup_lock);
6420
mmu_was_ready = kvm->arch.mmu_ready;
6421
if (kvm->arch.mmu_ready) {
6422
kvm->arch.mmu_ready = 0;
6423
/* order mmu_ready vs. vcpus_running */
6424
smp_mb();
6425
if (atomic_read(&kvm->arch.vcpus_running)) {
6426
kvm->arch.mmu_ready = 1;
6427
ret = -EBUSY;
6428
goto out;
6429
}
6430
}
6431
6432
srcu_idx = srcu_read_lock(&kvm->srcu);
6433
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
6434
struct kvm_memory_slot *memslot;
6435
struct kvm_memslots *slots = __kvm_memslots(kvm, i);
6436
int bkt;
6437
6438
if (!slots)
6439
continue;
6440
6441
kvm_for_each_memslot(memslot, bkt, slots) {
6442
kvmppc_uvmem_drop_pages(memslot, kvm, true);
6443
uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
6444
}
6445
}
6446
srcu_read_unlock(&kvm->srcu, srcu_idx);
6447
6448
ret = uv_svm_terminate(kvm->arch.lpid);
6449
if (ret != U_SUCCESS) {
6450
ret = -EINVAL;
6451
goto out;
6452
}
6453
6454
/*
6455
* When secure guest is reset, all the guest pages are sent
6456
* to UV via UV_PAGE_IN before the non-boot vcpus get a
6457
* chance to run and unpin their VPA pages. Unpinning of all
6458
* VPA pages is done here explicitly so that VPA pages
6459
* can be migrated to the secure side.
6460
*
6461
* This is required to for the secure SMP guest to reboot
6462
* correctly.
6463
*/
6464
kvm_for_each_vcpu(i, vcpu, kvm) {
6465
spin_lock(&vcpu->arch.vpa_update_lock);
6466
unpin_vpa_reset(kvm, &vcpu->arch.dtl);
6467
unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
6468
unpin_vpa_reset(kvm, &vcpu->arch.vpa);
6469
spin_unlock(&vcpu->arch.vpa_update_lock);
6470
}
6471
6472
kvmppc_setup_partition_table(kvm);
6473
kvm->arch.secure_guest = 0;
6474
kvm->arch.mmu_ready = mmu_was_ready;
6475
out:
6476
mutex_unlock(&kvm->arch.mmu_setup_lock);
6477
return ret;
6478
}
6479
6480
static int kvmhv_enable_dawr1(struct kvm *kvm)
6481
{
6482
if (!cpu_has_feature(CPU_FTR_DAWR1))
6483
return -ENODEV;
6484
6485
/* kvm == NULL means the caller is testing if the capability exists */
6486
if (kvm)
6487
kvm->arch.dawr1_enabled = true;
6488
return 0;
6489
}
6490
6491
static bool kvmppc_hash_v3_possible(void)
6492
{
6493
if (!cpu_has_feature(CPU_FTR_ARCH_300))
6494
return false;
6495
6496
if (!cpu_has_feature(CPU_FTR_HVMODE))
6497
return false;
6498
6499
/*
6500
* POWER9 chips before version 2.02 can't have some threads in
6501
* HPT mode and some in radix mode on the same core.
6502
*/
6503
if (radix_enabled()) {
6504
unsigned int pvr = mfspr(SPRN_PVR);
6505
if ((pvr >> 16) == PVR_POWER9 &&
6506
(((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
6507
((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
6508
return false;
6509
}
6510
6511
return true;
6512
}
6513
6514
static struct kvmppc_ops kvm_ops_hv = {
6515
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
6516
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
6517
.get_one_reg = kvmppc_get_one_reg_hv,
6518
.set_one_reg = kvmppc_set_one_reg_hv,
6519
.vcpu_load = kvmppc_core_vcpu_load_hv,
6520
.vcpu_put = kvmppc_core_vcpu_put_hv,
6521
.inject_interrupt = kvmppc_inject_interrupt_hv,
6522
.set_msr = kvmppc_set_msr_hv,
6523
.vcpu_run = kvmppc_vcpu_run_hv,
6524
.vcpu_create = kvmppc_core_vcpu_create_hv,
6525
.vcpu_free = kvmppc_core_vcpu_free_hv,
6526
.check_requests = kvmppc_core_check_requests_hv,
6527
.get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv,
6528
.flush_memslot = kvmppc_core_flush_memslot_hv,
6529
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
6530
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
6531
.unmap_gfn_range = kvm_unmap_gfn_range_hv,
6532
.age_gfn = kvm_age_gfn_hv,
6533
.test_age_gfn = kvm_test_age_gfn_hv,
6534
.free_memslot = kvmppc_core_free_memslot_hv,
6535
.init_vm = kvmppc_core_init_vm_hv,
6536
.destroy_vm = kvmppc_core_destroy_vm_hv,
6537
.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
6538
.emulate_op = kvmppc_core_emulate_op_hv,
6539
.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
6540
.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
6541
.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
6542
.arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
6543
.hcall_implemented = kvmppc_hcall_impl_hv,
6544
.configure_mmu = kvmhv_configure_mmu,
6545
.get_rmmu_info = kvmhv_get_rmmu_info,
6546
.set_smt_mode = kvmhv_set_smt_mode,
6547
.enable_nested = kvmhv_enable_nested,
6548
.load_from_eaddr = kvmhv_load_from_eaddr,
6549
.store_to_eaddr = kvmhv_store_to_eaddr,
6550
.enable_svm = kvmhv_enable_svm,
6551
.svm_off = kvmhv_svm_off,
6552
.enable_dawr1 = kvmhv_enable_dawr1,
6553
.hash_v3_possible = kvmppc_hash_v3_possible,
6554
.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
6555
.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
6556
};
6557
6558
static int kvm_init_subcore_bitmap(void)
6559
{
6560
int i, j;
6561
int nr_cores = cpu_nr_cores();
6562
struct sibling_subcore_state *sibling_subcore_state;
6563
6564
for (i = 0; i < nr_cores; i++) {
6565
int first_cpu = i * threads_per_core;
6566
int node = cpu_to_node(first_cpu);
6567
6568
/* Ignore if it is already allocated. */
6569
if (paca_ptrs[first_cpu]->sibling_subcore_state)
6570
continue;
6571
6572
sibling_subcore_state =
6573
kzalloc_node(sizeof(struct sibling_subcore_state),
6574
GFP_KERNEL, node);
6575
if (!sibling_subcore_state)
6576
return -ENOMEM;
6577
6578
6579
for (j = 0; j < threads_per_core; j++) {
6580
int cpu = first_cpu + j;
6581
6582
paca_ptrs[cpu]->sibling_subcore_state =
6583
sibling_subcore_state;
6584
}
6585
}
6586
return 0;
6587
}
6588
6589
static int kvmppc_radix_possible(void)
6590
{
6591
return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
6592
}
6593
6594
static int kvmppc_book3s_init_hv(void)
6595
{
6596
int r;
6597
6598
if (!tlbie_capable) {
6599
pr_err("KVM-HV: Host does not support TLBIE\n");
6600
return -ENODEV;
6601
}
6602
6603
/*
6604
* FIXME!! Do we need to check on all cpus ?
6605
*/
6606
r = kvmppc_core_check_processor_compat_hv();
6607
if (r < 0)
6608
return -ENODEV;
6609
6610
r = kvmhv_nested_init();
6611
if (r)
6612
return r;
6613
6614
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
6615
r = kvm_init_subcore_bitmap();
6616
if (r)
6617
goto err;
6618
}
6619
6620
/*
6621
* We need a way of accessing the XICS interrupt controller,
6622
* either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
6623
* indirectly, via OPAL.
6624
*/
6625
#ifdef CONFIG_SMP
6626
if (!xics_on_xive() && !kvmhv_on_pseries() &&
6627
!local_paca->kvm_hstate.xics_phys) {
6628
struct device_node *np;
6629
6630
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
6631
if (!np) {
6632
pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
6633
r = -ENODEV;
6634
goto err;
6635
}
6636
/* presence of intc confirmed - node can be dropped again */
6637
of_node_put(np);
6638
}
6639
#endif
6640
6641
init_default_hcalls();
6642
6643
init_vcore_lists();
6644
6645
r = kvmppc_mmu_hv_init();
6646
if (r)
6647
goto err;
6648
6649
if (kvmppc_radix_possible()) {
6650
r = kvmppc_radix_init();
6651
if (r)
6652
goto err;
6653
}
6654
6655
r = kvmppc_uvmem_init();
6656
if (r < 0) {
6657
pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
6658
return r;
6659
}
6660
6661
#if defined(CONFIG_KVM_XICS)
6662
/*
6663
* IRQ bypass is supported only for interrupts whose EOI operations are
6664
* handled via OPAL calls. Therefore, register IRQ bypass handlers
6665
* exclusively for PowerNV KVM when booted with 'xive=off', indicating
6666
* the use of the emulated XICS interrupt controller.
6667
*/
6668
if (!kvmhv_on_pseries()) {
6669
pr_info("KVM-HV: Enabling IRQ bypass\n");
6670
kvm_ops_hv.irq_bypass_add_producer =
6671
kvmppc_irq_bypass_add_producer_hv;
6672
kvm_ops_hv.irq_bypass_del_producer =
6673
kvmppc_irq_bypass_del_producer_hv;
6674
}
6675
#endif
6676
6677
kvm_ops_hv.owner = THIS_MODULE;
6678
kvmppc_hv_ops = &kvm_ops_hv;
6679
6680
return 0;
6681
6682
err:
6683
kvmhv_nested_exit();
6684
kvmppc_radix_exit();
6685
6686
return r;
6687
}
6688
6689
static void kvmppc_book3s_exit_hv(void)
6690
{
6691
kvmppc_uvmem_free();
6692
kvmppc_free_host_rm_ops();
6693
if (kvmppc_radix_possible())
6694
kvmppc_radix_exit();
6695
kvmppc_hv_ops = NULL;
6696
kvmhv_nested_exit();
6697
}
6698
6699
module_init(kvmppc_book3s_init_hv);
6700
module_exit(kvmppc_book3s_exit_hv);
6701
MODULE_DESCRIPTION("KVM on Book3S (POWER8 and later) in hypervisor mode");
6702
MODULE_LICENSE("GPL");
6703
MODULE_ALIAS_MISCDEV(KVM_MINOR);
6704
MODULE_ALIAS("devname:kvm");
6705
6706