CoCalc -- book3s

GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/kvm/book3s_hv.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Copyright 2011 Paul Mackerras, IBM Corp. <[email protected]>
4
 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
5
 *
6
 * Authors:
7
 *    Paul Mackerras <[email protected]>
8
 *    Alexander Graf <[email protected]>
9
 *    Kevin Wolf <[email protected]>
10
 *
11
 * Description: KVM functions specific to running on Book 3S
12
 * processors in hypervisor mode (specifically POWER7 and later).
13
 *
14
 * This file is derived from arch/powerpc/kvm/book3s.c,
15
 * by Alexander Graf <[email protected]>.
16
 */
17

18
#include <linux/kvm_host.h>
19
#include <linux/kernel.h>
20
#include <linux/err.h>
21
#include <linux/slab.h>
22
#include <linux/preempt.h>
23
#include <linux/sched/signal.h>
24
#include <linux/sched/stat.h>
25
#include <linux/delay.h>
26
#include <linux/export.h>
27
#include <linux/fs.h>
28
#include <linux/anon_inodes.h>
29
#include <linux/cpu.h>
30
#include <linux/cpumask.h>
31
#include <linux/spinlock.h>
32
#include <linux/page-flags.h>
33
#include <linux/srcu.h>
34
#include <linux/miscdevice.h>
35
#include <linux/debugfs.h>
36
#include <linux/gfp.h>
37
#include <linux/vmalloc.h>
38
#include <linux/highmem.h>
39
#include <linux/hugetlb.h>
40
#include <linux/kvm_irqfd.h>
41
#include <linux/irqbypass.h>
42
#include <linux/module.h>
43
#include <linux/compiler.h>
44
#include <linux/of.h>
45
#include <linux/irqdomain.h>
46
#include <linux/smp.h>
47

48
#include <asm/ftrace.h>
49
#include <asm/reg.h>
50
#include <asm/ppc-opcode.h>
51
#include <asm/asm-prototypes.h>
52
#include <asm/archrandom.h>
53
#include <asm/debug.h>
54
#include <asm/disassemble.h>
55
#include <asm/cputable.h>
56
#include <asm/cacheflush.h>
57
#include <linux/uaccess.h>
58
#include <asm/interrupt.h>
59
#include <asm/io.h>
60
#include <asm/kvm_ppc.h>
61
#include <asm/kvm_book3s.h>
62
#include <asm/mmu_context.h>
63
#include <asm/lppaca.h>
64
#include <asm/pmc.h>
65
#include <asm/processor.h>
66
#include <asm/cputhreads.h>
67
#include <asm/page.h>
68
#include <asm/hvcall.h>
69
#include <asm/switch_to.h>
70
#include <asm/smp.h>
71
#include <asm/dbell.h>
72
#include <asm/hmi.h>
73
#include <asm/pnv-pci.h>
74
#include <asm/mmu.h>
75
#include <asm/opal.h>
76
#include <asm/xics.h>
77
#include <asm/xive.h>
78
#include <asm/hw_breakpoint.h>
79
#include <asm/kvm_book3s_uvmem.h>
80
#include <asm/ultravisor.h>
81
#include <asm/dtl.h>
82
#include <asm/plpar_wrappers.h>
83

84
#include <trace/events/ipi.h>
85

86
#include "book3s.h"
87
#include "book3s_hv.h"
88

89
#define CREATE_TRACE_POINTS
90
#include "trace_hv.h"
91

92
/* #define EXIT_DEBUG */
93
/* #define EXIT_DEBUG_SIMPLE */
94
/* #define EXIT_DEBUG_INT */
95

96
/* Used to indicate that a guest page fault needs to be handled */
97
#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
98
/* Used to indicate that a guest passthrough interrupt needs to be handled */
99
#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
100

101
/* Used as a "null" value for timebase values */
102
#define TB_NIL	(~(u64)0)
103

104
static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
105

106
static int dynamic_mt_modes = 6;
107
module_param(dynamic_mt_modes, int, 0644);
108
MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
109
static int target_smt_mode;
110
module_param(target_smt_mode, int, 0644);
111
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
112

113
static bool one_vm_per_core;
114
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
115
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
116

117
#ifdef CONFIG_KVM_XICS
118
static const struct kernel_param_ops module_param_ops = {
119
	.set = param_set_int,
120
	.get = param_get_int,
121
};
122

123
module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
124
MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
125

126
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
127
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
128
#endif
129

130
/* If set, guests are allowed to create and control nested guests */
131
static bool nested = true;
132
module_param(nested, bool, S_IRUGO | S_IWUSR);
133
MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
134

135
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
136

137
/*
138
 * RWMR values for POWER8.  These control the rate at which PURR
139
 * and SPURR count and should be set according to the number of
140
 * online threads in the vcore being run.
141
 */
142
#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
143
#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
144
#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
145
#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
146
#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
147
#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
148
#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
149
#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
150

151
static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
152
	RWMR_RPA_P8_1THREAD,
153
	RWMR_RPA_P8_1THREAD,
154
	RWMR_RPA_P8_2THREAD,
155
	RWMR_RPA_P8_3THREAD,
156
	RWMR_RPA_P8_4THREAD,
157
	RWMR_RPA_P8_5THREAD,
158
	RWMR_RPA_P8_6THREAD,
159
	RWMR_RPA_P8_7THREAD,
160
	RWMR_RPA_P8_8THREAD,
161
};
162

163
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
164
		int *ip)
165
{
166
	int i = *ip;
167
	struct kvm_vcpu *vcpu;
168

169
	while (++i < MAX_SMT_THREADS) {
170
		vcpu = READ_ONCE(vc->runnable_threads[i]);
171
		if (vcpu) {
172
			*ip = i;
173
			return vcpu;
174
		}
175
	}
176
	return NULL;
177
}
178

179
/* Used to traverse the list of runnable threads for a given vcore */
180
#define for_each_runnable_thread(i, vcpu, vc) \
181
	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
182

183
static bool kvmppc_ipi_thread(int cpu)
184
{
185
	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
186

187
	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
188
	if (kvmhv_on_pseries())
189
		return false;
190

191
	/* On POWER9 we can use msgsnd to IPI any cpu */
192
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
193
		msg |= get_hard_smp_processor_id(cpu);
194
		smp_mb();
195
		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
196
		return true;
197
	}
198

199
	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
200
	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
201
		preempt_disable();
202
		if (cpu_first_thread_sibling(cpu) ==
203
		    cpu_first_thread_sibling(smp_processor_id())) {
204
			msg |= cpu_thread_in_core(cpu);
205
			smp_mb();
206
			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
207
			preempt_enable();
208
			return true;
209
		}
210
		preempt_enable();
211
	}
212

213
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
214
	if (cpu >= 0 && cpu < nr_cpu_ids) {
215
		if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
216
			xics_wake_cpu(cpu);
217
			return true;
218
		}
219
		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
220
		return true;
221
	}
222
#endif
223

224
	return false;
225
}
226

227
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
228
{
229
	int cpu;
230
	struct rcuwait *waitp;
231

232
	/*
233
	 * rcuwait_wake_up contains smp_mb() which orders prior stores that
234
	 * create pending work vs below loads of cpu fields. The other side
235
	 * is the barrier in vcpu run that orders setting the cpu fields vs
236
	 * testing for pending work.
237
	 */
238

239
	waitp = kvm_arch_vcpu_get_wait(vcpu);
240
	if (rcuwait_wake_up(waitp))
241
		++vcpu->stat.generic.halt_wakeup;
242

243
	cpu = READ_ONCE(vcpu->arch.thread_cpu);
244
	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
245
		return;
246

247
	/* CPU points to the first thread of the core */
248
	cpu = vcpu->cpu;
249
	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
250
		smp_send_reschedule(cpu);
251
}
252

253
/*
254
 * We use the vcpu_load/put functions to measure stolen time.
255
 *
256
 * Stolen time is counted as time when either the vcpu is able to
257
 * run as part of a virtual core, but the task running the vcore
258
 * is preempted or sleeping, or when the vcpu needs something done
259
 * in the kernel by the task running the vcpu, but that task is
260
 * preempted or sleeping.  Those two things have to be counted
261
 * separately, since one of the vcpu tasks will take on the job
262
 * of running the core, and the other vcpu tasks in the vcore will
263
 * sleep waiting for it to do that, but that sleep shouldn't count
264
 * as stolen time.
265
 *
266
 * Hence we accumulate stolen time when the vcpu can run as part of
267
 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
268
 * needs its task to do other things in the kernel (for example,
269
 * service a page fault) in busy_stolen.  We don't accumulate
270
 * stolen time for a vcore when it is inactive, or for a vcpu
271
 * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
272
 * a misnomer; it means that the vcpu task is not executing in
273
 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
274
 * the kernel.  We don't have any way of dividing up that time
275
 * between time that the vcpu is genuinely stopped, time that
276
 * the task is actively working on behalf of the vcpu, and time
277
 * that the task is preempted, so we don't count any of it as
278
 * stolen.
279
 *
280
 * Updates to busy_stolen are protected by arch.tbacct_lock;
281
 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
282
 * lock.  The stolen times are measured in units of timebase ticks.
283
 * (Note that the != TB_NIL checks below are purely defensive;
284
 * they should never fail.)
285
 *
286
 * The POWER9 path is simpler, one vcpu per virtual core so the
287
 * former case does not exist. If a vcpu is preempted when it is
288
 * BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate
289
 * the stolen cycles in busy_stolen. RUNNING is not a preemptible
290
 * state in the P9 path.
291
 */
292

293
static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
294
{
295
	unsigned long flags;
296

297
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
298

299
	spin_lock_irqsave(&vc->stoltb_lock, flags);
300
	vc->preempt_tb = tb;
301
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
302
}
303

304
static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
305
{
306
	unsigned long flags;
307

308
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
309

310
	spin_lock_irqsave(&vc->stoltb_lock, flags);
311
	if (vc->preempt_tb != TB_NIL) {
312
		vc->stolen_tb += tb - vc->preempt_tb;
313
		vc->preempt_tb = TB_NIL;
314
	}
315
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
316
}
317

318
static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
319
{
320
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
321
	unsigned long flags;
322
	u64 now;
323

324
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
325
		if (vcpu->arch.busy_preempt != TB_NIL) {
326
			WARN_ON_ONCE(vcpu->arch.state != KVMPPC_VCPU_BUSY_IN_HOST);
327
			vc->stolen_tb += mftb() - vcpu->arch.busy_preempt;
328
			vcpu->arch.busy_preempt = TB_NIL;
329
		}
330
		return;
331
	}
332

333
	now = mftb();
334

335
	/*
336
	 * We can test vc->runner without taking the vcore lock,
337
	 * because only this task ever sets vc->runner to this
338
	 * vcpu, and once it is set to this vcpu, only this task
339
	 * ever sets it to NULL.
340
	 */
341
	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
342
		kvmppc_core_end_stolen(vc, now);
343

344
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
345
	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
346
	    vcpu->arch.busy_preempt != TB_NIL) {
347
		vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
348
		vcpu->arch.busy_preempt = TB_NIL;
349
	}
350
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
351
}
352

353
static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
354
{
355
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
356
	unsigned long flags;
357
	u64 now;
358

359
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
360
		/*
361
		 * In the P9 path, RUNNABLE is not preemptible
362
		 * (nor takes host interrupts)
363
		 */
364
		WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE);
365
		/*
366
		 * Account stolen time when preempted while the vcpu task is
367
		 * running in the kernel (but not in qemu, which is INACTIVE).
368
		 */
369
		if (task_is_running(current) &&
370
				vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
371
			vcpu->arch.busy_preempt = mftb();
372
		return;
373
	}
374

375
	now = mftb();
376

377
	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
378
		kvmppc_core_start_stolen(vc, now);
379

380
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
381
	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
382
		vcpu->arch.busy_preempt = now;
383
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
384
}
385

386
static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
387
{
388
	vcpu->arch.pvr = pvr;
389
}
390

391
/* Dummy value used in computing PCR value below */
392
#define PCR_ARCH_31    (PCR_ARCH_300 << 1)
393

394
static inline unsigned long map_pcr_to_cap(unsigned long pcr)
395
{
396
	unsigned long cap = 0;
397

398
	switch (pcr) {
399
	case PCR_ARCH_300:
400
		cap = H_GUEST_CAP_POWER9;
401
		break;
402
	case PCR_ARCH_31:
403
		if (cpu_has_feature(CPU_FTR_P11_PVR))
404
			cap = H_GUEST_CAP_POWER11;
405
		else
406
			cap = H_GUEST_CAP_POWER10;
407
		break;
408
	default:
409
		break;
410
	}
411

412
	return cap;
413
}
414

415
static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
416
{
417
	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0, cap = 0;
418
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
419

420
	/* We can (emulate) our own architecture version and anything older */
421
	if (cpu_has_feature(CPU_FTR_P11_PVR) || cpu_has_feature(CPU_FTR_ARCH_31))
422
		host_pcr_bit = PCR_ARCH_31;
423
	else if (cpu_has_feature(CPU_FTR_ARCH_300))
424
		host_pcr_bit = PCR_ARCH_300;
425
	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
426
		host_pcr_bit = PCR_ARCH_207;
427
	else if (cpu_has_feature(CPU_FTR_ARCH_206))
428
		host_pcr_bit = PCR_ARCH_206;
429
	else
430
		host_pcr_bit = PCR_ARCH_205;
431

432
	/* Determine lowest PCR bit needed to run guest in given PVR level */
433
	guest_pcr_bit = host_pcr_bit;
434
	if (arch_compat) {
435
		switch (arch_compat) {
436
		case PVR_ARCH_205:
437
			guest_pcr_bit = PCR_ARCH_205;
438
			break;
439
		case PVR_ARCH_206:
440
		case PVR_ARCH_206p:
441
			guest_pcr_bit = PCR_ARCH_206;
442
			break;
443
		case PVR_ARCH_207:
444
			guest_pcr_bit = PCR_ARCH_207;
445
			break;
446
		case PVR_ARCH_300:
447
			guest_pcr_bit = PCR_ARCH_300;
448
			break;
449
		case PVR_ARCH_31:
450
		case PVR_ARCH_31_P11:
451
			guest_pcr_bit = PCR_ARCH_31;
452
			break;
453
		default:
454
			return -EINVAL;
455
		}
456
	}
457

458
	/* Check requested PCR bits don't exceed our capabilities */
459
	if (guest_pcr_bit > host_pcr_bit)
460
		return -EINVAL;
461

462
	if (kvmhv_on_pseries() && kvmhv_is_nestedv2()) {
463
		/*
464
		 * 'arch_compat == 0' would mean the guest should default to
465
		 * L1's compatibility. In this case, the guest would pick
466
		 * host's PCR and evaluate the corresponding capabilities.
467
		 */
468
		cap = map_pcr_to_cap(guest_pcr_bit);
469
		if (!(cap & nested_capabilities))
470
			return -EINVAL;
471
	}
472

473
	spin_lock(&vc->lock);
474
	vc->arch_compat = arch_compat;
475
	kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LOGICAL_PVR);
476
	/*
477
	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
478
	 * Also set all reserved PCR bits
479
	 */
480
	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
481
	spin_unlock(&vc->lock);
482

483
	return 0;
484
}
485

486
static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
487
{
488
	int r;
489

490
	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
491
	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
492
	       vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
493
	for (r = 0; r < 16; ++r)
494
		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
495
		       r, kvmppc_get_gpr(vcpu, r),
496
		       r+16, kvmppc_get_gpr(vcpu, r+16));
497
	pr_err("ctr = %.16lx  lr  = %.16lx\n",
498
	       vcpu->arch.regs.ctr, vcpu->arch.regs.link);
499
	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
500
	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
501
	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
502
	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
503
	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
504
	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
505
	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
506
	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
507
	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
508
	pr_err("fault dar = %.16lx dsisr = %.8x\n",
509
	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
510
	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
511
	for (r = 0; r < vcpu->arch.slb_max; ++r)
512
		pr_err("  ESID = %.16llx VSID = %.16llx\n",
513
		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
514
	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.16lx\n",
515
	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
516
	       vcpu->arch.last_inst);
517
}
518

519
static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
520
{
521
	return kvm_get_vcpu_by_id(kvm, id);
522
}
523

524
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
525
{
526
	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
527
	vpa->yield_count = cpu_to_be32(1);
528
}
529

530
static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
531
		   unsigned long addr, unsigned long len)
532
{
533
	/* check address is cacheline aligned */
534
	if (addr & (L1_CACHE_BYTES - 1))
535
		return -EINVAL;
536
	spin_lock(&vcpu->arch.vpa_update_lock);
537
	if (v->next_gpa != addr || v->len != len) {
538
		v->next_gpa = addr;
539
		v->len = addr ? len : 0;
540
		v->update_pending = 1;
541
	}
542
	spin_unlock(&vcpu->arch.vpa_update_lock);
543
	return 0;
544
}
545

546
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
547
struct reg_vpa {
548
	u32 dummy;
549
	union {
550
		__be16 hword;
551
		__be32 word;
552
	} length;
553
};
554

555
static int vpa_is_registered(struct kvmppc_vpa *vpap)
556
{
557
	if (vpap->update_pending)
558
		return vpap->next_gpa != 0;
559
	return vpap->pinned_addr != NULL;
560
}
561

562
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
563
				       unsigned long flags,
564
				       unsigned long vcpuid, unsigned long vpa)
565
{
566
	struct kvm *kvm = vcpu->kvm;
567
	unsigned long len, nb;
568
	void *va;
569
	struct kvm_vcpu *tvcpu;
570
	int err;
571
	int subfunc;
572
	struct kvmppc_vpa *vpap;
573

574
	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
575
	if (!tvcpu)
576
		return H_PARAMETER;
577

578
	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
579
	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
580
	    subfunc == H_VPA_REG_SLB) {
581
		/* Registering new area - address must be cache-line aligned */
582
		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
583
			return H_PARAMETER;
584

585
		/* convert logical addr to kernel addr and read length */
586
		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
587
		if (va == NULL)
588
			return H_PARAMETER;
589
		if (subfunc == H_VPA_REG_VPA)
590
			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
591
		else
592
			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
593
		kvmppc_unpin_guest_page(kvm, va, vpa, false);
594

595
		/* Check length */
596
		if (len > nb || len < sizeof(struct reg_vpa))
597
			return H_PARAMETER;
598
	} else {
599
		vpa = 0;
600
		len = 0;
601
	}
602

603
	err = H_PARAMETER;
604
	vpap = NULL;
605
	spin_lock(&tvcpu->arch.vpa_update_lock);
606

607
	switch (subfunc) {
608
	case H_VPA_REG_VPA:		/* register VPA */
609
		/*
610
		 * The size of our lppaca is 1kB because of the way we align
611
		 * it for the guest to avoid crossing a 4kB boundary. We only
612
		 * use 640 bytes of the structure though, so we should accept
613
		 * clients that set a size of 640.
614
		 */
615
		BUILD_BUG_ON(sizeof(struct lppaca) != 640);
616
		if (len < sizeof(struct lppaca))
617
			break;
618
		vpap = &tvcpu->arch.vpa;
619
		err = 0;
620
		break;
621

622
	case H_VPA_REG_DTL:		/* register DTL */
623
		if (len < sizeof(struct dtl_entry))
624
			break;
625
		len -= len % sizeof(struct dtl_entry);
626

627
		/* Check that they have previously registered a VPA */
628
		err = H_RESOURCE;
629
		if (!vpa_is_registered(&tvcpu->arch.vpa))
630
			break;
631

632
		vpap = &tvcpu->arch.dtl;
633
		err = 0;
634
		break;
635

636
	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
637
		/* Check that they have previously registered a VPA */
638
		err = H_RESOURCE;
639
		if (!vpa_is_registered(&tvcpu->arch.vpa))
640
			break;
641

642
		vpap = &tvcpu->arch.slb_shadow;
643
		err = 0;
644
		break;
645

646
	case H_VPA_DEREG_VPA:		/* deregister VPA */
647
		/* Check they don't still have a DTL or SLB buf registered */
648
		err = H_RESOURCE;
649
		if (vpa_is_registered(&tvcpu->arch.dtl) ||
650
		    vpa_is_registered(&tvcpu->arch.slb_shadow))
651
			break;
652

653
		vpap = &tvcpu->arch.vpa;
654
		err = 0;
655
		break;
656

657
	case H_VPA_DEREG_DTL:		/* deregister DTL */
658
		vpap = &tvcpu->arch.dtl;
659
		err = 0;
660
		break;
661

662
	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
663
		vpap = &tvcpu->arch.slb_shadow;
664
		err = 0;
665
		break;
666
	}
667

668
	if (vpap) {
669
		vpap->next_gpa = vpa;
670
		vpap->len = len;
671
		vpap->update_pending = 1;
672
	}
673

674
	spin_unlock(&tvcpu->arch.vpa_update_lock);
675

676
	return err;
677
}
678

679
static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap,
680
			       struct kvmppc_vpa *old_vpap)
681
{
682
	struct kvm *kvm = vcpu->kvm;
683
	void *va;
684
	unsigned long nb;
685
	unsigned long gpa;
686

687
	/*
688
	 * We need to pin the page pointed to by vpap->next_gpa,
689
	 * but we can't call kvmppc_pin_guest_page under the lock
690
	 * as it does get_user_pages() and down_read().  So we
691
	 * have to drop the lock, pin the page, then get the lock
692
	 * again and check that a new area didn't get registered
693
	 * in the meantime.
694
	 */
695
	for (;;) {
696
		gpa = vpap->next_gpa;
697
		spin_unlock(&vcpu->arch.vpa_update_lock);
698
		va = NULL;
699
		nb = 0;
700
		if (gpa)
701
			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
702
		spin_lock(&vcpu->arch.vpa_update_lock);
703
		if (gpa == vpap->next_gpa)
704
			break;
705
		/* sigh... unpin that one and try again */
706
		if (va)
707
			kvmppc_unpin_guest_page(kvm, va, gpa, false);
708
	}
709

710
	vpap->update_pending = 0;
711
	if (va && nb < vpap->len) {
712
		/*
713
		 * If it's now too short, it must be that userspace
714
		 * has changed the mappings underlying guest memory,
715
		 * so unregister the region.
716
		 */
717
		kvmppc_unpin_guest_page(kvm, va, gpa, false);
718
		va = NULL;
719
	}
720
	*old_vpap = *vpap;
721

722
	vpap->gpa = gpa;
723
	vpap->pinned_addr = va;
724
	vpap->dirty = false;
725
	if (va)
726
		vpap->pinned_end = va + vpap->len;
727
}
728

729
static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
730
{
731
	struct kvm *kvm = vcpu->kvm;
732
	struct kvmppc_vpa old_vpa = { 0 };
733

734
	if (!(vcpu->arch.vpa.update_pending ||
735
	      vcpu->arch.slb_shadow.update_pending ||
736
	      vcpu->arch.dtl.update_pending))
737
		return;
738

739
	spin_lock(&vcpu->arch.vpa_update_lock);
740
	if (vcpu->arch.vpa.update_pending) {
741
		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa, &old_vpa);
742
		if (old_vpa.pinned_addr) {
743
			if (kvmhv_is_nestedv2())
744
				kvmhv_nestedv2_set_vpa(vcpu, ~0ull);
745
			kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
746
						old_vpa.dirty);
747
		}
748
		if (vcpu->arch.vpa.pinned_addr) {
749
			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
750
			if (kvmhv_is_nestedv2())
751
				kvmhv_nestedv2_set_vpa(vcpu, __pa(vcpu->arch.vpa.pinned_addr));
752
		}
753
	}
754
	if (vcpu->arch.dtl.update_pending) {
755
		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl, &old_vpa);
756
		if (old_vpa.pinned_addr)
757
			kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
758
						old_vpa.dirty);
759
		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
760
		vcpu->arch.dtl_index = 0;
761
	}
762
	if (vcpu->arch.slb_shadow.update_pending) {
763
		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow, &old_vpa);
764
		if (old_vpa.pinned_addr)
765
			kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
766
						old_vpa.dirty);
767
	}
768

769
	spin_unlock(&vcpu->arch.vpa_update_lock);
770
}
771

772
/*
773
 * Return the accumulated stolen time for the vcore up until `now'.
774
 * The caller should hold the vcore lock.
775
 */
776
static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
777
{
778
	u64 p;
779
	unsigned long flags;
780

781
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
782

783
	spin_lock_irqsave(&vc->stoltb_lock, flags);
784
	p = vc->stolen_tb;
785
	if (vc->vcore_state != VCORE_INACTIVE &&
786
	    vc->preempt_tb != TB_NIL)
787
		p += now - vc->preempt_tb;
788
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
789
	return p;
790
}
791

792
static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
793
					struct lppaca *vpa,
794
					unsigned int pcpu, u64 now,
795
					unsigned long stolen)
796
{
797
	struct dtl_entry *dt;
798

799
	dt = vcpu->arch.dtl_ptr;
800

801
	if (!dt)
802
		return;
803

804
	dt->dispatch_reason = 7;
805
	dt->preempt_reason = 0;
806
	dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
807
	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
808
	dt->ready_to_enqueue_time = 0;
809
	dt->waiting_to_ready_time = 0;
810
	dt->timebase = cpu_to_be64(now);
811
	dt->fault_addr = 0;
812
	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
813
	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
814

815
	++dt;
816
	if (dt == vcpu->arch.dtl.pinned_end)
817
		dt = vcpu->arch.dtl.pinned_addr;
818
	vcpu->arch.dtl_ptr = dt;
819
	/* order writing *dt vs. writing vpa->dtl_idx */
820
	smp_wmb();
821
	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
822

823
	/* vcpu->arch.dtl.dirty is set by the caller */
824
}
825

826
static void kvmppc_update_vpa_dispatch(struct kvm_vcpu *vcpu,
827
				       struct kvmppc_vcore *vc)
828
{
829
	struct lppaca *vpa;
830
	unsigned long stolen;
831
	unsigned long core_stolen;
832
	u64 now;
833
	unsigned long flags;
834

835
	vpa = vcpu->arch.vpa.pinned_addr;
836
	if (!vpa)
837
		return;
838

839
	now = mftb();
840

841
	core_stolen = vcore_stolen_time(vc, now);
842
	stolen = core_stolen - vcpu->arch.stolen_logged;
843
	vcpu->arch.stolen_logged = core_stolen;
844
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
845
	stolen += vcpu->arch.busy_stolen;
846
	vcpu->arch.busy_stolen = 0;
847
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
848

849
	vpa->enqueue_dispatch_tb = cpu_to_be64(be64_to_cpu(vpa->enqueue_dispatch_tb) + stolen);
850

851
	__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now + kvmppc_get_tb_offset(vcpu), stolen);
852

853
	vcpu->arch.vpa.dirty = true;
854
}
855

856
static void kvmppc_update_vpa_dispatch_p9(struct kvm_vcpu *vcpu,
857
				       struct kvmppc_vcore *vc,
858
				       u64 now)
859
{
860
	struct lppaca *vpa;
861
	unsigned long stolen;
862
	unsigned long stolen_delta;
863

864
	vpa = vcpu->arch.vpa.pinned_addr;
865
	if (!vpa)
866
		return;
867

868
	stolen = vc->stolen_tb;
869
	stolen_delta = stolen - vcpu->arch.stolen_logged;
870
	vcpu->arch.stolen_logged = stolen;
871

872
	vpa->enqueue_dispatch_tb = cpu_to_be64(stolen);
873

874
	__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now, stolen_delta);
875

876
	vcpu->arch.vpa.dirty = true;
877
}
878

879
/* See if there is a doorbell interrupt pending for a vcpu */
880
static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
881
{
882
	int thr;
883
	struct kvmppc_vcore *vc;
884

885
	if (vcpu->arch.doorbell_request)
886
		return true;
887
	if (cpu_has_feature(CPU_FTR_ARCH_300))
888
		return false;
889
	/*
890
	 * Ensure that the read of vcore->dpdes comes after the read
891
	 * of vcpu->doorbell_request.  This barrier matches the
892
	 * smp_wmb() in kvmppc_guest_entry_inject().
893
	 */
894
	smp_rmb();
895
	vc = vcpu->arch.vcore;
896
	thr = vcpu->vcpu_id - vc->first_vcpuid;
897
	return !!(vc->dpdes & (1 << thr));
898
}
899

900
static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
901
{
902
	if (kvmppc_get_arch_compat(vcpu) >= PVR_ARCH_207)
903
		return true;
904
	if ((!kvmppc_get_arch_compat(vcpu)) &&
905
	    cpu_has_feature(CPU_FTR_ARCH_207S))
906
		return true;
907
	return false;
908
}
909

910
static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
911
			     unsigned long resource, unsigned long value1,
912
			     unsigned long value2)
913
{
914
	switch (resource) {
915
	case H_SET_MODE_RESOURCE_SET_CIABR:
916
		if (!kvmppc_power8_compatible(vcpu))
917
			return H_P2;
918
		if (value2)
919
			return H_P4;
920
		if (mflags)
921
			return H_UNSUPPORTED_FLAG_START;
922
		/* Guests can't breakpoint the hypervisor */
923
		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
924
			return H_P3;
925
		kvmppc_set_ciabr_hv(vcpu, value1);
926
		return H_SUCCESS;
927
	case H_SET_MODE_RESOURCE_SET_DAWR0:
928
		if (!kvmppc_power8_compatible(vcpu))
929
			return H_P2;
930
		if (!ppc_breakpoint_available())
931
			return H_P2;
932
		if (mflags)
933
			return H_UNSUPPORTED_FLAG_START;
934
		if (value2 & DABRX_HYP)
935
			return H_P4;
936
		kvmppc_set_dawr0_hv(vcpu, value1);
937
		kvmppc_set_dawrx0_hv(vcpu, value2);
938
		return H_SUCCESS;
939
	case H_SET_MODE_RESOURCE_SET_DAWR1:
940
		if (!kvmppc_power8_compatible(vcpu))
941
			return H_P2;
942
		if (!ppc_breakpoint_available())
943
			return H_P2;
944
		if (!cpu_has_feature(CPU_FTR_DAWR1))
945
			return H_P2;
946
		if (!vcpu->kvm->arch.dawr1_enabled)
947
			return H_FUNCTION;
948
		if (mflags)
949
			return H_UNSUPPORTED_FLAG_START;
950
		if (value2 & DABRX_HYP)
951
			return H_P4;
952
		kvmppc_set_dawr1_hv(vcpu, value1);
953
		kvmppc_set_dawrx1_hv(vcpu, value2);
954
		return H_SUCCESS;
955
	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
956
		/*
957
		 * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
958
		 * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
959
		 */
960
		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
961
				kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
962
			return H_UNSUPPORTED_FLAG_START;
963
		return H_TOO_HARD;
964
	default:
965
		return H_TOO_HARD;
966
	}
967
}
968

969
/* Copy guest memory in place - must reside within a single memslot */
970
static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
971
				  unsigned long len)
972
{
973
	struct kvm_memory_slot *to_memslot = NULL;
974
	struct kvm_memory_slot *from_memslot = NULL;
975
	unsigned long to_addr, from_addr;
976
	int r;
977

978
	/* Get HPA for from address */
979
	from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
980
	if (!from_memslot)
981
		return -EFAULT;
982
	if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
983
			     << PAGE_SHIFT))
984
		return -EINVAL;
985
	from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
986
	if (kvm_is_error_hva(from_addr))
987
		return -EFAULT;
988
	from_addr |= (from & (PAGE_SIZE - 1));
989

990
	/* Get HPA for to address */
991
	to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
992
	if (!to_memslot)
993
		return -EFAULT;
994
	if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
995
			   << PAGE_SHIFT))
996
		return -EINVAL;
997
	to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
998
	if (kvm_is_error_hva(to_addr))
999
		return -EFAULT;
1000
	to_addr |= (to & (PAGE_SIZE - 1));
1001

1002
	/* Perform copy */
1003
	r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
1004
			     len);
1005
	if (r)
1006
		return -EFAULT;
1007
	mark_page_dirty(kvm, to >> PAGE_SHIFT);
1008
	return 0;
1009
}
1010

1011
static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
1012
			       unsigned long dest, unsigned long src)
1013
{
1014
	u64 pg_sz = SZ_4K;		/* 4K page size */
1015
	u64 pg_mask = SZ_4K - 1;
1016
	int ret;
1017

1018
	/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
1019
	if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
1020
		      H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
1021
		return H_PARAMETER;
1022

1023
	/* dest (and src if copy_page flag set) must be page aligned */
1024
	if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
1025
		return H_PARAMETER;
1026

1027
	/* zero and/or copy the page as determined by the flags */
1028
	if (flags & H_COPY_PAGE) {
1029
		ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
1030
		if (ret < 0)
1031
			return H_PARAMETER;
1032
	} else if (flags & H_ZERO_PAGE) {
1033
		ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
1034
		if (ret < 0)
1035
			return H_PARAMETER;
1036
	}
1037

1038
	/* We can ignore the remaining flags */
1039

1040
	return H_SUCCESS;
1041
}
1042

1043
static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
1044
{
1045
	struct kvmppc_vcore *vcore = target->arch.vcore;
1046

1047
	/*
1048
	 * We expect to have been called by the real mode handler
1049
	 * (kvmppc_rm_h_confer()) which would have directly returned
1050
	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
1051
	 * have useful work to do and should not confer) so we don't
1052
	 * recheck that here.
1053
	 *
1054
	 * In the case of the P9 single vcpu per vcore case, the real
1055
	 * mode handler is not called but no other threads are in the
1056
	 * source vcore.
1057
	 */
1058
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
1059
		spin_lock(&vcore->lock);
1060
		if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
1061
		    vcore->vcore_state != VCORE_INACTIVE &&
1062
		    vcore->runner)
1063
			target = vcore->runner;
1064
		spin_unlock(&vcore->lock);
1065
	}
1066

1067
	return kvm_vcpu_yield_to(target);
1068
}
1069

1070
static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
1071
{
1072
	int yield_count = 0;
1073
	struct lppaca *lppaca;
1074

1075
	spin_lock(&vcpu->arch.vpa_update_lock);
1076
	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
1077
	if (lppaca)
1078
		yield_count = be32_to_cpu(lppaca->yield_count);
1079
	spin_unlock(&vcpu->arch.vpa_update_lock);
1080
	return yield_count;
1081
}
1082

1083
/*
1084
 * H_RPT_INVALIDATE hcall handler for nested guests.
1085
 *
1086
 * Handles only nested process-scoped invalidation requests in L0.
1087
 */
1088
static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
1089
{
1090
	unsigned long type = kvmppc_get_gpr(vcpu, 6);
1091
	unsigned long pid, pg_sizes, start, end;
1092

1093
	/*
1094
	 * The partition-scoped invalidations aren't handled here in L0.
1095
	 */
1096
	if (type & H_RPTI_TYPE_NESTED)
1097
		return RESUME_HOST;
1098

1099
	pid = kvmppc_get_gpr(vcpu, 4);
1100
	pg_sizes = kvmppc_get_gpr(vcpu, 7);
1101
	start = kvmppc_get_gpr(vcpu, 8);
1102
	end = kvmppc_get_gpr(vcpu, 9);
1103

1104
	do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
1105
				type, pg_sizes, start, end);
1106

1107
	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
1108
	return RESUME_GUEST;
1109
}
1110

1111
static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
1112
				    unsigned long id, unsigned long target,
1113
				    unsigned long type, unsigned long pg_sizes,
1114
				    unsigned long start, unsigned long end)
1115
{
1116
	if (!kvm_is_radix(vcpu->kvm))
1117
		return H_UNSUPPORTED;
1118

1119
	if (end < start)
1120
		return H_P5;
1121

1122
	/*
1123
	 * Partition-scoped invalidation for nested guests.
1124
	 */
1125
	if (type & H_RPTI_TYPE_NESTED) {
1126
		if (!nesting_enabled(vcpu->kvm))
1127
			return H_FUNCTION;
1128

1129
		/* Support only cores as target */
1130
		if (target != H_RPTI_TARGET_CMMU)
1131
			return H_P2;
1132

1133
		return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
1134
					       start, end);
1135
	}
1136

1137
	/*
1138
	 * Process-scoped invalidation for L1 guests.
1139
	 */
1140
	do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
1141
				type, pg_sizes, start, end);
1142
	return H_SUCCESS;
1143
}
1144

1145
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1146
{
1147
	struct kvm *kvm = vcpu->kvm;
1148
	unsigned long req = kvmppc_get_gpr(vcpu, 3);
1149
	unsigned long target, ret = H_SUCCESS;
1150
	int yield_count;
1151
	struct kvm_vcpu *tvcpu;
1152
	int idx, rc;
1153

1154
	if (req <= MAX_HCALL_OPCODE &&
1155
	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
1156
		return RESUME_HOST;
1157

1158
	switch (req) {
1159
	case H_REMOVE:
1160
		ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
1161
					kvmppc_get_gpr(vcpu, 5),
1162
					kvmppc_get_gpr(vcpu, 6));
1163
		if (ret == H_TOO_HARD)
1164
			return RESUME_HOST;
1165
		break;
1166
	case H_ENTER:
1167
		ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
1168
					kvmppc_get_gpr(vcpu, 5),
1169
					kvmppc_get_gpr(vcpu, 6),
1170
					kvmppc_get_gpr(vcpu, 7));
1171
		if (ret == H_TOO_HARD)
1172
			return RESUME_HOST;
1173
		break;
1174
	case H_READ:
1175
		ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
1176
					kvmppc_get_gpr(vcpu, 5));
1177
		if (ret == H_TOO_HARD)
1178
			return RESUME_HOST;
1179
		break;
1180
	case H_CLEAR_MOD:
1181
		ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
1182
					kvmppc_get_gpr(vcpu, 5));
1183
		if (ret == H_TOO_HARD)
1184
			return RESUME_HOST;
1185
		break;
1186
	case H_CLEAR_REF:
1187
		ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
1188
					kvmppc_get_gpr(vcpu, 5));
1189
		if (ret == H_TOO_HARD)
1190
			return RESUME_HOST;
1191
		break;
1192
	case H_PROTECT:
1193
		ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
1194
					kvmppc_get_gpr(vcpu, 5),
1195
					kvmppc_get_gpr(vcpu, 6));
1196
		if (ret == H_TOO_HARD)
1197
			return RESUME_HOST;
1198
		break;
1199
	case H_BULK_REMOVE:
1200
		ret = kvmppc_h_bulk_remove(vcpu);
1201
		if (ret == H_TOO_HARD)
1202
			return RESUME_HOST;
1203
		break;
1204

1205
	case H_CEDE:
1206
		break;
1207
	case H_PROD:
1208
		target = kvmppc_get_gpr(vcpu, 4);
1209
		tvcpu = kvmppc_find_vcpu(kvm, target);
1210
		if (!tvcpu) {
1211
			ret = H_PARAMETER;
1212
			break;
1213
		}
1214
		tvcpu->arch.prodded = 1;
1215
		smp_mb(); /* This orders prodded store vs ceded load */
1216
		if (tvcpu->arch.ceded)
1217
			kvmppc_fast_vcpu_kick_hv(tvcpu);
1218
		break;
1219
	case H_CONFER:
1220
		target = kvmppc_get_gpr(vcpu, 4);
1221
		if (target == -1)
1222
			break;
1223
		tvcpu = kvmppc_find_vcpu(kvm, target);
1224
		if (!tvcpu) {
1225
			ret = H_PARAMETER;
1226
			break;
1227
		}
1228
		yield_count = kvmppc_get_gpr(vcpu, 5);
1229
		if (kvmppc_get_yield_count(tvcpu) != yield_count)
1230
			break;
1231
		kvm_arch_vcpu_yield_to(tvcpu);
1232
		break;
1233
	case H_REGISTER_VPA:
1234
		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
1235
					kvmppc_get_gpr(vcpu, 5),
1236
					kvmppc_get_gpr(vcpu, 6));
1237
		break;
1238
	case H_RTAS:
1239
		if (list_empty(&kvm->arch.rtas_tokens))
1240
			return RESUME_HOST;
1241

1242
		idx = srcu_read_lock(&kvm->srcu);
1243
		rc = kvmppc_rtas_hcall(vcpu);
1244
		srcu_read_unlock(&kvm->srcu, idx);
1245

1246
		if (rc == -ENOENT)
1247
			return RESUME_HOST;
1248
		else if (rc == 0)
1249
			break;
1250

1251
		/* Send the error out to userspace via KVM_RUN */
1252
		return rc;
1253
	case H_LOGICAL_CI_LOAD:
1254
		ret = kvmppc_h_logical_ci_load(vcpu);
1255
		if (ret == H_TOO_HARD)
1256
			return RESUME_HOST;
1257
		break;
1258
	case H_LOGICAL_CI_STORE:
1259
		ret = kvmppc_h_logical_ci_store(vcpu);
1260
		if (ret == H_TOO_HARD)
1261
			return RESUME_HOST;
1262
		break;
1263
	case H_SET_MODE:
1264
		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
1265
					kvmppc_get_gpr(vcpu, 5),
1266
					kvmppc_get_gpr(vcpu, 6),
1267
					kvmppc_get_gpr(vcpu, 7));
1268
		if (ret == H_TOO_HARD)
1269
			return RESUME_HOST;
1270
		break;
1271
	case H_XIRR:
1272
	case H_CPPR:
1273
	case H_EOI:
1274
	case H_IPI:
1275
	case H_IPOLL:
1276
	case H_XIRR_X:
1277
		if (kvmppc_xics_enabled(vcpu)) {
1278
			if (xics_on_xive()) {
1279
				ret = H_NOT_AVAILABLE;
1280
				return RESUME_GUEST;
1281
			}
1282
			ret = kvmppc_xics_hcall(vcpu, req);
1283
			break;
1284
		}
1285
		return RESUME_HOST;
1286
	case H_SET_DABR:
1287
		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
1288
		break;
1289
	case H_SET_XDABR:
1290
		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
1291
						kvmppc_get_gpr(vcpu, 5));
1292
		break;
1293
#ifdef CONFIG_SPAPR_TCE_IOMMU
1294
	case H_GET_TCE:
1295
		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1296
						kvmppc_get_gpr(vcpu, 5));
1297
		if (ret == H_TOO_HARD)
1298
			return RESUME_HOST;
1299
		break;
1300
	case H_PUT_TCE:
1301
		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1302
						kvmppc_get_gpr(vcpu, 5),
1303
						kvmppc_get_gpr(vcpu, 6));
1304
		if (ret == H_TOO_HARD)
1305
			return RESUME_HOST;
1306
		break;
1307
	case H_PUT_TCE_INDIRECT:
1308
		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
1309
						kvmppc_get_gpr(vcpu, 5),
1310
						kvmppc_get_gpr(vcpu, 6),
1311
						kvmppc_get_gpr(vcpu, 7));
1312
		if (ret == H_TOO_HARD)
1313
			return RESUME_HOST;
1314
		break;
1315
	case H_STUFF_TCE:
1316
		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1317
						kvmppc_get_gpr(vcpu, 5),
1318
						kvmppc_get_gpr(vcpu, 6),
1319
						kvmppc_get_gpr(vcpu, 7));
1320
		if (ret == H_TOO_HARD)
1321
			return RESUME_HOST;
1322
		break;
1323
#endif
1324
	case H_RANDOM: {
1325
		unsigned long rand;
1326

1327
		if (!arch_get_random_seed_longs(&rand, 1))
1328
			ret = H_HARDWARE;
1329
		kvmppc_set_gpr(vcpu, 4, rand);
1330
		break;
1331
	}
1332
	case H_RPT_INVALIDATE:
1333
		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
1334
					      kvmppc_get_gpr(vcpu, 5),
1335
					      kvmppc_get_gpr(vcpu, 6),
1336
					      kvmppc_get_gpr(vcpu, 7),
1337
					      kvmppc_get_gpr(vcpu, 8),
1338
					      kvmppc_get_gpr(vcpu, 9));
1339
		break;
1340

1341
	case H_SET_PARTITION_TABLE:
1342
		ret = H_FUNCTION;
1343
		if (nesting_enabled(kvm))
1344
			ret = kvmhv_set_partition_table(vcpu);
1345
		break;
1346
	case H_ENTER_NESTED:
1347
		ret = H_FUNCTION;
1348
		if (!nesting_enabled(kvm))
1349
			break;
1350
		ret = kvmhv_enter_nested_guest(vcpu);
1351
		if (ret == H_INTERRUPT) {
1352
			kvmppc_set_gpr(vcpu, 3, 0);
1353
			vcpu->arch.hcall_needed = 0;
1354
			return -EINTR;
1355
		} else if (ret == H_TOO_HARD) {
1356
			kvmppc_set_gpr(vcpu, 3, 0);
1357
			vcpu->arch.hcall_needed = 0;
1358
			return RESUME_HOST;
1359
		}
1360
		break;
1361
	case H_TLB_INVALIDATE:
1362
		ret = H_FUNCTION;
1363
		if (nesting_enabled(kvm))
1364
			ret = kvmhv_do_nested_tlbie(vcpu);
1365
		break;
1366
	case H_COPY_TOFROM_GUEST:
1367
		ret = H_FUNCTION;
1368
		if (nesting_enabled(kvm))
1369
			ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1370
		break;
1371
	case H_PAGE_INIT:
1372
		ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1373
					 kvmppc_get_gpr(vcpu, 5),
1374
					 kvmppc_get_gpr(vcpu, 6));
1375
		break;
1376
	case H_SVM_PAGE_IN:
1377
		ret = H_UNSUPPORTED;
1378
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1379
			ret = kvmppc_h_svm_page_in(kvm,
1380
						   kvmppc_get_gpr(vcpu, 4),
1381
						   kvmppc_get_gpr(vcpu, 5),
1382
						   kvmppc_get_gpr(vcpu, 6));
1383
		break;
1384
	case H_SVM_PAGE_OUT:
1385
		ret = H_UNSUPPORTED;
1386
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1387
			ret = kvmppc_h_svm_page_out(kvm,
1388
						    kvmppc_get_gpr(vcpu, 4),
1389
						    kvmppc_get_gpr(vcpu, 5),
1390
						    kvmppc_get_gpr(vcpu, 6));
1391
		break;
1392
	case H_SVM_INIT_START:
1393
		ret = H_UNSUPPORTED;
1394
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1395
			ret = kvmppc_h_svm_init_start(kvm);
1396
		break;
1397
	case H_SVM_INIT_DONE:
1398
		ret = H_UNSUPPORTED;
1399
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1400
			ret = kvmppc_h_svm_init_done(kvm);
1401
		break;
1402
	case H_SVM_INIT_ABORT:
1403
		/*
1404
		 * Even if that call is made by the Ultravisor, the SSR1 value
1405
		 * is the guest context one, with the secure bit clear as it has
1406
		 * not yet been secured. So we can't check it here.
1407
		 * Instead the kvm->arch.secure_guest flag is checked inside
1408
		 * kvmppc_h_svm_init_abort().
1409
		 */
1410
		ret = kvmppc_h_svm_init_abort(kvm);
1411
		break;
1412

1413
	default:
1414
		return RESUME_HOST;
1415
	}
1416
	WARN_ON_ONCE(ret == H_TOO_HARD);
1417
	kvmppc_set_gpr(vcpu, 3, ret);
1418
	vcpu->arch.hcall_needed = 0;
1419
	return RESUME_GUEST;
1420
}
1421

1422
/*
1423
 * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
1424
 * handlers in book3s_hv_rmhandlers.S.
1425
 *
1426
 * This has to be done early, not in kvmppc_pseries_do_hcall(), so
1427
 * that the cede logic in kvmppc_run_single_vcpu() works properly.
1428
 */
1429
static void kvmppc_cede(struct kvm_vcpu *vcpu)
1430
{
1431
	__kvmppc_set_msr_hv(vcpu, __kvmppc_get_msr_hv(vcpu) | MSR_EE);
1432
	vcpu->arch.ceded = 1;
1433
	smp_mb();
1434
	if (vcpu->arch.prodded) {
1435
		vcpu->arch.prodded = 0;
1436
		smp_mb();
1437
		vcpu->arch.ceded = 0;
1438
	}
1439
}
1440

1441
static int kvmppc_hcall_impl_hv(unsigned long cmd)
1442
{
1443
	switch (cmd) {
1444
	case H_CEDE:
1445
	case H_PROD:
1446
	case H_CONFER:
1447
	case H_REGISTER_VPA:
1448
	case H_SET_MODE:
1449
#ifdef CONFIG_SPAPR_TCE_IOMMU
1450
	case H_GET_TCE:
1451
	case H_PUT_TCE:
1452
	case H_PUT_TCE_INDIRECT:
1453
	case H_STUFF_TCE:
1454
#endif
1455
	case H_LOGICAL_CI_LOAD:
1456
	case H_LOGICAL_CI_STORE:
1457
#ifdef CONFIG_KVM_XICS
1458
	case H_XIRR:
1459
	case H_CPPR:
1460
	case H_EOI:
1461
	case H_IPI:
1462
	case H_IPOLL:
1463
	case H_XIRR_X:
1464
#endif
1465
	case H_PAGE_INIT:
1466
	case H_RPT_INVALIDATE:
1467
		return 1;
1468
	}
1469

1470
	/* See if it's in the real-mode table */
1471
	return kvmppc_hcall_impl_hv_realmode(cmd);
1472
}
1473

1474
static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
1475
{
1476
	ppc_inst_t last_inst;
1477

1478
	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
1479
					EMULATE_DONE) {
1480
		/*
1481
		 * Fetch failed, so return to guest and
1482
		 * try executing it again.
1483
		 */
1484
		return RESUME_GUEST;
1485
	}
1486

1487
	if (ppc_inst_val(last_inst) == KVMPPC_INST_SW_BREAKPOINT) {
1488
		vcpu->run->exit_reason = KVM_EXIT_DEBUG;
1489
		vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
1490
		return RESUME_HOST;
1491
	} else {
1492
		kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1493
				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1494
		return RESUME_GUEST;
1495
	}
1496
}
1497

1498
static void do_nothing(void *x)
1499
{
1500
}
1501

1502
static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
1503
{
1504
	int thr, cpu, pcpu, nthreads;
1505
	struct kvm_vcpu *v;
1506
	unsigned long dpdes;
1507

1508
	nthreads = vcpu->kvm->arch.emul_smt_mode;
1509
	dpdes = 0;
1510
	cpu = vcpu->vcpu_id & ~(nthreads - 1);
1511
	for (thr = 0; thr < nthreads; ++thr, ++cpu) {
1512
		v = kvmppc_find_vcpu(vcpu->kvm, cpu);
1513
		if (!v)
1514
			continue;
1515
		/*
1516
		 * If the vcpu is currently running on a physical cpu thread,
1517
		 * interrupt it in order to pull it out of the guest briefly,
1518
		 * which will update its vcore->dpdes value.
1519
		 */
1520
		pcpu = READ_ONCE(v->cpu);
1521
		if (pcpu >= 0)
1522
			smp_call_function_single(pcpu, do_nothing, NULL, 1);
1523
		if (kvmppc_doorbell_pending(v))
1524
			dpdes |= 1 << thr;
1525
	}
1526
	return dpdes;
1527
}
1528

1529
/*
1530
 * On POWER9, emulate doorbell-related instructions in order to
1531
 * give the guest the illusion of running on a multi-threaded core.
1532
 * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
1533
 * and mfspr DPDES.
1534
 */
1535
static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
1536
{
1537
	u32 inst, rb, thr;
1538
	unsigned long arg;
1539
	struct kvm *kvm = vcpu->kvm;
1540
	struct kvm_vcpu *tvcpu;
1541
	ppc_inst_t pinst;
1542

1543
	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst) != EMULATE_DONE)
1544
		return RESUME_GUEST;
1545
	inst = ppc_inst_val(pinst);
1546
	if (get_op(inst) != 31)
1547
		return EMULATE_FAIL;
1548
	rb = get_rb(inst);
1549
	thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
1550
	switch (get_xop(inst)) {
1551
	case OP_31_XOP_MSGSNDP:
1552
		arg = kvmppc_get_gpr(vcpu, rb);
1553
		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1554
			break;
1555
		arg &= 0x7f;
1556
		if (arg >= kvm->arch.emul_smt_mode)
1557
			break;
1558
		tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
1559
		if (!tvcpu)
1560
			break;
1561
		if (!tvcpu->arch.doorbell_request) {
1562
			tvcpu->arch.doorbell_request = 1;
1563
			kvmppc_fast_vcpu_kick_hv(tvcpu);
1564
		}
1565
		break;
1566
	case OP_31_XOP_MSGCLRP:
1567
		arg = kvmppc_get_gpr(vcpu, rb);
1568
		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1569
			break;
1570
		vcpu->arch.vcore->dpdes = 0;
1571
		vcpu->arch.doorbell_request = 0;
1572
		break;
1573
	case OP_31_XOP_MFSPR:
1574
		switch (get_sprn(inst)) {
1575
		case SPRN_TIR:
1576
			arg = thr;
1577
			break;
1578
		case SPRN_DPDES:
1579
			arg = kvmppc_read_dpdes(vcpu);
1580
			break;
1581
		default:
1582
			return EMULATE_FAIL;
1583
		}
1584
		kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1585
		break;
1586
	default:
1587
		return EMULATE_FAIL;
1588
	}
1589
	kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
1590
	return RESUME_GUEST;
1591
}
1592

1593
/*
1594
 * If the lppaca had pmcregs_in_use clear when we exited the guest, then
1595
 * HFSCR_PM is cleared for next entry. If the guest then tries to access
1596
 * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
1597
 * back in the guest HFSCR will cause the next entry to load the PMU SPRs and
1598
 * allow the guest access to continue.
1599
 */
1600
static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
1601
{
1602
	if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
1603
		return EMULATE_FAIL;
1604

1605
	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PM);
1606

1607
	return RESUME_GUEST;
1608
}
1609

1610
static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
1611
{
1612
	if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
1613
		return EMULATE_FAIL;
1614

1615
	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_EBB);
1616

1617
	return RESUME_GUEST;
1618
}
1619

1620
static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
1621
{
1622
	if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
1623
		return EMULATE_FAIL;
1624

1625
	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
1626

1627
	return RESUME_GUEST;
1628
}
1629

1630
static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
1631
				 struct task_struct *tsk)
1632
{
1633
	struct kvm_run *run = vcpu->run;
1634
	int r = RESUME_HOST;
1635

1636
	vcpu->stat.sum_exits++;
1637

1638
	/*
1639
	 * This can happen if an interrupt occurs in the last stages
1640
	 * of guest entry or the first stages of guest exit (i.e. after
1641
	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1642
	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
1643
	 * That can happen due to a bug, or due to a machine check
1644
	 * occurring at just the wrong time.
1645
	 */
1646
	if (!kvmhv_is_nestedv2() && (__kvmppc_get_msr_hv(vcpu) & MSR_HV)) {
1647
		printk(KERN_EMERG "KVM trap in HV mode!\n");
1648
		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1649
			vcpu->arch.trap, kvmppc_get_pc(vcpu),
1650
			vcpu->arch.shregs.msr);
1651
		kvmppc_dump_regs(vcpu);
1652
		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1653
		run->hw.hardware_exit_reason = vcpu->arch.trap;
1654
		return RESUME_HOST;
1655
	}
1656
	run->exit_reason = KVM_EXIT_UNKNOWN;
1657
	run->ready_for_interrupt_injection = 1;
1658
	switch (vcpu->arch.trap) {
1659
	/* We're good on these - the host merely wanted to get our attention */
1660
	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
1661
		WARN_ON_ONCE(1); /* Should never happen */
1662
		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
1663
		fallthrough;
1664
	case BOOK3S_INTERRUPT_HV_DECREMENTER:
1665
		vcpu->stat.dec_exits++;
1666
		r = RESUME_GUEST;
1667
		break;
1668
	case BOOK3S_INTERRUPT_EXTERNAL:
1669
	case BOOK3S_INTERRUPT_H_DOORBELL:
1670
	case BOOK3S_INTERRUPT_H_VIRT:
1671
		vcpu->stat.ext_intr_exits++;
1672
		r = RESUME_GUEST;
1673
		break;
1674
	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1675
	case BOOK3S_INTERRUPT_HMI:
1676
	case BOOK3S_INTERRUPT_PERFMON:
1677
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
1678
		r = RESUME_GUEST;
1679
		break;
1680
	case BOOK3S_INTERRUPT_MACHINE_CHECK: {
1681
		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1682
					      DEFAULT_RATELIMIT_BURST);
1683
		/*
1684
		 * Print the MCE event to host console. Ratelimit so the guest
1685
		 * can't flood the host log.
1686
		 */
1687
		if (__ratelimit(&rs))
1688
			machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
1689

1690
		/*
1691
		 * If the guest can do FWNMI, exit to userspace so it can
1692
		 * deliver a FWNMI to the guest.
1693
		 * Otherwise we synthesize a machine check for the guest
1694
		 * so that it knows that the machine check occurred.
1695
		 */
1696
		if (!vcpu->kvm->arch.fwnmi_enabled) {
1697
			ulong flags = (__kvmppc_get_msr_hv(vcpu) & 0x083c0000) |
1698
					(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1699
			kvmppc_core_queue_machine_check(vcpu, flags);
1700
			r = RESUME_GUEST;
1701
			break;
1702
		}
1703

1704
		/* Exit to guest with KVM_EXIT_NMI as exit reason */
1705
		run->exit_reason = KVM_EXIT_NMI;
1706
		run->hw.hardware_exit_reason = vcpu->arch.trap;
1707
		/* Clear out the old NMI status from run->flags */
1708
		run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
1709
		/* Now set the NMI status */
1710
		if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
1711
			run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
1712
		else
1713
			run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1714

1715
		r = RESUME_HOST;
1716
		break;
1717
	}
1718
	case BOOK3S_INTERRUPT_PROGRAM:
1719
	{
1720
		ulong flags;
1721
		/*
1722
		 * Normally program interrupts are delivered directly
1723
		 * to the guest by the hardware, but we can get here
1724
		 * as a result of a hypervisor emulation interrupt
1725
		 * (e40) getting turned into a 700 by BML RTAS.
1726
		 */
1727
		flags = (__kvmppc_get_msr_hv(vcpu) & 0x1f0000ull) |
1728
			(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1729
		kvmppc_core_queue_program(vcpu, flags);
1730
		r = RESUME_GUEST;
1731
		break;
1732
	}
1733
	case BOOK3S_INTERRUPT_SYSCALL:
1734
	{
1735
		int i;
1736

1737
		if (!kvmhv_is_nestedv2() && unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
1738
			/*
1739
			 * Guest userspace executed sc 1. This can only be
1740
			 * reached by the P9 path because the old path
1741
			 * handles this case in realmode hcall handlers.
1742
			 */
1743
			if (!kvmhv_vcpu_is_radix(vcpu)) {
1744
				/*
1745
				 * A guest could be running PR KVM, so this
1746
				 * may be a PR KVM hcall. It must be reflected
1747
				 * to the guest kernel as a sc interrupt.
1748
				 */
1749
				kvmppc_core_queue_syscall(vcpu);
1750
			} else {
1751
				/*
1752
				 * Radix guests can not run PR KVM or nested HV
1753
				 * hash guests which might run PR KVM, so this
1754
				 * is always a privilege fault. Send a program
1755
				 * check to guest kernel.
1756
				 */
1757
				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
1758
			}
1759
			r = RESUME_GUEST;
1760
			break;
1761
		}
1762

1763
		/*
1764
		 * hcall - gather args and set exit_reason. This will next be
1765
		 * handled by kvmppc_pseries_do_hcall which may be able to deal
1766
		 * with it and resume guest, or may punt to userspace.
1767
		 */
1768
		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
1769
		for (i = 0; i < 9; ++i)
1770
			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
1771
		run->exit_reason = KVM_EXIT_PAPR_HCALL;
1772
		vcpu->arch.hcall_needed = 1;
1773
		r = RESUME_HOST;
1774
		break;
1775
	}
1776
	/*
1777
	 * We get these next two if the guest accesses a page which it thinks
1778
	 * it has mapped but which is not actually present, either because
1779
	 * it is for an emulated I/O device or because the corresonding
1780
	 * host page has been paged out.
1781
	 *
1782
	 * Any other HDSI/HISI interrupts have been handled already for P7/8
1783
	 * guests. For POWER9 hash guests not using rmhandlers, basic hash
1784
	 * fault handling is done here.
1785
	 */
1786
	case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
1787
		unsigned long vsid;
1788
		long err;
1789

1790
		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
1791
		    unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
1792
			r = RESUME_GUEST; /* Just retry if it's the canary */
1793
			break;
1794
		}
1795

1796
		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1797
			/*
1798
			 * Radix doesn't require anything, and pre-ISAv3.0 hash
1799
			 * already attempted to handle this in rmhandlers. The
1800
			 * hash fault handling below is v3 only (it uses ASDR
1801
			 * via fault_gpa).
1802
			 */
1803
			r = RESUME_PAGE_FAULT;
1804
			break;
1805
		}
1806

1807
		if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
1808
			kvmppc_core_queue_data_storage(vcpu,
1809
				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1810
				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1811
			r = RESUME_GUEST;
1812
			break;
1813
		}
1814

1815
		if (!(__kvmppc_get_msr_hv(vcpu) & MSR_DR))
1816
			vsid = vcpu->kvm->arch.vrma_slb_v;
1817
		else
1818
			vsid = vcpu->arch.fault_gpa;
1819

1820
		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1821
				vsid, vcpu->arch.fault_dsisr, true);
1822
		if (err == 0) {
1823
			r = RESUME_GUEST;
1824
		} else if (err == -1 || err == -2) {
1825
			r = RESUME_PAGE_FAULT;
1826
		} else {
1827
			kvmppc_core_queue_data_storage(vcpu,
1828
				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1829
				vcpu->arch.fault_dar, err);
1830
			r = RESUME_GUEST;
1831
		}
1832
		break;
1833
	}
1834
	case BOOK3S_INTERRUPT_H_INST_STORAGE: {
1835
		unsigned long vsid;
1836
		long err;
1837

1838
		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1839
		vcpu->arch.fault_dsisr = __kvmppc_get_msr_hv(vcpu) &
1840
			DSISR_SRR1_MATCH_64S;
1841
		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
1842
			/*
1843
			 * Radix doesn't require anything, and pre-ISAv3.0 hash
1844
			 * already attempted to handle this in rmhandlers. The
1845
			 * hash fault handling below is v3 only (it uses ASDR
1846
			 * via fault_gpa).
1847
			 */
1848
			if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
1849
				vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1850
			r = RESUME_PAGE_FAULT;
1851
			break;
1852
		}
1853

1854
		if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
1855
			kvmppc_core_queue_inst_storage(vcpu,
1856
				vcpu->arch.fault_dsisr |
1857
				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1858
			r = RESUME_GUEST;
1859
			break;
1860
		}
1861

1862
		if (!(__kvmppc_get_msr_hv(vcpu) & MSR_IR))
1863
			vsid = vcpu->kvm->arch.vrma_slb_v;
1864
		else
1865
			vsid = vcpu->arch.fault_gpa;
1866

1867
		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1868
				vsid, vcpu->arch.fault_dsisr, false);
1869
		if (err == 0) {
1870
			r = RESUME_GUEST;
1871
		} else if (err == -1) {
1872
			r = RESUME_PAGE_FAULT;
1873
		} else {
1874
			kvmppc_core_queue_inst_storage(vcpu,
1875
				err | (kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1876
			r = RESUME_GUEST;
1877
		}
1878
		break;
1879
	}
1880

1881
	/*
1882
	 * This occurs if the guest executes an illegal instruction.
1883
	 * If the guest debug is disabled, generate a program interrupt
1884
	 * to the guest. If guest debug is enabled, we need to check
1885
	 * whether the instruction is a software breakpoint instruction.
1886
	 * Accordingly return to Guest or Host.
1887
	 */
1888
	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1889
		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1890
			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1891
				swab32(vcpu->arch.emul_inst) :
1892
				vcpu->arch.emul_inst;
1893
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1894
			r = kvmppc_emulate_debug_inst(vcpu);
1895
		} else {
1896
			kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1897
				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1898
			r = RESUME_GUEST;
1899
		}
1900
		break;
1901

1902
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1903
	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1904
		/*
1905
		 * This occurs for various TM-related instructions that
1906
		 * we need to emulate on POWER9 DD2.2.  We have already
1907
		 * handled the cases where the guest was in real-suspend
1908
		 * mode and was transitioning to transactional state.
1909
		 */
1910
		r = kvmhv_p9_tm_emulation(vcpu);
1911
		if (r != -1)
1912
			break;
1913
		fallthrough; /* go to facility unavailable handler */
1914
#endif
1915

1916
	/*
1917
	 * This occurs if the guest (kernel or userspace), does something that
1918
	 * is prohibited by HFSCR.
1919
	 * On POWER9, this could be a doorbell instruction that we need
1920
	 * to emulate.
1921
	 * Otherwise, we just generate a program interrupt to the guest.
1922
	 */
1923
	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
1924
		u64 cause = kvmppc_get_hfscr_hv(vcpu) >> 56;
1925

1926
		r = EMULATE_FAIL;
1927
		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1928
			switch (cause) {
1929
			case FSCR_MSGP_LG:
1930
				r = kvmppc_emulate_doorbell_instr(vcpu);
1931
				break;
1932
			case FSCR_PM_LG:
1933
				r = kvmppc_pmu_unavailable(vcpu);
1934
				break;
1935
			case FSCR_EBB_LG:
1936
				r = kvmppc_ebb_unavailable(vcpu);
1937
				break;
1938
			case FSCR_TM_LG:
1939
				r = kvmppc_tm_unavailable(vcpu);
1940
				break;
1941
			default:
1942
				break;
1943
			}
1944
		}
1945
		if (r == EMULATE_FAIL) {
1946
			kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
1947
				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1948
			r = RESUME_GUEST;
1949
		}
1950
		break;
1951
	}
1952

1953
	case BOOK3S_INTERRUPT_HV_RM_HARD:
1954
		r = RESUME_PASSTHROUGH;
1955
		break;
1956
	default:
1957
		kvmppc_dump_regs(vcpu);
1958
		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1959
			vcpu->arch.trap, kvmppc_get_pc(vcpu),
1960
			__kvmppc_get_msr_hv(vcpu));
1961
		run->hw.hardware_exit_reason = vcpu->arch.trap;
1962
		r = RESUME_HOST;
1963
		break;
1964
	}
1965

1966
	return r;
1967
}
1968

1969
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1970
{
1971
	int r;
1972
	int srcu_idx;
1973

1974
	vcpu->stat.sum_exits++;
1975

1976
	/*
1977
	 * This can happen if an interrupt occurs in the last stages
1978
	 * of guest entry or the first stages of guest exit (i.e. after
1979
	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1980
	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
1981
	 * That can happen due to a bug, or due to a machine check
1982
	 * occurring at just the wrong time.
1983
	 */
1984
	if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
1985
		pr_emerg("KVM trap in HV mode while nested!\n");
1986
		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1987
			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
1988
			 __kvmppc_get_msr_hv(vcpu));
1989
		kvmppc_dump_regs(vcpu);
1990
		return RESUME_HOST;
1991
	}
1992
	switch (vcpu->arch.trap) {
1993
	/* We're good on these - the host merely wanted to get our attention */
1994
	case BOOK3S_INTERRUPT_HV_DECREMENTER:
1995
		vcpu->stat.dec_exits++;
1996
		r = RESUME_GUEST;
1997
		break;
1998
	case BOOK3S_INTERRUPT_EXTERNAL:
1999
		vcpu->stat.ext_intr_exits++;
2000
		r = RESUME_HOST;
2001
		break;
2002
	case BOOK3S_INTERRUPT_H_DOORBELL:
2003
	case BOOK3S_INTERRUPT_H_VIRT:
2004
		vcpu->stat.ext_intr_exits++;
2005
		r = RESUME_GUEST;
2006
		break;
2007
	/* These need to go to the nested HV */
2008
	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
2009
		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
2010
		vcpu->stat.dec_exits++;
2011
		r = RESUME_HOST;
2012
		break;
2013
	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
2014
	case BOOK3S_INTERRUPT_HMI:
2015
	case BOOK3S_INTERRUPT_PERFMON:
2016
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
2017
		r = RESUME_GUEST;
2018
		break;
2019
	case BOOK3S_INTERRUPT_MACHINE_CHECK:
2020
	{
2021
		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
2022
					      DEFAULT_RATELIMIT_BURST);
2023
		/* Pass the machine check to the L1 guest */
2024
		r = RESUME_HOST;
2025
		/* Print the MCE event to host console. */
2026
		if (__ratelimit(&rs))
2027
			machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
2028
		break;
2029
	}
2030
	/*
2031
	 * We get these next two if the guest accesses a page which it thinks
2032
	 * it has mapped but which is not actually present, either because
2033
	 * it is for an emulated I/O device or because the corresonding
2034
	 * host page has been paged out.
2035
	 */
2036
	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
2037
		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2038
		r = kvmhv_nested_page_fault(vcpu);
2039
		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2040
		break;
2041
	case BOOK3S_INTERRUPT_H_INST_STORAGE:
2042
		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
2043
		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
2044
					 DSISR_SRR1_MATCH_64S;
2045
		if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
2046
			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
2047
		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2048
		r = kvmhv_nested_page_fault(vcpu);
2049
		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2050
		break;
2051

2052
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2053
	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
2054
		/*
2055
		 * This occurs for various TM-related instructions that
2056
		 * we need to emulate on POWER9 DD2.2.  We have already
2057
		 * handled the cases where the guest was in real-suspend
2058
		 * mode and was transitioning to transactional state.
2059
		 */
2060
		r = kvmhv_p9_tm_emulation(vcpu);
2061
		if (r != -1)
2062
			break;
2063
		fallthrough; /* go to facility unavailable handler */
2064
#endif
2065

2066
	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
2067
		r = RESUME_HOST;
2068
		break;
2069

2070
	case BOOK3S_INTERRUPT_HV_RM_HARD:
2071
		vcpu->arch.trap = 0;
2072
		r = RESUME_GUEST;
2073
		if (!xics_on_xive())
2074
			kvmppc_xics_rm_complete(vcpu, 0);
2075
		break;
2076
	case BOOK3S_INTERRUPT_SYSCALL:
2077
	{
2078
		unsigned long req = kvmppc_get_gpr(vcpu, 3);
2079

2080
		/*
2081
		 * The H_RPT_INVALIDATE hcalls issued by nested
2082
		 * guests for process-scoped invalidations when
2083
		 * GTSE=0, are handled here in L0.
2084
		 */
2085
		if (req == H_RPT_INVALIDATE) {
2086
			r = kvmppc_nested_h_rpt_invalidate(vcpu);
2087
			break;
2088
		}
2089

2090
		r = RESUME_HOST;
2091
		break;
2092
	}
2093
	default:
2094
		r = RESUME_HOST;
2095
		break;
2096
	}
2097

2098
	return r;
2099
}
2100

2101
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
2102
					    struct kvm_sregs *sregs)
2103
{
2104
	int i;
2105

2106
	memset(sregs, 0, sizeof(struct kvm_sregs));
2107
	sregs->pvr = vcpu->arch.pvr;
2108
	for (i = 0; i < vcpu->arch.slb_max; i++) {
2109
		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
2110
		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
2111
	}
2112

2113
	return 0;
2114
}
2115

2116
static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
2117
					    struct kvm_sregs *sregs)
2118
{
2119
	int i, j;
2120

2121
	/* Only accept the same PVR as the host's, since we can't spoof it */
2122
	if (sregs->pvr != vcpu->arch.pvr)
2123
		return -EINVAL;
2124

2125
	j = 0;
2126
	for (i = 0; i < vcpu->arch.slb_nr; i++) {
2127
		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
2128
			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
2129
			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
2130
			++j;
2131
		}
2132
	}
2133
	vcpu->arch.slb_max = j;
2134

2135
	return 0;
2136
}
2137

2138
/*
2139
 * Enforce limits on guest LPCR values based on hardware availability,
2140
 * guest configuration, and possibly hypervisor support and security
2141
 * concerns.
2142
 */
2143
unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
2144
{
2145
	/* LPCR_TC only applies to HPT guests */
2146
	if (kvm_is_radix(kvm))
2147
		lpcr &= ~LPCR_TC;
2148

2149
	/* On POWER8 and above, userspace can modify AIL */
2150
	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2151
		lpcr &= ~LPCR_AIL;
2152
	if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
2153
		lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
2154
	/*
2155
	 * On some POWER9s we force AIL off for radix guests to prevent
2156
	 * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
2157
	 * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
2158
	 * be cached, which the host TLB management does not expect.
2159
	 */
2160
	if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
2161
		lpcr &= ~LPCR_AIL;
2162

2163
	/*
2164
	 * On POWER9, allow userspace to enable large decrementer for the
2165
	 * guest, whether or not the host has it enabled.
2166
	 */
2167
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
2168
		lpcr &= ~LPCR_LD;
2169

2170
	return lpcr;
2171
}
2172

2173
static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
2174
{
2175
	if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
2176
		WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
2177
			  lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
2178
	}
2179
}
2180

2181
static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
2182
		bool preserve_top32)
2183
{
2184
	struct kvm *kvm = vcpu->kvm;
2185
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
2186
	u64 mask;
2187

2188
	spin_lock(&vc->lock);
2189

2190
	/*
2191
	 * Userspace can only modify
2192
	 * DPFD (default prefetch depth), ILE (interrupt little-endian),
2193
	 * TC (translation control), AIL (alternate interrupt location),
2194
	 * LD (large decrementer).
2195
	 * These are subject to restrictions from kvmppc_filter_lcpr_hv().
2196
	 */
2197
	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;
2198

2199
	/* Broken 32-bit version of LPCR must not clear top bits */
2200
	if (preserve_top32)
2201
		mask &= 0xFFFFFFFF;
2202

2203
	new_lpcr = kvmppc_filter_lpcr_hv(kvm,
2204
			(vc->lpcr & ~mask) | (new_lpcr & mask));
2205

2206
	/*
2207
	 * If ILE (interrupt little-endian) has changed, update the
2208
	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
2209
	 */
2210
	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
2211
		struct kvm_vcpu *vcpu;
2212
		unsigned long i;
2213

2214
		kvm_for_each_vcpu(i, vcpu, kvm) {
2215
			if (vcpu->arch.vcore != vc)
2216
				continue;
2217
			if (new_lpcr & LPCR_ILE)
2218
				vcpu->arch.intr_msr |= MSR_LE;
2219
			else
2220
				vcpu->arch.intr_msr &= ~MSR_LE;
2221
		}
2222
	}
2223

2224
	vc->lpcr = new_lpcr;
2225
	kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
2226

2227
	spin_unlock(&vc->lock);
2228
}
2229

2230
static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2231
				 union kvmppc_one_reg *val)
2232
{
2233
	int r = 0;
2234
	long int i;
2235

2236
	switch (id) {
2237
	case KVM_REG_PPC_DEBUG_INST:
2238
		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
2239
		break;
2240
	case KVM_REG_PPC_HIOR:
2241
		*val = get_reg_val(id, 0);
2242
		break;
2243
	case KVM_REG_PPC_DABR:
2244
		*val = get_reg_val(id, vcpu->arch.dabr);
2245
		break;
2246
	case KVM_REG_PPC_DABRX:
2247
		*val = get_reg_val(id, vcpu->arch.dabrx);
2248
		break;
2249
	case KVM_REG_PPC_DSCR:
2250
		*val = get_reg_val(id, kvmppc_get_dscr_hv(vcpu));
2251
		break;
2252
	case KVM_REG_PPC_PURR:
2253
		*val = get_reg_val(id, kvmppc_get_purr_hv(vcpu));
2254
		break;
2255
	case KVM_REG_PPC_SPURR:
2256
		*val = get_reg_val(id, kvmppc_get_spurr_hv(vcpu));
2257
		break;
2258
	case KVM_REG_PPC_AMR:
2259
		*val = get_reg_val(id, kvmppc_get_amr_hv(vcpu));
2260
		break;
2261
	case KVM_REG_PPC_UAMOR:
2262
		*val = get_reg_val(id, kvmppc_get_uamor_hv(vcpu));
2263
		break;
2264
	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2265
		i = id - KVM_REG_PPC_MMCR0;
2266
		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i));
2267
		break;
2268
	case KVM_REG_PPC_MMCR2:
2269
		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 2));
2270
		break;
2271
	case KVM_REG_PPC_MMCRA:
2272
		*val = get_reg_val(id, kvmppc_get_mmcra_hv(vcpu));
2273
		break;
2274
	case KVM_REG_PPC_MMCRS:
2275
		*val = get_reg_val(id, vcpu->arch.mmcrs);
2276
		break;
2277
	case KVM_REG_PPC_MMCR3:
2278
		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 3));
2279
		break;
2280
	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2281
		i = id - KVM_REG_PPC_PMC1;
2282
		*val = get_reg_val(id, kvmppc_get_pmc_hv(vcpu, i));
2283
		break;
2284
	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2285
		i = id - KVM_REG_PPC_SPMC1;
2286
		*val = get_reg_val(id, vcpu->arch.spmc[i]);
2287
		break;
2288
	case KVM_REG_PPC_SIAR:
2289
		*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
2290
		break;
2291
	case KVM_REG_PPC_SDAR:
2292
		*val = get_reg_val(id, kvmppc_get_sdar_hv(vcpu));
2293
		break;
2294
	case KVM_REG_PPC_SIER:
2295
		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 0));
2296
		break;
2297
	case KVM_REG_PPC_SIER2:
2298
		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 1));
2299
		break;
2300
	case KVM_REG_PPC_SIER3:
2301
		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 2));
2302
		break;
2303
	case KVM_REG_PPC_IAMR:
2304
		*val = get_reg_val(id, kvmppc_get_iamr_hv(vcpu));
2305
		break;
2306
	case KVM_REG_PPC_PSPB:
2307
		*val = get_reg_val(id, kvmppc_get_pspb_hv(vcpu));
2308
		break;
2309
	case KVM_REG_PPC_DPDES:
2310
		/*
2311
		 * On POWER9, where we are emulating msgsndp etc.,
2312
		 * we return 1 bit for each vcpu, which can come from
2313
		 * either vcore->dpdes or doorbell_request.
2314
		 * On POWER8, doorbell_request is 0.
2315
		 */
2316
		if (cpu_has_feature(CPU_FTR_ARCH_300))
2317
			*val = get_reg_val(id, vcpu->arch.doorbell_request);
2318
		else
2319
			*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
2320
		break;
2321
	case KVM_REG_PPC_VTB:
2322
		*val = get_reg_val(id, kvmppc_get_vtb(vcpu));
2323
		break;
2324
	case KVM_REG_PPC_DAWR:
2325
		*val = get_reg_val(id, kvmppc_get_dawr0_hv(vcpu));
2326
		break;
2327
	case KVM_REG_PPC_DAWRX:
2328
		*val = get_reg_val(id, kvmppc_get_dawrx0_hv(vcpu));
2329
		break;
2330
	case KVM_REG_PPC_DAWR1:
2331
		*val = get_reg_val(id, kvmppc_get_dawr1_hv(vcpu));
2332
		break;
2333
	case KVM_REG_PPC_DAWRX1:
2334
		*val = get_reg_val(id, kvmppc_get_dawrx1_hv(vcpu));
2335
		break;
2336
	case KVM_REG_PPC_DEXCR:
2337
		*val = get_reg_val(id, kvmppc_get_dexcr_hv(vcpu));
2338
		break;
2339
	case KVM_REG_PPC_HASHKEYR:
2340
		*val = get_reg_val(id, kvmppc_get_hashkeyr_hv(vcpu));
2341
		break;
2342
	case KVM_REG_PPC_HASHPKEYR:
2343
		*val = get_reg_val(id, kvmppc_get_hashpkeyr_hv(vcpu));
2344
		break;
2345
	case KVM_REG_PPC_CIABR:
2346
		*val = get_reg_val(id, kvmppc_get_ciabr_hv(vcpu));
2347
		break;
2348
	case KVM_REG_PPC_CSIGR:
2349
		*val = get_reg_val(id, vcpu->arch.csigr);
2350
		break;
2351
	case KVM_REG_PPC_TACR:
2352
		*val = get_reg_val(id, vcpu->arch.tacr);
2353
		break;
2354
	case KVM_REG_PPC_TCSCR:
2355
		*val = get_reg_val(id, vcpu->arch.tcscr);
2356
		break;
2357
	case KVM_REG_PPC_PID:
2358
		*val = get_reg_val(id, kvmppc_get_pid(vcpu));
2359
		break;
2360
	case KVM_REG_PPC_ACOP:
2361
		*val = get_reg_val(id, vcpu->arch.acop);
2362
		break;
2363
	case KVM_REG_PPC_WORT:
2364
		*val = get_reg_val(id, kvmppc_get_wort_hv(vcpu));
2365
		break;
2366
	case KVM_REG_PPC_TIDR:
2367
		*val = get_reg_val(id, vcpu->arch.tid);
2368
		break;
2369
	case KVM_REG_PPC_PSSCR:
2370
		*val = get_reg_val(id, vcpu->arch.psscr);
2371
		break;
2372
	case KVM_REG_PPC_VPA_ADDR:
2373
		spin_lock(&vcpu->arch.vpa_update_lock);
2374
		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
2375
		spin_unlock(&vcpu->arch.vpa_update_lock);
2376
		break;
2377
	case KVM_REG_PPC_VPA_SLB:
2378
		spin_lock(&vcpu->arch.vpa_update_lock);
2379
		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
2380
		val->vpaval.length = vcpu->arch.slb_shadow.len;
2381
		spin_unlock(&vcpu->arch.vpa_update_lock);
2382
		break;
2383
	case KVM_REG_PPC_VPA_DTL:
2384
		spin_lock(&vcpu->arch.vpa_update_lock);
2385
		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
2386
		val->vpaval.length = vcpu->arch.dtl.len;
2387
		spin_unlock(&vcpu->arch.vpa_update_lock);
2388
		break;
2389
	case KVM_REG_PPC_TB_OFFSET:
2390
		*val = get_reg_val(id, kvmppc_get_tb_offset(vcpu));
2391
		break;
2392
	case KVM_REG_PPC_LPCR:
2393
	case KVM_REG_PPC_LPCR_64:
2394
		*val = get_reg_val(id, kvmppc_get_lpcr(vcpu));
2395
		break;
2396
	case KVM_REG_PPC_PPR:
2397
		*val = get_reg_val(id, kvmppc_get_ppr_hv(vcpu));
2398
		break;
2399
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2400
	case KVM_REG_PPC_TFHAR:
2401
		*val = get_reg_val(id, vcpu->arch.tfhar);
2402
		break;
2403
	case KVM_REG_PPC_TFIAR:
2404
		*val = get_reg_val(id, vcpu->arch.tfiar);
2405
		break;
2406
	case KVM_REG_PPC_TEXASR:
2407
		*val = get_reg_val(id, vcpu->arch.texasr);
2408
		break;
2409
	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2410
		i = id - KVM_REG_PPC_TM_GPR0;
2411
		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
2412
		break;
2413
	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2414
	{
2415
		int j;
2416
		i = id - KVM_REG_PPC_TM_VSR0;
2417
		if (i < 32)
2418
			for (j = 0; j < TS_FPRWIDTH; j++)
2419
				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
2420
		else {
2421
			if (cpu_has_feature(CPU_FTR_ALTIVEC))
2422
				val->vval = vcpu->arch.vr_tm.vr[i-32];
2423
			else
2424
				r = -ENXIO;
2425
		}
2426
		break;
2427
	}
2428
	case KVM_REG_PPC_TM_CR:
2429
		*val = get_reg_val(id, vcpu->arch.cr_tm);
2430
		break;
2431
	case KVM_REG_PPC_TM_XER:
2432
		*val = get_reg_val(id, vcpu->arch.xer_tm);
2433
		break;
2434
	case KVM_REG_PPC_TM_LR:
2435
		*val = get_reg_val(id, vcpu->arch.lr_tm);
2436
		break;
2437
	case KVM_REG_PPC_TM_CTR:
2438
		*val = get_reg_val(id, vcpu->arch.ctr_tm);
2439
		break;
2440
	case KVM_REG_PPC_TM_FPSCR:
2441
		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
2442
		break;
2443
	case KVM_REG_PPC_TM_AMR:
2444
		*val = get_reg_val(id, vcpu->arch.amr_tm);
2445
		break;
2446
	case KVM_REG_PPC_TM_PPR:
2447
		*val = get_reg_val(id, vcpu->arch.ppr_tm);
2448
		break;
2449
	case KVM_REG_PPC_TM_VRSAVE:
2450
		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
2451
		break;
2452
	case KVM_REG_PPC_TM_VSCR:
2453
		if (cpu_has_feature(CPU_FTR_ALTIVEC))
2454
			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
2455
		else
2456
			r = -ENXIO;
2457
		break;
2458
	case KVM_REG_PPC_TM_DSCR:
2459
		*val = get_reg_val(id, vcpu->arch.dscr_tm);
2460
		break;
2461
	case KVM_REG_PPC_TM_TAR:
2462
		*val = get_reg_val(id, vcpu->arch.tar_tm);
2463
		break;
2464
#endif
2465
	case KVM_REG_PPC_ARCH_COMPAT:
2466
		*val = get_reg_val(id, kvmppc_get_arch_compat(vcpu));
2467
		break;
2468
	case KVM_REG_PPC_DEC_EXPIRY:
2469
		*val = get_reg_val(id, kvmppc_get_dec_expires(vcpu));
2470
		break;
2471
	case KVM_REG_PPC_ONLINE:
2472
		*val = get_reg_val(id, vcpu->arch.online);
2473
		break;
2474
	case KVM_REG_PPC_PTCR:
2475
		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
2476
		break;
2477
	case KVM_REG_PPC_FSCR:
2478
		*val = get_reg_val(id, kvmppc_get_fscr_hv(vcpu));
2479
		break;
2480
	default:
2481
		r = -EINVAL;
2482
		break;
2483
	}
2484

2485
	return r;
2486
}
2487

2488
static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2489
				 union kvmppc_one_reg *val)
2490
{
2491
	int r = 0;
2492
	long int i;
2493
	unsigned long addr, len;
2494

2495
	switch (id) {
2496
	case KVM_REG_PPC_HIOR:
2497
		/* Only allow this to be set to zero */
2498
		if (set_reg_val(id, *val))
2499
			r = -EINVAL;
2500
		break;
2501
	case KVM_REG_PPC_DABR:
2502
		vcpu->arch.dabr = set_reg_val(id, *val);
2503
		break;
2504
	case KVM_REG_PPC_DABRX:
2505
		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
2506
		break;
2507
	case KVM_REG_PPC_DSCR:
2508
		kvmppc_set_dscr_hv(vcpu, set_reg_val(id, *val));
2509
		break;
2510
	case KVM_REG_PPC_PURR:
2511
		kvmppc_set_purr_hv(vcpu, set_reg_val(id, *val));
2512
		break;
2513
	case KVM_REG_PPC_SPURR:
2514
		kvmppc_set_spurr_hv(vcpu, set_reg_val(id, *val));
2515
		break;
2516
	case KVM_REG_PPC_AMR:
2517
		kvmppc_set_amr_hv(vcpu, set_reg_val(id, *val));
2518
		break;
2519
	case KVM_REG_PPC_UAMOR:
2520
		kvmppc_set_uamor_hv(vcpu, set_reg_val(id, *val));
2521
		break;
2522
	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2523
		i = id - KVM_REG_PPC_MMCR0;
2524
		kvmppc_set_mmcr_hv(vcpu, i, set_reg_val(id, *val));
2525
		break;
2526
	case KVM_REG_PPC_MMCR2:
2527
		kvmppc_set_mmcr_hv(vcpu, 2, set_reg_val(id, *val));
2528
		break;
2529
	case KVM_REG_PPC_MMCRA:
2530
		kvmppc_set_mmcra_hv(vcpu, set_reg_val(id, *val));
2531
		break;
2532
	case KVM_REG_PPC_MMCRS:
2533
		vcpu->arch.mmcrs = set_reg_val(id, *val);
2534
		break;
2535
	case KVM_REG_PPC_MMCR3:
2536
		kvmppc_set_mmcr_hv(vcpu, 3, set_reg_val(id, *val));
2537
		break;
2538
	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2539
		i = id - KVM_REG_PPC_PMC1;
2540
		kvmppc_set_pmc_hv(vcpu, i, set_reg_val(id, *val));
2541
		break;
2542
	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2543
		i = id - KVM_REG_PPC_SPMC1;
2544
		vcpu->arch.spmc[i] = set_reg_val(id, *val);
2545
		break;
2546
	case KVM_REG_PPC_SIAR:
2547
		kvmppc_set_siar_hv(vcpu, set_reg_val(id, *val));
2548
		break;
2549
	case KVM_REG_PPC_SDAR:
2550
		kvmppc_set_sdar_hv(vcpu, set_reg_val(id, *val));
2551
		break;
2552
	case KVM_REG_PPC_SIER:
2553
		kvmppc_set_sier_hv(vcpu, 0, set_reg_val(id, *val));
2554
		break;
2555
	case KVM_REG_PPC_SIER2:
2556
		kvmppc_set_sier_hv(vcpu, 1, set_reg_val(id, *val));
2557
		break;
2558
	case KVM_REG_PPC_SIER3:
2559
		kvmppc_set_sier_hv(vcpu, 2, set_reg_val(id, *val));
2560
		break;
2561
	case KVM_REG_PPC_IAMR:
2562
		kvmppc_set_iamr_hv(vcpu, set_reg_val(id, *val));
2563
		break;
2564
	case KVM_REG_PPC_PSPB:
2565
		kvmppc_set_pspb_hv(vcpu, set_reg_val(id, *val));
2566
		break;
2567
	case KVM_REG_PPC_DPDES:
2568
		if (cpu_has_feature(CPU_FTR_ARCH_300))
2569
			vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
2570
		else
2571
			vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
2572
		break;
2573
	case KVM_REG_PPC_VTB:
2574
		kvmppc_set_vtb(vcpu, set_reg_val(id, *val));
2575
		break;
2576
	case KVM_REG_PPC_DAWR:
2577
		kvmppc_set_dawr0_hv(vcpu, set_reg_val(id, *val));
2578
		break;
2579
	case KVM_REG_PPC_DAWRX:
2580
		kvmppc_set_dawrx0_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2581
		break;
2582
	case KVM_REG_PPC_DAWR1:
2583
		kvmppc_set_dawr1_hv(vcpu, set_reg_val(id, *val));
2584
		break;
2585
	case KVM_REG_PPC_DAWRX1:
2586
		kvmppc_set_dawrx1_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2587
		break;
2588
	case KVM_REG_PPC_DEXCR:
2589
		kvmppc_set_dexcr_hv(vcpu, set_reg_val(id, *val));
2590
		break;
2591
	case KVM_REG_PPC_HASHKEYR:
2592
		kvmppc_set_hashkeyr_hv(vcpu, set_reg_val(id, *val));
2593
		break;
2594
	case KVM_REG_PPC_HASHPKEYR:
2595
		kvmppc_set_hashpkeyr_hv(vcpu, set_reg_val(id, *val));
2596
		break;
2597
	case KVM_REG_PPC_CIABR:
2598
		kvmppc_set_ciabr_hv(vcpu, set_reg_val(id, *val));
2599
		/* Don't allow setting breakpoints in hypervisor code */
2600
		if ((kvmppc_get_ciabr_hv(vcpu) & CIABR_PRIV) == CIABR_PRIV_HYPER)
2601
			kvmppc_set_ciabr_hv(vcpu, kvmppc_get_ciabr_hv(vcpu) & ~CIABR_PRIV);
2602
		break;
2603
	case KVM_REG_PPC_CSIGR:
2604
		vcpu->arch.csigr = set_reg_val(id, *val);
2605
		break;
2606
	case KVM_REG_PPC_TACR:
2607
		vcpu->arch.tacr = set_reg_val(id, *val);
2608
		break;
2609
	case KVM_REG_PPC_TCSCR:
2610
		vcpu->arch.tcscr = set_reg_val(id, *val);
2611
		break;
2612
	case KVM_REG_PPC_PID:
2613
		kvmppc_set_pid(vcpu, set_reg_val(id, *val));
2614
		break;
2615
	case KVM_REG_PPC_ACOP:
2616
		vcpu->arch.acop = set_reg_val(id, *val);
2617
		break;
2618
	case KVM_REG_PPC_WORT:
2619
		kvmppc_set_wort_hv(vcpu, set_reg_val(id, *val));
2620
		break;
2621
	case KVM_REG_PPC_TIDR:
2622
		vcpu->arch.tid = set_reg_val(id, *val);
2623
		break;
2624
	case KVM_REG_PPC_PSSCR:
2625
		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
2626
		break;
2627
	case KVM_REG_PPC_VPA_ADDR:
2628
		addr = set_reg_val(id, *val);
2629
		r = -EINVAL;
2630
		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
2631
			      vcpu->arch.dtl.next_gpa))
2632
			break;
2633
		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
2634
		break;
2635
	case KVM_REG_PPC_VPA_SLB:
2636
		addr = val->vpaval.addr;
2637
		len = val->vpaval.length;
2638
		r = -EINVAL;
2639
		if (addr && !vcpu->arch.vpa.next_gpa)
2640
			break;
2641
		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
2642
		break;
2643
	case KVM_REG_PPC_VPA_DTL:
2644
		addr = val->vpaval.addr;
2645
		len = val->vpaval.length;
2646
		r = -EINVAL;
2647
		if (addr && (len < sizeof(struct dtl_entry) ||
2648
			     !vcpu->arch.vpa.next_gpa))
2649
			break;
2650
		len -= len % sizeof(struct dtl_entry);
2651
		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
2652
		break;
2653
	case KVM_REG_PPC_TB_OFFSET:
2654
	{
2655
		/* round up to multiple of 2^24 */
2656
		u64 tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24);
2657

2658
		/*
2659
		 * Now that we know the timebase offset, update the
2660
		 * decrementer expiry with a guest timebase value. If
2661
		 * the userspace does not set DEC_EXPIRY, this ensures
2662
		 * a migrated vcpu at least starts with an expired
2663
		 * decrementer, which is better than a large one that
2664
		 * causes a hang.
2665
		 */
2666
		kvmppc_set_tb_offset(vcpu, tb_offset);
2667
		if (!kvmppc_get_dec_expires(vcpu) && tb_offset)
2668
			kvmppc_set_dec_expires(vcpu, get_tb() + tb_offset);
2669

2670
		kvmppc_set_tb_offset(vcpu, tb_offset);
2671
		break;
2672
	}
2673
	case KVM_REG_PPC_LPCR:
2674
		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
2675
		break;
2676
	case KVM_REG_PPC_LPCR_64:
2677
		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
2678
		break;
2679
	case KVM_REG_PPC_PPR:
2680
		kvmppc_set_ppr_hv(vcpu, set_reg_val(id, *val));
2681
		break;
2682
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2683
	case KVM_REG_PPC_TFHAR:
2684
		vcpu->arch.tfhar = set_reg_val(id, *val);
2685
		break;
2686
	case KVM_REG_PPC_TFIAR:
2687
		vcpu->arch.tfiar = set_reg_val(id, *val);
2688
		break;
2689
	case KVM_REG_PPC_TEXASR:
2690
		vcpu->arch.texasr = set_reg_val(id, *val);
2691
		break;
2692
	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2693
		i = id - KVM_REG_PPC_TM_GPR0;
2694
		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
2695
		break;
2696
	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2697
	{
2698
		int j;
2699
		i = id - KVM_REG_PPC_TM_VSR0;
2700
		if (i < 32)
2701
			for (j = 0; j < TS_FPRWIDTH; j++)
2702
				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
2703
		else
2704
			if (cpu_has_feature(CPU_FTR_ALTIVEC))
2705
				vcpu->arch.vr_tm.vr[i-32] = val->vval;
2706
			else
2707
				r = -ENXIO;
2708
		break;
2709
	}
2710
	case KVM_REG_PPC_TM_CR:
2711
		vcpu->arch.cr_tm = set_reg_val(id, *val);
2712
		break;
2713
	case KVM_REG_PPC_TM_XER:
2714
		vcpu->arch.xer_tm = set_reg_val(id, *val);
2715
		break;
2716
	case KVM_REG_PPC_TM_LR:
2717
		vcpu->arch.lr_tm = set_reg_val(id, *val);
2718
		break;
2719
	case KVM_REG_PPC_TM_CTR:
2720
		vcpu->arch.ctr_tm = set_reg_val(id, *val);
2721
		break;
2722
	case KVM_REG_PPC_TM_FPSCR:
2723
		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
2724
		break;
2725
	case KVM_REG_PPC_TM_AMR:
2726
		vcpu->arch.amr_tm = set_reg_val(id, *val);
2727
		break;
2728
	case KVM_REG_PPC_TM_PPR:
2729
		vcpu->arch.ppr_tm = set_reg_val(id, *val);
2730
		break;
2731
	case KVM_REG_PPC_TM_VRSAVE:
2732
		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
2733
		break;
2734
	case KVM_REG_PPC_TM_VSCR:
2735
		if (cpu_has_feature(CPU_FTR_ALTIVEC))
2736
			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
2737
		else
2738
			r = - ENXIO;
2739
		break;
2740
	case KVM_REG_PPC_TM_DSCR:
2741
		vcpu->arch.dscr_tm = set_reg_val(id, *val);
2742
		break;
2743
	case KVM_REG_PPC_TM_TAR:
2744
		vcpu->arch.tar_tm = set_reg_val(id, *val);
2745
		break;
2746
#endif
2747
	case KVM_REG_PPC_ARCH_COMPAT:
2748
		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
2749
		break;
2750
	case KVM_REG_PPC_DEC_EXPIRY:
2751
		kvmppc_set_dec_expires(vcpu, set_reg_val(id, *val));
2752
		break;
2753
	case KVM_REG_PPC_ONLINE:
2754
		i = set_reg_val(id, *val);
2755
		if (i && !vcpu->arch.online)
2756
			atomic_inc(&vcpu->arch.vcore->online_count);
2757
		else if (!i && vcpu->arch.online)
2758
			atomic_dec(&vcpu->arch.vcore->online_count);
2759
		vcpu->arch.online = i;
2760
		break;
2761
	case KVM_REG_PPC_PTCR:
2762
		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
2763
		break;
2764
	case KVM_REG_PPC_FSCR:
2765
		kvmppc_set_fscr_hv(vcpu, set_reg_val(id, *val));
2766
		break;
2767
	default:
2768
		r = -EINVAL;
2769
		break;
2770
	}
2771

2772
	return r;
2773
}
2774

2775
/*
2776
 * On POWER9, threads are independent and can be in different partitions.
2777
 * Therefore we consider each thread to be a subcore.
2778
 * There is a restriction that all threads have to be in the same
2779
 * MMU mode (radix or HPT), unfortunately, but since we only support
2780
 * HPT guests on a HPT host so far, that isn't an impediment yet.
2781
 */
2782
static int threads_per_vcore(struct kvm *kvm)
2783
{
2784
	if (cpu_has_feature(CPU_FTR_ARCH_300))
2785
		return 1;
2786
	return threads_per_subcore;
2787
}
2788

2789
static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
2790
{
2791
	struct kvmppc_vcore *vcore;
2792

2793
	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
2794

2795
	if (vcore == NULL)
2796
		return NULL;
2797

2798
	spin_lock_init(&vcore->lock);
2799
	spin_lock_init(&vcore->stoltb_lock);
2800
	rcuwait_init(&vcore->wait);
2801
	vcore->preempt_tb = TB_NIL;
2802
	vcore->lpcr = kvm->arch.lpcr;
2803
	vcore->first_vcpuid = id;
2804
	vcore->kvm = kvm;
2805
	INIT_LIST_HEAD(&vcore->preempt_list);
2806

2807
	return vcore;
2808
}
2809

2810
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
2811
static struct debugfs_timings_element {
2812
	const char *name;
2813
	size_t offset;
2814
} timings[] = {
2815
#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
2816
	{"vcpu_entry",	offsetof(struct kvm_vcpu, arch.vcpu_entry)},
2817
	{"guest_entry",	offsetof(struct kvm_vcpu, arch.guest_entry)},
2818
	{"in_guest",	offsetof(struct kvm_vcpu, arch.in_guest)},
2819
	{"guest_exit",	offsetof(struct kvm_vcpu, arch.guest_exit)},
2820
	{"vcpu_exit",	offsetof(struct kvm_vcpu, arch.vcpu_exit)},
2821
	{"hypercall",	offsetof(struct kvm_vcpu, arch.hcall)},
2822
	{"page_fault",	offsetof(struct kvm_vcpu, arch.pg_fault)},
2823
#else
2824
	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
2825
	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
2826
	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
2827
	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
2828
	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
2829
#endif
2830
};
2831

2832
#define N_TIMINGS	(ARRAY_SIZE(timings))
2833

2834
struct debugfs_timings_state {
2835
	struct kvm_vcpu	*vcpu;
2836
	unsigned int	buflen;
2837
	char		buf[N_TIMINGS * 100];
2838
};
2839

2840
static int debugfs_timings_open(struct inode *inode, struct file *file)
2841
{
2842
	struct kvm_vcpu *vcpu = inode->i_private;
2843
	struct debugfs_timings_state *p;
2844

2845
	p = kzalloc(sizeof(*p), GFP_KERNEL);
2846
	if (!p)
2847
		return -ENOMEM;
2848

2849
	kvm_get_kvm(vcpu->kvm);
2850
	p->vcpu = vcpu;
2851
	file->private_data = p;
2852

2853
	return nonseekable_open(inode, file);
2854
}
2855

2856
static int debugfs_timings_release(struct inode *inode, struct file *file)
2857
{
2858
	struct debugfs_timings_state *p = file->private_data;
2859

2860
	kvm_put_kvm(p->vcpu->kvm);
2861
	kfree(p);
2862
	return 0;
2863
}
2864

2865
static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
2866
				    size_t len, loff_t *ppos)
2867
{
2868
	struct debugfs_timings_state *p = file->private_data;
2869
	struct kvm_vcpu *vcpu = p->vcpu;
2870
	char *s, *buf_end;
2871
	struct kvmhv_tb_accumulator tb;
2872
	u64 count;
2873
	loff_t pos;
2874
	ssize_t n;
2875
	int i, loops;
2876
	bool ok;
2877

2878
	if (!p->buflen) {
2879
		s = p->buf;
2880
		buf_end = s + sizeof(p->buf);
2881
		for (i = 0; i < N_TIMINGS; ++i) {
2882
			struct kvmhv_tb_accumulator *acc;
2883

2884
			acc = (struct kvmhv_tb_accumulator *)
2885
				((unsigned long)vcpu + timings[i].offset);
2886
			ok = false;
2887
			for (loops = 0; loops < 1000; ++loops) {
2888
				count = acc->seqcount;
2889
				if (!(count & 1)) {
2890
					smp_rmb();
2891
					tb = *acc;
2892
					smp_rmb();
2893
					if (count == acc->seqcount) {
2894
						ok = true;
2895
						break;
2896
					}
2897
				}
2898
				udelay(1);
2899
			}
2900
			if (!ok)
2901
				snprintf(s, buf_end - s, "%s: stuck\n",
2902
					timings[i].name);
2903
			else
2904
				snprintf(s, buf_end - s,
2905
					"%s: %llu %llu %llu %llu\n",
2906
					timings[i].name, count / 2,
2907
					tb_to_ns(tb.tb_total),
2908
					tb_to_ns(tb.tb_min),
2909
					tb_to_ns(tb.tb_max));
2910
			s += strlen(s);
2911
		}
2912
		p->buflen = s - p->buf;
2913
	}
2914

2915
	pos = *ppos;
2916
	if (pos >= p->buflen)
2917
		return 0;
2918
	if (len > p->buflen - pos)
2919
		len = p->buflen - pos;
2920
	n = copy_to_user(buf, p->buf + pos, len);
2921
	if (n) {
2922
		if (n == len)
2923
			return -EFAULT;
2924
		len -= n;
2925
	}
2926
	*ppos = pos + len;
2927
	return len;
2928
}
2929

2930
static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
2931
				     size_t len, loff_t *ppos)
2932
{
2933
	return -EACCES;
2934
}
2935

2936
static const struct file_operations debugfs_timings_ops = {
2937
	.owner	 = THIS_MODULE,
2938
	.open	 = debugfs_timings_open,
2939
	.release = debugfs_timings_release,
2940
	.read	 = debugfs_timings_read,
2941
	.write	 = debugfs_timings_write,
2942
	.llseek	 = generic_file_llseek,
2943
};
2944

2945
/* Create a debugfs directory for the vcpu */
2946
static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
2947
{
2948
	if (cpu_has_feature(CPU_FTR_ARCH_300) == IS_ENABLED(CONFIG_KVM_BOOK3S_HV_P9_TIMING))
2949
		debugfs_create_file("timings", 0444, debugfs_dentry, vcpu,
2950
				    &debugfs_timings_ops);
2951
	return 0;
2952
}
2953

2954
#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2955
static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
2956
{
2957
	return 0;
2958
}
2959
#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2960

2961
static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
2962
{
2963
	int err;
2964
	int core;
2965
	struct kvmppc_vcore *vcore;
2966
	struct kvm *kvm;
2967
	unsigned int id;
2968

2969
	kvm = vcpu->kvm;
2970
	id = vcpu->vcpu_id;
2971

2972
	vcpu->arch.shared = &vcpu->arch.shregs;
2973
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
2974
	/*
2975
	 * The shared struct is never shared on HV,
2976
	 * so we can always use host endianness
2977
	 */
2978
#ifdef __BIG_ENDIAN__
2979
	vcpu->arch.shared_big_endian = true;
2980
#else
2981
	vcpu->arch.shared_big_endian = false;
2982
#endif
2983
#endif
2984

2985
	if (kvmhv_is_nestedv2()) {
2986
		err = kvmhv_nestedv2_vcpu_create(vcpu, &vcpu->arch.nestedv2_io);
2987
		if (err < 0)
2988
			return err;
2989
	}
2990

2991
	kvmppc_set_mmcr_hv(vcpu, 0, MMCR0_FC);
2992
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
2993
		kvmppc_set_mmcr_hv(vcpu, 0, kvmppc_get_mmcr_hv(vcpu, 0) | MMCR0_PMCCEXT);
2994
		kvmppc_set_mmcra_hv(vcpu, MMCRA_BHRB_DISABLE);
2995
	}
2996

2997
	kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH);
2998
	/* default to host PVR, since we can't spoof it */
2999
	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
3000
	spin_lock_init(&vcpu->arch.vpa_update_lock);
3001
	spin_lock_init(&vcpu->arch.tbacct_lock);
3002
	vcpu->arch.busy_preempt = TB_NIL;
3003
	__kvmppc_set_msr_hv(vcpu, MSR_ME);
3004
	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
3005

3006
	/*
3007
	 * Set the default HFSCR for the guest from the host value.
3008
	 * This value is only used on POWER9 and later.
3009
	 * On >= POWER9, we want to virtualize the doorbell facility, so we
3010
	 * don't set the HFSCR_MSGP bit, and that causes those instructions
3011
	 * to trap and then we emulate them.
3012
	 */
3013
	kvmppc_set_hfscr_hv(vcpu, HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
3014
			    HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP);
3015

3016
	/* On POWER10 and later, allow prefixed instructions */
3017
	if (cpu_has_feature(CPU_FTR_ARCH_31))
3018
		kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PREFIX);
3019

3020
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
3021
		kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & mfspr(SPRN_HFSCR));
3022

3023
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
3024
		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3025
			kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
3026
#endif
3027
	}
3028
	if (cpu_has_feature(CPU_FTR_TM_COMP))
3029
		vcpu->arch.hfscr |= HFSCR_TM;
3030

3031
	vcpu->arch.hfscr_permitted = kvmppc_get_hfscr_hv(vcpu);
3032

3033
	/*
3034
	 * PM, EBB, TM are demand-faulted so start with it clear.
3035
	 */
3036
	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM));
3037

3038
	kvmppc_mmu_book3s_hv_init(vcpu);
3039

3040
	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
3041

3042
	init_waitqueue_head(&vcpu->arch.cpu_run);
3043

3044
	mutex_lock(&kvm->lock);
3045
	vcore = NULL;
3046
	err = -EINVAL;
3047
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
3048
		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
3049
			pr_devel("KVM: VCPU ID too high\n");
3050
			core = KVM_MAX_VCORES;
3051
		} else {
3052
			BUG_ON(kvm->arch.smt_mode != 1);
3053
			core = kvmppc_pack_vcpu_id(kvm, id);
3054
		}
3055
	} else {
3056
		core = id / kvm->arch.smt_mode;
3057
	}
3058
	if (core < KVM_MAX_VCORES) {
3059
		vcore = kvm->arch.vcores[core];
3060
		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
3061
			pr_devel("KVM: collision on id %u", id);
3062
			vcore = NULL;
3063
		} else if (!vcore) {
3064
			/*
3065
			 * Take mmu_setup_lock for mutual exclusion
3066
			 * with kvmppc_update_lpcr().
3067
			 */
3068
			err = -ENOMEM;
3069
			vcore = kvmppc_vcore_create(kvm,
3070
					id & ~(kvm->arch.smt_mode - 1));
3071
			mutex_lock(&kvm->arch.mmu_setup_lock);
3072
			kvm->arch.vcores[core] = vcore;
3073
			kvm->arch.online_vcores++;
3074
			mutex_unlock(&kvm->arch.mmu_setup_lock);
3075
		}
3076
	}
3077
	mutex_unlock(&kvm->lock);
3078

3079
	if (!vcore)
3080
		return err;
3081

3082
	spin_lock(&vcore->lock);
3083
	++vcore->num_threads;
3084
	spin_unlock(&vcore->lock);
3085
	vcpu->arch.vcore = vcore;
3086
	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
3087
	vcpu->arch.thread_cpu = -1;
3088
	vcpu->arch.prev_cpu = -1;
3089

3090
	vcpu->arch.cpu_type = KVM_CPU_3S_64;
3091
	kvmppc_sanity_check(vcpu);
3092

3093
	return 0;
3094
}
3095

3096
static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
3097
			      unsigned long flags)
3098
{
3099
	int err;
3100
	int esmt = 0;
3101

3102
	if (flags)
3103
		return -EINVAL;
3104
	if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
3105
		return -EINVAL;
3106
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
3107
		/*
3108
		 * On POWER8 (or POWER7), the threading mode is "strict",
3109
		 * so we pack smt_mode vcpus per vcore.
3110
		 */
3111
		if (smt_mode > threads_per_subcore)
3112
			return -EINVAL;
3113
	} else {
3114
		/*
3115
		 * On POWER9, the threading mode is "loose",
3116
		 * so each vcpu gets its own vcore.
3117
		 */
3118
		esmt = smt_mode;
3119
		smt_mode = 1;
3120
	}
3121
	mutex_lock(&kvm->lock);
3122
	err = -EBUSY;
3123
	if (!kvm->arch.online_vcores) {
3124
		kvm->arch.smt_mode = smt_mode;
3125
		kvm->arch.emul_smt_mode = esmt;
3126
		err = 0;
3127
	}
3128
	mutex_unlock(&kvm->lock);
3129

3130
	return err;
3131
}
3132

3133
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
3134
{
3135
	if (vpa->pinned_addr)
3136
		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
3137
					vpa->dirty);
3138
}
3139

3140
static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
3141
{
3142
	spin_lock(&vcpu->arch.vpa_update_lock);
3143
	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
3144
	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
3145
	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
3146
	spin_unlock(&vcpu->arch.vpa_update_lock);
3147
	if (kvmhv_is_nestedv2())
3148
		kvmhv_nestedv2_vcpu_free(vcpu, &vcpu->arch.nestedv2_io);
3149
}
3150

3151
static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
3152
{
3153
	/* Indicate we want to get back into the guest */
3154
	return 1;
3155
}
3156

3157
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
3158
{
3159
	unsigned long dec_nsec, now;
3160

3161
	now = get_tb();
3162
	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
3163
		/* decrementer has already gone negative */
3164
		kvmppc_core_queue_dec(vcpu);
3165
		kvmppc_core_prepare_to_enter(vcpu);
3166
		return;
3167
	}
3168
	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
3169
	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
3170
	vcpu->arch.timer_running = 1;
3171
}
3172

3173
extern int __kvmppc_vcore_entry(void);
3174

3175
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
3176
				   struct kvm_vcpu *vcpu, u64 tb)
3177
{
3178
	u64 now;
3179

3180
	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
3181
		return;
3182
	spin_lock_irq(&vcpu->arch.tbacct_lock);
3183
	now = tb;
3184
	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
3185
		vcpu->arch.stolen_logged;
3186
	vcpu->arch.busy_preempt = now;
3187
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
3188
	spin_unlock_irq(&vcpu->arch.tbacct_lock);
3189
	--vc->n_runnable;
3190
	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
3191
}
3192

3193
static int kvmppc_grab_hwthread(int cpu)
3194
{
3195
	struct paca_struct *tpaca;
3196
	long timeout = 10000;
3197

3198
	tpaca = paca_ptrs[cpu];
3199

3200
	/* Ensure the thread won't go into the kernel if it wakes */
3201
	tpaca->kvm_hstate.kvm_vcpu = NULL;
3202
	tpaca->kvm_hstate.kvm_vcore = NULL;
3203
	tpaca->kvm_hstate.napping = 0;
3204
	smp_wmb();
3205
	tpaca->kvm_hstate.hwthread_req = 1;
3206

3207
	/*
3208
	 * If the thread is already executing in the kernel (e.g. handling
3209
	 * a stray interrupt), wait for it to get back to nap mode.
3210
	 * The smp_mb() is to ensure that our setting of hwthread_req
3211
	 * is visible before we look at hwthread_state, so if this
3212
	 * races with the code at system_reset_pSeries and the thread
3213
	 * misses our setting of hwthread_req, we are sure to see its
3214
	 * setting of hwthread_state, and vice versa.
3215
	 */
3216
	smp_mb();
3217
	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
3218
		if (--timeout <= 0) {
3219
			pr_err("KVM: couldn't grab cpu %d\n", cpu);
3220
			return -EBUSY;
3221
		}
3222
		udelay(1);
3223
	}
3224
	return 0;
3225
}
3226

3227
static void kvmppc_release_hwthread(int cpu)
3228
{
3229
	struct paca_struct *tpaca;
3230

3231
	tpaca = paca_ptrs[cpu];
3232
	tpaca->kvm_hstate.hwthread_req = 0;
3233
	tpaca->kvm_hstate.kvm_vcpu = NULL;
3234
	tpaca->kvm_hstate.kvm_vcore = NULL;
3235
	tpaca->kvm_hstate.kvm_split_mode = NULL;
3236
}
3237

3238
static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
3239

3240
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
3241
{
3242
	struct kvm_nested_guest *nested = vcpu->arch.nested;
3243
	cpumask_t *need_tlb_flush;
3244
	int i;
3245

3246
	if (nested)
3247
		need_tlb_flush = &nested->need_tlb_flush;
3248
	else
3249
		need_tlb_flush = &kvm->arch.need_tlb_flush;
3250

3251
	cpu = cpu_first_tlb_thread_sibling(cpu);
3252
	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3253
					i += cpu_tlb_thread_sibling_step())
3254
		cpumask_set_cpu(i, need_tlb_flush);
3255

3256
	/*
3257
	 * Make sure setting of bit in need_tlb_flush precedes testing of
3258
	 * cpu_in_guest. The matching barrier on the other side is hwsync
3259
	 * when switching to guest MMU mode, which happens between
3260
	 * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
3261
	 * being tested.
3262
	 */
3263
	smp_mb();
3264

3265
	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3266
					i += cpu_tlb_thread_sibling_step()) {
3267
		struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
3268

3269
		if (running == kvm)
3270
			smp_call_function_single(i, do_nothing, NULL, 1);
3271
	}
3272
}
3273

3274
static void do_migrate_away_vcpu(void *arg)
3275
{
3276
	struct kvm_vcpu *vcpu = arg;
3277
	struct kvm *kvm = vcpu->kvm;
3278

3279
	/*
3280
	 * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
3281
	 * ptesync sequence on the old CPU before migrating to a new one, in
3282
	 * case we interrupted the guest between a tlbie ; eieio ;
3283
	 * tlbsync; ptesync sequence.
3284
	 *
3285
	 * Otherwise, ptesync is sufficient for ordering tlbiel sequences.
3286
	 */
3287
	if (kvm->arch.lpcr & LPCR_GTSE)
3288
		asm volatile("eieio; tlbsync; ptesync");
3289
	else
3290
		asm volatile("ptesync");
3291
}
3292

3293
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
3294
{
3295
	struct kvm_nested_guest *nested = vcpu->arch.nested;
3296
	struct kvm *kvm = vcpu->kvm;
3297
	int prev_cpu;
3298

3299
	if (!cpu_has_feature(CPU_FTR_HVMODE))
3300
		return;
3301

3302
	if (nested)
3303
		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
3304
	else
3305
		prev_cpu = vcpu->arch.prev_cpu;
3306

3307
	/*
3308
	 * With radix, the guest can do TLB invalidations itself,
3309
	 * and it could choose to use the local form (tlbiel) if
3310
	 * it is invalidating a translation that has only ever been
3311
	 * used on one vcpu.  However, that doesn't mean it has
3312
	 * only ever been used on one physical cpu, since vcpus
3313
	 * can move around between pcpus.  To cope with this, when
3314
	 * a vcpu moves from one pcpu to another, we need to tell
3315
	 * any vcpus running on the same core as this vcpu previously
3316
	 * ran to flush the TLB.
3317
	 */
3318
	if (prev_cpu != pcpu) {
3319
		if (prev_cpu >= 0) {
3320
			if (cpu_first_tlb_thread_sibling(prev_cpu) !=
3321
			    cpu_first_tlb_thread_sibling(pcpu))
3322
				radix_flush_cpu(kvm, prev_cpu, vcpu);
3323

3324
			smp_call_function_single(prev_cpu,
3325
					do_migrate_away_vcpu, vcpu, 1);
3326
		}
3327
		if (nested)
3328
			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
3329
		else
3330
			vcpu->arch.prev_cpu = pcpu;
3331
	}
3332
}
3333

3334
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
3335
{
3336
	int cpu;
3337
	struct paca_struct *tpaca;
3338

3339
	cpu = vc->pcpu;
3340
	if (vcpu) {
3341
		if (vcpu->arch.timer_running) {
3342
			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
3343
			vcpu->arch.timer_running = 0;
3344
		}
3345
		cpu += vcpu->arch.ptid;
3346
		vcpu->cpu = vc->pcpu;
3347
		vcpu->arch.thread_cpu = cpu;
3348
	}
3349
	tpaca = paca_ptrs[cpu];
3350
	tpaca->kvm_hstate.kvm_vcpu = vcpu;
3351
	tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
3352
	tpaca->kvm_hstate.fake_suspend = 0;
3353
	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
3354
	smp_wmb();
3355
	tpaca->kvm_hstate.kvm_vcore = vc;
3356
	if (cpu != smp_processor_id())
3357
		kvmppc_ipi_thread(cpu);
3358
}
3359

3360
static void kvmppc_wait_for_nap(int n_threads)
3361
{
3362
	int cpu = smp_processor_id();
3363
	int i, loops;
3364

3365
	if (n_threads <= 1)
3366
		return;
3367
	for (loops = 0; loops < 1000000; ++loops) {
3368
		/*
3369
		 * Check if all threads are finished.
3370
		 * We set the vcore pointer when starting a thread
3371
		 * and the thread clears it when finished, so we look
3372
		 * for any threads that still have a non-NULL vcore ptr.
3373
		 */
3374
		for (i = 1; i < n_threads; ++i)
3375
			if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3376
				break;
3377
		if (i == n_threads) {
3378
			HMT_medium();
3379
			return;
3380
		}
3381
		HMT_low();
3382
	}
3383
	HMT_medium();
3384
	for (i = 1; i < n_threads; ++i)
3385
		if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3386
			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
3387
}
3388

3389
/*
3390
 * Check that we are on thread 0 and that any other threads in
3391
 * this core are off-line.  Then grab the threads so they can't
3392
 * enter the kernel.
3393
 */
3394
static int on_primary_thread(void)
3395
{
3396
	int cpu = smp_processor_id();
3397
	int thr;
3398

3399
	/* Are we on a primary subcore? */
3400
	if (cpu_thread_in_subcore(cpu))
3401
		return 0;
3402

3403
	thr = 0;
3404
	while (++thr < threads_per_subcore)
3405
		if (cpu_online(cpu + thr))
3406
			return 0;
3407

3408
	/* Grab all hw threads so they can't go into the kernel */
3409
	for (thr = 1; thr < threads_per_subcore; ++thr) {
3410
		if (kvmppc_grab_hwthread(cpu + thr)) {
3411
			/* Couldn't grab one; let the others go */
3412
			do {
3413
				kvmppc_release_hwthread(cpu + thr);
3414
			} while (--thr > 0);
3415
			return 0;
3416
		}
3417
	}
3418
	return 1;
3419
}
3420

3421
/*
3422
 * A list of virtual cores for each physical CPU.
3423
 * These are vcores that could run but their runner VCPU tasks are
3424
 * (or may be) preempted.
3425
 */
3426
struct preempted_vcore_list {
3427
	struct list_head	list;
3428
	spinlock_t		lock;
3429
};
3430

3431
static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
3432

3433
static void init_vcore_lists(void)
3434
{
3435
	int cpu;
3436

3437
	for_each_possible_cpu(cpu) {
3438
		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
3439
		spin_lock_init(&lp->lock);
3440
		INIT_LIST_HEAD(&lp->list);
3441
	}
3442
}
3443

3444
static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
3445
{
3446
	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3447

3448
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3449

3450
	vc->vcore_state = VCORE_PREEMPT;
3451
	vc->pcpu = smp_processor_id();
3452
	if (vc->num_threads < threads_per_vcore(vc->kvm)) {
3453
		spin_lock(&lp->lock);
3454
		list_add_tail(&vc->preempt_list, &lp->list);
3455
		spin_unlock(&lp->lock);
3456
	}
3457

3458
	/* Start accumulating stolen time */
3459
	kvmppc_core_start_stolen(vc, mftb());
3460
}
3461

3462
static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
3463
{
3464
	struct preempted_vcore_list *lp;
3465

3466
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3467

3468
	kvmppc_core_end_stolen(vc, mftb());
3469
	if (!list_empty(&vc->preempt_list)) {
3470
		lp = &per_cpu(preempted_vcores, vc->pcpu);
3471
		spin_lock(&lp->lock);
3472
		list_del_init(&vc->preempt_list);
3473
		spin_unlock(&lp->lock);
3474
	}
3475
	vc->vcore_state = VCORE_INACTIVE;
3476
}
3477

3478
/*
3479
 * This stores information about the virtual cores currently
3480
 * assigned to a physical core.
3481
 */
3482
struct core_info {
3483
	int		n_subcores;
3484
	int		max_subcore_threads;
3485
	int		total_threads;
3486
	int		subcore_threads[MAX_SUBCORES];
3487
	struct kvmppc_vcore *vc[MAX_SUBCORES];
3488
};
3489

3490
/*
3491
 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
3492
 * respectively in 2-way micro-threading (split-core) mode on POWER8.
3493
 */
3494
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
3495

3496
static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
3497
{
3498
	memset(cip, 0, sizeof(*cip));
3499
	cip->n_subcores = 1;
3500
	cip->max_subcore_threads = vc->num_threads;
3501
	cip->total_threads = vc->num_threads;
3502
	cip->subcore_threads[0] = vc->num_threads;
3503
	cip->vc[0] = vc;
3504
}
3505

3506
static bool subcore_config_ok(int n_subcores, int n_threads)
3507
{
3508
	/*
3509
	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
3510
	 * split-core mode, with one thread per subcore.
3511
	 */
3512
	if (cpu_has_feature(CPU_FTR_ARCH_300))
3513
		return n_subcores <= 4 && n_threads == 1;
3514

3515
	/* On POWER8, can only dynamically split if unsplit to begin with */
3516
	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
3517
		return false;
3518
	if (n_subcores > MAX_SUBCORES)
3519
		return false;
3520
	if (n_subcores > 1) {
3521
		if (!(dynamic_mt_modes & 2))
3522
			n_subcores = 4;
3523
		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
3524
			return false;
3525
	}
3526

3527
	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
3528
}
3529

3530
static void init_vcore_to_run(struct kvmppc_vcore *vc)
3531
{
3532
	vc->entry_exit_map = 0;
3533
	vc->in_guest = 0;
3534
	vc->napping_threads = 0;
3535
	vc->conferring_threads = 0;
3536
	vc->tb_offset_applied = 0;
3537
}
3538

3539
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
3540
{
3541
	int n_threads = vc->num_threads;
3542
	int sub;
3543

3544
	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
3545
		return false;
3546

3547
	/* In one_vm_per_core mode, require all vcores to be from the same vm */
3548
	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
3549
		return false;
3550

3551
	if (n_threads < cip->max_subcore_threads)
3552
		n_threads = cip->max_subcore_threads;
3553
	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
3554
		return false;
3555
	cip->max_subcore_threads = n_threads;
3556

3557
	sub = cip->n_subcores;
3558
	++cip->n_subcores;
3559
	cip->total_threads += vc->num_threads;
3560
	cip->subcore_threads[sub] = vc->num_threads;
3561
	cip->vc[sub] = vc;
3562
	init_vcore_to_run(vc);
3563
	list_del_init(&vc->preempt_list);
3564

3565
	return true;
3566
}
3567

3568
/*
3569
 * Work out whether it is possible to piggyback the execution of
3570
 * vcore *pvc onto the execution of the other vcores described in *cip.
3571
 */
3572
static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
3573
			  int target_threads)
3574
{
3575
	if (cip->total_threads + pvc->num_threads > target_threads)
3576
		return false;
3577

3578
	return can_dynamic_split(pvc, cip);
3579
}
3580

3581
static void prepare_threads(struct kvmppc_vcore *vc)
3582
{
3583
	int i;
3584
	struct kvm_vcpu *vcpu;
3585

3586
	for_each_runnable_thread(i, vcpu, vc) {
3587
		if (signal_pending(vcpu->arch.run_task))
3588
			vcpu->arch.ret = -EINTR;
3589
		else if (vcpu->arch.vpa.update_pending ||
3590
			 vcpu->arch.slb_shadow.update_pending ||
3591
			 vcpu->arch.dtl.update_pending)
3592
			vcpu->arch.ret = RESUME_GUEST;
3593
		else
3594
			continue;
3595
		kvmppc_remove_runnable(vc, vcpu, mftb());
3596
		wake_up(&vcpu->arch.cpu_run);
3597
	}
3598
}
3599

3600
static void collect_piggybacks(struct core_info *cip, int target_threads)
3601
{
3602
	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3603
	struct kvmppc_vcore *pvc, *vcnext;
3604

3605
	spin_lock(&lp->lock);
3606
	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
3607
		if (!spin_trylock(&pvc->lock))
3608
			continue;
3609
		prepare_threads(pvc);
3610
		if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
3611
			list_del_init(&pvc->preempt_list);
3612
			if (pvc->runner == NULL) {
3613
				pvc->vcore_state = VCORE_INACTIVE;
3614
				kvmppc_core_end_stolen(pvc, mftb());
3615
			}
3616
			spin_unlock(&pvc->lock);
3617
			continue;
3618
		}
3619
		if (!can_piggyback(pvc, cip, target_threads)) {
3620
			spin_unlock(&pvc->lock);
3621
			continue;
3622
		}
3623
		kvmppc_core_end_stolen(pvc, mftb());
3624
		pvc->vcore_state = VCORE_PIGGYBACK;
3625
		if (cip->total_threads >= target_threads)
3626
			break;
3627
	}
3628
	spin_unlock(&lp->lock);
3629
}
3630

3631
static bool recheck_signals_and_mmu(struct core_info *cip)
3632
{
3633
	int sub, i;
3634
	struct kvm_vcpu *vcpu;
3635
	struct kvmppc_vcore *vc;
3636

3637
	for (sub = 0; sub < cip->n_subcores; ++sub) {
3638
		vc = cip->vc[sub];
3639
		if (!vc->kvm->arch.mmu_ready)
3640
			return true;
3641
		for_each_runnable_thread(i, vcpu, vc)
3642
			if (signal_pending(vcpu->arch.run_task))
3643
				return true;
3644
	}
3645
	return false;
3646
}
3647

3648
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
3649
{
3650
	int still_running = 0, i;
3651
	u64 now;
3652
	long ret;
3653
	struct kvm_vcpu *vcpu;
3654

3655
	spin_lock(&vc->lock);
3656
	now = get_tb();
3657
	for_each_runnable_thread(i, vcpu, vc) {
3658
		/*
3659
		 * It's safe to unlock the vcore in the loop here, because
3660
		 * for_each_runnable_thread() is safe against removal of
3661
		 * the vcpu, and the vcore state is VCORE_EXITING here,
3662
		 * so any vcpus becoming runnable will have their arch.trap
3663
		 * set to zero and can't actually run in the guest.
3664
		 */
3665
		spin_unlock(&vc->lock);
3666
		/* cancel pending dec exception if dec is positive */
3667
		if (now < kvmppc_dec_expires_host_tb(vcpu) &&
3668
		    kvmppc_core_pending_dec(vcpu))
3669
			kvmppc_core_dequeue_dec(vcpu);
3670

3671
		trace_kvm_guest_exit(vcpu);
3672

3673
		ret = RESUME_GUEST;
3674
		if (vcpu->arch.trap)
3675
			ret = kvmppc_handle_exit_hv(vcpu,
3676
						    vcpu->arch.run_task);
3677

3678
		vcpu->arch.ret = ret;
3679
		vcpu->arch.trap = 0;
3680

3681
		spin_lock(&vc->lock);
3682
		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
3683
			if (vcpu->arch.pending_exceptions)
3684
				kvmppc_core_prepare_to_enter(vcpu);
3685
			if (vcpu->arch.ceded)
3686
				kvmppc_set_timer(vcpu);
3687
			else
3688
				++still_running;
3689
		} else {
3690
			kvmppc_remove_runnable(vc, vcpu, mftb());
3691
			wake_up(&vcpu->arch.cpu_run);
3692
		}
3693
	}
3694
	if (!is_master) {
3695
		if (still_running > 0) {
3696
			kvmppc_vcore_preempt(vc);
3697
		} else if (vc->runner) {
3698
			vc->vcore_state = VCORE_PREEMPT;
3699
			kvmppc_core_start_stolen(vc, mftb());
3700
		} else {
3701
			vc->vcore_state = VCORE_INACTIVE;
3702
		}
3703
		if (vc->n_runnable > 0 && vc->runner == NULL) {
3704
			/* make sure there's a candidate runner awake */
3705
			i = -1;
3706
			vcpu = next_runnable_thread(vc, &i);
3707
			wake_up(&vcpu->arch.cpu_run);
3708
		}
3709
	}
3710
	spin_unlock(&vc->lock);
3711
}
3712

3713
/*
3714
 * Clear core from the list of active host cores as we are about to
3715
 * enter the guest. Only do this if it is the primary thread of the
3716
 * core (not if a subcore) that is entering the guest.
3717
 */
3718
static inline int kvmppc_clear_host_core(unsigned int cpu)
3719
{
3720
	int core;
3721

3722
	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3723
		return 0;
3724
	/*
3725
	 * Memory barrier can be omitted here as we will do a smp_wmb()
3726
	 * later in kvmppc_start_thread and we need ensure that state is
3727
	 * visible to other CPUs only after we enter guest.
3728
	 */
3729
	core = cpu >> threads_shift;
3730
	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
3731
	return 0;
3732
}
3733

3734
/*
3735
 * Advertise this core as an active host core since we exited the guest
3736
 * Only need to do this if it is the primary thread of the core that is
3737
 * exiting.
3738
 */
3739
static inline int kvmppc_set_host_core(unsigned int cpu)
3740
{
3741
	int core;
3742

3743
	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3744
		return 0;
3745

3746
	/*
3747
	 * Memory barrier can be omitted here because we do a spin_unlock
3748
	 * immediately after this which provides the memory barrier.
3749
	 */
3750
	core = cpu >> threads_shift;
3751
	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
3752
	return 0;
3753
}
3754

3755
static void set_irq_happened(int trap)
3756
{
3757
	switch (trap) {
3758
	case BOOK3S_INTERRUPT_EXTERNAL:
3759
		local_paca->irq_happened |= PACA_IRQ_EE;
3760
		break;
3761
	case BOOK3S_INTERRUPT_H_DOORBELL:
3762
		local_paca->irq_happened |= PACA_IRQ_DBELL;
3763
		break;
3764
	case BOOK3S_INTERRUPT_HMI:
3765
		local_paca->irq_happened |= PACA_IRQ_HMI;
3766
		break;
3767
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
3768
		replay_system_reset();
3769
		break;
3770
	}
3771
}
3772

3773
/*
3774
 * Run a set of guest threads on a physical core.
3775
 * Called with vc->lock held.
3776
 */
3777
static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3778
{
3779
	struct kvm_vcpu *vcpu;
3780
	int i;
3781
	int srcu_idx;
3782
	struct core_info core_info;
3783
	struct kvmppc_vcore *pvc;
3784
	struct kvm_split_mode split_info, *sip;
3785
	int split, subcore_size, active;
3786
	int sub;
3787
	bool thr0_done;
3788
	unsigned long cmd_bit, stat_bit;
3789
	int pcpu, thr;
3790
	int target_threads;
3791
	int controlled_threads;
3792
	int trap;
3793
	bool is_power8;
3794

3795
	if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
3796
		return;
3797

3798
	/*
3799
	 * Remove from the list any threads that have a signal pending
3800
	 * or need a VPA update done
3801
	 */
3802
	prepare_threads(vc);
3803

3804
	/* if the runner is no longer runnable, let the caller pick a new one */
3805
	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
3806
		return;
3807

3808
	/*
3809
	 * Initialize *vc.
3810
	 */
3811
	init_vcore_to_run(vc);
3812
	vc->preempt_tb = TB_NIL;
3813

3814
	/*
3815
	 * Number of threads that we will be controlling: the same as
3816
	 * the number of threads per subcore, except on POWER9,
3817
	 * where it's 1 because the threads are (mostly) independent.
3818
	 */
3819
	controlled_threads = threads_per_vcore(vc->kvm);
3820

3821
	/*
3822
	 * Make sure we are running on primary threads, and that secondary
3823
	 * threads are offline.  Also check if the number of threads in this
3824
	 * guest are greater than the current system threads per guest.
3825
	 */
3826
	if ((controlled_threads > 1) &&
3827
	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
3828
		for_each_runnable_thread(i, vcpu, vc) {
3829
			vcpu->arch.ret = -EBUSY;
3830
			kvmppc_remove_runnable(vc, vcpu, mftb());
3831
			wake_up(&vcpu->arch.cpu_run);
3832
		}
3833
		goto out;
3834
	}
3835

3836
	/*
3837
	 * See if we could run any other vcores on the physical core
3838
	 * along with this one.
3839
	 */
3840
	init_core_info(&core_info, vc);
3841
	pcpu = smp_processor_id();
3842
	target_threads = controlled_threads;
3843
	if (target_smt_mode && target_smt_mode < target_threads)
3844
		target_threads = target_smt_mode;
3845
	if (vc->num_threads < target_threads)
3846
		collect_piggybacks(&core_info, target_threads);
3847

3848
	/*
3849
	 * Hard-disable interrupts, and check resched flag and signals.
3850
	 * If we need to reschedule or deliver a signal, clean up
3851
	 * and return without going into the guest(s).
3852
	 * If the mmu_ready flag has been cleared, don't go into the
3853
	 * guest because that means a HPT resize operation is in progress.
3854
	 */
3855
	local_irq_disable();
3856
	hard_irq_disable();
3857
	if (lazy_irq_pending() || need_resched() ||
3858
	    recheck_signals_and_mmu(&core_info)) {
3859
		local_irq_enable();
3860
		vc->vcore_state = VCORE_INACTIVE;
3861
		/* Unlock all except the primary vcore */
3862
		for (sub = 1; sub < core_info.n_subcores; ++sub) {
3863
			pvc = core_info.vc[sub];
3864
			/* Put back on to the preempted vcores list */
3865
			kvmppc_vcore_preempt(pvc);
3866
			spin_unlock(&pvc->lock);
3867
		}
3868
		for (i = 0; i < controlled_threads; ++i)
3869
			kvmppc_release_hwthread(pcpu + i);
3870
		return;
3871
	}
3872

3873
	kvmppc_clear_host_core(pcpu);
3874

3875
	/* Decide on micro-threading (split-core) mode */
3876
	subcore_size = threads_per_subcore;
3877
	cmd_bit = stat_bit = 0;
3878
	split = core_info.n_subcores;
3879
	sip = NULL;
3880
	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
3881

3882
	if (split > 1) {
3883
		sip = &split_info;
3884
		memset(&split_info, 0, sizeof(split_info));
3885
		for (sub = 0; sub < core_info.n_subcores; ++sub)
3886
			split_info.vc[sub] = core_info.vc[sub];
3887

3888
		if (is_power8) {
3889
			if (split == 2 && (dynamic_mt_modes & 2)) {
3890
				cmd_bit = HID0_POWER8_1TO2LPAR;
3891
				stat_bit = HID0_POWER8_2LPARMODE;
3892
			} else {
3893
				split = 4;
3894
				cmd_bit = HID0_POWER8_1TO4LPAR;
3895
				stat_bit = HID0_POWER8_4LPARMODE;
3896
			}
3897
			subcore_size = MAX_SMT_THREADS / split;
3898
			split_info.rpr = mfspr(SPRN_RPR);
3899
			split_info.pmmar = mfspr(SPRN_PMMAR);
3900
			split_info.ldbar = mfspr(SPRN_LDBAR);
3901
			split_info.subcore_size = subcore_size;
3902
		} else {
3903
			split_info.subcore_size = 1;
3904
		}
3905

3906
		/* order writes to split_info before kvm_split_mode pointer */
3907
		smp_wmb();
3908
	}
3909

3910
	for (thr = 0; thr < controlled_threads; ++thr) {
3911
		struct paca_struct *paca = paca_ptrs[pcpu + thr];
3912

3913
		paca->kvm_hstate.napping = 0;
3914
		paca->kvm_hstate.kvm_split_mode = sip;
3915
	}
3916

3917
	/* Initiate micro-threading (split-core) on POWER8 if required */
3918
	if (cmd_bit) {
3919
		unsigned long hid0 = mfspr(SPRN_HID0);
3920

3921
		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
3922
		mb();
3923
		mtspr(SPRN_HID0, hid0);
3924
		isync();
3925
		for (;;) {
3926
			hid0 = mfspr(SPRN_HID0);
3927
			if (hid0 & stat_bit)
3928
				break;
3929
			cpu_relax();
3930
		}
3931
	}
3932

3933
	/*
3934
	 * On POWER8, set RWMR register.
3935
	 * Since it only affects PURR and SPURR, it doesn't affect
3936
	 * the host, so we don't save/restore the host value.
3937
	 */
3938
	if (is_power8) {
3939
		unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
3940
		int n_online = atomic_read(&vc->online_count);
3941

3942
		/*
3943
		 * Use the 8-thread value if we're doing split-core
3944
		 * or if the vcore's online count looks bogus.
3945
		 */
3946
		if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
3947
		    n_online >= 1 && n_online <= MAX_SMT_THREADS)
3948
			rwmr_val = p8_rwmr_values[n_online];
3949
		mtspr(SPRN_RWMR, rwmr_val);
3950
	}
3951

3952
	/* Start all the threads */
3953
	active = 0;
3954
	for (sub = 0; sub < core_info.n_subcores; ++sub) {
3955
		thr = is_power8 ? subcore_thread_map[sub] : sub;
3956
		thr0_done = false;
3957
		active |= 1 << thr;
3958
		pvc = core_info.vc[sub];
3959
		pvc->pcpu = pcpu + thr;
3960
		for_each_runnable_thread(i, vcpu, pvc) {
3961
			/*
3962
			 * XXX: is kvmppc_start_thread called too late here?
3963
			 * It updates vcpu->cpu and vcpu->arch.thread_cpu
3964
			 * which are used by kvmppc_fast_vcpu_kick_hv(), but
3965
			 * kick is called after new exceptions become available
3966
			 * and exceptions are checked earlier than here, by
3967
			 * kvmppc_core_prepare_to_enter.
3968
			 */
3969
			kvmppc_start_thread(vcpu, pvc);
3970
			kvmppc_update_vpa_dispatch(vcpu, pvc);
3971
			trace_kvm_guest_enter(vcpu);
3972
			if (!vcpu->arch.ptid)
3973
				thr0_done = true;
3974
			active |= 1 << (thr + vcpu->arch.ptid);
3975
		}
3976
		/*
3977
		 * We need to start the first thread of each subcore
3978
		 * even if it doesn't have a vcpu.
3979
		 */
3980
		if (!thr0_done)
3981
			kvmppc_start_thread(NULL, pvc);
3982
	}
3983

3984
	/*
3985
	 * Ensure that split_info.do_nap is set after setting
3986
	 * the vcore pointer in the PACA of the secondaries.
3987
	 */
3988
	smp_mb();
3989

3990
	/*
3991
	 * When doing micro-threading, poke the inactive threads as well.
3992
	 * This gets them to the nap instruction after kvm_do_nap,
3993
	 * which reduces the time taken to unsplit later.
3994
	 */
3995
	if (cmd_bit) {
3996
		split_info.do_nap = 1;	/* ask secondaries to nap when done */
3997
		for (thr = 1; thr < threads_per_subcore; ++thr)
3998
			if (!(active & (1 << thr)))
3999
				kvmppc_ipi_thread(pcpu + thr);
4000
	}
4001

4002
	vc->vcore_state = VCORE_RUNNING;
4003
	preempt_disable();
4004

4005
	trace_kvmppc_run_core(vc, 0);
4006

4007
	for (sub = 0; sub < core_info.n_subcores; ++sub)
4008
		spin_unlock(&core_info.vc[sub]->lock);
4009

4010
	guest_timing_enter_irqoff();
4011

4012
	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
4013

4014
	guest_state_enter_irqoff();
4015
	this_cpu_disable_ftrace();
4016

4017
	trap = __kvmppc_vcore_entry();
4018

4019
	this_cpu_enable_ftrace();
4020
	guest_state_exit_irqoff();
4021

4022
	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
4023

4024
	set_irq_happened(trap);
4025

4026
	spin_lock(&vc->lock);
4027
	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
4028
	vc->vcore_state = VCORE_EXITING;
4029

4030
	/* wait for secondary threads to finish writing their state to memory */
4031
	kvmppc_wait_for_nap(controlled_threads);
4032

4033
	/* Return to whole-core mode if we split the core earlier */
4034
	if (cmd_bit) {
4035
		unsigned long hid0 = mfspr(SPRN_HID0);
4036

4037
		hid0 &= ~HID0_POWER8_DYNLPARDIS;
4038
		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
4039
		mb();
4040
		mtspr(SPRN_HID0, hid0);
4041
		isync();
4042
		for (;;) {
4043
			hid0 = mfspr(SPRN_HID0);
4044
			if (!(hid0 & stat_bit))
4045
				break;
4046
			cpu_relax();
4047
		}
4048
		split_info.do_nap = 0;
4049
	}
4050

4051
	kvmppc_set_host_core(pcpu);
4052

4053
	if (!vtime_accounting_enabled_this_cpu()) {
4054
		local_irq_enable();
4055
		/*
4056
		 * Service IRQs here before guest_timing_exit_irqoff() so any
4057
		 * ticks that occurred while running the guest are accounted to
4058
		 * the guest. If vtime accounting is enabled, accounting uses
4059
		 * TB rather than ticks, so it can be done without enabling
4060
		 * interrupts here, which has the problem that it accounts
4061
		 * interrupt processing overhead to the host.
4062
		 */
4063
		local_irq_disable();
4064
	}
4065
	guest_timing_exit_irqoff();
4066

4067
	local_irq_enable();
4068

4069
	/* Let secondaries go back to the offline loop */
4070
	for (i = 0; i < controlled_threads; ++i) {
4071
		kvmppc_release_hwthread(pcpu + i);
4072
		if (sip && sip->napped[i])
4073
			kvmppc_ipi_thread(pcpu + i);
4074
	}
4075

4076
	spin_unlock(&vc->lock);
4077

4078
	/* make sure updates to secondary vcpu structs are visible now */
4079
	smp_mb();
4080

4081
	preempt_enable();
4082

4083
	for (sub = 0; sub < core_info.n_subcores; ++sub) {
4084
		pvc = core_info.vc[sub];
4085
		post_guest_process(pvc, pvc == vc);
4086
	}
4087

4088
	spin_lock(&vc->lock);
4089

4090
 out:
4091
	vc->vcore_state = VCORE_INACTIVE;
4092
	trace_kvmppc_run_core(vc, 1);
4093
}
4094

4095
static inline bool hcall_is_xics(unsigned long req)
4096
{
4097
	return req == H_EOI || req == H_CPPR || req == H_IPI ||
4098
		req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
4099
}
4100

4101
static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
4102
{
4103
	struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
4104
	if (lp) {
4105
		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
4106
		lp->yield_count = cpu_to_be32(yield_count);
4107
		vcpu->arch.vpa.dirty = 1;
4108
	}
4109
}
4110

4111
/* Helper functions for reading L2's stats from L1's VPA */
4112
#ifdef CONFIG_PPC_PSERIES
4113
static DEFINE_PER_CPU(u64, l1_to_l2_cs);
4114
static DEFINE_PER_CPU(u64, l2_to_l1_cs);
4115
static DEFINE_PER_CPU(u64, l2_runtime_agg);
4116

4117
int kvmhv_get_l2_counters_status(void)
4118
{
4119
	return firmware_has_feature(FW_FEATURE_LPAR) &&
4120
		get_lppaca()->l2_counters_enable;
4121
}
4122

4123
void kvmhv_set_l2_counters_status(int cpu, bool status)
4124
{
4125
	if (!firmware_has_feature(FW_FEATURE_LPAR))
4126
		return;
4127
	if (status)
4128
		lppaca_of(cpu).l2_counters_enable = 1;
4129
	else
4130
		lppaca_of(cpu).l2_counters_enable = 0;
4131
}
4132
EXPORT_SYMBOL(kvmhv_set_l2_counters_status);
4133

4134
int kvmhv_counters_tracepoint_regfunc(void)
4135
{
4136
	int cpu;
4137

4138
	for_each_present_cpu(cpu) {
4139
		kvmhv_set_l2_counters_status(cpu, true);
4140
	}
4141
	return 0;
4142
}
4143

4144
void kvmhv_counters_tracepoint_unregfunc(void)
4145
{
4146
	int cpu;
4147

4148
	for_each_present_cpu(cpu) {
4149
		kvmhv_set_l2_counters_status(cpu, false);
4150
	}
4151
}
4152

4153
static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
4154
{
4155
	struct lppaca *lp = get_lppaca();
4156
	u64 l1_to_l2_ns, l2_to_l1_ns, l2_runtime_ns;
4157
	u64 *l1_to_l2_cs_ptr = this_cpu_ptr(&l1_to_l2_cs);
4158
	u64 *l2_to_l1_cs_ptr = this_cpu_ptr(&l2_to_l1_cs);
4159
	u64 *l2_runtime_agg_ptr = this_cpu_ptr(&l2_runtime_agg);
4160

4161
	l1_to_l2_ns = tb_to_ns(be64_to_cpu(lp->l1_to_l2_cs_tb));
4162
	l2_to_l1_ns = tb_to_ns(be64_to_cpu(lp->l2_to_l1_cs_tb));
4163
	l2_runtime_ns = tb_to_ns(be64_to_cpu(lp->l2_runtime_tb));
4164
	trace_kvmppc_vcpu_stats(vcpu, l1_to_l2_ns - *l1_to_l2_cs_ptr,
4165
					l2_to_l1_ns - *l2_to_l1_cs_ptr,
4166
					l2_runtime_ns - *l2_runtime_agg_ptr);
4167
	*l1_to_l2_cs_ptr = l1_to_l2_ns;
4168
	*l2_to_l1_cs_ptr = l2_to_l1_ns;
4169
	*l2_runtime_agg_ptr = l2_runtime_ns;
4170
	vcpu->arch.l1_to_l2_cs = l1_to_l2_ns;
4171
	vcpu->arch.l2_to_l1_cs = l2_to_l1_ns;
4172
	vcpu->arch.l2_runtime_agg = l2_runtime_ns;
4173
}
4174

4175
u64 kvmhv_get_l1_to_l2_cs_time(void)
4176
{
4177
	return tb_to_ns(be64_to_cpu(get_lppaca()->l1_to_l2_cs_tb));
4178
}
4179
EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time);
4180

4181
u64 kvmhv_get_l2_to_l1_cs_time(void)
4182
{
4183
	return tb_to_ns(be64_to_cpu(get_lppaca()->l2_to_l1_cs_tb));
4184
}
4185
EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time);
4186

4187
u64 kvmhv_get_l2_runtime_agg(void)
4188
{
4189
	return tb_to_ns(be64_to_cpu(get_lppaca()->l2_runtime_tb));
4190
}
4191
EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg);
4192

4193
u64 kvmhv_get_l1_to_l2_cs_time_vcpu(void)
4194
{
4195
	struct kvm_vcpu *vcpu;
4196
	struct kvm_vcpu_arch *arch;
4197

4198
	vcpu = local_paca->kvm_hstate.kvm_vcpu;
4199
	if (vcpu) {
4200
		arch = &vcpu->arch;
4201
		return arch->l1_to_l2_cs;
4202
	} else {
4203
		return 0;
4204
	}
4205
}
4206
EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time_vcpu);
4207

4208
u64 kvmhv_get_l2_to_l1_cs_time_vcpu(void)
4209
{
4210
	struct kvm_vcpu *vcpu;
4211
	struct kvm_vcpu_arch *arch;
4212

4213
	vcpu = local_paca->kvm_hstate.kvm_vcpu;
4214
	if (vcpu) {
4215
		arch = &vcpu->arch;
4216
		return arch->l2_to_l1_cs;
4217
	} else {
4218
		return 0;
4219
	}
4220
}
4221
EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time_vcpu);
4222

4223
u64 kvmhv_get_l2_runtime_agg_vcpu(void)
4224
{
4225
	struct kvm_vcpu *vcpu;
4226
	struct kvm_vcpu_arch *arch;
4227

4228
	vcpu = local_paca->kvm_hstate.kvm_vcpu;
4229
	if (vcpu) {
4230
		arch = &vcpu->arch;
4231
		return arch->l2_runtime_agg;
4232
	} else {
4233
		return 0;
4234
	}
4235
}
4236
EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg_vcpu);
4237

4238
#else
4239
int kvmhv_get_l2_counters_status(void)
4240
{
4241
	return 0;
4242
}
4243

4244
static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
4245
{
4246
}
4247
#endif
4248

4249
static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit,
4250
				     unsigned long lpcr, u64 *tb)
4251
{
4252
	struct kvmhv_nestedv2_io *io;
4253
	unsigned long msr, i;
4254
	int trap;
4255
	long rc;
4256

4257
	if (vcpu->arch.doorbell_request) {
4258
		vcpu->arch.doorbell_request = 0;
4259
		kvmppc_set_dpdes(vcpu, 1);
4260
	}
4261

4262
	io = &vcpu->arch.nestedv2_io;
4263

4264
	msr = mfmsr();
4265
	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4266
	if (lazy_irq_pending())
4267
		return 0;
4268

4269
	rc = kvmhv_nestedv2_flush_vcpu(vcpu, time_limit);
4270
	if (rc < 0)
4271
		return -EINVAL;
4272

4273
	kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr);
4274

4275
	accumulate_time(vcpu, &vcpu->arch.in_guest);
4276
	rc = plpar_guest_run_vcpu(0, vcpu->kvm->arch.lpid, vcpu->vcpu_id,
4277
				  &trap, &i);
4278

4279
	if (rc != H_SUCCESS) {
4280
		pr_err("KVM Guest Run VCPU hcall failed\n");
4281
		if (rc == H_INVALID_ELEMENT_ID)
4282
			pr_err("KVM: Guest Run VCPU invalid element id at %ld\n", i);
4283
		else if (rc == H_INVALID_ELEMENT_SIZE)
4284
			pr_err("KVM: Guest Run VCPU invalid element size at %ld\n", i);
4285
		else if (rc == H_INVALID_ELEMENT_VALUE)
4286
			pr_err("KVM: Guest Run VCPU invalid element value at %ld\n", i);
4287
		return -EINVAL;
4288
	}
4289
	accumulate_time(vcpu, &vcpu->arch.guest_exit);
4290

4291
	*tb = mftb();
4292
	kvmppc_gsm_reset(io->vcpu_message);
4293
	kvmppc_gsm_reset(io->vcore_message);
4294
	kvmppc_gsbm_zero(&io->valids);
4295

4296
	rc = kvmhv_nestedv2_parse_output(vcpu);
4297
	if (rc < 0)
4298
		return -EINVAL;
4299

4300
	timer_rearm_host_dec(*tb);
4301

4302
	/* Record context switch and guest_run_time data */
4303
	if (kvmhv_get_l2_counters_status())
4304
		do_trace_nested_cs_time(vcpu);
4305

4306
	return trap;
4307
}
4308

4309
/* call our hypervisor to load up HV regs and go */
4310
static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
4311
{
4312
	unsigned long host_psscr;
4313
	unsigned long msr;
4314
	struct hv_guest_state hvregs;
4315
	struct p9_host_os_sprs host_os_sprs;
4316
	s64 dec;
4317
	int trap;
4318

4319
	msr = mfmsr();
4320

4321
	save_p9_host_os_sprs(&host_os_sprs);
4322

4323
	/*
4324
	 * We need to save and restore the guest visible part of the
4325
	 * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
4326
	 * doesn't do this for us. Note only required if pseries since
4327
	 * this is done in kvmhv_vcpu_entry_p9() below otherwise.
4328
	 */
4329
	host_psscr = mfspr(SPRN_PSSCR_PR);
4330

4331
	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4332
	if (lazy_irq_pending())
4333
		return 0;
4334

4335
	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
4336
		msr = mfmsr(); /* TM restore can update msr */
4337

4338
	if (vcpu->arch.psscr != host_psscr)
4339
		mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
4340

4341
	kvmhv_save_hv_regs(vcpu, &hvregs);
4342
	hvregs.lpcr = lpcr;
4343
	hvregs.amor = ~0;
4344
	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
4345
	hvregs.version = HV_GUEST_STATE_VERSION;
4346
	if (vcpu->arch.nested) {
4347
		hvregs.lpid = vcpu->arch.nested->shadow_lpid;
4348
		hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
4349
	} else {
4350
		hvregs.lpid = vcpu->kvm->arch.lpid;
4351
		hvregs.vcpu_token = vcpu->vcpu_id;
4352
	}
4353
	hvregs.hdec_expiry = time_limit;
4354

4355
	/*
4356
	 * hvregs has the doorbell status, so zero it here which
4357
	 * enables us to receive doorbells when H_ENTER_NESTED is
4358
	 * in progress for this vCPU
4359
	 */
4360

4361
	if (vcpu->arch.doorbell_request)
4362
		vcpu->arch.doorbell_request = 0;
4363

4364
	/*
4365
	 * When setting DEC, we must always deal with irq_work_raise
4366
	 * via NMI vs setting DEC. The problem occurs right as we
4367
	 * switch into guest mode if a NMI hits and sets pending work
4368
	 * and sets DEC, then that will apply to the guest and not
4369
	 * bring us back to the host.
4370
	 *
4371
	 * irq_work_raise could check a flag (or possibly LPCR[HDICE]
4372
	 * for example) and set HDEC to 1? That wouldn't solve the
4373
	 * nested hv case which needs to abort the hcall or zero the
4374
	 * time limit.
4375
	 *
4376
	 * XXX: Another day's problem.
4377
	 */
4378
	mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
4379

4380
	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
4381
	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
4382
	switch_pmu_to_guest(vcpu, &host_os_sprs);
4383
	accumulate_time(vcpu, &vcpu->arch.in_guest);
4384
	trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
4385
				  __pa(&vcpu->arch.regs));
4386
	accumulate_time(vcpu, &vcpu->arch.guest_exit);
4387
	kvmhv_restore_hv_return_state(vcpu, &hvregs);
4388
	switch_pmu_to_host(vcpu, &host_os_sprs);
4389
	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
4390
	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
4391
	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
4392
	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
4393

4394
	store_vcpu_state(vcpu);
4395

4396
	dec = mfspr(SPRN_DEC);
4397
	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
4398
		dec = (s32) dec;
4399
	*tb = mftb();
4400
	vcpu->arch.dec_expires = dec + (*tb + kvmppc_get_tb_offset(vcpu));
4401

4402
	timer_rearm_host_dec(*tb);
4403

4404
	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
4405
	if (vcpu->arch.psscr != host_psscr)
4406
		mtspr(SPRN_PSSCR_PR, host_psscr);
4407

4408
	return trap;
4409
}
4410

4411
/*
4412
 * Guest entry for POWER9 and later CPUs.
4413
 */
4414
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
4415
			 unsigned long lpcr, u64 *tb)
4416
{
4417
	struct kvm *kvm = vcpu->kvm;
4418
	struct kvm_nested_guest *nested = vcpu->arch.nested;
4419
	u64 next_timer;
4420
	int trap;
4421

4422
	next_timer = timer_get_next_tb();
4423
	if (*tb >= next_timer)
4424
		return BOOK3S_INTERRUPT_HV_DECREMENTER;
4425
	if (next_timer < time_limit)
4426
		time_limit = next_timer;
4427
	else if (*tb >= time_limit) /* nested time limit */
4428
		return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
4429

4430
	vcpu->arch.ceded = 0;
4431

4432
	vcpu_vpa_increment_dispatch(vcpu);
4433

4434
	if (kvmhv_on_pseries()) {
4435
		if (kvmhv_is_nestedv1())
4436
			trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
4437
		else
4438
			trap = kvmhv_vcpu_entry_nestedv2(vcpu, time_limit, lpcr, tb);
4439

4440
		/* H_CEDE has to be handled now, not later */
4441
		if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
4442
		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
4443
			kvmppc_cede(vcpu);
4444
			kvmppc_set_gpr(vcpu, 3, 0);
4445
			trap = 0;
4446
		}
4447

4448
	} else if (nested) {
4449
		__this_cpu_write(cpu_in_guest, kvm);
4450
		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4451
		__this_cpu_write(cpu_in_guest, NULL);
4452

4453
	} else {
4454
		kvmppc_xive_push_vcpu(vcpu);
4455

4456
		__this_cpu_write(cpu_in_guest, kvm);
4457
		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4458
		__this_cpu_write(cpu_in_guest, NULL);
4459

4460
		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
4461
		    !(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
4462
			unsigned long req = kvmppc_get_gpr(vcpu, 3);
4463

4464
			/*
4465
			 * XIVE rearm and XICS hcalls must be handled
4466
			 * before xive context is pulled (is this
4467
			 * true?)
4468
			 */
4469
			if (req == H_CEDE) {
4470
				/* H_CEDE has to be handled now */
4471
				kvmppc_cede(vcpu);
4472
				if (!kvmppc_xive_rearm_escalation(vcpu)) {
4473
					/*
4474
					 * Pending escalation so abort
4475
					 * the cede.
4476
					 */
4477
					vcpu->arch.ceded = 0;
4478
				}
4479
				kvmppc_set_gpr(vcpu, 3, 0);
4480
				trap = 0;
4481

4482
			} else if (req == H_ENTER_NESTED) {
4483
				/*
4484
				 * L2 should not run with the L1
4485
				 * context so rearm and pull it.
4486
				 */
4487
				if (!kvmppc_xive_rearm_escalation(vcpu)) {
4488
					/*
4489
					 * Pending escalation so abort
4490
					 * H_ENTER_NESTED.
4491
					 */
4492
					kvmppc_set_gpr(vcpu, 3, 0);
4493
					trap = 0;
4494
				}
4495

4496
			} else if (hcall_is_xics(req)) {
4497
				int ret;
4498

4499
				ret = kvmppc_xive_xics_hcall(vcpu, req);
4500
				if (ret != H_TOO_HARD) {
4501
					kvmppc_set_gpr(vcpu, 3, ret);
4502
					trap = 0;
4503
				}
4504
			}
4505
		}
4506
		kvmppc_xive_pull_vcpu(vcpu);
4507

4508
		if (kvm_is_radix(kvm))
4509
			vcpu->arch.slb_max = 0;
4510
	}
4511

4512
	vcpu_vpa_increment_dispatch(vcpu);
4513

4514
	return trap;
4515
}
4516

4517
/*
4518
 * Wait for some other vcpu thread to execute us, and
4519
 * wake us up when we need to handle something in the host.
4520
 */
4521
static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
4522
				 struct kvm_vcpu *vcpu, int wait_state)
4523
{
4524
	DEFINE_WAIT(wait);
4525

4526
	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
4527
	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4528
		spin_unlock(&vc->lock);
4529
		schedule();
4530
		spin_lock(&vc->lock);
4531
	}
4532
	finish_wait(&vcpu->arch.cpu_run, &wait);
4533
}
4534

4535
static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
4536
{
4537
	if (!halt_poll_ns_grow)
4538
		return;
4539

4540
	vc->halt_poll_ns *= halt_poll_ns_grow;
4541
	if (vc->halt_poll_ns < halt_poll_ns_grow_start)
4542
		vc->halt_poll_ns = halt_poll_ns_grow_start;
4543
}
4544

4545
static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
4546
{
4547
	if (halt_poll_ns_shrink == 0)
4548
		vc->halt_poll_ns = 0;
4549
	else
4550
		vc->halt_poll_ns /= halt_poll_ns_shrink;
4551
}
4552

4553
#ifdef CONFIG_KVM_XICS
4554
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4555
{
4556
	if (!xics_on_xive())
4557
		return false;
4558
	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
4559
		vcpu->arch.xive_saved_state.cppr;
4560
}
4561
#else
4562
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4563
{
4564
	return false;
4565
}
4566
#endif /* CONFIG_KVM_XICS */
4567

4568
static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
4569
{
4570
	if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
4571
	    kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
4572
		return true;
4573

4574
	return false;
4575
}
4576

4577
static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
4578
{
4579
	if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
4580
		return true;
4581
	return false;
4582
}
4583

4584
/*
4585
 * Check to see if any of the runnable vcpus on the vcore have pending
4586
 * exceptions or are no longer ceded
4587
 */
4588
static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
4589
{
4590
	struct kvm_vcpu *vcpu;
4591
	int i;
4592

4593
	for_each_runnable_thread(i, vcpu, vc) {
4594
		if (kvmppc_vcpu_check_block(vcpu))
4595
			return 1;
4596
	}
4597

4598
	return 0;
4599
}
4600

4601
/*
4602
 * All the vcpus in this vcore are idle, so wait for a decrementer
4603
 * or external interrupt to one of the vcpus.  vc->lock is held.
4604
 */
4605
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
4606
{
4607
	ktime_t cur, start_poll, start_wait;
4608
	int do_sleep = 1;
4609
	u64 block_ns;
4610

4611
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
4612

4613
	/* Poll for pending exceptions and ceded state */
4614
	cur = start_poll = ktime_get();
4615
	if (vc->halt_poll_ns) {
4616
		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
4617
		++vc->runner->stat.generic.halt_attempted_poll;
4618

4619
		vc->vcore_state = VCORE_POLLING;
4620
		spin_unlock(&vc->lock);
4621

4622
		do {
4623
			if (kvmppc_vcore_check_block(vc)) {
4624
				do_sleep = 0;
4625
				break;
4626
			}
4627
			cur = ktime_get();
4628
		} while (kvm_vcpu_can_poll(cur, stop));
4629

4630
		spin_lock(&vc->lock);
4631
		vc->vcore_state = VCORE_INACTIVE;
4632

4633
		if (!do_sleep) {
4634
			++vc->runner->stat.generic.halt_successful_poll;
4635
			goto out;
4636
		}
4637
	}
4638

4639
	prepare_to_rcuwait(&vc->wait);
4640
	set_current_state(TASK_INTERRUPTIBLE);
4641
	if (kvmppc_vcore_check_block(vc)) {
4642
		finish_rcuwait(&vc->wait);
4643
		do_sleep = 0;
4644
		/* If we polled, count this as a successful poll */
4645
		if (vc->halt_poll_ns)
4646
			++vc->runner->stat.generic.halt_successful_poll;
4647
		goto out;
4648
	}
4649

4650
	start_wait = ktime_get();
4651

4652
	vc->vcore_state = VCORE_SLEEPING;
4653
	trace_kvmppc_vcore_blocked(vc->runner, 0);
4654
	spin_unlock(&vc->lock);
4655
	schedule();
4656
	finish_rcuwait(&vc->wait);
4657
	spin_lock(&vc->lock);
4658
	vc->vcore_state = VCORE_INACTIVE;
4659
	trace_kvmppc_vcore_blocked(vc->runner, 1);
4660
	++vc->runner->stat.halt_successful_wait;
4661

4662
	cur = ktime_get();
4663

4664
out:
4665
	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
4666

4667
	/* Attribute wait time */
4668
	if (do_sleep) {
4669
		vc->runner->stat.generic.halt_wait_ns +=
4670
			ktime_to_ns(cur) - ktime_to_ns(start_wait);
4671
		KVM_STATS_LOG_HIST_UPDATE(
4672
				vc->runner->stat.generic.halt_wait_hist,
4673
				ktime_to_ns(cur) - ktime_to_ns(start_wait));
4674
		/* Attribute failed poll time */
4675
		if (vc->halt_poll_ns) {
4676
			vc->runner->stat.generic.halt_poll_fail_ns +=
4677
				ktime_to_ns(start_wait) -
4678
				ktime_to_ns(start_poll);
4679
			KVM_STATS_LOG_HIST_UPDATE(
4680
				vc->runner->stat.generic.halt_poll_fail_hist,
4681
				ktime_to_ns(start_wait) -
4682
				ktime_to_ns(start_poll));
4683
		}
4684
	} else {
4685
		/* Attribute successful poll time */
4686
		if (vc->halt_poll_ns) {
4687
			vc->runner->stat.generic.halt_poll_success_ns +=
4688
				ktime_to_ns(cur) -
4689
				ktime_to_ns(start_poll);
4690
			KVM_STATS_LOG_HIST_UPDATE(
4691
				vc->runner->stat.generic.halt_poll_success_hist,
4692
				ktime_to_ns(cur) - ktime_to_ns(start_poll));
4693
		}
4694
	}
4695

4696
	/* Adjust poll time */
4697
	if (halt_poll_ns) {
4698
		if (block_ns <= vc->halt_poll_ns)
4699
			;
4700
		/* We slept and blocked for longer than the max halt time */
4701
		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
4702
			shrink_halt_poll_ns(vc);
4703
		/* We slept and our poll time is too small */
4704
		else if (vc->halt_poll_ns < halt_poll_ns &&
4705
				block_ns < halt_poll_ns)
4706
			grow_halt_poll_ns(vc);
4707
		if (vc->halt_poll_ns > halt_poll_ns)
4708
			vc->halt_poll_ns = halt_poll_ns;
4709
	} else
4710
		vc->halt_poll_ns = 0;
4711

4712
	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
4713
}
4714

4715
/*
4716
 * This never fails for a radix guest, as none of the operations it does
4717
 * for a radix guest can fail or have a way to report failure.
4718
 */
4719
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
4720
{
4721
	int r = 0;
4722
	struct kvm *kvm = vcpu->kvm;
4723

4724
	mutex_lock(&kvm->arch.mmu_setup_lock);
4725
	if (!kvm->arch.mmu_ready) {
4726
		if (!kvm_is_radix(kvm))
4727
			r = kvmppc_hv_setup_htab_rma(vcpu);
4728
		if (!r) {
4729
			if (cpu_has_feature(CPU_FTR_ARCH_300))
4730
				kvmppc_setup_partition_table(kvm);
4731
			kvm->arch.mmu_ready = 1;
4732
		}
4733
	}
4734
	mutex_unlock(&kvm->arch.mmu_setup_lock);
4735
	return r;
4736
}
4737

4738
static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
4739
{
4740
	struct kvm_run *run = vcpu->run;
4741
	int n_ceded, i, r;
4742
	struct kvmppc_vcore *vc;
4743
	struct kvm_vcpu *v;
4744

4745
	trace_kvmppc_run_vcpu_enter(vcpu);
4746

4747
	run->exit_reason = 0;
4748
	vcpu->arch.ret = RESUME_GUEST;
4749
	vcpu->arch.trap = 0;
4750
	kvmppc_update_vpas(vcpu);
4751

4752
	/*
4753
	 * Synchronize with other threads in this virtual core
4754
	 */
4755
	vc = vcpu->arch.vcore;
4756
	spin_lock(&vc->lock);
4757
	vcpu->arch.ceded = 0;
4758
	vcpu->arch.run_task = current;
4759
	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4760
	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4761
	vcpu->arch.busy_preempt = TB_NIL;
4762
	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
4763
	++vc->n_runnable;
4764

4765
	/*
4766
	 * This happens the first time this is called for a vcpu.
4767
	 * If the vcore is already running, we may be able to start
4768
	 * this thread straight away and have it join in.
4769
	 */
4770
	if (!signal_pending(current)) {
4771
		if ((vc->vcore_state == VCORE_PIGGYBACK ||
4772
		     vc->vcore_state == VCORE_RUNNING) &&
4773
			   !VCORE_IS_EXITING(vc)) {
4774
			kvmppc_update_vpa_dispatch(vcpu, vc);
4775
			kvmppc_start_thread(vcpu, vc);
4776
			trace_kvm_guest_enter(vcpu);
4777
		} else if (vc->vcore_state == VCORE_SLEEPING) {
4778
		        rcuwait_wake_up(&vc->wait);
4779
		}
4780

4781
	}
4782

4783
	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4784
	       !signal_pending(current)) {
4785
		/* See if the MMU is ready to go */
4786
		if (!vcpu->kvm->arch.mmu_ready) {
4787
			spin_unlock(&vc->lock);
4788
			r = kvmhv_setup_mmu(vcpu);
4789
			spin_lock(&vc->lock);
4790
			if (r) {
4791
				run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4792
				run->fail_entry.
4793
					hardware_entry_failure_reason = 0;
4794
				vcpu->arch.ret = r;
4795
				break;
4796
			}
4797
		}
4798

4799
		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4800
			kvmppc_vcore_end_preempt(vc);
4801

4802
		if (vc->vcore_state != VCORE_INACTIVE) {
4803
			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
4804
			continue;
4805
		}
4806
		for_each_runnable_thread(i, v, vc) {
4807
			kvmppc_core_prepare_to_enter(v);
4808
			if (signal_pending(v->arch.run_task)) {
4809
				kvmppc_remove_runnable(vc, v, mftb());
4810
				v->stat.signal_exits++;
4811
				v->run->exit_reason = KVM_EXIT_INTR;
4812
				v->arch.ret = -EINTR;
4813
				wake_up(&v->arch.cpu_run);
4814
			}
4815
		}
4816
		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
4817
			break;
4818
		n_ceded = 0;
4819
		for_each_runnable_thread(i, v, vc) {
4820
			if (!kvmppc_vcpu_woken(v))
4821
				n_ceded += v->arch.ceded;
4822
			else
4823
				v->arch.ceded = 0;
4824
		}
4825
		vc->runner = vcpu;
4826
		if (n_ceded == vc->n_runnable) {
4827
			kvmppc_vcore_blocked(vc);
4828
		} else if (need_resched()) {
4829
			kvmppc_vcore_preempt(vc);
4830
			/* Let something else run */
4831
			cond_resched_lock(&vc->lock);
4832
			if (vc->vcore_state == VCORE_PREEMPT)
4833
				kvmppc_vcore_end_preempt(vc);
4834
		} else {
4835
			kvmppc_run_core(vc);
4836
		}
4837
		vc->runner = NULL;
4838
	}
4839

4840
	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4841
	       (vc->vcore_state == VCORE_RUNNING ||
4842
		vc->vcore_state == VCORE_EXITING ||
4843
		vc->vcore_state == VCORE_PIGGYBACK))
4844
		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
4845

4846
	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4847
		kvmppc_vcore_end_preempt(vc);
4848

4849
	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4850
		kvmppc_remove_runnable(vc, vcpu, mftb());
4851
		vcpu->stat.signal_exits++;
4852
		run->exit_reason = KVM_EXIT_INTR;
4853
		vcpu->arch.ret = -EINTR;
4854
	}
4855

4856
	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
4857
		/* Wake up some vcpu to run the core */
4858
		i = -1;
4859
		v = next_runnable_thread(vc, &i);
4860
		wake_up(&v->arch.cpu_run);
4861
	}
4862

4863
	trace_kvmppc_run_vcpu_exit(vcpu);
4864
	spin_unlock(&vc->lock);
4865
	return vcpu->arch.ret;
4866
}
4867

4868
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4869
			  unsigned long lpcr)
4870
{
4871
	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
4872
	struct kvm_run *run = vcpu->run;
4873
	int trap, r, pcpu;
4874
	int srcu_idx;
4875
	struct kvmppc_vcore *vc;
4876
	struct kvm *kvm = vcpu->kvm;
4877
	struct kvm_nested_guest *nested = vcpu->arch.nested;
4878
	unsigned long flags;
4879
	u64 tb;
4880

4881
	trace_kvmppc_run_vcpu_enter(vcpu);
4882

4883
	run->exit_reason = 0;
4884
	vcpu->arch.ret = RESUME_GUEST;
4885
	vcpu->arch.trap = 0;
4886

4887
	vc = vcpu->arch.vcore;
4888
	vcpu->arch.ceded = 0;
4889
	vcpu->arch.run_task = current;
4890
	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
4891

4892
	/* See if the MMU is ready to go */
4893
	if (unlikely(!kvm->arch.mmu_ready)) {
4894
		r = kvmhv_setup_mmu(vcpu);
4895
		if (r) {
4896
			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4897
			run->fail_entry.hardware_entry_failure_reason = 0;
4898
			vcpu->arch.ret = r;
4899
			return r;
4900
		}
4901
	}
4902

4903
	if (need_resched())
4904
		cond_resched();
4905

4906
	kvmppc_update_vpas(vcpu);
4907

4908
	preempt_disable();
4909
	pcpu = smp_processor_id();
4910
	if (kvm_is_radix(kvm))
4911
		kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4912

4913
	/* flags save not required, but irq_pmu has no disable/enable API */
4914
	powerpc_local_irq_pmu_save(flags);
4915

4916
	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4917

4918
	if (signal_pending(current))
4919
		goto sigpend;
4920
	if (need_resched() || !kvm->arch.mmu_ready)
4921
		goto out;
4922

4923
	vcpu->cpu = pcpu;
4924
	vcpu->arch.thread_cpu = pcpu;
4925
	vc->pcpu = pcpu;
4926
	local_paca->kvm_hstate.kvm_vcpu = vcpu;
4927
	local_paca->kvm_hstate.ptid = 0;
4928
	local_paca->kvm_hstate.fake_suspend = 0;
4929

4930
	/*
4931
	 * Orders set cpu/thread_cpu vs testing for pending interrupts and
4932
	 * doorbells below. The other side is when these fields are set vs
4933
	 * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to
4934
	 * kick a vCPU to notice the pending interrupt.
4935
	 */
4936
	smp_mb();
4937

4938
	if (!nested) {
4939
		kvmppc_core_prepare_to_enter(vcpu);
4940
		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
4941
			     &vcpu->arch.pending_exceptions) ||
4942
		    xive_interrupt_pending(vcpu)) {
4943
			/*
4944
			 * For nested HV, don't synthesize but always pass MER,
4945
			 * the L0 will be able to optimise that more
4946
			 * effectively than manipulating registers directly.
4947
			 */
4948
			if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE))
4949
				kvmppc_inject_interrupt_hv(vcpu,
4950
							   BOOK3S_INTERRUPT_EXTERNAL, 0);
4951
			else
4952
				lpcr |= LPCR_MER;
4953
		} else {
4954
			/*
4955
			 * L1's copy of L2's LPCR (vcpu->arch.vcore->lpcr) can get its MER bit
4956
			 * unexpectedly set - for e.g. during NMI handling when all register
4957
			 * states are synchronized from L0 to L1. L1 needs to inform L0 about
4958
			 * MER=1 only when there are pending external interrupts.
4959
			 * In the above if check, MER bit is set if there are pending
4960
			 * external interrupts. Hence, explicitly mask off MER bit
4961
			 * here as otherwise it may generate spurious interrupts in L2 KVM
4962
			 * causing an endless loop, which results in L2 guest getting hung.
4963
			 */
4964
			lpcr &= ~LPCR_MER;
4965
		}
4966
	} else if (vcpu->arch.pending_exceptions ||
4967
		   xive_interrupt_pending(vcpu)) {
4968
		vcpu->arch.ret = RESUME_HOST;
4969
		goto out;
4970
	}
4971

4972
	if (vcpu->arch.timer_running) {
4973
		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
4974
		vcpu->arch.timer_running = 0;
4975
	}
4976

4977
	tb = mftb();
4978

4979
	kvmppc_update_vpa_dispatch_p9(vcpu, vc, tb + kvmppc_get_tb_offset(vcpu));
4980

4981
	trace_kvm_guest_enter(vcpu);
4982

4983
	guest_timing_enter_irqoff();
4984

4985
	srcu_idx = srcu_read_lock(&kvm->srcu);
4986

4987
	guest_state_enter_irqoff();
4988
	this_cpu_disable_ftrace();
4989

4990
	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb);
4991
	vcpu->arch.trap = trap;
4992

4993
	this_cpu_enable_ftrace();
4994
	guest_state_exit_irqoff();
4995

4996
	srcu_read_unlock(&kvm->srcu, srcu_idx);
4997

4998
	set_irq_happened(trap);
4999

5000
	vcpu->cpu = -1;
5001
	vcpu->arch.thread_cpu = -1;
5002
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5003

5004
	if (!vtime_accounting_enabled_this_cpu()) {
5005
		powerpc_local_irq_pmu_restore(flags);
5006
		/*
5007
		 * Service IRQs here before guest_timing_exit_irqoff() so any
5008
		 * ticks that occurred while running the guest are accounted to
5009
		 * the guest. If vtime accounting is enabled, accounting uses
5010
		 * TB rather than ticks, so it can be done without enabling
5011
		 * interrupts here, which has the problem that it accounts
5012
		 * interrupt processing overhead to the host.
5013
		 */
5014
		powerpc_local_irq_pmu_save(flags);
5015
	}
5016
	guest_timing_exit_irqoff();
5017

5018
	powerpc_local_irq_pmu_restore(flags);
5019

5020
	preempt_enable();
5021

5022
	/*
5023
	 * cancel pending decrementer exception if DEC is now positive, or if
5024
	 * entering a nested guest in which case the decrementer is now owned
5025
	 * by L2 and the L1 decrementer is provided in hdec_expires
5026
	 */
5027
	if (kvmppc_core_pending_dec(vcpu) &&
5028
			((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
5029
			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
5030
			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
5031
		kvmppc_core_dequeue_dec(vcpu);
5032

5033
	trace_kvm_guest_exit(vcpu);
5034
	r = RESUME_GUEST;
5035
	if (trap) {
5036
		if (!nested)
5037
			r = kvmppc_handle_exit_hv(vcpu, current);
5038
		else
5039
			r = kvmppc_handle_nested_exit(vcpu);
5040
	}
5041
	vcpu->arch.ret = r;
5042

5043
	if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
5044
		kvmppc_set_timer(vcpu);
5045

5046
		prepare_to_rcuwait(wait);
5047
		for (;;) {
5048
			set_current_state(TASK_INTERRUPTIBLE);
5049
			if (signal_pending(current)) {
5050
				vcpu->stat.signal_exits++;
5051
				run->exit_reason = KVM_EXIT_INTR;
5052
				vcpu->arch.ret = -EINTR;
5053
				break;
5054
			}
5055

5056
			if (kvmppc_vcpu_check_block(vcpu))
5057
				break;
5058

5059
			trace_kvmppc_vcore_blocked(vcpu, 0);
5060
			schedule();
5061
			trace_kvmppc_vcore_blocked(vcpu, 1);
5062
		}
5063
		finish_rcuwait(wait);
5064
	}
5065
	vcpu->arch.ceded = 0;
5066

5067
 done:
5068
	trace_kvmppc_run_vcpu_exit(vcpu);
5069

5070
	return vcpu->arch.ret;
5071

5072
 sigpend:
5073
	vcpu->stat.signal_exits++;
5074
	run->exit_reason = KVM_EXIT_INTR;
5075
	vcpu->arch.ret = -EINTR;
5076
 out:
5077
	vcpu->cpu = -1;
5078
	vcpu->arch.thread_cpu = -1;
5079
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5080
	powerpc_local_irq_pmu_restore(flags);
5081
	preempt_enable();
5082
	goto done;
5083
}
5084

5085
static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
5086
{
5087
	struct kvm_run *run = vcpu->run;
5088
	int r;
5089
	int srcu_idx;
5090
	struct kvm *kvm;
5091
	unsigned long msr;
5092

5093
	start_timing(vcpu, &vcpu->arch.vcpu_entry);
5094

5095
	if (!vcpu->arch.sane) {
5096
		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5097
		return -EINVAL;
5098
	}
5099

5100
	/* No need to go into the guest when all we'll do is come back out */
5101
	if (signal_pending(current)) {
5102
		run->exit_reason = KVM_EXIT_INTR;
5103
		return -EINTR;
5104
	}
5105

5106
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
5107
	/*
5108
	 * Don't allow entry with a suspended transaction, because
5109
	 * the guest entry/exit code will lose it.
5110
	 */
5111
	if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
5112
	    (current->thread.regs->msr & MSR_TM)) {
5113
		if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
5114
			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5115
			run->fail_entry.hardware_entry_failure_reason = 0;
5116
			return -EINVAL;
5117
		}
5118
	}
5119
#endif
5120

5121
	/*
5122
	 * Force online to 1 for the sake of old userspace which doesn't
5123
	 * set it.
5124
	 */
5125
	if (!vcpu->arch.online) {
5126
		atomic_inc(&vcpu->arch.vcore->online_count);
5127
		vcpu->arch.online = 1;
5128
	}
5129

5130
	kvmppc_core_prepare_to_enter(vcpu);
5131

5132
	kvm = vcpu->kvm;
5133
	atomic_inc(&kvm->arch.vcpus_running);
5134
	/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
5135
	smp_mb();
5136

5137
	msr = 0;
5138
	if (IS_ENABLED(CONFIG_PPC_FPU))
5139
		msr |= MSR_FP;
5140
	if (cpu_has_feature(CPU_FTR_ALTIVEC))
5141
		msr |= MSR_VEC;
5142
	if (cpu_has_feature(CPU_FTR_VSX))
5143
		msr |= MSR_VSX;
5144
	if ((cpu_has_feature(CPU_FTR_TM) ||
5145
	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
5146
			(kvmppc_get_hfscr_hv(vcpu) & HFSCR_TM))
5147
		msr |= MSR_TM;
5148
	msr = msr_check_and_set(msr);
5149

5150
	kvmppc_save_user_regs();
5151

5152
	kvmppc_save_current_sprs();
5153

5154
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5155
		vcpu->arch.waitp = &vcpu->arch.vcore->wait;
5156
	vcpu->arch.pgdir = kvm->mm->pgd;
5157
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
5158

5159
	do {
5160
		accumulate_time(vcpu, &vcpu->arch.guest_entry);
5161
		if (cpu_has_feature(CPU_FTR_ARCH_300))
5162
			r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
5163
						  vcpu->arch.vcore->lpcr);
5164
		else
5165
			r = kvmppc_run_vcpu(vcpu);
5166

5167
		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
5168
			accumulate_time(vcpu, &vcpu->arch.hcall);
5169

5170
			if (!kvmhv_is_nestedv2() && WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
5171
				/*
5172
				 * These should have been caught reflected
5173
				 * into the guest by now. Final sanity check:
5174
				 * don't allow userspace to execute hcalls in
5175
				 * the hypervisor.
5176
				 */
5177
				r = RESUME_GUEST;
5178
				continue;
5179
			}
5180
			trace_kvm_hcall_enter(vcpu);
5181
			r = kvmppc_pseries_do_hcall(vcpu);
5182
			trace_kvm_hcall_exit(vcpu, r);
5183
			kvmppc_core_prepare_to_enter(vcpu);
5184
		} else if (r == RESUME_PAGE_FAULT) {
5185
			accumulate_time(vcpu, &vcpu->arch.pg_fault);
5186
			srcu_idx = srcu_read_lock(&kvm->srcu);
5187
			r = kvmppc_book3s_hv_page_fault(vcpu,
5188
				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
5189
			srcu_read_unlock(&kvm->srcu, srcu_idx);
5190
		} else if (r == RESUME_PASSTHROUGH) {
5191
			if (WARN_ON(xics_on_xive()))
5192
				r = H_SUCCESS;
5193
			else
5194
				r = kvmppc_xics_rm_complete(vcpu, 0);
5195
		}
5196
	} while (is_kvmppc_resume_guest(r));
5197
	accumulate_time(vcpu, &vcpu->arch.vcpu_exit);
5198

5199
	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
5200
	atomic_dec(&kvm->arch.vcpus_running);
5201

5202
	srr_regs_clobbered();
5203

5204
	end_timing(vcpu);
5205

5206
	return r;
5207
}
5208

5209
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
5210
				     int shift, int sllp)
5211
{
5212
	(*sps)->page_shift = shift;
5213
	(*sps)->slb_enc = sllp;
5214
	(*sps)->enc[0].page_shift = shift;
5215
	(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
5216
	/*
5217
	 * Add 16MB MPSS support (may get filtered out by userspace)
5218
	 */
5219
	if (shift != 24) {
5220
		int penc = kvmppc_pgsize_lp_encoding(shift, 24);
5221
		if (penc != -1) {
5222
			(*sps)->enc[1].page_shift = 24;
5223
			(*sps)->enc[1].pte_enc = penc;
5224
		}
5225
	}
5226
	(*sps)++;
5227
}
5228

5229
static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
5230
					 struct kvm_ppc_smmu_info *info)
5231
{
5232
	struct kvm_ppc_one_seg_page_size *sps;
5233

5234
	/*
5235
	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
5236
	 * POWER7 doesn't support keys for instruction accesses,
5237
	 * POWER8 and POWER9 do.
5238
	 */
5239
	info->data_keys = 32;
5240
	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
5241

5242
	/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
5243
	info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
5244
	info->slb_size = 32;
5245

5246
	/* We only support these sizes for now, and no muti-size segments */
5247
	sps = &info->sps[0];
5248
	kvmppc_add_seg_page_size(&sps, 12, 0);
5249
	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
5250
	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
5251

5252
	/* If running as a nested hypervisor, we don't support HPT guests */
5253
	if (kvmhv_on_pseries())
5254
		info->flags |= KVM_PPC_NO_HASH;
5255

5256
	return 0;
5257
}
5258

5259
/*
5260
 * Get (and clear) the dirty memory log for a memory slot.
5261
 */
5262
static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
5263
					 struct kvm_dirty_log *log)
5264
{
5265
	struct kvm_memslots *slots;
5266
	struct kvm_memory_slot *memslot;
5267
	int r;
5268
	unsigned long n, i;
5269
	unsigned long *buf, *p;
5270
	struct kvm_vcpu *vcpu;
5271

5272
	mutex_lock(&kvm->slots_lock);
5273

5274
	r = -EINVAL;
5275
	if (log->slot >= KVM_USER_MEM_SLOTS)
5276
		goto out;
5277

5278
	slots = kvm_memslots(kvm);
5279
	memslot = id_to_memslot(slots, log->slot);
5280
	r = -ENOENT;
5281
	if (!memslot || !memslot->dirty_bitmap)
5282
		goto out;
5283

5284
	/*
5285
	 * Use second half of bitmap area because both HPT and radix
5286
	 * accumulate bits in the first half.
5287
	 */
5288
	n = kvm_dirty_bitmap_bytes(memslot);
5289
	buf = memslot->dirty_bitmap + n / sizeof(long);
5290
	memset(buf, 0, n);
5291

5292
	if (kvm_is_radix(kvm))
5293
		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
5294
	else
5295
		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
5296
	if (r)
5297
		goto out;
5298

5299
	/*
5300
	 * We accumulate dirty bits in the first half of the
5301
	 * memslot's dirty_bitmap area, for when pages are paged
5302
	 * out or modified by the host directly.  Pick up these
5303
	 * bits and add them to the map.
5304
	 */
5305
	p = memslot->dirty_bitmap;
5306
	for (i = 0; i < n / sizeof(long); ++i)
5307
		buf[i] |= xchg(&p[i], 0);
5308

5309
	/* Harvest dirty bits from VPA and DTL updates */
5310
	/* Note: we never modify the SLB shadow buffer areas */
5311
	kvm_for_each_vcpu(i, vcpu, kvm) {
5312
		spin_lock(&vcpu->arch.vpa_update_lock);
5313
		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
5314
		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
5315
		spin_unlock(&vcpu->arch.vpa_update_lock);
5316
	}
5317

5318
	r = -EFAULT;
5319
	if (copy_to_user(log->dirty_bitmap, buf, n))
5320
		goto out;
5321

5322
	r = 0;
5323
out:
5324
	mutex_unlock(&kvm->slots_lock);
5325
	return r;
5326
}
5327

5328
static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
5329
{
5330
	vfree(slot->arch.rmap);
5331
	slot->arch.rmap = NULL;
5332
}
5333

5334
static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
5335
				const struct kvm_memory_slot *old,
5336
				struct kvm_memory_slot *new,
5337
				enum kvm_mr_change change)
5338
{
5339
	if (change == KVM_MR_CREATE) {
5340
		unsigned long size = array_size(new->npages, sizeof(*new->arch.rmap));
5341

5342
		if ((size >> PAGE_SHIFT) > totalram_pages())
5343
			return -ENOMEM;
5344

5345
		new->arch.rmap = vzalloc(size);
5346
		if (!new->arch.rmap)
5347
			return -ENOMEM;
5348
	} else if (change != KVM_MR_DELETE) {
5349
		new->arch.rmap = old->arch.rmap;
5350
	}
5351

5352
	return 0;
5353
}
5354

5355
static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
5356
				struct kvm_memory_slot *old,
5357
				const struct kvm_memory_slot *new,
5358
				enum kvm_mr_change change)
5359
{
5360
	/*
5361
	 * If we are creating or modifying a memslot, it might make
5362
	 * some address that was previously cached as emulated
5363
	 * MMIO be no longer emulated MMIO, so invalidate
5364
	 * all the caches of emulated MMIO translations.
5365
	 */
5366
	if (change != KVM_MR_DELETE)
5367
		atomic64_inc(&kvm->arch.mmio_update);
5368

5369
	/*
5370
	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
5371
	 * have already called kvm_arch_flush_shadow_memslot() to
5372
	 * flush shadow mappings.  For KVM_MR_CREATE we have no
5373
	 * previous mappings.  So the only case to handle is
5374
	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
5375
	 * has been changed.
5376
	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
5377
	 * to get rid of any THP PTEs in the partition-scoped page tables
5378
	 * so we can track dirtiness at the page level; we flush when
5379
	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
5380
	 * using THP PTEs.
5381
	 */
5382
	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
5383
	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
5384
		kvmppc_radix_flush_memslot(kvm, old);
5385
	/*
5386
	 * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
5387
	 */
5388
	if (!kvm->arch.secure_guest)
5389
		return;
5390

5391
	switch (change) {
5392
	case KVM_MR_CREATE:
5393
		/*
5394
		 * @TODO kvmppc_uvmem_memslot_create() can fail and
5395
		 * return error. Fix this.
5396
		 */
5397
		kvmppc_uvmem_memslot_create(kvm, new);
5398
		break;
5399
	case KVM_MR_DELETE:
5400
		kvmppc_uvmem_memslot_delete(kvm, old);
5401
		break;
5402
	default:
5403
		/* TODO: Handle KVM_MR_MOVE */
5404
		break;
5405
	}
5406
}
5407

5408
/*
5409
 * Update LPCR values in kvm->arch and in vcores.
5410
 * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
5411
 * of kvm->arch.lpcr update).
5412
 */
5413
void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
5414
{
5415
	long int i;
5416
	u32 cores_done = 0;
5417

5418
	if ((kvm->arch.lpcr & mask) == lpcr)
5419
		return;
5420

5421
	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
5422

5423
	for (i = 0; i < KVM_MAX_VCORES; ++i) {
5424
		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
5425
		if (!vc)
5426
			continue;
5427

5428
		spin_lock(&vc->lock);
5429
		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
5430
		verify_lpcr(kvm, vc->lpcr);
5431
		spin_unlock(&vc->lock);
5432
		if (++cores_done >= kvm->arch.online_vcores)
5433
			break;
5434
	}
5435

5436
	if (kvmhv_is_nestedv2()) {
5437
		struct kvm_vcpu *vcpu;
5438

5439
		kvm_for_each_vcpu(i, vcpu, kvm) {
5440
			kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
5441
		}
5442
	}
5443
}
5444

5445
void kvmppc_setup_partition_table(struct kvm *kvm)
5446
{
5447
	unsigned long dw0, dw1;
5448

5449
	if (!kvm_is_radix(kvm)) {
5450
		/* PS field - page size for VRMA */
5451
		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
5452
			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
5453
		/* HTABSIZE and HTABORG fields */
5454
		dw0 |= kvm->arch.sdr1;
5455

5456
		/* Second dword as set by userspace */
5457
		dw1 = kvm->arch.process_table;
5458
	} else {
5459
		dw0 = PATB_HR | radix__get_tree_size() |
5460
			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
5461
		dw1 = PATB_GR | kvm->arch.process_table;
5462
	}
5463
	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
5464
}
5465

5466
/*
5467
 * Set up HPT (hashed page table) and RMA (real-mode area).
5468
 * Must be called with kvm->arch.mmu_setup_lock held.
5469
 */
5470
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
5471
{
5472
	int err = 0;
5473
	struct kvm *kvm = vcpu->kvm;
5474
	unsigned long hva;
5475
	struct kvm_memory_slot *memslot;
5476
	struct vm_area_struct *vma;
5477
	unsigned long lpcr = 0, senc;
5478
	unsigned long psize, porder;
5479
	int srcu_idx;
5480

5481
	/* Allocate hashed page table (if not done already) and reset it */
5482
	if (!kvm->arch.hpt.virt) {
5483
		int order = KVM_DEFAULT_HPT_ORDER;
5484
		struct kvm_hpt_info info;
5485

5486
		err = kvmppc_allocate_hpt(&info, order);
5487
		/* If we get here, it means userspace didn't specify a
5488
		 * size explicitly.  So, try successively smaller
5489
		 * sizes if the default failed. */
5490
		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
5491
			err  = kvmppc_allocate_hpt(&info, order);
5492

5493
		if (err < 0) {
5494
			pr_err("KVM: Couldn't alloc HPT\n");
5495
			goto out;
5496
		}
5497

5498
		kvmppc_set_hpt(kvm, &info);
5499
	}
5500

5501
	/* Look up the memslot for guest physical address 0 */
5502
	srcu_idx = srcu_read_lock(&kvm->srcu);
5503
	memslot = gfn_to_memslot(kvm, 0);
5504

5505
	/* We must have some memory at 0 by now */
5506
	err = -EINVAL;
5507
	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
5508
		goto out_srcu;
5509

5510
	/* Look up the VMA for the start of this memory slot */
5511
	hva = memslot->userspace_addr;
5512
	mmap_read_lock(kvm->mm);
5513
	vma = vma_lookup(kvm->mm, hva);
5514
	if (!vma || (vma->vm_flags & VM_IO))
5515
		goto up_out;
5516

5517
	psize = vma_kernel_pagesize(vma);
5518

5519
	mmap_read_unlock(kvm->mm);
5520

5521
	/* We can handle 4k, 64k or 16M pages in the VRMA */
5522
	if (psize >= 0x1000000)
5523
		psize = 0x1000000;
5524
	else if (psize >= 0x10000)
5525
		psize = 0x10000;
5526
	else
5527
		psize = 0x1000;
5528
	porder = __ilog2(psize);
5529

5530
	senc = slb_pgsize_encoding(psize);
5531
	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
5532
		(VRMA_VSID << SLB_VSID_SHIFT_1T);
5533
	/* Create HPTEs in the hash page table for the VRMA */
5534
	kvmppc_map_vrma(vcpu, memslot, porder);
5535

5536
	/* Update VRMASD field in the LPCR */
5537
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
5538
		/* the -4 is to account for senc values starting at 0x10 */
5539
		lpcr = senc << (LPCR_VRMASD_SH - 4);
5540
		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
5541
	}
5542

5543
	/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
5544
	smp_wmb();
5545
	err = 0;
5546
 out_srcu:
5547
	srcu_read_unlock(&kvm->srcu, srcu_idx);
5548
 out:
5549
	return err;
5550

5551
 up_out:
5552
	mmap_read_unlock(kvm->mm);
5553
	goto out_srcu;
5554
}
5555

5556
/*
5557
 * Must be called with kvm->arch.mmu_setup_lock held and
5558
 * mmu_ready = 0 and no vcpus running.
5559
 */
5560
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
5561
{
5562
	unsigned long lpcr, lpcr_mask;
5563

5564
	if (nesting_enabled(kvm))
5565
		kvmhv_release_all_nested(kvm);
5566
	kvmppc_rmap_reset(kvm);
5567
	kvm->arch.process_table = 0;
5568
	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5569
	spin_lock(&kvm->mmu_lock);
5570
	kvm->arch.radix = 0;
5571
	spin_unlock(&kvm->mmu_lock);
5572
	kvmppc_free_radix(kvm);
5573

5574
	lpcr = LPCR_VPM1;
5575
	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5576
	if (cpu_has_feature(CPU_FTR_ARCH_31))
5577
		lpcr_mask |= LPCR_HAIL;
5578
	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
5579

5580
	return 0;
5581
}
5582

5583
/*
5584
 * Must be called with kvm->arch.mmu_setup_lock held and
5585
 * mmu_ready = 0 and no vcpus running.
5586
 */
5587
int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
5588
{
5589
	unsigned long lpcr, lpcr_mask;
5590
	int err;
5591

5592
	err = kvmppc_init_vm_radix(kvm);
5593
	if (err)
5594
		return err;
5595
	kvmppc_rmap_reset(kvm);
5596
	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5597
	spin_lock(&kvm->mmu_lock);
5598
	kvm->arch.radix = 1;
5599
	spin_unlock(&kvm->mmu_lock);
5600
	kvmppc_free_hpt(&kvm->arch.hpt);
5601

5602
	lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5603
	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5604
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5605
		lpcr_mask |= LPCR_HAIL;
5606
		if (cpu_has_feature(CPU_FTR_HVMODE) &&
5607
				(kvm->arch.host_lpcr & LPCR_HAIL))
5608
			lpcr |= LPCR_HAIL;
5609
	}
5610
	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
5611

5612
	return 0;
5613
}
5614

5615
#ifdef CONFIG_KVM_XICS
5616
/*
5617
 * Allocate a per-core structure for managing state about which cores are
5618
 * running in the host versus the guest and for exchanging data between
5619
 * real mode KVM and CPU running in the host.
5620
 * This is only done for the first VM.
5621
 * The allocated structure stays even if all VMs have stopped.
5622
 * It is only freed when the kvm-hv module is unloaded.
5623
 * It's OK for this routine to fail, we just don't support host
5624
 * core operations like redirecting H_IPI wakeups.
5625
 */
5626
void kvmppc_alloc_host_rm_ops(void)
5627
{
5628
	struct kvmppc_host_rm_ops *ops;
5629
	unsigned long l_ops;
5630
	int cpu, core;
5631
	int size;
5632

5633
	if (cpu_has_feature(CPU_FTR_ARCH_300))
5634
		return;
5635

5636
	/* Not the first time here ? */
5637
	if (kvmppc_host_rm_ops_hv != NULL)
5638
		return;
5639

5640
	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
5641
	if (!ops)
5642
		return;
5643

5644
	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
5645
	ops->rm_core = kzalloc(size, GFP_KERNEL);
5646

5647
	if (!ops->rm_core) {
5648
		kfree(ops);
5649
		return;
5650
	}
5651

5652
	cpus_read_lock();
5653

5654
	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
5655
		if (!cpu_online(cpu))
5656
			continue;
5657

5658
		core = cpu >> threads_shift;
5659
		ops->rm_core[core].rm_state.in_host = 1;
5660
	}
5661

5662
	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
5663

5664
	/*
5665
	 * Make the contents of the kvmppc_host_rm_ops structure visible
5666
	 * to other CPUs before we assign it to the global variable.
5667
	 * Do an atomic assignment (no locks used here), but if someone
5668
	 * beats us to it, just free our copy and return.
5669
	 */
5670
	smp_wmb();
5671
	l_ops = (unsigned long) ops;
5672

5673
	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
5674
		cpus_read_unlock();
5675
		kfree(ops->rm_core);
5676
		kfree(ops);
5677
		return;
5678
	}
5679

5680
	cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
5681
					     "ppc/kvm_book3s:prepare",
5682
					     kvmppc_set_host_core,
5683
					     kvmppc_clear_host_core);
5684
	cpus_read_unlock();
5685
}
5686

5687
void kvmppc_free_host_rm_ops(void)
5688
{
5689
	if (kvmppc_host_rm_ops_hv) {
5690
		cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
5691
		kfree(kvmppc_host_rm_ops_hv->rm_core);
5692
		kfree(kvmppc_host_rm_ops_hv);
5693
		kvmppc_host_rm_ops_hv = NULL;
5694
	}
5695
}
5696
#endif
5697

5698
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
5699
{
5700
	unsigned long lpcr, lpid;
5701
	int ret;
5702

5703
	mutex_init(&kvm->arch.uvmem_lock);
5704
	INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
5705
	mutex_init(&kvm->arch.mmu_setup_lock);
5706

5707
	/* Allocate the guest's logical partition ID */
5708

5709
	if (!kvmhv_is_nestedv2()) {
5710
		lpid = kvmppc_alloc_lpid();
5711
		if ((long)lpid < 0)
5712
			return -ENOMEM;
5713
		kvm->arch.lpid = lpid;
5714
	}
5715

5716
	kvmppc_alloc_host_rm_ops();
5717

5718
	kvmhv_vm_nested_init(kvm);
5719

5720
	if (kvmhv_is_nestedv2()) {
5721
		long rc;
5722
		unsigned long guest_id;
5723

5724
		rc = plpar_guest_create(0, &guest_id);
5725

5726
		if (rc != H_SUCCESS)
5727
			pr_err("KVM: Create Guest hcall failed, rc=%ld\n", rc);
5728

5729
		switch (rc) {
5730
		case H_PARAMETER:
5731
		case H_FUNCTION:
5732
		case H_STATE:
5733
			return -EINVAL;
5734
		case H_NOT_ENOUGH_RESOURCES:
5735
		case H_ABORTED:
5736
			return -ENOMEM;
5737
		case H_AUTHORITY:
5738
			return -EPERM;
5739
		case H_NOT_AVAILABLE:
5740
			return -EBUSY;
5741
		}
5742
		kvm->arch.lpid = guest_id;
5743
	}
5744

5745

5746
	/*
5747
	 * Since we don't flush the TLB when tearing down a VM,
5748
	 * and this lpid might have previously been used,
5749
	 * make sure we flush on each core before running the new VM.
5750
	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
5751
	 * does this flush for us.
5752
	 */
5753
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5754
		cpumask_setall(&kvm->arch.need_tlb_flush);
5755

5756
	/* Start out with the default set of hcalls enabled */
5757
	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
5758
	       sizeof(kvm->arch.enabled_hcalls));
5759

5760
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5761
		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
5762

5763
	/* Init LPCR for virtual RMA mode */
5764
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
5765
		kvm->arch.host_lpid = mfspr(SPRN_LPID);
5766
		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
5767
		lpcr &= LPCR_PECE | LPCR_LPES;
5768
	} else {
5769
		/*
5770
		 * The L2 LPES mode will be set by the L0 according to whether
5771
		 * or not it needs to take external interrupts in HV mode.
5772
		 */
5773
		lpcr = 0;
5774
	}
5775
	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
5776
		LPCR_VPM0 | LPCR_VPM1;
5777
	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
5778
		(VRMA_VSID << SLB_VSID_SHIFT_1T);
5779
	/* On POWER8 turn on online bit to enable PURR/SPURR */
5780
	if (cpu_has_feature(CPU_FTR_ARCH_207S))
5781
		lpcr |= LPCR_ONL;
5782
	/*
5783
	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
5784
	 * Set HVICE bit to enable hypervisor virtualization interrupts.
5785
	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
5786
	 * be unnecessary but better safe than sorry in case we re-enable
5787
	 * EE in HV mode with this LPCR still set)
5788
	 */
5789
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5790
		lpcr &= ~LPCR_VPM0;
5791
		lpcr |= LPCR_HVICE | LPCR_HEIC;
5792

5793
		/*
5794
		 * If xive is enabled, we route 0x500 interrupts directly
5795
		 * to the guest.
5796
		 */
5797
		if (xics_on_xive())
5798
			lpcr |= LPCR_LPES;
5799
	}
5800

5801
	/*
5802
	 * If the host uses radix, the guest starts out as radix.
5803
	 */
5804
	if (radix_enabled()) {
5805
		kvm->arch.radix = 1;
5806
		kvm->arch.mmu_ready = 1;
5807
		lpcr &= ~LPCR_VPM1;
5808
		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5809
		if (cpu_has_feature(CPU_FTR_HVMODE) &&
5810
		    cpu_has_feature(CPU_FTR_ARCH_31) &&
5811
		    (kvm->arch.host_lpcr & LPCR_HAIL))
5812
			lpcr |= LPCR_HAIL;
5813
		ret = kvmppc_init_vm_radix(kvm);
5814
		if (ret) {
5815
			if (kvmhv_is_nestedv2())
5816
				plpar_guest_delete(0, kvm->arch.lpid);
5817
			else
5818
				kvmppc_free_lpid(kvm->arch.lpid);
5819
			return ret;
5820
		}
5821
		kvmppc_setup_partition_table(kvm);
5822
	}
5823

5824
	verify_lpcr(kvm, lpcr);
5825
	kvm->arch.lpcr = lpcr;
5826

5827
	/* Initialization for future HPT resizes */
5828
	kvm->arch.resize_hpt = NULL;
5829

5830
	/*
5831
	 * Work out how many sets the TLB has, for the use of
5832
	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
5833
	 */
5834
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5835
		/*
5836
		 * P10 will flush all the congruence class with a single tlbiel
5837
		 */
5838
		kvm->arch.tlb_sets = 1;
5839
	} else if (radix_enabled())
5840
		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
5841
	else if (cpu_has_feature(CPU_FTR_ARCH_300))
5842
		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
5843
	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
5844
		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
5845
	else
5846
		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
5847

5848
	/*
5849
	 * Track that we now have a HV mode VM active. This blocks secondary
5850
	 * CPU threads from coming online.
5851
	 */
5852
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5853
		kvm_hv_vm_activated();
5854

5855
	/*
5856
	 * Initialize smt_mode depending on processor.
5857
	 * POWER8 and earlier have to use "strict" threading, where
5858
	 * all vCPUs in a vcore have to run on the same (sub)core,
5859
	 * whereas on POWER9 the threads can each run a different
5860
	 * guest.
5861
	 */
5862
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5863
		kvm->arch.smt_mode = threads_per_subcore;
5864
	else
5865
		kvm->arch.smt_mode = 1;
5866
	kvm->arch.emul_smt_mode = 1;
5867

5868
	return 0;
5869
}
5870

5871
static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm)
5872
{
5873
	kvmppc_mmu_debugfs_init(kvm);
5874
	if (radix_enabled())
5875
		kvmhv_radix_debugfs_init(kvm);
5876
	return 0;
5877
}
5878

5879
static void kvmppc_free_vcores(struct kvm *kvm)
5880
{
5881
	long int i;
5882

5883
	for (i = 0; i < KVM_MAX_VCORES; ++i)
5884
		kfree(kvm->arch.vcores[i]);
5885
	kvm->arch.online_vcores = 0;
5886
}
5887

5888
static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
5889
{
5890
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5891
		kvm_hv_vm_deactivated();
5892

5893
	kvmppc_free_vcores(kvm);
5894

5895

5896
	if (kvm_is_radix(kvm))
5897
		kvmppc_free_radix(kvm);
5898
	else
5899
		kvmppc_free_hpt(&kvm->arch.hpt);
5900

5901
	/* Perform global invalidation and return lpid to the pool */
5902
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5903
		if (nesting_enabled(kvm))
5904
			kvmhv_release_all_nested(kvm);
5905
		kvm->arch.process_table = 0;
5906
		if (kvm->arch.secure_guest)
5907
			uv_svm_terminate(kvm->arch.lpid);
5908
		if (!kvmhv_is_nestedv2())
5909
			kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
5910
	}
5911

5912
	if (kvmhv_is_nestedv2()) {
5913
		kvmhv_flush_lpid(kvm->arch.lpid);
5914
		plpar_guest_delete(0, kvm->arch.lpid);
5915
	} else {
5916
		kvmppc_free_lpid(kvm->arch.lpid);
5917
	}
5918

5919
	kvmppc_free_pimap(kvm);
5920
}
5921

5922
/* We don't need to emulate any privileged instructions or dcbz */
5923
static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
5924
				     unsigned int inst, int *advance)
5925
{
5926
	return EMULATE_FAIL;
5927
}
5928

5929
static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
5930
					ulong spr_val)
5931
{
5932
	return EMULATE_FAIL;
5933
}
5934

5935
static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
5936
					ulong *spr_val)
5937
{
5938
	return EMULATE_FAIL;
5939
}
5940

5941
static int kvmppc_core_check_processor_compat_hv(void)
5942
{
5943
	if (cpu_has_feature(CPU_FTR_HVMODE) &&
5944
	    cpu_has_feature(CPU_FTR_ARCH_206))
5945
		return 0;
5946

5947
	/* POWER9 in radix mode is capable of being a nested hypervisor. */
5948
	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
5949
		return 0;
5950

5951
	return -EIO;
5952
}
5953

5954
#ifdef CONFIG_KVM_XICS
5955

5956
void kvmppc_free_pimap(struct kvm *kvm)
5957
{
5958
	kfree(kvm->arch.pimap);
5959
}
5960

5961
static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
5962
{
5963
	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
5964
}
5965

5966
static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
5967
{
5968
	struct irq_desc *desc;
5969
	struct kvmppc_irq_map *irq_map;
5970
	struct kvmppc_passthru_irqmap *pimap;
5971
	struct irq_chip *chip;
5972
	int i, rc = 0;
5973
	struct irq_data *host_data;
5974

5975
	if (!kvm_irq_bypass)
5976
		return 1;
5977

5978
	desc = irq_to_desc(host_irq);
5979
	if (!desc)
5980
		return -EIO;
5981

5982
	mutex_lock(&kvm->lock);
5983

5984
	pimap = kvm->arch.pimap;
5985
	if (pimap == NULL) {
5986
		/* First call, allocate structure to hold IRQ map */
5987
		pimap = kvmppc_alloc_pimap();
5988
		if (pimap == NULL) {
5989
			mutex_unlock(&kvm->lock);
5990
			return -ENOMEM;
5991
		}
5992
		kvm->arch.pimap = pimap;
5993
	}
5994

5995
	/*
5996
	 * For now, we only support interrupts for which the EOI operation
5997
	 * is an OPAL call followed by a write to XIRR, since that's
5998
	 * what our real-mode EOI code does, or a XIVE interrupt
5999
	 */
6000
	chip = irq_data_get_irq_chip(&desc->irq_data);
6001
	if (!chip || !is_pnv_opal_msi(chip)) {
6002
		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
6003
			host_irq, guest_gsi);
6004
		mutex_unlock(&kvm->lock);
6005
		return -ENOENT;
6006
	}
6007

6008
	/*
6009
	 * See if we already have an entry for this guest IRQ number.
6010
	 * If it's mapped to a hardware IRQ number, that's an error,
6011
	 * otherwise re-use this entry.
6012
	 */
6013
	for (i = 0; i < pimap->n_mapped; i++) {
6014
		if (guest_gsi == pimap->mapped[i].v_hwirq) {
6015
			if (pimap->mapped[i].r_hwirq) {
6016
				mutex_unlock(&kvm->lock);
6017
				return -EINVAL;
6018
			}
6019
			break;
6020
		}
6021
	}
6022

6023
	if (i == KVMPPC_PIRQ_MAPPED) {
6024
		mutex_unlock(&kvm->lock);
6025
		return -EAGAIN;		/* table is full */
6026
	}
6027

6028
	irq_map = &pimap->mapped[i];
6029

6030
	irq_map->v_hwirq = guest_gsi;
6031
	irq_map->desc = desc;
6032

6033
	/*
6034
	 * Order the above two stores before the next to serialize with
6035
	 * the KVM real mode handler.
6036
	 */
6037
	smp_wmb();
6038

6039
	/*
6040
	 * The 'host_irq' number is mapped in the PCI-MSI domain but
6041
	 * the underlying calls, which will EOI the interrupt in real
6042
	 * mode, need an HW IRQ number mapped in the XICS IRQ domain.
6043
	 */
6044
	host_data = irq_domain_get_irq_data(irq_get_default_domain(), host_irq);
6045
	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
6046

6047
	if (i == pimap->n_mapped)
6048
		pimap->n_mapped++;
6049

6050
	if (xics_on_xive())
6051
		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
6052
	else
6053
		kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
6054
	if (rc)
6055
		irq_map->r_hwirq = 0;
6056

6057
	mutex_unlock(&kvm->lock);
6058

6059
	return 0;
6060
}
6061

6062
static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
6063
{
6064
	struct irq_desc *desc;
6065
	struct kvmppc_passthru_irqmap *pimap;
6066
	int i, rc = 0;
6067

6068
	if (!kvm_irq_bypass)
6069
		return 0;
6070

6071
	desc = irq_to_desc(host_irq);
6072
	if (!desc)
6073
		return -EIO;
6074

6075
	mutex_lock(&kvm->lock);
6076
	if (!kvm->arch.pimap)
6077
		goto unlock;
6078

6079
	pimap = kvm->arch.pimap;
6080

6081
	for (i = 0; i < pimap->n_mapped; i++) {
6082
		if (guest_gsi == pimap->mapped[i].v_hwirq)
6083
			break;
6084
	}
6085

6086
	if (i == pimap->n_mapped) {
6087
		mutex_unlock(&kvm->lock);
6088
		return -ENODEV;
6089
	}
6090

6091
	if (xics_on_xive())
6092
		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
6093
	else
6094
		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
6095

6096
	/* invalidate the entry (what to do on error from the above ?) */
6097
	pimap->mapped[i].r_hwirq = 0;
6098

6099
	/*
6100
	 * We don't free this structure even when the count goes to
6101
	 * zero. The structure is freed when we destroy the VM.
6102
	 */
6103
 unlock:
6104
	mutex_unlock(&kvm->lock);
6105
	return rc;
6106
}
6107

6108
static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
6109
					     struct irq_bypass_producer *prod)
6110
{
6111
	int ret = 0;
6112
	struct kvm_kernel_irqfd *irqfd =
6113
		container_of(cons, struct kvm_kernel_irqfd, consumer);
6114

6115
	irqfd->producer = prod;
6116

6117
	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
6118
	if (ret)
6119
		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
6120
			prod->irq, irqfd->gsi, ret);
6121

6122
	return ret;
6123
}
6124

6125
static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
6126
					      struct irq_bypass_producer *prod)
6127
{
6128
	int ret;
6129
	struct kvm_kernel_irqfd *irqfd =
6130
		container_of(cons, struct kvm_kernel_irqfd, consumer);
6131

6132
	irqfd->producer = NULL;
6133

6134
	/*
6135
	 * When producer of consumer is unregistered, we change back to
6136
	 * default external interrupt handling mode - KVM real mode
6137
	 * will switch back to host.
6138
	 */
6139
	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
6140
	if (ret)
6141
		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
6142
			prod->irq, irqfd->gsi, ret);
6143
}
6144
#endif
6145

6146
static int kvm_arch_vm_ioctl_hv(struct file *filp,
6147
				unsigned int ioctl, unsigned long arg)
6148
{
6149
	struct kvm *kvm __maybe_unused = filp->private_data;
6150
	void __user *argp = (void __user *)arg;
6151
	int r;
6152

6153
	switch (ioctl) {
6154

6155
	case KVM_PPC_ALLOCATE_HTAB: {
6156
		u32 htab_order;
6157

6158
		/* If we're a nested hypervisor, we currently only support radix */
6159
		if (kvmhv_on_pseries()) {
6160
			r = -EOPNOTSUPP;
6161
			break;
6162
		}
6163

6164
		r = -EFAULT;
6165
		if (get_user(htab_order, (u32 __user *)argp))
6166
			break;
6167
		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
6168
		if (r)
6169
			break;
6170
		r = 0;
6171
		break;
6172
	}
6173

6174
	case KVM_PPC_GET_HTAB_FD: {
6175
		struct kvm_get_htab_fd ghf;
6176

6177
		r = -EFAULT;
6178
		if (copy_from_user(&ghf, argp, sizeof(ghf)))
6179
			break;
6180
		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
6181
		break;
6182
	}
6183

6184
	case KVM_PPC_RESIZE_HPT_PREPARE: {
6185
		struct kvm_ppc_resize_hpt rhpt;
6186

6187
		r = -EFAULT;
6188
		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
6189
			break;
6190

6191
		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
6192
		break;
6193
	}
6194

6195
	case KVM_PPC_RESIZE_HPT_COMMIT: {
6196
		struct kvm_ppc_resize_hpt rhpt;
6197

6198
		r = -EFAULT;
6199
		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
6200
			break;
6201

6202
		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
6203
		break;
6204
	}
6205

6206
	default:
6207
		r = -ENOTTY;
6208
	}
6209

6210
	return r;
6211
}
6212

6213
/*
6214
 * List of hcall numbers to enable by default.
6215
 * For compatibility with old userspace, we enable by default
6216
 * all hcalls that were implemented before the hcall-enabling
6217
 * facility was added.  Note this list should not include H_RTAS.
6218
 */
6219
static unsigned int default_hcall_list[] = {
6220
	H_REMOVE,
6221
	H_ENTER,
6222
	H_READ,
6223
	H_PROTECT,
6224
	H_BULK_REMOVE,
6225
#ifdef CONFIG_SPAPR_TCE_IOMMU
6226
	H_GET_TCE,
6227
	H_PUT_TCE,
6228
#endif
6229
	H_SET_DABR,
6230
	H_SET_XDABR,
6231
	H_CEDE,
6232
	H_PROD,
6233
	H_CONFER,
6234
	H_REGISTER_VPA,
6235
#ifdef CONFIG_KVM_XICS
6236
	H_EOI,
6237
	H_CPPR,
6238
	H_IPI,
6239
	H_IPOLL,
6240
	H_XIRR,
6241
	H_XIRR_X,
6242
#endif
6243
	0
6244
};
6245

6246
static void init_default_hcalls(void)
6247
{
6248
	int i;
6249
	unsigned int hcall;
6250

6251
	for (i = 0; default_hcall_list[i]; ++i) {
6252
		hcall = default_hcall_list[i];
6253
		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
6254
		__set_bit(hcall / 4, default_enabled_hcalls);
6255
	}
6256
}
6257

6258
static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
6259
{
6260
	unsigned long lpcr;
6261
	int radix;
6262
	int err;
6263

6264
	/* If not on a POWER9, reject it */
6265
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6266
		return -ENODEV;
6267

6268
	/* If any unknown flags set, reject it */
6269
	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
6270
		return -EINVAL;
6271

6272
	/* GR (guest radix) bit in process_table field must match */
6273
	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
6274
	if (!!(cfg->process_table & PATB_GR) != radix)
6275
		return -EINVAL;
6276

6277
	/* Process table size field must be reasonable, i.e. <= 24 */
6278
	if ((cfg->process_table & PRTS_MASK) > 24)
6279
		return -EINVAL;
6280

6281
	/* We can change a guest to/from radix now, if the host is radix */
6282
	if (radix && !radix_enabled())
6283
		return -EINVAL;
6284

6285
	/* If we're a nested hypervisor, we currently only support radix */
6286
	if (kvmhv_on_pseries() && !radix)
6287
		return -EINVAL;
6288

6289
	mutex_lock(&kvm->arch.mmu_setup_lock);
6290
	if (radix != kvm_is_radix(kvm)) {
6291
		if (kvm->arch.mmu_ready) {
6292
			kvm->arch.mmu_ready = 0;
6293
			/* order mmu_ready vs. vcpus_running */
6294
			smp_mb();
6295
			if (atomic_read(&kvm->arch.vcpus_running)) {
6296
				kvm->arch.mmu_ready = 1;
6297
				err = -EBUSY;
6298
				goto out_unlock;
6299
			}
6300
		}
6301
		if (radix)
6302
			err = kvmppc_switch_mmu_to_radix(kvm);
6303
		else
6304
			err = kvmppc_switch_mmu_to_hpt(kvm);
6305
		if (err)
6306
			goto out_unlock;
6307
	}
6308

6309
	kvm->arch.process_table = cfg->process_table;
6310
	kvmppc_setup_partition_table(kvm);
6311

6312
	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
6313
	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
6314
	err = 0;
6315

6316
 out_unlock:
6317
	mutex_unlock(&kvm->arch.mmu_setup_lock);
6318
	return err;
6319
}
6320

6321
static int kvmhv_enable_nested(struct kvm *kvm)
6322
{
6323
	if (!nested)
6324
		return -EPERM;
6325
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6326
		return -ENODEV;
6327
	if (!radix_enabled())
6328
		return -ENODEV;
6329
	if (kvmhv_is_nestedv2())
6330
		return -ENODEV;
6331

6332
	/* kvm == NULL means the caller is testing if the capability exists */
6333
	if (kvm)
6334
		kvm->arch.nested_enable = true;
6335
	return 0;
6336
}
6337

6338
static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
6339
				 int size)
6340
{
6341
	int rc = -EINVAL;
6342

6343
	if (kvmhv_vcpu_is_radix(vcpu)) {
6344
		rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
6345

6346
		if (rc > 0)
6347
			rc = -EINVAL;
6348
	}
6349

6350
	/* For now quadrants are the only way to access nested guest memory */
6351
	if (rc && vcpu->arch.nested)
6352
		rc = -EAGAIN;
6353

6354
	return rc;
6355
}
6356

6357
static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
6358
				int size)
6359
{
6360
	int rc = -EINVAL;
6361

6362
	if (kvmhv_vcpu_is_radix(vcpu)) {
6363
		rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
6364

6365
		if (rc > 0)
6366
			rc = -EINVAL;
6367
	}
6368

6369
	/* For now quadrants are the only way to access nested guest memory */
6370
	if (rc && vcpu->arch.nested)
6371
		rc = -EAGAIN;
6372

6373
	return rc;
6374
}
6375

6376
static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
6377
{
6378
	unpin_vpa(kvm, vpa);
6379
	vpa->gpa = 0;
6380
	vpa->pinned_addr = NULL;
6381
	vpa->dirty = false;
6382
	vpa->update_pending = 0;
6383
}
6384

6385
/*
6386
 * Enable a guest to become a secure VM, or test whether
6387
 * that could be enabled.
6388
 * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
6389
 * tested (kvm == NULL) or enabled (kvm != NULL).
6390
 */
6391
static int kvmhv_enable_svm(struct kvm *kvm)
6392
{
6393
	if (!kvmppc_uvmem_available())
6394
		return -EINVAL;
6395
	if (kvm)
6396
		kvm->arch.svm_enabled = 1;
6397
	return 0;
6398
}
6399

6400
/*
6401
 *  IOCTL handler to turn off secure mode of guest
6402
 *
6403
 * - Release all device pages
6404
 * - Issue ucall to terminate the guest on the UV side
6405
 * - Unpin the VPA pages.
6406
 * - Reinit the partition scoped page tables
6407
 */
6408
static int kvmhv_svm_off(struct kvm *kvm)
6409
{
6410
	struct kvm_vcpu *vcpu;
6411
	int mmu_was_ready;
6412
	int srcu_idx;
6413
	int ret = 0;
6414
	unsigned long i;
6415

6416
	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
6417
		return ret;
6418

6419
	mutex_lock(&kvm->arch.mmu_setup_lock);
6420
	mmu_was_ready = kvm->arch.mmu_ready;
6421
	if (kvm->arch.mmu_ready) {
6422
		kvm->arch.mmu_ready = 0;
6423
		/* order mmu_ready vs. vcpus_running */
6424
		smp_mb();
6425
		if (atomic_read(&kvm->arch.vcpus_running)) {
6426
			kvm->arch.mmu_ready = 1;
6427
			ret = -EBUSY;
6428
			goto out;
6429
		}
6430
	}
6431

6432
	srcu_idx = srcu_read_lock(&kvm->srcu);
6433
	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
6434
		struct kvm_memory_slot *memslot;
6435
		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
6436
		int bkt;
6437

6438
		if (!slots)
6439
			continue;
6440

6441
		kvm_for_each_memslot(memslot, bkt, slots) {
6442
			kvmppc_uvmem_drop_pages(memslot, kvm, true);
6443
			uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
6444
		}
6445
	}
6446
	srcu_read_unlock(&kvm->srcu, srcu_idx);
6447

6448
	ret = uv_svm_terminate(kvm->arch.lpid);
6449
	if (ret != U_SUCCESS) {
6450
		ret = -EINVAL;
6451
		goto out;
6452
	}
6453

6454
	/*
6455
	 * When secure guest is reset, all the guest pages are sent
6456
	 * to UV via UV_PAGE_IN before the non-boot vcpus get a
6457
	 * chance to run and unpin their VPA pages. Unpinning of all
6458
	 * VPA pages is done here explicitly so that VPA pages
6459
	 * can be migrated to the secure side.
6460
	 *
6461
	 * This is required to for the secure SMP guest to reboot
6462
	 * correctly.
6463
	 */
6464
	kvm_for_each_vcpu(i, vcpu, kvm) {
6465
		spin_lock(&vcpu->arch.vpa_update_lock);
6466
		unpin_vpa_reset(kvm, &vcpu->arch.dtl);
6467
		unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
6468
		unpin_vpa_reset(kvm, &vcpu->arch.vpa);
6469
		spin_unlock(&vcpu->arch.vpa_update_lock);
6470
	}
6471

6472
	kvmppc_setup_partition_table(kvm);
6473
	kvm->arch.secure_guest = 0;
6474
	kvm->arch.mmu_ready = mmu_was_ready;
6475
out:
6476
	mutex_unlock(&kvm->arch.mmu_setup_lock);
6477
	return ret;
6478
}
6479

6480
static int kvmhv_enable_dawr1(struct kvm *kvm)
6481
{
6482
	if (!cpu_has_feature(CPU_FTR_DAWR1))
6483
		return -ENODEV;
6484

6485
	/* kvm == NULL means the caller is testing if the capability exists */
6486
	if (kvm)
6487
		kvm->arch.dawr1_enabled = true;
6488
	return 0;
6489
}
6490

6491
static bool kvmppc_hash_v3_possible(void)
6492
{
6493
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6494
		return false;
6495

6496
	if (!cpu_has_feature(CPU_FTR_HVMODE))
6497
		return false;
6498

6499
	/*
6500
	 * POWER9 chips before version 2.02 can't have some threads in
6501
	 * HPT mode and some in radix mode on the same core.
6502
	 */
6503
	if (radix_enabled()) {
6504
		unsigned int pvr = mfspr(SPRN_PVR);
6505
		if ((pvr >> 16) == PVR_POWER9 &&
6506
		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
6507
		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
6508
			return false;
6509
	}
6510

6511
	return true;
6512
}
6513

6514
static struct kvmppc_ops kvm_ops_hv = {
6515
	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
6516
	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
6517
	.get_one_reg = kvmppc_get_one_reg_hv,
6518
	.set_one_reg = kvmppc_set_one_reg_hv,
6519
	.vcpu_load   = kvmppc_core_vcpu_load_hv,
6520
	.vcpu_put    = kvmppc_core_vcpu_put_hv,
6521
	.inject_interrupt = kvmppc_inject_interrupt_hv,
6522
	.set_msr     = kvmppc_set_msr_hv,
6523
	.vcpu_run    = kvmppc_vcpu_run_hv,
6524
	.vcpu_create = kvmppc_core_vcpu_create_hv,
6525
	.vcpu_free   = kvmppc_core_vcpu_free_hv,
6526
	.check_requests = kvmppc_core_check_requests_hv,
6527
	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
6528
	.flush_memslot  = kvmppc_core_flush_memslot_hv,
6529
	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
6530
	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
6531
	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
6532
	.age_gfn = kvm_age_gfn_hv,
6533
	.test_age_gfn = kvm_test_age_gfn_hv,
6534
	.free_memslot = kvmppc_core_free_memslot_hv,
6535
	.init_vm =  kvmppc_core_init_vm_hv,
6536
	.destroy_vm = kvmppc_core_destroy_vm_hv,
6537
	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
6538
	.emulate_op = kvmppc_core_emulate_op_hv,
6539
	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
6540
	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
6541
	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
6542
	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
6543
	.hcall_implemented = kvmppc_hcall_impl_hv,
6544
	.configure_mmu = kvmhv_configure_mmu,
6545
	.get_rmmu_info = kvmhv_get_rmmu_info,
6546
	.set_smt_mode = kvmhv_set_smt_mode,
6547
	.enable_nested = kvmhv_enable_nested,
6548
	.load_from_eaddr = kvmhv_load_from_eaddr,
6549
	.store_to_eaddr = kvmhv_store_to_eaddr,
6550
	.enable_svm = kvmhv_enable_svm,
6551
	.svm_off = kvmhv_svm_off,
6552
	.enable_dawr1 = kvmhv_enable_dawr1,
6553
	.hash_v3_possible = kvmppc_hash_v3_possible,
6554
	.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
6555
	.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
6556
};
6557

6558
static int kvm_init_subcore_bitmap(void)
6559
{
6560
	int i, j;
6561
	int nr_cores = cpu_nr_cores();
6562
	struct sibling_subcore_state *sibling_subcore_state;
6563

6564
	for (i = 0; i < nr_cores; i++) {
6565
		int first_cpu = i * threads_per_core;
6566
		int node = cpu_to_node(first_cpu);
6567

6568
		/* Ignore if it is already allocated. */
6569
		if (paca_ptrs[first_cpu]->sibling_subcore_state)
6570
			continue;
6571

6572
		sibling_subcore_state =
6573
			kzalloc_node(sizeof(struct sibling_subcore_state),
6574
							GFP_KERNEL, node);
6575
		if (!sibling_subcore_state)
6576
			return -ENOMEM;
6577

6578

6579
		for (j = 0; j < threads_per_core; j++) {
6580
			int cpu = first_cpu + j;
6581

6582
			paca_ptrs[cpu]->sibling_subcore_state =
6583
						sibling_subcore_state;
6584
		}
6585
	}
6586
	return 0;
6587
}
6588

6589
static int kvmppc_radix_possible(void)
6590
{
6591
	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
6592
}
6593

6594
static int kvmppc_book3s_init_hv(void)
6595
{
6596
	int r;
6597

6598
	if (!tlbie_capable) {
6599
		pr_err("KVM-HV: Host does not support TLBIE\n");
6600
		return -ENODEV;
6601
	}
6602

6603
	/*
6604
	 * FIXME!! Do we need to check on all cpus ?
6605
	 */
6606
	r = kvmppc_core_check_processor_compat_hv();
6607
	if (r < 0)
6608
		return -ENODEV;
6609

6610
	r = kvmhv_nested_init();
6611
	if (r)
6612
		return r;
6613

6614
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
6615
		r = kvm_init_subcore_bitmap();
6616
		if (r)
6617
			goto err;
6618
	}
6619

6620
	/*
6621
	 * We need a way of accessing the XICS interrupt controller,
6622
	 * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
6623
	 * indirectly, via OPAL.
6624
	 */
6625
#ifdef CONFIG_SMP
6626
	if (!xics_on_xive() && !kvmhv_on_pseries() &&
6627
	    !local_paca->kvm_hstate.xics_phys) {
6628
		struct device_node *np;
6629

6630
		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
6631
		if (!np) {
6632
			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
6633
			r = -ENODEV;
6634
			goto err;
6635
		}
6636
		/* presence of intc confirmed - node can be dropped again */
6637
		of_node_put(np);
6638
	}
6639
#endif
6640

6641
	init_default_hcalls();
6642

6643
	init_vcore_lists();
6644

6645
	r = kvmppc_mmu_hv_init();
6646
	if (r)
6647
		goto err;
6648

6649
	if (kvmppc_radix_possible()) {
6650
		r = kvmppc_radix_init();
6651
		if (r)
6652
			goto err;
6653
	}
6654

6655
	r = kvmppc_uvmem_init();
6656
	if (r < 0) {
6657
		pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
6658
		return r;
6659
	}
6660

6661
#if defined(CONFIG_KVM_XICS)
6662
	/*
6663
	 * IRQ bypass is supported only for interrupts whose EOI operations are
6664
	 * handled via OPAL calls. Therefore, register IRQ bypass handlers
6665
	 * exclusively for PowerNV KVM when booted with 'xive=off', indicating
6666
	 * the use of the emulated XICS interrupt controller.
6667
	 */
6668
	if (!kvmhv_on_pseries()) {
6669
		pr_info("KVM-HV: Enabling IRQ bypass\n");
6670
		kvm_ops_hv.irq_bypass_add_producer =
6671
			kvmppc_irq_bypass_add_producer_hv;
6672
		kvm_ops_hv.irq_bypass_del_producer =
6673
			kvmppc_irq_bypass_del_producer_hv;
6674
	}
6675
#endif
6676

6677
	kvm_ops_hv.owner = THIS_MODULE;
6678
	kvmppc_hv_ops = &kvm_ops_hv;
6679

6680
	return 0;
6681

6682
err:
6683
	kvmhv_nested_exit();
6684
	kvmppc_radix_exit();
6685

6686
	return r;
6687
}
6688

6689
static void kvmppc_book3s_exit_hv(void)
6690
{
6691
	kvmppc_uvmem_free();
6692
	kvmppc_free_host_rm_ops();
6693
	if (kvmppc_radix_possible())
6694
		kvmppc_radix_exit();
6695
	kvmppc_hv_ops = NULL;
6696
	kvmhv_nested_exit();
6697
}
6698

6699
module_init(kvmppc_book3s_init_hv);
6700
module_exit(kvmppc_book3s_exit_hv);
6701
MODULE_DESCRIPTION("KVM on Book3S (POWER8 and later) in hypervisor mode");
6702
MODULE_LICENSE("GPL");
6703
MODULE_ALIAS_MISCDEV(KVM_MINOR);
6704
MODULE_ALIAS("devname:kvm");
6705

6706
Product

Resources

Company