book3s_hv.c 158 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
 *
 * Authors:
 *    Paul Mackerras <paulus@au1.ibm.com>
 *    Alexander Graf <agraf@suse.de>
 *    Kevin Wolf <mail@kevin-wolf.de>
 *
 * Description: KVM functions specific to running on Book 3S
 * processors in hypervisor mode (specifically POWER7 and later).
 *
 * This file is derived from arch/powerpc/kvm/book3s.c,
 * by Alexander Graf <agraf@suse.de>.
 */

#include <linux/kvm_host.h>
19
#include <linux/kernel.h>
20 21 22
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/preempt.h>
23
#include <linux/sched/signal.h>
24
#include <linux/sched/stat.h>
25
#include <linux/delay.h>
26
#include <linux/export.h>
27 28
#include <linux/fs.h>
#include <linux/anon_inodes.h>
29
#include <linux/cpu.h>
30
#include <linux/cpumask.h>
31 32
#include <linux/spinlock.h>
#include <linux/page-flags.h>
33
#include <linux/srcu.h>
34
#include <linux/miscdevice.h>
35
#include <linux/debugfs.h>
36 37 38 39 40 41 42 43 44
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/of.h>
45

46
#include <asm/ftrace.h>
47
#include <asm/reg.h>
48
#include <asm/ppc-opcode.h>
49
#include <asm/asm-prototypes.h>
50
#include <asm/archrandom.h>
51
#include <asm/debug.h>
52
#include <asm/disassemble.h>
53 54
#include <asm/cputable.h>
#include <asm/cacheflush.h>
55
#include <linux/uaccess.h>
56
#include <asm/interrupt.h>
57 58 59 60 61
#include <asm/io.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu_context.h>
#include <asm/lppaca.h>
62
#include <asm/pmc.h>
63
#include <asm/processor.h>
64
#include <asm/cputhreads.h>
65
#include <asm/page.h>
66
#include <asm/hvcall.h>
67
#include <asm/switch_to.h>
68
#include <asm/smp.h>
69
#include <asm/dbell.h>
70
#include <asm/hmi.h>
71
#include <asm/pnv-pci.h>
72
#include <asm/mmu.h>
73 74
#include <asm/opal.h>
#include <asm/xics.h>
75
#include <asm/xive.h>
76
#include <asm/hw_breakpoint.h>
77
#include <asm/kvm_book3s_uvmem.h>
78
#include <asm/ultravisor.h>
79
#include <asm/dtl.h>
80
#include <asm/plpar_wrappers.h>
81

82
#include "book3s.h"
83
#include "book3s_hv.h"
84

85 86 87
#define CREATE_TRACE_POINTS
#include "trace_hv.h"

88 89 90 91
/* #define EXIT_DEBUG */
/* #define EXIT_DEBUG_SIMPLE */
/* #define EXIT_DEBUG_INT */

92 93
/* Used to indicate that a guest page fault needs to be handled */
#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
94 95
/* Used to indicate that a guest passthrough interrupt needs to be handled */
#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
96

97 98 99
/* Used as a "null" value for timebase values */
#define TB_NIL	(~(u64)0)

100 101
static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);

102
static int dynamic_mt_modes = 6;
103
module_param(dynamic_mt_modes, int, 0644);
104
MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
105
static int target_smt_mode;
106
module_param(target_smt_mode, int, 0644);
107
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
108

109 110
static bool one_vm_per_core;
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
111
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
112

113
#ifdef CONFIG_KVM_XICS
114
static const struct kernel_param_ops module_param_ops = {
115 116 117 118
	.set = param_set_int,
	.get = param_get_int,
};

119
module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
120 121
MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");

122
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
123 124 125
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif

126 127 128 129 130
/* If set, guests are allowed to create and control nested guests */
static bool nested = true;
module_param(nested, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");

131
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
132

133 134 135 136 137
/*
 * RWMR values for POWER8.  These control the rate at which PURR
 * and SPURR count and should be set according to the number of
 * online threads in the vcore being run.
 */
138 139 140 141 142 143 144 145
#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
146 147 148 149 150 151 152 153 154 155 156 157 158

static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
	RWMR_RPA_P8_1THREAD,
	RWMR_RPA_P8_1THREAD,
	RWMR_RPA_P8_2THREAD,
	RWMR_RPA_P8_3THREAD,
	RWMR_RPA_P8_4THREAD,
	RWMR_RPA_P8_5THREAD,
	RWMR_RPA_P8_6THREAD,
	RWMR_RPA_P8_7THREAD,
	RWMR_RPA_P8_8THREAD,
};

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
		int *ip)
{
	int i = *ip;
	struct kvm_vcpu *vcpu;

	while (++i < MAX_SMT_THREADS) {
		vcpu = READ_ONCE(vc->runnable_threads[i]);
		if (vcpu) {
			*ip = i;
			return vcpu;
		}
	}
	return NULL;
}

/* Used to traverse the list of runnable threads for a given vcore */
#define for_each_runnable_thread(i, vcpu, vc) \
	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )

179 180
static bool kvmppc_ipi_thread(int cpu)
{
181 182
	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);

183 184 185 186
	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
	if (kvmhv_on_pseries())
		return false;

187 188 189 190 191 192 193 194
	/* On POWER9 we can use msgsnd to IPI any cpu */
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
		msg |= get_hard_smp_processor_id(cpu);
		smp_mb();
		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
		return true;
	}

195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
		preempt_disable();
		if (cpu_first_thread_sibling(cpu) ==
		    cpu_first_thread_sibling(smp_processor_id())) {
			msg |= cpu_thread_in_core(cpu);
			smp_mb();
			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
			preempt_enable();
			return true;
		}
		preempt_enable();
	}

#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
210
	if (cpu >= 0 && cpu < nr_cpu_ids) {
211
		if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
212 213 214 215
			xics_wake_cpu(cpu);
			return true;
		}
		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
216 217 218 219 220 221 222
		return true;
	}
#endif

	return false;
}

223
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
224
{
225
	int cpu;
226
	struct rcuwait *waitp;
227

228 229
	waitp = kvm_arch_vcpu_get_wait(vcpu);
	if (rcuwait_wake_up(waitp))
230
		++vcpu->stat.generic.halt_wakeup;
231

232 233
	cpu = READ_ONCE(vcpu->arch.thread_cpu);
	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
234
		return;
235 236

	/* CPU points to the first thread of the core */
237
	cpu = vcpu->cpu;
238 239
	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
		smp_send_reschedule(cpu);
240 241
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
/*
 * We use the vcpu_load/put functions to measure stolen time.
 * Stolen time is counted as time when either the vcpu is able to
 * run as part of a virtual core, but the task running the vcore
 * is preempted or sleeping, or when the vcpu needs something done
 * in the kernel by the task running the vcpu, but that task is
 * preempted or sleeping.  Those two things have to be counted
 * separately, since one of the vcpu tasks will take on the job
 * of running the core, and the other vcpu tasks in the vcore will
 * sleep waiting for it to do that, but that sleep shouldn't count
 * as stolen time.
 *
 * Hence we accumulate stolen time when the vcpu can run as part of
 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
 * needs its task to do other things in the kernel (for example,
 * service a page fault) in busy_stolen.  We don't accumulate
 * stolen time for a vcore when it is inactive, or for a vcpu
 * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
 * a misnomer; it means that the vcpu task is not executing in
 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
 * the kernel.  We don't have any way of dividing up that time
 * between time that the vcpu is genuinely stopped, time that
 * the task is actively working on behalf of the vcpu, and time
 * that the task is preempted, so we don't count any of it as
 * stolen.
 *
 * Updates to busy_stolen are protected by arch.tbacct_lock;
269 270 271 272
 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
 * lock.  The stolen times are measured in units of timebase ticks.
 * (Note that the != TB_NIL checks below are purely defensive;
 * they should never fail.)
273 274
 */

275
static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
276 277 278
{
	unsigned long flags;

279 280
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

281
	spin_lock_irqsave(&vc->stoltb_lock, flags);
282
	vc->preempt_tb = tb;
283 284 285
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
}

286
static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
287 288 289
{
	unsigned long flags;

290 291
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

292 293
	spin_lock_irqsave(&vc->stoltb_lock, flags);
	if (vc->preempt_tb != TB_NIL) {
294
		vc->stolen_tb += tb - vc->preempt_tb;
295 296 297 298 299
		vc->preempt_tb = TB_NIL;
	}
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
}

300
static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
301
{
302
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
303
	unsigned long flags;
304 305 306 307 308 309
	u64 now;

	if (cpu_has_feature(CPU_FTR_ARCH_300))
		return;

	now = mftb();
310

311 312 313 314 315 316
	/*
	 * We can test vc->runner without taking the vcore lock,
	 * because only this task ever sets vc->runner to this
	 * vcpu, and once it is set to this vcpu, only this task
	 * ever sets it to NULL.
	 */
317
	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
318
		kvmppc_core_end_stolen(vc, now);
319

320
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
321 322
	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
	    vcpu->arch.busy_preempt != TB_NIL) {
323
		vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
324 325
		vcpu->arch.busy_preempt = TB_NIL;
	}
326
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
327 328
}

329
static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
330
{
331
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
332
	unsigned long flags;
333 334 335 336 337 338
	u64 now;

	if (cpu_has_feature(CPU_FTR_ARCH_300))
		return;

	now = mftb();
339

340
	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
341
		kvmppc_core_start_stolen(vc, now);
342

343
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
344
	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
345
		vcpu->arch.busy_preempt = now;
346
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
347 348
}

349
static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
350 351 352 353
{
	vcpu->arch.pvr = pvr;
}

354 355 356
/* Dummy value used in computing PCR value below */
#define PCR_ARCH_31    (PCR_ARCH_300 << 1)

357
static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
358
{
359
	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
360 361
	struct kvmppc_vcore *vc = vcpu->arch.vcore;

362
	/* We can (emulate) our own architecture version and anything older */
363 364 365
	if (cpu_has_feature(CPU_FTR_ARCH_31))
		host_pcr_bit = PCR_ARCH_31;
	else if (cpu_has_feature(CPU_FTR_ARCH_300))
366 367 368 369 370 371 372 373 374 375
		host_pcr_bit = PCR_ARCH_300;
	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
		host_pcr_bit = PCR_ARCH_207;
	else if (cpu_has_feature(CPU_FTR_ARCH_206))
		host_pcr_bit = PCR_ARCH_206;
	else
		host_pcr_bit = PCR_ARCH_205;

	/* Determine lowest PCR bit needed to run guest in given PVR level */
	guest_pcr_bit = host_pcr_bit;
376 377 378
	if (arch_compat) {
		switch (arch_compat) {
		case PVR_ARCH_205:
379
			guest_pcr_bit = PCR_ARCH_205;
380 381 382
			break;
		case PVR_ARCH_206:
		case PVR_ARCH_206p:
383
			guest_pcr_bit = PCR_ARCH_206;
384 385
			break;
		case PVR_ARCH_207:
386 387 388 389
			guest_pcr_bit = PCR_ARCH_207;
			break;
		case PVR_ARCH_300:
			guest_pcr_bit = PCR_ARCH_300;
390
			break;
391 392 393
		case PVR_ARCH_31:
			guest_pcr_bit = PCR_ARCH_31;
			break;
394 395 396 397 398
		default:
			return -EINVAL;
		}
	}

399 400 401 402
	/* Check requested PCR bits don't exceed our capabilities */
	if (guest_pcr_bit > host_pcr_bit)
		return -EINVAL;

403 404
	spin_lock(&vc->lock);
	vc->arch_compat = arch_compat;
405 406 407 408 409
	/*
	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
	 * Also set all reserved PCR bits
	 */
	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
410 411 412 413 414
	spin_unlock(&vc->lock);

	return 0;
}

415
static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
416 417 418 419 420
{
	int r;

	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
421
	       vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
422 423 424 425 426
	for (r = 0; r < 16; ++r)
		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
		       r, kvmppc_get_gpr(vcpu, r),
		       r+16, kvmppc_get_gpr(vcpu, r+16));
	pr_err("ctr = %.16lx  lr  = %.16lx\n",
427
	       vcpu->arch.regs.ctr, vcpu->arch.regs.link);
428 429 430 431 432 433
	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
434 435
	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
436 437 438 439 440 441 442 443
	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
	pr_err("fault dar = %.16lx dsisr = %.8x\n",
	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
	for (r = 0; r < vcpu->arch.slb_max; ++r)
		pr_err("  ESID = %.16llx VSID = %.16llx\n",
		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
444
	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
445 446 447
	       vcpu->arch.last_inst);
}

448
static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
449
{
450
	return kvm_get_vcpu_by_id(kvm, id);
451 452 453 454
}

static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
{
455
	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
456
	vpa->yield_count = cpu_to_be32(1);
457 458
}

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
		   unsigned long addr, unsigned long len)
{
	/* check address is cacheline aligned */
	if (addr & (L1_CACHE_BYTES - 1))
		return -EINVAL;
	spin_lock(&vcpu->arch.vpa_update_lock);
	if (v->next_gpa != addr || v->len != len) {
		v->next_gpa = addr;
		v->len = addr ? len : 0;
		v->update_pending = 1;
	}
	spin_unlock(&vcpu->arch.vpa_update_lock);
	return 0;
}

475 476 477 478
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
struct reg_vpa {
	u32 dummy;
	union {
479 480
		__be16 hword;
		__be32 word;
481 482 483 484 485 486 487 488 489 490
	} length;
};

static int vpa_is_registered(struct kvmppc_vpa *vpap)
{
	if (vpap->update_pending)
		return vpap->next_gpa != 0;
	return vpap->pinned_addr != NULL;
}

491 492 493 494 495
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
				       unsigned long flags,
				       unsigned long vcpuid, unsigned long vpa)
{
	struct kvm *kvm = vcpu->kvm;
496
	unsigned long len, nb;
497 498
	void *va;
	struct kvm_vcpu *tvcpu;
499 500 501
	int err;
	int subfunc;
	struct kvmppc_vpa *vpap;
502 503 504 505 506

	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
	if (!tvcpu)
		return H_PARAMETER;

507 508 509 510 511
	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
	    subfunc == H_VPA_REG_SLB) {
		/* Registering new area - address must be cache-line aligned */
		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
512
			return H_PARAMETER;
513 514

		/* convert logical addr to kernel addr and read length */
515 516
		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
		if (va == NULL)
517
			return H_PARAMETER;
518
		if (subfunc == H_VPA_REG_VPA)
519
			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
520
		else
521
			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
522
		kvmppc_unpin_guest_page(kvm, va, vpa, false);
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537

		/* Check length */
		if (len > nb || len < sizeof(struct reg_vpa))
			return H_PARAMETER;
	} else {
		vpa = 0;
		len = 0;
	}

	err = H_PARAMETER;
	vpap = NULL;
	spin_lock(&tvcpu->arch.vpa_update_lock);

	switch (subfunc) {
	case H_VPA_REG_VPA:		/* register VPA */
538 539 540 541 542 543
		/*
		 * The size of our lppaca is 1kB because of the way we align
		 * it for the guest to avoid crossing a 4kB boundary. We only
		 * use 640 bytes of the structure though, so we should accept
		 * clients that set a size of 640.
		 */
544 545
		BUILD_BUG_ON(sizeof(struct lppaca) != 640);
		if (len < sizeof(struct lppaca))
546
			break;
547 548 549 550 551 552
		vpap = &tvcpu->arch.vpa;
		err = 0;
		break;

	case H_VPA_REG_DTL:		/* register DTL */
		if (len < sizeof(struct dtl_entry))
553
			break;
554 555 556 557 558
		len -= len % sizeof(struct dtl_entry);

		/* Check that they have previously registered a VPA */
		err = H_RESOURCE;
		if (!vpa_is_registered(&tvcpu->arch.vpa))
559
			break;
560 561 562 563 564 565 566 567 568

		vpap = &tvcpu->arch.dtl;
		err = 0;
		break;

	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
		/* Check that they have previously registered a VPA */
		err = H_RESOURCE;
		if (!vpa_is_registered(&tvcpu->arch.vpa))
569
			break;
570 571 572 573 574 575 576 577 578 579

		vpap = &tvcpu->arch.slb_shadow;
		err = 0;
		break;

	case H_VPA_DEREG_VPA:		/* deregister VPA */
		/* Check they don't still have a DTL or SLB buf registered */
		err = H_RESOURCE;
		if (vpa_is_registered(&tvcpu->arch.dtl) ||
		    vpa_is_registered(&tvcpu->arch.slb_shadow))
580
			break;
581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600

		vpap = &tvcpu->arch.vpa;
		err = 0;
		break;

	case H_VPA_DEREG_DTL:		/* deregister DTL */
		vpap = &tvcpu->arch.dtl;
		err = 0;
		break;

	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
		vpap = &tvcpu->arch.slb_shadow;
		err = 0;
		break;
	}

	if (vpap) {
		vpap->next_gpa = vpa;
		vpap->len = len;
		vpap->update_pending = 1;
601
	}
602

603 604
	spin_unlock(&tvcpu->arch.vpa_update_lock);

605
	return err;
606 607
}

608
static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
609
{
610
	struct kvm *kvm = vcpu->kvm;
611 612
	void *va;
	unsigned long nb;
613
	unsigned long gpa;
614

615 616 617 618 619 620 621 622 623 624 625 626 627 628
	/*
	 * We need to pin the page pointed to by vpap->next_gpa,
	 * but we can't call kvmppc_pin_guest_page under the lock
	 * as it does get_user_pages() and down_read().  So we
	 * have to drop the lock, pin the page, then get the lock
	 * again and check that a new area didn't get registered
	 * in the meantime.
	 */
	for (;;) {
		gpa = vpap->next_gpa;
		spin_unlock(&vcpu->arch.vpa_update_lock);
		va = NULL;
		nb = 0;
		if (gpa)
629
			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
630 631 632 633 634
		spin_lock(&vcpu->arch.vpa_update_lock);
		if (gpa == vpap->next_gpa)
			break;
		/* sigh... unpin that one and try again */
		if (va)
635
			kvmppc_unpin_guest_page(kvm, va, gpa, false);
636 637 638 639 640 641 642 643 644
	}

	vpap->update_pending = 0;
	if (va && nb < vpap->len) {
		/*
		 * If it's now too short, it must be that userspace
		 * has changed the mappings underlying guest memory,
		 * so unregister the region.
		 */
645
		kvmppc_unpin_guest_page(kvm, va, gpa, false);
646
		va = NULL;
647 648
	}
	if (vpap->pinned_addr)
649 650 651
		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
					vpap->dirty);
	vpap->gpa = gpa;
652
	vpap->pinned_addr = va;
653
	vpap->dirty = false;
654 655 656 657 658 659
	if (va)
		vpap->pinned_end = va + vpap->len;
}

static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
{
660 661 662 663 664
	if (!(vcpu->arch.vpa.update_pending ||
	      vcpu->arch.slb_shadow.update_pending ||
	      vcpu->arch.dtl.update_pending))
		return;

665 666
	spin_lock(&vcpu->arch.vpa_update_lock);
	if (vcpu->arch.vpa.update_pending) {
667
		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
668 669
		if (vcpu->arch.vpa.pinned_addr)
			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
670 671
	}
	if (vcpu->arch.dtl.update_pending) {
672
		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
673 674 675 676
		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
		vcpu->arch.dtl_index = 0;
	}
	if (vcpu->arch.slb_shadow.update_pending)
677
		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
678 679 680
	spin_unlock(&vcpu->arch.vpa_update_lock);
}

681 682 683 684 685 686 687
/*
 * Return the accumulated stolen time for the vcore up until `now'.
 * The caller should hold the vcore lock.
 */
static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
{
	u64 p;
688
	unsigned long flags;
689

690 691
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

692 693
	spin_lock_irqsave(&vc->stoltb_lock, flags);
	p = vc->stolen_tb;
694
	if (vc->vcore_state != VCORE_INACTIVE &&
695 696 697
	    vc->preempt_tb != TB_NIL)
		p += now - vc->preempt_tb;
	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
698 699 700
	return p;
}

701 702 703
static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
					unsigned int pcpu, u64 now,
					unsigned long stolen)
704 705 706 707 708 709
{
	struct dtl_entry *dt;
	struct lppaca *vpa;

	dt = vcpu->arch.dtl_ptr;
	vpa = vcpu->arch.vpa.pinned_addr;
710

711 712
	if (!dt || !vpa)
		return;
713

714
	dt->dispatch_reason = 7;
715 716
	dt->preempt_reason = 0;
	dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
717
	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
718 719 720 721
	dt->ready_to_enqueue_time = 0;
	dt->waiting_to_ready_time = 0;
	dt->timebase = cpu_to_be64(now);
	dt->fault_addr = 0;
722 723
	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
724

725 726 727 728 729 730
	++dt;
	if (dt == vcpu->arch.dtl.pinned_end)
		dt = vcpu->arch.dtl.pinned_addr;
	vcpu->arch.dtl_ptr = dt;
	/* order writing *dt vs. writing vpa->dtl_idx */
	smp_wmb();
731
	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
732
	vcpu->arch.dtl.dirty = true;
733 734
}

735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
				    struct kvmppc_vcore *vc)
{
	unsigned long stolen;
	unsigned long core_stolen;
	u64 now;
	unsigned long flags;

	now = mftb();

	core_stolen = vcore_stolen_time(vc, now);
	stolen = core_stolen - vcpu->arch.stolen_logged;
	vcpu->arch.stolen_logged = core_stolen;
	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
	stolen += vcpu->arch.busy_stolen;
	vcpu->arch.busy_stolen = 0;
	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);

	__kvmppc_create_dtl_entry(vcpu, vc->pcpu, now + vc->tb_offset, stolen);
}

756 757 758 759 760 761
/* See if there is a doorbell interrupt pending for a vcpu */
static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
{
	int thr;
	struct kvmppc_vcore *vc;

762 763
	if (vcpu->arch.doorbell_request)
		return true;
764 765
	if (cpu_has_feature(CPU_FTR_ARCH_300))
		return false;
766 767 768
	/*
	 * Ensure that the read of vcore->dpdes comes after the read
	 * of vcpu->doorbell_request.  This barrier matches the
769
	 * smp_wmb() in kvmppc_guest_entry_inject().
770 771
	 */
	smp_rmb();
772 773 774 775 776
	vc = vcpu->arch.vcore;
	thr = vcpu->vcpu_id - vc->first_vcpuid;
	return !!(vc->dpdes & (1 << thr));
}

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
{
	if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
		return true;
	if ((!vcpu->arch.vcore->arch_compat) &&
	    cpu_has_feature(CPU_FTR_ARCH_207S))
		return true;
	return false;
}

static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
			     unsigned long resource, unsigned long value1,
			     unsigned long value2)
{
	switch (resource) {
	case H_SET_MODE_RESOURCE_SET_CIABR:
		if (!kvmppc_power8_compatible(vcpu))
			return H_P2;
		if (value2)
			return H_P4;
		if (mflags)
			return H_UNSUPPORTED_FLAG_START;
		/* Guests can't breakpoint the hypervisor */
		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
			return H_P3;
		vcpu->arch.ciabr  = value1;
		return H_SUCCESS;
804
	case H_SET_MODE_RESOURCE_SET_DAWR0:
805 806
		if (!kvmppc_power8_compatible(vcpu))
			return H_P2;
807 808
		if (!ppc_breakpoint_available())
			return H_P2;
809 810 811 812
		if (mflags)
			return H_UNSUPPORTED_FLAG_START;
		if (value2 & DABRX_HYP)
			return H_P4;
813 814
		vcpu->arch.dawr0  = value1;
		vcpu->arch.dawrx0 = value2;
815
		return H_SUCCESS;
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
	case H_SET_MODE_RESOURCE_SET_DAWR1:
		if (!kvmppc_power8_compatible(vcpu))
			return H_P2;
		if (!ppc_breakpoint_available())
			return H_P2;
		if (!cpu_has_feature(CPU_FTR_DAWR1))
			return H_P2;
		if (!vcpu->kvm->arch.dawr1_enabled)
			return H_FUNCTION;
		if (mflags)
			return H_UNSUPPORTED_FLAG_START;
		if (value2 & DABRX_HYP)
			return H_P4;
		vcpu->arch.dawr1  = value1;
		vcpu->arch.dawrx1 = value2;
		return H_SUCCESS;
832
	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
833 834 835 836
		/*
		 * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
		 * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
		 */
837 838
		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
				kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
839 840
			return H_UNSUPPORTED_FLAG_START;
		return H_TOO_HARD;
841 842 843 844 845
	default:
		return H_TOO_HARD;
	}
}

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
/* Copy guest memory in place - must reside within a single memslot */
static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
				  unsigned long len)
{
	struct kvm_memory_slot *to_memslot = NULL;
	struct kvm_memory_slot *from_memslot = NULL;
	unsigned long to_addr, from_addr;
	int r;

	/* Get HPA for from address */
	from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
	if (!from_memslot)
		return -EFAULT;
	if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
			     << PAGE_SHIFT))
		return -EINVAL;
	from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
	if (kvm_is_error_hva(from_addr))
		return -EFAULT;
	from_addr |= (from & (PAGE_SIZE - 1));

	/* Get HPA for to address */
	to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
	if (!to_memslot)
		return -EFAULT;
	if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
			   << PAGE_SHIFT))
		return -EINVAL;
	to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
	if (kvm_is_error_hva(to_addr))
		return -EFAULT;
	to_addr |= (to & (PAGE_SIZE - 1));

	/* Perform copy */
	r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
			     len);
	if (r)
		return -EFAULT;
	mark_page_dirty(kvm, to >> PAGE_SHIFT);
	return 0;
}

static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
			       unsigned long dest, unsigned long src)
{
	u64 pg_sz = SZ_4K;		/* 4K page size */
	u64 pg_mask = SZ_4K - 1;
	int ret;

	/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
	if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
		      H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
		return H_PARAMETER;

	/* dest (and src if copy_page flag set) must be page aligned */
	if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
		return H_PARAMETER;

	/* zero and/or copy the page as determined by the flags */
	if (flags & H_COPY_PAGE) {
		ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
		if (ret < 0)
			return H_PARAMETER;
	} else if (flags & H_ZERO_PAGE) {
		ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
		if (ret < 0)
			return H_PARAMETER;
	}

	/* We can ignore the remaining flags */

	return H_SUCCESS;
}

920 921 922 923 924 925 926 927 928 929
static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
{
	struct kvmppc_vcore *vcore = target->arch.vcore;

	/*
	 * We expect to have been called by the real mode handler
	 * (kvmppc_rm_h_confer()) which would have directly returned
	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
	 * have useful work to do and should not confer) so we don't
	 * recheck that here.
930 931 932 933
	 *
	 * In the case of the P9 single vcpu per vcore case, the real
	 * mode handler is not called but no other threads are in the
	 * source vcore.
934
	 */
935 936 937 938 939 940 941 942
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
		spin_lock(&vcore->lock);
		if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
		    vcore->vcore_state != VCORE_INACTIVE &&
		    vcore->runner)
			target = vcore->runner;
		spin_unlock(&vcore->lock);
	}
943 944 945 946 947 948 949 950 951 952 953 954

	return kvm_vcpu_yield_to(target);
}

static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
{
	int yield_count = 0;
	struct lppaca *lppaca;

	spin_lock(&vcpu->arch.vpa_update_lock);
	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
	if (lppaca)
955
		yield_count = be32_to_cpu(lppaca->yield_count);
956 957 958 959
	spin_unlock(&vcpu->arch.vpa_update_lock);
	return yield_count;
}

960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
/*
 * H_RPT_INVALIDATE hcall handler for nested guests.
 *
 * Handles only nested process-scoped invalidation requests in L0.
 */
static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
{
	unsigned long type = kvmppc_get_gpr(vcpu, 6);
	unsigned long pid, pg_sizes, start, end;

	/*
	 * The partition-scoped invalidations aren't handled here in L0.
	 */
	if (type & H_RPTI_TYPE_NESTED)
		return RESUME_HOST;

	pid = kvmppc_get_gpr(vcpu, 4);
	pg_sizes = kvmppc_get_gpr(vcpu, 7);
	start = kvmppc_get_gpr(vcpu, 8);
	end = kvmppc_get_gpr(vcpu, 9);

	do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
				type, pg_sizes, start, end);

	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
	return RESUME_GUEST;
}

988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
				    unsigned long id, unsigned long target,
				    unsigned long type, unsigned long pg_sizes,
				    unsigned long start, unsigned long end)
{
	if (!kvm_is_radix(vcpu->kvm))
		return H_UNSUPPORTED;

	if (end < start)
		return H_P5;

	/*
	 * Partition-scoped invalidation for nested guests.
	 */
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
	if (type & H_RPTI_TYPE_NESTED) {
		if (!nesting_enabled(vcpu->kvm))
			return H_FUNCTION;

		/* Support only cores as target */
		if (target != H_RPTI_TARGET_CMMU)
			return H_P2;

		return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
					       start, end);
	}
1013 1014 1015 1016 1017 1018 1019 1020 1021

	/*
	 * Process-scoped invalidation for L1 guests.
	 */
	do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
				type, pg_sizes, start, end);
	return H_SUCCESS;
}

1022 1023
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
{
1024
	struct kvm *kvm = vcpu->kvm;
1025 1026
	unsigned long req = kvmppc_get_gpr(vcpu, 3);
	unsigned long target, ret = H_SUCCESS;
1027
	int yield_count;
1028
	struct kvm_vcpu *tvcpu;
1029
	int idx, rc;
1030

1031 1032 1033 1034
	if (req <= MAX_HCALL_OPCODE &&
	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
		return RESUME_HOST;

1035
	switch (req) {
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
	case H_REMOVE:
		ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5),
					kvmppc_get_gpr(vcpu, 6));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_ENTER:
		ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5),
					kvmppc_get_gpr(vcpu, 6),
					kvmppc_get_gpr(vcpu, 7));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_READ:
		ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_CLEAR_MOD:
		ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_CLEAR_REF:
		ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_PROTECT:
		ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5),
					kvmppc_get_gpr(vcpu, 6));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_BULK_REMOVE:
		ret = kvmppc_h_bulk_remove(vcpu);
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;

1082 1083 1084 1085
	case H_CEDE:
		break;
	case H_PROD:
		target = kvmppc_get_gpr(vcpu, 4);
1086
		tvcpu = kvmppc_find_vcpu(kvm, target);
1087 1088 1089 1090 1091 1092
		if (!tvcpu) {
			ret = H_PARAMETER;
			break;
		}
		tvcpu->arch.prodded = 1;
		smp_mb();
1093 1094
		if (tvcpu->arch.ceded)
			kvmppc_fast_vcpu_kick_hv(tvcpu);
1095 1096
		break;
	case H_CONFER:
1097 1098 1099
		target = kvmppc_get_gpr(vcpu, 4);
		if (target == -1)
			break;
1100
		tvcpu = kvmppc_find_vcpu(kvm, target);
1101 1102 1103 1104
		if (!tvcpu) {
			ret = H_PARAMETER;
			break;
		}
1105 1106 1107 1108
		yield_count = kvmppc_get_gpr(vcpu, 5);
		if (kvmppc_get_yield_count(tvcpu) != yield_count)
			break;
		kvm_arch_vcpu_yield_to(tvcpu);
1109 1110 1111 1112 1113 1114
		break;
	case H_REGISTER_VPA:
		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5),
					kvmppc_get_gpr(vcpu, 6));
		break;
1115
	case H_RTAS:
1116
		if (list_empty(&kvm->arch.rtas_tokens))
1117 1118
			return RESUME_HOST;

1119
		idx = srcu_read_lock(&kvm->srcu);
1120
		rc = kvmppc_rtas_hcall(vcpu);
1121
		srcu_read_unlock(&kvm->srcu, idx);
1122 1123 1124 1125 1126 1127 1128 1129

		if (rc == -ENOENT)
			return RESUME_HOST;
		else if (rc == 0)
			break;

		/* Send the error out to userspace via KVM_RUN */
		return rc;
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
	case H_LOGICAL_CI_LOAD:
		ret = kvmppc_h_logical_ci_load(vcpu);
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_LOGICAL_CI_STORE:
		ret = kvmppc_h_logical_ci_store(vcpu);
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
1140 1141 1142 1143 1144 1145 1146 1147
	case H_SET_MODE:
		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
					kvmppc_get_gpr(vcpu, 5),
					kvmppc_get_gpr(vcpu, 6),
					kvmppc_get_gpr(vcpu, 7));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
1148 1149 1150 1151
	case H_XIRR:
	case H_CPPR:
	case H_EOI:
	case H_IPI:
1152 1153
	case H_IPOLL:
	case H_XIRR_X:
1154
		if (kvmppc_xics_enabled(vcpu)) {
1155
			if (xics_on_xive()) {
1156 1157 1158
				ret = H_NOT_AVAILABLE;
				return RESUME_GUEST;
			}
1159 1160
			ret = kvmppc_xics_hcall(vcpu, req);
			break;
1161 1162
		}
		return RESUME_HOST;
1163 1164 1165 1166 1167 1168 1169
	case H_SET_DABR:
		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
		break;
	case H_SET_XDABR:
		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
						kvmppc_get_gpr(vcpu, 5));
		break;
1170
#ifdef CONFIG_SPAPR_TCE_IOMMU
1171 1172 1173 1174 1175 1176
	case H_GET_TCE:
		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
						kvmppc_get_gpr(vcpu, 5));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
	case H_PUT_TCE:
		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
						kvmppc_get_gpr(vcpu, 5),
						kvmppc_get_gpr(vcpu, 6));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_PUT_TCE_INDIRECT:
		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
						kvmppc_get_gpr(vcpu, 5),
						kvmppc_get_gpr(vcpu, 6),
						kvmppc_get_gpr(vcpu, 7));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
	case H_STUFF_TCE:
		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
						kvmppc_get_gpr(vcpu, 5),
						kvmppc_get_gpr(vcpu, 6),
						kvmppc_get_gpr(vcpu, 7));
		if (ret == H_TOO_HARD)
			return RESUME_HOST;
		break;
1200
#endif
1201
	case H_RANDOM:
1202
		if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
1203 1204
			ret = H_HARDWARE;
		break;
1205 1206 1207 1208 1209 1210 1211 1212
	case H_RPT_INVALIDATE:
		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
					      kvmppc_get_gpr(vcpu, 5),
					      kvmppc_get_gpr(vcpu, 6),
					      kvmppc_get_gpr(vcpu, 7),
					      kvmppc_get_gpr(vcpu, 8),
					      kvmppc_get_gpr(vcpu, 9));
		break;
1213 1214 1215

	case H_SET_PARTITION_TABLE:
		ret = H_FUNCTION;
1216
		if (nesting_enabled(kvm))
1217 1218 1219 1220
			ret = kvmhv_set_partition_table(vcpu);
		break;
	case H_ENTER_NESTED:
		ret = H_FUNCTION;
1221
		if (!nesting_enabled(kvm))
1222 1223 1224 1225
			break;
		ret = kvmhv_enter_nested_guest(vcpu);
		if (ret == H_INTERRUPT) {
			kvmppc_set_gpr(vcpu, 3, 0);
1226
			vcpu->arch.hcall_needed = 0;
1227
			return -EINTR;
1228 1229 1230 1231
		} else if (ret == H_TOO_HARD) {
			kvmppc_set_gpr(vcpu, 3, 0);
			vcpu->arch.hcall_needed = 0;
			return RESUME_HOST;
1232
		}
1233 1234 1235
		break;
	case H_TLB_INVALIDATE:
		ret = H_FUNCTION;
1236
		if (nesting_enabled(kvm))
1237
			ret = kvmhv_do_nested_tlbie(vcpu);
1238
		break;
1239 1240
	case H_COPY_TOFROM_GUEST:
		ret = H_FUNCTION;
1241
		if (nesting_enabled(kvm))
1242 1243
			ret = kvmhv_copy_tofrom_guest_nested(vcpu);
		break;
1244 1245 1246 1247 1248
	case H_PAGE_INIT:
		ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
					 kvmppc_get_gpr(vcpu, 5),
					 kvmppc_get_gpr(vcpu, 6));
		break;
1249
	case H_SVM_PAGE_IN:
1250 1251
		ret = H_UNSUPPORTED;
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1252
			ret = kvmppc_h_svm_page_in(kvm,
1253 1254 1255
						   kvmppc_get_gpr(vcpu, 4),
						   kvmppc_get_gpr(vcpu, 5),
						   kvmppc_get_gpr(vcpu, 6));
1256 1257
		break;
	case H_SVM_PAGE_OUT:
1258 1259
		ret = H_UNSUPPORTED;
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1260
			ret = kvmppc_h_svm_page_out(kvm,
1261 1262 1263
						    kvmppc_get_gpr(vcpu, 4),
						    kvmppc_get_gpr(vcpu, 5),
						    kvmppc_get_gpr(vcpu, 6));
1264 1265
		break;
	case H_SVM_INIT_START:
1266 1267
		ret = H_UNSUPPORTED;
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1268
			ret = kvmppc_h_svm_init_start(kvm);
1269 1270
		break;
	case H_SVM_INIT_DONE:
1271 1272
		ret = H_UNSUPPORTED;
		if (kvmppc_get_srr1(vcpu) & MSR_S)
1273
			ret = kvmppc_h_svm_init_done(kvm);
1274
		break;
1275
	case H_SVM_INIT_ABORT:
1276 1277 1278 1279 1280 1281 1282
		/*
		 * Even if that call is made by the Ultravisor, the SSR1 value
		 * is the guest context one, with the secure bit clear as it has
		 * not yet been secured. So we can't check it here.
		 * Instead the kvm->arch.secure_guest flag is checked inside
		 * kvmppc_h_svm_init_abort().
		 */
1283
		ret = kvmppc_h_svm_init_abort(kvm);
1284
		break;
1285

1286 1287 1288
	default:
		return RESUME_HOST;
	}
1289
	WARN_ON_ONCE(ret == H_TOO_HARD);
1290 1291 1292 1293 1294
	kvmppc_set_gpr(vcpu, 3, ret);
	vcpu->arch.hcall_needed = 0;
	return RESUME_GUEST;
}

1295
/*
1296 1297 1298
 * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
 * handlers in book3s_hv_rmhandlers.S.
 *
1299 1300 1301
 * This has to be done early, not in kvmppc_pseries_do_hcall(), so
 * that the cede logic in kvmppc_run_single_vcpu() works properly.
 */
1302
static void kvmppc_cede(struct kvm_vcpu *vcpu)
1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
{
	vcpu->arch.shregs.msr |= MSR_EE;
	vcpu->arch.ceded = 1;
	smp_mb();
	if (vcpu->arch.prodded) {
		vcpu->arch.prodded = 0;
		smp_mb();
		vcpu->arch.ceded = 0;
	}
}

1314 1315 1316 1317 1318 1319 1320
static int kvmppc_hcall_impl_hv(unsigned long cmd)
{
	switch (cmd) {
	case H_CEDE:
	case H_PROD:
	case H_CONFER:
	case H_REGISTER_VPA:
1321
	case H_SET_MODE:
1322 1323
	case H_LOGICAL_CI_LOAD:
	case H_LOGICAL_CI_STORE:
1324 1325 1326 1327 1328 1329 1330 1331
#ifdef CONFIG_KVM_XICS
	case H_XIRR:
	case H_CPPR:
	case H_EOI:
	case H_IPI:
	case H_IPOLL:
	case H_XIRR_X:
#endif
1332
	case H_PAGE_INIT:
1333
	case H_RPT_INVALIDATE:
1334 1335 1336 1337 1338 1339 1340
		return 1;
	}

	/* See if it's in the real-mode table */
	return kvmppc_hcall_impl_hv_realmode(cmd);
}

1341
static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354
{
	u32 last_inst;

	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
					EMULATE_DONE) {
		/*
		 * Fetch failed, so return to guest and
		 * try executing it again.
		 */
		return RESUME_GUEST;
	}

	if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
1355 1356
		vcpu->run->exit_reason = KVM_EXIT_DEBUG;
		vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
1357 1358 1359 1360 1361 1362 1363
		return RESUME_HOST;
	} else {
		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
		return RESUME_GUEST;
	}
}

1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
static void do_nothing(void *x)
{
}

static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
{
	int thr, cpu, pcpu, nthreads;
	struct kvm_vcpu *v;
	unsigned long dpdes;

	nthreads = vcpu->kvm->arch.emul_smt_mode;
	dpdes = 0;
	cpu = vcpu->vcpu_id & ~(nthreads - 1);
	for (thr = 0; thr < nthreads; ++thr, ++cpu) {
		v = kvmppc_find_vcpu(vcpu->kvm, cpu);
		if (!v)
			continue;
		/*
		 * If the vcpu is currently running on a physical cpu thread,
		 * interrupt it in order to pull it out of the guest briefly,
		 * which will update its vcore->dpdes value.
		 */
		pcpu = READ_ONCE(v->cpu);
		if (pcpu >= 0)
			smp_call_function_single(pcpu, do_nothing, NULL, 1);
		if (kvmppc_doorbell_pending(v))
			dpdes |= 1 << thr;
	}
	return dpdes;
}

/*
 * On POWER9, emulate doorbell-related instructions in order to
 * give the guest the illusion of running on a multi-threaded core.
 * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
 * and mfspr DPDES.
 */
static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
{
	u32 inst, rb, thr;
	unsigned long arg;
	struct kvm *kvm = vcpu->kvm;
	struct kvm_vcpu *tvcpu;

	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
		return RESUME_GUEST;
	if (get_op(inst) != 31)
		return EMULATE_FAIL;
	rb = get_rb(inst);
	thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
	switch (get_xop(inst)) {
	case OP_31_XOP_MSGSNDP:
		arg = kvmppc_get_gpr(vcpu, rb);
1417
		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1418
			break;
1419
		arg &= 0x7f;
1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
		if (arg >= kvm->arch.emul_smt_mode)
			break;
		tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
		if (!tvcpu)
			break;
		if (!tvcpu->arch.doorbell_request) {
			tvcpu->arch.doorbell_request = 1;
			kvmppc_fast_vcpu_kick_hv(tvcpu);
		}
		break;
	case OP_31_XOP_MSGCLRP:
		arg = kvmppc_get_gpr(vcpu, rb);
1432
		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
			break;
		vcpu->arch.vcore->dpdes = 0;
		vcpu->arch.doorbell_request = 0;
		break;
	case OP_31_XOP_MFSPR:
		switch (get_sprn(inst)) {
		case SPRN_TIR:
			arg = thr;
			break;
		case SPRN_DPDES:
			arg = kvmppc_read_dpdes(vcpu);
			break;
		default:
			return EMULATE_FAIL;
		}
		kvmppc_set_gpr(vcpu, get_rt(inst), arg);
		break;
	default:
		return EMULATE_FAIL;
	}
	kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
	return RESUME_GUEST;
}

1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
/*
 * If the lppaca had pmcregs_in_use clear when we exited the guest, then
 * HFSCR_PM is cleared for next entry. If the guest then tries to access
 * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
 * back in the guest HFSCR will cause the next entry to load the PMU SPRs and
 * allow the guest access to continue.
 */
static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
		return EMULATE_FAIL;

	vcpu->arch.hfscr |= HFSCR_PM;

	return RESUME_GUEST;
}

1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
		return EMULATE_FAIL;

	vcpu->arch.hfscr |= HFSCR_EBB;

	return RESUME_GUEST;
}

1484 1485 1486 1487 1488 1489 1490 1491 1492 1493
static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
		return EMULATE_FAIL;

	vcpu->arch.hfscr |= HFSCR_TM;

	return RESUME_GUEST;
}

1494
static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
1495
				 struct task_struct *tsk)
1496
{
1497
	struct kvm_run *run = vcpu->run;
1498 1499 1500 1501
	int r = RESUME_HOST;

	vcpu->stat.sum_exits++;

1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
	/*
	 * This can happen if an interrupt occurs in the last stages
	 * of guest entry or the first stages of guest exit (i.e. after
	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
	 * That can happen due to a bug, or due to a machine check
	 * occurring at just the wrong time.
	 */
	if (vcpu->arch.shregs.msr & MSR_HV) {
		printk(KERN_EMERG "KVM trap in HV mode!\n");
		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
			vcpu->arch.trap, kvmppc_get_pc(vcpu),
			vcpu->arch.shregs.msr);
		kvmppc_dump_regs(vcpu);
		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		run->hw.hardware_exit_reason = vcpu->arch.trap;
		return RESUME_HOST;
	}
1520 1521 1522 1523
	run->exit_reason = KVM_EXIT_UNKNOWN;
	run->ready_for_interrupt_injection = 1;
	switch (vcpu->arch.trap) {
	/* We're good on these - the host merely wanted to get our attention */
1524 1525 1526 1527
	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
		WARN_ON_ONCE(1); /* Should never happen */
		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
		fallthrough;
1528 1529 1530 1531 1532
	case BOOK3S_INTERRUPT_HV_DECREMENTER:
		vcpu->stat.dec_exits++;
		r = RESUME_GUEST;
		break;
	case BOOK3S_INTERRUPT_EXTERNAL:
1533
	case BOOK3S_INTERRUPT_H_DOORBELL:
1534
	case BOOK3S_INTERRUPT_H_VIRT:
1535 1536 1537
		vcpu->stat.ext_intr_exits++;
		r = RESUME_GUEST;
		break;
1538
	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1539
	case BOOK3S_INTERRUPT_HMI:
1540
	case BOOK3S_INTERRUPT_PERFMON:
1541
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
1542 1543
		r = RESUME_GUEST;
		break;
1544 1545 1546 1547 1548 1549 1550 1551 1552
	case BOOK3S_INTERRUPT_MACHINE_CHECK: {
		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
					      DEFAULT_RATELIMIT_BURST);
		/*
		 * Print the MCE event to host console. Ratelimit so the guest
		 * can't flood the host log.
		 */
		if (__ratelimit(&rs))
			machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566

		/*
		 * If the guest can do FWNMI, exit to userspace so it can
		 * deliver a FWNMI to the guest.
		 * Otherwise we synthesize a machine check for the guest
		 * so that it knows that the machine check occurred.
		 */
		if (!vcpu->kvm->arch.fwnmi_enabled) {
			ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
			kvmppc_core_queue_machine_check(vcpu, flags);
			r = RESUME_GUEST;
			break;
		}

1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
		/* Exit to guest with KVM_EXIT_NMI as exit reason */
		run->exit_reason = KVM_EXIT_NMI;
		run->hw.hardware_exit_reason = vcpu->arch.trap;
		/* Clear out the old NMI status from run->flags */
		run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
		/* Now set the NMI status */
		if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
			run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
		else
			run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;

		r = RESUME_HOST;
1579
		break;
1580
	}
1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
	case BOOK3S_INTERRUPT_PROGRAM:
	{
		ulong flags;
		/*
		 * Normally program interrupts are delivered directly
		 * to the guest by the hardware, but we can get here
		 * as a result of a hypervisor emulation interrupt
		 * (e40) getting turned into a 700 by BML RTAS.
		 */
		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
		kvmppc_core_queue_program(vcpu, flags);
		r = RESUME_GUEST;
		break;
	}
	case BOOK3S_INTERRUPT_SYSCALL:
	{
		int i;

1599 1600 1601 1602 1603 1604
		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
			/*
			 * Guest userspace executed sc 1. This can only be
			 * reached by the P9 path because the old path
			 * handles this case in realmode hcall handlers.
			 */
1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620
			if (!kvmhv_vcpu_is_radix(vcpu)) {
				/*
				 * A guest could be running PR KVM, so this
				 * may be a PR KVM hcall. It must be reflected
				 * to the guest kernel as a sc interrupt.
				 */
				kvmppc_core_queue_syscall(vcpu);
			} else {
				/*
				 * Radix guests can not run PR KVM or nested HV
				 * hash guests which might run PR KVM, so this
				 * is always a privilege fault. Send a program
				 * check to guest kernel.
				 */
				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
			}
1621 1622 1623
			r = RESUME_GUEST;
			break;
		}
1624

1625 1626 1627 1628 1629
		/*
		 * hcall - gather args and set exit_reason. This will next be
		 * handled by kvmppc_pseries_do_hcall which may be able to deal
		 * with it and resume guest, or may punt to userspace.
		 */
1630 1631 1632 1633 1634 1635 1636 1637 1638
		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
		for (i = 0; i < 9; ++i)
			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
		run->exit_reason = KVM_EXIT_PAPR_HCALL;
		vcpu->arch.hcall_needed = 1;
		r = RESUME_HOST;
		break;
	}
	/*
1639 1640 1641
	 * We get these next two if the guest accesses a page which it thinks
	 * it has mapped but which is not actually present, either because
	 * it is for an emulated I/O device or because the corresonding
1642 1643 1644 1645 1646
	 * host page has been paged out.
	 *
	 * Any other HDSI/HISI interrupts have been handled already for P7/8
	 * guests. For POWER9 hash guests not using rmhandlers, basic hash
	 * fault handling is done here.
1647
	 */
1648 1649 1650 1651
	case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
		unsigned long vsid;
		long err;

1652 1653
		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
		    unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
1654
			r = RESUME_GUEST; /* Just retry if it's the canary */
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
			break;
		}

		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
			/*
			 * Radix doesn't require anything, and pre-ISAv3.0 hash
			 * already attempted to handle this in rmhandlers. The
			 * hash fault handling below is v3 only (it uses ASDR
			 * via fault_gpa).
			 */
			r = RESUME_PAGE_FAULT;
			break;
		}

		if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
			kvmppc_core_queue_data_storage(vcpu,
				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
			r = RESUME_GUEST;
			break;
		}

		if (!(vcpu->arch.shregs.msr & MSR_DR))
			vsid = vcpu->kvm->arch.vrma_slb_v;
		else
			vsid = vcpu->arch.fault_gpa;

		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
				vsid, vcpu->arch.fault_dsisr, true);
		if (err == 0) {
			r = RESUME_GUEST;
		} else if (err == -1 || err == -2) {
			r = RESUME_PAGE_FAULT;
		} else {
			kvmppc_core_queue_data_storage(vcpu,
				vcpu->arch.fault_dar, err);
			r = RESUME_GUEST;
		}
1692
		break;
1693 1694 1695 1696 1697
	}
	case BOOK3S_INTERRUPT_H_INST_STORAGE: {
		unsigned long vsid;
		long err;

1698
		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1699 1700
		vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
			DSISR_SRR1_MATCH_64S;
1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
			/*
			 * Radix doesn't require anything, and pre-ISAv3.0 hash
			 * already attempted to handle this in rmhandlers. The
			 * hash fault handling below is v3 only (it uses ASDR
			 * via fault_gpa).
			 */
			if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
				vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
			r = RESUME_PAGE_FAULT;
			break;
		}

		if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
			kvmppc_core_queue_inst_storage(vcpu,
				vcpu->arch.fault_dsisr);
			r = RESUME_GUEST;
			break;
		}

		if (!(vcpu->arch.shregs.msr & MSR_IR))
			vsid = vcpu->kvm->arch.vrma_slb_v;
		else
			vsid = vcpu->arch.fault_gpa;

		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
				vsid, vcpu->arch.fault_dsisr, false);
		if (err == 0) {
			r = RESUME_GUEST;
		} else if (err == -1) {
			r = RESUME_PAGE_FAULT;
		} else {
			kvmppc_core_queue_inst_storage(vcpu, err);
			r = RESUME_GUEST;
		}
1736
		break;
1737 1738
	}

1739 1740
	/*
	 * This occurs if the guest executes an illegal instruction.
1741 1742 1743 1744
	 * If the guest debug is disabled, generate a program interrupt
	 * to the guest. If guest debug is enabled, we need to check
	 * whether the instruction is a software breakpoint instruction.
	 * Accordingly return to Guest or Host.
1745 1746
	 */
	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1747 1748 1749 1750
		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
				swab32(vcpu->arch.emul_inst) :
				vcpu->arch.emul_inst;
1751
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1752
			r = kvmppc_emulate_debug_inst(vcpu);
1753 1754 1755 1756
		} else {
			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
			r = RESUME_GUEST;
		}
1757
		break;
1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772

#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
		/*
		 * This occurs for various TM-related instructions that
		 * we need to emulate on POWER9 DD2.2.  We have already
		 * handled the cases where the guest was in real-suspend
		 * mode and was transitioning to transactional state.
		 */
		r = kvmhv_p9_tm_emulation(vcpu);
		if (r != -1)
			break;
		fallthrough; /* go to facility unavailable handler */
#endif

1773 1774
	/*
	 * This occurs if the guest (kernel or userspace), does something that
1775 1776 1777 1778
	 * is prohibited by HFSCR.
	 * On POWER9, this could be a doorbell instruction that we need
	 * to emulate.
	 * Otherwise, we just generate a program interrupt to the guest.
1779
	 */
1780 1781 1782
	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
		u64 cause = vcpu->arch.hfscr >> 56;

1783
		r = EMULATE_FAIL;
1784 1785 1786 1787 1788
		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			if (cause == FSCR_MSGP_LG)
				r = kvmppc_emulate_doorbell_instr(vcpu);
			if (cause == FSCR_PM_LG)
				r = kvmppc_pmu_unavailable(vcpu);
1789 1790
			if (cause == FSCR_EBB_LG)
				r = kvmppc_ebb_unavailable(vcpu);
1791 1792
			if (cause == FSCR_TM_LG)
				r = kvmppc_tm_unavailable(vcpu);
1793
		}
1794 1795 1796 1797
		if (r == EMULATE_FAIL) {
			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
			r = RESUME_GUEST;
		}
1798
		break;
1799
	}
1800

1801 1802 1803
	case BOOK3S_INTERRUPT_HV_RM_HARD:
		r = RESUME_PASSTHROUGH;
		break;
1804 1805 1806 1807 1808
	default:
		kvmppc_dump_regs(vcpu);
		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
			vcpu->arch.trap, kvmppc_get_pc(vcpu),
			vcpu->arch.shregs.msr);
1809
		run->hw.hardware_exit_reason = vcpu->arch.trap;
1810 1811 1812 1813 1814 1815 1816
		r = RESUME_HOST;
		break;
	}

	return r;
}

1817
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1818
{
1819
	struct kvm_nested_guest *nested = vcpu->arch.nested;
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
	int r;
	int srcu_idx;

	vcpu->stat.sum_exits++;

	/*
	 * This can happen if an interrupt occurs in the last stages
	 * of guest entry or the first stages of guest exit (i.e. after
	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
	 * That can happen due to a bug, or due to a machine check
	 * occurring at just the wrong time.
	 */
	if (vcpu->arch.shregs.msr & MSR_HV) {
		pr_emerg("KVM trap in HV mode while nested!\n");
		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
			 vcpu->arch.shregs.msr);
		kvmppc_dump_regs(vcpu);
		return RESUME_HOST;
	}
	switch (vcpu->arch.trap) {
	/* We're good on these - the host merely wanted to get our attention */
	case BOOK3S_INTERRUPT_HV_DECREMENTER:
		vcpu->stat.dec_exits++;
		r = RESUME_GUEST;
		break;
	case BOOK3S_INTERRUPT_EXTERNAL:
		vcpu->stat.ext_intr_exits++;
		r = RESUME_HOST;
		break;
	case BOOK3S_INTERRUPT_H_DOORBELL:
	case BOOK3S_INTERRUPT_H_VIRT:
		vcpu->stat.ext_intr_exits++;
		r = RESUME_GUEST;
		break;
1856 1857 1858 1859 1860 1861
	/* These need to go to the nested HV */
	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
		vcpu->stat.dec_exits++;
		r = RESUME_HOST;
		break;
1862 1863 1864 1865 1866 1867 1868
	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
	case BOOK3S_INTERRUPT_HMI:
	case BOOK3S_INTERRUPT_PERFMON:
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
		r = RESUME_GUEST;
		break;
	case BOOK3S_INTERRUPT_MACHINE_CHECK:
1869 1870 1871
	{
		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
					      DEFAULT_RATELIMIT_BURST);
1872 1873 1874
		/* Pass the machine check to the L1 guest */
		r = RESUME_HOST;
		/* Print the MCE event to host console. */
1875 1876
		if (__ratelimit(&rs))
			machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1877
		break;
1878
	}
1879 1880 1881 1882 1883 1884 1885 1886
	/*
	 * We get these next two if the guest accesses a page which it thinks
	 * it has mapped but which is not actually present, either because
	 * it is for an emulated I/O device or because the corresonding
	 * host page has been paged out.
	 */
	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1887
		r = kvmhv_nested_page_fault(vcpu);
1888 1889 1890 1891 1892 1893 1894 1895 1896
		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
		break;
	case BOOK3S_INTERRUPT_H_INST_STORAGE:
		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
					 DSISR_SRR1_MATCH_64S;
		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1897
		r = kvmhv_nested_page_fault(vcpu);
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
		break;

#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
		/*
		 * This occurs for various TM-related instructions that
		 * we need to emulate on POWER9 DD2.2.  We have already
		 * handled the cases where the guest was in real-suspend
		 * mode and was transitioning to transactional state.
		 */
		r = kvmhv_p9_tm_emulation(vcpu);
1910 1911 1912
		if (r != -1)
			break;
		fallthrough; /* go to facility unavailable handler */
1913 1914
#endif

1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941
	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
		u64 cause = vcpu->arch.hfscr >> 56;

		/*
		 * Only pass HFU interrupts to the L1 if the facility is
		 * permitted but disabled by the L1's HFSCR, otherwise
		 * the interrupt does not make sense to the L1 so turn
		 * it into a HEAI.
		 */
		if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
					(nested->hfscr & (1UL << cause))) {
			vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;

			/*
			 * If the fetch failed, return to guest and
			 * try executing it again.
			 */
			r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
						 &vcpu->arch.emul_inst);
			if (r != EMULATE_DONE)
				r = RESUME_GUEST;
			else
				r = RESUME_HOST;
		} else {
			r = RESUME_HOST;
		}

1942
		break;
1943
	}
1944

1945 1946 1947
	case BOOK3S_INTERRUPT_HV_RM_HARD:
		vcpu->arch.trap = 0;
		r = RESUME_GUEST;
1948
		if (!xics_on_xive())
1949 1950
			kvmppc_xics_rm_complete(vcpu, 0);
		break;
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
	case BOOK3S_INTERRUPT_SYSCALL:
	{
		unsigned long req = kvmppc_get_gpr(vcpu, 3);

		/*
		 * The H_RPT_INVALIDATE hcalls issued by nested
		 * guests for process-scoped invalidations when
		 * GTSE=0, are handled here in L0.
		 */
		if (req == H_RPT_INVALIDATE) {
			r = kvmppc_nested_h_rpt_invalidate(vcpu);
			break;
		}

		r = RESUME_HOST;
		break;
	}
1968 1969 1970 1971 1972 1973 1974 1975
	default:
		r = RESUME_HOST;
		break;
	}

	return r;
}

1976 1977
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
					    struct kvm_sregs *sregs)
1978 1979 1980 1981
{
	int i;

	memset(sregs, 0, sizeof(struct kvm_sregs));
1982
	sregs->pvr = vcpu->arch.pvr;
1983 1984 1985 1986 1987 1988 1989 1990
	for (i = 0; i < vcpu->arch.slb_max; i++) {
		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
	}

	return 0;
}

1991 1992
static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
					    struct kvm_sregs *sregs)
1993 1994 1995
{
	int i, j;

1996 1997 1998
	/* Only accept the same PVR as the host's, since we can't spoof it */
	if (sregs->pvr != vcpu->arch.pvr)
		return -EINVAL;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012

	j = 0;
	for (i = 0; i < vcpu->arch.slb_nr; i++) {
		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
			++j;
		}
	}
	vcpu->arch.slb_max = j;

	return 0;
}

2013 2014 2015 2016 2017 2018 2019
/*
 * Enforce limits on guest LPCR values based on hardware availability,
 * guest configuration, and possibly hypervisor support and security
 * concerns.
 */
unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
{
2020 2021 2022 2023
	/* LPCR_TC only applies to HPT guests */
	if (kvm_is_radix(kvm))
		lpcr &= ~LPCR_TC;

2024 2025 2026
	/* On POWER8 and above, userspace can modify AIL */
	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
		lpcr &= ~LPCR_AIL;
2027 2028
	if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
		lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
2029 2030 2031 2032 2033 2034 2035 2036
	/*
	 * On some POWER9s we force AIL off for radix guests to prevent
	 * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
	 * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
	 * be cached, which the host TLB management does not expect.
	 */
	if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
		lpcr &= ~LPCR_AIL;
2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055

	/*
	 * On POWER9, allow userspace to enable large decrementer for the
	 * guest, whether or not the host has it enabled.
	 */
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		lpcr &= ~LPCR_LD;

	return lpcr;
}

static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
{
	if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
		WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
			  lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
	}
}

2056 2057
static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
		bool preserve_top32)
2058
{
2059
	struct kvm *kvm = vcpu->kvm;
2060 2061 2062 2063
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
	u64 mask;

	spin_lock(&vc->lock);
2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080

	/*
	 * Userspace can only modify
	 * DPFD (default prefetch depth), ILE (interrupt little-endian),
	 * TC (translation control), AIL (alternate interrupt location),
	 * LD (large decrementer).
	 * These are subject to restrictions from kvmppc_filter_lcpr_hv().
	 */
	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;

	/* Broken 32-bit version of LPCR must not clear top bits */
	if (preserve_top32)
		mask &= 0xFFFFFFFF;

	new_lpcr = kvmppc_filter_lpcr_hv(kvm,
			(vc->lpcr & ~mask) | (new_lpcr & mask));

2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098
	/*
	 * If ILE (interrupt little-endian) has changed, update the
	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
	 */
	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
		struct kvm_vcpu *vcpu;
		int i;

		kvm_for_each_vcpu(i, vcpu, kvm) {
			if (vcpu->arch.vcore != vc)
				continue;
			if (new_lpcr & LPCR_ILE)
				vcpu->arch.intr_msr |= MSR_LE;
			else
				vcpu->arch.intr_msr &= ~MSR_LE;
		}
	}

2099
	vc->lpcr = new_lpcr;
2100

2101 2102 2103
	spin_unlock(&vc->lock);
}

2104 2105
static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
				 union kvmppc_one_reg *val)
2106
{
2107 2108
	int r = 0;
	long int i;
2109

2110
	switch (id) {
2111 2112 2113
	case KVM_REG_PPC_DEBUG_INST:
		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
		break;
2114
	case KVM_REG_PPC_HIOR:
2115 2116 2117 2118 2119
		*val = get_reg_val(id, 0);
		break;
	case KVM_REG_PPC_DABR:
		*val = get_reg_val(id, vcpu->arch.dabr);
		break;
2120 2121 2122
	case KVM_REG_PPC_DABRX:
		*val = get_reg_val(id, vcpu->arch.dabrx);
		break;
2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137
	case KVM_REG_PPC_DSCR:
		*val = get_reg_val(id, vcpu->arch.dscr);
		break;
	case KVM_REG_PPC_PURR:
		*val = get_reg_val(id, vcpu->arch.purr);
		break;
	case KVM_REG_PPC_SPURR:
		*val = get_reg_val(id, vcpu->arch.spurr);
		break;
	case KVM_REG_PPC_AMR:
		*val = get_reg_val(id, vcpu->arch.amr);
		break;
	case KVM_REG_PPC_UAMOR:
		*val = get_reg_val(id, vcpu->arch.uamor);
		break;
2138
	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2139 2140 2141
		i = id - KVM_REG_PPC_MMCR0;
		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
		break;
2142 2143 2144 2145 2146 2147 2148 2149 2150
	case KVM_REG_PPC_MMCR2:
		*val = get_reg_val(id, vcpu->arch.mmcr[2]);
		break;
	case KVM_REG_PPC_MMCRA:
		*val = get_reg_val(id, vcpu->arch.mmcra);
		break;
	case KVM_REG_PPC_MMCRS:
		*val = get_reg_val(id, vcpu->arch.mmcrs);
		break;
2151 2152 2153
	case KVM_REG_PPC_MMCR3:
		*val = get_reg_val(id, vcpu->arch.mmcr[3]);
		break;
2154 2155 2156
	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
		i = id - KVM_REG_PPC_PMC1;
		*val = get_reg_val(id, vcpu->arch.pmc[i]);
2157
		break;
2158 2159 2160 2161
	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
		i = id - KVM_REG_PPC_SPMC1;
		*val = get_reg_val(id, vcpu->arch.spmc[i]);
		break;
2162 2163 2164 2165 2166 2167
	case KVM_REG_PPC_SIAR:
		*val = get_reg_val(id, vcpu->arch.siar);
		break;
	case KVM_REG_PPC_SDAR:
		*val = get_reg_val(id, vcpu->arch.sdar);
		break;
2168
	case KVM_REG_PPC_SIER:
2169 2170 2171 2172 2173 2174 2175
		*val = get_reg_val(id, vcpu->arch.sier[0]);
		break;
	case KVM_REG_PPC_SIER2:
		*val = get_reg_val(id, vcpu->arch.sier[1]);
		break;
	case KVM_REG_PPC_SIER3:
		*val = get_reg_val(id, vcpu->arch.sier[2]);
2176
		break;
2177 2178 2179 2180 2181 2182 2183
	case KVM_REG_PPC_IAMR:
		*val = get_reg_val(id, vcpu->arch.iamr);
		break;
	case KVM_REG_PPC_PSPB:
		*val = get_reg_val(id, vcpu->arch.pspb);
		break;
	case KVM_REG_PPC_DPDES:
2184 2185 2186 2187 2188 2189
		/*
		 * On POWER9, where we are emulating msgsndp etc.,
		 * we return 1 bit for each vcpu, which can come from
		 * either vcore->dpdes or doorbell_request.
		 * On POWER8, doorbell_request is 0.
		 */
2190 2191 2192 2193
		if (cpu_has_feature(CPU_FTR_ARCH_300))
			*val = get_reg_val(id, vcpu->arch.doorbell_request);
		else
			*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
2194
		break;
2195 2196 2197
	case KVM_REG_PPC_VTB:
		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
		break;
2198
	case KVM_REG_PPC_DAWR:
2199
		*val = get_reg_val(id, vcpu->arch.dawr0);
2200 2201
		break;
	case KVM_REG_PPC_DAWRX:
2202
		*val = get_reg_val(id, vcpu->arch.dawrx0);
2203
		break;
2204 2205 2206 2207 2208 2209
	case KVM_REG_PPC_DAWR1:
		*val = get_reg_val(id, vcpu->arch.dawr1);
		break;
	case KVM_REG_PPC_DAWRX1:
		*val = get_reg_val(id, vcpu->arch.dawrx1);
		break;
2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229
	case KVM_REG_PPC_CIABR:
		*val = get_reg_val(id, vcpu->arch.ciabr);
		break;
	case KVM_REG_PPC_CSIGR:
		*val = get_reg_val(id, vcpu->arch.csigr);
		break;
	case KVM_REG_PPC_TACR:
		*val = get_reg_val(id, vcpu->arch.tacr);
		break;
	case KVM_REG_PPC_TCSCR:
		*val = get_reg_val(id, vcpu->arch.tcscr);
		break;
	case KVM_REG_PPC_PID:
		*val = get_reg_val(id, vcpu->arch.pid);
		break;
	case KVM_REG_PPC_ACOP:
		*val = get_reg_val(id, vcpu->arch.acop);
		break;
	case KVM_REG_PPC_WORT:
		*val = get_reg_val(id, vcpu->arch.wort);
2230
		break;
2231 2232 2233 2234 2235 2236
	case KVM_REG_PPC_TIDR:
		*val = get_reg_val(id, vcpu->arch.tid);
		break;
	case KVM_REG_PPC_PSSCR:
		*val = get_reg_val(id, vcpu->arch.psscr);
		break;
2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253
	case KVM_REG_PPC_VPA_ADDR:
		spin_lock(&vcpu->arch.vpa_update_lock);
		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
		spin_unlock(&vcpu->arch.vpa_update_lock);
		break;
	case KVM_REG_PPC_VPA_SLB:
		spin_lock(&vcpu->arch.vpa_update_lock);
		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
		val->vpaval.length = vcpu->arch.slb_shadow.len;
		spin_unlock(&vcpu->arch.vpa_update_lock);
		break;
	case KVM_REG_PPC_VPA_DTL:
		spin_lock(&vcpu->arch.vpa_update_lock);
		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
		val->vpaval.length = vcpu->arch.dtl.len;
		spin_unlock(&vcpu->arch.vpa_update_lock);
		break;
2254 2255 2256
	case KVM_REG_PPC_TB_OFFSET:
		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
		break;
2257
	case KVM_REG_PPC_LPCR:
2258
	case KVM_REG_PPC_LPCR_64:
2259 2260
		*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
		break;
2261 2262 2263
	case KVM_REG_PPC_PPR:
		*val = get_reg_val(id, vcpu->arch.ppr);
		break;
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	case KVM_REG_PPC_TFHAR:
		*val = get_reg_val(id, vcpu->arch.tfhar);
		break;
	case KVM_REG_PPC_TFIAR:
		*val = get_reg_val(id, vcpu->arch.tfiar);
		break;
	case KVM_REG_PPC_TEXASR:
		*val = get_reg_val(id, vcpu->arch.texasr);
		break;
	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
		i = id - KVM_REG_PPC_TM_GPR0;
		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
		break;
	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
	{
		int j;
		i = id - KVM_REG_PPC_TM_VSR0;
		if (i < 32)
			for (j = 0; j < TS_FPRWIDTH; j++)
				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
		else {
			if (cpu_has_feature(CPU_FTR_ALTIVEC))
				val->vval = vcpu->arch.vr_tm.vr[i-32];
			else
				r = -ENXIO;
		}
		break;
	}
	case KVM_REG_PPC_TM_CR:
		*val = get_reg_val(id, vcpu->arch.cr_tm);
		break;
2296 2297 2298
	case KVM_REG_PPC_TM_XER:
		*val = get_reg_val(id, vcpu->arch.xer_tm);
		break;
2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329
	case KVM_REG_PPC_TM_LR:
		*val = get_reg_val(id, vcpu->arch.lr_tm);
		break;
	case KVM_REG_PPC_TM_CTR:
		*val = get_reg_val(id, vcpu->arch.ctr_tm);
		break;
	case KVM_REG_PPC_TM_FPSCR:
		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
		break;
	case KVM_REG_PPC_TM_AMR:
		*val = get_reg_val(id, vcpu->arch.amr_tm);
		break;
	case KVM_REG_PPC_TM_PPR:
		*val = get_reg_val(id, vcpu->arch.ppr_tm);
		break;
	case KVM_REG_PPC_TM_VRSAVE:
		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
		break;
	case KVM_REG_PPC_TM_VSCR:
		if (cpu_has_feature(CPU_FTR_ALTIVEC))
			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
		else
			r = -ENXIO;
		break;
	case KVM_REG_PPC_TM_DSCR:
		*val = get_reg_val(id, vcpu->arch.dscr_tm);
		break;
	case KVM_REG_PPC_TM_TAR:
		*val = get_reg_val(id, vcpu->arch.tar_tm);
		break;
#endif
2330 2331 2332
	case KVM_REG_PPC_ARCH_COMPAT:
		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
		break;
2333
	case KVM_REG_PPC_DEC_EXPIRY:
2334
		*val = get_reg_val(id, vcpu->arch.dec_expires);
2335
		break;
2336 2337 2338
	case KVM_REG_PPC_ONLINE:
		*val = get_reg_val(id, vcpu->arch.online);
		break;
2339 2340 2341
	case KVM_REG_PPC_PTCR:
		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
		break;
2342
	default:
2343
		r = -EINVAL;
2344 2345 2346 2347 2348 2349
		break;
	}

	return r;
}

2350 2351
static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
				 union kvmppc_one_reg *val)
2352
{
2353 2354
	int r = 0;
	long int i;
2355
	unsigned long addr, len;
2356

2357
	switch (id) {
2358 2359
	case KVM_REG_PPC_HIOR:
		/* Only allow this to be set to zero */
2360
		if (set_reg_val(id, *val))
2361 2362
			r = -EINVAL;
		break;
2363 2364 2365
	case KVM_REG_PPC_DABR:
		vcpu->arch.dabr = set_reg_val(id, *val);
		break;
2366 2367 2368
	case KVM_REG_PPC_DABRX:
		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
		break;
2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383
	case KVM_REG_PPC_DSCR:
		vcpu->arch.dscr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_PURR:
		vcpu->arch.purr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_SPURR:
		vcpu->arch.spurr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_AMR:
		vcpu->arch.amr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_UAMOR:
		vcpu->arch.uamor = set_reg_val(id, *val);
		break;
2384
	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2385 2386 2387
		i = id - KVM_REG_PPC_MMCR0;
		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
		break;
2388 2389 2390 2391 2392 2393 2394 2395 2396
	case KVM_REG_PPC_MMCR2:
		vcpu->arch.mmcr[2] = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_MMCRA:
		vcpu->arch.mmcra = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_MMCRS:
		vcpu->arch.mmcrs = set_reg_val(id, *val);
		break;
2397 2398 2399
	case KVM_REG_PPC_MMCR3:
		*val = get_reg_val(id, vcpu->arch.mmcr[3]);
		break;
2400 2401 2402 2403
	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
		i = id - KVM_REG_PPC_PMC1;
		vcpu->arch.pmc[i] = set_reg_val(id, *val);
		break;
2404 2405 2406 2407
	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
		i = id - KVM_REG_PPC_SPMC1;
		vcpu->arch.spmc[i] = set_reg_val(id, *val);
		break;
2408 2409 2410 2411 2412 2413
	case KVM_REG_PPC_SIAR:
		vcpu->arch.siar = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_SDAR:
		vcpu->arch.sdar = set_reg_val(id, *val);
		break;
2414
	case KVM_REG_PPC_SIER:
2415 2416 2417 2418 2419 2420 2421
		vcpu->arch.sier[0] = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_SIER2:
		vcpu->arch.sier[1] = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_SIER3:
		vcpu->arch.sier[2] = set_reg_val(id, *val);
2422
		break;
2423 2424 2425 2426 2427 2428 2429
	case KVM_REG_PPC_IAMR:
		vcpu->arch.iamr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_PSPB:
		vcpu->arch.pspb = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_DPDES:
2430 2431 2432 2433
		if (cpu_has_feature(CPU_FTR_ARCH_300))
			vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
		else
			vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
2434
		break;
2435 2436 2437
	case KVM_REG_PPC_VTB:
		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
		break;
2438
	case KVM_REG_PPC_DAWR:
2439
		vcpu->arch.dawr0 = set_reg_val(id, *val);
2440 2441
		break;
	case KVM_REG_PPC_DAWRX:
2442
		vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
2443
		break;
2444 2445 2446 2447 2448 2449
	case KVM_REG_PPC_DAWR1:
		vcpu->arch.dawr1 = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_DAWRX1:
		vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP;
		break;
2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472
	case KVM_REG_PPC_CIABR:
		vcpu->arch.ciabr = set_reg_val(id, *val);
		/* Don't allow setting breakpoints in hypervisor code */
		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */
		break;
	case KVM_REG_PPC_CSIGR:
		vcpu->arch.csigr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TACR:
		vcpu->arch.tacr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TCSCR:
		vcpu->arch.tcscr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_PID:
		vcpu->arch.pid = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_ACOP:
		vcpu->arch.acop = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_WORT:
		vcpu->arch.wort = set_reg_val(id, *val);
2473
		break;
2474 2475 2476 2477 2478 2479
	case KVM_REG_PPC_TIDR:
		vcpu->arch.tid = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_PSSCR:
		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
		break;
2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499
	case KVM_REG_PPC_VPA_ADDR:
		addr = set_reg_val(id, *val);
		r = -EINVAL;
		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
			      vcpu->arch.dtl.next_gpa))
			break;
		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
		break;
	case KVM_REG_PPC_VPA_SLB:
		addr = val->vpaval.addr;
		len = val->vpaval.length;
		r = -EINVAL;
		if (addr && !vcpu->arch.vpa.next_gpa)
			break;
		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
		break;
	case KVM_REG_PPC_VPA_DTL:
		addr = val->vpaval.addr;
		len = val->vpaval.length;
		r = -EINVAL;
2500 2501
		if (addr && (len < sizeof(struct dtl_entry) ||
			     !vcpu->arch.vpa.next_gpa))
2502 2503 2504 2505
			break;
		len -= len % sizeof(struct dtl_entry);
		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
		break;
2506 2507 2508 2509 2510
	case KVM_REG_PPC_TB_OFFSET:
		/* round up to multiple of 2^24 */
		vcpu->arch.vcore->tb_offset =
			ALIGN(set_reg_val(id, *val), 1UL << 24);
		break;
2511
	case KVM_REG_PPC_LPCR:
2512 2513 2514 2515
		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
		break;
	case KVM_REG_PPC_LPCR_64:
		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
2516
		break;
2517 2518 2519
	case KVM_REG_PPC_PPR:
		vcpu->arch.ppr = set_reg_val(id, *val);
		break;
2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	case KVM_REG_PPC_TFHAR:
		vcpu->arch.tfhar = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TFIAR:
		vcpu->arch.tfiar = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TEXASR:
		vcpu->arch.texasr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
		i = id - KVM_REG_PPC_TM_GPR0;
		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
	{
		int j;
		i = id - KVM_REG_PPC_TM_VSR0;
		if (i < 32)
			for (j = 0; j < TS_FPRWIDTH; j++)
				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
		else
			if (cpu_has_feature(CPU_FTR_ALTIVEC))
				vcpu->arch.vr_tm.vr[i-32] = val->vval;
			else
				r = -ENXIO;
		break;
	}
	case KVM_REG_PPC_TM_CR:
		vcpu->arch.cr_tm = set_reg_val(id, *val);
		break;
2551 2552 2553
	case KVM_REG_PPC_TM_XER:
		vcpu->arch.xer_tm = set_reg_val(id, *val);
		break;
2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584
	case KVM_REG_PPC_TM_LR:
		vcpu->arch.lr_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_CTR:
		vcpu->arch.ctr_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_FPSCR:
		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_AMR:
		vcpu->arch.amr_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_PPR:
		vcpu->arch.ppr_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_VRSAVE:
		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_VSCR:
		if (cpu_has_feature(CPU_FTR_ALTIVEC))
			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
		else
			r = - ENXIO;
		break;
	case KVM_REG_PPC_TM_DSCR:
		vcpu->arch.dscr_tm = set_reg_val(id, *val);
		break;
	case KVM_REG_PPC_TM_TAR:
		vcpu->arch.tar_tm = set_reg_val(id, *val);
		break;
#endif
2585 2586 2587
	case KVM_REG_PPC_ARCH_COMPAT:
		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
		break;
2588
	case KVM_REG_PPC_DEC_EXPIRY:
2589
		vcpu->arch.dec_expires = set_reg_val(id, *val);
2590
		break;
2591
	case KVM_REG_PPC_ONLINE:
2592 2593 2594 2595 2596 2597
		i = set_reg_val(id, *val);
		if (i && !vcpu->arch.online)
			atomic_inc(&vcpu->arch.vcore->online_count);
		else if (!i && vcpu->arch.online)
			atomic_dec(&vcpu->arch.vcore->online_count);
		vcpu->arch.online = i;
2598
		break;
2599 2600 2601
	case KVM_REG_PPC_PTCR:
		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
		break;
2602
	default:
2603
		r = -EINVAL;
2604 2605 2606 2607 2608 2609
		break;
	}

	return r;
}

2610 2611 2612 2613 2614 2615 2616
/*
 * On POWER9, threads are independent and can be in different partitions.
 * Therefore we consider each thread to be a subcore.
 * There is a restriction that all threads have to be in the same
 * MMU mode (radix or HPT), unfortunately, but since we only support
 * HPT guests on a HPT host so far, that isn't an impediment yet.
 */
2617
static int threads_per_vcore(struct kvm *kvm)
2618
{
2619
	if (cpu_has_feature(CPU_FTR_ARCH_300))
2620 2621 2622 2623
		return 1;
	return threads_per_subcore;
}

2624
static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
2625 2626 2627 2628 2629 2630 2631 2632 2633
{
	struct kvmppc_vcore *vcore;

	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);

	if (vcore == NULL)
		return NULL;

	spin_lock_init(&vcore->lock);
2634
	spin_lock_init(&vcore->stoltb_lock);
2635
	rcuwait_init(&vcore->wait);
2636 2637
	vcore->preempt_tb = TB_NIL;
	vcore->lpcr = kvm->arch.lpcr;
2638
	vcore->first_vcpuid = id;
2639
	vcore->kvm = kvm;
2640
	INIT_LIST_HEAD(&vcore->preempt_list);
2641 2642 2643 2644

	return vcore;
}

2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
static struct debugfs_timings_element {
	const char *name;
	size_t offset;
} timings[] = {
	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
};

2657
#define N_TIMINGS	(ARRAY_SIZE(timings))
2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777

struct debugfs_timings_state {
	struct kvm_vcpu	*vcpu;
	unsigned int	buflen;
	char		buf[N_TIMINGS * 100];
};

static int debugfs_timings_open(struct inode *inode, struct file *file)
{
	struct kvm_vcpu *vcpu = inode->i_private;
	struct debugfs_timings_state *p;

	p = kzalloc(sizeof(*p), GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	kvm_get_kvm(vcpu->kvm);
	p->vcpu = vcpu;
	file->private_data = p;

	return nonseekable_open(inode, file);
}

static int debugfs_timings_release(struct inode *inode, struct file *file)
{
	struct debugfs_timings_state *p = file->private_data;

	kvm_put_kvm(p->vcpu->kvm);
	kfree(p);
	return 0;
}

static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
				    size_t len, loff_t *ppos)
{
	struct debugfs_timings_state *p = file->private_data;
	struct kvm_vcpu *vcpu = p->vcpu;
	char *s, *buf_end;
	struct kvmhv_tb_accumulator tb;
	u64 count;
	loff_t pos;
	ssize_t n;
	int i, loops;
	bool ok;

	if (!p->buflen) {
		s = p->buf;
		buf_end = s + sizeof(p->buf);
		for (i = 0; i < N_TIMINGS; ++i) {
			struct kvmhv_tb_accumulator *acc;

			acc = (struct kvmhv_tb_accumulator *)
				((unsigned long)vcpu + timings[i].offset);
			ok = false;
			for (loops = 0; loops < 1000; ++loops) {
				count = acc->seqcount;
				if (!(count & 1)) {
					smp_rmb();
					tb = *acc;
					smp_rmb();
					if (count == acc->seqcount) {
						ok = true;
						break;
					}
				}
				udelay(1);
			}
			if (!ok)
				snprintf(s, buf_end - s, "%s: stuck\n",
					timings[i].name);
			else
				snprintf(s, buf_end - s,
					"%s: %llu %llu %llu %llu\n",
					timings[i].name, count / 2,
					tb_to_ns(tb.tb_total),
					tb_to_ns(tb.tb_min),
					tb_to_ns(tb.tb_max));
			s += strlen(s);
		}
		p->buflen = s - p->buf;
	}

	pos = *ppos;
	if (pos >= p->buflen)
		return 0;
	if (len > p->buflen - pos)
		len = p->buflen - pos;
	n = copy_to_user(buf, p->buf + pos, len);
	if (n) {
		if (n == len)
			return -EFAULT;
		len -= n;
	}
	*ppos = pos + len;
	return len;
}

static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
				     size_t len, loff_t *ppos)
{
	return -EACCES;
}

static const struct file_operations debugfs_timings_ops = {
	.owner	 = THIS_MODULE,
	.open	 = debugfs_timings_open,
	.release = debugfs_timings_release,
	.read	 = debugfs_timings_read,
	.write	 = debugfs_timings_write,
	.llseek	 = generic_file_llseek,
};

/* Create a debugfs directory for the vcpu */
static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
{
	char buf[16];
	struct kvm *kvm = vcpu->kvm;

	snprintf(buf, sizeof(buf), "vcpu%u", id);
	vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
2778 2779
	debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, vcpu,
			    &debugfs_timings_ops);
2780 2781 2782 2783 2784 2785 2786 2787
}

#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
{
}
#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */

2788
static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
2789
{
2790
	int err;
2791 2792
	int core;
	struct kvmppc_vcore *vcore;
2793 2794
	struct kvm *kvm;
	unsigned int id;
2795

2796 2797
	kvm = vcpu->kvm;
	id = vcpu->vcpu_id;
2798 2799

	vcpu->arch.shared = &vcpu->arch.shregs;
2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
	/*
	 * The shared struct is never shared on HV,
	 * so we can always use host endianness
	 */
#ifdef __BIG_ENDIAN__
	vcpu->arch.shared_big_endian = true;
#else
	vcpu->arch.shared_big_endian = false;
#endif
#endif
2811
	vcpu->arch.mmcr[0] = MMCR0_FC;
2812 2813 2814 2815 2816
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
		vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT;
		vcpu->arch.mmcra = MMCRA_BHRB_DISABLE;
	}

2817 2818
	vcpu->arch.ctrl = CTRL_RUNLATCH;
	/* default to host PVR, since we can't spoof it */
2819
	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
2820
	spin_lock_init(&vcpu->arch.vpa_update_lock);
2821 2822
	spin_lock_init(&vcpu->arch.tbacct_lock);
	vcpu->arch.busy_preempt = TB_NIL;
2823
	vcpu->arch.shregs.msr = MSR_ME;
2824
	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
2825

2826 2827 2828
	/*
	 * Set the default HFSCR for the guest from the host value.
	 * This value is only used on POWER9.
2829
	 * On POWER9, we want to virtualize the doorbell facility, so we
2830 2831
	 * don't set the HFSCR_MSGP bit, and that causes those instructions
	 * to trap and then we emulate them.
2832
	 */
2833
	vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
2834
		HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX;
2835 2836
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
		vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
2837
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2838 2839
		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
			vcpu->arch.hfscr |= HFSCR_TM;
2840
#endif
2841 2842
	}
	if (cpu_has_feature(CPU_FTR_TM_COMP))
2843
		vcpu->arch.hfscr |= HFSCR_TM;
2844

2845 2846
	vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;

2847
	/*
2848
	 * PM, EBB, TM are demand-faulted so start with it clear.
2849
	 */
2850
	vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM);
2851

2852 2853
	kvmppc_mmu_book3s_hv_init(vcpu);

2854
	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
2855 2856 2857 2858

	init_waitqueue_head(&vcpu->arch.cpu_run);

	mutex_lock(&kvm->lock);
2859 2860
	vcore = NULL;
	err = -EINVAL;
2861
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
2862 2863 2864 2865 2866 2867 2868
		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
			pr_devel("KVM: VCPU ID too high\n");
			core = KVM_MAX_VCORES;
		} else {
			BUG_ON(kvm->arch.smt_mode != 1);
			core = kvmppc_pack_vcpu_id(kvm, id);
		}
2869 2870 2871
	} else {
		core = id / kvm->arch.smt_mode;
	}
2872 2873
	if (core < KVM_MAX_VCORES) {
		vcore = kvm->arch.vcores[core];
2874 2875 2876 2877
		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
			pr_devel("KVM: collision on id %u", id);
			vcore = NULL;
		} else if (!vcore) {
2878 2879 2880 2881
			/*
			 * Take mmu_setup_lock for mutual exclusion
			 * with kvmppc_update_lpcr().
			 */
2882
			err = -ENOMEM;
2883 2884
			vcore = kvmppc_vcore_create(kvm,
					id & ~(kvm->arch.smt_mode - 1));
2885
			mutex_lock(&kvm->arch.mmu_setup_lock);
2886 2887
			kvm->arch.vcores[core] = vcore;
			kvm->arch.online_vcores++;
2888
			mutex_unlock(&kvm->arch.mmu_setup_lock);
2889
		}
2890 2891 2892 2893
	}
	mutex_unlock(&kvm->lock);

	if (!vcore)
2894
		return err;
2895 2896 2897 2898 2899

	spin_lock(&vcore->lock);
	++vcore->num_threads;
	spin_unlock(&vcore->lock);
	vcpu->arch.vcore = vcore;
2900
	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
2901
	vcpu->arch.thread_cpu = -1;
2902
	vcpu->arch.prev_cpu = -1;
2903

2904 2905 2906
	vcpu->arch.cpu_type = KVM_CPU_3S_64;
	kvmppc_sanity_check(vcpu);

2907 2908
	debugfs_vcpu_init(vcpu, id);

2909
	return 0;
2910 2911
}

2912 2913 2914 2915
static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
			      unsigned long flags)
{
	int err;
2916
	int esmt = 0;
2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933

	if (flags)
		return -EINVAL;
	if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
		return -EINVAL;
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
		/*
		 * On POWER8 (or POWER7), the threading mode is "strict",
		 * so we pack smt_mode vcpus per vcore.
		 */
		if (smt_mode > threads_per_subcore)
			return -EINVAL;
	} else {
		/*
		 * On POWER9, the threading mode is "loose",
		 * so each vcpu gets its own vcore.
		 */
2934
		esmt = smt_mode;
2935 2936 2937 2938 2939 2940
		smt_mode = 1;
	}
	mutex_lock(&kvm->lock);
	err = -EBUSY;
	if (!kvm->arch.online_vcores) {
		kvm->arch.smt_mode = smt_mode;
2941
		kvm->arch.emul_smt_mode = esmt;
2942 2943 2944 2945 2946 2947 2948
		err = 0;
	}
	mutex_unlock(&kvm->lock);

	return err;
}

2949 2950 2951 2952 2953 2954 2955
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
{
	if (vpa->pinned_addr)
		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
					vpa->dirty);
}

2956
static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
2957
{
2958
	spin_lock(&vcpu->arch.vpa_update_lock);
2959 2960 2961
	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
2962
	spin_unlock(&vcpu->arch.vpa_update_lock);
2963 2964
}

2965 2966 2967 2968 2969 2970
static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
{
	/* Indicate we want to get back into the guest */
	return 1;
}

2971
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
2972
{
2973
	unsigned long dec_nsec, now;
2974

2975
	now = get_tb();
2976
	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
2977 2978
		/* decrementer has already gone negative */
		kvmppc_core_queue_dec(vcpu);
2979
		kvmppc_core_prepare_to_enter(vcpu);
2980
		return;
2981
	}
2982
	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
2983
	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
2984
	vcpu->arch.timer_running = 1;
2985 2986
}

2987
extern int __kvmppc_vcore_entry(void);
2988

2989
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
2990
				   struct kvm_vcpu *vcpu, u64 tb)
2991
{
2992 2993
	u64 now;

2994 2995
	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
		return;
2996
	spin_lock_irq(&vcpu->arch.tbacct_lock);
2997
	now = tb;
2998 2999 3000 3001
	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
		vcpu->arch.stolen_logged;
	vcpu->arch.busy_preempt = now;
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
3002
	spin_unlock_irq(&vcpu->arch.tbacct_lock);
3003
	--vc->n_runnable;
3004
	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
3005 3006
}

3007 3008 3009
static int kvmppc_grab_hwthread(int cpu)
{
	struct paca_struct *tpaca;
3010
	long timeout = 10000;
3011

3012
	tpaca = paca_ptrs[cpu];
3013 3014

	/* Ensure the thread won't go into the kernel if it wakes */
3015
	tpaca->kvm_hstate.kvm_vcpu = NULL;
3016
	tpaca->kvm_hstate.kvm_vcore = NULL;
3017 3018 3019
	tpaca->kvm_hstate.napping = 0;
	smp_wmb();
	tpaca->kvm_hstate.hwthread_req = 1;
3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044

	/*
	 * If the thread is already executing in the kernel (e.g. handling
	 * a stray interrupt), wait for it to get back to nap mode.
	 * The smp_mb() is to ensure that our setting of hwthread_req
	 * is visible before we look at hwthread_state, so if this
	 * races with the code at system_reset_pSeries and the thread
	 * misses our setting of hwthread_req, we are sure to see its
	 * setting of hwthread_state, and vice versa.
	 */
	smp_mb();
	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
		if (--timeout <= 0) {
			pr_err("KVM: couldn't grab cpu %d\n", cpu);
			return -EBUSY;
		}
		udelay(1);
	}
	return 0;
}

static void kvmppc_release_hwthread(int cpu)
{
	struct paca_struct *tpaca;

3045
	tpaca = paca_ptrs[cpu];
3046
	tpaca->kvm_hstate.hwthread_req = 0;
3047
	tpaca->kvm_hstate.kvm_vcpu = NULL;
3048 3049
	tpaca->kvm_hstate.kvm_vcore = NULL;
	tpaca->kvm_hstate.kvm_split_mode = NULL;
3050 3051
}

3052 3053
static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);

3054 3055
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
3056
	struct kvm_nested_guest *nested = vcpu->arch.nested;
3057
	cpumask_t *need_tlb_flush;
3058 3059
	int i;

3060
	if (nested)
3061
		need_tlb_flush = &nested->need_tlb_flush;
3062
	else
3063 3064 3065 3066 3067 3068 3069
		need_tlb_flush = &kvm->arch.need_tlb_flush;

	cpu = cpu_first_tlb_thread_sibling(cpu);
	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
					i += cpu_tlb_thread_sibling_step())
		cpumask_set_cpu(i, need_tlb_flush);

3070
	/*
3071 3072 3073 3074 3075
	 * Make sure setting of bit in need_tlb_flush precedes testing of
	 * cpu_in_guest. The matching barrier on the other side is hwsync
	 * when switching to guest MMU mode, which happens between
	 * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
	 * being tested.
3076 3077
	 */
	smp_mb();
3078

3079
	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3080 3081 3082 3083
					i += cpu_tlb_thread_sibling_step()) {
		struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);

		if (running == kvm)
3084
			smp_call_function_single(i, do_nothing, NULL, 1);
3085
	}
3086 3087
}

3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106
static void do_migrate_away_vcpu(void *arg)
{
	struct kvm_vcpu *vcpu = arg;
	struct kvm *kvm = vcpu->kvm;

	/*
	 * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
	 * ptesync sequence on the old CPU before migrating to a new one, in
	 * case we interrupted the guest between a tlbie ; eieio ;
	 * tlbsync; ptesync sequence.
	 *
	 * Otherwise, ptesync is sufficient for ordering tlbiel sequences.
	 */
	if (kvm->arch.lpcr & LPCR_GTSE)
		asm volatile("eieio; tlbsync; ptesync");
	else
		asm volatile("ptesync");
}

3107 3108
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
3109
	struct kvm_nested_guest *nested = vcpu->arch.nested;
3110
	struct kvm *kvm = vcpu->kvm;
3111 3112 3113 3114 3115 3116 3117 3118 3119
	int prev_cpu;

	if (!cpu_has_feature(CPU_FTR_HVMODE))
		return;

	if (nested)
		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
	else
		prev_cpu = vcpu->arch.prev_cpu;
3120 3121 3122 3123 3124 3125 3126 3127 3128 3129

	/*
	 * With radix, the guest can do TLB invalidations itself,
	 * and it could choose to use the local form (tlbiel) if
	 * it is invalidating a translation that has only ever been
	 * used on one vcpu.  However, that doesn't mean it has
	 * only ever been used on one physical cpu, since vcpus
	 * can move around between pcpus.  To cope with this, when
	 * a vcpu moves from one pcpu to another, we need to tell
	 * any vcpus running on the same core as this vcpu previously
3130
	 * ran to flush the TLB.
3131
	 */
3132
	if (prev_cpu != pcpu) {
3133 3134 3135 3136 3137 3138 3139 3140
		if (prev_cpu >= 0) {
			if (cpu_first_tlb_thread_sibling(prev_cpu) !=
			    cpu_first_tlb_thread_sibling(pcpu))
				radix_flush_cpu(kvm, prev_cpu, vcpu);

			smp_call_function_single(prev_cpu,
					do_migrate_away_vcpu, vcpu, 1);
		}
3141 3142 3143 3144 3145 3146 3147
		if (nested)
			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
		else
			vcpu->arch.prev_cpu = pcpu;
	}
}

3148
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
3149 3150 3151 3152
{
	int cpu;
	struct paca_struct *tpaca;

3153 3154 3155 3156 3157 3158 3159
	cpu = vc->pcpu;
	if (vcpu) {
		if (vcpu->arch.timer_running) {
			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
			vcpu->arch.timer_running = 0;
		}
		cpu += vcpu->arch.ptid;
3160
		vcpu->cpu = vc->pcpu;
3161
		vcpu->arch.thread_cpu = cpu;
3162
	}
3163
	tpaca = paca_ptrs[cpu];
3164
	tpaca->kvm_hstate.kvm_vcpu = vcpu;
3165
	tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
3166
	tpaca->kvm_hstate.fake_suspend = 0;
3167
	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
3168
	smp_wmb();
3169
	tpaca->kvm_hstate.kvm_vcore = vc;
3170
	if (cpu != smp_processor_id())
3171
		kvmppc_ipi_thread(cpu);
3172
}
3173

3174
static void kvmppc_wait_for_nap(int n_threads)
3175
{
3176 3177
	int cpu = smp_processor_id();
	int i, loops;
3178

3179 3180
	if (n_threads <= 1)
		return;
3181 3182 3183
	for (loops = 0; loops < 1000000; ++loops) {
		/*
		 * Check if all threads are finished.
3184
		 * We set the vcore pointer when starting a thread
3185
		 * and the thread clears it when finished, so we look
3186
		 * for any threads that still have a non-NULL vcore ptr.
3187
		 */
3188
		for (i = 1; i < n_threads; ++i)
3189
			if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3190
				break;
3191
		if (i == n_threads) {
3192 3193
			HMT_medium();
			return;
3194
		}
3195
		HMT_low();
3196 3197
	}
	HMT_medium();
3198
	for (i = 1; i < n_threads; ++i)
3199
		if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3200
			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
3201 3202 3203 3204
}

/*
 * Check that we are on thread 0 and that any other threads in
3205 3206
 * this core are off-line.  Then grab the threads so they can't
 * enter the kernel.
3207 3208 3209 3210
 */
static int on_primary_thread(void)
{
	int cpu = smp_processor_id();
3211
	int thr;
3212

3213 3214
	/* Are we on a primary subcore? */
	if (cpu_thread_in_subcore(cpu))
3215
		return 0;
3216 3217 3218

	thr = 0;
	while (++thr < threads_per_subcore)
3219 3220
		if (cpu_online(cpu + thr))
			return 0;
3221 3222

	/* Grab all hw threads so they can't go into the kernel */
3223
	for (thr = 1; thr < threads_per_subcore; ++thr) {
3224 3225 3226 3227 3228 3229 3230 3231
		if (kvmppc_grab_hwthread(cpu + thr)) {
			/* Couldn't grab one; let the others go */
			do {
				kvmppc_release_hwthread(cpu + thr);
			} while (--thr > 0);
			return 0;
		}
	}
3232 3233 3234
	return 1;
}

3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261
/*
 * A list of virtual cores for each physical CPU.
 * These are vcores that could run but their runner VCPU tasks are
 * (or may be) preempted.
 */
struct preempted_vcore_list {
	struct list_head	list;
	spinlock_t		lock;
};

static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);

static void init_vcore_lists(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
		spin_lock_init(&lp->lock);
		INIT_LIST_HEAD(&lp->list);
	}
}

static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
{
	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);

3262 3263
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

3264 3265
	vc->vcore_state = VCORE_PREEMPT;
	vc->pcpu = smp_processor_id();
3266
	if (vc->num_threads < threads_per_vcore(vc->kvm)) {
3267 3268 3269 3270 3271 3272
		spin_lock(&lp->lock);
		list_add_tail(&vc->preempt_list, &lp->list);
		spin_unlock(&lp->lock);
	}

	/* Start accumulating stolen time */
3273
	kvmppc_core_start_stolen(vc, mftb());
3274 3275 3276 3277
}

static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
{
3278
	struct preempted_vcore_list *lp;
3279

3280 3281
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

3282
	kvmppc_core_end_stolen(vc, mftb());
3283
	if (!list_empty(&vc->preempt_list)) {
3284
		lp = &per_cpu(preempted_vcores, vc->pcpu);
3285 3286 3287 3288 3289 3290 3291
		spin_lock(&lp->lock);
		list_del_init(&vc->preempt_list);
		spin_unlock(&lp->lock);
	}
	vc->vcore_state = VCORE_INACTIVE;
}

3292 3293 3294 3295
/*
 * This stores information about the virtual cores currently
 * assigned to a physical core.
 */
3296
struct core_info {
3297 3298
	int		n_subcores;
	int		max_subcore_threads;
3299
	int		total_threads;
3300
	int		subcore_threads[MAX_SUBCORES];
3301
	struct kvmppc_vcore *vc[MAX_SUBCORES];
3302 3303
};

3304 3305
/*
 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
3306
 * respectively in 2-way micro-threading (split-core) mode on POWER8.
3307 3308 3309
 */
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };

3310 3311 3312
static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
{
	memset(cip, 0, sizeof(*cip));
3313 3314
	cip->n_subcores = 1;
	cip->max_subcore_threads = vc->num_threads;
3315
	cip->total_threads = vc->num_threads;
3316
	cip->subcore_threads[0] = vc->num_threads;
3317
	cip->vc[0] = vc;
3318 3319 3320 3321
}

static bool subcore_config_ok(int n_subcores, int n_threads)
{
3322
	/*
3323 3324
	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
	 * split-core mode, with one thread per subcore.
3325 3326 3327 3328 3329
	 */
	if (cpu_has_feature(CPU_FTR_ARCH_300))
		return n_subcores <= 4 && n_threads == 1;

	/* On POWER8, can only dynamically split if unsplit to begin with */
3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341
	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
		return false;
	if (n_subcores > MAX_SUBCORES)
		return false;
	if (n_subcores > 1) {
		if (!(dynamic_mt_modes & 2))
			n_subcores = 4;
		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
			return false;
	}

	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
3342 3343
}

3344
static void init_vcore_to_run(struct kvmppc_vcore *vc)
3345 3346 3347 3348 3349
{
	vc->entry_exit_map = 0;
	vc->in_guest = 0;
	vc->napping_threads = 0;
	vc->conferring_threads = 0;
3350
	vc->tb_offset_applied = 0;
3351 3352
}

3353 3354 3355 3356 3357 3358 3359 3360
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
{
	int n_threads = vc->num_threads;
	int sub;

	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
		return false;

3361 3362 3363 3364
	/* In one_vm_per_core mode, require all vcores to be from the same vm */
	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
		return false;

3365 3366
	if (n_threads < cip->max_subcore_threads)
		n_threads = cip->max_subcore_threads;
3367
	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
3368
		return false;
3369
	cip->max_subcore_threads = n_threads;
3370 3371 3372 3373 3374

	sub = cip->n_subcores;
	++cip->n_subcores;
	cip->total_threads += vc->num_threads;
	cip->subcore_threads[sub] = vc->num_threads;
3375 3376 3377
	cip->vc[sub] = vc;
	init_vcore_to_run(vc);
	list_del_init(&vc->preempt_list);
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391

	return true;
}

/*
 * Work out whether it is possible to piggyback the execution of
 * vcore *pvc onto the execution of the other vcores described in *cip.
 */
static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
			  int target_threads)
{
	if (cip->total_threads + pvc->num_threads > target_threads)
		return false;

3392
	return can_dynamic_split(pvc, cip);
3393 3394
}

3395 3396
static void prepare_threads(struct kvmppc_vcore *vc)
{
3397 3398
	int i;
	struct kvm_vcpu *vcpu;
3399

3400
	for_each_runnable_thread(i, vcpu, vc) {
3401 3402 3403 3404 3405 3406 3407 3408
		if (signal_pending(vcpu->arch.run_task))
			vcpu->arch.ret = -EINTR;
		else if (vcpu->arch.vpa.update_pending ||
			 vcpu->arch.slb_shadow.update_pending ||
			 vcpu->arch.dtl.update_pending)
			vcpu->arch.ret = RESUME_GUEST;
		else
			continue;
3409
		kvmppc_remove_runnable(vc, vcpu, mftb());
3410 3411 3412 3413
		wake_up(&vcpu->arch.cpu_run);
	}
}

3414 3415 3416 3417 3418 3419 3420 3421 3422 3423
static void collect_piggybacks(struct core_info *cip, int target_threads)
{
	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
	struct kvmppc_vcore *pvc, *vcnext;

	spin_lock(&lp->lock);
	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
		if (!spin_trylock(&pvc->lock))
			continue;
		prepare_threads(pvc);
3424
		if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
3425 3426 3427
			list_del_init(&pvc->preempt_list);
			if (pvc->runner == NULL) {
				pvc->vcore_state = VCORE_INACTIVE;
3428
				kvmppc_core_end_stolen(pvc, mftb());
3429 3430 3431 3432 3433 3434 3435 3436
			}
			spin_unlock(&pvc->lock);
			continue;
		}
		if (!can_piggyback(pvc, cip, target_threads)) {
			spin_unlock(&pvc->lock);
			continue;
		}
3437
		kvmppc_core_end_stolen(pvc, mftb());
3438 3439 3440 3441 3442 3443 3444
		pvc->vcore_state = VCORE_PIGGYBACK;
		if (cip->total_threads >= target_threads)
			break;
	}
	spin_unlock(&lp->lock);
}

3445
static bool recheck_signals_and_mmu(struct core_info *cip)
3446 3447 3448
{
	int sub, i;
	struct kvm_vcpu *vcpu;
3449
	struct kvmppc_vcore *vc;
3450

3451 3452 3453 3454 3455
	for (sub = 0; sub < cip->n_subcores; ++sub) {
		vc = cip->vc[sub];
		if (!vc->kvm->arch.mmu_ready)
			return true;
		for_each_runnable_thread(i, vcpu, vc)
3456 3457
			if (signal_pending(vcpu->arch.run_task))
				return true;
3458
	}
3459 3460 3461
	return false;
}

3462
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
3463
{
3464
	int still_running = 0, i;
3465 3466
	u64 now;
	long ret;
3467
	struct kvm_vcpu *vcpu;
3468

3469
	spin_lock(&vc->lock);
3470
	now = get_tb();
3471
	for_each_runnable_thread(i, vcpu, vc) {
3472 3473 3474 3475 3476 3477 3478 3479
		/*
		 * It's safe to unlock the vcore in the loop here, because
		 * for_each_runnable_thread() is safe against removal of
		 * the vcpu, and the vcore state is VCORE_EXITING here,
		 * so any vcpus becoming runnable will have their arch.trap
		 * set to zero and can't actually run in the guest.
		 */
		spin_unlock(&vc->lock);
3480
		/* cancel pending dec exception if dec is positive */
3481
		if (now < kvmppc_dec_expires_host_tb(vcpu) &&
3482 3483 3484 3485 3486 3487 3488
		    kvmppc_core_pending_dec(vcpu))
			kvmppc_core_dequeue_dec(vcpu);

		trace_kvm_guest_exit(vcpu);

		ret = RESUME_GUEST;
		if (vcpu->arch.trap)
3489
			ret = kvmppc_handle_exit_hv(vcpu,
3490 3491 3492 3493 3494
						    vcpu->arch.run_task);

		vcpu->arch.ret = ret;
		vcpu->arch.trap = 0;

3495
		spin_lock(&vc->lock);
3496 3497 3498 3499
		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
			if (vcpu->arch.pending_exceptions)
				kvmppc_core_prepare_to_enter(vcpu);
			if (vcpu->arch.ceded)
3500
				kvmppc_set_timer(vcpu);
3501 3502 3503
			else
				++still_running;
		} else {
3504
			kvmppc_remove_runnable(vc, vcpu, mftb());
3505 3506 3507
			wake_up(&vcpu->arch.cpu_run);
		}
	}
3508
	if (!is_master) {
3509
		if (still_running > 0) {
3510
			kvmppc_vcore_preempt(vc);
3511 3512
		} else if (vc->runner) {
			vc->vcore_state = VCORE_PREEMPT;
3513
			kvmppc_core_start_stolen(vc, mftb());
3514 3515 3516
		} else {
			vc->vcore_state = VCORE_INACTIVE;
		}
3517 3518
		if (vc->n_runnable > 0 && vc->runner == NULL) {
			/* make sure there's a candidate runner awake */
3519 3520
			i = -1;
			vcpu = next_runnable_thread(vc, &i);
3521 3522 3523 3524
			wake_up(&vcpu->arch.cpu_run);
		}
	}
	spin_unlock(&vc->lock);
3525 3526
}

3527 3528 3529 3530 3531
/*
 * Clear core from the list of active host cores as we are about to
 * enter the guest. Only do this if it is the primary thread of the
 * core (not if a subcore) that is entering the guest.
 */
3532
static inline int kvmppc_clear_host_core(unsigned int cpu)
3533 3534 3535 3536
{
	int core;

	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3537
		return 0;
3538 3539 3540 3541 3542 3543 3544
	/*
	 * Memory barrier can be omitted here as we will do a smp_wmb()
	 * later in kvmppc_start_thread and we need ensure that state is
	 * visible to other CPUs only after we enter guest.
	 */
	core = cpu >> threads_shift;
	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
3545
	return 0;
3546 3547 3548 3549 3550 3551 3552
}

/*
 * Advertise this core as an active host core since we exited the guest
 * Only need to do this if it is the primary thread of the core that is
 * exiting.
 */
3553
static inline int kvmppc_set_host_core(unsigned int cpu)
3554 3555 3556 3557
{
	int core;

	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
3558
		return 0;
3559 3560 3561 3562 3563 3564 3565

	/*
	 * Memory barrier can be omitted here because we do a spin_unlock
	 * immediately after this which provides the memory barrier.
	 */
	core = cpu >> threads_shift;
	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
3566
	return 0;
3567 3568
}

3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580
static void set_irq_happened(int trap)
{
	switch (trap) {
	case BOOK3S_INTERRUPT_EXTERNAL:
		local_paca->irq_happened |= PACA_IRQ_EE;
		break;
	case BOOK3S_INTERRUPT_H_DOORBELL:
		local_paca->irq_happened |= PACA_IRQ_DBELL;
		break;
	case BOOK3S_INTERRUPT_HMI:
		local_paca->irq_happened |= PACA_IRQ_HMI;
		break;
3581 3582 3583
	case BOOK3S_INTERRUPT_SYSTEM_RESET:
		replay_system_reset();
		break;
3584 3585 3586
	}
}

3587 3588 3589 3590
/*
 * Run a set of guest threads on a physical core.
 * Called with vc->lock held.
 */
3591
static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3592
{
3593
	struct kvm_vcpu *vcpu;
3594
	int i;
3595
	int srcu_idx;
3596
	struct core_info core_info;
3597
	struct kvmppc_vcore *pvc;
3598 3599 3600 3601 3602
	struct kvm_split_mode split_info, *sip;
	int split, subcore_size, active;
	int sub;
	bool thr0_done;
	unsigned long cmd_bit, stat_bit;
3603 3604
	int pcpu, thr;
	int target_threads;
3605
	int controlled_threads;
3606
	int trap;
3607
	bool is_power8;
3608

3609 3610 3611
	if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
		return;

3612 3613 3614 3615 3616 3617 3618 3619 3620
	/*
	 * Remove from the list any threads that have a signal pending
	 * or need a VPA update done
	 */
	prepare_threads(vc);

	/* if the runner is no longer runnable, let the caller pick a new one */
	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
		return;
3621 3622

	/*
3623
	 * Initialize *vc.
3624
	 */
3625
	init_vcore_to_run(vc);
3626
	vc->preempt_tb = TB_NIL;
3627

3628 3629 3630 3631 3632
	/*
	 * Number of threads that we will be controlling: the same as
	 * the number of threads per subcore, except on POWER9,
	 * where it's 1 because the threads are (mostly) independent.
	 */
3633
	controlled_threads = threads_per_vcore(vc->kvm);
3634

3635
	/*
3636 3637 3638
	 * Make sure we are running on primary threads, and that secondary
	 * threads are offline.  Also check if the number of threads in this
	 * guest are greater than the current system threads per guest.
3639
	 */
3640 3641
	if ((controlled_threads > 1) &&
	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
3642
		for_each_runnable_thread(i, vcpu, vc) {
3643
			vcpu->arch.ret = -EBUSY;
3644
			kvmppc_remove_runnable(vc, vcpu, mftb());
3645 3646
			wake_up(&vcpu->arch.cpu_run);
		}
3647 3648 3649
		goto out;
	}

3650 3651 3652 3653 3654 3655
	/*
	 * See if we could run any other vcores on the physical core
	 * along with this one.
	 */
	init_core_info(&core_info, vc);
	pcpu = smp_processor_id();
3656
	target_threads = controlled_threads;
3657 3658 3659 3660
	if (target_smt_mode && target_smt_mode < target_threads)
		target_threads = target_smt_mode;
	if (vc->num_threads < target_threads)
		collect_piggybacks(&core_info, target_threads);
3661

3662 3663 3664 3665
	/*
	 * Hard-disable interrupts, and check resched flag and signals.
	 * If we need to reschedule or deliver a signal, clean up
	 * and return without going into the guest(s).
3666
	 * If the mmu_ready flag has been cleared, don't go into the
3667
	 * guest because that means a HPT resize operation is in progress.
3668 3669 3670 3671
	 */
	local_irq_disable();
	hard_irq_disable();
	if (lazy_irq_pending() || need_resched() ||
3672
	    recheck_signals_and_mmu(&core_info)) {
3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688
		local_irq_enable();
		vc->vcore_state = VCORE_INACTIVE;
		/* Unlock all except the primary vcore */
		for (sub = 1; sub < core_info.n_subcores; ++sub) {
			pvc = core_info.vc[sub];
			/* Put back on to the preempted vcores list */
			kvmppc_vcore_preempt(pvc);
			spin_unlock(&pvc->lock);
		}
		for (i = 0; i < controlled_threads; ++i)
			kvmppc_release_hwthread(pcpu + i);
		return;
	}

	kvmppc_clear_host_core(pcpu);

3689 3690 3691 3692 3693
	/* Decide on micro-threading (split-core) mode */
	subcore_size = threads_per_subcore;
	cmd_bit = stat_bit = 0;
	split = core_info.n_subcores;
	sip = NULL;
3694
	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
3695

3696
	if (split > 1) {
3697 3698 3699
		sip = &split_info;
		memset(&split_info, 0, sizeof(split_info));
		for (sub = 0; sub < core_info.n_subcores; ++sub)
3700
			split_info.vc[sub] = core_info.vc[sub];
3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719

		if (is_power8) {
			if (split == 2 && (dynamic_mt_modes & 2)) {
				cmd_bit = HID0_POWER8_1TO2LPAR;
				stat_bit = HID0_POWER8_2LPARMODE;
			} else {
				split = 4;
				cmd_bit = HID0_POWER8_1TO4LPAR;
				stat_bit = HID0_POWER8_4LPARMODE;
			}
			subcore_size = MAX_SMT_THREADS / split;
			split_info.rpr = mfspr(SPRN_RPR);
			split_info.pmmar = mfspr(SPRN_PMMAR);
			split_info.ldbar = mfspr(SPRN_LDBAR);
			split_info.subcore_size = subcore_size;
		} else {
			split_info.subcore_size = 1;
		}

3720 3721 3722
		/* order writes to split_info before kvm_split_mode pointer */
		smp_wmb();
	}
3723 3724

	for (thr = 0; thr < controlled_threads; ++thr) {
3725 3726 3727 3728
		struct paca_struct *paca = paca_ptrs[pcpu + thr];

		paca->kvm_hstate.napping = 0;
		paca->kvm_hstate.kvm_split_mode = sip;
3729
	}
3730

3731
	/* Initiate micro-threading (split-core) on POWER8 if required */
3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743
	if (cmd_bit) {
		unsigned long hid0 = mfspr(SPRN_HID0);

		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
		mb();
		mtspr(SPRN_HID0, hid0);
		isync();
		for (;;) {
			hid0 = mfspr(SPRN_HID0);
			if (hid0 & stat_bit)
				break;
			cpu_relax();
3744
		}
3745
	}
3746

3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765
	/*
	 * On POWER8, set RWMR register.
	 * Since it only affects PURR and SPURR, it doesn't affect
	 * the host, so we don't save/restore the host value.
	 */
	if (is_power8) {
		unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
		int n_online = atomic_read(&vc->online_count);

		/*
		 * Use the 8-thread value if we're doing split-core
		 * or if the vcore's online count looks bogus.
		 */
		if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
		    n_online >= 1 && n_online <= MAX_SMT_THREADS)
			rwmr_val = p8_rwmr_values[n_online];
		mtspr(SPRN_RWMR, rwmr_val);
	}

3766 3767 3768
	/* Start all the threads */
	active = 0;
	for (sub = 0; sub < core_info.n_subcores; ++sub) {
3769
		thr = is_power8 ? subcore_thread_map[sub] : sub;
3770 3771
		thr0_done = false;
		active |= 1 << thr;
3772 3773 3774 3775
		pvc = core_info.vc[sub];
		pvc->pcpu = pcpu + thr;
		for_each_runnable_thread(i, vcpu, pvc) {
			kvmppc_start_thread(vcpu, pvc);
3776
			kvmppc_create_dtl_entry(vcpu, pvc);
3777 3778 3779 3780
			trace_kvm_guest_enter(vcpu);
			if (!vcpu->arch.ptid)
				thr0_done = true;
			active |= 1 << (thr + vcpu->arch.ptid);
3781
		}
3782 3783 3784 3785 3786 3787
		/*
		 * We need to start the first thread of each subcore
		 * even if it doesn't have a vcpu.
		 */
		if (!thr0_done)
			kvmppc_start_thread(NULL, pvc);
3788
	}
3789

3790 3791 3792 3793 3794 3795
	/*
	 * Ensure that split_info.do_nap is set after setting
	 * the vcore pointer in the PACA of the secondaries.
	 */
	smp_mb();

3796 3797 3798 3799 3800
	/*
	 * When doing micro-threading, poke the inactive threads as well.
	 * This gets them to the nap instruction after kvm_do_nap,
	 * which reduces the time taken to unsplit later.
	 */
3801
	if (cmd_bit) {
3802
		split_info.do_nap = 1;	/* ask secondaries to nap when done */
3803 3804 3805
		for (thr = 1; thr < threads_per_subcore; ++thr)
			if (!(active & (1 << thr)))
				kvmppc_ipi_thread(pcpu + thr);
3806
	}
3807

3808
	vc->vcore_state = VCORE_RUNNING;
3809
	preempt_disable();
3810 3811 3812

	trace_kvmppc_run_core(vc, 0);

3813
	for (sub = 0; sub < core_info.n_subcores; ++sub)
3814
		spin_unlock(&core_info.vc[sub]->lock);
3815

3816
	guest_enter_irqoff();
3817

3818
	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3819

3820 3821
	this_cpu_disable_ftrace();

3822 3823 3824 3825 3826 3827
	/*
	 * Interrupts will be enabled once we get into the guest,
	 * so tell lockdep that we're about to enable interrupts.
	 */
	trace_hardirqs_on();

3828
	trap = __kvmppc_vcore_entry();
3829

3830 3831
	trace_hardirqs_off();

3832 3833
	this_cpu_enable_ftrace();

3834 3835
	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);

3836 3837
	set_irq_happened(trap);

3838
	spin_lock(&vc->lock);
3839
	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
3840
	vc->vcore_state = VCORE_EXITING;
3841

3842
	/* wait for secondary threads to finish writing their state to memory */
3843
	kvmppc_wait_for_nap(controlled_threads);
3844 3845

	/* Return to whole-core mode if we split the core earlier */
3846
	if (cmd_bit) {
3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861
		unsigned long hid0 = mfspr(SPRN_HID0);
		unsigned long loops = 0;

		hid0 &= ~HID0_POWER8_DYNLPARDIS;
		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
		mb();
		mtspr(SPRN_HID0, hid0);
		isync();
		for (;;) {
			hid0 = mfspr(SPRN_HID0);
			if (!(hid0 & stat_bit))
				break;
			cpu_relax();
			++loops;
		}
3862
		split_info.do_nap = 0;
3863 3864
	}

3865 3866
	kvmppc_set_host_core(pcpu);

3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880
	context_tracking_guest_exit();
	if (!vtime_accounting_enabled_this_cpu()) {
		local_irq_enable();
		/*
		 * Service IRQs here before vtime_account_guest_exit() so any
		 * ticks that occurred while running the guest are accounted to
		 * the guest. If vtime accounting is enabled, accounting uses
		 * TB rather than ticks, so it can be done without enabling
		 * interrupts here, which has the problem that it accounts
		 * interrupt processing overhead to the host.
		 */
		local_irq_disable();
	}
	vtime_account_guest_exit();
3881

3882 3883
	local_irq_enable();

3884
	/* Let secondaries go back to the offline loop */
3885
	for (i = 0; i < controlled_threads; ++i) {
3886 3887 3888 3889 3890
		kvmppc_release_hwthread(pcpu + i);
		if (sip && sip->napped[i])
			kvmppc_ipi_thread(pcpu + i);
	}

3891
	spin_unlock(&vc->lock);
3892

3893 3894
	/* make sure updates to secondary vcpu structs are visible now */
	smp_mb();
3895

3896 3897
	preempt_enable();

3898 3899 3900 3901
	for (sub = 0; sub < core_info.n_subcores; ++sub) {
		pvc = core_info.vc[sub];
		post_guest_process(pvc, pvc == vc);
	}
3902

3903
	spin_lock(&vc->lock);
3904 3905

 out:
3906
	vc->vcore_state = VCORE_INACTIVE;
3907
	trace_kvmppc_run_core(vc, 1);
3908 3909
}

3910 3911 3912 3913
static inline bool hcall_is_xics(unsigned long req)
{
	return req == H_EOI || req == H_CPPR || req == H_IPI ||
		req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
3914 3915
}

3916 3917 3918 3919 3920 3921 3922 3923 3924 3925
static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
{
	struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
	if (lp) {
		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
		lp->yield_count = cpu_to_be32(yield_count);
		vcpu->arch.vpa.dirty = 1;
	}
}

3926 3927 3928 3929 3930
/* call our hypervisor to load up HV regs and go */
static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
{
	struct kvmppc_vcore *vc = vcpu->arch.vcore;
	unsigned long host_psscr;
3931
	unsigned long msr;
3932
	struct hv_guest_state hvregs;
3933
	struct p9_host_os_sprs host_os_sprs;
3934
	s64 dec;
3935 3936
	int trap;

3937 3938
	msr = mfmsr();

3939
	save_p9_host_os_sprs(&host_os_sprs);
3940 3941 3942 3943 3944 3945 3946 3947

	/*
	 * We need to save and restore the guest visible part of the
	 * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
	 * doesn't do this for us. Note only required if pseries since
	 * this is done in kvmhv_vcpu_entry_p9() below otherwise.
	 */
	host_psscr = mfspr(SPRN_PSSCR_PR);
3948

3949
	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
3950 3951 3952 3953 3954 3955
	if (lazy_irq_pending())
		return 0;

	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
		msr = mfmsr(); /* TM restore can update msr */

3956 3957 3958
	if (vcpu->arch.psscr != host_psscr)
		mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);

3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989
	kvmhv_save_hv_regs(vcpu, &hvregs);
	hvregs.lpcr = lpcr;
	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
	hvregs.version = HV_GUEST_STATE_VERSION;
	if (vcpu->arch.nested) {
		hvregs.lpid = vcpu->arch.nested->shadow_lpid;
		hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
	} else {
		hvregs.lpid = vcpu->kvm->arch.lpid;
		hvregs.vcpu_token = vcpu->vcpu_id;
	}
	hvregs.hdec_expiry = time_limit;

	/*
	 * When setting DEC, we must always deal with irq_work_raise
	 * via NMI vs setting DEC. The problem occurs right as we
	 * switch into guest mode if a NMI hits and sets pending work
	 * and sets DEC, then that will apply to the guest and not
	 * bring us back to the host.
	 *
	 * irq_work_raise could check a flag (or possibly LPCR[HDICE]
	 * for example) and set HDEC to 1? That wouldn't solve the
	 * nested hv case which needs to abort the hcall or zero the
	 * time limit.
	 *
	 * XXX: Another day's problem.
	 */
	mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);

	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
3990
	switch_pmu_to_guest(vcpu, &host_os_sprs);
3991 3992 3993
	trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
				  __pa(&vcpu->arch.regs));
	kvmhv_restore_hv_return_state(vcpu, &hvregs);
3994
	switch_pmu_to_host(vcpu, &host_os_sprs);
3995 3996 3997 3998 3999
	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);

4000 4001
	store_vcpu_state(vcpu);

4002 4003 4004 4005 4006 4007
	dec = mfspr(SPRN_DEC);
	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
		dec = (s32) dec;
	*tb = mftb();
	vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset);

4008 4009 4010
	timer_rearm_host_dec(*tb);

	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
4011 4012
	if (vcpu->arch.psscr != host_psscr)
		mtspr(SPRN_PSSCR_PR, host_psscr);
4013

4014 4015 4016
	return trap;
}

4017
/*
4018
 * Guest entry for POWER9 and later CPUs.
4019
 */
4020
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
4021
			 unsigned long lpcr, u64 *tb)
4022
{
4023
	u64 next_timer;
4024
	int trap;
4025

4026
	next_timer = timer_get_next_tb();
4027
	if (*tb >= next_timer)
4028
		return BOOK3S_INTERRUPT_HV_DECREMENTER;
4029 4030
	if (next_timer < time_limit)
		time_limit = next_timer;
4031 4032
	else if (*tb >= time_limit) /* nested time limit */
		return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
4033

4034 4035
	vcpu->arch.ceded = 0;

4036 4037
	kvmppc_subcore_enter_guest();

4038
	vcpu_vpa_increment_dispatch(vcpu);
4039

4040
	if (kvmhv_on_pseries()) {
4041
		trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
4042

4043 4044 4045
		/* H_CEDE has to be handled now, not later */
		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
4046
			kvmppc_cede(vcpu);
4047
			kvmppc_set_gpr(vcpu, 3, 0);
4048 4049
			trap = 0;
		}
4050

4051
	} else {
4052 4053
		struct kvm *kvm = vcpu->kvm;

4054
		kvmppc_xive_push_vcpu(vcpu);
4055 4056

		__this_cpu_write(cpu_in_guest, kvm);
4057
		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4058 4059
		__this_cpu_write(cpu_in_guest, NULL);

4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081
		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
		    !(vcpu->arch.shregs.msr & MSR_PR)) {
			unsigned long req = kvmppc_get_gpr(vcpu, 3);

			/* H_CEDE has to be handled now, not later */
			if (req == H_CEDE) {
				kvmppc_cede(vcpu);
				kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
				kvmppc_set_gpr(vcpu, 3, 0);
				trap = 0;

			/* XICS hcalls must be handled before xive is pulled */
			} else if (hcall_is_xics(req)) {
				int ret;

				ret = kvmppc_xive_xics_hcall(vcpu, req);
				if (ret != H_TOO_HARD) {
					kvmppc_set_gpr(vcpu, 3, ret);
					trap = 0;
				}
			}
		}
4082
		kvmppc_xive_pull_vcpu(vcpu);
4083

4084
		if (kvm_is_radix(kvm))
4085
			vcpu->arch.slb_max = 0;
4086 4087
	}

4088
	vcpu_vpa_increment_dispatch(vcpu);
4089 4090 4091 4092 4093 4094

	kvmppc_subcore_exit_guest();

	return trap;
}

4095 4096 4097 4098
/*
 * Wait for some other vcpu thread to execute us, and
 * wake us up when we need to handle something in the host.
 */
4099 4100
static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
				 struct kvm_vcpu *vcpu, int wait_state)
4101 4102 4103
{
	DEFINE_WAIT(wait);

4104
	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
4105 4106
	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
		spin_unlock(&vc->lock);
4107
		schedule();
4108 4109
		spin_lock(&vc->lock);
	}
4110 4111 4112
	finish_wait(&vcpu->arch.cpu_run, &wait);
}

4113 4114
static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
{
4115 4116 4117
	if (!halt_poll_ns_grow)
		return;

4118 4119
	vc->halt_poll_ns *= halt_poll_ns_grow;
	if (vc->halt_poll_ns < halt_poll_ns_grow_start)
4120
		vc->halt_poll_ns = halt_poll_ns_grow_start;
4121 4122 4123 4124 4125 4126 4127 4128 4129 4130
}

static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
{
	if (halt_poll_ns_shrink == 0)
		vc->halt_poll_ns = 0;
	else
		vc->halt_poll_ns /= halt_poll_ns_shrink;
}

4131 4132 4133
#ifdef CONFIG_KVM_XICS
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{
4134
	if (!xics_on_xive())
4135
		return false;
4136
	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
4137 4138 4139 4140 4141 4142 4143 4144 4145
		vcpu->arch.xive_saved_state.cppr;
}
#else
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{
	return false;
}
#endif /* CONFIG_KVM_XICS */

4146 4147 4148
static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
{
	if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
4149
	    kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
4150 4151 4152 4153 4154
		return true;

	return false;
}

4155 4156 4157 4158 4159 4160 4161
static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
{
	if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
		return true;
	return false;
}

4162 4163
/*
 * Check to see if any of the runnable vcpus on the vcore have pending
4164 4165 4166 4167 4168 4169 4170 4171
 * exceptions or are no longer ceded
 */
static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
{
	struct kvm_vcpu *vcpu;
	int i;

	for_each_runnable_thread(i, vcpu, vc) {
4172
		if (kvmppc_vcpu_check_block(vcpu))
4173 4174 4175 4176 4177 4178
			return 1;
	}

	return 0;
}

4179 4180 4181 4182 4183 4184
/*
 * All the vcpus in this vcore are idle, so wait for a decrementer
 * or external interrupt to one of the vcpus.  vc->lock is held.
 */
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
4185
	ktime_t cur, start_poll, start_wait;
4186 4187
	int do_sleep = 1;
	u64 block_ns;
4188

4189 4190
	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));

4191
	/* Poll for pending exceptions and ceded state */
4192
	cur = start_poll = ktime_get();
4193
	if (vc->halt_poll_ns) {
4194
		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
4195
		++vc->runner->stat.generic.halt_attempted_poll;
4196

4197 4198 4199 4200 4201 4202 4203 4204 4205
		vc->vcore_state = VCORE_POLLING;
		spin_unlock(&vc->lock);

		do {
			if (kvmppc_vcore_check_block(vc)) {
				do_sleep = 0;
				break;
			}
			cur = ktime_get();
4206
		} while (kvm_vcpu_can_poll(cur, stop));
4207 4208 4209 4210

		spin_lock(&vc->lock);
		vc->vcore_state = VCORE_INACTIVE;

4211
		if (!do_sleep) {
4212
			++vc->runner->stat.generic.halt_successful_poll;
4213
			goto out;
4214
		}
4215 4216
	}

4217 4218
	prepare_to_rcuwait(&vc->wait);
	set_current_state(TASK_INTERRUPTIBLE);
4219
	if (kvmppc_vcore_check_block(vc)) {
4220
		finish_rcuwait(&vc->wait);
4221
		do_sleep = 0;
4222 4223
		/* If we polled, count this as a successful poll */
		if (vc->halt_poll_ns)
4224
			++vc->runner->stat.generic.halt_successful_poll;
4225
		goto out;
4226 4227
	}

4228 4229
	start_wait = ktime_get();

4230
	vc->vcore_state = VCORE_SLEEPING;
4231
	trace_kvmppc_vcore_blocked(vc, 0);
4232
	spin_unlock(&vc->lock);
4233
	schedule();
4234
	finish_rcuwait(&vc->wait);
4235 4236
	spin_lock(&vc->lock);
	vc->vcore_state = VCORE_INACTIVE;
4237
	trace_kvmppc_vcore_blocked(vc, 1);
4238
	++vc->runner->stat.halt_successful_wait;
4239 4240 4241 4242

	cur = ktime_get();

out:
4243 4244 4245 4246
	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);

	/* Attribute wait time */
	if (do_sleep) {
4247
		vc->runner->stat.generic.halt_wait_ns +=
4248
			ktime_to_ns(cur) - ktime_to_ns(start_wait);
4249 4250 4251
		KVM_STATS_LOG_HIST_UPDATE(
				vc->runner->stat.generic.halt_wait_hist,
				ktime_to_ns(cur) - ktime_to_ns(start_wait));
4252
		/* Attribute failed poll time */
4253
		if (vc->halt_poll_ns) {
4254
			vc->runner->stat.generic.halt_poll_fail_ns +=
4255 4256
				ktime_to_ns(start_wait) -
				ktime_to_ns(start_poll);
4257 4258 4259 4260 4261
			KVM_STATS_LOG_HIST_UPDATE(
				vc->runner->stat.generic.halt_poll_fail_hist,
				ktime_to_ns(start_wait) -
				ktime_to_ns(start_poll));
		}
4262 4263
	} else {
		/* Attribute successful poll time */
4264
		if (vc->halt_poll_ns) {
4265
			vc->runner->stat.generic.halt_poll_success_ns +=
4266 4267
				ktime_to_ns(cur) -
				ktime_to_ns(start_poll);
4268 4269 4270 4271
			KVM_STATS_LOG_HIST_UPDATE(
				vc->runner->stat.generic.halt_poll_success_hist,
				ktime_to_ns(cur) - ktime_to_ns(start_poll));
		}
4272
	}
4273 4274

	/* Adjust poll time */
4275
	if (halt_poll_ns) {
4276 4277 4278
		if (block_ns <= vc->halt_poll_ns)
			;
		/* We slept and blocked for longer than the max halt time */
4279
		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
4280 4281
			shrink_halt_poll_ns(vc);
		/* We slept and our poll time is too small */
4282 4283
		else if (vc->halt_poll_ns < halt_poll_ns &&
				block_ns < halt_poll_ns)
4284
			grow_halt_poll_ns(vc);
4285 4286
		if (vc->halt_poll_ns > halt_poll_ns)
			vc->halt_poll_ns = halt_poll_ns;
4287 4288 4289 4290
	} else
		vc->halt_poll_ns = 0;

	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
4291
}
4292

4293 4294 4295 4296
/*
 * This never fails for a radix guest, as none of the operations it does
 * for a radix guest can fail or have a way to report failure.
 */
4297 4298 4299 4300 4301
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{
	int r = 0;
	struct kvm *kvm = vcpu->kvm;

4302
	mutex_lock(&kvm->arch.mmu_setup_lock);
4303 4304 4305 4306 4307 4308 4309 4310 4311
	if (!kvm->arch.mmu_ready) {
		if (!kvm_is_radix(kvm))
			r = kvmppc_hv_setup_htab_rma(vcpu);
		if (!r) {
			if (cpu_has_feature(CPU_FTR_ARCH_300))
				kvmppc_setup_partition_table(kvm);
			kvm->arch.mmu_ready = 1;
		}
	}
4312
	mutex_unlock(&kvm->arch.mmu_setup_lock);
4313 4314 4315
	return r;
}

4316
static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
4317
{
4318
	struct kvm_run *run = vcpu->run;
4319
	int n_ceded, i, r;
4320
	struct kvmppc_vcore *vc;
4321
	struct kvm_vcpu *v;
4322

4323 4324
	trace_kvmppc_run_vcpu_enter(vcpu);

4325
	run->exit_reason = 0;
4326 4327
	vcpu->arch.ret = RESUME_GUEST;
	vcpu->arch.trap = 0;
4328
	kvmppc_update_vpas(vcpu);
4329 4330 4331 4332 4333 4334

	/*
	 * Synchronize with other threads in this virtual core
	 */
	vc = vcpu->arch.vcore;
	spin_lock(&vc->lock);
4335
	vcpu->arch.ceded = 0;
4336
	vcpu->arch.run_task = current;
4337
	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4338
	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4339
	vcpu->arch.busy_preempt = TB_NIL;
4340
	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
4341 4342
	++vc->n_runnable;

4343 4344 4345 4346 4347
	/*
	 * This happens the first time this is called for a vcpu.
	 * If the vcore is already running, we may be able to start
	 * this thread straight away and have it join in.
	 */
4348
	if (!signal_pending(current)) {
4349 4350
		if ((vc->vcore_state == VCORE_PIGGYBACK ||
		     vc->vcore_state == VCORE_RUNNING) &&
4351
			   !VCORE_IS_EXITING(vc)) {
4352
			kvmppc_create_dtl_entry(vcpu, vc);
4353
			kvmppc_start_thread(vcpu, vc);
4354
			trace_kvm_guest_enter(vcpu);
4355
		} else if (vc->vcore_state == VCORE_SLEEPING) {
4356
		        rcuwait_wake_up(&vc->wait);
4357 4358
		}

4359
	}
4360

4361 4362
	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
	       !signal_pending(current)) {
4363 4364
		/* See if the MMU is ready to go */
		if (!vcpu->kvm->arch.mmu_ready) {
4365
			spin_unlock(&vc->lock);
4366
			r = kvmhv_setup_mmu(vcpu);
4367 4368
			spin_lock(&vc->lock);
			if (r) {
4369 4370
				run->exit_reason = KVM_EXIT_FAIL_ENTRY;
				run->fail_entry.
4371
					hardware_entry_failure_reason = 0;
4372 4373 4374 4375 4376
				vcpu->arch.ret = r;
				break;
			}
		}

4377 4378 4379
		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
			kvmppc_vcore_end_preempt(vc);

4380
		if (vc->vcore_state != VCORE_INACTIVE) {
4381
			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
4382 4383
			continue;
		}
4384
		for_each_runnable_thread(i, v, vc) {
4385
			kvmppc_core_prepare_to_enter(v);
4386
			if (signal_pending(v->arch.run_task)) {
4387
				kvmppc_remove_runnable(vc, v, mftb());
4388
				v->stat.signal_exits++;
4389
				v->run->exit_reason = KVM_EXIT_INTR;
4390 4391 4392 4393
				v->arch.ret = -EINTR;
				wake_up(&v->arch.cpu_run);
			}
		}
4394 4395 4396
		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
			break;
		n_ceded = 0;
4397
		for_each_runnable_thread(i, v, vc) {
4398
			if (!kvmppc_vcpu_woken(v))
4399
				n_ceded += v->arch.ceded;
4400 4401 4402
			else
				v->arch.ceded = 0;
		}
4403 4404
		vc->runner = vcpu;
		if (n_ceded == vc->n_runnable) {
4405
			kvmppc_vcore_blocked(vc);
4406
		} else if (need_resched()) {
4407
			kvmppc_vcore_preempt(vc);
4408 4409
			/* Let something else run */
			cond_resched_lock(&vc->lock);
4410 4411
			if (vc->vcore_state == VCORE_PREEMPT)
				kvmppc_vcore_end_preempt(vc);
4412
		} else {
4413
			kvmppc_run_core(vc);
4414
		}
4415
		vc->runner = NULL;
4416
	}
4417

4418 4419
	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
	       (vc->vcore_state == VCORE_RUNNING ||
4420 4421
		vc->vcore_state == VCORE_EXITING ||
		vc->vcore_state == VCORE_PIGGYBACK))
4422
		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
4423

4424 4425 4426
	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
		kvmppc_vcore_end_preempt(vc);

4427
	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4428
		kvmppc_remove_runnable(vc, vcpu, mftb());
4429
		vcpu->stat.signal_exits++;
4430
		run->exit_reason = KVM_EXIT_INTR;
4431 4432 4433 4434 4435
		vcpu->arch.ret = -EINTR;
	}

	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
		/* Wake up some vcpu to run the core */
4436 4437
		i = -1;
		v = next_runnable_thread(vc, &i);
4438
		wake_up(&v->arch.cpu_run);
4439 4440
	}

4441
	trace_kvmppc_run_vcpu_exit(vcpu);
4442 4443
	spin_unlock(&vc->lock);
	return vcpu->arch.ret;
4444 4445
}

4446
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4447
			  unsigned long lpcr)
4448
{
4449
	struct kvm_run *run = vcpu->run;
4450
	int trap, r, pcpu;
4451
	int srcu_idx;
4452 4453
	struct kvmppc_vcore *vc;
	struct kvm *kvm = vcpu->kvm;
4454
	struct kvm_nested_guest *nested = vcpu->arch.nested;
4455
	unsigned long flags;
4456
	u64 tb;
4457 4458 4459

	trace_kvmppc_run_vcpu_enter(vcpu);

4460
	run->exit_reason = 0;
4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
	vcpu->arch.ret = RESUME_GUEST;
	vcpu->arch.trap = 0;

	vc = vcpu->arch.vcore;
	vcpu->arch.ceded = 0;
	vcpu->arch.run_task = current;
	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;

	/* See if the MMU is ready to go */
4471
	if (unlikely(!kvm->arch.mmu_ready)) {
4472 4473 4474 4475 4476 4477 4478 4479
		r = kvmhv_setup_mmu(vcpu);
		if (r) {
			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
			run->fail_entry.hardware_entry_failure_reason = 0;
			vcpu->arch.ret = r;
			return r;
		}
	}
4480 4481 4482 4483 4484 4485 4486 4487

	if (need_resched())
		cond_resched();

	kvmppc_update_vpas(vcpu);

	preempt_disable();
	pcpu = smp_processor_id();
4488 4489
	if (kvm_is_radix(kvm))
		kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4490

4491 4492
	/* flags save not required, but irq_pmu has no disable/enable API */
	powerpc_local_irq_pmu_save(flags);
4493

4494 4495
	if (signal_pending(current))
		goto sigpend;
4496
	if (need_resched() || !kvm->arch.mmu_ready)
4497 4498
		goto out;

4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509
	if (!nested) {
		kvmppc_core_prepare_to_enter(vcpu);
		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
			     &vcpu->arch.pending_exceptions))
			lpcr |= LPCR_MER;
	} else if (vcpu->arch.pending_exceptions ||
		   vcpu->arch.doorbell_request ||
		   xive_interrupt_pending(vcpu)) {
		vcpu->arch.ret = RESUME_HOST;
		goto out;
	}
4510

4511 4512 4513 4514
	if (vcpu->arch.timer_running) {
		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
		vcpu->arch.timer_running = 0;
	}
4515

4516
	tb = mftb();
4517

4518 4519 4520 4521 4522 4523
	vcpu->cpu = pcpu;
	vcpu->arch.thread_cpu = pcpu;
	vc->pcpu = pcpu;
	local_paca->kvm_hstate.kvm_vcpu = vcpu;
	local_paca->kvm_hstate.ptid = 0;
	local_paca->kvm_hstate.fake_suspend = 0;
4524

4525
	__kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0);
4526

4527
	trace_kvm_guest_enter(vcpu);
4528 4529 4530 4531 4532 4533 4534

	guest_enter_irqoff();

	srcu_idx = srcu_read_lock(&kvm->srcu);

	this_cpu_disable_ftrace();

4535 4536 4537
	/* Tell lockdep that we're about to enable interrupts */
	trace_hardirqs_on();

4538
	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb);
4539 4540
	vcpu->arch.trap = trap;

4541 4542
	trace_hardirqs_off();

4543 4544 4545 4546 4547 4548
	this_cpu_enable_ftrace();

	srcu_read_unlock(&kvm->srcu, srcu_idx);

	set_irq_happened(trap);

4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562
	context_tracking_guest_exit();
	if (!vtime_accounting_enabled_this_cpu()) {
		local_irq_enable();
		/*
		 * Service IRQs here before vtime_account_guest_exit() so any
		 * ticks that occurred while running the guest are accounted to
		 * the guest. If vtime accounting is enabled, accounting uses
		 * TB rather than ticks, so it can be done without enabling
		 * interrupts here, which has the problem that it accounts
		 * interrupt processing overhead to the host.
		 */
		local_irq_disable();
	}
	vtime_account_guest_exit();
4563

4564 4565
	vcpu->cpu = -1;
	vcpu->arch.thread_cpu = -1;
4566

4567
	powerpc_local_irq_pmu_restore(flags);
4568 4569 4570

	preempt_enable();

4571 4572 4573 4574 4575 4576
	/*
	 * cancel pending decrementer exception if DEC is now positive, or if
	 * entering a nested guest in which case the decrementer is now owned
	 * by L2 and the L1 decrementer is provided in hdec_expires
	 */
	if (kvmppc_core_pending_dec(vcpu) &&
4577
			((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
4578 4579
			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
4580 4581 4582 4583
		kvmppc_core_dequeue_dec(vcpu);

	trace_kvm_guest_exit(vcpu);
	r = RESUME_GUEST;
4584 4585
	if (trap) {
		if (!nested)
4586
			r = kvmppc_handle_exit_hv(vcpu, current);
4587
		else
4588
			r = kvmppc_handle_nested_exit(vcpu);
4589
	}
4590 4591
	vcpu->arch.ret = r;

4592
	if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
4593
		kvmppc_set_timer(vcpu);
4594 4595 4596 4597

		prepare_to_rcuwait(&vcpu->wait);
		for (;;) {
			set_current_state(TASK_INTERRUPTIBLE);
4598 4599
			if (signal_pending(current)) {
				vcpu->stat.signal_exits++;
4600
				run->exit_reason = KVM_EXIT_INTR;
4601 4602 4603
				vcpu->arch.ret = -EINTR;
				break;
			}
4604 4605 4606 4607 4608 4609 4610

			if (kvmppc_vcpu_check_block(vcpu))
				break;

			trace_kvmppc_vcore_blocked(vc, 0);
			schedule();
			trace_kvmppc_vcore_blocked(vc, 1);
4611
		}
4612
		finish_rcuwait(&vcpu->wait);
4613 4614 4615 4616
	}
	vcpu->arch.ceded = 0;

 done:
4617
	trace_kvmppc_run_vcpu_exit(vcpu);
4618 4619 4620 4621 4622

	return vcpu->arch.ret;

 sigpend:
	vcpu->stat.signal_exits++;
4623
	run->exit_reason = KVM_EXIT_INTR;
4624 4625
	vcpu->arch.ret = -EINTR;
 out:
4626
	powerpc_local_irq_pmu_restore(flags);
4627 4628 4629 4630
	preempt_enable();
	goto done;
}

4631
static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
4632
{
4633
	struct kvm_run *run = vcpu->run;
4634
	int r;
4635
	int srcu_idx;
4636
	struct kvm *kvm;
4637
	unsigned long msr;
4638

4639 4640 4641 4642 4643
	if (!vcpu->arch.sane) {
		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		return -EINVAL;
	}

4644 4645 4646 4647 4648 4649 4650
	/* No need to go into the guest when all we'll do is come back out */
	if (signal_pending(current)) {
		run->exit_reason = KVM_EXIT_INTR;
		return -EINTR;
	}

#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664
	/*
	 * Don't allow entry with a suspended transaction, because
	 * the guest entry/exit code will lose it.
	 */
	if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
	    (current->thread.regs->msr & MSR_TM)) {
		if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
			run->fail_entry.hardware_entry_failure_reason = 0;
			return -EINVAL;
		}
	}
#endif

4665 4666 4667 4668 4669 4670 4671 4672 4673
	/*
	 * Force online to 1 for the sake of old userspace which doesn't
	 * set it.
	 */
	if (!vcpu->arch.online) {
		atomic_inc(&vcpu->arch.vcore->online_count);
		vcpu->arch.online = 1;
	}

4674 4675
	kvmppc_core_prepare_to_enter(vcpu);

4676 4677 4678
	kvm = vcpu->kvm;
	atomic_inc(&kvm->arch.vcpus_running);
	/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
4679 4680
	smp_mb();

4681 4682 4683 4684 4685 4686 4687
	msr = 0;
	if (IS_ENABLED(CONFIG_PPC_FPU))
		msr |= MSR_FP;
	if (cpu_has_feature(CPU_FTR_ALTIVEC))
		msr |= MSR_VEC;
	if (cpu_has_feature(CPU_FTR_VSX))
		msr |= MSR_VSX;
4688 4689 4690
	if ((cpu_has_feature(CPU_FTR_TM) ||
	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
			(vcpu->arch.hfscr & HFSCR_TM))
4691 4692 4693 4694
		msr |= MSR_TM;
	msr = msr_check_and_set(msr);

	kvmppc_save_user_regs();
4695

4696
	kvmppc_save_current_sprs();
4697

4698 4699
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		vcpu->arch.waitp = &vcpu->arch.vcore->wait;
4700
	vcpu->arch.pgdir = kvm->mm->pgd;
4701
	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
4702

4703
	do {
4704
		if (cpu_has_feature(CPU_FTR_ARCH_300))
4705
			r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
4706
						  vcpu->arch.vcore->lpcr);
4707
		else
4708
			r = kvmppc_run_vcpu(vcpu);
4709

4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720
		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
				/*
				 * These should have been caught reflected
				 * into the guest by now. Final sanity check:
				 * don't allow userspace to execute hcalls in
				 * the hypervisor.
				 */
				r = RESUME_GUEST;
				continue;
			}
4721
			trace_kvm_hcall_enter(vcpu);
4722
			r = kvmppc_pseries_do_hcall(vcpu);
4723
			trace_kvm_hcall_exit(vcpu, r);
4724
			kvmppc_core_prepare_to_enter(vcpu);
4725
		} else if (r == RESUME_PAGE_FAULT) {
4726
			srcu_idx = srcu_read_lock(&kvm->srcu);
4727
			r = kvmppc_book3s_hv_page_fault(vcpu,
4728
				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
4729
			srcu_read_unlock(&kvm->srcu, srcu_idx);
4730
		} else if (r == RESUME_PASSTHROUGH) {
4731
			if (WARN_ON(xics_on_xive()))
4732 4733 4734 4735
				r = H_SUCCESS;
			else
				r = kvmppc_xics_rm_complete(vcpu, 0);
		}
4736
	} while (is_kvmppc_resume_guest(r));
4737

4738
	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
4739
	atomic_dec(&kvm->arch.vcpus_running);
4740 4741 4742

	srr_regs_clobbered();

4743 4744 4745
	return r;
}

4746
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
4747
				     int shift, int sllp)
4748
{
4749 4750 4751 4752
	(*sps)->page_shift = shift;
	(*sps)->slb_enc = sllp;
	(*sps)->enc[0].page_shift = shift;
	(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
4753
	/*
4754
	 * Add 16MB MPSS support (may get filtered out by userspace)
4755
	 */
4756 4757 4758 4759 4760 4761
	if (shift != 24) {
		int penc = kvmppc_pgsize_lp_encoding(shift, 24);
		if (penc != -1) {
			(*sps)->enc[1].page_shift = 24;
			(*sps)->enc[1].pte_enc = penc;
		}
4762
	}
4763 4764 4765
	(*sps)++;
}

4766 4767
static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
					 struct kvm_ppc_smmu_info *info)
4768 4769 4770
{
	struct kvm_ppc_one_seg_page_size *sps;

4771 4772 4773 4774 4775 4776 4777 4778
	/*
	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
	 * POWER7 doesn't support keys for instruction accesses,
	 * POWER8 and POWER9 do.
	 */
	info->data_keys = 32;
	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;

4779 4780 4781
	/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
	info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
	info->slb_size = 32;
4782 4783 4784

	/* We only support these sizes for now, and no muti-size segments */
	sps = &info->sps[0];
4785 4786 4787
	kvmppc_add_seg_page_size(&sps, 12, 0);
	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
4788

4789 4790 4791 4792
	/* If running as a nested hypervisor, we don't support HPT guests */
	if (kvmhv_on_pseries())
		info->flags |= KVM_PPC_NO_HASH;

4793 4794 4795
	return 0;
}

4796 4797 4798
/*
 * Get (and clear) the dirty memory log for a memory slot.
 */
4799 4800
static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
					 struct kvm_dirty_log *log)
4801
{
4802
	struct kvm_memslots *slots;
4803
	struct kvm_memory_slot *memslot;
4804
	int i, r;
4805
	unsigned long n;
4806
	unsigned long *buf, *p;
4807
	struct kvm_vcpu *vcpu;
4808 4809 4810 4811

	mutex_lock(&kvm->slots_lock);

	r = -EINVAL;
4812
	if (log->slot >= KVM_USER_MEM_SLOTS)
4813 4814
		goto out;

4815 4816
	slots = kvm_memslots(kvm);
	memslot = id_to_memslot(slots, log->slot);
4817
	r = -ENOENT;
4818
	if (!memslot || !memslot->dirty_bitmap)
4819 4820
		goto out;

4821
	/*
4822 4823
	 * Use second half of bitmap area because both HPT and radix
	 * accumulate bits in the first half.
4824
	 */
4825
	n = kvm_dirty_bitmap_bytes(memslot);
4826 4827
	buf = memslot->dirty_bitmap + n / sizeof(long);
	memset(buf, 0, n);
4828

4829 4830 4831 4832
	if (kvm_is_radix(kvm))
		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
	else
		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
4833 4834 4835
	if (r)
		goto out;

4836 4837 4838 4839 4840 4841 4842 4843 4844 4845
	/*
	 * We accumulate dirty bits in the first half of the
	 * memslot's dirty_bitmap area, for when pages are paged
	 * out or modified by the host directly.  Pick up these
	 * bits and add them to the map.
	 */
	p = memslot->dirty_bitmap;
	for (i = 0; i < n / sizeof(long); ++i)
		buf[i] |= xchg(&p[i], 0);

4846 4847 4848 4849 4850 4851 4852 4853 4854
	/* Harvest dirty bits from VPA and DTL updates */
	/* Note: we never modify the SLB shadow buffer areas */
	kvm_for_each_vcpu(i, vcpu, kvm) {
		spin_lock(&vcpu->arch.vpa_update_lock);
		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
		spin_unlock(&vcpu->arch.vpa_update_lock);
	}

4855
	r = -EFAULT;
4856
	if (copy_to_user(log->dirty_bitmap, buf, n))
4857 4858 4859 4860 4861 4862 4863 4864
		goto out;

	r = 0;
out:
	mutex_unlock(&kvm->slots_lock);
	return r;
}

4865
static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
4866
{
4867 4868
	vfree(slot->arch.rmap);
	slot->arch.rmap = NULL;
4869 4870
}

4871 4872 4873 4874
static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
					struct kvm_memory_slot *slot,
					const struct kvm_userspace_memory_region *mem,
					enum kvm_mr_change change)
4875
{
4876
	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
4877

4878 4879 4880 4881 4882 4883
	if (change == KVM_MR_CREATE) {
		slot->arch.rmap = vzalloc(array_size(npages,
					  sizeof(*slot->arch.rmap)));
		if (!slot->arch.rmap)
			return -ENOMEM;
	}
4884

4885
	return 0;
4886 4887
}

4888
static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
4889
				const struct kvm_userspace_memory_region *mem,
4890
				const struct kvm_memory_slot *old,
4891 4892
				const struct kvm_memory_slot *new,
				enum kvm_mr_change change)
4893
{
4894 4895
	unsigned long npages = mem->memory_size >> PAGE_SHIFT;

4896 4897 4898 4899 4900 4901 4902 4903
	/*
	 * If we are making a new memslot, it might make
	 * some address that was previously cached as emulated
	 * MMIO be no longer emulated MMIO, so invalidate
	 * all the caches of emulated MMIO translations.
	 */
	if (npages)
		atomic64_inc(&kvm->arch.mmio_update);
4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920

	/*
	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
	 * have already called kvm_arch_flush_shadow_memslot() to
	 * flush shadow mappings.  For KVM_MR_CREATE we have no
	 * previous mappings.  So the only case to handle is
	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
	 * has been changed.
	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
	 * to get rid of any THP PTEs in the partition-scoped page tables
	 * so we can track dirtiness at the page level; we flush when
	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
	 * using THP PTEs.
	 */
	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
		kvmppc_radix_flush_memslot(kvm, old);
4921 4922 4923 4924 4925 4926 4927 4928
	/*
	 * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
	 */
	if (!kvm->arch.secure_guest)
		return;

	switch (change) {
	case KVM_MR_CREATE:
4929 4930 4931 4932 4933
		/*
		 * @TODO kvmppc_uvmem_memslot_create() can fail and
		 * return error. Fix this.
		 */
		kvmppc_uvmem_memslot_create(kvm, new);
4934 4935
		break;
	case KVM_MR_DELETE:
4936
		kvmppc_uvmem_memslot_delete(kvm, old);
4937 4938 4939 4940 4941
		break;
	default:
		/* TODO: Handle KVM_MR_MOVE */
		break;
	}
4942 4943
}

4944 4945
/*
 * Update LPCR values in kvm->arch and in vcores.
4946 4947
 * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
 * of kvm->arch.lpcr update).
4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962
 */
void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
{
	long int i;
	u32 cores_done = 0;

	if ((kvm->arch.lpcr & mask) == lpcr)
		return;

	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;

	for (i = 0; i < KVM_MAX_VCORES; ++i) {
		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
		if (!vc)
			continue;
4963

4964 4965
		spin_lock(&vc->lock);
		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
4966
		verify_lpcr(kvm, vc->lpcr);
4967 4968 4969 4970 4971 4972
		spin_unlock(&vc->lock);
		if (++cores_done >= kvm->arch.online_vcores)
			break;
	}
}

4973
void kvmppc_setup_partition_table(struct kvm *kvm)
4974 4975 4976
{
	unsigned long dw0, dw1;

4977 4978 4979 4980 4981 4982
	if (!kvm_is_radix(kvm)) {
		/* PS field - page size for VRMA */
		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
		/* HTABSIZE and HTABORG fields */
		dw0 |= kvm->arch.sdr1;
4983

4984 4985 4986 4987 4988 4989 4990
		/* Second dword as set by userspace */
		dw1 = kvm->arch.process_table;
	} else {
		dw0 = PATB_HR | radix__get_tree_size() |
			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
		dw1 = PATB_GR | kvm->arch.process_table;
	}
4991
	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
4992 4993
}

4994 4995
/*
 * Set up HPT (hashed page table) and RMA (real-mode area).
4996
 * Must be called with kvm->arch.mmu_setup_lock held.
4997
 */
4998
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
4999 5000 5001 5002 5003 5004
{
	int err = 0;
	struct kvm *kvm = vcpu->kvm;
	unsigned long hva;
	struct kvm_memory_slot *memslot;
	struct vm_area_struct *vma;
5005
	unsigned long lpcr = 0, senc;
5006
	unsigned long psize, porder;
5007
	int srcu_idx;
5008

5009
	/* Allocate hashed page table (if not done already) and reset it */
5010
	if (!kvm->arch.hpt.virt) {
5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021
		int order = KVM_DEFAULT_HPT_ORDER;
		struct kvm_hpt_info info;

		err = kvmppc_allocate_hpt(&info, order);
		/* If we get here, it means userspace didn't specify a
		 * size explicitly.  So, try successively smaller
		 * sizes if the default failed. */
		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
			err  = kvmppc_allocate_hpt(&info, order);

		if (err < 0) {
5022 5023 5024
			pr_err("KVM: Couldn't alloc HPT\n");
			goto out;
		}
5025 5026

		kvmppc_set_hpt(kvm, &info);
5027 5028
	}

5029
	/* Look up the memslot for guest physical address 0 */
5030
	srcu_idx = srcu_read_lock(&kvm->srcu);
5031
	memslot = gfn_to_memslot(kvm, 0);
5032

5033 5034 5035
	/* We must have some memory at 0 by now */
	err = -EINVAL;
	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
5036
		goto out_srcu;
5037 5038 5039

	/* Look up the VMA for the start of this memory slot */
	hva = memslot->userspace_addr;
5040
	mmap_read_lock(kvm->mm);
5041 5042
	vma = vma_lookup(kvm->mm, hva);
	if (!vma || (vma->vm_flags & VM_IO))
5043 5044 5045 5046
		goto up_out;

	psize = vma_kernel_pagesize(vma);

5047
	mmap_read_unlock(kvm->mm);
5048

5049
	/* We can handle 4k, 64k or 16M pages in the VRMA */
5050 5051 5052 5053 5054 5055 5056
	if (psize >= 0x1000000)
		psize = 0x1000000;
	else if (psize >= 0x10000)
		psize = 0x10000;
	else
		psize = 0x1000;
	porder = __ilog2(psize);
5057

5058 5059 5060 5061 5062
	senc = slb_pgsize_encoding(psize);
	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
		(VRMA_VSID << SLB_VSID_SHIFT_1T);
	/* Create HPTEs in the hash page table for the VRMA */
	kvmppc_map_vrma(vcpu, memslot, porder);
5063

5064 5065 5066 5067 5068 5069
	/* Update VRMASD field in the LPCR */
	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
		/* the -4 is to account for senc values starting at 0x10 */
		lpcr = senc << (LPCR_VRMASD_SH - 4);
		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
	}
5070

5071
	/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
5072 5073
	smp_wmb();
	err = 0;
5074 5075
 out_srcu:
	srcu_read_unlock(&kvm->srcu, srcu_idx);
5076 5077
 out:
	return err;
5078

5079
 up_out:
5080
	mmap_read_unlock(kvm->mm);
5081
	goto out_srcu;
5082 5083
}

5084 5085 5086 5087
/*
 * Must be called with kvm->arch.mmu_setup_lock held and
 * mmu_ready = 0 and no vcpus running.
 */
5088 5089
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{
5090 5091
	unsigned long lpcr, lpcr_mask;

5092
	if (nesting_enabled(kvm))
5093
		kvmhv_release_all_nested(kvm);
5094 5095
	kvmppc_rmap_reset(kvm);
	kvm->arch.process_table = 0;
5096
	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5097 5098 5099
	spin_lock(&kvm->mmu_lock);
	kvm->arch.radix = 0;
	spin_unlock(&kvm->mmu_lock);
5100
	kvmppc_free_radix(kvm);
5101 5102 5103 5104 5105 5106 5107

	lpcr = LPCR_VPM1;
	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
	if (cpu_has_feature(CPU_FTR_ARCH_31))
		lpcr_mask |= LPCR_HAIL;
	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);

5108 5109 5110
	return 0;
}

5111 5112 5113 5114
/*
 * Must be called with kvm->arch.mmu_setup_lock held and
 * mmu_ready = 0 and no vcpus running.
 */
5115 5116
int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
{
5117
	unsigned long lpcr, lpcr_mask;
5118 5119 5120 5121 5122
	int err;

	err = kvmppc_init_vm_radix(kvm);
	if (err)
		return err;
5123
	kvmppc_rmap_reset(kvm);
5124
	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
5125 5126 5127
	spin_lock(&kvm->mmu_lock);
	kvm->arch.radix = 1;
	spin_unlock(&kvm->mmu_lock);
5128
	kvmppc_free_hpt(&kvm->arch.hpt);
5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139

	lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
		lpcr_mask |= LPCR_HAIL;
		if (cpu_has_feature(CPU_FTR_HVMODE) &&
				(kvm->arch.host_lpcr & LPCR_HAIL))
			lpcr |= LPCR_HAIL;
	}
	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);

5140 5141 5142
	return 0;
}

5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160
#ifdef CONFIG_KVM_XICS
/*
 * Allocate a per-core structure for managing state about which cores are
 * running in the host versus the guest and for exchanging data between
 * real mode KVM and CPU running in the host.
 * This is only done for the first VM.
 * The allocated structure stays even if all VMs have stopped.
 * It is only freed when the kvm-hv module is unloaded.
 * It's OK for this routine to fail, we just don't support host
 * core operations like redirecting H_IPI wakeups.
 */
void kvmppc_alloc_host_rm_ops(void)
{
	struct kvmppc_host_rm_ops *ops;
	unsigned long l_ops;
	int cpu, core;
	int size;

5161 5162 5163
	if (cpu_has_feature(CPU_FTR_ARCH_300))
		return;

5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179
	/* Not the first time here ? */
	if (kvmppc_host_rm_ops_hv != NULL)
		return;

	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
	if (!ops)
		return;

	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
	ops->rm_core = kzalloc(size, GFP_KERNEL);

	if (!ops->rm_core) {
		kfree(ops);
		return;
	}

5180
	cpus_read_lock();
5181

5182 5183 5184 5185 5186 5187 5188 5189
	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
		if (!cpu_online(cpu))
			continue;

		core = cpu >> threads_shift;
		ops->rm_core[core].rm_state.in_host = 1;
	}

5190 5191
	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;

5192 5193 5194 5195 5196 5197 5198 5199 5200 5201
	/*
	 * Make the contents of the kvmppc_host_rm_ops structure visible
	 * to other CPUs before we assign it to the global variable.
	 * Do an atomic assignment (no locks used here), but if someone
	 * beats us to it, just free our copy and return.
	 */
	smp_wmb();
	l_ops = (unsigned long) ops;

	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
5202
		cpus_read_unlock();
5203 5204
		kfree(ops->rm_core);
		kfree(ops);
5205
		return;
5206
	}
5207

5208 5209 5210 5211 5212
	cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
					     "ppc/kvm_book3s:prepare",
					     kvmppc_set_host_core,
					     kvmppc_clear_host_core);
	cpus_read_unlock();
5213 5214 5215 5216 5217
}

void kvmppc_free_host_rm_ops(void)
{
	if (kvmppc_host_rm_ops_hv) {
5218
		cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
5219 5220 5221 5222 5223 5224 5225
		kfree(kvmppc_host_rm_ops_hv->rm_core);
		kfree(kvmppc_host_rm_ops_hv);
		kvmppc_host_rm_ops_hv = NULL;
	}
}
#endif

5226
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
5227
{
5228
	unsigned long lpcr, lpid;
5229
	char buf[32];
5230
	int ret;
5231

5232 5233
	mutex_init(&kvm->arch.uvmem_lock);
	INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
5234 5235
	mutex_init(&kvm->arch.mmu_setup_lock);

5236 5237 5238
	/* Allocate the guest's logical partition ID */

	lpid = kvmppc_alloc_lpid();
5239
	if ((long)lpid < 0)
5240 5241
		return -ENOMEM;
	kvm->arch.lpid = lpid;
5242

5243 5244
	kvmppc_alloc_host_rm_ops();

5245 5246
	kvmhv_vm_nested_init(kvm);

5247 5248 5249 5250
	/*
	 * Since we don't flush the TLB when tearing down a VM,
	 * and this lpid might have previously been used,
	 * make sure we flush on each core before running the new VM.
5251 5252
	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
	 * does this flush for us.
5253
	 */
5254 5255
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		cpumask_setall(&kvm->arch.need_tlb_flush);
5256

5257 5258 5259 5260
	/* Start out with the default set of hcalls enabled */
	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
	       sizeof(kvm->arch.enabled_hcalls));

5261 5262
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
5263

5264
	/* Init LPCR for virtual RMA mode */
5265 5266 5267 5268 5269 5270 5271
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
		kvm->arch.host_lpid = mfspr(SPRN_LPID);
		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
		lpcr &= LPCR_PECE | LPCR_LPES;
	} else {
		lpcr = 0;
	}
5272 5273 5274 5275 5276 5277 5278
	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
		LPCR_VPM0 | LPCR_VPM1;
	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
		(VRMA_VSID << SLB_VSID_SHIFT_1T);
	/* On POWER8 turn on online bit to enable PURR/SPURR */
	if (cpu_has_feature(CPU_FTR_ARCH_207S))
		lpcr |= LPCR_ONL;
5279 5280 5281
	/*
	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
	 * Set HVICE bit to enable hypervisor virtualization interrupts.
5282 5283 5284
	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
	 * be unnecessary but better safe than sorry in case we re-enable
	 * EE in HV mode with this LPCR still set)
5285 5286
	 */
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5287
		lpcr &= ~LPCR_VPM0;
5288 5289 5290 5291 5292 5293
		lpcr |= LPCR_HVICE | LPCR_HEIC;

		/*
		 * If xive is enabled, we route 0x500 interrupts directly
		 * to the guest.
		 */
5294
		if (xics_on_xive())
5295
			lpcr |= LPCR_LPES;
5296 5297
	}

5298
	/*
5299
	 * If the host uses radix, the guest starts out as radix.
5300 5301 5302
	 */
	if (radix_enabled()) {
		kvm->arch.radix = 1;
5303
		kvm->arch.mmu_ready = 1;
5304 5305
		lpcr &= ~LPCR_VPM1;
		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
5306 5307 5308 5309
		if (cpu_has_feature(CPU_FTR_HVMODE) &&
		    cpu_has_feature(CPU_FTR_ARCH_31) &&
		    (kvm->arch.host_lpcr & LPCR_HAIL))
			lpcr |= LPCR_HAIL;
5310 5311 5312 5313 5314 5315 5316 5317
		ret = kvmppc_init_vm_radix(kvm);
		if (ret) {
			kvmppc_free_lpid(kvm->arch.lpid);
			return ret;
		}
		kvmppc_setup_partition_table(kvm);
	}

5318
	verify_lpcr(kvm, lpcr);
5319
	kvm->arch.lpcr = lpcr;
5320

5321 5322 5323
	/* Initialization for future HPT resizes */
	kvm->arch.resize_hpt = NULL;

5324 5325 5326 5327
	/*
	 * Work out how many sets the TLB has, for the use of
	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
	 */
5328 5329 5330 5331 5332 5333
	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
		/*
		 * P10 will flush all the congruence class with a single tlbiel
		 */
		kvm->arch.tlb_sets = 1;
	} else if (radix_enabled())
5334 5335
		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
	else if (cpu_has_feature(CPU_FTR_ARCH_300))
5336 5337 5338 5339 5340 5341
		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
	else
		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */

5342
	/*
5343 5344
	 * Track that we now have a HV mode VM active. This blocks secondary
	 * CPU threads from coming online.
5345
	 */
5346
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5347
		kvm_hv_vm_activated();
5348

5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359
	/*
	 * Initialize smt_mode depending on processor.
	 * POWER8 and earlier have to use "strict" threading, where
	 * all vCPUs in a vcore have to run on the same (sub)core,
	 * whereas on POWER9 the threads can each run a different
	 * guest.
	 */
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		kvm->arch.smt_mode = threads_per_subcore;
	else
		kvm->arch.smt_mode = 1;
5360
	kvm->arch.emul_smt_mode = 1;
5361

5362 5363 5364 5365 5366
	/*
	 * Create a debugfs directory for the VM
	 */
	snprintf(buf, sizeof(buf), "vm%d", current->pid);
	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
5367
	kvmppc_mmu_debugfs_init(kvm);
5368 5369
	if (radix_enabled())
		kvmhv_radix_debugfs_init(kvm);
5370

5371
	return 0;
5372 5373
}

5374 5375 5376 5377
static void kvmppc_free_vcores(struct kvm *kvm)
{
	long int i;

5378
	for (i = 0; i < KVM_MAX_VCORES; ++i)
5379 5380 5381 5382
		kfree(kvm->arch.vcores[i]);
	kvm->arch.online_vcores = 0;
}

5383
static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
5384
{
5385 5386
	debugfs_remove_recursive(kvm->arch.debugfs_dir);

5387
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5388
		kvm_hv_vm_deactivated();
5389

5390
	kvmppc_free_vcores(kvm);
5391

5392

5393 5394 5395
	if (kvm_is_radix(kvm))
		kvmppc_free_radix(kvm);
	else
5396
		kvmppc_free_hpt(&kvm->arch.hpt);
5397

5398 5399
	/* Perform global invalidation and return lpid to the pool */
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5400
		if (nesting_enabled(kvm))
5401
			kvmhv_release_all_nested(kvm);
5402
		kvm->arch.process_table = 0;
5403 5404
		if (kvm->arch.secure_guest)
			uv_svm_terminate(kvm->arch.lpid);
5405
		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
5406
	}
5407

5408 5409
	kvmppc_free_lpid(kvm->arch.lpid);

5410
	kvmppc_free_pimap(kvm);
5411 5412
}

5413
/* We don't need to emulate any privileged instructions or dcbz */
5414
static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
5415
				     unsigned int inst, int *advance)
5416
{
5417
	return EMULATE_FAIL;
5418 5419
}

5420 5421
static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
					ulong spr_val)
5422 5423 5424 5425
{
	return EMULATE_FAIL;
}

5426 5427
static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
					ulong *spr_val)
5428 5429 5430 5431
{
	return EMULATE_FAIL;
}

5432
static int kvmppc_core_check_processor_compat_hv(void)
5433
{
5434 5435 5436
	if (cpu_has_feature(CPU_FTR_HVMODE) &&
	    cpu_has_feature(CPU_FTR_ARCH_206))
		return 0;
5437

5438 5439 5440 5441 5442
	/* POWER9 in radix mode is capable of being a nested hypervisor. */
	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
		return 0;

	return -EIO;
5443 5444
}

5445 5446 5447 5448 5449 5450 5451
#ifdef CONFIG_KVM_XICS

void kvmppc_free_pimap(struct kvm *kvm)
{
	kfree(kvm->arch.pimap);
}

5452
static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
5453 5454 5455
{
	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
}
5456 5457 5458 5459 5460 5461 5462

static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
{
	struct irq_desc *desc;
	struct kvmppc_irq_map *irq_map;
	struct kvmppc_passthru_irqmap *pimap;
	struct irq_chip *chip;
5463
	int i, rc = 0;
5464
	struct irq_data *host_data;
5465

5466 5467 5468
	if (!kvm_irq_bypass)
		return 1;

5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488
	desc = irq_to_desc(host_irq);
	if (!desc)
		return -EIO;

	mutex_lock(&kvm->lock);

	pimap = kvm->arch.pimap;
	if (pimap == NULL) {
		/* First call, allocate structure to hold IRQ map */
		pimap = kvmppc_alloc_pimap();
		if (pimap == NULL) {
			mutex_unlock(&kvm->lock);
			return -ENOMEM;
		}
		kvm->arch.pimap = pimap;
	}

	/*
	 * For now, we only support interrupts for which the EOI operation
	 * is an OPAL call followed by a write to XIRR, since that's
5489
	 * what our real-mode EOI code does, or a XIVE interrupt
5490 5491
	 */
	chip = irq_data_get_irq_chip(&desc->irq_data);
5492
	if (!chip || !is_pnv_opal_msi(chip)) {
5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523
		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
			host_irq, guest_gsi);
		mutex_unlock(&kvm->lock);
		return -ENOENT;
	}

	/*
	 * See if we already have an entry for this guest IRQ number.
	 * If it's mapped to a hardware IRQ number, that's an error,
	 * otherwise re-use this entry.
	 */
	for (i = 0; i < pimap->n_mapped; i++) {
		if (guest_gsi == pimap->mapped[i].v_hwirq) {
			if (pimap->mapped[i].r_hwirq) {
				mutex_unlock(&kvm->lock);
				return -EINVAL;
			}
			break;
		}
	}

	if (i == KVMPPC_PIRQ_MAPPED) {
		mutex_unlock(&kvm->lock);
		return -EAGAIN;		/* table is full */
	}

	irq_map = &pimap->mapped[i];

	irq_map->v_hwirq = guest_gsi;
	irq_map->desc = desc;

5524 5525 5526 5527 5528
	/*
	 * Order the above two stores before the next to serialize with
	 * the KVM real mode handler.
	 */
	smp_wmb();
5529 5530 5531 5532 5533 5534 5535 5536

	/*
	 * The 'host_irq' number is mapped in the PCI-MSI domain but
	 * the underlying calls, which will EOI the interrupt in real
	 * mode, need an HW IRQ number mapped in the XICS IRQ domain.
	 */
	host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
5537

5538 5539 5540
	if (i == pimap->n_mapped)
		pimap->n_mapped++;

5541
	if (xics_on_xive())
5542
		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
5543
	else
5544
		kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
5545 5546
	if (rc)
		irq_map->r_hwirq = 0;
5547

5548 5549 5550 5551 5552 5553 5554 5555 5556
	mutex_unlock(&kvm->lock);

	return 0;
}

static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
{
	struct irq_desc *desc;
	struct kvmppc_passthru_irqmap *pimap;
5557
	int i, rc = 0;
5558

5559 5560 5561
	if (!kvm_irq_bypass)
		return 0;

5562 5563 5564 5565 5566
	desc = irq_to_desc(host_irq);
	if (!desc)
		return -EIO;

	mutex_lock(&kvm->lock);
5567 5568
	if (!kvm->arch.pimap)
		goto unlock;
5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581

	pimap = kvm->arch.pimap;

	for (i = 0; i < pimap->n_mapped; i++) {
		if (guest_gsi == pimap->mapped[i].v_hwirq)
			break;
	}

	if (i == pimap->n_mapped) {
		mutex_unlock(&kvm->lock);
		return -ENODEV;
	}

5582
	if (xics_on_xive())
5583
		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
5584 5585
	else
		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
5586

5587
	/* invalidate the entry (what do do on error from the above ?) */
5588 5589 5590 5591 5592 5593
	pimap->mapped[i].r_hwirq = 0;

	/*
	 * We don't free this structure even when the count goes to
	 * zero. The structure is freed when we destroy the VM.
	 */
5594
 unlock:
5595
	mutex_unlock(&kvm->lock);
5596
	return rc;
5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634
}

static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
					     struct irq_bypass_producer *prod)
{
	int ret = 0;
	struct kvm_kernel_irqfd *irqfd =
		container_of(cons, struct kvm_kernel_irqfd, consumer);

	irqfd->producer = prod;

	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
	if (ret)
		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
			prod->irq, irqfd->gsi, ret);

	return ret;
}

static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
					      struct irq_bypass_producer *prod)
{
	int ret;
	struct kvm_kernel_irqfd *irqfd =
		container_of(cons, struct kvm_kernel_irqfd, consumer);

	irqfd->producer = NULL;

	/*
	 * When producer of consumer is unregistered, we change back to
	 * default external interrupt handling mode - KVM real mode
	 * will switch back to host.
	 */
	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
	if (ret)
		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
			prod->irq, irqfd->gsi, ret);
}
5635 5636
#endif

5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648
static long kvm_arch_vm_ioctl_hv(struct file *filp,
				 unsigned int ioctl, unsigned long arg)
{
	struct kvm *kvm __maybe_unused = filp->private_data;
	void __user *argp = (void __user *)arg;
	long r;

	switch (ioctl) {

	case KVM_PPC_ALLOCATE_HTAB: {
		u32 htab_order;

5649 5650 5651 5652 5653 5654
		/* If we're a nested hypervisor, we currently only support radix */
		if (kvmhv_on_pseries()) {
			r = -EOPNOTSUPP;
			break;
		}

5655 5656 5657
		r = -EFAULT;
		if (get_user(htab_order, (u32 __user *)argp))
			break;
5658
		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674
		if (r)
			break;
		r = 0;
		break;
	}

	case KVM_PPC_GET_HTAB_FD: {
		struct kvm_get_htab_fd ghf;

		r = -EFAULT;
		if (copy_from_user(&ghf, argp, sizeof(ghf)))
			break;
		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
		break;
	}

5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696
	case KVM_PPC_RESIZE_HPT_PREPARE: {
		struct kvm_ppc_resize_hpt rhpt;

		r = -EFAULT;
		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
			break;

		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
		break;
	}

	case KVM_PPC_RESIZE_HPT_COMMIT: {
		struct kvm_ppc_resize_hpt rhpt;

		r = -EFAULT;
		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
			break;

		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
		break;
	}

5697 5698 5699 5700 5701 5702 5703
	default:
		r = -ENOTTY;
	}

	return r;
}

5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715
/*
 * List of hcall numbers to enable by default.
 * For compatibility with old userspace, we enable by default
 * all hcalls that were implemented before the hcall-enabling
 * facility was added.  Note this list should not include H_RTAS.
 */
static unsigned int default_hcall_list[] = {
	H_REMOVE,
	H_ENTER,
	H_READ,
	H_PROTECT,
	H_BULK_REMOVE,
5716
#ifdef CONFIG_SPAPR_TCE_IOMMU
5717 5718
	H_GET_TCE,
	H_PUT_TCE,
5719
#endif
5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739
	H_SET_DABR,
	H_SET_XDABR,
	H_CEDE,
	H_PROD,
	H_CONFER,
	H_REGISTER_VPA,
#ifdef CONFIG_KVM_XICS
	H_EOI,
	H_CPPR,
	H_IPI,
	H_IPOLL,
	H_XIRR,
	H_XIRR_X,
#endif
	0
};

static void init_default_hcalls(void)
{
	int i;
5740
	unsigned int hcall;
5741

5742 5743 5744 5745 5746
	for (i = 0; default_hcall_list[i]; ++i) {
		hcall = default_hcall_list[i];
		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
		__set_bit(hcall / 4, default_enabled_hcalls);
	}
5747 5748
}

5749 5750
static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
{
5751
	unsigned long lpcr;
5752
	int radix;
5753
	int err;
5754 5755 5756 5757 5758 5759 5760 5761 5762 5763

	/* If not on a POWER9, reject it */
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
		return -ENODEV;

	/* If any unknown flags set, reject it */
	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
		return -EINVAL;

	/* GR (guest radix) bit in process_table field must match */
5764
	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
5765
	if (!!(cfg->process_table & PATB_GR) != radix)
5766 5767 5768 5769 5770 5771
		return -EINVAL;

	/* Process table size field must be reasonable, i.e. <= 24 */
	if ((cfg->process_table & PRTS_MASK) > 24)
		return -EINVAL;

5772 5773 5774 5775
	/* We can change a guest to/from radix now, if the host is radix */
	if (radix && !radix_enabled())
		return -EINVAL;

5776 5777 5778 5779
	/* If we're a nested hypervisor, we currently only support radix */
	if (kvmhv_on_pseries() && !radix)
		return -EINVAL;

5780
	mutex_lock(&kvm->arch.mmu_setup_lock);
5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799
	if (radix != kvm_is_radix(kvm)) {
		if (kvm->arch.mmu_ready) {
			kvm->arch.mmu_ready = 0;
			/* order mmu_ready vs. vcpus_running */
			smp_mb();
			if (atomic_read(&kvm->arch.vcpus_running)) {
				kvm->arch.mmu_ready = 1;
				err = -EBUSY;
				goto out_unlock;
			}
		}
		if (radix)
			err = kvmppc_switch_mmu_to_radix(kvm);
		else
			err = kvmppc_switch_mmu_to_hpt(kvm);
		if (err)
			goto out_unlock;
	}

5800 5801 5802 5803 5804
	kvm->arch.process_table = cfg->process_table;
	kvmppc_setup_partition_table(kvm);

	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
5805
	err = 0;
5806

5807
 out_unlock:
5808
	mutex_unlock(&kvm->arch.mmu_setup_lock);
5809
	return err;
5810 5811
}

5812 5813 5814 5815
static int kvmhv_enable_nested(struct kvm *kvm)
{
	if (!nested)
		return -EPERM;
5816
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5817
		return -ENODEV;
5818
	if (!radix_enabled())
5819 5820 5821 5822 5823 5824 5825 5826
		return -ENODEV;

	/* kvm == NULL means the caller is testing if the capability exists */
	if (kvm)
		kvm->arch.nested_enable = true;
	return 0;
}

5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864
static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
				 int size)
{
	int rc = -EINVAL;

	if (kvmhv_vcpu_is_radix(vcpu)) {
		rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);

		if (rc > 0)
			rc = -EINVAL;
	}

	/* For now quadrants are the only way to access nested guest memory */
	if (rc && vcpu->arch.nested)
		rc = -EAGAIN;

	return rc;
}

static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
				int size)
{
	int rc = -EINVAL;

	if (kvmhv_vcpu_is_radix(vcpu)) {
		rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);

		if (rc > 0)
			rc = -EINVAL;
	}

	/* For now quadrants are the only way to access nested guest memory */
	if (rc && vcpu->arch.nested)
		rc = -EAGAIN;

	return rc;
}

5865 5866 5867 5868 5869 5870 5871 5872 5873
static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
{
	unpin_vpa(kvm, vpa);
	vpa->gpa = 0;
	vpa->pinned_addr = NULL;
	vpa->dirty = false;
	vpa->update_pending = 0;
}

5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888
/*
 * Enable a guest to become a secure VM, or test whether
 * that could be enabled.
 * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
 * tested (kvm == NULL) or enabled (kvm != NULL).
 */
static int kvmhv_enable_svm(struct kvm *kvm)
{
	if (!kvmppc_uvmem_available())
		return -EINVAL;
	if (kvm)
		kvm->arch.svm_enabled = 1;
	return 0;
}

5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929
/*
 *  IOCTL handler to turn off secure mode of guest
 *
 * - Release all device pages
 * - Issue ucall to terminate the guest on the UV side
 * - Unpin the VPA pages.
 * - Reinit the partition scoped page tables
 */
static int kvmhv_svm_off(struct kvm *kvm)
{
	struct kvm_vcpu *vcpu;
	int mmu_was_ready;
	int srcu_idx;
	int ret = 0;
	int i;

	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
		return ret;

	mutex_lock(&kvm->arch.mmu_setup_lock);
	mmu_was_ready = kvm->arch.mmu_ready;
	if (kvm->arch.mmu_ready) {
		kvm->arch.mmu_ready = 0;
		/* order mmu_ready vs. vcpus_running */
		smp_mb();
		if (atomic_read(&kvm->arch.vcpus_running)) {
			kvm->arch.mmu_ready = 1;
			ret = -EBUSY;
			goto out;
		}
	}

	srcu_idx = srcu_read_lock(&kvm->srcu);
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		struct kvm_memory_slot *memslot;
		struct kvm_memslots *slots = __kvm_memslots(kvm, i);

		if (!slots)
			continue;

		kvm_for_each_memslot(memslot, slots) {
5930
			kvmppc_uvmem_drop_pages(memslot, kvm, true);
5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967
			uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
		}
	}
	srcu_read_unlock(&kvm->srcu, srcu_idx);

	ret = uv_svm_terminate(kvm->arch.lpid);
	if (ret != U_SUCCESS) {
		ret = -EINVAL;
		goto out;
	}

	/*
	 * When secure guest is reset, all the guest pages are sent
	 * to UV via UV_PAGE_IN before the non-boot vcpus get a
	 * chance to run and unpin their VPA pages. Unpinning of all
	 * VPA pages is done here explicitly so that VPA pages
	 * can be migrated to the secure side.
	 *
	 * This is required to for the secure SMP guest to reboot
	 * correctly.
	 */
	kvm_for_each_vcpu(i, vcpu, kvm) {
		spin_lock(&vcpu->arch.vpa_update_lock);
		unpin_vpa_reset(kvm, &vcpu->arch.dtl);
		unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
		unpin_vpa_reset(kvm, &vcpu->arch.vpa);
		spin_unlock(&vcpu->arch.vpa_update_lock);
	}

	kvmppc_setup_partition_table(kvm);
	kvm->arch.secure_guest = 0;
	kvm->arch.mmu_ready = mmu_was_ready;
out:
	mutex_unlock(&kvm->arch.mmu_setup_lock);
	return ret;
}

5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978
static int kvmhv_enable_dawr1(struct kvm *kvm)
{
	if (!cpu_has_feature(CPU_FTR_DAWR1))
		return -ENODEV;

	/* kvm == NULL means the caller is testing if the capability exists */
	if (kvm)
		kvm->arch.dawr1_enabled = true;
	return 0;
}

5979 5980
static bool kvmppc_hash_v3_possible(void)
{
5981
	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5982 5983
		return false;

5984
	if (!cpu_has_feature(CPU_FTR_HVMODE))
5985 5986
		return false;

5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999
	/*
	 * POWER9 chips before version 2.02 can't have some threads in
	 * HPT mode and some in radix mode on the same core.
	 */
	if (radix_enabled()) {
		unsigned int pvr = mfspr(SPRN_PVR);
		if ((pvr >> 16) == PVR_POWER9 &&
		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
			return false;
	}

	return true;
6000 6001
}

6002
static struct kvmppc_ops kvm_ops_hv = {
6003 6004 6005 6006 6007 6008
	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
	.get_one_reg = kvmppc_get_one_reg_hv,
	.set_one_reg = kvmppc_set_one_reg_hv,
	.vcpu_load   = kvmppc_core_vcpu_load_hv,
	.vcpu_put    = kvmppc_core_vcpu_put_hv,
6009
	.inject_interrupt = kvmppc_inject_interrupt_hv,
6010 6011 6012 6013 6014 6015 6016 6017 6018
	.set_msr     = kvmppc_set_msr_hv,
	.vcpu_run    = kvmppc_vcpu_run_hv,
	.vcpu_create = kvmppc_core_vcpu_create_hv,
	.vcpu_free   = kvmppc_core_vcpu_free_hv,
	.check_requests = kvmppc_core_check_requests_hv,
	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
	.flush_memslot  = kvmppc_core_flush_memslot_hv,
	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
6019 6020 6021 6022
	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
	.age_gfn = kvm_age_gfn_hv,
	.test_age_gfn = kvm_test_age_gfn_hv,
	.set_spte_gfn = kvm_set_spte_gfn_hv,
6023 6024 6025 6026 6027 6028 6029 6030 6031
	.free_memslot = kvmppc_core_free_memslot_hv,
	.init_vm =  kvmppc_core_init_vm_hv,
	.destroy_vm = kvmppc_core_destroy_vm_hv,
	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
	.emulate_op = kvmppc_core_emulate_op_hv,
	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
6032
	.hcall_implemented = kvmppc_hcall_impl_hv,
6033 6034 6035 6036
#ifdef CONFIG_KVM_XICS
	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
#endif
6037 6038
	.configure_mmu = kvmhv_configure_mmu,
	.get_rmmu_info = kvmhv_get_rmmu_info,
6039
	.set_smt_mode = kvmhv_set_smt_mode,
6040
	.enable_nested = kvmhv_enable_nested,
6041 6042
	.load_from_eaddr = kvmhv_load_from_eaddr,
	.store_to_eaddr = kvmhv_store_to_eaddr,
6043
	.enable_svm = kvmhv_enable_svm,
6044
	.svm_off = kvmhv_svm_off,
6045
	.enable_dawr1 = kvmhv_enable_dawr1,
6046
	.hash_v3_possible = kvmppc_hash_v3_possible,
6047 6048
};

6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059
static int kvm_init_subcore_bitmap(void)
{
	int i, j;
	int nr_cores = cpu_nr_cores();
	struct sibling_subcore_state *sibling_subcore_state;

	for (i = 0; i < nr_cores; i++) {
		int first_cpu = i * threads_per_core;
		int node = cpu_to_node(first_cpu);

		/* Ignore if it is already allocated. */
6060
		if (paca_ptrs[first_cpu]->sibling_subcore_state)
6061 6062 6063
			continue;

		sibling_subcore_state =
6064
			kzalloc_node(sizeof(struct sibling_subcore_state),
6065 6066 6067 6068 6069 6070 6071 6072
							GFP_KERNEL, node);
		if (!sibling_subcore_state)
			return -ENOMEM;


		for (j = 0; j < threads_per_core; j++) {
			int cpu = first_cpu + j;

6073 6074
			paca_ptrs[cpu]->sibling_subcore_state =
						sibling_subcore_state;
6075 6076 6077 6078 6079
		}
	}
	return 0;
}

6080 6081 6082 6083 6084
static int kvmppc_radix_possible(void)
{
	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
}

6085
static int kvmppc_book3s_init_hv(void)
6086 6087
{
	int r;
6088 6089 6090 6091 6092 6093

	if (!tlbie_capable) {
		pr_err("KVM-HV: Host does not support TLBIE\n");
		return -ENODEV;
	}

6094 6095 6096 6097 6098
	/*
	 * FIXME!! Do we need to check on all cpus ?
	 */
	r = kvmppc_core_check_processor_compat_hv();
	if (r < 0)
6099
		return -ENODEV;
6100

6101 6102 6103 6104
	r = kvmhv_nested_init();
	if (r)
		return r;

6105 6106 6107 6108
	r = kvm_init_subcore_bitmap();
	if (r)
		return r;

6109 6110
	/*
	 * We need a way of accessing the XICS interrupt controller,
6111
	 * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
6112 6113 6114
	 * indirectly, via OPAL.
	 */
#ifdef CONFIG_SMP
6115
	if (!xics_on_xive() && !kvmhv_on_pseries() &&
6116
	    !local_paca->kvm_hstate.xics_phys) {
6117 6118 6119 6120 6121 6122 6123
		struct device_node *np;

		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
		if (!np) {
			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
			return -ENODEV;
		}
6124 6125
		/* presence of intc confirmed - node can be dropped again */
		of_node_put(np);
6126 6127 6128
	}
#endif

6129 6130
	kvm_ops_hv.owner = THIS_MODULE;
	kvmppc_hv_ops = &kvm_ops_hv;
6131

6132 6133
	init_default_hcalls();

6134 6135
	init_vcore_lists();

6136
	r = kvmppc_mmu_hv_init();
6137 6138 6139 6140 6141
	if (r)
		return r;

	if (kvmppc_radix_possible())
		r = kvmppc_radix_init();
6142

6143 6144 6145 6146
	r = kvmppc_uvmem_init();
	if (r < 0)
		pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);

6147 6148 6149
	return r;
}

6150
static void kvmppc_book3s_exit_hv(void)
6151
{
6152
	kvmppc_uvmem_free();
6153
	kvmppc_free_host_rm_ops();
6154 6155
	if (kvmppc_radix_possible())
		kvmppc_radix_exit();
6156
	kvmppc_hv_ops = NULL;
6157
	kvmhv_nested_exit();
6158 6159
}

6160 6161
module_init(kvmppc_book3s_init_hv);
module_exit(kvmppc_book3s_exit_hv);
6162
MODULE_LICENSE("GPL");
6163 6164
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");