Commit 243e2511 authored by Benjamin Herrenschmidt's avatar Benjamin Herrenschmidt Committed by Michael Ellerman

powerpc/xive: Native exploitation of the XIVE interrupt controller

The XIVE interrupt controller is the new interrupt controller
found in POWER9. It supports advanced virtualization capabilities
among other things.

Currently we use a set of firmware calls that simulate the old
"XICS" interrupt controller but this is fairly inefficient.

This adds the framework for using XIVE along with a native
backend which OPAL for configuration. Later, a backend allowing
the use in a KVM or PowerVM guest will also be provided.

This disables some fast path for interrupts in KVM when XIVE is
enabled as these rely on the firmware emulation code which is no
longer available when the XIVE is used natively by Linux.

A latter patch will make KVM also directly exploit the XIVE, thus
recovering the lost performance (and more).
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Fixup pr_xxx("XIVE:"...), don't split pr_xxx() strings,
 tweak Kconfig so XIVE_NATIVE selects XIVE and depends on POWERNV,
 fix build errors when SMP=n, fold in fixes from Ben:
   Don't call cpu_online() on an invalid CPU number
   Fix irq target selection returning out of bounds cpu#
   Extra sanity checks on cpu numbers
 ]
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent a978e139
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_XIVE_REGS_H
#define _ASM_POWERPC_XIVE_REGS_H
/*
* Thread Management (aka "TM") registers
*/
/* TM register offsets */
#define TM_QW0_USER 0x000 /* All rings */
#define TM_QW1_OS 0x010 /* Ring 0..2 */
#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */
#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */
/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
#define TM_NSR 0x0 /* + + - + */
#define TM_CPPR 0x1 /* - + - + */
#define TM_IPB 0x2 /* - + + + */
#define TM_LSMFB 0x3 /* - + + + */
#define TM_ACK_CNT 0x4 /* - + - - */
#define TM_INC 0x5 /* - + - + */
#define TM_AGE 0x6 /* - + - + */
#define TM_PIPR 0x7 /* - + - + */
#define TM_WORD0 0x0
#define TM_WORD1 0x4
/*
* QW word 2 contains the valid bit at the top and other fields
* depending on the QW.
*/
#define TM_WORD2 0x8
#define TM_QW0W2_VU PPC_BIT32(0)
#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
#define TM_QW1W2_VO PPC_BIT32(0)
#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
#define TM_QW2W2_VP PPC_BIT32(0)
#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
#define TM_QW3W2_VT PPC_BIT32(0)
#define TM_QW3W2_LP PPC_BIT32(6)
#define TM_QW3W2_LE PPC_BIT32(7)
#define TM_QW3W2_T PPC_BIT32(31)
/*
* In addition to normal loads to "peek" and writes (only when invalid)
* using 4 and 8 bytes accesses, the above registers support these
* "special" byte operations:
*
* - Byte load from QW0[NSR] - User level NSR (EBB)
* - Byte store to QW0[NSR] - User level NSR (EBB)
* - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
* - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
* otherwise VT||0000000
* - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
*
* Then we have all these "special" CI ops at these offset that trigger
* all sorts of side effects:
*/
#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/
#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */
#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */
#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */
#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */
#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */
#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/
#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */
#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */
#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */
#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */
#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */
/* XXX more... */
/* NSR fields for the various QW ack types */
#define TM_QW0_NSR_EB PPC_BIT8(0)
#define TM_QW1_NSR_EO PPC_BIT8(0)
#define TM_QW3_NSR_HE PPC_BITMASK8(0,1)
#define TM_QW3_NSR_HE_NONE 0
#define TM_QW3_NSR_HE_POOL 1
#define TM_QW3_NSR_HE_PHYS 2
#define TM_QW3_NSR_HE_LSI 3
#define TM_QW3_NSR_I PPC_BIT8(2)
#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7)
/* Utilities to manipulate these (originaly from OPAL) */
#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m))
#define SETFIELD(m, v, val) \
(((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
#endif /* _ASM_POWERPC_XIVE_REGS_H */
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_XIVE_H
#define _ASM_POWERPC_XIVE_H
#define XIVE_INVALID_VP 0xffffffff
#ifdef CONFIG_PPC_XIVE
/*
* Thread Interrupt Management Area (TIMA)
*
* This is a global MMIO region divided in 4 pages of varying access
* permissions, providing access to per-cpu interrupt management
* functions. It always identifies the CPU doing the access based
* on the PowerBus initiator ID, thus we always access via the
* same offset regardless of where the code is executing
*/
extern void __iomem *xive_tima;
/*
* Offset in the TM area of our current execution level (provided by
* the backend)
*/
extern u32 xive_tima_offset;
/*
* Per-irq data (irq_get_handler_data for normal IRQs), IPIs
* have it stored in the xive_cpu structure. We also cache
* for normal interrupts the current target CPU.
*
* This structure is setup by the backend for each interrupt.
*/
struct xive_irq_data {
u64 flags;
u64 eoi_page;
void __iomem *eoi_mmio;
u64 trig_page;
void __iomem *trig_mmio;
u32 esb_shift;
int src_chip;
/* Setup/used by frontend */
int target;
bool saved_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
#define XIVE_IRQ_FLAG_MASK_FW 0x08
#define XIVE_IRQ_FLAG_EOI_FW 0x10
#define XIVE_INVALID_CHIP_ID -1
/* A queue tracking structure in a CPU */
struct xive_q {
__be32 *qpage;
u32 msk;
u32 idx;
u32 toggle;
u64 eoi_phys;
u32 esc_irq;
atomic_t count;
atomic_t pending_count;
};
/*
* "magic" Event State Buffer (ESB) MMIO offsets.
*
* Each interrupt source has a 2-bit state machine called ESB
* which can be controlled by MMIO. It's made of 2 bits, P and
* Q. P indicates that an interrupt is pending (has been sent
* to a queue and is waiting for an EOI). Q indicates that the
* interrupt has been triggered while pending.
*
* This acts as a coalescing mechanism in order to guarantee
* that a given interrupt only occurs at most once in a queue.
*
* When doing an EOI, the Q bit will indicate if the interrupt
* needs to be re-triggered.
*
* The following offsets into the ESB MMIO allow to read or
* manipulate the PQ bits. They must be used with an 8-bytes
* load instruction. They all return the previous state of the
* interrupt (atomically).
*
* Additionally, some ESB pages support doing an EOI via a
* store at 0 and some ESBs support doing a trigger via a
* separate trigger page.
*/
#define XIVE_ESB_GET 0x800
#define XIVE_ESB_SET_PQ_00 0xc00
#define XIVE_ESB_SET_PQ_01 0xd00
#define XIVE_ESB_SET_PQ_10 0xe00
#define XIVE_ESB_SET_PQ_11 0xf00
#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
#define XIVE_ESB_VAL_P 0x2
#define XIVE_ESB_VAL_Q 0x1
/* Global enable flags for the XIVE support */
extern bool __xive_enabled;
static inline bool xive_enabled(void) { return __xive_enabled; }
extern bool xive_native_init(void);
extern void xive_smp_probe(void);
extern int xive_smp_prepare_cpu(unsigned int cpu);
extern void xive_smp_setup_cpu(void);
extern void xive_smp_disable_cpu(void);
extern void xive_kexec_teardown_cpu(int secondary);
extern void xive_shutdown(void);
extern void xive_flush_interrupt(void);
/* xmon hook */
extern void xmon_xive_do_dump(int cpu);
/* APIs used by KVM */
extern u32 xive_native_default_eq_shift(void);
extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
extern void xive_native_free_vp_block(u32 vp_base);
extern int xive_native_populate_irq_data(u32 hw_irq,
struct xive_irq_data *data);
extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
extern u32 xive_native_alloc_irq(void);
extern void xive_native_free_irq(u32 irq);
extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
__be32 *qpage, u32 order, bool can_escalate);
extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
extern bool __xive_irq_trigger(struct xive_irq_data *xd);
extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
extern bool is_xive_irq(struct irq_chip *chip);
#else
static inline bool xive_enabled(void) { return false; }
static inline bool xive_native_init(void) { return false; }
static inline void xive_smp_probe(void) { }
extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
static inline void xive_smp_setup_cpu(void) { }
static inline void xive_smp_disable_cpu(void) { }
static inline void xive_kexec_teardown_cpu(int secondary) { }
static inline void xive_shutdown(void) { }
static inline void xive_flush_interrupt(void) { }
static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
static inline void xive_native_free_vp_block(u32 vp_base) { }
#endif
#endif /* _ASM_POWERPC_XIVE_H */
......@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
extern int cpus_are_in_xmon(void);
#endif
extern void xmon_printf(const char *format, ...);
#endif /* __KERNEL __ */
#endif /* __ASM_POWERPC_XMON_H */
......@@ -23,6 +23,7 @@
#include <asm/kvm_book3s.h>
#include <asm/archrandom.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
#include <asm/io.h>
......@@ -224,6 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
return;
}
/* We should never reach this */
if (WARN_ON_ONCE(xive_enabled()))
return;
/* Else poke the target with an IPI */
xics_phys = paca[cpu].kvm_hstate.xics_phys;
if (xics_phys)
......@@ -386,6 +391,9 @@ long kvmppc_read_intr(void)
long rc;
bool again;
if (xive_enabled())
return 1;
do {
again = false;
rc = kvmppc_read_one_intr(&again);
......
......@@ -4,6 +4,7 @@ config PPC_POWERNV
select PPC_NATIVE
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_XIVE_NATIVE
select PPC_P7_NAP
select PCI
select PCI_MSI
......
......@@ -32,6 +32,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/kexec.h>
#include <asm/smp.h>
......@@ -76,7 +77,9 @@ static void __init pnv_init(void)
static void __init pnv_init_IRQ(void)
{
xics_init();
/* Try using a XIVE if available, otherwise use a XICS */
if (!xive_native_init())
xics_init();
WARN_ON(!ppc_md.get_irq);
}
......@@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
{
xics_kexec_teardown_cpu(secondary);
if (xive_enabled())
xive_kexec_teardown_cpu(secondary);
else
xics_kexec_teardown_cpu(secondary);
/* On OPAL, we return all CPUs to firmware */
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
......@@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
/* Primary waits for the secondaries to have reached OPAL */
pnv_kexec_wait_secondaries_down();
/* Switch XIVE back to emulation mode */
if (xive_enabled())
xive_shutdown();
/*
* We might be running as little-endian - now that interrupts
* are disabled, reset the HILE bit to big-endian so we don't
......
......@@ -29,6 +29,7 @@
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
#include <asm/code-patching.h>
......@@ -47,7 +48,9 @@
static void pnv_smp_setup_cpu(int cpu)
{
if (cpu != boot_cpuid)
if (xive_enabled())
xive_smp_setup_cpu();
else if (cpu != boot_cpuid)
xics_setup_cpu();
#ifdef CONFIG_PPC_DOORBELL
......@@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void)
vdso_data->processorCount--;
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
xics_migrate_irqs_away();
if (xive_enabled())
xive_smp_disable_cpu();
else
xics_migrate_irqs_away();
return 0;
}
......@@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void)
if (((srr1 & wmask) == SRR1_WAKEEE) ||
((srr1 & wmask) == SRR1_WAKEHVI) ||
(local_paca->irq_happened & PACA_IRQ_EE)) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
icp_opal_flush_interrupt();
else
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (xive_enabled())
xive_flush_interrupt();
else
icp_opal_flush_interrupt();
} else
icp_native_flush_interrupt();
} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
......@@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr)
return smp_generic_cpu_bootable(nr);
}
static int pnv_smp_prepare_cpu(int cpu)
{
if (xive_enabled())
return xive_smp_prepare_cpu(cpu);
return 0;
}
static void __init pnv_smp_probe(void)
{
if (xive_enabled())
xive_smp_probe();
else
xics_smp_probe();
}
static struct smp_ops_t pnv_smp_ops = {
.message_pass = smp_muxed_ipi_message_pass,
.cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
.probe = xics_smp_probe,
.cause_ipi = NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
.probe = pnv_smp_probe,
.prepare_cpu = pnv_smp_prepare_cpu,
.kick_cpu = pnv_smp_kick_cpu,
.setup_cpu = pnv_smp_setup_cpu,
.cpu_bootable = pnv_cpu_bootable,
......
......@@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
default y if PPC_POWERNV
source "arch/powerpc/sysdev/xics/Kconfig"
source "arch/powerpc/sysdev/xive/Kconfig"
config PPC_SCOM
bool
......
......@@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
obj-$(CONFIG_PPC_XICS) += xics/
obj-$(CONFIG_PPC_XIVE) += xive/
obj-$(CONFIG_GE_FPGA) += ge/
config PPC_XIVE
bool
default n
select PPC_SMP_MUXED_IPI
select HARDIRQS_SW_RESEND
config PPC_XIVE_NATIVE
bool
default n
select PPC_XIVE
depends on PPC_POWERNV
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
obj-y += common.o
obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o
This diff is collapsed.
This diff is collapsed.
/*
* Copyright 2016,2017 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef __XIVE_INTERNAL_H
#define __XIVE_INTERNAL_H
/* Each CPU carry one of these with various per-CPU state */
struct xive_cpu {
#ifdef CONFIG_SMP
/* HW irq number and data of IPI */
u32 hw_ipi;
struct xive_irq_data ipi_data;
#endif /* CONFIG_SMP */
int chip_id;
/* Queue datas. Only one is populated */
#define XIVE_MAX_QUEUES 8
struct xive_q queue[XIVE_MAX_QUEUES];
/*
* Pending mask. Each bit corresponds to a priority that
* potentially has pending interrupts.
*/
u8 pending_prio;
/* Cache of HW CPPR */
u8 cppr;
};
/* Backend ops */
struct xive_ops {
int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
bool (*match)(struct device_node *np);
void (*shutdown)(void);
void (*update_pending)(struct xive_cpu *xc);
void (*eoi)(u32 hw_irq);
void (*sync_source)(u32 hw_irq);
#ifdef CONFIG_SMP
int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
#endif
const char *name;
};
bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
u8 max_prio);
extern bool xive_cmdline_disabled;
#endif /* __XIVE_INTERNAL_H */
......@@ -30,6 +30,7 @@
#include <linux/ctype.h>
#include <asm/ptrace.h>
#include <asm/smp.h>
#include <asm/string.h>
#include <asm/prom.h>
#include <asm/machdep.h>
......@@ -48,7 +49,7 @@
#include <asm/reg.h>
#include <asm/debug.h>
#include <asm/hw_breakpoint.h>
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/firmware.h>
......@@ -232,7 +233,13 @@ Commands:\n\
"\
dr dump stream of raw bytes\n\
dt dump the tracing buffers (uses printk)\n\
e print exception information\n\
"
#ifdef CONFIG_PPC_POWERNV
" dx# dump xive on CPU #\n\
dxi# dump xive irq state #\n\
dxa dump xive on all CPUs\n"
#endif
" e print exception information\n\
f flush cache\n\
la lookup symbol+offset of specified address\n\
ls lookup address of specified symbol\n\
......@@ -2338,6 +2345,81 @@ static void dump_pacas(void)
}
#endif
#ifdef CONFIG_PPC_POWERNV
static void dump_one_xive(int cpu)
{
unsigned int hwid = get_hard_smp_processor_id(cpu);
opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
opal_xive_dump(XIVE_DUMP_VP, hwid);
opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
if (setjmp(bus_error_jmp) != 0) {
catch_memory_errors = 0;
printf("*** Error dumping xive on cpu %d\n", cpu);
return;
}
catch_memory_errors = 1;
sync();
xmon_xive_do_dump(cpu);
sync();
__delay(200);
catch_memory_errors = 0;
}
static void dump_all_xives(void)
{
int cpu;
if (num_possible_cpus() == 0) {
printf("No possible cpus, use 'dx #' to dump individual cpus\n");
return;
}
for_each_possible_cpu(cpu)
dump_one_xive(cpu);
}
static void dump_one_xive_irq(u32 num)
{
s64 rc;
__be64 vp;
u8 prio;
__be32 lirq;
rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
}
static void dump_xives(void)
{
unsigned long num;
int c;
c = inchar();
if (c == 'a') {
dump_all_xives();
return;
} else if (c == 'i') {
if (scanhex(&num))
dump_one_xive_irq(num);
return;
}
termch = c; /* Put c back, it wasn't 'a' */
if (scanhex(&num))
dump_one_xive(num);
else
dump_one_xive(xmon_owner);
}
#endif /* CONFIG_PPC_POWERNV */
static void dump_by_size(unsigned long addr, long count, int size)
{
unsigned char temp[16];
......@@ -2386,6 +2468,14 @@ dump(void)
return;
}
#endif
#ifdef CONFIG_PPC_POWERNV
if (c == 'x') {
xmon_start_pagination();
dump_xives();
xmon_end_pagination();
return;
}
#endif
if (c == '\n')
termch = c;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment