Commit 94675cce authored by Mahesh Salgaonkar's avatar Mahesh Salgaonkar Committed by Michael Ellerman

powerpc/pseries: Defer the logging of rtas error to irq work queue.

rtas_log_buf is a buffer to hold RTAS event data that are communicated
to kernel by hypervisor. This buffer is then used to pass RTAS event
data to user through proc fs. This buffer is allocated from
vmalloc (non-linear mapping) area.

On Machine check interrupt, register r3 points to RTAS extended event
log passed by hypervisor that contains the MCE event. The pseries
machine check handler then logs this error into rtas_log_buf. The
rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
page fault (vector 0x300) while accessing it. Since machine check
interrupt handler runs in NMI context we can not afford to take any
page fault. Page faults are not honored in NMI context and causes
kernel panic. Apart from that, as Nick pointed out,
pSeries_log_error() also takes a spin_lock while logging error which
is not safe in NMI context. It may endup in deadlock if we get another
MCE before releasing the lock. Fix this by deferring the logging of
rtas error to irq work queue.

Current implementation uses two different buffers to hold rtas error
log depending on whether extended log is provided or not. This makes
bit difficult to identify which buffer has valid data that needs to
logged later in irq work. Simplify this using single buffer, one per
paca, and copy rtas log to it irrespective of whether extended log is
provided or not. Allocate this buffer below RMA region so that it can
be accessed in real mode mce handler.

Fixes: b96672dd ("powerpc: Machine check interrupt is a non-maskable interrupt")
Cc: stable@vger.kernel.org # v4.14+
Reviewed-by: default avatarNicholas Piggin <npiggin@gmail.com>
Signed-off-by: default avatarMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 74e96bf4
...@@ -247,6 +247,9 @@ struct paca_struct { ...@@ -247,6 +247,9 @@ struct paca_struct {
void *rfi_flush_fallback_area; void *rfi_flush_fallback_area;
u64 l1d_flush_size; u64 l1d_flush_size;
#endif #endif
#ifdef CONFIG_PPC_PSERIES
u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */
#endif /* CONFIG_PPC_PSERIES */
} ____cacheline_aligned; } ____cacheline_aligned;
extern void copy_mm_to_paca(struct mm_struct *mm); extern void copy_mm_to_paca(struct mm_struct *mm);
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/of.h> #include <linux/of.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/reboot.h> #include <linux/reboot.h>
#include <linux/irq_work.h>
#include <asm/machdep.h> #include <asm/machdep.h>
#include <asm/rtas.h> #include <asm/rtas.h>
...@@ -32,11 +33,13 @@ ...@@ -32,11 +33,13 @@
static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(ras_log_buf_lock); static DEFINE_SPINLOCK(ras_log_buf_lock);
static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_PER_CPU(__u64, mce_data_buf);
static int ras_check_exception_token; static int ras_check_exception_token;
static void mce_process_errlog_event(struct irq_work *work);
static struct irq_work mce_errlog_process_work = {
.func = mce_process_errlog_event,
};
#define EPOW_SENSOR_TOKEN 9 #define EPOW_SENSOR_TOKEN 9
#define EPOW_SENSOR_INDEX 0 #define EPOW_SENSOR_INDEX 0
...@@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) ...@@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
static inline struct rtas_error_log *fwnmi_get_errlog(void)
{
return (struct rtas_error_log *)local_paca->mce_data_buf;
}
/* /*
* Get the error information for errors coming through the * Get the error information for errors coming through the
* FWNMI vectors. The pt_regs' r3 will be updated to reflect * FWNMI vectors. The pt_regs' r3 will be updated to reflect
* the actual r3 if possible, and a ptr to the error log entry * the actual r3 if possible, and a ptr to the error log entry
* will be returned if found. * will be returned if found.
* *
* If the RTAS error is not of the extended type, then we put it in a per * Use one buffer mce_data_buf per cpu to store RTAS error.
* cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
* *
* The global_mce_data_buf does not have any locks or protection around it, * The mce_data_buf does not have any locks or protection around it,
* if a second machine check comes in, or a system reset is done * if a second machine check comes in, or a system reset is done
* before we have logged the error, then we will get corruption in the * before we have logged the error, then we will get corruption in the
* error log. This is preferable over holding off on calling * error log. This is preferable over holding off on calling
...@@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) ...@@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
{ {
unsigned long *savep; unsigned long *savep;
struct rtas_error_log *h, *errhdr = NULL; struct rtas_error_log *h;
/* Mask top two bits */ /* Mask top two bits */
regs->gpr[3] &= ~(0x3UL << 62); regs->gpr[3] &= ~(0x3UL << 62);
...@@ -362,22 +369,20 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) ...@@ -362,22 +369,20 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
savep = __va(regs->gpr[3]); savep = __va(regs->gpr[3]);
regs->gpr[3] = savep[0]; /* restore original r3 */ regs->gpr[3] = savep[0]; /* restore original r3 */
/* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)&savep[1]; h = (struct rtas_error_log *)&savep[1];
/* Use the per cpu buffer from paca to store rtas error log */
memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
if (!rtas_error_extended(h)) { if (!rtas_error_extended(h)) {
memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64)); memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf);
} else { } else {
int len, error_log_length; int len, error_log_length;
error_log_length = 8 + rtas_error_extended_log_length(h); error_log_length = 8 + rtas_error_extended_log_length(h);
len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); memcpy(local_paca->mce_data_buf, h, len);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;
} }
return errhdr; return (struct rtas_error_log *)local_paca->mce_data_buf;
} }
/* Call this when done with the data returned by FWNMI_get_errinfo. /* Call this when done with the data returned by FWNMI_get_errinfo.
...@@ -422,6 +427,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) ...@@ -422,6 +427,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */ return 0; /* need to perform reset */
} }
/*
* Process MCE rtas errlog event.
*/
static void mce_process_errlog_event(struct irq_work *work)
{
struct rtas_error_log *err;
err = fwnmi_get_errlog();
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
}
/* /*
* See if we can recover from a machine check exception. * See if we can recover from a machine check exception.
* This is only called on power4 (or above) and only via * This is only called on power4 (or above) and only via
...@@ -466,7 +482,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) ...@@ -466,7 +482,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
recovered = 1; recovered = 1;
} }
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); /* Queue irq work to log this rtas event later. */
irq_work_queue(&mce_errlog_process_work);
return recovered; return recovered;
} }
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include <linux/root_dev.h> #include <linux/root_dev.h>
#include <linux/of.h> #include <linux/of.h>
#include <linux/of_pci.h> #include <linux/of_pci.h>
#include <linux/memblock.h>
#include <asm/mmu.h> #include <asm/mmu.h>
#include <asm/processor.h> #include <asm/processor.h>
...@@ -102,6 +103,9 @@ static void pSeries_show_cpuinfo(struct seq_file *m) ...@@ -102,6 +103,9 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
static void __init fwnmi_init(void) static void __init fwnmi_init(void)
{ {
unsigned long system_reset_addr, machine_check_addr; unsigned long system_reset_addr, machine_check_addr;
u8 *mce_data_buf;
unsigned int i;
int nr_cpus = num_possible_cpus();
int ibm_nmi_register = rtas_token("ibm,nmi-register"); int ibm_nmi_register = rtas_token("ibm,nmi-register");
if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
...@@ -115,6 +119,18 @@ static void __init fwnmi_init(void) ...@@ -115,6 +119,18 @@ static void __init fwnmi_init(void)
if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
machine_check_addr)) machine_check_addr))
fwnmi_active = 1; fwnmi_active = 1;
/*
* Allocate a chunk for per cpu buffer to hold rtas errorlog.
* It will be used in real mode mce handler, hence it needs to be
* below RMA.
*/
mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
RTAS_ERROR_LOG_MAX, ppc64_rma_size));
for_each_possible_cpu(i) {
paca_ptrs[i]->mce_data_buf = mce_data_buf +
(RTAS_ERROR_LOG_MAX * i);
}
} }
static void pseries_8259_cascade(struct irq_desc *desc) static void pseries_8259_cascade(struct irq_desc *desc)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment