Commit e92ae54f authored by Paul Mackerras's avatar Paul Mackerras Committed by Linus Torvalds

[PATCH] ppc64: EEH fixes for POWER5 machines (2/2)

From: Linas Vepstas <linas@austin.ibm.com>

This patch fixes the usage of the slot-error-detail log buffer for the
Power5 architecture.  The size of the error buffer is variable, and the
correct size to use should have been obtained from firmware.  Failure to
use the correct buffer sizes will result in hard-to-debug system lockups
deep in firmware.  This patch is based on an earlier patch from Ben
Herrenschmidt, which essentially did the same thing.

This patch also tweaks some of the subroutine documentation.
Signed-off-by: default avatarLinas Vepstas <linas@linas.org>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 9353154e
...@@ -45,12 +45,18 @@ ...@@ -45,12 +45,18 @@
static int ibm_set_eeh_option; static int ibm_set_eeh_option;
static int ibm_set_slot_reset; static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state; static int ibm_read_slot_reset_state;
static int ibm_slot_error_detail;
static int eeh_subsystem_enabled; static int eeh_subsystem_enabled;
#define EEH_MAX_OPTS 4096 #define EEH_MAX_OPTS 4096
static char *eeh_opts; static char *eeh_opts;
static int eeh_opts_last; static int eeh_opts_last;
/* Buffer for reporting slot-error-detail rtas calls */
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static spinlock_t slot_errbuf_lock = SPIN_LOCK_UNLOCKED;
static int eeh_error_buf_size;
/* System monitoring statistics */ /* System monitoring statistics */
static DEFINE_PER_CPU(unsigned long, total_mmio_ffs); static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
static DEFINE_PER_CPU(unsigned long, false_positives); static DEFINE_PER_CPU(unsigned long, false_positives);
...@@ -368,9 +374,6 @@ unsigned long eeh_check_failure(void *token, unsigned long val) ...@@ -368,9 +374,6 @@ unsigned long eeh_check_failure(void *token, unsigned long val)
struct device_node *dn; struct device_node *dn;
int ret; int ret;
int rets[2]; int rets[2];
static spinlock_t lock = SPIN_LOCK_UNLOCKED;
/* dont want this on the stack */
static unsigned char slot_err_buf[RTAS_ERROR_LOG_MAX];
unsigned long flags; unsigned long flags;
__get_cpu_var(total_mmio_ffs)++; __get_cpu_var(total_mmio_ffs)++;
...@@ -414,23 +417,24 @@ unsigned long eeh_check_failure(void *token, unsigned long val) ...@@ -414,23 +417,24 @@ unsigned long eeh_check_failure(void *token, unsigned long val)
BUID_LO(dn->phb->buid)); BUID_LO(dn->phb->buid));
if (ret == 0 && rets[1] == 1 && rets[0] >= 2) { if (ret == 0 && rets[1] == 1 && rets[0] >= 2) {
int slot_err_ret; int log_event;
spin_lock_irqsave(&slot_errbuf_lock, flags);
memset(slot_errbuf, 0, eeh_error_buf_size);
spin_lock_irqsave(&lock, flags); log_event = rtas_call(ibm_slot_error_detail,
memset(slot_err_buf, 0, RTAS_ERROR_LOG_MAX);
slot_err_ret = rtas_call(rtas_token("ibm,slot-error-detail"),
8, 1, NULL, dn->eeh_config_addr, 8, 1, NULL, dn->eeh_config_addr,
BUID_HI(dn->phb->buid), BUID_HI(dn->phb->buid),
BUID_LO(dn->phb->buid), NULL, 0, BUID_LO(dn->phb->buid), NULL, 0,
__pa(slot_err_buf), virt_to_phys(slot_errbuf),
RTAS_ERROR_LOG_MAX, eeh_error_buf_size,
2 /* Permanent Error */); 2 /* Permanent Error */);
if (slot_err_ret == 0) if (log_event == 0)
log_error(slot_err_buf, ERR_TYPE_RTAS_LOG, log_error(slot_errbuf, ERR_TYPE_RTAS_LOG,
1 /* Fatal */); 1 /* Fatal */);
spin_unlock_irqrestore(&lock, flags); spin_unlock_irqrestore(&slot_errbuf_lock, flags);
/* /*
* XXX We should create a separate sysctl for this. * XXX We should create a separate sysctl for this.
...@@ -517,8 +521,7 @@ static void *early_enable_eeh(struct device_node *dn, void *data) ...@@ -517,8 +521,7 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
} }
if (!enable || info->force_off) { if (!enable || info->force_off) {
dn->eeh_mode = EEH_MODE_NOCHECK; dn->eeh_mode |= EEH_MODE_NOCHECK;
return NULL;
} }
/* This device may already have an EEH parent. */ /* This device may already have an EEH parent. */
...@@ -562,14 +565,13 @@ static void *early_enable_eeh(struct device_node *dn, void *data) ...@@ -562,14 +565,13 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
* As a side effect we can determine here if eeh is supported at all. * As a side effect we can determine here if eeh is supported at all.
* Note that we leave EEH on so failed config cycles won't cause a machine * Note that we leave EEH on so failed config cycles won't cause a machine
* check. If a user turns off EEH for a particular adapter they are really * check. If a user turns off EEH for a particular adapter they are really
* telling Linux to ignore errors. * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
* grant access to a slot if EEH isn't enabled, and so we always enable
* EEH for all slots/all devices.
* *
* We should probably distinguish between "ignore errors" and "turn EEH off" * The eeh-force-off option disables EEH checking globally, for all slots.
* but for now disabling EEH for adapters is mostly to work around drivers that * Even if force-off is set, the EEH hardware is still enabled, so that
* directly access mmio space (without using the macros). * newer systems can boot.
*
* The eeh-force-off option does literally what it says, so if Linux must
* avoid enabling EEH this must be done.
*/ */
void __init eeh_init(void) void __init eeh_init(void)
{ {
...@@ -588,10 +590,21 @@ void __init eeh_init(void) ...@@ -588,10 +590,21 @@ void __init eeh_init(void)
ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
return; return;
eeh_error_buf_size = rtas_token("rtas-error-log-max");
if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
eeh_error_buf_size = 1024;
}
if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
"buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
}
info.force_off = 0; info.force_off = 0;
if (eeh_force_off) { if (eeh_force_off) {
printk(KERN_WARNING "EEH: WARNING: PCI Enhanced I/O Error " printk(KERN_WARNING "EEH: WARNING: PCI Enhanced I/O Error "
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment