Commit 5c1344e9 authored by Linas Vepstas's avatar Linas Vepstas Committed by Paul Mackerras

[PATCH] ppc64: escape hatch for spinning interrupt deadlocks

08-eeh-spin-counter.patch

One an EEH event is triggers, all further I/O to a device is blocked (until
reset).  Bad device drivers may end up spinning in their interrupt handlers,
trying to read an interrupt status register that will never change state.
This patch moves that spin counter to a per-device structure, and adds
some diagnostic prints to help locate the bad driver.
Signed-off-by: default avatarLinas Vepstas <linas@linas.org>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
parent fd761fd8
...@@ -78,14 +78,12 @@ DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL); ...@@ -78,14 +78,12 @@ DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
static struct notifier_block *eeh_notifier_chain; static struct notifier_block *eeh_notifier_chain;
/* /* If a device driver keeps reading an MMIO register in an interrupt
* If a device driver keeps reading an MMIO register in an interrupt
* handler after a slot isolation event has occurred, we assume it * handler after a slot isolation event has occurred, we assume it
* is broken and panic. This sets the threshold for how many read * is broken and panic. This sets the threshold for how many read
* attempts we allow before panicking. * attempts we allow before panicking.
*/ */
#define EEH_MAX_FAILS 1000 #define EEH_MAX_FAILS 100000
static atomic_t eeh_fail_count;
/* RTAS tokens */ /* RTAS tokens */
static int ibm_set_eeh_option; static int ibm_set_eeh_option;
...@@ -521,7 +519,6 @@ static void eeh_event_handler(void *dummy) ...@@ -521,7 +519,6 @@ static void eeh_event_handler(void *dummy)
"%s\n", event->reset_state, "%s\n", event->reset_state,
pci_name(event->dev)); pci_name(event->dev));
atomic_set(&eeh_fail_count, 0);
notifier_call_chain (&eeh_notifier_chain, notifier_call_chain (&eeh_notifier_chain,
EEH_NOTIFY_FREEZE, event); EEH_NOTIFY_FREEZE, event);
...@@ -657,12 +654,18 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) ...@@ -657,12 +654,18 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
spin_lock_irqsave(&confirm_error_lock, flags); spin_lock_irqsave(&confirm_error_lock, flags);
rc = 1; rc = 1;
if (pdn->eeh_mode & EEH_MODE_ISOLATED) { if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
atomic_inc(&eeh_fail_count); pdn->eeh_check_count ++;
if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
pdn->eeh_check_count);
dump_stack();
/* re-read the slot reset state */ /* re-read the slot reset state */
if (read_slot_reset_state(pdn, rets) != 0) if (read_slot_reset_state(pdn, rets) != 0)
rets[0] = -1; /* reset state unknown */ rets[0] = -1; /* reset state unknown */
eeh_panic(dev, rets[0]);
/* If we are here, then we hit an infinite loop. Stop. */
panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
} }
goto dn_unlock; goto dn_unlock;
} }
...@@ -808,6 +811,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data) ...@@ -808,6 +811,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
struct pci_dn *pdn = PCI_DN(dn); struct pci_dn *pdn = PCI_DN(dn);
pdn->eeh_mode = 0; pdn->eeh_mode = 0;
pdn->eeh_check_count = 0;
pdn->eeh_freeze_count = 0;
if (status && strcmp(status, "ok") != 0) if (status && strcmp(status, "ok") != 0)
return NULL; /* ignore devices with bad status */ return NULL; /* ignore devices with bad status */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment