Commit 26abebb8 authored by Alex Williamson's avatar Alex Williamson Committed by David Mosberger

[PATCH] ia64: CPE & CMC polling for 2.5

Here's another feature I'd like to add to MCA support; the ability
to detect a flood of CMCs and switch to polling mode for retrieving
CMC logs.  Once no more CMC logs are found, return to and interrupt
driven handler.  If the flood threshold is never reached, the CMC
handler simply behaves as it does today.

   It's useful to get the CMC logs to know that something isn't
quite right, but if you end up with some bad memory it's too easy for
them to interfere with useful work.  I've tested this on an HP rx2600,
with a known bad DIMM.  This DIMM acts like it has a completely dead
DRAM on it.  With the current CMC handler, once I hit that range of
memory addresses, the system essentially dies, constantly handling
CMC errors.  With this patch, the system hits the threshold quickly,
but remains functional with no performance degredation once in polling
mode.  This patch applies against linux-2.4.20-ia64-021210 and 
includes:

 - Switching CMCs to polling mode at predeterimined threshold
 - If polling for CPEs, poll on all processors
 - Fix timestamp on log output
parent c461783b
...@@ -42,6 +42,10 @@ ...@@ -42,6 +42,10 @@
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/acpi.h> #include <linux/acpi.h>
#include <linux/timer.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <asm/machvec.h> #include <asm/machvec.h>
#include <asm/page.h> #include <asm/page.h>
...@@ -105,6 +109,19 @@ static struct irqaction mca_cpe_irqaction = { ...@@ -105,6 +109,19 @@ static struct irqaction mca_cpe_irqaction = {
.name = "cpe_hndlr" .name = "cpe_hndlr"
}; };
#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */
#define MIN_CPE_POLL_INTERVAL (2*60*HZ) /* 2 minutes */
#define CMC_POLL_INTERVAL (1*60*HZ) /* 1 minute */
#define CMC_HISTORY_LENGTH 5
static struct timer_list cpe_poll_timer;
static struct timer_list cmc_poll_timer;
/*
* Start with this in the wrong state so we won't play w/ timers
* before the system is ready.
*/
static int cmc_polling_enabled = 1;
/* /*
* ia64_mca_log_sal_error_record * ia64_mca_log_sal_error_record
* *
...@@ -152,7 +169,7 @@ mca_handler_platform (void) ...@@ -152,7 +169,7 @@ mca_handler_platform (void)
void void
ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
{ {
IA64_MCA_DEBUG("ia64_mca_cpe_int_handler: received interrupt. vector = %#x\n", cpe_irq); IA64_MCA_DEBUG("ia64_mca_cpe_int_handler: received interrupt. CPU:%d vector = %#x\n", smp_processor_id(), cpe_irq);
/* Get the CMC error record and log it */ /* Get the CMC error record and log it */
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE, 0); ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE, 0);
...@@ -295,6 +312,60 @@ ia64_mca_cmc_vector_setup (void) ...@@ -295,6 +312,60 @@ ia64_mca_cmc_vector_setup (void)
smp_processor_id(), ia64_get_cmcv()); smp_processor_id(), ia64_get_cmcv());
} }
/*
* ia64_mca_cmc_vector_disable
*
* Mask the corrected machine check vector register in the processor.
* This function is invoked on a per-processor basis.
*
* Inputs
* dummy(unused)
*
* Outputs
* None
*/
void
ia64_mca_cmc_vector_disable (void *dummy)
{
cmcv_reg_t cmcv;
cmcv = (cmcv_reg_t)ia64_get_cmcv();
cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
ia64_set_cmcv(cmcv.cmcv_regval);
IA64_MCA_DEBUG("ia64_mca_cmc_vector_disable: CPU %d corrected "
"machine check vector %#x disabled.\n",
smp_processor_id(), cmcv.cmcv_vector);
}
/*
* ia64_mca_cmc_vector_enable
*
* Unmask the corrected machine check vector register in the processor.
* This function is invoked on a per-processor basis.
*
* Inputs
* dummy(unused)
*
* Outputs
* None
*/
void
ia64_mca_cmc_vector_enable (void *dummy)
{
cmcv_reg_t cmcv;
cmcv = (cmcv_reg_t)ia64_get_cmcv();
cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
ia64_set_cmcv(cmcv.cmcv_regval);
IA64_MCA_DEBUG("ia64_mca_cmc_vector_enable: CPU %d corrected "
"machine check vector %#x enabled.\n",
smp_processor_id(), cmcv.cmcv_vector);
}
#if defined(MCA_TEST) #if defined(MCA_TEST)
...@@ -494,9 +565,7 @@ ia64_mca_init(void) ...@@ -494,9 +565,7 @@ ia64_mca_init(void)
setup_irq(irq, &mca_cpe_irqaction); setup_irq(irq, &mca_cpe_irqaction);
} }
ia64_mca_register_cpev(cpev); ia64_mca_register_cpev(cpev);
} else }
printk(KERN_ERR
"ia64_mca_init: Failed to get routed CPEI vector from ACPI.\n");
} }
/* Initialize the areas set aside by the OS to buffer the /* Initialize the areas set aside by the OS to buffer the
...@@ -751,11 +820,68 @@ ia64_mca_ucmc_handler(void) ...@@ -751,11 +820,68 @@ ia64_mca_ucmc_handler(void)
void void
ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
{ {
static unsigned long cmc_history[CMC_HISTORY_LENGTH];
static int index;
static spinlock_t cmc_history_lock = SPIN_LOCK_UNLOCKED;
IA64_MCA_DEBUG("ia64_mca_cmc_int_handler: received interrupt vector = %#x on CPU %d\n", IA64_MCA_DEBUG("ia64_mca_cmc_int_handler: received interrupt vector = %#x on CPU %d\n",
cmc_irq, smp_processor_id()); cmc_irq, smp_processor_id());
/* Get the CMC error record and log it */ /* Get the CMC error record and log it */
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC, 0); ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC, 0);
spin_lock(&cmc_history_lock);
if (!cmc_polling_enabled) {
int i, count = 1; /* we know 1 happened now */
unsigned long now = jiffies;
for (i = 0; i < CMC_HISTORY_LENGTH; i++) {
if (now - cmc_history[i] <= HZ)
count++;
}
IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH);
if (count >= CMC_HISTORY_LENGTH) {
/*
* CMC threshold exceeded, clear the history
* so we have a fresh start when we return
*/
for (index = 0 ; index < CMC_HISTORY_LENGTH; index++)
cmc_history[index] = 0;
index = 0;
/* Switch to polling mode */
cmc_polling_enabled = 1;
/*
* Unlock & enable interrupts before
* smp_call_function or risk deadlock
*/
spin_unlock(&cmc_history_lock);
ia64_mca_cmc_vector_disable(NULL);
local_irq_enable();
smp_call_function(ia64_mca_cmc_vector_disable, NULL, 1, 1);
/*
* Corrected errors will still be corrected, but
* make sure there's a log somewhere that indicates
* something is generating more than we can handle.
*/
printk(KERN_WARNING "ia64_mca_cmc_int_handler: WARNING: Switching to polling CMC handler, error records may be lost\n");
mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
/* lock already released, get out now */
return;
} else {
cmc_history[index++] = now;
if (index == CMC_HISTORY_LENGTH)
index = 0;
}
}
spin_unlock(&cmc_history_lock);
} }
/* /*
...@@ -768,6 +894,7 @@ typedef struct ia64_state_log_s ...@@ -768,6 +894,7 @@ typedef struct ia64_state_log_s
{ {
spinlock_t isl_lock; spinlock_t isl_lock;
int isl_index; int isl_index;
unsigned long isl_count;
ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
} ia64_state_log_t; } ia64_state_log_t;
...@@ -784,11 +911,145 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; ...@@ -784,11 +911,145 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
#define IA64_LOG_NEXT_INDEX(it) ia64_state_log[it].isl_index #define IA64_LOG_NEXT_INDEX(it) ia64_state_log[it].isl_index
#define IA64_LOG_CURR_INDEX(it) 1 - ia64_state_log[it].isl_index #define IA64_LOG_CURR_INDEX(it) 1 - ia64_state_log[it].isl_index
#define IA64_LOG_INDEX_INC(it) \ #define IA64_LOG_INDEX_INC(it) \
ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \
ia64_state_log[it].isl_count++;}
#define IA64_LOG_INDEX_DEC(it) \ #define IA64_LOG_INDEX_DEC(it) \
ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index
#define IA64_LOG_NEXT_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)])) #define IA64_LOG_NEXT_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]))
#define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])) #define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]))
#define IA64_LOG_COUNT(it) ia64_state_log[it].isl_count
/*
* ia64_mca_cmc_int_caller
*
* Call CMC interrupt handler, only purpose is to have a
* smp_call_function callable entry.
*
* Inputs : dummy(unused)
* Outputs : None
* */
static void
ia64_mca_cmc_int_caller(void *dummy)
{
ia64_mca_cmc_int_handler(0, NULL, NULL);
}
/*
* ia64_mca_cmc_poll
*
* Poll for Corrected Machine Checks (CMCs)
*
* Inputs : dummy(unused)
* Outputs : None
*
*/
static void
ia64_mca_cmc_poll (unsigned long dummy)
{
int start_count;
start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
/* Call the interrupt handler */
smp_call_function(ia64_mca_cmc_int_caller, NULL, 1, 1);
local_irq_disable();
ia64_mca_cmc_int_caller(NULL);
local_irq_enable();
/*
* If no log recored, switch out of polling mode.
*/
if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
printk(KERN_WARNING "ia64_mca_cmc_poll: Returning to interrupt driven CMC handler\n");
cmc_polling_enabled = 0;
smp_call_function(ia64_mca_cmc_vector_enable, NULL, 1, 1);
ia64_mca_cmc_vector_enable(NULL);
} else {
mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
}
}
/*
* ia64_mca_cpe_int_caller
*
* Call CPE interrupt handler, only purpose is to have a
* smp_call_function callable entry.
*
* Inputs : dummy(unused)
* Outputs : None
* */
static void
ia64_mca_cpe_int_caller(void *dummy)
{
ia64_mca_cpe_int_handler(0, NULL, NULL);
}
/*
* ia64_mca_cpe_poll
*
* Poll for Corrected Platform Errors (CPEs), dynamically adjust
* polling interval based on occurance of an event.
*
* Inputs : dummy(unused)
* Outputs : None
*
*/
static void
ia64_mca_cpe_poll (unsigned long dummy)
{
int start_count;
static int poll_time = MAX_CPE_POLL_INTERVAL;
start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
/* Call the interrupt handler */
smp_call_function(ia64_mca_cpe_int_caller, NULL, 1, 1);
local_irq_disable();
ia64_mca_cpe_int_caller(NULL);
local_irq_enable();
/*
* If a log was recorded, increase our polling frequency,
* otherwise, backoff.
*/
if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) {
poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time/2);
} else {
poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2);
}
mod_timer(&cpe_poll_timer, jiffies + poll_time);
}
/*
* ia64_mca_late_init
*
* Opportunity to setup things that require initialization later
* than ia64_mca_init. Setup a timer to poll for CPEs if the
* platform doesn't support an interrupt driven mechanism.
*
* Inputs : None
* Outputs : Status
*/
static int __init
ia64_mca_late_init(void)
{
init_timer(&cmc_poll_timer);
cmc_poll_timer.function = ia64_mca_cmc_poll;
/* Reset to the correct state */
cmc_polling_enabled = 0;
init_timer(&cpe_poll_timer);
cpe_poll_timer.function = ia64_mca_cpe_poll;
/* If platform doesn't support CPEI, get the timer going. */
if (acpi_request_vector(ACPI_INTERRUPT_CPEI) < 0)
ia64_mca_cpe_poll(0UL);
return 0;
}
device_initcall(ia64_mca_late_init);
/* /*
* C portion of the OS INIT handler * C portion of the OS INIT handler
...@@ -949,7 +1210,6 @@ ia64_log_get(int sal_info_type, prfunc_t prfunc) ...@@ -949,7 +1210,6 @@ ia64_log_get(int sal_info_type, prfunc_t prfunc)
return total_len; return total_len;
} else { } else {
IA64_LOG_UNLOCK(sal_info_type); IA64_LOG_UNLOCK(sal_info_type);
prfunc("ia64_log_get: No SAL error record available for type %d\n", sal_info_type);
return 0; return 0;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment