Commit 047379fb authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] New machine check handler for x86-64

This adds a new completely rewritten machine check handler for x86-64.
The old one never worked on 2.6.

The new handler has many improvements. It closely follows the Intel and AMD
recommendations on MCE handlers now (the old one had many violations). It handles
unrecoverable errors in user space better now - it will only kill the process now
if possible instead of panicing.

This one is CPU independent now - it should work on any CPU that supports the standard
x86 MCA architecture.

This new handler only logs fatal errors that lead to kernel panic to the console.
Non fatal errors are logged race free into a new (non ring) buffer now
and supplied to the user using a new character device.  The old one could
deadlock on console and printk locks. This also separates machine check errors
from real kernel errors better. The new buffer has been also designed to
be easily accessible from external debugging tools: it has a signature
and could be even recovered after reboot. It is not organized as a ring buffer -
this means the first errors are kept unless explicitely cleared.

The new error formats can be parsed using ftp://ftp.suse.com/pub/people/ak/x86-64/mcelog.c
The new character device for it can be created with mknod /dev/mcelog c 10 227

There is a new sysfs interface to configure the machine check handler.
It has a "tolerant" parameter that defines the aggressiveness of the machine check:

0: always panic
1: panic if deadlock possible (e.g. MCE happened in the kernel)
2: try to avoid panic

Default is 2

Despite of having more features the new handler is shorter.
parent 386eaf87
...@@ -5,19 +5,12 @@ only the AMD64 specific ones are listed here. ...@@ -5,19 +5,12 @@ only the AMD64 specific ones are listed here.
Machine check Machine check
(see the Opteron BIOS&Kernel manual for more details on the banks etc.)
mce=off disable machine check mce=off disable machine check
mce=nok8 disable k8 specific features
mce=disable<NUMBER> disable bank NUMBER
mce=enable<NUMBER> enable bank number
mce=device Enable more machine check options in Northbridge.
Can be useful for device driver debugging.
mce=NUMBER mcheck timer interval number seconds.
Can be also comma separated in a single mce=
nomce (for compatibility with i386): same as mce=off nomce (for compatibility with i386): same as mce=off
Everything else is in sysfs now.
APICs APICs
apic Use IO-APIC. Default apic Use IO-APIC. Default
......
...@@ -7,7 +7,8 @@ EXTRA_AFLAGS := -traditional ...@@ -7,7 +7,8 @@ EXTRA_AFLAGS := -traditional
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_x86_64.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_x86_64.o \
x8664_ksyms.o i387.o syscall.o vsyscall.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \
setup64.o bluesmoke.o bootflag.o e820.o reboot.o warmreboot.o setup64.o bootflag.o e820.o reboot.o warmreboot.o
obj-y += mce.o
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_ACPI) += acpi/
......
/*
* Machine check handler.
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
*/
#include <linux/config.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/kdebug.h>
#include <linux/pci.h>
#include <linux/timer.h>
static int mce_disabled __initdata;
static unsigned long mce_cpus;
/*
* Machine Check Handler For PII/PIII/K7
*/
static int banks;
static unsigned long ignored_banks, disabled_banks;
static void generic_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
preempt_disable();
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if(mcgstl&(1<<0)) /* Recoverable ? */
recover=0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl);
if (regs && (mcgstl & 2))
printk(KERN_EMERG "RIP <%02lx>:%016lx RSP %016lx\n",
regs->cs, regs->rip, regs->rsp);
for(i=0;i<banks;i++)
{
if ((1UL<<i) & ignored_banks)
continue;
rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
if(high&(1<<31))
{
if(high&(1<<29))
recover|=1;
if(high&(1<<25))
recover|=2;
printk(KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high&=~(1<<31);
if(high&(1<<27))
{
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk("[%08x%08x]", alow, ahigh);
}
if(high&(1<<26))
{
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk(" at %08x%08x",
ahigh, alow);
}
printk("\n");
/* Clear it */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
if(recover&2)
panic("CPU context corrupt");
if(recover&1)
panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
mcgstl&=~(1<<2);
wrmsr(MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
preempt_enable();
}
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
printk("unexpected machine check %lx\n", error_code);
}
/*
* Call the installed machine check handler for this CPU setup.
*/
static void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
void do_machine_check(struct pt_regs * regs, long error_code)
{
notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
machine_check_vector(regs, error_code);
}
/*
* K8 machine check.
*/
static struct pci_dev *find_k8_nb(void)
{
struct pci_dev *dev = NULL;
int cpu = smp_processor_id();
while ((dev = pci_find_device(PCI_VENDOR_ID_AMD, 0x1103, dev)) != NULL) {
if (dev->bus->number==0 && PCI_SLOT(dev->devfn) == (24U+cpu))
return dev;
}
return NULL;
}
/* When we have kallsyms we can afford kmcedecode too. */
static char *transaction[] = {
"instruction", "data", "generic", "reserved"
};
static char *cachelevel[] = {
"level 0", "level 1", "level 2", "level generic"
};
static char *memtrans[] = {
"generic error", "generic read", "generic write", "data read",
"data write", "instruction fetch", "prefetch", "snoop",
"?", "?", "?", "?", "?", "?", "?"
};
static char *partproc[] = {
"local node origin", "local node response",
"local node observed", "generic"
};
static char *timeout[] = {
"request didn't time out",
"request timed out"
};
static char *memoryio[] = {
"memory access", "res.", "i/o access", "generic"
};
static char *extendederr[] = {
"ecc error",
"crc error",
"sync error",
"mst abort",
"tgt abort",
"gart error",
"rmw error",
"wdog error",
"chipkill ecc error",
"<9>","<10>","<11>","<12>",
"<13>","<14>","<15>"
};
static char *highbits[32] = {
[31] = "previous error lost",
[30] = "error overflow",
[29] = "error uncorrected",
[28] = "error enable",
[27] = "misc error valid",
[26] = "error address valid",
[25] = "processor context corrupt",
[24] = "res24",
[23] = "res23",
/* 22-15 ecc syndrome bits */
[14] = "corrected ecc error",
[13] = "uncorrected ecc error",
[12] = "res12",
[11] = "res11",
[10] = "res10",
[9] = "res9",
[8] = "dram scrub error",
[7] = "res7",
/* 6-4 ht link number of error */
[3] = "res3",
[2] = "res2",
[1] = "err cpu0",
[0] = "err cpu1",
};
static void check_k8_nb(int header)
{
struct pci_dev *nb;
u32 statuslow, statushigh;
unsigned short errcode;
int i;
nb = find_k8_nb();
if (nb == NULL)
return;
pci_read_config_dword(nb, 0x48, &statuslow);
pci_read_config_dword(nb, 0x4c, &statushigh);
if (!(statushigh & (1<<31)))
return;
if (header)
printk(KERN_ERR "CPU %d: Silent Northbridge MCE\n", smp_processor_id());
printk(KERN_ERR "Northbridge status %08x%08x\n",
statushigh,statuslow);
printk(KERN_ERR " Error %s\n", extendederr[(statuslow >> 16) & 0xf]);
errcode = statuslow & 0xffff;
switch ((statuslow >> 16) & 0xF) {
case 5:
printk(KERN_ERR " GART TLB error %s %s\n",
transaction[(errcode >> 2) & 3],
cachelevel[errcode & 3]);
break;
case 8:
printk(KERN_ERR " ECC error syndrome %x\n",
(((statuslow >> 24) & 0xff) << 8) | ((statushigh >> 15) & 0x7f));
/*FALL THROUGH*/
default:
printk(KERN_ERR " bus error %s, %s\n %s\n %s, %s\n",
partproc[(errcode >> 9) & 0x3],
timeout[(errcode >> 8) & 1],
memtrans[(errcode >> 4) & 0xf],
memoryio[(errcode >> 2) & 0x3],
cachelevel[(errcode & 0x3)]);
/* should only print when it was a HyperTransport related error. */
printk(KERN_ERR " link number %x\n", (statushigh >> 4) & 3);
break;
}
for (i = 0; i < 32; i++) {
if (i == 26 || i == 28)
continue;
if (highbits[i] && (statushigh & (1<<i)))
printk(KERN_ERR " %s\n", highbits[i]);
}
if (statushigh & (1<<26)) {
u32 addrhigh, addrlow;
pci_read_config_dword(nb, 0x54, &addrhigh);
pci_read_config_dword(nb, 0x50, &addrlow);
printk(KERN_ERR " NB error address %08x%08x\n", addrhigh,addrlow);
}
statushigh &= ~(1<<31);
pci_write_config_dword(nb, 0x4c, statushigh);
}
static void k8_machine_check(struct pt_regs * regs, long error_code)
{
u64 status, nbstatus;
preempt_disable();
rdmsrl(MSR_IA32_MCG_STATUS, status);
if ((status & (1<<2)) == 0) {
if (!regs)
check_k8_nb(1);
return;
}
printk(KERN_EMERG "CPU %d: Machine Check Exception: %016Lx\n", smp_processor_id(), status);
if (status & 1)
printk(KERN_EMERG "MCG_STATUS: unrecoverable\n");
rdmsrl(MSR_IA32_MC0_STATUS+4*4, nbstatus);
if ((nbstatus & (1UL<<63)) == 0)
goto others;
printk(KERN_EMERG "Northbridge Machine Check %s %016lx %lx\n",
regs ? "exception" : "timer",
(unsigned long)nbstatus, error_code);
if (nbstatus & (1UL<<62))
printk(KERN_EMERG "Lost at least one NB error condition\n");
if (nbstatus & (1UL<<61))
printk(KERN_EMERG "Uncorrectable condition\n");
if (nbstatus & (1UL<57))
printk(KERN_EMERG "Unrecoverable condition\n");
check_k8_nb(0);
if (nbstatus & (1UL<<58)) {
u64 adr;
rdmsrl(MSR_IA32_MC0_ADDR+4*4, adr);
printk(KERN_EMERG "Address: %016lx\n", (unsigned long)adr);
}
wrmsrl(MSR_IA32_MC0_STATUS+4*4, 0);
wrmsrl(MSR_IA32_MCG_STATUS, 0);
others:
generic_machine_check(regs, error_code);
preempt_enable();
}
static struct timer_list mcheck_timer;
int mcheck_interval = 30*HZ;
#ifndef CONFIG_SMP
static void mcheck_timer_handler(unsigned long data)
{
k8_machine_check(NULL,0);
mcheck_timer.expires = jiffies + mcheck_interval;
add_timer(&mcheck_timer);
}
#else
/* SMP needs a process context trampoline because smp_call_function cannot be
called from interrupt context. */
static void mcheck_timer_other(void *data)
{
k8_machine_check(NULL, 0);
}
static void mcheck_timer_dist(void *data)
{
smp_call_function(mcheck_timer_other,0,0,0);
k8_machine_check(NULL, 0);
mcheck_timer.expires = jiffies + mcheck_interval;
add_timer(&mcheck_timer);
}
static void mcheck_timer_handler(unsigned long data)
{
static DECLARE_WORK(mcheck_work, mcheck_timer_dist, NULL);
schedule_work(&mcheck_work);
}
#endif
static int nok8 __initdata;
static void __init k8_mcheck_init(struct cpuinfo_x86 *c)
{
u64 cap;
int i;
if (!test_bit(X86_FEATURE_MCE, &c->x86_capability) ||
!test_bit(X86_FEATURE_MCA, &c->x86_capability))
return;
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap&0xff;
machine_check_vector = k8_machine_check;
for (i = 0; i < banks; i++) {
u64 val = ((1UL<<i) & disabled_banks) ? 0 : ~0UL;
wrmsrl(MSR_IA32_MC0_CTL+4*i, val);
wrmsrl(MSR_IA32_MC0_STATUS+4*i,0);
}
if (cap & (1<<8))
wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
set_in_cr4(X86_CR4_MCE);
if (mcheck_interval && (smp_processor_id() == 0)) {
init_timer(&mcheck_timer);
mcheck_timer.function = (void (*)(unsigned long))mcheck_timer_handler;
mcheck_timer.expires = jiffies + mcheck_interval;
add_timer(&mcheck_timer);
}
printk(KERN_INFO "Machine Check Reporting enabled for CPU#%d\n", smp_processor_id());
}
/*
* Set up machine check reporting for Intel processors
*/
static void __init generic_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
static int done;
/*
* Check for MCE support
*/
if( !test_bit(X86_FEATURE_MCE, &c->x86_capability) )
return;
/*
* Check for PPro style MCA
*/
if( !test_bit(X86_FEATURE_MCA, &c->x86_capability) )
return;
/* Ok machine check is available */
machine_check_vector = generic_machine_check;
wmb();
if(done==0)
printk(KERN_INFO "Intel machine check architecture supported.\n");
rdmsr(MSR_IA32_MCG_CAP, l, h);
if(l&(1<<8))
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
banks = l&0xff;
for(i=0;i<banks;i++)
{
u32 val = ((1UL<<i) & disabled_banks) ? 0 : ~0;
wrmsr(MSR_IA32_MC0_CTL+4*i, val, val);
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id());
done=1;
}
/*
* This has to be run for each processor
*/
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if (test_and_set_bit(smp_processor_id(), &mce_cpus))
return;
if(mce_disabled==1)
return;
switch(c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86 == 15 && !nok8) {
k8_mcheck_init(c);
break;
}
/* FALL THROUGH */
default:
case X86_VENDOR_INTEL:
generic_mcheck_init(c);
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
/* mce=off disable machine check
mce=nok8 disable k8 specific features
mce=disable<NUMBER> disable bank NUMBER
mce=enable<NUMBER> enable bank number
mce=NUMBER mcheck timer interval number seconds.
Can be also comma separated in a single mce= */
static int __init mcheck_enable(char *str)
{
char *p;
while ((p = strsep(&str,",")) != NULL) {
if (isdigit(*p))
mcheck_interval = simple_strtol(p,NULL,0) * HZ;
else if (!strcmp(p,"off"))
mce_disabled = 1;
else if (!strncmp(p,"enable",6))
disabled_banks &= ~(1<<simple_strtol(p+6,NULL,0));
else if (!strncmp(p,"disable",7))
disabled_banks |= ~(1<<simple_strtol(p+7,NULL,0));
else if (!strcmp(p,"nok8"))
nok8 = 1;
}
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
/*
* Machine check handler.
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/kdebug.h>
#include <asm/uaccess.h>
#define MISC_MCELOG_MINOR 227
static int mce_disabled __initdata;
/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic */
static int tolerant = 2;
static int banks;
static unsigned long disabled_banks;
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
struct mce_log mcelog = {
MCE_LOG_SIGNATURE,
MCE_LOG_LEN,
};
static void mce_log(struct mce *mce)
{
unsigned next, entry;
mce->finished = 0;
smp_wmb();
for (;;) {
entry = mcelog.next;
read_barrier_depends();
/* When the buffer fills up discard new entries. Assume
that the earlier errors are the more interesting. */
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW, &mcelog.flags);
return;
}
/* Old left over entry. Skip. */
if (mcelog.entry[entry].finished)
continue;
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog.next, entry, next) == entry)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
smp_wmb();
mcelog.entry[entry].finished = 1;
smp_wmb();
}
static void print_mce(struct mce *m)
{
printk("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
m->cpu, m->mcgstatus, m->bank, m->status);
if (m->rip) {
printk("RIP %02x:<%016Lx> ", m->cs, m->rip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->rip);
printk("\n");
}
printk("TSC %Lx ", m->tsc);
if (m->addr)
printk("ADDR %Lx ", m->addr);
if (m->misc)
printk("MISC %Lx ", m->addr);
printk("\n");
}
static void mce_panic(char *msg, struct mce *backup, unsigned long start)
{
int i;
oops_begin();
for (i = 0; i < MCE_LOG_LEN; i++) {
if (mcelog.entry[i].tsc < start)
continue;
print_mce(&mcelog.entry[i]);
if (mcelog.entry[i].tsc == backup->tsc)
backup = NULL;
}
if (backup)
print_mce(backup);
panic(msg);
}
static int mce_available(struct cpuinfo_x86 *c)
{
return !mce_disabled &&
test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
test_bit(X86_FEATURE_MCA, &c->x86_capability);
}
/*
* The actual machine check handler
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
struct mce m;
int nowayout = 0;
int kill_it = 0;
u64 mcestart;
int i;
if (regs)
notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
if (!banks)
return;
memset(&m, 0, sizeof(struct mce));
m.cpu = hard_smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (!regs && (m.mcgstatus & MCG_STATUS_MCIP))
return;
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
if (regs && (m.mcgstatus & MCG_STATUS_EIPV)) {
m.rip = regs->rip;
m.cs = regs->cs;
}
rdtscll(mcestart);
mb();
for (i = 0; i < banks; i++) {
if (test_bit(i, &disabled_banks))
continue;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
nowayout |= (tolerant < 1);
nowayout |= !!(m.status & (MCI_STATUS_OVER|MCI_STATUS_PCC));
kill_it |= !!(m.status & MCI_STATUS_UC);
m.bank = i;
if (m.status & MCI_STATUS_MISCV)
rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
if (m.status & MCI_STATUS_ADDRV)
rdmsrl(MSR_IA32_MC0_MISC + i*4, m.addr);
rdtscll(m.tsc);
wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
mce_log(&m);
}
wrmsrl(MSR_IA32_MCG_STATUS, 0);
/* Never do anything final in the polling timer */
if (!regs)
return;
if (nowayout)
mce_panic("Machine check", &m, mcestart);
if (kill_it) {
int user_space = (m.rip && (m.cs & 3));
/* When the machine was in user space and the CPU didn't get
confused it's normally not necessary to panic, unless you are
paranoid (tolerant == 0) */
if (!user_space && (panic_on_oops || tolerant < 2))
mce_panic("Uncorrected machine check in kernel", &m, mcestart);
/* do_exit takes an awful lot of locks and has as slight risk
of deadlocking. If you don't want that don't set tolerant >= 2 */
do_exit(SIGBUS);
}
}
static void mce_clear_all(void)
{
int i;
for (i = 0; i < banks; i++)
wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
/*
* Periodic polling timer for "silent" machine check errors.
*/
static int check_interval = 3600; /* one hour */
static void mcheck_timer(void *data);
static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
static void mcheck_check_cpu(void *info)
{
if (mce_available(&current_cpu_data))
do_machine_check(NULL, 0);
}
static void mcheck_timer(void *data)
{
on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
schedule_delayed_work(&mcheck_work, check_interval * HZ);
}
static __init int periodic_mcheck_init(void)
{
if (check_interval)
schedule_delayed_work(&mcheck_work, check_interval*HZ);
return 0;
}
__initcall(periodic_mcheck_init);
/*
* Initialize Machine Checks for a CPU.
*/
static void mce_init(void *dummy)
{
u64 cap;
int i;
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
banks = cap & 0xff;
mce_clear_all();
for (i = 0; i < banks; i++) {
u64 val = test_bit(i, &disabled_banks) ? 0 : ~0UL;
wrmsrl(MSR_IA32_MC0_CTL+4*i, val);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
set_in_cr4(X86_CR4_MCE);
}
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __init mcheck_init(struct cpuinfo_x86 *c)
{
static unsigned long mce_cpus __initdata = 0;
if (test_and_set_bit(smp_processor_id(), &mce_cpus) || !mce_available(c))
return;
mce_init(NULL);
}
/*
* Character device to read and clear the MCE log.
*/
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
rdtscll(cpu_tsc[smp_processor_id()]);
}
static ssize_t mce_read(struct file *filp, char *ubuf, size_t usize, loff_t *off)
{
unsigned long cpu_tsc[NR_CPUS];
static DECLARE_MUTEX(mce_read_sem);
unsigned next;
char *buf = ubuf;
int i, err;
down(&mce_read_sem);
next = mcelog.next;
read_barrier_depends();
/* Only supports full reads right now */
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
up(&mce_read_sem);
return -EINVAL;
}
err = 0;
for (i = 0; i < next; i++) {
if (!mcelog.entry[i].finished)
continue;
smp_rmb();
err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
buf += sizeof(struct mce);
}
memset(mcelog.entry, 0, next * sizeof(struct mce));
mcelog.next = 0;
smp_wmb();
synchronize_kernel();
/* Collect entries that were still getting written before the synchronize. */
on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
if (mcelog.entry[i].finished &&
mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
smp_rmb();
buf += sizeof(struct mce);
memset(&mcelog.entry[i], 0, sizeof(struct mce));
}
}
up(&mce_read_sem);
return err ? -EFAULT : buf - ubuf;
}
static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), (int *)arg);
case MCE_GET_LOG_LEN:
return put_user(MCE_LOG_LEN, (int *)arg);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
return put_user(flags, (int *)arg);
}
default:
return -ENOTTY;
}
}
#if 0 /* for testing */
static ssize_t mce_write(struct file *f, const char __user *buf, size_t sz, loff_t *off)
{
struct mce m;
if (sz != sizeof(struct mce))
return -EINVAL;
copy_from_user(&m, buf, sizeof(struct mce));
m.finished = 0;
mce_log(&m);
return sizeof(struct mce);
}
#endif
static struct file_operations mce_chrdev_ops = {
.read = mce_read,
.ioctl = mce_ioctl,
//.write = mce_write
};
static struct miscdevice mce_log_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
/*
* Old style boot options parsing. Only for compatibility.
*/
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
/* mce=off disable machine check */
static int __init mcheck_enable(char *str)
{
if (!strcmp(str, "off"))
mce_disabled = 1;
else
printk("mce= argument %s ignored. Please use /sys", str);
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
/*
* Sysfs support
*/
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
static int mce_resume(struct sys_device *dev)
{
mce_clear_all();
on_each_cpu(mce_init, NULL, 1, 1);
return 0;
}
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
if (check_interval)
cancel_delayed_work(&mcheck_work);
/* Timer race is harmless here */
on_each_cpu(mce_init, NULL, 1, 1);
if (check_interval)
schedule_delayed_work(&mcheck_work, check_interval*HZ);
}
static struct sysdev_class mce_sysclass = {
.resume = mce_resume,
set_kset_name("machinecheck"),
};
static struct sys_device device_mce = {
.id = 0,
.cls = &mce_sysclass,
};
/* Why are there no generic functions for this? */
#define ACCESSOR(name, start) \
static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
return sprintf(buf, "%lu\n", (unsigned long)name); \
} \
static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
char *end; \
unsigned long new = simple_strtoul(buf, &end, 0); \
if (end == buf) return -EINVAL; \
name = new; \
start; \
return end-buf; \
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
ACCESSOR(disabled_banks,mce_restart())
ACCESSOR(tolerant,)
ACCESSOR(check_interval,mce_restart())
static __init int mce_init_device(void)
{
int err;
if (!mce_available(&boot_cpu_data))
return -EIO;
err = sysdev_class_register(&mce_sysclass);
if (!err)
err = sys_device_register(&device_mce);
if (!err) {
/* could create per CPU objects, but is not worth it. */
sysdev_create_file(&device_mce, &attr_disabled_banks);
sysdev_create_file(&device_mce, &attr_tolerant);
sysdev_create_file(&device_mce, &attr_check_interval);
}
misc_register(&mce_log_device);
return err;
}
device_initcall(mce_init_device);
#ifndef _ASM_MCE_H
#define _ASM_MCE_H 1
#include <asm/ioctls.h>
#include <asm/types.h>
/*
* Machine Check support for x86
*/
#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1) /* eip points to correct instruction */
#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
#define MCI_STATUS_VAL (1UL<<63) /* valid error */
#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
#define MCI_STATUS_EN (1UL<<60) /* error enabled */
#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
/* Fields are zero when not available */
struct mce {
__u64 status;
__u64 misc;
__u64 addr;
__u64 mcgstatus;
__u64 rip;
__u64 tsc; /* cpu time stamp counter */
__u64 res1; /* for future extension */
__u64 res2; /* dito. */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu that raised the error */
__u8 finished; /* entry is valid */
__u32 pad;
};
/*
* This structure contains all data related to the MCE log.
* Also carries a signature to make it easier to find from external debugging tools.
* Each entry is only valid when its finished flag is set.
*/
#define MCE_LOG_LEN 32
struct mce_log {
char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = MCE_LOG_LEN */
unsigned next;
unsigned flags;
unsigned pad0;
struct mce entry[MCE_LOG_LEN];
};
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
#define MCE_LOG_SIGNATURE "MACHINECHECK"
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment