Commit 86bbbeba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Ingo Molnar:
 "The main changes in this cycle were:

   - AMD MCE support/decoding improvements (Yazen Ghannam)

   - general MCE header cleanups and reorganization (Borislav Petkov)"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  Revert "x86/mce/AMD: Collect error info even if valid bits are not set"
  x86/MCE: Cleanup and complete struct mce fields definitions
  x86/mce/AMD: Carve out SMCA get_block_address() code
  x86/mce/AMD: Get address from already initialized block
  x86/mce/AMD, EDAC/mce_amd: Enumerate Reserved SMCA bank type
  x86/mce/AMD: Pass the bank number to smca_get_bank_type()
  x86/mce/AMD: Collect error info even if valid bits are not set
  x86/mce: Issue the 'mcelog --ascii' message only on !AMD
  x86/mce: Convert 'struct mca_config' bools to a bitfield
  x86/mce: Put private structures and definitions into the internal header
parents 486adcea e2efacb6
......@@ -138,58 +138,6 @@ struct mce_log_buffer {
struct mce entry[MCE_LOG_LEN];
};
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
bool lmce_disabled;
bool ignore_ce;
bool disabled;
bool ser;
bool recovery;
bool bios_cmci_threshold;
u8 banks;
s8 bootlog;
int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
};
struct mce_vendor_flags {
/*
* Indicates that overflow conditions are not fatal, when set.
*/
__u64 overflow_recov : 1,
/*
* (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
* the register space for each MCA bank and also increases number of
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
smca : 1,
__reserved_0 : 61;
};
struct mca_msr_regs {
u32 (*ctl) (int bank);
u32 (*status) (int bank);
u32 (*addr) (int bank);
u32 (*misc) (int bank);
};
extern struct mce_vendor_flags mce_flags;
extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios {
MCE_PRIO_FIRST = INT_MAX,
MCE_PRIO_SRAO = INT_MAX - 1,
......@@ -346,6 +294,7 @@ enum smca_bank_types {
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
SMCA_RESERVED, /* Reserved */
SMCA_EX, /* Execution Unit */
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
......
......@@ -5,32 +5,36 @@
#include <linux/types.h>
#include <linux/ioctl.h>
/* Fields are zero when not available */
/*
* Fields are zero when not available. Also, this struct is shared with
* userspace mcelog and thus must keep existing fields at current offsets.
* Only add new fields to the end of the structure
*/
struct mce {
__u64 status;
__u64 misc;
__u64 addr;
__u64 mcgstatus;
__u64 ip;
__u64 tsc; /* cpu time stamp counter */
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
__u8 severity;
__u64 status; /* Bank's MCi_STATUS MSR */
__u64 misc; /* Bank's MCi_MISC MSR */
__u64 addr; /* Bank's MCi_ADDR MSR */
__u64 mcgstatus; /* Machine Check Global Status MSR */
__u64 ip; /* Instruction Pointer when the error happened */
__u64 tsc; /* CPU time stamp counter */
__u64 time; /* Wall time_t when error was detected */
__u8 cpuvendor; /* Kernel's X86_VENDOR enum */
__u8 inject_flags; /* Software inject flags */
__u8 severity; /* Error severity */
__u8 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu number; obsolete; use extcpu now */
__u8 finished; /* entry is valid */
__u32 extcpu; /* linux cpu number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
__u64 ppin; /* Protected Processor Inventory Number */
__u32 microcode;/* Microcode revision */
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* Code segment */
__u8 bank; /* Machine check bank reporting the error */
__u8 cpu; /* CPU number; obsoleted by extcpu */
__u8 finished; /* Entry is valid */
__u32 extcpu; /* Linux CPU number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial APIC ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
__u64 ppin; /* Protected Processor Inventory Number */
__u32 microcode; /* Microcode revision */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
......
......@@ -113,8 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { }
static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
extern struct mca_config mca_cfg;
#ifndef CONFIG_X86_64
/*
* On 32-bit systems it would be difficult to safely unmap a poison page
......@@ -130,4 +128,61 @@ static inline void mce_unmap_kpfn(unsigned long pfn) {}
#define mce_unmap_kpfn mce_unmap_kpfn
#endif
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
bool ignore_ce;
__u64 lmce_disabled : 1,
disabled : 1,
ser : 1,
recovery : 1,
bios_cmci_threshold : 1,
__reserved : 59;
u8 banks;
s8 bootlog;
int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
};
extern struct mca_config mca_cfg;
struct mce_vendor_flags {
/*
* Indicates that overflow conditions are not fatal, when set.
*/
__u64 overflow_recov : 1,
/*
* (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
* the register space for each MCA bank and also increases number of
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
smca : 1,
__reserved_0 : 61;
};
extern struct mce_vendor_flags mce_flags;
struct mca_msr_regs {
u32 (*ctl) (int bank);
u32 (*status) (int bank);
u32 (*addr) (int bank);
u32 (*misc) (int bank);
};
extern struct mca_msr_regs msr_ops;
#endif /* __X86_MCE_INTERNAL_H__ */
......@@ -273,7 +273,9 @@ static void __print_mce(struct mce *m)
static void print_mce(struct mce *m)
{
__print_mce(m);
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
if (m->cpuvendor != X86_VENDOR_AMD)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
......@@ -1516,7 +1518,7 @@ static int __mcheck_cpu_cap_init(void)
mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
if (cap & MCG_SER_P)
mca_cfg.ser = true;
mca_cfg.ser = 1;
return 0;
}
......@@ -1824,12 +1826,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
mca_cfg.disabled = true;
mca_cfg.disabled = 1;
return;
}
if (mce_gen_pool_init()) {
mca_cfg.disabled = true;
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;
}
......@@ -1907,11 +1909,11 @@ static int __init mcheck_enable(char *str)
if (*str == '=')
str++;
if (!strcmp(str, "off"))
cfg->disabled = true;
cfg->disabled = 1;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
else if (!strcmp(str, "no_lmce"))
cfg->lmce_disabled = true;
cfg->lmce_disabled = 1;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
......@@ -1919,9 +1921,9 @@ static int __init mcheck_enable(char *str)
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
cfg->bootlog = (str[0] == 'b');
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
cfg->bios_cmci_threshold = 1;
else if (!strcmp(str, "recovery"))
cfg->recovery = true;
cfg->recovery = 1;
else if (isdigit(str[0])) {
if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));
......@@ -2403,7 +2405,7 @@ device_initcall_sync(mcheck_init_device);
*/
static int __init mcheck_disable(char *str)
{
mca_cfg.disabled = true;
mca_cfg.disabled = 1;
return 1;
}
__setup("nomce", mcheck_disable);
......
......@@ -82,6 +82,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
[SMCA_RESERVED] = { "reserved", "Reserved" },
[SMCA_EX] = { "execution_unit", "Execution Unit" },
[SMCA_FP] = { "floating_point", "Floating Point Unit" },
[SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
......@@ -110,14 +111,14 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
static enum smca_bank_types smca_get_bank_type(struct mce *m)
static enum smca_bank_types smca_get_bank_type(unsigned int bank)
{
struct smca_bank *b;
if (m->bank >= N_SMCA_BANK_TYPES)
if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;
b = &smca_banks[m->bank];
b = &smca_banks[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;
......@@ -127,6 +128,9 @@ static enum smca_bank_types smca_get_bank_type(struct mce *m)
static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
/* Reserved type */
{ SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
......@@ -427,35 +431,58 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
unsigned int block)
{
u32 low, high;
u32 addr = 0;
if (smca_get_bank_type(bank) == SMCA_RESERVED)
return addr;
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr;
if (!(low & MCI_CONFIG_MCAX))
return addr;
if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
return addr;
}
static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
{
u32 addr = 0, offset = 0;
if (mce_flags.smca) {
if (!block) {
addr = MSR_AMD64_SMCA_MCx_MISC(bank);
} else {
/*
* For SMCA enabled processors, BLKPTR field of the
* first MISC register (MCx_MISC0) indicates presence of
* additional MISC register set (MISC1-4).
*/
u32 low, high;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
return addr;
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr;
/* Get address from already initialized block. */
if (per_cpu(threshold_banks, cpu)) {
struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank];
if (!(low & MCI_CONFIG_MCAX))
return addr;
if (bankp && bankp->blocks) {
struct threshold_block *blockp = &bankp->blocks[block];
if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
if (blockp)
return blockp->address;
}
return addr;
}
if (mce_flags.smca)
return smca_get_block_address(cpu, bank, block);
/* Fall back to method we used for older processors: */
switch (block) {
case 0:
......@@ -760,7 +787,7 @@ bool amd_mce_is_memory_error(struct mce *m)
u8 xec = (m->status >> 16) & 0x1f;
if (mce_flags.smca)
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
......@@ -1063,7 +1090,7 @@ static struct kobj_type threshold_ktype = {
static const char *get_name(unsigned int bank, struct threshold_block *b)
{
unsigned int bank_type;
enum smca_bank_types bank_type;
if (!mce_flags.smca) {
if (b && bank == 4)
......@@ -1072,11 +1099,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
if (!smca_banks[bank].hwid)
bank_type = smca_get_bank_type(bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
bank_type = smca_banks[bank].hwid->bank_type;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
......
......@@ -854,21 +854,24 @@ static void decode_mc6_mce(struct mce *m)
static void decode_smca_error(struct mce *m)
{
struct smca_hwid *hwid;
unsigned int bank_type;
enum smca_bank_types bank_type;
const char *ip_name;
u8 xec = XEC(m->status, xec_mask);
if (m->bank >= ARRAY_SIZE(smca_banks))
return;
if (x86_family(m->cpuid) >= 0x17 && m->bank == 4)
pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
hwid = smca_banks[m->bank].hwid;
if (!hwid)
return;
bank_type = hwid->bank_type;
if (bank_type == SMCA_RESERVED) {
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
return;
}
ip_name = smca_get_long_name(bank_type);
pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment