Commit 090bc5a2 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "Boris is on vacation so I'm sending the RAS bits this time. The main
  changes were:

   - Various RAS/CEC improvements and fixes by Borislav Petkov:
       - error insertion fixes
       - offlining latency fix
       - memory leak fix
       - additional sanity checks
       - cleanups
       - debug output improvements

   - More SMCA enhancements by Yazen Ghannam:
       - make banks truly per-CPU which they are in the hardware
       - don't over-cache certain registers
       - make the number of MCA banks per-CPU variable

     The long term goal with these changes is to support future
     heterogenous SMCA extensions.

   - Misc fixes and improvements"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Do not check return value of debugfs_create functions
  x86/MCE: Determine MCA banks' init state properly
  x86/MCE: Make the number of MCA banks a per-CPU variable
  x86/MCE/AMD: Don't cache block addresses on SMCA systems
  x86/MCE: Make mce_banks a per-CPU array
  x86/MCE: Make struct mce_banks[] static
  RAS/CEC: Add copyright
  RAS/CEC: Add CONFIG_RAS_CEC_DEBUG and move CEC debug features there
  RAS/CEC: Dump the different array element sections
  RAS/CEC: Rename count_threshold to action_threshold
  RAS/CEC: Sanity-check array on every insertion
  RAS/CEC: Fix potential memory leak
  RAS/CEC: Do not set decay value on error
  RAS/CEC: Check count_threshold unconditionally
  RAS/CEC: Fix pfn insertion
parents e1928328 6e4f929e
......@@ -99,11 +99,6 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PCIE] = { "pcie", "PCI Express Unit" },
};
static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
{
[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
};
static const char *smca_get_name(enum smca_bank_types t)
{
if (t >= N_SMCA_BANK_TYPES)
......@@ -197,6 +192,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
......@@ -206,6 +204,28 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
{
u32 low, high;
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return;
if (!(low & MCI_CONFIG_MCAX))
return;
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high))
return;
if (low & MASK_BLKPTR_LO)
per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
}
static void smca_configure(unsigned int bank, unsigned int cpu)
{
unsigned int i, hwid_mcatype;
......@@ -243,6 +263,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
smca_set_misc_banks_map(bank, cpu);
/* Return early if this bank was already initialized. */
if (smca_banks[bank].hwid)
return;
......@@ -453,50 +475,29 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
static u32 smca_get_block_address(unsigned int bank, unsigned int block)
static u32 smca_get_block_address(unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 low, high;
u32 addr = 0;
if (smca_get_bank_type(bank) == SMCA_RESERVED)
return addr;
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
/* Check our cache first: */
if (smca_bank_addrs[bank][block] != -1)
return smca_bank_addrs[bank][block];
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
goto out;
if (!(low & MCI_CONFIG_MCAX))
goto out;
if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
return 0;
out:
smca_bank_addrs[bank][block] = addr;
return addr;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 addr = 0, offset = 0;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return addr;
if (mce_flags.smca)
return smca_get_block_address(bank, block);
return smca_get_block_address(bank, block, cpu);
/* Fall back to method we used for older processors: */
switch (block) {
......@@ -624,18 +625,19 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
address = get_block_address(address, low, high, bank, block, cpu);
if (!address)
break;
......@@ -973,7 +975,7 @@ static void amd_deferred_error_interrupt(void)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank)
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
log_error_deferred(bank);
}
......@@ -1014,7 +1016,7 @@ static void amd_threshold_interrupt(void)
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
unsigned int bank, cpu = smp_processor_id();
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
......@@ -1201,7 +1203,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
u32 low, high;
int err;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
......@@ -1252,7 +1254,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
address = get_block_address(address, low, high, bank, ++block);
address = get_block_address(address, low, high, bank, ++block, cpu);
if (!address)
return 0;
......@@ -1435,7 +1437,7 @@ int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
......@@ -1456,14 +1458,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (bp)
return 0;
bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
GFP_KERNEL);
if (!bp)
return -ENOMEM;
per_cpu(threshold_banks, cpu) = bp;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);
......
......@@ -65,7 +65,23 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_bank {
u64 ctl; /* subevents to enable */
bool init; /* initialise bank? */
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev {
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
u8 bank; /* bank number */
};
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
......@@ -675,6 +691,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bool error_seen = false;
struct mce m;
int i;
......@@ -686,7 +703,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (flags & MCP_TIMESTAMP)
m.tsc = rdtsc();
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
......@@ -788,7 +805,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
char *tmp;
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
m->status = mce_rdmsrl(msr_ops.status(i));
if (!(m->status & MCI_STATUS_VAL))
continue;
......@@ -1068,7 +1085,7 @@ static void mce_clear_state(unsigned long *toclear)
{
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (test_bit(i, toclear))
mce_wrmsrl(msr_ops.status(i), 0);
}
......@@ -1122,10 +1139,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
unsigned long *toclear, unsigned long *valid_banks,
int no_way_out, int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
int severity, i;
for (i = 0; i < cfg->banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
continue;
......@@ -1463,27 +1481,29 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
static int __mcheck_cpu_mce_banks_init(void)
static void __mcheck_cpu_mce_banks_init(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
u8 n_banks = this_cpu_read(mce_num_banks);
int i;
mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
for (i = 0; i < MAX_NR_BANKS; i++) {
for (i = 0; i < n_banks; i++) {
struct mce_bank *b = &mce_banks[i];
/*
* Init them all, __mcheck_cpu_apply_quirks() is going to apply
* the required vendor quirks before
* __mcheck_cpu_init_clear_banks() does the final bank setup.
*/
b->ctl = -1ULL;
b->init = 1;
}
return 0;
}
/*
* Initialize Machine Checks for a CPU.
*/
static int __mcheck_cpu_cap_init(void)
static void __mcheck_cpu_cap_init(void)
{
u64 cap;
u8 b;
......@@ -1491,16 +1511,16 @@ static int __mcheck_cpu_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
if (WARN_ON_ONCE(b > MAX_NR_BANKS))
if (b > MAX_NR_BANKS) {
pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
smp_processor_id(), MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
}
mca_cfg.banks = max(mca_cfg.banks, b);
this_cpu_write(mce_num_banks, b);
if (!mce_banks) {
int err = __mcheck_cpu_mce_banks_init();
if (err)
return err;
}
__mcheck_cpu_mce_banks_init();
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
......@@ -1508,8 +1528,6 @@ static int __mcheck_cpu_cap_init(void)
if (cap & MCG_SER_P)
mca_cfg.ser = 1;
return 0;
}
static void __mcheck_cpu_init_generic(void)
......@@ -1536,9 +1554,10 @@ static void __mcheck_cpu_init_generic(void)
static void __mcheck_cpu_init_clear_banks(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
......@@ -1548,6 +1567,33 @@ static void __mcheck_cpu_init_clear_banks(void)
}
}
/*
* Do a final check to see if there are any unused/RAZ banks.
*
* This must be done after the banks have been initialized and any quirks have
* been applied.
*
* Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
* Otherwise, a user who disables a bank will not be able to re-enable it
* without a system reboot.
*/
static void __mcheck_cpu_check_banks(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
u64 msrval;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
continue;
rdmsrl(msr_ops.ctl(i), msrval);
b->init = !!msrval;
}
}
/*
* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
......@@ -1579,6 +1625,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
......@@ -1588,7 +1635,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
if (c->x86 == 15 && cfg->banks > 4) {
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
......@@ -1607,7 +1654,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
if (c->x86 == 6 && cfg->banks > 0)
if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].ctl = 0;
/*
......@@ -1629,7 +1676,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].init = 0;
/*
......@@ -1815,7 +1862,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
if (!mce_available(c))
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
__mcheck_cpu_cap_init();
if (__mcheck_cpu_apply_quirks(c) < 0) {
mca_cfg.disabled = 1;
return;
}
......@@ -1832,6 +1881,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
__mcheck_cpu_check_banks();
__mcheck_cpu_setup_timer();
}
......@@ -1863,7 +1913,7 @@ static void __mce_disable_bank(void *arg)
void mce_disable_bank(int bank)
{
if (bank >= mca_cfg.banks) {
if (bank >= this_cpu_read(mce_num_banks)) {
pr_warn(FW_BUG
"Ignoring request to disable invalid MCA bank %d.\n",
bank);
......@@ -1949,9 +1999,10 @@ int __init mcheck_init(void)
*/
static void mce_disable_error_reporting(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
......@@ -2051,26 +2102,47 @@ static struct bus_type mce_subsys = {
DEFINE_PER_CPU(struct device *, mce_device);
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
return container_of(attr, struct mce_bank_dev, attr);
}
static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
u8 bank = attr_to_bank(attr)->bank;
struct mce_bank *b;
if (bank >= per_cpu(mce_num_banks, s->id))
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
if (!b->init)
return -ENODEV;
return sprintf(buf, "%llx\n", b->ctl);
}
static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u8 bank = attr_to_bank(attr)->bank;
struct mce_bank *b;
u64 new;
if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
if (bank >= per_cpu(mce_num_banks, s->id))
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
if (!b->init)
return -ENODEV;
b->ctl = new;
mce_restart();
return size;
......@@ -2185,7 +2257,7 @@ static void mce_device_release(struct device *dev)
kfree(dev);
}
/* Per cpu device init. All of the cpus still share the same ctrl bank: */
/* Per CPU device init. All of the CPUs still share the same bank device: */
static int mce_device_create(unsigned int cpu)
{
struct device *dev;
......@@ -2217,8 +2289,8 @@ static int mce_device_create(unsigned int cpu)
if (err)
goto error;
}
for (j = 0; j < mca_cfg.banks; j++) {
err = device_create_file(dev, &mce_banks[j].attr);
for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
err = device_create_file(dev, &mce_bank_devs[j].attr);
if (err)
goto error2;
}
......@@ -2228,7 +2300,7 @@ static int mce_device_create(unsigned int cpu)
return 0;
error2:
while (--j >= 0)
device_remove_file(dev, &mce_banks[j].attr);
device_remove_file(dev, &mce_bank_devs[j].attr);
error:
while (--i >= 0)
device_remove_file(dev, mce_device_attrs[i]);
......@@ -2249,8 +2321,8 @@ static void mce_device_remove(unsigned int cpu)
for (i = 0; mce_device_attrs[i]; i++)
device_remove_file(dev, mce_device_attrs[i]);
for (i = 0; i < mca_cfg.banks; i++)
device_remove_file(dev, &mce_banks[i].attr);
for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
device_remove_file(dev, &mce_bank_devs[i].attr);
device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
......@@ -2271,6 +2343,7 @@ static void mce_disable_cpu(void)
static void mce_reenable_cpu(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
......@@ -2278,7 +2351,7 @@ static void mce_reenable_cpu(void)
if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
......@@ -2328,10 +2401,12 @@ static __init void mce_init_banks(void)
{
int i;
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
for (i = 0; i < MAX_NR_BANKS; i++) {
struct mce_bank_dev *b = &mce_bank_devs[i];
struct device_attribute *a = &b->attr;
b->bank = i;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
......@@ -2441,22 +2516,16 @@ static int fake_panic_set(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
"%llu\n");
static int __init mcheck_debugfs_init(void)
static void __init mcheck_debugfs_init(void)
{
struct dentry *dmce, *ffake_panic;
struct dentry *dmce;
dmce = mce_get_debugfs_dir();
if (!dmce)
return -ENOMEM;
ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce,
NULL, &fake_panic_fops);
if (!ffake_panic)
return -ENOMEM;
return 0;
debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
&fake_panic_fops);
}
#else
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
static void __init mcheck_debugfs_init(void) { }
#endif
DEFINE_STATIC_KEY_FALSE(mcsafe_key);
......@@ -2464,8 +2533,6 @@ EXPORT_SYMBOL_GPL(mcsafe_key);
static int __init mcheck_late_init(void)
{
pr_info("Using %d MCE banks\n", mca_cfg.banks);
if (mca_cfg.recovery)
static_branch_inc(&mcsafe_key);
......
......@@ -645,7 +645,6 @@ static const struct file_operations readme_fops = {
static struct dfs_node {
char *name;
struct dentry *d;
const struct file_operations *fops;
umode_t perm;
} dfs_fls[] = {
......@@ -659,49 +658,23 @@ static struct dfs_node {
{ .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
};
static int __init debugfs_init(void)
static void __init debugfs_init(void)
{
unsigned int i;
dfs_inj = debugfs_create_dir("mce-inject", NULL);
if (!dfs_inj)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
dfs_fls[i].perm,
dfs_inj,
&i_mce,
dfs_fls[i].fops);
if (!dfs_fls[i].d)
goto err_dfs_add;
}
return 0;
err_dfs_add:
while (i-- > 0)
debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj);
dfs_inj = NULL;
return -ENODEV;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
&i_mce, dfs_fls[i].fops);
}
static int __init inject_init(void)
{
int err;
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
err = debugfs_init();
if (err) {
free_cpumask_var(mce_inject_cpumask);
return err;
}
debugfs_init();
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
mce_register_injector_chain(&inject_nb);
......
......@@ -22,17 +22,8 @@ enum severity_level {
extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
struct mce_evt_llist {
struct llist_node llnode;
struct mce mce;
......@@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
......@@ -128,7 +118,6 @@ struct mca_config {
bios_cmci_threshold : 1,
__reserved : 59;
u8 banks;
s8 bootlog;
int tolerant;
int monarch_timeout;
......@@ -137,6 +126,7 @@ struct mca_config {
};
extern struct mca_config mca_cfg;
DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_vendor_flags {
/*
......
......@@ -400,21 +400,13 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
struct dentry *dmce, *fsev;
struct dentry *dmce;
dmce = mce_get_debugfs_dir();
if (!dmce)
goto err_out;
fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
if (!fsev)
goto err_out;
return 0;
err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
......@@ -11,3 +11,13 @@ config RAS_CEC
Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
config RAS_CEC_DEBUG
bool "CEC debugging machinery"
default n
depends on RAS_CEC
help
Add extra files to (debugfs)/ras/cec to test the correctable error
collector feature. "pfn" is a writable file that allows user to
simulate an error in a particular page frame. "array" is a read-only
file that dumps out the current state of all pages logged so far.
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2017-2019 Borislav Petkov, SUSE Labs.
*/
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
......@@ -37,9 +40,9 @@
* thus emulate an an LRU-like behavior when deleting elements to free up space
* in the page.
*
* When an element reaches it's max count of count_threshold, we try to poison
* it by assuming that errors triggered count_threshold times in a single page
* are excessive and that page shouldn't be used anymore. count_threshold is
* When an element reaches it's max count of action_threshold, we try to poison
* it by assuming that errors triggered action_threshold times in a single page
* are excessive and that page shouldn't be used anymore. action_threshold is
* initialized to COUNT_MASK which is the maximum.
*
* That error event entry causes cec_add_elem() to return !0 value and thus
......@@ -122,7 +125,7 @@ static DEFINE_MUTEX(ce_mutex);
static u64 dfs_pfn;
/* Amount of errors after which we offline */
static unsigned int count_threshold = COUNT_MASK;
static u64 action_threshold = COUNT_MASK;
/* Each element "decays" each decay_interval which is 24hrs by default. */
#define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
......@@ -276,11 +279,39 @@ static u64 __maybe_unused del_lru_elem(void)
return pfn;
}
static bool sanity_check(struct ce_array *ca)
{
bool ret = false;
u64 prev = 0;
int i;
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this))
ret = true;
prev = this;
}
if (!ret)
return ret;
pr_info("Sanity check dump:\n{ n: %d\n", ca->n);
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
}
pr_info("}\n");
return ret;
}
int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
unsigned int to;
unsigned int to = 0;
int count, ret = 0;
/*
......@@ -294,6 +325,7 @@ int cec_add_elem(u64 pfn)
ca->ces_entered++;
/* Array full, free the LRU slot. */
if (ca->n == MAX_ELEMS)
WARN_ON(!del_lru_elem_unlocked(ca));
......@@ -306,24 +338,17 @@ int cec_add_elem(u64 pfn)
(void *)&ca->array[to],
(ca->n - to) * sizeof(u64));
ca->array[to] = (pfn << PAGE_SHIFT) |
(DECAY_MASK << COUNT_BITS) | 1;
ca->array[to] = pfn << PAGE_SHIFT;
ca->n++;
ret = 0;
goto decay;
}
count = COUNT(ca->array[to]);
if (count < count_threshold) {
ca->array[to] |= (DECAY_MASK << COUNT_BITS);
/* Add/refresh element generation and increment count */
ca->array[to] |= DECAY_MASK << COUNT_BITS;
ca->array[to]++;
ret = 0;
} else {
/* Check action threshold and soft-offline, if reached. */
count = COUNT(ca->array[to]);
if (count >= action_threshold) {
u64 pfn = ca->array[to] >> PAGE_SHIFT;
if (!pfn_valid(pfn)) {
......@@ -338,20 +363,21 @@ int cec_add_elem(u64 pfn)
del_elem(ca, to);
/*
* Return a >0 value to denote that we've reached the offlining
* threshold.
* Return a >0 value to callers, to denote that we've reached
* the offlining threshold.
*/
ret = 1;
goto unlock;
}
decay:
ca->decay_count++;
if (ca->decay_count >= CLEAN_ELEMS)
do_spring_cleaning(ca);
WARN_ON_ONCE(sanity_check(ca));
unlock:
mutex_unlock(&ce_mutex);
......@@ -369,45 +395,48 @@ static int pfn_set(void *data, u64 val)
{
*(u64 *)data = val;
return cec_add_elem(val);
cec_add_elem(val);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
static int decay_interval_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val < CEC_DECAY_MIN_INTERVAL)
return -EINVAL;
if (val > CEC_DECAY_MAX_INTERVAL)
return -EINVAL;
*(u64 *)data = val;
decay_interval = val;
cec_mod_work(decay_interval);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
static int count_threshold_set(void *data, u64 val)
static int action_threshold_set(void *data, u64 val)
{
*(u64 *)data = val;
if (val > COUNT_MASK)
val = COUNT_MASK;
count_threshold = val;
action_threshold = val;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
static const char * const bins[] = { "00", "01", "10", "11" };
static int array_dump(struct seq_file *m, void *v)
{
struct ce_array *ca = &ce_arr;
u64 prev = 0;
int i;
mutex_lock(&ce_mutex);
......@@ -416,11 +445,8 @@ static int array_dump(struct seq_file *m, void *v)
for (i = 0; i < ca->n; i++) {
u64 this = PFN(ca->array[i]);
seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
WARN_ON(prev > this);
prev = this;
seq_printf(m, " %3d: [%016llx|%s|%03llx]\n",
i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i]));
}
seq_printf(m, "}\n");
......@@ -433,7 +459,7 @@ static int array_dump(struct seq_file *m, void *v)
seq_printf(m, "Decay interval: %lld seconds\n", decay_interval);
seq_printf(m, "Decays: %lld\n", ca->decays_done);
seq_printf(m, "Action threshold: %d\n", count_threshold);
seq_printf(m, "Action threshold: %lld\n", action_threshold);
mutex_unlock(&ce_mutex);
......@@ -463,18 +489,6 @@ static int __init create_debugfs_nodes(void)
return -1;
}
pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
if (!pfn) {
pr_warn("Error creating pfn debugfs node!\n");
goto err;
}
array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
if (!array) {
pr_warn("Error creating array debugfs node!\n");
goto err;
}
decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
&decay_interval, &decay_interval_ops);
if (!decay) {
......@@ -482,13 +496,27 @@ static int __init create_debugfs_nodes(void)
goto err;
}
count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
&count_threshold, &count_threshold_ops);
count = debugfs_create_file("action_threshold", S_IRUSR | S_IWUSR, d,
&action_threshold, &action_threshold_ops);
if (!count) {
pr_warn("Error creating count_threshold debugfs node!\n");
pr_warn("Error creating action_threshold debugfs node!\n");
goto err;
}
if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG))
return 0;
pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
if (!pfn) {
pr_warn("Error creating pfn debugfs node!\n");
goto err;
}
array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
if (!array) {
pr_warn("Error creating array debugfs node!\n");
goto err;
}
return 0;
......@@ -509,8 +537,10 @@ void __init cec_init(void)
return;
}
if (create_debugfs_nodes())
if (create_debugfs_nodes()) {
free_page((unsigned long)ce_arr.array);
return;
}
INIT_DELAYED_WORK(&cec_work, cec_work_fn);
schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment