Commit 7adcadb9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_updates_for_6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - Make ghes_edac a simple module like the rest of the EDAC drivers and
   drop the forced built-in only configuration by disentangling it from
   GHES (Jia He)

 - The usual small cleanups and improvements all over EDAC land

* tag 'edac_updates_for_6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  EDAC/i10nm: fix refcount leak in pci_get_dev_wrapper()
  EDAC/i5400: Fix typo in comment: vaious -> various
  EDAC/mc_sysfs: Increase legacy channel support to 12
  MAINTAINERS: Make Mauro EDAC reviewer
  MAINTAINERS: Make Manivannan Sadhasivam the maintainer of qcom_edac
  EDAC/igen6: Return the correct error type when not the MC owner
  apei/ghes: Use xchg_release() for updating new cache slot instead of cmpxchg()
  EDAC: Check for GHES preference in the chipset-specific EDAC drivers
  EDAC/ghes: Make ghes_edac a proper module
  EDAC/ghes: Prepare to make ghes_edac a proper module
  EDAC/ghes: Add a notifier for reporting memory errors
  efi/cper: Export several helpers for ghes_edac to use
  EDAC/i5000: Mark as BROKEN
parents 40deb5e4 3919430f
......@@ -7386,9 +7386,9 @@ F: drivers/edac/thunderx_edac*
EDAC-CORE
M: Borislav Petkov <bp@alien8.de>
M: Mauro Carvalho Chehab <mchehab@kernel.org>
M: Tony Luck <tony.luck@intel.com>
R: James Morse <james.morse@arm.com>
R: Mauro Carvalho Chehab <mchehab@kernel.org>
R: Robert Richter <rric@kernel.org>
L: linux-edac@vger.kernel.org
S: Supported
......@@ -7505,8 +7505,7 @@ S: Maintained
F: drivers/edac/pnd2_edac.[ch]
EDAC-QCOM
M: Channagoud Kadabi <ckadabi@codeaurora.org>
M: Venkata Narendra Kumar Gutta <vnkgutta@codeaurora.org>
M: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
L: linux-arm-msm@vger.kernel.org
L: linux-edac@vger.kernel.org
S: Maintained
......
......@@ -94,6 +94,8 @@
#define FIX_APEI_GHES_SDEI_CRITICAL __end_of_fixed_addresses
#endif
static ATOMIC_NOTIFIER_HEAD(ghes_report_chain);
static inline bool is_hest_type_generic_v2(struct ghes *ghes)
{
return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
......@@ -107,6 +109,13 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);
/*
* "ghes.edac_force_enable" forcibly enables ghes_edac and skips the platform
* check.
*/
static bool ghes_edac_force_enable;
module_param_named(edac_force_enable, ghes_edac_force_enable, bool, 0);
/*
* All error sources notified with HED (Hardware Error Device) share a
* single notifier callback, so they need to be linked and checked one
......@@ -118,6 +127,13 @@ module_param_named(disable, ghes_disable, bool, 0);
static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);
/*
* A list of GHES devices which are given to the corresponding EDAC driver
* ghes_edac for further use.
*/
static LIST_HEAD(ghes_devs);
static DEFINE_MUTEX(ghes_devs_mutex);
/*
* Because the memory area used to transfer hardware error information
* from BIOS to Linux can be determined only in NMI, IRQ or timer
......@@ -645,7 +661,7 @@ static bool ghes_do_proc(struct ghes *ghes,
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
ghes_edac_report_mem_error(sev, mem_err);
atomic_notifier_call_chain(&ghes_report_chain, sev, mem_err);
arch_apei_report_mem_error(sev, mem_err);
queued = ghes_handle_memory_failure(gdata, sev);
......@@ -1382,7 +1398,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
platform_set_drvdata(ghes_dev, ghes);
ghes_edac_register(ghes, &ghes_dev->dev);
ghes->dev = &ghes_dev->dev;
mutex_lock(&ghes_devs_mutex);
list_add_tail(&ghes->elist, &ghes_devs);
mutex_unlock(&ghes_devs_mutex);
/* Handle any pending errors right away */
spin_lock_irqsave(&ghes_notify_lock_irq, flags);
......@@ -1446,7 +1466,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
ghes_fini(ghes);
ghes_edac_unregister(ghes);
mutex_lock(&ghes_devs_mutex);
list_del(&ghes->elist);
mutex_unlock(&ghes_devs_mutex);
kfree(ghes);
......@@ -1501,3 +1523,41 @@ void __init acpi_ghes_init(void)
else
pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
}
/*
* Known x86 systems that prefer GHES error reporting:
*/
static struct acpi_platform_list plat_list[] = {
{"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
{ } /* End */
};
struct list_head *ghes_get_devices(void)
{
int idx = -1;
if (IS_ENABLED(CONFIG_X86)) {
idx = acpi_match_platform_list(plat_list);
if (idx < 0) {
if (!ghes_edac_force_enable)
return NULL;
pr_warn_once("Force-loading ghes_edac on an unsupported platform. You're on your own!\n");
}
}
return &ghes_devs;
}
EXPORT_SYMBOL_GPL(ghes_get_devices);
void ghes_register_report_chain(struct notifier_block *nb)
{
atomic_notifier_chain_register(&ghes_report_chain, nb);
}
EXPORT_SYMBOL_GPL(ghes_register_report_chain);
void ghes_unregister_report_chain(struct notifier_block *nb)
{
atomic_notifier_chain_unregister(&ghes_report_chain, nb);
}
EXPORT_SYMBOL_GPL(ghes_unregister_report_chain);
......@@ -53,8 +53,8 @@ config EDAC_DECODE_MCE
has been initialized.
config EDAC_GHES
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES && (EDAC=y)
tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES
select UEFI_CPER
help
Not all machines support hardware-driven error report. Some of those
......@@ -211,6 +211,7 @@ config EDAC_R82600
config EDAC_I5000
tristate "Intel Greencreek/Blackford chipset"
depends on X86 && PCI
depends on BROKEN
help
Support for error detection and correction the Intel
Greekcreek/Blackford chipsets.
......
......@@ -4329,6 +4329,9 @@ static int __init amd64_edac_init(void)
int err = -ENODEV;
int i;
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
......
......@@ -599,6 +599,9 @@ static int __init armada_xp_edac_init(void)
{
int res;
if (ghes_get_devices())
return -EBUSY;
/* only polling is supported */
edac_op_state = EDAC_OPSTATE_POLL;
......
......@@ -298,6 +298,14 @@ DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 6);
DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 7);
DEVICE_CHANNEL(ch8_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 8);
DEVICE_CHANNEL(ch9_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 9);
DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 10);
DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 11);
/* Total possible dynamic DIMM Label attribute file table */
static struct attribute *dynamic_csrow_dimm_attr[] = {
......@@ -309,6 +317,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
&dev_attr_legacy_ch5_dimm_label.attr.attr,
&dev_attr_legacy_ch6_dimm_label.attr.attr,
&dev_attr_legacy_ch7_dimm_label.attr.attr,
&dev_attr_legacy_ch8_dimm_label.attr.attr,
&dev_attr_legacy_ch9_dimm_label.attr.attr,
&dev_attr_legacy_ch10_dimm_label.attr.attr,
&dev_attr_legacy_ch11_dimm_label.attr.attr,
NULL
};
......@@ -329,6 +341,14 @@ DEVICE_CHANNEL(ch6_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 6);
DEVICE_CHANNEL(ch7_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 7);
DEVICE_CHANNEL(ch8_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 8);
DEVICE_CHANNEL(ch9_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 9);
DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 10);
DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 11);
/* Total possible dynamic ce_count attribute file table */
static struct attribute *dynamic_csrow_ce_count_attr[] = {
......@@ -340,6 +360,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
&dev_attr_legacy_ch5_ce_count.attr.attr,
&dev_attr_legacy_ch6_ce_count.attr.attr,
&dev_attr_legacy_ch7_ce_count.attr.attr,
&dev_attr_legacy_ch8_ce_count.attr.attr,
&dev_attr_legacy_ch9_ce_count.attr.attr,
&dev_attr_legacy_ch10_ce_count.attr.attr,
&dev_attr_legacy_ch11_ce_count.attr.attr,
NULL
};
......
......@@ -11,6 +11,7 @@
#ifndef __EDAC_MODULE_H__
#define __EDAC_MODULE_H__
#include <acpi/ghes.h>
#include "edac_mc.h"
#include "edac_pci.h"
#include "edac_device.h"
......
......@@ -14,6 +14,7 @@
#include <linux/dmi.h>
#include "edac_module.h"
#include <ras/ras_event.h>
#include <linux/notifier.h>
#define OTHER_DETAIL_LEN 400
......@@ -53,12 +54,10 @@ static DEFINE_MUTEX(ghes_reg_mutex);
*/
static DEFINE_SPINLOCK(ghes_lock);
/* "ghes_edac.force_load=1" skips the platform check */
static bool __read_mostly force_load;
module_param(force_load, bool, 0);
static bool system_scanned;
static struct list_head *ghes_devs;
/* Memory Device - Type 17 of SMBIOS spec */
struct memdev_dmi_entry {
u8 type;
......@@ -267,11 +266,14 @@ static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char
return n;
}
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
static int ghes_edac_report_mem_error(struct notifier_block *nb,
unsigned long val, void *data)
{
struct cper_sec_mem_err *mem_err = (struct cper_sec_mem_err *)data;
struct cper_mem_err_compact cmem;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
unsigned long sev = val;
struct ghes_pvt *pvt;
unsigned long flags;
char *p;
......@@ -282,7 +284,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
* know.
*/
if (WARN_ON_ONCE(in_nmi()))
return;
return NOTIFY_OK;
spin_lock_irqsave(&ghes_lock, flags);
......@@ -374,36 +376,24 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
return NOTIFY_OK;
}
/*
* Known systems that are safe to enable this module.
*/
static struct acpi_platform_list plat_list[] = {
{"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
{ } /* End */
static struct notifier_block ghes_edac_mem_err_nb = {
.notifier_call = ghes_edac_report_mem_error,
.priority = 0,
};
int ghes_edac_register(struct ghes *ghes, struct device *dev)
static int ghes_edac_register(struct device *dev)
{
bool fake = false;
struct mem_ctl_info *mci;
struct ghes_pvt *pvt;
struct edac_mc_layer layers[1];
unsigned long flags;
int idx = -1;
int rc = 0;
if (IS_ENABLED(CONFIG_X86)) {
/* Check if safe to enable on this system */
idx = acpi_match_platform_list(plat_list);
if (!force_load && idx < 0)
return -ENODEV;
} else {
force_load = true;
idx = 0;
}
/* finish another registration/unregistration instance first */
mutex_lock(&ghes_reg_mutex);
......@@ -447,15 +437,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
pr_info("work on such system. Use this driver with caution\n");
} else if (idx < 0) {
pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
pr_info("If you find incorrect reports, please contact your hardware vendor\n");
pr_info("to correct its BIOS.\n");
pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
}
pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
if (!fake) {
struct dimm_info *src, *dst;
int i = 0;
......@@ -503,6 +488,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
ghes_pvt = pvt;
spin_unlock_irqrestore(&ghes_lock, flags);
ghes_register_report_chain(&ghes_edac_mem_err_nb);
/* only set on success */
refcount_set(&ghes_refcount, 1);
......@@ -517,14 +504,11 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
return rc;
}
void ghes_edac_unregister(struct ghes *ghes)
static void ghes_edac_unregister(struct ghes *ghes)
{
struct mem_ctl_info *mci;
unsigned long flags;
if (!force_load)
return;
mutex_lock(&ghes_reg_mutex);
system_scanned = false;
......@@ -548,6 +532,42 @@ void ghes_edac_unregister(struct ghes *ghes)
if (mci)
edac_mc_free(mci);
ghes_unregister_report_chain(&ghes_edac_mem_err_nb);
unlock:
mutex_unlock(&ghes_reg_mutex);
}
static int __init ghes_edac_init(void)
{
struct ghes *g, *g_tmp;
ghes_devs = ghes_get_devices();
if (!ghes_devs)
return -ENODEV;
if (list_empty(ghes_devs)) {
pr_info("GHES probing device list is empty");
return -ENODEV;
}
list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
ghes_edac_register(g->dev);
}
return 0;
}
module_init(ghes_edac_init);
static void __exit ghes_edac_exit(void)
{
struct ghes *g, *g_tmp;
list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
ghes_edac_unregister(g);
}
}
module_exit(ghes_edac_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC");
......@@ -304,11 +304,10 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
if (unlikely(pci_enable_device(pdev) < 0)) {
edac_dbg(2, "Failed to enable device %02x:%02x.%x\n",
bus, dev, fun);
pci_dev_put(pdev);
return NULL;
}
pci_dev_get(pdev);
return pdev;
}
......@@ -756,6 +755,9 @@ static int __init i10nm_init(void)
edac_dbg(2, "\n");
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
......
......@@ -279,7 +279,8 @@ static inline int from_nf_ferr(unsigned int mask)
#define FERR_NF_RECOVERABLE to_nf_mask(ERROR_NF_RECOVERABLE)
#define FERR_NF_UNCORRECTABLE to_nf_mask(ERROR_NF_UNCORRECTABLE)
/* Defines to extract the vaious fields from the
/*
* Defines to extract the various fields from the
* MTRx - Memory Technology Registers
*/
#define MTR_DIMMS_PRESENT(mtr) ((mtr) & (1 << 10))
......
......@@ -1271,9 +1271,12 @@ static int __init igen6_init(void)
edac_dbg(2, "\n");
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -ENODEV;
return -EBUSY;
edac_op_state = EDAC_OPSTATE_NMI;
......
......@@ -38,6 +38,9 @@ static int __init fsl_ddr_mc_init(void)
{
int res;
if (ghes_get_devices())
return -EBUSY;
/* make sure error reporting method is sane */
switch (edac_op_state) {
case EDAC_OPSTATE_POLL:
......
......@@ -1528,6 +1528,9 @@ static int __init pnd2_init(void)
edac_dbg(2, "\n");
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
......
......@@ -3634,6 +3634,9 @@ static int __init sbridge_init(void)
edac_dbg(2, "\n");
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
......
......@@ -653,6 +653,9 @@ static int __init skx_init(void)
edac_dbg(2, "\n");
if (ghes_get_devices())
return -EBUSY;
owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
......
......@@ -2114,6 +2114,9 @@ static int __init thunderx_edac_init(void)
{
int rc = 0;
if (ghes_get_devices())
return -EBUSY;
rc = pci_register_driver(&thunderx_lmc_driver);
if (rc)
return rc;
......
......@@ -2004,6 +2004,9 @@ static int __init xgene_edac_init(void)
{
int rc;
if (ghes_get_devices())
return -EBUSY;
/* Make sure error reporting method is sane */
switch (edac_op_state) {
case EDAC_OPSTATE_POLL:
......
......@@ -290,6 +290,7 @@ int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
return n;
}
EXPORT_SYMBOL_GPL(cper_mem_err_location);
int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
{
......@@ -310,6 +311,7 @@ int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
return n;
}
EXPORT_SYMBOL_GPL(cper_dimm_err_location);
void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
struct cper_mem_err_compact *cmem)
......@@ -331,6 +333,7 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
cmem->mem_array_handle = mem->mem_array_handle;
cmem->mem_dev_handle = mem->mem_dev_handle;
}
EXPORT_SYMBOL_GPL(cper_mem_err_pack);
const char *cper_mem_err_unpack(struct trace_seq *p,
struct cper_mem_err_compact *cmem)
......
......@@ -27,6 +27,8 @@ struct ghes {
struct timer_list timer;
unsigned int irq;
};
struct device *dev;
struct list_head elist;
};
struct ghes_estatus_node {
......@@ -69,35 +71,14 @@ int ghes_register_vendor_record_notifier(struct notifier_block *nb);
* @nb: pointer to the notifier_block structure of the vendor record handler.
*/
void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
#endif
int ghes_estatus_pool_init(unsigned int num_ghes);
/* From drivers/edac/ghes_edac.c */
#ifdef CONFIG_EDAC_GHES
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err);
int ghes_edac_register(struct ghes *ghes, struct device *dev);
void ghes_edac_unregister(struct ghes *ghes);
struct list_head *ghes_get_devices(void);
#else
static inline void ghes_edac_report_mem_error(int sev,
struct cper_sec_mem_err *mem_err)
{
}
static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
{
return -ENODEV;
}
static inline void ghes_edac_unregister(struct ghes *ghes)
{
}
static inline struct list_head *ghes_get_devices(void) { return NULL; }
#endif
int ghes_estatus_pool_init(unsigned int num_ghes);
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
{
return gdata->revision >> 8;
......@@ -145,4 +126,7 @@ int ghes_notify_sea(void);
static inline int ghes_notify_sea(void) { return -ENOENT; }
#endif
struct notifier_block;
extern void ghes_register_report_chain(struct notifier_block *nb);
extern void ghes_unregister_report_chain(struct notifier_block *nb);
#endif /* GHES_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment