Commit e9918d7f authored by Thor Thayer's avatar Thor Thayer Committed by Borislav Petkov

EDAC, altera: Handle SDRAM Uncorrectable Errors on Stratix10

On Stratix10, uncorrectable errors are routed to the SError exception
instead of the IRQ exceptions. In Stratix10, uncorrectable SErrors
must be treated as fatal and will cause a panic. Older Altera/Intel
parts printed out a message for UE so do that here using the notifier
framework.

Record the UE in sticky registers that retain the state through a reset.
Check these registers on probe and printout the error on startup.
Signed-off-by: default avatarThor Thayer <thor.thayer@linux.intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: mark.rutland@arm.com
Cc: mchehab@kernel.org
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1526079610-5527-1-git-send-email-thor.thayer@linux.intel.com
[ Remove unused var in s10_edac_dberr_handler(), reorder args. ]
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
parent f8eb0ede
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/irqchip/chained_irq.h> #include <linux/irqchip/chained_irq.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/mfd/syscon.h> #include <linux/mfd/syscon.h>
#include <linux/notifier.h>
#include <linux/of_address.h> #include <linux/of_address.h>
#include <linux/of_irq.h> #include <linux/of_irq.h>
#include <linux/of_platform.h> #include <linux/of_platform.h>
...@@ -725,6 +726,13 @@ static int altr_s10_sdram_probe(struct platform_device *pdev) ...@@ -725,6 +726,13 @@ static int altr_s10_sdram_probe(struct platform_device *pdev)
goto err2; goto err2;
} }
if (regmap_write(regmap, S10_SYSMGR_ECC_INTMASK_CLR_OFST,
S10_DDR0_IRQ_MASK)) {
edac_printk(KERN_ERR, EDAC_MC,
"Error clearing SDRAM ECC count\n");
return -ENODEV;
}
if (regmap_update_bits(drvdata->mc_vbase, priv->ecc_irq_en_offset, if (regmap_update_bits(drvdata->mc_vbase, priv->ecc_irq_en_offset,
priv->ecc_irq_en_mask, priv->ecc_irq_en_mask)) { priv->ecc_irq_en_mask, priv->ecc_irq_en_mask)) {
edac_mc_printk(mci, KERN_ERR, edac_mc_printk(mci, KERN_ERR,
...@@ -2228,23 +2236,50 @@ module_platform_driver(altr_edac_a10_driver); ...@@ -2228,23 +2236,50 @@ module_platform_driver(altr_edac_a10_driver);
/************** Stratix 10 EDAC Device Controller Functions> ************/ /************** Stratix 10 EDAC Device Controller Functions> ************/
#define to_s10edac(p, m) container_of(p, struct altr_stratix10_edac, m)
/*
* The double bit error is handled through SError which is fatal. This is
* called as a panic notifier to printout ECC error info as part of the panic.
*/
static int s10_edac_dberr_handler(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct altr_stratix10_edac *edac = to_s10edac(this, panic_notifier);
int err_addr, dberror;
s10_protected_reg_read(edac, S10_SYSMGR_ECC_INTSTAT_DERR_OFST,
&dberror);
/* Remember the UE Errors for a reboot */
s10_protected_reg_write(edac, S10_SYSMGR_UE_VAL_OFST, dberror);
if (dberror & S10_DDR0_IRQ_MASK) {
s10_protected_reg_read(edac, S10_DERRADDR_OFST, &err_addr);
/* Remember the UE Error address */
s10_protected_reg_write(edac, S10_SYSMGR_UE_ADDR_OFST,
err_addr);
edac_printk(KERN_ERR, EDAC_MC,
"EDAC: [Uncorrectable errors @ 0x%08X]\n\n",
err_addr);
}
return NOTIFY_DONE;
}
static void altr_edac_s10_irq_handler(struct irq_desc *desc) static void altr_edac_s10_irq_handler(struct irq_desc *desc)
{ {
int dberr, bit, sm_offset, irq_status;
struct altr_stratix10_edac *edac = irq_desc_get_handler_data(desc); struct altr_stratix10_edac *edac = irq_desc_get_handler_data(desc);
struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_chip *chip = irq_desc_get_chip(desc);
int irq = irq_desc_get_irq(desc); int irq = irq_desc_get_irq(desc);
int bit, sm_offset, irq_status;
dberr = (irq == edac->db_irq) ? 1 : 0; sm_offset = S10_SYSMGR_ECC_INTSTAT_SERR_OFST;
sm_offset = dberr ? S10_SYSMGR_ECC_INTSTAT_DERR_OFST :
S10_SYSMGR_ECC_INTSTAT_SERR_OFST;
chained_irq_enter(chip, desc); chained_irq_enter(chip, desc);
s10_protected_reg_read(NULL, sm_offset, &irq_status); s10_protected_reg_read(NULL, sm_offset, &irq_status);
for_each_set_bit(bit, (unsigned long *)&irq_status, 32) { for_each_set_bit(bit, (unsigned long *)&irq_status, 32) {
irq = irq_linear_revmap(edac->domain, dberr * 32 + bit); irq = irq_linear_revmap(edac->domain, bit);
if (irq) if (irq)
generic_handle_irq(irq); generic_handle_irq(irq);
} }
...@@ -2289,6 +2324,7 @@ static int altr_edac_s10_probe(struct platform_device *pdev) ...@@ -2289,6 +2324,7 @@ static int altr_edac_s10_probe(struct platform_device *pdev)
{ {
struct altr_stratix10_edac *edac; struct altr_stratix10_edac *edac;
struct device_node *child; struct device_node *child;
int dberror, err_addr;
edac = devm_kzalloc(&pdev->dev, sizeof(*edac), GFP_KERNEL); edac = devm_kzalloc(&pdev->dev, sizeof(*edac), GFP_KERNEL);
if (!edac) if (!edac)
...@@ -2318,11 +2354,22 @@ static int altr_edac_s10_probe(struct platform_device *pdev) ...@@ -2318,11 +2354,22 @@ static int altr_edac_s10_probe(struct platform_device *pdev)
altr_edac_s10_irq_handler, altr_edac_s10_irq_handler,
edac); edac);
edac->db_irq = platform_get_irq(pdev, 1); edac->panic_notifier.notifier_call = s10_edac_dberr_handler;
if (edac->db_irq >= 0) atomic_notifier_chain_register(&panic_notifier_list,
irq_set_chained_handler_and_data(edac->db_irq, &edac->panic_notifier);
altr_edac_s10_irq_handler,
edac); /* Printout a message if uncorrectable error previously. */
s10_protected_reg_read(edac, S10_SYSMGR_UE_VAL_OFST, &dberror);
if (dberror) {
s10_protected_reg_read(edac, S10_SYSMGR_UE_ADDR_OFST,
&err_addr);
edac_printk(KERN_ERR, EDAC_DEVICE,
"Previous Boot UE detected[0x%X] @ 0x%X\n",
dberror, err_addr);
/* Reset the sticky registers */
s10_protected_reg_write(edac, S10_SYSMGR_UE_VAL_OFST, 0);
s10_protected_reg_write(edac, S10_SYSMGR_UE_ADDR_OFST, 0);
}
for_each_child_of_node(pdev->dev.of_node, child) { for_each_child_of_node(pdev->dev.of_node, child) {
if (!of_device_is_available(child)) if (!of_device_is_available(child))
......
...@@ -180,6 +180,10 @@ ...@@ -180,6 +180,10 @@
/* SDRAM Single Bit Error Count Compare Set Register */ /* SDRAM Single Bit Error Count Compare Set Register */
#define S10_SERRCNTREG_OFST 0xF801113C #define S10_SERRCNTREG_OFST 0xF801113C
/* Sticky registers for Uncorrected Errors */
#define S10_SYSMGR_UE_VAL_OFST 0xFFD12220
#define S10_SYSMGR_UE_ADDR_OFST 0xFFD12224
struct altr_sdram_prv_data { struct altr_sdram_prv_data {
int ecc_ctrl_offset; int ecc_ctrl_offset;
int ecc_ctl_en_mask; int ecc_ctl_en_mask;
...@@ -322,6 +326,8 @@ struct altr_sdram_mc_data { ...@@ -322,6 +326,8 @@ struct altr_sdram_mc_data {
#define S10_SYSMGR_ECC_INTSTAT_SERR_OFST 0xFFD1209C #define S10_SYSMGR_ECC_INTSTAT_SERR_OFST 0xFFD1209C
#define S10_SYSMGR_ECC_INTSTAT_DERR_OFST 0xFFD120A0 #define S10_SYSMGR_ECC_INTSTAT_DERR_OFST 0xFFD120A0
#define S10_DDR0_IRQ_MASK BIT(16)
struct altr_edac_device_dev; struct altr_edac_device_dev;
struct edac_device_prv_data { struct edac_device_prv_data {
...@@ -434,10 +440,10 @@ struct altr_arria10_edac { ...@@ -434,10 +440,10 @@ struct altr_arria10_edac {
struct altr_stratix10_edac { struct altr_stratix10_edac {
struct device *dev; struct device *dev;
int sb_irq; int sb_irq;
int db_irq;
struct irq_domain *domain; struct irq_domain *domain;
struct irq_chip irq_chip; struct irq_chip irq_chip;
struct list_head s10_ecc_devices; struct list_head s10_ecc_devices;
struct notifier_block panic_notifier;
}; };
#endif /* #ifndef _ALTERA_EDAC_H */ #endif /* #ifndef _ALTERA_EDAC_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment