Commit ccb5ecdc authored by Xiaofei Tan's avatar Xiaofei Tan Committed by Rafael J. Wysocki

ACPI: APEI: fix synchronous external aborts in user-mode

Before commit 8fcc4ae6 ("arm64: acpi: Make apei_claim_sea()
synchronise with APEI's irq work"), do_sea() would unconditionally
signal the affected task from the arch code. Since that change,
the GHES driver sends the signals.

This exposes a problem as errors the GHES driver doesn't understand
or doesn't handle effectively are silently ignored. It will cause
the errors get taken again, and circulate endlessly. User-space task
get stuck in this loop.

Existing firmware on Kunpeng9xx systems reports cache errors with the
'ARM Processor Error' CPER records.

Do memory failure handling for ARM Processor Error Section just like
for Memory Error Section.

Fixes: 8fcc4ae6 ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work")
Signed-off-by: default avatarXiaofei Tan <tanxiaofei@huawei.com>
Reviewed-by: default avatarJames Morse <james.morse@arm.com>
[ rjw: Subject edit ]
Signed-off-by: default avatarRafael J. Wysocki <rafael.j.wysocki@intel.com>
parent b7a732a7
......@@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
}
static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
int sev)
static bool ghes_do_memory_failure(u64 physical_addr, int flags)
{
unsigned long pfn;
int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
return false;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return false;
pfn = mem_err->physical_addr >> PAGE_SHIFT;
pfn = PHYS_PFN(physical_addr);
if (!pfn_valid(pfn)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",
mem_err->physical_addr);
physical_addr);
return false;
}
memory_failure_queue(pfn, flags);
return true;
}
static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
int sev)
{
int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return false;
/* iff following two events can be handled properly by now */
if (sec_sev == GHES_SEV_CORRECTED &&
(gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
......@@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
flags = 0;
if (flags != -1) {
memory_failure_queue(pfn, flags);
return true;
}
if (flags != -1)
return ghes_do_memory_failure(mem_err->physical_addr, flags);
return false;
}
static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
{
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
bool queued = false;
int sec_sev, i;
char *p;
log_arm_hw_error(err);
sec_sev = ghes_severity(gdata->error_severity);
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
return false;
p = (char *)(err + 1);
for (i = 0; i < err->err_info_num; i++) {
struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
const char *error_type = "unknown error";
/*
* The field (err_info->error_info & BIT(26)) is fixed to set to
* 1 in some old firmware of HiSilicon Kunpeng920. We assume that
* firmware won't mix corrected errors in an uncorrected section,
* and don't filter out 'corrected' error here.
*/
if (is_cache && has_pa) {
queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
p += err_info->length;
continue;
}
if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
error_type = cper_proc_error_type_strs[err_info->type];
pr_warn_ratelimited(FW_WARN GHES_PFX
"Unhandled processor error type: %s\n",
error_type);
p += err_info->length;
}
return queued;
}
/*
* PCIe AER errors need to be sent to the AER driver for reporting and
* recovery. The GHES severities map to the following AER severities and
......@@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes,
ghes_handle_aer(gdata);
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
log_arm_hw_error(err);
queued = ghes_handle_arm_hw_error(gdata, sev);
} else {
void *err = acpi_hest_get_payload(gdata);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment