Commit 46e2231c authored by Candice Li's avatar Candice Li Committed by Alex Deucher

drm/amdgpu: Log deferred error separately

Separate deferred error from UE and CE and log it
individually.
Signed-off-by: default avatarCandice Li <candice.li@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9c97bf88
...@@ -256,9 +256,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo ...@@ -256,9 +256,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
if (type == AMDGPU_MCA_ERROR_TYPE_UE) if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data, amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
else else {
amdgpu_ras_error_statistic_ce_count(err_data, if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
&mcm_info, &err_addr, (uint64_t)count); amdgpu_ras_error_statistic_de_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
else
amdgpu_ras_error_statistic_ce_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
}
} }
out_mca_release: out_mca_release:
......
...@@ -1036,7 +1036,8 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, ...@@ -1036,7 +1036,8 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_manager *ras_mgr, struct ras_manager *ras_mgr,
struct ras_err_data *err_data, struct ras_err_data *err_data,
const char *blk_name, const char *blk_name,
bool is_ue) bool is_ue,
bool is_de)
{ {
struct amdgpu_smuio_mcm_config_info *mcm_info; struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node; struct ras_err_node *err_node;
...@@ -1065,25 +1066,50 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, ...@@ -1065,25 +1066,50 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
} }
} else { } else {
for_each_ras_error(err_node, err_data) { if (is_de) {
err_info = &err_node->err_info; for_each_ras_error(err_node, err_data) {
mcm_info = &err_info->mcm_info; err_info = &err_node->err_info;
if (err_info->ce_count) { mcm_info = &err_info->mcm_info;
if (err_info->de_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new deferred hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->de_count,
blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, " dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new correctable hardware errors detected in %s block\n", "%lld deferred hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->socket_id, mcm_info->die_id,
mcm_info->die_id, err_info->de_count, blk_name);
err_info->ce_count, }
blk_name); } else {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new correctable hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->ce_count,
blk_name);
}
} }
}
for_each_ras_error(err_node, &ras_mgr->err_data) { for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info; err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info; mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, " dev_info(adev->dev, "socket: %d, die: %d, "
"%lld correctable hardware errors detected in total in %s block\n", "%lld correctable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name); mcm_info->socket_id, mcm_info->die_id,
err_info->ce_count, blk_name);
}
} }
} }
} }
...@@ -1102,7 +1128,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1102,7 +1128,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
if (err_data->ce_count) { if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) { if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false); amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
blk_name, false, false);
} else if (!adev->aid_mask && } else if (!adev->aid_mask &&
adev->smuio.funcs && adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_socket_id &&
...@@ -1124,7 +1151,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1124,7 +1151,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
if (err_data->ue_count) { if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) { if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true); amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
blk_name, true, false);
} else if (!adev->aid_mask && } else if (!adev->aid_mask &&
adev->smuio.funcs && adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_socket_id &&
...@@ -1144,6 +1172,28 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1144,6 +1172,28 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
} }
} }
if (err_data->de_count) {
if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
blk_name, false, true);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
"%ld deferred hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.de_count,
blk_name);
} else {
dev_info(adev->dev, "%ld deferred hardware errors "
"detected in %s block\n",
ras_mgr->err_data.de_count,
blk_name);
}
}
} }
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
...@@ -1154,7 +1204,8 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s ...@@ -1154,7 +1204,8 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
if (err_data_has_source_info(err_data)) { if (err_data_has_source_info(err_data)) {
for_each_ras_error(err_node, err_data) { for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info; err_info = &err_node->err_info;
amdgpu_ras_error_statistic_de_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->de_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data, amdgpu_ras_error_statistic_ce_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ce_count); &err_info->mcm_info, NULL, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data, amdgpu_ras_error_statistic_ue_count(&obj->err_data,
...@@ -1164,6 +1215,7 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s ...@@ -1164,6 +1215,7 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
/* for legacy asic path which doesn't has error source info */ /* for legacy asic path which doesn't has error source info */
obj->err_data.ue_count += err_data->ue_count; obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count; obj->err_data.ce_count += err_data->ce_count;
obj->err_data.de_count += err_data->de_count;
} }
} }
...@@ -1312,6 +1364,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i ...@@ -1312,6 +1364,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
info->ue_count = obj->err_data.ue_count; info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count; info->ce_count = obj->err_data.ce_count;
info->de_count = obj->err_data.de_count;
amdgpu_ras_error_generate_report(adev, info, &err_data); amdgpu_ras_error_generate_report(adev, info, &err_data);
...@@ -2029,6 +2082,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, ...@@ -2029,6 +2082,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
*/ */
obj->err_data.ue_count += err_data.ue_count; obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count; obj->err_data.ce_count += err_data.ce_count;
obj->err_data.de_count += err_data.de_count;
} }
amdgpu_ras_error_data_fini(&err_data); amdgpu_ras_error_data_fini(&err_data);
...@@ -3872,6 +3926,28 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, ...@@ -3872,6 +3926,28 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
return 0; return 0;
} }
int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
{
struct ras_err_info *err_info;
if (!err_data || !mcm_info)
return -EINVAL;
if (!count)
return 0;
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
if (!err_info)
return -EINVAL;
err_info->de_count += count;
err_data->de_count += count;
return 0;
}
#define mmMP0_SMN_C2PMSG_92 0x1609C #define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE #define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
......
...@@ -476,6 +476,7 @@ struct ras_err_info { ...@@ -476,6 +476,7 @@ struct ras_err_info {
struct amdgpu_smuio_mcm_config_info mcm_info; struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count; u64 ce_count;
u64 ue_count; u64 ue_count;
u64 de_count;
struct ras_err_addr err_addr; struct ras_err_addr err_addr;
}; };
...@@ -487,6 +488,7 @@ struct ras_err_node { ...@@ -487,6 +488,7 @@ struct ras_err_node {
struct ras_err_data { struct ras_err_data {
unsigned long ue_count; unsigned long ue_count;
unsigned long ce_count; unsigned long ce_count;
unsigned long de_count;
unsigned long err_addr_cnt; unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr; struct eeprom_table_record *err_addr;
u32 err_list_count; u32 err_list_count;
...@@ -564,6 +566,7 @@ struct ras_query_if { ...@@ -564,6 +566,7 @@ struct ras_query_if {
struct ras_common_if head; struct ras_common_if head;
unsigned long ue_count; unsigned long ue_count;
unsigned long ce_count; unsigned long ce_count;
unsigned long de_count;
}; };
struct ras_inject_if { struct ras_inject_if {
...@@ -835,6 +838,9 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, ...@@ -835,6 +838,9 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count); struct ras_err_addr *err_addr, u64 count);
int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data); const struct aca_info *aca_info, void *data);
......
...@@ -208,6 +208,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) ...@@ -208,6 +208,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
if (ret == AMDGPU_RAS_SUCCESS && obj) { if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data.ue_count; obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count; obj->err_data.ce_count += err_data.ce_count;
obj->err_data.de_count += err_data.de_count;
} }
amdgpu_ras_error_data_fini(&err_data); amdgpu_ras_error_data_fini(&err_data);
......
...@@ -89,12 +89,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) ...@@ -89,12 +89,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
umc_v12_0_reset_error_count_per_channel, NULL); umc_v12_0_reset_error_count_per_channel, NULL);
} }
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1));
}
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{ {
if (amdgpu_ras_is_poison_mode_supported(adev) && if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && return false;
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
return true;
return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
...@@ -104,9 +109,7 @@ bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_um ...@@ -104,9 +109,7 @@ bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_um
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{ {
if (amdgpu_ras_is_poison_mode_supported(adev) && if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
return false; return false;
return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
...@@ -119,29 +122,10 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_ ...@@ -119,29 +122,10 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_
!(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))))); !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
} }
static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev, static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev,
uint64_t umc_reg_offset, uint64_t umc_reg_offset,
unsigned long *error_count) unsigned long *error_count,
{ check_error_type_func error_type_func)
uint64_t mc_umc_status;
uint64_t mc_umc_status_addr;
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
/* Rely on MCUMC_STATUS for correctable error counter
* MCUMC_STATUS is a 64 bit register
*/
mc_umc_status =
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
if (umc_v12_0_is_correctable_error(adev, mc_umc_status))
*error_count += 1;
}
static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t umc_reg_offset,
unsigned long *error_count)
{ {
uint64_t mc_umc_status; uint64_t mc_umc_status;
uint64_t mc_umc_status_addr; uint64_t mc_umc_status_addr;
...@@ -149,11 +133,11 @@ static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev ...@@ -149,11 +133,11 @@ static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev
mc_umc_status_addr = mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
/* Check the MCUMC_STATUS. */ /* Check MCUMC_STATUS */
mc_umc_status = mc_umc_status =
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) if (error_type_func(adev, mc_umc_status))
*error_count += 1; *error_count += 1;
} }
...@@ -162,7 +146,7 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev, ...@@ -162,7 +146,7 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
uint32_t ch_inst, void *data) uint32_t ch_inst, void *data)
{ {
struct ras_err_data *err_data = (struct ras_err_data *)data; struct ras_err_data *err_data = (struct ras_err_data *)data;
unsigned long ue_count = 0, ce_count = 0; unsigned long ue_count = 0, ce_count = 0, de_count = 0;
/* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3], /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],
* which can be used as die ID directly */ * which can be used as die ID directly */
...@@ -174,11 +158,16 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev, ...@@ -174,11 +158,16 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
uint64_t umc_reg_offset = uint64_t umc_reg_offset =
get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count); umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count); &ce_count, umc_v12_0_is_correctable_error);
umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
&ue_count, umc_v12_0_is_uncorrectable_error);
umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
&de_count, umc_v12_0_is_deferred_error);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count); amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, de_count);
return 0; return 0;
} }
...@@ -392,7 +381,8 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade ...@@ -392,7 +381,8 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
if (!mc_umc_status) if (!mc_umc_status)
continue; continue;
if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
uint64_t mca_addr, err_addr, mca_ipid; uint64_t mca_addr, err_addr, mca_ipid;
uint32_t InstanceIdLo; uint32_t InstanceIdLo;
struct amdgpu_smuio_mcm_config_info *mcm_info; struct amdgpu_smuio_mcm_config_info *mcm_info;
......
...@@ -121,9 +121,12 @@ ...@@ -121,9 +121,12 @@
(((_ipid_lo) >> 12) & 0xF)) (((_ipid_lo) >> 12) & 0xF))
#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
typedef bool (*check_error_type_func)(struct amdgpu_device *adev, uint64_t mc_umc_status);
extern const uint32_t extern const uint32_t
umc_v12_0_channel_idx_tbl[] umc_v12_0_channel_idx_tbl[]
[UMC_V12_0_UMC_INSTANCE_NUM] [UMC_V12_0_UMC_INSTANCE_NUM]
......
...@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct ...@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
return 0; return 0;
} }
if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) ||
*count = 1; (type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) ||
else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(adev, status0)) umc_v12_0_is_deferred_error(adev, status0))))
*count = 1; *count = 1;
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment