Commit ec3e0a91 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: refine ras error kernel log print

refine ras error kernel log to avoid user-ridden ambiguity.
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 53d4d779
...@@ -635,8 +635,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, ...@@ -635,8 +635,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
static inline void put_obj(struct ras_manager *obj) static inline void put_obj(struct ras_manager *obj)
{ {
if (obj && (--obj->use == 0)) if (obj && (--obj->use == 0)) {
list_del(&obj->node); list_del(&obj->node);
amdgpu_ras_error_data_fini(&obj->err_data);
}
if (obj && (obj->use < 0)) if (obj && (obj->use < 0))
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
} }
...@@ -666,6 +669,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, ...@@ -666,6 +669,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
if (alive_obj(obj)) if (alive_obj(obj))
return NULL; return NULL;
if (amdgpu_ras_error_data_init(&obj->err_data))
return NULL;
obj->head = *head; obj->head = *head;
obj->adev = adev; obj->adev = adev;
list_add(&obj->node, &con->head); list_add(&obj->node, &con->head);
...@@ -1023,42 +1029,66 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d ...@@ -1023,42 +1029,66 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
} }
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_query_if *query_if, struct ras_manager *ras_mgr,
struct ras_err_data *err_data, struct ras_err_data *err_data,
const char *blk_name,
bool is_ue) bool is_ue)
{ {
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
const char *blk_name = get_ras_block_str(&query_if->head);
struct amdgpu_smuio_mcm_config_info *mcm_info; struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node; struct ras_err_node *err_node;
struct ras_err_info *err_info; struct ras_err_info *err_info;
if (is_ue) if (is_ue) {
dev_info(adev->dev, "%ld uncorrectable hardware errors detected in %s block\n",
ras_mgr->err_data.ue_count, blk_name);
else
dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n",
ras_mgr->err_data.ce_count, blk_name);
for_each_ras_error(err_node, err_data) { for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info; err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info; mcm_info = &err_info->mcm_info;
if (is_ue && err_info->ue_count) { if (err_info->ue_count) {
dev_info(adev->dev, "socket: %d, die: %d " dev_info(adev->dev, "socket: %d, die: %d, "
"%lld uncorrectable hardware errors detected in %s block\n", "%lld new uncorrectable hardware errors detected in %s block\n",
mcm_info->socket_id, mcm_info->socket_id,
mcm_info->die_id, mcm_info->die_id,
err_info->ue_count, err_info->ue_count,
blk_name); blk_name);
} else if (!is_ue && err_info->ce_count) { }
dev_info(adev->dev, "socket: %d, die: %d " }
"%lld correctable hardware errors detected in %s block\n",
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld uncorrectable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
}
} else {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new correctable hardware errors detected in %s block, "
"no user action is needed\n",
mcm_info->socket_id, mcm_info->socket_id,
mcm_info->die_id, mcm_info->die_id,
err_info->ce_count, err_info->ce_count,
blk_name); blk_name);
} }
} }
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld correctable hardware errors detected in total in %s block, "
"no user action is needed\n",
mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name);
}
}
}
static inline bool err_data_has_source_info(struct ras_err_data *data)
{
return !list_empty(&data->err_node_list);
} }
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
...@@ -1069,9 +1099,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1069,9 +1099,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
const char *blk_name = get_ras_block_str(&query_if->head); const char *blk_name = get_ras_block_str(&query_if->head);
if (err_data->ce_count) { if (err_data->ce_count) {
if (!list_empty(&err_data->err_node_list)) { if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, query_if, amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false);
err_data, false);
} else if (!adev->aid_mask && } else if (!adev->aid_mask &&
adev->smuio.funcs && adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_socket_id &&
...@@ -1094,9 +1123,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1094,9 +1123,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
} }
if (err_data->ue_count) { if (err_data->ue_count) {
if (!list_empty(&err_data->err_node_list)) { if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, query_if, amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true);
err_data, true);
} else if (!adev->aid_mask && } else if (!adev->aid_mask &&
adev->smuio.funcs && adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_socket_id &&
...@@ -1118,6 +1146,25 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1118,6 +1146,25 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
} }
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
{
struct ras_err_node *err_node;
struct ras_err_info *err_info;
if (err_data_has_source_info(err_data)) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
}
} else {
/* for legacy asic path which doesn't has error source info */
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;
}
}
/* query/inject/cure begin */ /* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info) struct ras_query_if *info)
...@@ -1156,8 +1203,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -1156,8 +1203,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
} }
} }
obj->err_data.ue_count += err_data.ue_count; amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
obj->err_data.ce_count += err_data.ce_count;
info->ue_count = obj->err_data.ue_count; info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count; info->ce_count = obj->err_data.ce_count;
......
...@@ -515,10 +515,7 @@ struct ras_manager { ...@@ -515,10 +515,7 @@ struct ras_manager {
/* IH data */ /* IH data */
struct ras_ih_data ih_data; struct ras_ih_data ih_data;
struct { struct ras_err_data err_data;
unsigned long ue_count;
unsigned long ce_count;
} err_data;
}; };
struct ras_badpage { struct ras_badpage {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment