Commit 8fb20d95 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add amdgpu MCA bank dispatch function support

- Refine mca driver code.
- Centralize mca bank dispatch code logic.
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 8e9f1575
...@@ -267,7 +267,8 @@ static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_ ...@@ -267,7 +267,8 @@ static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_
return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
} }
static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set) static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
struct ras_query_context *qctx)
{ {
struct mca_bank_entry entry; struct mca_bank_entry entry;
uint32_t count = 0, i; uint32_t count = 0, i;
...@@ -287,6 +288,8 @@ static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mc ...@@ -287,6 +288,8 @@ static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mc
return ret; return ret;
amdgpu_mca_bank_set_add_entry(mca_set, &entry); amdgpu_mca_bank_set_add_entry(mca_set, &entry);
amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx);
} }
return 0; return 0;
...@@ -306,36 +309,36 @@ static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum ...@@ -306,36 +309,36 @@ static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum
return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count); return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
} }
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx) struct mca_bank_set *mca_set, struct ras_err_data *err_data)
{ {
struct ras_err_addr err_addr;
struct amdgpu_smuio_mcm_config_info mcm_info; struct amdgpu_smuio_mcm_config_info mcm_info;
struct ras_err_addr err_addr = {0};
struct mca_bank_set mca_set;
struct mca_bank_node *node; struct mca_bank_node *node;
struct mca_bank_entry *entry; struct mca_bank_entry *entry;
uint32_t count; uint32_t count;
int ret, i = 0; int ret;
amdgpu_mca_bank_set_init(&mca_set); if (!mca_set)
return -EINVAL;
ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set); if (!mca_set->nr_entries)
if (ret) return 0;
goto out_mca_release;
list_for_each_entry(node, &mca_set.list, node) { list_for_each_entry(node, &mca_set->list, node) {
entry = &node->entry; entry = &node->entry;
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
count = 0; count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret) if (ret)
goto out_mca_release; return ret;
if (!count) if (!count)
continue; continue;
memset(&mcm_info, 0, sizeof(mcm_info));
memset(&err_addr, 0, sizeof(err_addr));
mcm_info.socket_id = entry->info.socket_id; mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid; mcm_info.die_id = entry->info.aid;
...@@ -345,19 +348,36 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo ...@@ -345,19 +348,36 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR]; err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
} }
if (type == AMDGPU_MCA_ERROR_TYPE_UE) if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
amdgpu_ras_error_statistic_ue_count(err_data, amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
else { } else {
if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
amdgpu_ras_error_statistic_de_count(err_data, amdgpu_ras_error_statistic_de_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
else else
amdgpu_ras_error_statistic_ce_count(err_data, amdgpu_ras_error_statistic_ce_count(err_data,
&mcm_info, &err_addr, (uint64_t)count); &mcm_info, &err_addr, (uint64_t)count);
} }
} }
return 0;
}
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx)
{
struct mca_bank_set mca_set;
int ret;
amdgpu_mca_bank_set_init(&mca_set);
ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx);
if (ret)
goto out_mca_release;
ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
out_mca_release: out_mca_release:
amdgpu_mca_bank_set_release(&mca_set); amdgpu_mca_bank_set_release(&mca_set);
...@@ -402,36 +422,29 @@ static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry) ...@@ -402,36 +422,29 @@ static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry)
static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type) static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)m->private; struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
struct mca_bank_entry *entry; struct mca_bank_node *node;
uint32_t count = 0; struct mca_bank_set mca_set;
int i, ret; struct ras_query_context qctx;
int ret;
ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count); amdgpu_mca_bank_set_init(&mca_set);
qctx.event_id = 0ULL;
ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
if (ret) if (ret)
return ret; goto err_free_mca_set;
seq_printf(m, "amdgpu smu %s valid mca count: %d\n", seq_printf(m, "amdgpu smu %s valid mca count: %d\n",
type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", count); type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries);
if (!count)
return 0;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
for (i = 0; i < count; i++) {
memset(entry, 0, sizeof(*entry));
ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, entry); if (!mca_set.nr_entries)
if (ret) goto err_free_mca_set;
goto err_free_entry;
mca_dump_entry(m, entry); list_for_each_entry(node, &mca_set.list, node)
} mca_dump_entry(m, &node->entry);
err_free_entry: err_free_mca_set:
kfree(entry); amdgpu_mca_bank_set_release(&mca_set);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment