Commit 07c1db70 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: refine smu v13.0.6 mca dump driver

refine smu mca driver to support query ras error from pmfw path.
- correct gfx smu bank hwid (from mp5 to smu bank)
- retire unused callback function in amdgpu_mca_smu_funcs{}
- add new mca_bank_set{} structure to collect mca bank
- move enum mca_reg_idx into amdgpu_mca.h header
- add mca status register field decode macro
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0b169571
...@@ -143,6 +143,46 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) ...@@ -143,6 +143,46 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
return 0; return 0;
} }
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set)
{
if (!mca_set)
return;
memset(mca_set, 0, sizeof(*mca_set));
INIT_LIST_HEAD(&mca_set->list);
}
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry)
{
struct mca_bank_node *node;
if (!entry)
return -EINVAL;
node = kvzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
memcpy(&node->entry, entry, sizeof(*entry));
INIT_LIST_HEAD(&node->node);
list_add_tail(&node->node, &mca_set->list);
mca_set->nr_entries++;
return 0;
}
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
{
struct mca_bank_node *node, *tmp;
list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
list_del(&node->node);
kvfree(node);
}
}
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
{ {
struct amdgpu_mca *mca = &adev->mca; struct amdgpu_mca *mca = &adev->mca;
...@@ -160,6 +200,58 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) ...@@ -160,6 +200,58 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
{
int i;
for (i = 0; i < ARRAY_SIZE(entry->regs); i++)
dev_dbg(adev->dev, "mca entry[%02d].regs[%02d]=0x%016llx\n", idx, i, entry->regs[i]);
}
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
{
struct amdgpu_smuio_mcm_config_info mcm_info;
struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
uint32_t count;
int ret, i = 0;
amdgpu_mca_bank_set_init(&mca_set);
ret = amdgpu_mca_smu_get_mca_set(adev, blk, type, &mca_set);
if (ret)
goto out_mca_release;
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret)
goto out_mca_release;
if (!count)
continue;
mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid;
if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
else
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
}
out_mca_release:
amdgpu_mca_bank_set_release(&mca_set);
return ret;
}
int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
{ {
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
...@@ -173,17 +265,77 @@ int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_m ...@@ -173,17 +265,77 @@ int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_m
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count) enum amdgpu_mca_error_type type, uint32_t *total)
{ {
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count) struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
uint32_t count;
int ret;
if (!total)
return -EINVAL; return -EINVAL;
if (mca_funcs && mca_funcs->mca_get_error_count) if (!mca_funcs)
return mca_funcs->mca_get_error_count(adev, blk, type, count); return -EOPNOTSUPP;
if (!mca_funcs->mca_get_ras_mca_set || !mca_funcs->mca_get_valid_mca_count)
return -EOPNOTSUPP; return -EOPNOTSUPP;
amdgpu_mca_bank_set_init(&mca_set);
ret = mca_funcs->mca_get_ras_mca_set(adev, blk, type, &mca_set);
if (ret)
goto err_mca_set_release;
*total = 0;
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
count = 0;
ret = mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret)
goto err_mca_set_release;
*total += count;
}
err_mca_set_release:
amdgpu_mca_bank_set_release(&mca_set);
return ret;
}
int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count || !entry)
return -EINVAL;
if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count)
return -EOPNOTSUPP;
return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
}
int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!mca_set)
return -EINVAL;
if (!mca_funcs || !mca_funcs->mca_get_ras_mca_set)
return -EOPNOTSUPP;
WARN_ON(!list_empty(&mca_set->list));
return mca_funcs->mca_get_ras_mca_set(adev, blk, type, mca_set);
} }
int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
......
...@@ -25,6 +25,27 @@ ...@@ -25,6 +25,27 @@
#define MCA_MAX_REGS_COUNT (16) #define MCA_MAX_REGS_COUNT (16)
#define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
#define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63)
#define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62)
#define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61)
#define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60)
#define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59)
#define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58)
#define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57)
#define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56)
#define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55)
#define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53)
#define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46)
#define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45)
#define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44)
#define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43)
#define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40)
#define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32)
#define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24)
#define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16)
#define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0)
enum amdgpu_mca_ip { enum amdgpu_mca_ip {
AMDGPU_MCA_IP_UNKNOW = -1, AMDGPU_MCA_IP_UNKNOW = -1,
AMDGPU_MCA_IP_PSP = 0, AMDGPU_MCA_IP_PSP = 0,
...@@ -57,6 +78,17 @@ struct amdgpu_mca { ...@@ -57,6 +78,17 @@ struct amdgpu_mca {
const struct amdgpu_mca_smu_funcs *mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs;
}; };
enum mca_reg_idx {
MCA_REG_IDX_CONTROL = 0,
MCA_REG_IDX_STATUS = 1,
MCA_REG_IDX_ADDR = 2,
MCA_REG_IDX_MISC0 = 3,
MCA_REG_IDX_CONFIG = 4,
MCA_REG_IDX_IPID = 5,
MCA_REG_IDX_SYND = 6,
MCA_REG_IDX_COUNT = 16,
};
struct mca_bank_info { struct mca_bank_info {
int socket_id; int socket_id;
int aid; int aid;
...@@ -72,18 +104,28 @@ struct mca_bank_entry { ...@@ -72,18 +104,28 @@ struct mca_bank_entry {
uint64_t regs[MCA_MAX_REGS_COUNT]; uint64_t regs[MCA_MAX_REGS_COUNT];
}; };
struct mca_bank_node {
struct mca_bank_entry entry;
struct list_head node;
};
struct mca_bank_set {
int nr_entries;
struct list_head list;
};
struct amdgpu_mca_smu_funcs { struct amdgpu_mca_smu_funcs {
int max_ue_count; int max_ue_count;
int max_ce_count; int max_ce_count;
int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable);
int (*mca_get_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, int (*mca_get_ras_mca_set)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
enum amdgpu_mca_error_type type, uint32_t *count); struct mca_bank_set *mca_set);
int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct mca_bank_entry *entry, uint32_t *count);
int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
uint32_t *count); uint32_t *count);
int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
int idx, struct mca_bank_entry *entry); int idx, struct mca_bank_entry *entry);
int (*mca_get_ras_mca_idx_array)(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size);
}; };
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
...@@ -107,11 +149,22 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); ...@@ -107,11 +149,22 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count); int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count);
int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *total);
int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count); enum amdgpu_mca_error_type type, uint32_t *count);
int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count);
int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set);
int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
int idx, struct mca_bank_entry *entry); int idx, struct mca_bank_entry *entry);
void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
#endif #endif
...@@ -100,17 +100,6 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_6.bin"); ...@@ -100,17 +100,6 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_6.bin");
#define MCA_BANK_IPID(_ip, _hwid, _type) \ #define MCA_BANK_IPID(_ip, _hwid, _type) \
[AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, } [AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, }
enum mca_reg_idx {
MCA_REG_IDX_CONTROL = 0,
MCA_REG_IDX_STATUS = 1,
MCA_REG_IDX_ADDR = 2,
MCA_REG_IDX_MISC0 = 3,
MCA_REG_IDX_CONFIG = 4,
MCA_REG_IDX_IPID = 5,
MCA_REG_IDX_SYND = 6,
MCA_REG_IDX_COUNT = 16,
};
struct mca_bank_ipid { struct mca_bank_ipid {
enum amdgpu_mca_ip ip; enum amdgpu_mca_ip ip;
uint16_t hwid; uint16_t hwid;
...@@ -123,7 +112,9 @@ struct mca_ras_info { ...@@ -123,7 +112,9 @@ struct mca_ras_info {
int *err_code_array; int *err_code_array;
int err_code_count; int err_code_count;
int (*get_err_count)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, int (*get_err_count)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, int idx, uint32_t *count); enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count);
bool (*bank_is_valid)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry);
}; };
#define P2S_TABLE_ID_A 0x50325341 #define P2S_TABLE_ID_A 0x50325341
...@@ -2449,48 +2440,34 @@ static int mca_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_t ...@@ -2449,48 +2440,34 @@ static int mca_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_t
return 0; return 0;
} }
static int mca_decode_mca_ipid(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int idx, int *ip) static int mca_decode_ipid_to_hwip(uint64_t val)
{ {
const struct mca_bank_ipid *ipid; const struct mca_bank_ipid *ipid;
uint64_t val;
uint16_t hwid, mcatype; uint16_t hwid, mcatype;
int i, ret; int i;
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_IPID, &val);
if (ret)
return ret;
hwid = REG_GET_FIELD(val, MCMP1_IPIDT0, HardwareID); hwid = REG_GET_FIELD(val, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(val, MCMP1_IPIDT0, McaType); mcatype = REG_GET_FIELD(val, MCMP1_IPIDT0, McaType);
if (hwid) {
for (i = 0; i < ARRAY_SIZE(smu_v13_0_6_mca_ipid_table); i++) { for (i = 0; i < ARRAY_SIZE(smu_v13_0_6_mca_ipid_table); i++) {
ipid = &smu_v13_0_6_mca_ipid_table[i]; ipid = &smu_v13_0_6_mca_ipid_table[i];
if (!ipid->hwid) if (!ipid->hwid)
continue; continue;
if (ipid->hwid == hwid && ipid->mcatype == mcatype) { if (ipid->hwid == hwid && ipid->mcatype == mcatype)
*ip = i; return i;
return 0;
}
}
} }
*ip = AMDGPU_MCA_IP_UNKNOW; return AMDGPU_MCA_IP_UNKNOW;
return 0;
} }
static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, int idx, uint32_t *count) enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
{ {
uint64_t status0; uint64_t status0;
int ret;
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0); status0 = entry->regs[MCA_REG_IDX_STATUS];
if (ret)
return ret;
if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
*count = 0; *count = 0;
...@@ -2521,70 +2498,41 @@ static bool mca_smu_check_error_code(struct amdgpu_device *adev, const struct mc ...@@ -2521,70 +2498,41 @@ static bool mca_smu_check_error_code(struct amdgpu_device *adev, const struct mc
return false; return false;
} }
static int mca_mp5_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, static int mca_gfx_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, int idx, uint32_t *count) enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
{ {
uint64_t status0 = 0, misc0 = 0; uint64_t status0, misc0;
uint32_t errcode;
int ret;
if (mca_ras->ip != AMDGPU_MCA_IP_MP5)
return -EINVAL;
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0);
if (ret)
return ret;
status0 = entry->regs[MCA_REG_IDX_STATUS];
if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
*count = 0; *count = 0;
return 0; return 0;
} }
errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode);
if (!mca_smu_check_error_code(adev, mca_ras, errcode))
return 0;
if (type == AMDGPU_MCA_ERROR_TYPE_UE && if (type == AMDGPU_MCA_ERROR_TYPE_UE &&
REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 && REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 &&
REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) { REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) {
if (count)
*count = 1; *count = 1;
return 0; return 0;
} } else {
misc0 = entry->regs[MCA_REG_IDX_MISC0];
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0);
if (ret)
return ret;
if (count)
*count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt); *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt);
}
return 0; return 0;
} }
static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, int idx, uint32_t *count) enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
{ {
uint64_t status0 = 0, misc0 = 0; uint64_t status0, misc0;
uint32_t errcode;
int ret;
if (mca_ras->ip != AMDGPU_MCA_IP_SMU)
return -EINVAL;
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0);
if (ret)
return ret;
status0 = entry->regs[MCA_REG_IDX_STATUS];
if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
*count = 0; *count = 0;
return 0; return 0;
} }
errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode);
if (!mca_smu_check_error_code(adev, mca_ras, errcode))
return 0;
if (type == AMDGPU_MCA_ERROR_TYPE_UE && if (type == AMDGPU_MCA_ERROR_TYPE_UE &&
REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 && REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 &&
REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) { REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) {
...@@ -2593,16 +2541,43 @@ static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct ...@@ -2593,16 +2541,43 @@ static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
return 0; return 0;
} }
ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0); misc0 = entry->regs[MCA_REG_IDX_MISC0];
if (ret)
return ret;
if (count)
*count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt); *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt);
return 0; return 0;
} }
static bool mca_gfx_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry)
{
uint32_t instlo;
instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo);
switch (instlo) {
case 0x36430400: /* SMNAID XCD 0 */
case 0x38430400: /* SMNAID XCD 1 */
case 0x40430400: /* SMNXCD XCD 0, NOTE: FIXME: fix this error later */
return true;
default:
return false;
}
return false;
};
static bool mca_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, struct mca_bank_entry *entry)
{
uint32_t errcode, instlo;
instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo);
if (instlo != 0x03b30400)
return false;
errcode = REG_GET_FIELD(entry->regs[MCA_REG_IDX_STATUS], MCMP1_STATUST0, ErrorCode);
return mca_smu_check_error_code(adev, mca_ras, errcode);
}
static int sdma_err_codes[] = { CODE_SDMA0, CODE_SDMA1, CODE_SDMA2, CODE_SDMA3 }; static int sdma_err_codes[] = { CODE_SDMA0, CODE_SDMA1, CODE_SDMA2, CODE_SDMA3 };
static int mmhub_err_codes[] = { static int mmhub_err_codes[] = {
CODE_DAGB0, CODE_DAGB0 + 1, CODE_DAGB0 + 2, CODE_DAGB0 + 3, CODE_DAGB0 + 4, /* DAGB0-4 */ CODE_DAGB0, CODE_DAGB0 + 1, CODE_DAGB0 + 2, CODE_DAGB0 + 3, CODE_DAGB0 + 4, /* DAGB0-4 */
...@@ -2617,20 +2592,23 @@ static const struct mca_ras_info mca_ras_table[] = { ...@@ -2617,20 +2592,23 @@ static const struct mca_ras_info mca_ras_table[] = {
.get_err_count = mca_umc_mca_get_err_count, .get_err_count = mca_umc_mca_get_err_count,
}, { }, {
.blkid = AMDGPU_RAS_BLOCK__GFX, .blkid = AMDGPU_RAS_BLOCK__GFX,
.ip = AMDGPU_MCA_IP_MP5, .ip = AMDGPU_MCA_IP_SMU,
.get_err_count = mca_mp5_mca_get_err_count, .get_err_count = mca_gfx_mca_get_err_count,
.bank_is_valid = mca_gfx_smu_bank_is_valid,
}, { }, {
.blkid = AMDGPU_RAS_BLOCK__SDMA, .blkid = AMDGPU_RAS_BLOCK__SDMA,
.ip = AMDGPU_MCA_IP_SMU, .ip = AMDGPU_MCA_IP_SMU,
.err_code_array = sdma_err_codes, .err_code_array = sdma_err_codes,
.err_code_count = ARRAY_SIZE(sdma_err_codes), .err_code_count = ARRAY_SIZE(sdma_err_codes),
.get_err_count = mca_smu_mca_get_err_count, .get_err_count = mca_smu_mca_get_err_count,
.bank_is_valid = mca_smu_bank_is_valid,
}, { }, {
.blkid = AMDGPU_RAS_BLOCK__MMHUB, .blkid = AMDGPU_RAS_BLOCK__MMHUB,
.ip = AMDGPU_MCA_IP_SMU, .ip = AMDGPU_MCA_IP_SMU,
.err_code_array = mmhub_err_codes, .err_code_array = mmhub_err_codes,
.err_code_count = ARRAY_SIZE(mmhub_err_codes), .err_code_count = ARRAY_SIZE(mmhub_err_codes),
.get_err_count = mca_smu_mca_get_err_count, .get_err_count = mca_smu_mca_get_err_count,
.bank_is_valid = mca_smu_bank_is_valid,
}, },
}; };
...@@ -2665,130 +2643,84 @@ static int mca_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_e ...@@ -2665,130 +2643,84 @@ static int mca_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_e
} }
static bool mca_bank_is_valid(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, static bool mca_bank_is_valid(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
enum amdgpu_mca_error_type type, int idx) enum amdgpu_mca_error_type type, struct mca_bank_entry *entry)
{ {
int ret, ip = AMDGPU_MCA_IP_UNKNOW; if (mca_decode_ipid_to_hwip(entry->regs[MCA_REG_IDX_IPID]) != mca_ras->ip)
ret = mca_decode_mca_ipid(adev, type, idx, &ip);
if (ret)
return false;
if (ip == AMDGPU_MCA_IP_UNKNOW)
return false; return false;
return ip == mca_ras->ip; if (mca_ras->bank_is_valid)
} return mca_ras->bank_is_valid(mca_ras, adev, type, entry);
static int mca_get_valid_mca_idx(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
enum amdgpu_mca_error_type type,
uint32_t mca_cnt, int *idx_array, int idx_array_size)
{
int i, idx_cnt = 0;
for (i = 0; i < mca_cnt; i++) {
if (!mca_bank_is_valid(adev, mca_ras, type, i))
continue;
if (idx_array) {
if (idx_cnt < idx_array_size)
idx_array[idx_cnt] = i;
else
return -EINVAL;
}
idx_cnt++; return true;
}
return idx_cnt;
} }
static int __mca_smu_get_error_count(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, enum amdgpu_mca_error_type type, uint32_t *count) static int __mca_smu_get_ras_mca_set(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set)
{ {
uint32_t result, mca_cnt, total = 0; struct mca_bank_entry entry;
int idx_array[16]; uint32_t mca_cnt;
int i, ret, idx_cnt = 0; int i, ret;
ret = mca_get_valid_mca_count(adev, type, &mca_cnt); ret = mca_get_valid_mca_count(adev, type, &mca_cnt);
if (ret) if (ret)
return ret; return ret;
/* if valid mca bank count is 0, the driver can return 0 directly */ /* if valid mca bank count is 0, the driver can return 0 directly */
if (!mca_cnt) { if (!mca_cnt)
*count = 0;
return 0; return 0;
}
if (!mca_ras->get_err_count) for (i = 0; i < mca_cnt; i++) {
return -EINVAL; memset(&entry, 0, sizeof(entry));
ret = mca_get_mca_entry(adev, type, i, &entry);
if (ret)
return ret;
idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, ARRAY_SIZE(idx_array)); if (mca_ras && !mca_bank_is_valid(adev, mca_ras, type, &entry))
if (idx_cnt < 0) continue;
return -EINVAL;
for (i = 0; i < idx_cnt; i++) { ret = amdgpu_mca_bank_set_add_entry(mca_set, &entry);
result = 0;
ret = mca_ras->get_err_count(mca_ras, adev, type, idx_array[i], &result);
if (ret) if (ret)
return ret; return ret;
total += result;
} }
*count = total;
return 0; return 0;
} }
static int mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, static int mca_smu_get_ras_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count) enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set)
{ {
const struct mca_ras_info *mca_ras; const struct mca_ras_info *mca_ras = NULL;
if (!count) if (!mca_set)
return -EINVAL; return -EINVAL;
if (blk != AMDGPU_RAS_BLOCK_COUNT) {
mca_ras = mca_get_mca_ras_info(adev, blk); mca_ras = mca_get_mca_ras_info(adev, blk);
if (!mca_ras) if (!mca_ras)
return -EOPNOTSUPP; return -EOPNOTSUPP;
return __mca_smu_get_error_count(adev, mca_ras, type, count);
}
static int __mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size)
{
uint32_t mca_cnt = 0;
int ret, idx_cnt = 0;
ret = mca_get_valid_mca_count(adev, type, &mca_cnt);
if (ret)
return ret;
/* if valid mca bank count is 0, the driver can return 0 directly */
if (!mca_cnt) {
*idx_array_size = 0;
return 0;
} }
idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, *idx_array_size); return __mca_smu_get_ras_mca_set(adev, mca_ras, type, mca_set);
if (idx_cnt < 0)
return -EINVAL;
*idx_array_size = idx_cnt;
return 0;
} }
static int mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, enum amdgpu_ras_block blk, static int mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size) struct mca_bank_entry *entry, uint32_t *count)
{ {
const struct mca_ras_info *mca_ras; const struct mca_ras_info *mca_ras;
if (!entry || !count)
return -EINVAL;
mca_ras = mca_get_mca_ras_info(adev, blk); mca_ras = mca_get_mca_ras_info(adev, blk);
if (!mca_ras) if (!mca_ras)
return -EOPNOTSUPP; return -EOPNOTSUPP;
return __mca_smu_get_ras_mca_idx_array(adev, mca_ras, type, idx_array, idx_array_size); if (!mca_bank_is_valid(adev, mca_ras, type, entry)) {
*count = 0;
return 0;
}
return mca_ras->get_err_count(mca_ras, adev, type, entry, count);
} }
static int mca_smu_get_mca_entry(struct amdgpu_device *adev, static int mca_smu_get_mca_entry(struct amdgpu_device *adev,
...@@ -2807,10 +2739,10 @@ static const struct amdgpu_mca_smu_funcs smu_v13_0_6_mca_smu_funcs = { ...@@ -2807,10 +2739,10 @@ static const struct amdgpu_mca_smu_funcs smu_v13_0_6_mca_smu_funcs = {
.max_ue_count = 12, .max_ue_count = 12,
.max_ce_count = 12, .max_ce_count = 12,
.mca_set_debug_mode = mca_smu_set_debug_mode, .mca_set_debug_mode = mca_smu_set_debug_mode,
.mca_get_error_count = mca_smu_get_error_count, .mca_get_ras_mca_set = mca_smu_get_ras_mca_set,
.mca_parse_mca_error_count = mca_smu_parse_mca_error_count,
.mca_get_mca_entry = mca_smu_get_mca_entry, .mca_get_mca_entry = mca_smu_get_mca_entry,
.mca_get_valid_mca_count = mca_smu_get_valid_mca_count, .mca_get_valid_mca_count = mca_smu_get_valid_mca_count,
.mca_get_ras_mca_idx_array = mca_smu_get_ras_mca_idx_array,
}; };
static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu, static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment