Commit 9dc57c2a authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add ras event id support

add amdgpu ras event id support to better distinguish different
error information sources in dmesg logs.

the following log will be identify by event id:
{event_id} interrupt to inform RAS event
{event_id} ACA logs
{event_id} errors statistic since from current injection/error query
{event_id} errors statistic since from gpu load
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d5586e2f
......@@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
return -EOPNOTSUPP;
}
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
struct ras_query_context *qctx)
{
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_STATUS]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_ADDR]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_MISC0]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_IPID]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_SYND]);
u64 event_id = qctx->event_id;
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_STATUS]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_ADDR]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_MISC0]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_IPID]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_SYND]);
}
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx)
{
struct amdgpu_smuio_mcm_config_info mcm_info;
struct ras_err_addr err_addr = {0};
......@@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
......
......@@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx);
#endif
This diff is collapsed.
......@@ -64,6 +64,14 @@ struct amdgpu_iv_entry;
/* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \
do { \
if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \
dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \
else \
dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \
} while (0)
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
......@@ -419,6 +427,21 @@ struct umc_ecc_info {
int record_ce_addr_supported;
};
enum ras_event_type {
RAS_EVENT_TYPE_INVALID = -1,
RAS_EVENT_TYPE_ISR = 0,
RAS_EVENT_TYPE_COUNT,
};
struct ras_event_manager {
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
};
struct ras_query_context {
enum ras_event_type type;
u64 event_id;
};
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
......@@ -479,6 +502,11 @@ struct amdgpu_ras {
atomic_t page_retirement_req_cnt;
/* Fatal error detected flag */
atomic_t fed;
/* RAS event manager */
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
};
struct ras_fs_data {
......@@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
#endif
......@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
struct ras_event_manager event_mgr;
};
struct amdgpu_pcs_ras_field {
......
......@@ -404,10 +404,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_query_context qctx;
memset(&qctx, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
amdgpu_mca_smu_log_ras_error(adev,
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
amdgpu_mca_smu_log_ras_error(adev,
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
}
static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment