Commit dc37a919 authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher

drm/amdgpu: Add query_ras_error_count for sdma v4_4_2

Add query_ras_error_count callback for sdma
v4_4_2. It will be used to query and log sdma
uncorrectable error count and memory block.
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d90d90a1
...@@ -62,6 +62,34 @@ struct amdgpu_sdma_instance { ...@@ -62,6 +62,34 @@ struct amdgpu_sdma_instance {
uint32_t aid_id; uint32_t aid_id;
}; };
enum amdgpu_sdma_ras_memory_id {
AMDGPU_SDMA_MBANK_DATA_BUF0 = 1,
AMDGPU_SDMA_MBANK_DATA_BUF1 = 2,
AMDGPU_SDMA_MBANK_DATA_BUF2 = 3,
AMDGPU_SDMA_MBANK_DATA_BUF3 = 4,
AMDGPU_SDMA_MBANK_DATA_BUF4 = 5,
AMDGPU_SDMA_MBANK_DATA_BUF5 = 6,
AMDGPU_SDMA_MBANK_DATA_BUF6 = 7,
AMDGPU_SDMA_MBANK_DATA_BUF7 = 8,
AMDGPU_SDMA_MBANK_DATA_BUF8 = 9,
AMDGPU_SDMA_MBANK_DATA_BUF9 = 10,
AMDGPU_SDMA_MBANK_DATA_BUF10 = 11,
AMDGPU_SDMA_MBANK_DATA_BUF11 = 12,
AMDGPU_SDMA_MBANK_DATA_BUF12 = 13,
AMDGPU_SDMA_MBANK_DATA_BUF13 = 14,
AMDGPU_SDMA_MBANK_DATA_BUF14 = 15,
AMDGPU_SDMA_MBANK_DATA_BUF15 = 16,
AMDGPU_SDMA_UCODE_BUF = 17,
AMDGPU_SDMA_RB_CMD_BUF = 18,
AMDGPU_SDMA_IB_CMD_BUF = 19,
AMDGPU_SDMA_UTCL1_RD_FIFO = 20,
AMDGPU_SDMA_UTCL1_RDBST_FIFO = 21,
AMDGPU_SDMA_UTCL1_WR_FIFO = 22,
AMDGPU_SDMA_DATA_LUT_FIFO = 23,
AMDGPU_SDMA_SPLIT_DAT_BUF = 24,
AMDGPU_SDMA_MEMORY_BLOCK_LAST,
};
struct amdgpu_sdma_ras { struct amdgpu_sdma_ras {
struct amdgpu_ras_block_object ras_block; struct amdgpu_ras_block_object ras_block;
}; };
......
...@@ -2071,3 +2071,67 @@ struct amdgpu_xcp_ip_funcs sdma_v4_4_2_xcp_funcs = { ...@@ -2071,3 +2071,67 @@ struct amdgpu_xcp_ip_funcs sdma_v4_4_2_xcp_funcs = {
.suspend = &sdma_v4_4_2_xcp_suspend, .suspend = &sdma_v4_4_2_xcp_suspend,
.resume = &sdma_v4_4_2_xcp_resume .resume = &sdma_v4_4_2_xcp_resume
}; };
static const struct amdgpu_ras_err_status_reg_entry sdma_v4_2_2_ue_reg_list[] = {
{AMDGPU_RAS_REG_ENTRY(SDMA0, 0, regSDMA_UE_ERR_STATUS_LO, regSDMA_UE_ERR_STATUS_HI),
1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SDMA"},
};
static const struct amdgpu_ras_memory_id_entry sdma_v4_4_2_ras_memory_list[] = {
{AMDGPU_SDMA_MBANK_DATA_BUF0, "SDMA_MBANK_DATA_BUF0"},
{AMDGPU_SDMA_MBANK_DATA_BUF1, "SDMA_MBANK_DATA_BUF1"},
{AMDGPU_SDMA_MBANK_DATA_BUF2, "SDMA_MBANK_DATA_BUF2"},
{AMDGPU_SDMA_MBANK_DATA_BUF3, "SDMA_MBANK_DATA_BUF3"},
{AMDGPU_SDMA_MBANK_DATA_BUF4, "SDMA_MBANK_DATA_BUF4"},
{AMDGPU_SDMA_MBANK_DATA_BUF5, "SDMA_MBANK_DATA_BUF5"},
{AMDGPU_SDMA_MBANK_DATA_BUF6, "SDMA_MBANK_DATA_BUF6"},
{AMDGPU_SDMA_MBANK_DATA_BUF7, "SDMA_MBANK_DATA_BUF7"},
{AMDGPU_SDMA_MBANK_DATA_BUF8, "SDMA_MBANK_DATA_BUF8"},
{AMDGPU_SDMA_MBANK_DATA_BUF9, "SDMA_MBANK_DATA_BUF9"},
{AMDGPU_SDMA_MBANK_DATA_BUF10, "SDMA_MBANK_DATA_BUF10"},
{AMDGPU_SDMA_MBANK_DATA_BUF11, "SDMA_MBANK_DATA_BUF11"},
{AMDGPU_SDMA_MBANK_DATA_BUF12, "SDMA_MBANK_DATA_BUF12"},
{AMDGPU_SDMA_MBANK_DATA_BUF13, "SDMA_MBANK_DATA_BUF13"},
{AMDGPU_SDMA_MBANK_DATA_BUF14, "SDMA_MBANK_DATA_BUF14"},
{AMDGPU_SDMA_MBANK_DATA_BUF15, "SDMA_MBANK_DATA_BUF15"},
{AMDGPU_SDMA_UCODE_BUF, "SDMA_UCODE_BUF"},
{AMDGPU_SDMA_RB_CMD_BUF, "SDMA_RB_CMD_BUF"},
{AMDGPU_SDMA_IB_CMD_BUF, "SDMA_IB_CMD_BUF"},
{AMDGPU_SDMA_UTCL1_RD_FIFO, "SDMA_UTCL1_RD_FIFO"},
{AMDGPU_SDMA_UTCL1_RDBST_FIFO, "SDMA_UTCL1_RDBST_FIFO"},
{AMDGPU_SDMA_UTCL1_WR_FIFO, "SDMA_UTCL1_WR_FIFO"},
{AMDGPU_SDMA_DATA_LUT_FIFO, "SDMA_DATA_LUT_FIFO"},
{AMDGPU_SDMA_SPLIT_DAT_BUF, "SDMA_SPLIT_DAT_BUF"},
};
static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
uint32_t sdma_inst,
void *ras_err_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status;
/* sdma v4_4_2 doesn't support query ce counts */
amdgpu_ras_inst_query_ras_error_count(adev,
sdma_v4_2_2_ue_reg_list,
ARRAY_SIZE(sdma_v4_2_2_ue_reg_list),
sdma_v4_4_2_ras_memory_list,
ARRAY_SIZE(sdma_v4_4_2_ras_memory_list),
sdma_inst,
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
&err_data->ue_count);
}
static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
void *ras_err_status)
{
uint32_t inst_mask;
int i = 0;
inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
for_each_inst(i, inst_mask)
sdma_v4_4_2_inst_query_ras_error_count(adev, i, ras_err_status);
} else {
dev_warn(adev->dev, "SDMA RAS is not supported\n");
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment