Commit 322a7e00 authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher

drm/amdgpu: Add common helper to query ras error (v2)

Add common helper to query ras error status and
log error information, including memory block id
and erorr count. The helpers are applicable to IP
blocks that follow the new ras error logging design.
For IP blocks that don't support the new design,
please still implement ip specific helper to query
ras error.

v2: optimize struct amdgpu_ras_err_status_reg_entry
and the implementaion in helper (Lijo/Tao)
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent cbf9e46a
...@@ -3103,3 +3103,122 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, ...@@ -3103,3 +3103,122 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
return 0; return 0;
} }
void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
{
if (!err_type_name)
return;
switch (err_type) {
case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
sprintf(err_type_name, "correctable");
break;
case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
sprintf(err_type_name, "uncorrectable");
break;
default:
sprintf(err_type_name, "unknown");
break;
}
}
bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_entry,
uint32_t instance,
uint32_t *memory_id)
{
uint32_t err_status_lo_data, err_status_lo_offset;
if (!reg_entry)
return false;
err_status_lo_offset =
AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
reg_entry->seg_lo, reg_entry->reg_lo);
err_status_lo_data = RREG32(err_status_lo_offset);
if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
!REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
return false;
*memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
return true;
}
bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_entry,
uint32_t instance,
unsigned long *err_cnt)
{
uint32_t err_status_hi_data, err_status_hi_offset;
if (!reg_entry)
return false;
err_status_hi_offset =
AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
reg_entry->seg_hi, reg_entry->reg_hi);
err_status_hi_data = RREG32(err_status_hi_offset);
if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
!REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
return false;
/* read err count */
*err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
return true;
}
void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_list,
uint32_t reg_list_size,
const struct amdgpu_ras_memory_id_entry *mem_list,
uint32_t mem_list_size,
uint32_t instance,
uint32_t err_type,
unsigned long *err_count)
{
uint32_t memory_id;
unsigned long err_cnt;
char err_type_name[16];
uint32_t i, j;
for (i = 0; i < reg_list_size; i++) {
/* query err_cnt from err_status_hi */
if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
instance, &err_cnt) ||
!err_cnt)
continue;
/* query memory_id from err_status_lo */
if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
instance, &memory_id))
continue;
*err_count += err_cnt;
/* log the errors */
amdgpu_ras_get_error_type_name(err_type, err_type_name);
if (!mem_list) {
/* memory_list is not supported */
dev_info(adev->dev,
"%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
err_cnt, err_type_name,
reg_list[i].block_name,
instance, memory_id);
} else {
for (j = 0; j < mem_list_size; j++) {
if (memory_id == mem_list[j].memory_id) {
dev_info(adev->dev,
"%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
err_cnt, err_type_name,
reg_list[i].block_name,
instance, mem_list[j].name);
break;
}
}
}
}
}
...@@ -314,6 +314,43 @@ enum amdgpu_ras_ret { ...@@ -314,6 +314,43 @@ enum amdgpu_ras_ret {
AMDGPU_RAS_PT, AMDGPU_RAS_PT,
}; };
/* ras error status reisger fields */
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
#define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18
#define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L
#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2
#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L
#define ERR_STATUS__ERR_CNT__SHIFT 0x17
#define ERR_STATUS__ERR_CNT_MASK 0x03800000L
#define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \
ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi
#define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \
(adev->reg_offset[hwip][ip_inst][segment] + (reg))
#define AMDGPU_RAS_ERR_INFO_VALID (1 << 0)
#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1)
#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2)
struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
uint32_t seg_lo;
uint32_t reg_lo;
uint32_t seg_hi;
uint32_t reg_hi;
uint32_t reg_inst;
uint32_t flags;
const char *block_name;
};
struct amdgpu_ras_memory_id_entry {
uint32_t memory_id;
const char *name;
};
struct ras_common_if { struct ras_common_if {
enum amdgpu_ras_block block; enum amdgpu_ras_block block;
enum amdgpu_ras_error_type type; enum amdgpu_ras_error_type type;
...@@ -696,4 +733,21 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co ...@@ -696,4 +733,21 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj); struct amdgpu_ras_block_object *ras_block_obj);
void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name);
bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_entry,
uint32_t instance,
uint32_t *memory_id);
bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_entry,
uint32_t instance,
unsigned long *err_cnt);
void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_list,
uint32_t reg_list_size,
const struct amdgpu_ras_memory_id_entry *mem_list,
uint32_t mem_list_size,
uint32_t instance,
uint32_t err_type,
unsigned long *err_count);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment