Commit 640ae42e authored by John Clements's avatar John Clements Committed by Alex Deucher

drm/amdgpu: Updated RAS infrastructure

Update RAS infrastructure to support RAS query for MCA subblocks
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6effad8a
......@@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
{
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
uint64_t mc_status = RREG64_PCIE(mc_status_addr);
if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
......@@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
{
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
uint64_t mc_status = RREG64_PCIE(mc_status_addr);
if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
......@@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr)
{
WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
WREG64_PCIE(mc_status_addr, 0x0ULL);
}
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
......@@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
if (!mca_dev->ras_if)
return -ENOMEM;
mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block;
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
mca_dev->ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *mca_dev->ras_if;
r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
......
......@@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs {
void (*query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status);
uint32_t ras_block;
uint32_t ras_sub_block;
const char* sysfs_name;
};
......
......@@ -61,9 +61,30 @@ const char *ras_block_string[] = {
"mp0",
"mp1",
"fuse",
"mpio",
"mca",
};
const char *ras_mca_block_string[] = {
"mca_mp0",
"mca_mp1",
"mca_mpio",
"mca_iohc",
};
const char *get_ras_block_str(struct ras_common_if *ras_block)
{
if (!ras_block)
return "NULL";
if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
return "OUT OF RANGE";
if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
return ras_mca_block_string[ras_block->sub_block_index];
return ras_block_string[ras_block->block];
}
#define ras_err_str(i) (ras_error_string[ffs(i)])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
......@@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
*block_id = i;
if (strcmp(name, ras_block_str(i)) == 0)
if (strcmp(name, ras_block_string[i]) == 0)
return 0;
}
return -EINVAL;
......@@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
if (obj->adev->asic_type == CHIP_ALDEBARAN) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
DRM_WARN("Failed to reset error counter and error status");
......@@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj)
if (obj && (--obj->use == 0))
list_del(&obj->node);
if (obj && (obj->use < 0))
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
}
/* make one obj and return it. */
......@@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL;
if (head->block == AMDGPU_RAS_BLOCK__MCA) {
if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
return NULL;
obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
} else
obj = &con->objs[head->block];
/* already exist. return obj? */
if (alive_obj(obj))
return NULL;
......@@ -574,21 +601,23 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL;
if (head->block == AMDGPU_RAS_BLOCK__MCA) {
if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
return NULL;
obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
} else
obj = &con->objs[head->block];
if (alive_obj(obj)) {
WARN_ON(head->block != obj->head.block);
if (alive_obj(obj))
return obj;
}
} else {
for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
obj = &con->objs[i];
if (alive_obj(obj)) {
WARN_ON(i != obj->head.block);
if (alive_obj(obj))
return obj;
}
}
}
return NULL;
}
......@@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
*/
if (!amdgpu_ras_is_feature_allowed(adev, head))
return 0;
if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
return 0;
if (enable) {
if (!obj) {
......@@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Are we alerady in that state we are going to set? */
if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
ret = 0;
goto out;
}
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
dev_err(adev->dev, "ras %s %s failed %d\n",
enable ? "enable":"disable",
ras_block_str(head->block),
get_ras_block_str(head),
ret);
goto out;
}
......@@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
if (!ret)
dev_info(adev->dev,
"RAS INFO: %s setup object\n",
ras_block_str(head->block));
get_ras_block_str(head));
}
} else {
/* setup the object then issue a ras TA disable cmd.*/
......@@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
bool bypass)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
int i;
const enum amdgpu_ras_error_type default_ras_type =
AMDGPU_RAS_ERROR__NONE;
const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
for (i = 0; i < ras_block_count; i++) {
for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
struct ras_common_if head = {
.block = i,
.type = default_ras_type,
.sub_block_index = 0,
};
if (i == AMDGPU_RAS_BLOCK__MCA)
continue;
if (bypass) {
/*
* bypass psp. vbios enable ras for us.
* so just create the obj
*/
if (__amdgpu_ras_feature_enable(adev, &head, 1))
break;
} else {
if (amdgpu_ras_feature_enable(adev, &head, 1))
break;
}
}
for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__MCA,
.type = default_ras_type,
.sub_block_index = i,
};
if (bypass) {
/*
* bypass psp. vbios enable ras for us.
......@@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
}
/* feature ctl end */
void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
struct ras_err_data *err_data)
{
switch (ras_block->sub_block_index) {
case AMDGPU_RAS_MCA_BLOCK__MP0:
if (adev->mca.mp0.ras_funcs &&
adev->mca.mp0.ras_funcs->query_ras_error_count)
adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_MCA_BLOCK__MP1:
if (adev->mca.mp1.ras_funcs &&
adev->mca.mp1.ras_funcs->query_ras_error_count)
adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_MCA_BLOCK__MPIO:
if (adev->mca.mpio.ras_funcs &&
adev->mca.mpio.ras_funcs->query_ras_error_count)
adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
}
}
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
......@@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->hdp.ras_funcs->query_ras_error_count)
adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__MCA:
amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
break;
default:
break;
}
......@@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
obj->err_data.ce_count,
ras_block_str(info->head.block));
get_ras_block_str(&info->head));
} else {
dev_info(adev->dev, "%ld correctable hardware errors "
"detected in %s block, no user "
"action is needed.\n",
obj->err_data.ce_count,
ras_block_str(info->head.block));
get_ras_block_str(&info->head));
}
}
if (err_data.ue_count) {
......@@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
obj->err_data.ue_count,
ras_block_str(info->head.block));
get_ras_block_str(&info->head));
} else {
dev_info(adev->dev, "%ld uncorrectable hardware errors "
"detected in %s block\n",
obj->err_data.ue_count,
ras_block_str(info->head.block));
get_ras_block_str(&info->head));
}
}
......@@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF:
case AMDGPU_RAS_BLOCK__MP0:
case AMDGPU_RAS_BLOCK__MP1:
case AMDGPU_RAS_BLOCK__MPIO:
case AMDGPU_RAS_BLOCK__MCA:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
......@@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
break;
default:
dev_info(adev->dev, "%s error injection is not supported yet\n",
ras_block_str(info->head.block));
get_ras_block_str(&info->head));
ret = -EINVAL;
}
if (ret)
dev_err(adev->dev, "ras inject %s failed %d\n",
ras_block_str(info->head.block), ret);
get_ras_block_str(&info->head), ret);
return ret;
}
......@@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
if (amdgpu_ras_is_supported(adev, obj->head.block) &&
(obj->attr_inuse == 1)) {
sprintf(fs_info.debugfs_name, "%s_err_inject",
ras_block_str(obj->head.block));
get_ras_block_str(&obj->head));
fs_info.head = obj->head;
amdgpu_ras_debugfs_create(adev, &fs_info, dir);
}
......@@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return 0;
con = kmalloc(sizeof(struct amdgpu_ras) +
sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
GFP_KERNEL|__GFP_ZERO);
if (!con)
return -ENOMEM;
......
......@@ -49,15 +49,22 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MP0,
AMDGPU_RAS_BLOCK__MP1,
AMDGPU_RAS_BLOCK__FUSE,
AMDGPU_RAS_BLOCK__MPIO,
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__LAST
};
extern const char *ras_block_string[];
enum amdgpu_ras_mca_block {
AMDGPU_RAS_MCA_BLOCK__MP0 = 0,
AMDGPU_RAS_MCA_BLOCK__MP1,
AMDGPU_RAS_MCA_BLOCK__MPIO,
AMDGPU_RAS_MCA_BLOCK__IOHC,
AMDGPU_RAS_MCA_BLOCK__LAST
};
#define ras_block_str(i) (ras_block_string[i])
#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
#define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST
#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
enum amdgpu_ras_gfx_subblock {
......@@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
return TA_RAS_BLOCK__MP1;
case AMDGPU_RAS_BLOCK__FUSE:
return TA_RAS_BLOCK__FUSE;
case AMDGPU_RAS_BLOCK__MPIO:
return TA_RAS_BLOCK__MPIO;
case AMDGPU_RAS_BLOCK__MCA:
return TA_RAS_BLOCK__MCA;
default:
WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
return TA_RAS_BLOCK__UMC;
......@@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
const char *get_ras_block_str(struct ras_common_if *ras_block);
#endif
......@@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
.ras_fini = mca_v3_0_mp0_ras_fini,
.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP0,
.ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0,
.sysfs_name = "mp0_err_count",
};
......@@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
.ras_fini = mca_v3_0_mp1_ras_fini,
.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP1,
.ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1,
.sysfs_name = "mp1_err_count",
};
......@@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
.ras_fini = mca_v3_0_mpio_ras_fini,
.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MPIO,
.ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO,
.sysfs_name = "mpio_err_count",
};
......
......@@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
"errors detected in %s block, "
"no user action is needed.\n",
obj->err_data.ce_count,
ras_block_str(adev->nbio.ras_if->block));
get_ras_block_str(adev->nbio.ras_if));
if (err_data.ue_count)
dev_info(adev->dev, "%ld uncorrectable hardware "
"errors detected in %s block\n",
obj->err_data.ue_count,
ras_block_str(adev->nbio.ras_if->block));
get_ras_block_str(adev->nbio.ras_if));
}
dev_info(adev->dev, "RAS controller interrupt triggered "
......
......@@ -73,10 +73,19 @@ enum ta_ras_block {
TA_RAS_BLOCK__MP0,
TA_RAS_BLOCK__MP1,
TA_RAS_BLOCK__FUSE,
TA_RAS_BLOCK__MPIO,
TA_RAS_BLOCK__MCA,
TA_NUM_BLOCK_MAX
};
enum ta_ras_mca_block
{
TA_RAS_MCA_BLOCK__MP0 = 0,
TA_RAS_MCA_BLOCK__MP1 = 1,
TA_RAS_MCA_BLOCK__MPIO = 2,
TA_RAS_MCA_BLOCK__IOHC = 3,
TA_MCA_NUM_BLOCK_MAX
};
enum ta_ras_error_type {
TA_RAS_ERROR__NONE = 0,
TA_RAS_ERROR__PARITY = 1,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment