Commit 04c4fcd2 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: add amdgpu ras aca query interface

v1:
add ACA error query interface

v2:
Add a new helper function to determine whether to use ACA or MCA.
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0c54e457
...@@ -577,6 +577,9 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, ...@@ -577,6 +577,9 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
{ {
struct amdgpu_aca *aca = &adev->aca; struct amdgpu_aca *aca = &adev->aca;
if (!amdgpu_aca_is_enabled(adev))
return 0;
return add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data); return add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
} }
...@@ -613,6 +616,11 @@ static void aca_manager_fini(struct aca_handle_manager *mgr) ...@@ -613,6 +616,11 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
remove_aca(handle); remove_aca(handle);
} }
bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
{
return adev->aca.is_enabled;
}
int amdgpu_aca_init(struct amdgpu_device *adev) int amdgpu_aca_init(struct amdgpu_device *adev)
{ {
struct amdgpu_aca *aca = &adev->aca; struct amdgpu_aca *aca = &adev->aca;
......
...@@ -172,6 +172,7 @@ struct aca_smu_funcs { ...@@ -172,6 +172,7 @@ struct aca_smu_funcs {
struct amdgpu_aca { struct amdgpu_aca {
struct aca_handle_manager mgr; struct aca_handle_manager mgr;
const struct aca_smu_funcs *smu_funcs; const struct aca_smu_funcs *smu_funcs;
bool is_enabled;
}; };
struct aca_info { struct aca_info {
...@@ -183,6 +184,7 @@ struct aca_info { ...@@ -183,6 +184,7 @@ struct aca_info {
int amdgpu_aca_init(struct amdgpu_device *adev); int amdgpu_aca_init(struct amdgpu_device *adev);
void amdgpu_aca_fini(struct amdgpu_device *adev); void amdgpu_aca_fini(struct amdgpu_device *adev);
void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs); void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs);
bool amdgpu_aca_is_enabled(struct amdgpu_device *adev);
int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info); int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info);
int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size); int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size);
......
...@@ -1167,6 +1167,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s ...@@ -1167,6 +1167,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
} }
} }
static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{
struct ras_common_if head;
memset(&head, 0, sizeof(head));
head.block = blk;
return amdgpu_ras_find_obj(adev, &head);
}
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data);
}
int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
amdgpu_aca_remove_handle(&obj->aca_handle);
return 0;
}
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum aca_error_type type, struct ras_err_data *err_data)
{
struct ras_manager *obj;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info, struct ras_query_if *info,
struct ras_err_data *err_data, struct ras_err_data *err_data,
...@@ -1174,6 +1221,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, ...@@ -1174,6 +1221,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
{ {
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL; struct amdgpu_ras_block_object *block_obj = NULL;
int ret;
if (blk == AMDGPU_RAS_BLOCK_COUNT) if (blk == AMDGPU_RAS_BLOCK_COUNT)
return -EINVAL; return -EINVAL;
...@@ -1202,11 +1250,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, ...@@ -1202,11 +1250,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
block_obj->hw_ops->query_ras_error_status(adev); block_obj->hw_ops->query_ras_error_status(adev);
} }
} }
} else {
if (amdgpu_aca_is_enabled(adev)) {
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
if (ret)
return ret;
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
if (ret)
return ret;
} else { } else {
/* FIXME: add code to check return value later */ /* FIXME: add code to check return value later */
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data); amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data); amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
} }
}
return 0; return 0;
} }
...@@ -1254,6 +1312,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, ...@@ -1254,6 +1312,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
struct amdgpu_hive_info *hive; struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0; int hive_ras_recovery = 0;
...@@ -1264,7 +1323,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, ...@@ -1264,7 +1323,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
} }
if (!amdgpu_ras_is_supported(adev, block) || if (!amdgpu_ras_is_supported(adev, block) ||
!amdgpu_ras_get_mca_debug_mode(adev)) !amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP; return -EOPNOTSUPP;
hive = amdgpu_get_xgmi_hive(adev); hive = amdgpu_get_xgmi_hive(adev);
...@@ -1276,7 +1335,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, ...@@ -1276,7 +1335,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
/* skip ras error reset in gpu reset */ /* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
hive_ras_recovery) && hive_ras_recovery) &&
mca_funcs && mca_funcs->mca_set_debug_mode) ((smu_funcs && smu_funcs->set_debug_mode) ||
(mca_funcs && mca_funcs->mca_set_debug_mode)))
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count) if (block_obj->hw_ops->reset_ras_error_count)
...@@ -1772,6 +1832,9 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) ...@@ -1772,6 +1832,9 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
} }
} }
if (amdgpu_aca_is_enabled(adev))
amdgpu_aca_smu_debugfs_init(adev, dir);
else
amdgpu_mca_smu_debugfs_init(adev, dir); amdgpu_mca_smu_debugfs_init(adev, dir);
} }
...@@ -2753,6 +2816,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) ...@@ -2753,6 +2816,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask; adev->ras_hw_enabled & amdgpu_ras_mask;
/* aca is disabled by default */
adev->aca.is_enabled = false;
} }
static void amdgpu_ras_counte_dw(struct work_struct *work) static void amdgpu_ras_counte_dw(struct work_struct *work)
...@@ -3142,6 +3208,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) ...@@ -3142,6 +3208,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev)) if (amdgpu_sriov_vf(adev))
return 0; return 0;
if (amdgpu_aca_is_enabled(adev))
amdgpu_ras_set_aca_debug_mode(adev, false);
else
amdgpu_ras_set_mca_debug_mode(adev, false); amdgpu_ras_set_mca_debug_mode(adev, false);
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
...@@ -3425,7 +3494,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) ...@@ -3425,7 +3494,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
if (con) { if (con) {
ret = amdgpu_mca_smu_set_debug_mode(adev, enable); ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret) if (!ret)
con->is_mca_debug_mode = enable; con->is_aca_debug_mode = enable;
} }
return ret; return ret;
...@@ -3437,24 +3506,29 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) ...@@ -3437,24 +3506,29 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
int ret = 0; int ret = 0;
if (con) { if (con) {
if (amdgpu_aca_is_enabled(adev))
ret = amdgpu_aca_smu_set_debug_mode(adev, enable); ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
else
ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret) if (!ret)
con->is_mca_debug_mode = enable; con->is_aca_debug_mode = enable;
} }
return ret; return ret;
} }
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!con) if (!con)
return false; return false;
if (mca_funcs && mca_funcs->mca_set_debug_mode) if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) ||
return con->is_mca_debug_mode; (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode))
return con->is_aca_debug_mode;
else else
return true; return true;
} }
...@@ -3464,15 +3538,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, ...@@ -3464,15 +3538,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con) { if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false; return false;
} }
if (mca_funcs && mca_funcs->mca_set_debug_mode) if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))
*error_query_mode = *error_query_mode =
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
......
...@@ -455,7 +455,7 @@ struct amdgpu_ras { ...@@ -455,7 +455,7 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */ /* Indicates smu whether need update bad channel info */
bool update_channel_flag; bool update_channel_flag;
/* Record status of smu mca debug mode */ /* Record status of smu mca debug mode */
bool is_mca_debug_mode; bool is_aca_debug_mode;
/* Record special requirements of gpu reset caller */ /* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags; uint32_t gpu_reset_flags;
...@@ -543,6 +543,8 @@ struct ras_manager { ...@@ -543,6 +543,8 @@ struct ras_manager {
struct ras_ih_data ih_data; struct ras_ih_data ih_data;
struct ras_err_data err_data; struct ras_err_data err_data;
struct aca_handle aca_handle;
}; };
struct ras_badpage { struct ras_badpage {
...@@ -794,9 +796,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); ...@@ -794,9 +796,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode); unsigned int *mode);
...@@ -834,4 +836,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, ...@@ -834,4 +836,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info, struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count); struct ras_err_addr *err_addr, u64 count);
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data);
int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment