Commit 314c38cd authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: retire bad pages for umc v12_0

Retire bad pages for umc v12_0.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e74313be
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
#include "umc/umc_12_0_0_sh_mask.h" #include "umc/umc_12_0_0_sh_mask.h"
#include "mp/mp_13_0_6_sh_mask.h" #include "mp/mp_13_0_6_sh_mask.h"
#define MAX_ECC_NUM_PER_RETIREMENT 32
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst, uint32_t node_inst,
uint32_t umc_inst, uint32_t umc_inst,
...@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, ...@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
return 0; return 0;
} }
#ifdef TO_BE_REMOVED
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev, static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status) void *ras_error_status)
{ {
...@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade ...@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
} }
} }
} }
#endif
static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev, static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status) enum amdgpu_mca_error_type type, void *ras_error_status)
...@@ -633,6 +637,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ...@@ -633,6 +637,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return 0; return 0;
} }
static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
struct ras_ecc_err *ecc_err, void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
uint32_t i = 0;
int ret = 0;
if (!err_data || !ecc_err)
return -EINVAL;
for (i = 0; i < ecc_err->err_pages.count; i++) {
ret = amdgpu_umc_fill_error_record(err_data,
ecc_err->addr,
ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
MCA_IPID_2_UMC_CH(ecc_err->ipid),
MCA_IPID_2_UMC_INST(ecc_err->ipid));
if (ret)
break;
}
err_data->de_count++;
return ret;
}
static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
void *ras_error_status)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
struct radix_tree_root *ecc_tree;
int new_detected, ret, i;
ecc_tree = &con->umc_ecc_log.de_page_tree;
mutex_lock(&con->umc_ecc_log.lock);
new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
for (i = 0; i < new_detected; i++) {
if (!entries[i])
continue;
ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
if (ret) {
dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
break;
}
radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
}
mutex_unlock(&con->umc_ecc_log.lock);
}
struct amdgpu_umc_ras umc_v12_0_ras = { struct amdgpu_umc_ras umc_v12_0_ras = {
.ras_block = { .ras_block = {
.hw_ops = &umc_v12_0_ras_hw_ops, .hw_ops = &umc_v12_0_ras_hw_ops,
...@@ -640,8 +696,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = { ...@@ -640,8 +696,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
}, },
.err_cnt_init = umc_v12_0_err_cnt_init, .err_cnt_init = umc_v12_0_err_cnt_init,
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
.check_ecc_err_status = umc_v12_0_check_ecc_err_status, .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
.update_ecc_status = umc_v12_0_update_ecc_status, .update_ecc_status = umc_v12_0_update_ecc_status,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment