Commit 66399248 authored by John Clements's avatar John Clements Committed by Alex Deucher

drm/amdgpu: added xgmi ras error reset sequence

added mechanism to clear xgmi ras status inbetween error queries
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3aa0115d
...@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) ...@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
adev->gmc.xgmi.num_physical_nodes == 0) adev->gmc.xgmi.num_physical_nodes == 0)
return 0; return 0;
amdgpu_xgmi_reset_ras_error_count(adev);
if (!adev->gmc.xgmi.ras_if) { if (!adev->gmc.xgmi.ras_if) {
adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->gmc.xgmi.ras_if) if (!adev->gmc.xgmi.ras_if)
...@@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, ...@@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
return addr + dram_base_addr; return addr + dram_base_addr;
} }
static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
{
WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
WREG32_PCIE(pcs_status_reg, 0);
}
void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
{
uint32_t i;
switch (adev->asic_type) {
case CHIP_ARCTURUS:
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
pcs_clear_status(adev,
xgmi_pcs_err_status_reg_arct[i]);
break;
case CHIP_VEGA20:
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
pcs_clear_status(adev,
xgmi_pcs_err_status_reg_vg20[i]);
break;
default:
break;
}
}
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
uint32_t value, uint32_t value,
uint32_t *ue_count, uint32_t *ue_count,
...@@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, ...@@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
break; break;
} }
amdgpu_xgmi_reset_ras_error_count(adev);
err_data->ue_count += ue_cnt; err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt; err_data->ce_count += ce_cnt;
......
...@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, ...@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
uint64_t addr); uint64_t addr);
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev, static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
struct amdgpu_device *bo_adev) struct amdgpu_device *bo_adev)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment