Commit 6475ae2b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)

Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9d8a8d78
...@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo ...@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
else if (reset) else if (reset)
amdgpu_amdkfd_gpu_reset(adev); amdgpu_amdkfd_gpu_reset(adev);
} }
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
{
if (adev->gfx.ras->query_utcl2_poison_status)
return adev->gfx.ras->query_utcl2_poison_status(adev);
else
return false;
}
...@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, ...@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p); void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p); int amdgpu_amdkfd_criu_resume(void *p);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
#if IS_ENABLED(CONFIG_HSA_AMD) #if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
......
...@@ -202,6 +202,7 @@ struct amdgpu_cu_info { ...@@ -202,6 +202,7 @@ struct amdgpu_cu_info {
struct amdgpu_gfx_ras { struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block; struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev); void (*enable_watchdog_timer)(struct amdgpu_device *adev);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
}; };
struct amdgpu_gfx_funcs { struct amdgpu_gfx_funcs {
......
...@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev) ...@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex); mutex_unlock(&adev->grbm_idx_mutex);
} }
static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
{
u32 status = 0;
struct amdgpu_vmhub *hub;
hub = &adev->vmhub[AMDGPU_GFXHUB_0];
status = RREG32(hub->vm_l2_pro_fault_status);
/* reset page fault status */
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
}
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = { struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
.ras_error_inject = &gfx_v9_4_2_ras_error_inject, .ras_error_inject = &gfx_v9_4_2_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
...@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = { ...@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops, .hw_ops = &gfx_v9_4_2_ras_ops,
}, },
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment