Commit 2c7cd280 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: gpu recovers from fatal error in poison mode

Fatal error occurs in ras poison mode, mode1 reset
is used to recover gpu.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 50a7c876
...@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ...@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
reset_context.method = AMD_RESET_METHOD_MODE2; reset_context.method = AMD_RESET_METHOD_MODE2;
} }
/* Fatal error occurs in poison mode, mode1 reset is used to
* recover gpu.
*/
if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
}
} }
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
...@@ -2955,9 +2963,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) ...@@ -2955,9 +2963,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
return; return;
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
dev_info(adev->dev, "uncorrectable hardware error" dev_info(adev->dev, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n"); "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
} }
......
...@@ -340,6 +340,7 @@ enum amdgpu_ras_ret { ...@@ -340,6 +340,7 @@ enum amdgpu_ras_ret {
#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) #define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2)
#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0) #define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0)
#define AMDGPU_RAS_GPU_RESET_MODE1_RESET (0x1 << 1)
struct amdgpu_ras_err_status_reg_entry { struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip; uint32_t hwip;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment