Commit d5ea093e authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

dmr/amdgpu: Add system auto reboot to RAS.

In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

v4: Use latest kernel API for disk sync.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 7c6e68c7
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
#include "amdgpu_pmu.h" #include "amdgpu_pmu.h"
#include <linux/suspend.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
...@@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0; int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered(); bool in_ras_intr = amdgpu_ras_intr_triggered();
/*
* Flush RAM to disk so that after reboot
* the user can read log and see why the system rebooted.
*/
if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
DRM_WARN("Emergency reboot.");
ksys_sync_helper();
emergency_restart();
}
need_full_reset = job_signaled = false; need_full_reset = job_signaled = false;
INIT_LIST_HEAD(&device_list); INIT_LIST_HEAD(&device_list);
......
...@@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 1; op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2; op = 2;
else if (sscanf(str, "reboot %32s", block_name) == 1)
op = 3;
else if (str[0] && str[1] && str[2] && str[3]) else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */ /* ascii string, but commands are not matched. */
return -EINVAL; return -EINVAL;
...@@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ...@@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
/* data.inject.address is offset instead of absolute gpu address */ /* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject); ret = amdgpu_ras_error_inject(adev, &data.inject);
break; break;
case 3:
amdgpu_ras_get_context(adev)->reboot = true;
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) ...@@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{ {
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
amdgpu_ras_reset_gpu(adev, false);
} }
} }
...@@ -334,7 +334,7 @@ struct amdgpu_ras { ...@@ -334,7 +334,7 @@ struct amdgpu_ras {
struct mutex recovery_lock; struct mutex recovery_lock;
uint32_t flags; uint32_t flags;
bool reboot;
struct amdgpu_ras_eeprom_control eeprom_control; struct amdgpu_ras_eeprom_control eeprom_control;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment