Commit 61380faa authored by John Clements's avatar John Clements Committed by Alex Deucher

drm/amdgpu: disable ras query and iject during gpu reset

added flag to ras context to indicate if ras query functionality is ready
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 66399248
...@@ -4168,6 +4168,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4168,6 +4168,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
need_full_reset = job_signaled = false; need_full_reset = job_signaled = false;
INIT_LIST_HEAD(&device_list); INIT_LIST_HEAD(&device_list);
amdgpu_ras_set_error_query_ready(adev, false);
dev_info(adev->dev, "GPU %s begin!\n", dev_info(adev->dev, "GPU %s begin!\n",
(in_ras_intr && !use_baco) ? "jobs stop":"reset"); (in_ras_intr && !use_baco) ? "jobs stop":"reset");
...@@ -4224,6 +4226,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4224,6 +4226,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */ /* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (tmp_adev != adev) { if (tmp_adev != adev) {
amdgpu_ras_set_error_query_ready(tmp_adev, false);
amdgpu_device_lock_adev(tmp_adev, false); amdgpu_device_lock_adev(tmp_adev, false);
if (!amdgpu_sriov_vf(tmp_adev)) if (!amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_pre_reset(tmp_adev); amdgpu_amdkfd_pre_reset(tmp_adev);
......
...@@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); ...@@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr); uint64_t addr);
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
{
if (adev)
amdgpu_ras_get_context(adev)->error_query_ready = ready;
}
bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
{
if (adev)
return amdgpu_ras_get_context(adev)->error_query_ready;
return false;
}
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos) size_t size, loff_t *pos)
{ {
...@@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ...@@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
struct ras_debug_if data; struct ras_debug_if data;
int ret = 0; int ret = 0;
if (amdgpu_ras_intr_triggered()) { if (!amdgpu_ras_get_error_query_ready(adev)) {
DRM_WARN("RAS WARN: error injection currently inaccessible\n"); DRM_WARN("RAS WARN: error injection currently inaccessible\n");
return size; return size;
} }
...@@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, ...@@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
.head = obj->head, .head = obj->head,
}; };
if (amdgpu_ras_intr_triggered()) if (!amdgpu_ras_get_error_query_ready(obj->adev))
return snprintf(buf, PAGE_SIZE, return snprintf(buf, PAGE_SIZE,
"Query currently inaccessible\n"); "Query currently inaccessible\n");
...@@ -1886,8 +1900,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, ...@@ -1886,8 +1900,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
} }
/* in resume phase, no need to create ras fs node */ /* in resume phase, no need to create ras fs node */
if (adev->in_suspend || adev->in_gpu_reset) if (adev->in_suspend || adev->in_gpu_reset) {
amdgpu_ras_set_error_query_ready(adev, true);
return 0; return 0;
}
if (ih_info->cb) { if (ih_info->cb) {
r = amdgpu_ras_interrupt_add_handler(adev, ih_info); r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
...@@ -1899,6 +1915,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, ...@@ -1899,6 +1915,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (r) if (r)
goto sysfs; goto sysfs;
amdgpu_ras_set_error_query_ready(adev, true);
return 0; return 0;
cleanup: cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block); amdgpu_ras_sysfs_remove(adev, ras_block);
......
...@@ -334,6 +334,8 @@ struct amdgpu_ras { ...@@ -334,6 +334,8 @@ struct amdgpu_ras {
uint32_t flags; uint32_t flags;
bool reboot; bool reboot;
struct amdgpu_ras_eeprom_control eeprom_control; struct amdgpu_ras_eeprom_control eeprom_control;
bool error_query_ready;
}; };
struct ras_fs_data { struct ras_fs_data {
...@@ -629,4 +631,6 @@ static inline void amdgpu_ras_intr_cleared(void) ...@@ -629,4 +631,6 @@ static inline void amdgpu_ras_intr_cleared(void)
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment