Commit dfe9c3cd authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher

drm/amdgpu: Do a basic health check before reset

Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.
Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarAsad Kamal <asad.kamal@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f1b8479d
......@@ -5532,6 +5532,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
}
static int amdgpu_device_health_check(struct list_head *device_list_handle)
{
struct amdgpu_device *tmp_adev;
int ret = 0;
u32 status;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
if (PCI_POSSIBLE_ERROR(status)) {
dev_err(tmp_adev->dev, "device lost from bus!");
ret = -ENODEV;
}
}
return ret;
}
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
......@@ -5603,6 +5620,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
if (!amdgpu_sriov_vf(adev)) {
r = amdgpu_device_health_check(device_list_handle);
if (r)
goto end_reset;
}
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
......@@ -5768,6 +5791,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment