drm/amdgpu: Do a basic health check before reset

Check if the device is present in the bus before trying to recover. It could be that device itself is lost from the bus in some hang situations. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: Do a basic health check before reset
Check if the device is present in the bus before trying to recover. It could be that device itself is lost from the bus in some hang situations. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
dfe9c3cd · Lijo Lazar · Alex Deucher · f1b8479d · dfe9c3cd
Commit dfe9c3cd authored Mar 13, 2024 by Lijo Lazar Committed by Alex Deucher Mar 20, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 0 deletions

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +24 -0

No files found.
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5532,6 +5532,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)

 }

+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+	struct amdgpu_device *tmp_adev;
+	int ret = 0;
+	u32 status;
+
+	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+		if (PCI_POSSIBLE_ERROR(status)) {
+			dev_err(tmp_adev->dev, "device lost from bus!");
+			ret = -ENODEV;
+		}
+	}
+
+	return ret;
+}
+
 /**
 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
 *
@@ -5603,6 +5620,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		device_list_handle = &device_list;
 	}

+	if (!amdgpu_sriov_vf(adev)) {
+		r = amdgpu_device_health_check(device_list_handle);
+		if (r)
+			goto end_reset;
+	}
+
 	/* We need to lock reset domain only once both for XGMI and single device */
 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
 				    reset_list);
@@ -5768,6 +5791,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 					    reset_list);
 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

+end_reset:
 	if (hive) {
 		mutex_unlock(&hive->hive_lock);
 		amdgpu_put_xgmi_hive(hive);