drm/amdgpu: Adjust removal control flow for smu v13_0_2

Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. V3: 1. Update commit comments. 2. Split a patch into multiple patches. 3. The current patch does: a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into the existing gpu recover path, which make all devices in hive list only have HW reset but no resume (except the base IP). b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover in amdgpu_pci_remove when removing the first device in hive list. c. When removing the first device, the IP blocks keyword function call sequence is as follows: .suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini. ^ | |-<----------<---------<----| The first three sequences are because of a call to amdgpu_device_gpu_recover. The three sequences will be executed in a loop until all devices in the hive list are iterated. The sequences starting from .hw_fini only apply to the first device. Since .suspend has been called before, except the resumed phase1 basic ip blocks, all other ip blocks .hw_fini of current device will do nothing. d. When removing other devices, the calling sequences is the same as legacy: .hw_fini -> .early_fini -> .sw_fini. Since .suspend has been called when removing the first device, except the resumed phase1 basic ip blocks, all of other ip blocks .hw_fini of current device will do nothing. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: Adjust removal control flow for smu v13_0_2
Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. V3: 1. Update commit comments. 2. Split a patch into multiple patches. 3. The current patch does: a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into the existing gpu recover path, which make all devices in hive list only have HW reset but no resume (except the base IP). b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover in amdgpu_pci_remove when removing the first device in hive list. c. When removing the first device, the IP blocks keyword function call sequence is as follows: .suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini. ^ | |-<----------<---------<----| The first three sequences are because of a call to amdgpu_device_gpu_recover. The three sequences will be executed in a loop until all devices in the hive list are iterated. The sequences starting from .hw_fini only apply to the first device. Since .suspend has been called before, except the resumed phase1 basic ip blocks, all other ip blocks .hw_fini of current device will do nothing. d. When removing other devices, the calling sequences is the same as legacy: .hw_fini -> .early_fini -> .sw_fini. Since .suspend has been called when removing the first device, except the resumed phase1 basic ip blocks, all of other ip blocks .hw_fini of current device will do nothing. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
f5c7e779 · YiPeng Chai · Alex Deucher · e4cf73fd · f5c7e779 · f5c7e779
Commit f5c7e779 authored Sep 07, 2022 by YiPeng Chai Committed by Alex Deucher Sep 19, 2022
4 changed files
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4749,6 +4749,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset, skip_hw_reset, vram_lost = false;
 	int r = 0;
+	bool gpu_reset_for_dev_remove = 0;
 	/* Try reset handler method first */
 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -4768,6 +4769,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
+	gpu_reset_for_dev_remove =
+		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 	/*
 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
 	 * to allow proper links negotiation in FW (within 1 sec)
@@ -4812,6 +4817,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 		amdgpu_ras_intr_cleared();
 	}
+	/* Since the mode1 reset affects base ip blocks, the
+	 * phase1 ip blocks need to be resumed. Otherwise there
+	 * will be a BIOS signature error and the psp bootloader
+	 * can't load kdb on the next amdgpu install.
+	 */
+	if (gpu_reset_for_dev_remove) {
+		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+			amdgpu_device_ip_resume_phase1(tmp_adev);
+		goto end;
+	}
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 		if (need_full_reset) {
 			/* post card */
@@ -5134,6 +5151,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
 	int tmp_vram_lost_counter;
+	bool gpu_reset_for_dev_remove = false;
+	gpu_reset_for_dev_remove =
+			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 	/*
 	 * Special case: RAS triggered and full reset isn't supported
@@ -5253,6 +5275,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+		if (gpu_reset_for_dev_remove) {
+			/* Workaroud for ASICs need to disable SMC first */
+			amdgpu_device_smu_fini_early(tmp_adev);
+		}
 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
 		/*TODO Should we stop ?*/
 		if (r) {
@@ -5286,6 +5312,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			adev->asic_reset_res = 0;
 			goto retry;
 		}
+		if (!r && gpu_reset_for_dev_remove)
+			goto recover_end;
 	}
 skip_hw_reset:
@@ -5359,6 +5388,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_unset_mp1_state(tmp_adev);
 	}
+recover_end:
 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
 					    reset_list);
 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 		pm_runtime_forbid(dev->dev);
 	}
+	if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
+		bool need_to_reset_gpu = false;
+		if (adev->gmc.xgmi.num_physical_nodes > 1) {
+			struct amdgpu_hive_info *hive;
+			hive = amdgpu_get_xgmi_hive(adev);
+			if (hive->device_remove_count == 0)
+				need_to_reset_gpu = true;
+			hive->device_remove_count++;
+			amdgpu_put_xgmi_hive(hive);
+		} else {
+			need_to_reset_gpu = true;
+		}
+		/* Workaround for ASICs need to reset SMU.
+		 * Called only when the first device is removed.
+		 */
+		if (need_to_reset_gpu) {
+			struct amdgpu_reset_context reset_context;
+			memset(&reset_context, 0, sizeof(reset_context));
+			reset_context.method = AMD_RESET_METHOD_NONE;
+			reset_context.reset_req_dev = adev;
+			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+			set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
+			amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+		}
+	}
 	amdgpu_driver_unload_kms(dev);
 	drm_dev_unplug(dev);

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS {
 	AMDGPU_NEED_FULL_RESET = 0,
 	AMDGPU_SKIP_HW_RESET = 1,
 	AMDGPU_SKIP_MODE2_RESET = 2,
+	AMDGPU_RESET_FOR_DEVICE_REMOVE = 3,
 };
 struct amdgpu_reset_context {

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -43,6 +43,7 @@ struct amdgpu_hive_info {
 	} pstate;
 	struct amdgpu_reset_domain *reset_domain;
+	uint32_t device_remove_count;
 };
 struct amdgpu_pcs_ras_field {