Commit 087a3e13 authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2"

Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the
HW in an active state and is an unbalanced use of the IP callbacks.

Using the IP callbacks like this can lead to memory leaks, double
free and imbalanced reference counters.

Leaving the HW in an active state can lead to DMA accesses to memory now
freed by the driver.

Both is a complete no-go for driver unload so completely revert the
workaround for now.

This reverts commit f5c7e779.
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 8a1f7fdd
...@@ -5267,7 +5267,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, ...@@ -5267,7 +5267,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false; bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0; int r = 0;
bool gpu_reset_for_dev_remove = 0;
/* Try reset handler method first */ /* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
...@@ -5287,10 +5286,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, ...@@ -5287,10 +5286,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/* /*
* ASIC reset has to be done on all XGMI hive nodes ASAP * ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec) * to allow proper links negotiation in FW (within 1 sec)
...@@ -5333,18 +5328,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, ...@@ -5333,18 +5328,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
amdgpu_ras_intr_cleared(); amdgpu_ras_intr_cleared();
} }
/* Since the mode1 reset affects base ip blocks, the
* phase1 ip blocks need to be resumed. Otherwise there
* will be a BIOS signature error and the psp bootloader
* can't load kdb on the next amdgpu install.
*/
if (gpu_reset_for_dev_remove) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
amdgpu_device_ip_resume_phase1(tmp_adev);
goto end;
}
list_for_each_entry(tmp_adev, device_list_handle, reset_list) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) { if (need_full_reset) {
/* post card */ /* post card */
...@@ -5581,11 +5564,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5581,11 +5564,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0; int i, r = 0;
bool need_emergency_restart = false; bool need_emergency_restart = false;
bool audio_suspended = false; bool audio_suspended = false;
bool gpu_reset_for_dev_remove = false;
gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/* /*
* Special case: RAS triggered and full reset isn't supported * Special case: RAS triggered and full reset isn't supported
...@@ -5623,7 +5601,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5623,7 +5601,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list); list_add_tail(&tmp_adev->reset_list, &device_list);
if (gpu_reset_for_dev_remove && adev->shutdown) if (adev->shutdown)
tmp_adev->shutdown = true; tmp_adev->shutdown = true;
} }
if (!list_is_first(&adev->reset_list, &device_list)) if (!list_is_first(&adev->reset_list, &device_list))
...@@ -5708,10 +5686,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5708,10 +5686,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */ retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (gpu_reset_for_dev_remove) {
/* Workaroud for ASICs need to disable SMC first */
amdgpu_device_smu_fini_early(tmp_adev);
}
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/ /*TODO Should we stop ?*/
if (r) { if (r) {
...@@ -5743,9 +5717,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5743,9 +5717,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
r = amdgpu_do_asic_reset(device_list_handle, reset_context); r = amdgpu_do_asic_reset(device_list_handle, reset_context);
if (r && r == -EAGAIN) if (r && r == -EAGAIN)
goto retry; goto retry;
if (!r && gpu_reset_for_dev_remove)
goto recover_end;
} }
skip_hw_reset: skip_hw_reset:
...@@ -5801,7 +5772,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5801,7 +5772,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_ras_set_error_query_ready(tmp_adev, true); amdgpu_ras_set_error_query_ready(tmp_adev, true);
} }
recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list); reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
......
...@@ -2337,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev) ...@@ -2337,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(dev->dev); pm_runtime_forbid(dev->dev);
} }
if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
!amdgpu_sriov_vf(adev)) {
bool need_to_reset_gpu = false;
if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive;
hive = amdgpu_get_xgmi_hive(adev);
if (hive->device_remove_count == 0)
need_to_reset_gpu = true;
hive->device_remove_count++;
amdgpu_put_xgmi_hive(hive);
} else {
need_to_reset_gpu = true;
}
/* Workaround for ASICs need to reset SMU.
* Called only when the first device is removed.
*/
if (need_to_reset_gpu) {
struct amdgpu_reset_context reset_context;
adev->shutdown = true;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
}
amdgpu_driver_unload_kms(dev); amdgpu_driver_unload_kms(dev);
/* /*
......
...@@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS { ...@@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0, AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,
}; };
struct amdgpu_reset_context { struct amdgpu_reset_context {
......
...@@ -43,7 +43,6 @@ struct amdgpu_hive_info { ...@@ -43,7 +43,6 @@ struct amdgpu_hive_info {
} pstate; } pstate;
struct amdgpu_reset_domain *reset_domain; struct amdgpu_reset_domain *reset_domain;
uint32_t device_remove_count;
atomic_t ras_recovery; atomic_t ras_recovery;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment