Commit 04442bf7 authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher

drm/amdgpu: Add reset control handling to reset workflow

This prefers reset control based handling if it's implemented
for a particular ASIC. If not, it takes the legacy path. It uses
the legacy method of preparing environment (job, scheduler tasks)
and restoring environment.

v2: remove unused variable (Alex)
Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarFeifei Xu <Feifei.Xu@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e071dce3
...@@ -270,6 +270,7 @@ struct amdgpu_bo_va_mapping; ...@@ -270,6 +270,7 @@ struct amdgpu_bo_va_mapping;
struct amdgpu_atif; struct amdgpu_atif;
struct kfd_vm_fault_info; struct kfd_vm_fault_info;
struct amdgpu_hive_info; struct amdgpu_hive_info;
struct amdgpu_reset_context;
struct amdgpu_reset_control; struct amdgpu_reset_control;
enum amdgpu_cp_irq { enum amdgpu_cp_irq {
...@@ -1075,6 +1076,7 @@ struct amdgpu_device { ...@@ -1075,6 +1076,7 @@ struct amdgpu_device {
bool in_pci_err_recovery; bool in_pci_err_recovery;
struct pci_saved_state *pci_state; struct pci_saved_state *pci_state;
struct amdgpu_reset_control *reset_cntl; struct amdgpu_reset_control *reset_cntl;
}; };
...@@ -1127,13 +1129,10 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type); ...@@ -1127,13 +1129,10 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev); bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
struct amdgpu_job *job, struct amdgpu_reset_context *reset_context);
bool *need_full_reset_arg);
int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context);
bool *need_full_reset_arg,
bool skip_hw_reset);
int emu_soc_asic_init(struct amdgpu_device *adev); int emu_soc_asic_init(struct amdgpu_device *adev);
......
...@@ -65,6 +65,7 @@ ...@@ -65,6 +65,7 @@
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
#include "amdgpu_pmu.h" #include "amdgpu_pmu.h"
#include "amdgpu_fru_eeprom.h" #include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
#include <linux/suspend.h> #include <linux/suspend.h>
#include <drm/task_barrier.h> #include <drm/task_barrier.h>
...@@ -3421,6 +3422,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -3421,6 +3422,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
goto fence_driver_init; goto fence_driver_init;
} }
amdgpu_reset_init(adev);
/* detect if we are with an SRIOV vbios */ /* detect if we are with an SRIOV vbios */
amdgpu_device_detect_sriov_bios(adev); amdgpu_device_detect_sriov_bios(adev);
...@@ -3671,6 +3674,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) ...@@ -3671,6 +3674,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
release_firmware(adev->firmware.gpu_info_fw); release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL; adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false; adev->accel_working = false;
amdgpu_reset_fini(adev);
/* free i2c buses */ /* free i2c buses */
if (!amdgpu_device_has_dc_support(adev)) if (!amdgpu_device_has_dc_support(adev))
amdgpu_i2c_fini(adev); amdgpu_i2c_fini(adev);
...@@ -4239,11 +4245,15 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) ...@@ -4239,11 +4245,15 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
} }
int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
struct amdgpu_job *job, struct amdgpu_reset_context *reset_context)
bool *need_full_reset_arg)
{ {
int i, r = 0; int i, r = 0;
bool need_full_reset = *need_full_reset_arg; struct amdgpu_job *job = NULL;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
if (reset_context->reset_req_dev == adev)
job = reset_context->job;
/* no need to dump if device is not in good state during probe period */ /* no need to dump if device is not in good state during probe period */
if (!adev->gmc.xgmi.pending_reset) if (!adev->gmc.xgmi.pending_reset)
...@@ -4268,6 +4278,10 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, ...@@ -4268,6 +4278,10 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if(job) if(job)
drm_sched_increase_karma(&job->base); drm_sched_increase_karma(&job->base);
r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
if (r != -ENOSYS)
return r;
/* Don't suspend on bare metal if we are not going to HW reset the ASIC */ /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
if (!amdgpu_sriov_vf(adev)) { if (!amdgpu_sriov_vf(adev)) {
...@@ -4286,22 +4300,36 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, ...@@ -4286,22 +4300,36 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (need_full_reset) if (need_full_reset)
r = amdgpu_device_ip_suspend(adev); r = amdgpu_device_ip_suspend(adev);
if (need_full_reset)
*need_full_reset_arg = need_full_reset; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
else
clear_bit(AMDGPU_NEED_FULL_RESET,
&reset_context->flags);
} }
return r; return r;
} }
int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context)
bool *need_full_reset_arg,
bool skip_hw_reset)
{ {
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false; bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0; int r = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
if (r != -ENOSYS)
return r;
/* Reset handler not implemented, use the default method */
need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
/* /*
* ASIC reset has to be done on all XGMI hive nodes ASAP * ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec) * to allow proper links negotiation in FW (within 1 sec)
...@@ -4385,7 +4413,8 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -4385,7 +4413,8 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
*/ */
amdgpu_register_gpu_instance(tmp_adev); amdgpu_register_gpu_instance(tmp_adev);
if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) if (!reset_context->hive &&
tmp_adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(tmp_adev); amdgpu_xgmi_add_device(tmp_adev);
r = amdgpu_device_ip_late_init(tmp_adev); r = amdgpu_device_ip_late_init(tmp_adev);
...@@ -4413,8 +4442,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -4413,8 +4442,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
} }
/* Update PSP FW topology after reset */ /* Update PSP FW topology after reset */
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) if (reset_context->hive &&
r = amdgpu_xgmi_update_topology(hive, tmp_adev); tmp_adev->gmc.xgmi.num_physical_nodes > 1)
r = amdgpu_xgmi_update_topology(
reset_context->hive, tmp_adev);
} }
} }
...@@ -4438,7 +4469,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -4438,7 +4469,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
} }
end: end:
*need_full_reset_arg = need_full_reset; if (need_full_reset)
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
else
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
return r; return r;
} }
...@@ -4575,10 +4609,9 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) ...@@ -4575,10 +4609,9 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
return 0; return 0;
} }
void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, void amdgpu_device_recheck_guilty_jobs(
struct amdgpu_hive_info *hive, struct amdgpu_device *adev, struct list_head *device_list_handle,
struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context)
bool *need_full_reset)
{ {
int i, r = 0; int i, r = 0;
...@@ -4614,8 +4647,10 @@ void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, ...@@ -4614,8 +4647,10 @@ void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev,
if (r) if (r)
adev->asic_reset_res = r; adev->asic_reset_res = r;
} else { } else {
r = amdgpu_do_asic_reset(hive, device_list_handle, clear_bit(AMDGPU_SKIP_HW_RESET,
need_full_reset, false); &reset_context->flags);
r = amdgpu_do_asic_reset(device_list_handle,
reset_context);
if (r && r == -EAGAIN) if (r && r == -EAGAIN)
goto retry; goto retry;
} }
...@@ -4657,7 +4692,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4657,7 +4692,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job *job) struct amdgpu_job *job)
{ {
struct list_head device_list, *device_list_handle = NULL; struct list_head device_list, *device_list_handle = NULL;
bool need_full_reset = false;
bool job_signaled = false; bool job_signaled = false;
struct amdgpu_hive_info *hive = NULL; struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
...@@ -4665,6 +4699,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4665,6 +4699,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool need_emergency_restart = false; bool need_emergency_restart = false;
bool audio_suspended = false; bool audio_suspended = false;
int tmp_vram_lost_counter; int tmp_vram_lost_counter;
struct amdgpu_reset_context reset_context;
memset(&reset_context, 0, sizeof(reset_context));
/* /*
* Special case: RAS triggered and full reset isn't supported * Special case: RAS triggered and full reset isn't supported
...@@ -4705,6 +4742,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4705,6 +4742,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
mutex_lock(&hive->hive_lock); mutex_lock(&hive->hive_lock);
} }
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
reset_context.job = job;
reset_context.hive = hive;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/* /*
* lock the device before we try to operate the linked list * lock the device before we try to operate the linked list
* if didn't get the device lock, don't touch the linked list since * if didn't get the device lock, don't touch the linked list since
...@@ -4805,9 +4848,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4805,9 +4848,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */ retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
r = amdgpu_device_pre_asic_reset(tmp_adev, r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
(tmp_adev == adev) ? job : NULL,
&need_full_reset);
/*TODO Should we stop ?*/ /*TODO Should we stop ?*/
if (r) { if (r) {
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
...@@ -4824,7 +4865,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4824,7 +4865,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r) if (r)
adev->asic_reset_res = r; adev->asic_reset_res = r;
} else { } else {
r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN) if (r && r == -EAGAIN)
goto retry; goto retry;
} }
...@@ -4843,8 +4884,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4843,8 +4884,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/ */
if (amdgpu_gpu_recovery == 2 && if (amdgpu_gpu_recovery == 2 &&
!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
amdgpu_device_recheck_guilty_jobs(tmp_adev, hive, amdgpu_device_recheck_guilty_jobs(
device_list_handle, &need_full_reset); tmp_adev, device_list_handle, &reset_context);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i]; struct amdgpu_ring *ring = tmp_adev->rings[i];
...@@ -5189,12 +5230,14 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) ...@@ -5189,12 +5230,14 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct drm_device *dev = pci_get_drvdata(pdev); struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_device *adev = drm_to_adev(dev);
int r, i; int r, i;
bool need_full_reset = true; struct amdgpu_reset_context reset_context;
u32 memsize; u32 memsize;
struct list_head device_list; struct list_head device_list;
DRM_INFO("PCI error: slot reset callback!!\n"); DRM_INFO("PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
INIT_LIST_HEAD(&device_list); INIT_LIST_HEAD(&device_list);
list_add_tail(&adev->reset_list, &device_list); list_add_tail(&adev->reset_list, &device_list);
...@@ -5217,13 +5260,18 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) ...@@ -5217,13 +5260,18 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
goto out; goto out;
} }
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
adev->in_pci_err_recovery = true; adev->in_pci_err_recovery = true;
r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); r = amdgpu_device_pre_asic_reset(adev, &reset_context);
adev->in_pci_err_recovery = false; adev->in_pci_err_recovery = false;
if (r) if (r)
goto out; goto out;
r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); r = amdgpu_do_asic_reset(&device_list, &reset_context);
out: out:
if (!r) { if (!r) {
......
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
#include "amdgpu_ras.h" #include "amdgpu_ras.h"
#include "amdgpu_xgmi.h" #include "amdgpu_xgmi.h"
#include "amdgpu_reset.h"
/* /*
* KMS wrapper. * KMS wrapper.
...@@ -1349,7 +1350,9 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) ...@@ -1349,7 +1350,9 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
struct list_head device_list; struct list_head device_list;
struct amdgpu_device *adev; struct amdgpu_device *adev;
int i, r; int i, r;
bool need_full_reset = true; struct amdgpu_reset_context reset_context;
memset(&reset_context, 0, sizeof(reset_context));
mutex_lock(&mgpu_info.mutex); mutex_lock(&mgpu_info.mutex);
if (mgpu_info.pending_reset == true) { if (mgpu_info.pending_reset == true) {
...@@ -1359,9 +1362,14 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) ...@@ -1359,9 +1362,14 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
mgpu_info.pending_reset = true; mgpu_info.pending_reset = true;
mutex_unlock(&mgpu_info.mutex); mutex_unlock(&mgpu_info.mutex);
/* Use a common context, just need to make sure full reset is done */
reset_context.method = AMD_RESET_METHOD_NONE;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
for (i = 0; i < mgpu_info.num_dgpu; i++) { for (i = 0; i < mgpu_info.num_dgpu; i++) {
adev = mgpu_info.gpu_ins[i].adev; adev = mgpu_info.gpu_ins[i].adev;
r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); reset_context.reset_req_dev = adev;
r = amdgpu_device_pre_asic_reset(adev, &reset_context);
if (r) { if (r) {
dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
r, adev_to_drm(adev)->unique); r, adev_to_drm(adev)->unique);
...@@ -1388,7 +1396,10 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) ...@@ -1388,7 +1396,10 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
list_for_each_entry(adev, &device_list, reset_list) list_for_each_entry(adev, &device_list, reset_list)
amdgpu_unregister_gpu_instance(adev); amdgpu_unregister_gpu_instance(adev);
r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); /* Use a common context, just need to make sure full reset is done */
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
r = amdgpu_do_asic_reset(&device_list, &reset_context);
if (r) { if (r) {
DRM_ERROR("reinit gpus failure"); DRM_ERROR("reinit gpus failure");
return; return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment