Commit c2a77fde authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdkfd: Avoid hanging hardware in stop_cpsch

Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Tested-by: default avatarEmily Deng <Emily.Deng@amd.com>
Reviewed-by: default avatarshaoyunl  <shaoyun.liu@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 09c34e8d
...@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm) ...@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
static int stop_nocpsch(struct device_queue_manager *dqm) static int stop_nocpsch(struct device_queue_manager *dqm)
{ {
if (dqm->dev->device_info->asic_family == CHIP_HAWAII) if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
pm_uninit(&dqm->packets); pm_uninit(&dqm->packets, false);
dqm->sched_running = false; dqm->sched_running = false;
return 0; return 0;
...@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm) ...@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
return 0; return 0;
fail_allocate_vidmem: fail_allocate_vidmem:
fail_set_sched_resources: fail_set_sched_resources:
pm_uninit(&dqm->packets); pm_uninit(&dqm->packets, false);
fail_packet_manager_init: fail_packet_manager_init:
return retval; return retval;
} }
static int stop_cpsch(struct device_queue_manager *dqm) static int stop_cpsch(struct device_queue_manager *dqm)
{ {
bool hanging;
dqm_lock(dqm); dqm_lock(dqm);
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
dqm->sched_running = false; dqm->sched_running = false;
dqm_unlock(dqm); dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
pm_uninit(&dqm->packets); pm_uninit(&dqm->packets, hanging);
return 0; return 0;
} }
......
...@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev, ...@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
} }
/* Uninitialize a kernel queue and free all its memory usages. */ /* Uninitialize a kernel queue and free all its memory usages. */
static void kq_uninitialize(struct kernel_queue *kq) static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
{ {
if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
kq->queue->mqd, kq->queue->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
...@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, ...@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
return NULL; return NULL;
} }
void kernel_queue_uninit(struct kernel_queue *kq) void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
{ {
kq_uninitialize(kq); kq_uninitialize(kq, hanging);
kfree(kq); kfree(kq);
} }
......
...@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) ...@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
return 0; return 0;
} }
void pm_uninit(struct packet_manager *pm) void pm_uninit(struct packet_manager *pm, bool hanging)
{ {
mutex_destroy(&pm->lock); mutex_destroy(&pm->lock);
kernel_queue_uninit(pm->priv_queue); kernel_queue_uninit(pm->priv_queue, hanging);
} }
int pm_send_set_resources(struct packet_manager *pm, int pm_send_set_resources(struct packet_manager *pm,
......
...@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); ...@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm); void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type); enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq); void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */ /* Process Queue Manager */
...@@ -972,7 +972,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs; ...@@ -972,7 +972,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
extern const struct packet_manager_funcs kfd_v9_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
void pm_uninit(struct packet_manager *pm); void pm_uninit(struct packet_manager *pm, bool hanging);
int pm_send_set_resources(struct packet_manager *pm, int pm_send_set_resources(struct packet_manager *pm,
struct scheduling_resources *res); struct scheduling_resources *res);
int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
......
...@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) ...@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
/* destroy kernel queue (DIQ) */ /* destroy kernel queue (DIQ) */
dqm = pqn->kq->dev->dqm; dqm = pqn->kq->dev->dqm;
dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
kernel_queue_uninit(pqn->kq); kernel_queue_uninit(pqn->kq, false);
} }
if (pqn->q) { if (pqn->q) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment