Commit 2c99a547 authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdkfd: don't use dqm lock during device reset/suspend/resume

If device reset/suspend/resume failed for some reason, dqm lock is
hold forever and this causes deadlock. Below is a kernel backtrace when
application open kfd after suspend/resume failed.

Instead of holding dqm lock in pre_reset and releasing dqm lock in
post_reset, add dqm->sched_running flag which is modified in
dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
because write/read are all inside dqm lock.

For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
sched_running flag before sending the updated runlist.

v2: For no-HWS case, when device is stopped, don't call
load/destroy_mqd for eviction, restore and create queue, and avoid
debugfs dump hdqs.

Backtrace of dqm lock deadlock:

[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
than 120 seconds.
[Thu Oct 17 16:43:37 2019]       Not tainted
5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
[Thu Oct 17 16:43:37 2019] "echo 0 >
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
0x80000000
[Thu Oct 17 16:43:37 2019] Call Trace:
[Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
[Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
[Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
[Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
[Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
[Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]
kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
[Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
[amdgpu]
[Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
[Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
[Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
[Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
[Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
[Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
[Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
[Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
[Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
[Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
[Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
Suggested-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 80df905d
...@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) ...@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
return 0; return 0;
kgd2kfd_suspend(kfd); kgd2kfd_suspend(kfd);
/* hold dqm->lock to prevent further execution*/
dqm_lock(kfd->dqm);
kfd_signal_reset_event(kfd); kfd_signal_reset_event(kfd);
return 0; return 0;
} }
...@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ...@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
if (!kfd->init_complete) if (!kfd->init_complete)
return 0; return 0;
dqm_unlock(kfd->dqm);
ret = kfd_resume(kfd); ret = kfd_resume(kfd);
if (ret) if (ret)
return ret; return ret;
......
...@@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, ...@@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties); &q->gart_mqd_addr, &q->properties);
if (q->properties.is_active) { if (q->properties.is_active) {
if (!dqm->sched_running) {
WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
goto add_queue_to_list;
}
if (WARN(q->process->mm != current->mm, if (WARN(q->process->mm != current->mm,
"should only run in user thread")) "should only run in user thread"))
...@@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, ...@@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
goto out_free_mqd; goto out_free_mqd;
} }
add_queue_to_list:
list_add(&q->list, &qpd->queues_list); list_add(&q->list, &qpd->queues_list);
qpd->queue_count++; qpd->queue_count++;
if (q->properties.is_active) if (q->properties.is_active)
...@@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, ...@@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
deallocate_doorbell(qpd, q); deallocate_doorbell(qpd, q);
if (!dqm->sched_running) {
WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
return 0;
}
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
KFD_UNMAP_LATENCY_MS, KFD_UNMAP_LATENCY_MS,
...@@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) ...@@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE || (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA || q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
if (!dqm->sched_running) {
WARN_ONCE(1, "Update non-HWS queue while stopped\n");
goto out_unlock;
}
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
...@@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, ...@@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)]; q->properties.type)];
q->properties.is_active = false; q->properties.is_active = false;
dqm->queue_count--;
if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
continue;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
...@@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, ...@@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
* maintain a consistent eviction state * maintain a consistent eviction state
*/ */
ret = retval; ret = retval;
dqm->queue_count--;
} }
out: out:
...@@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, ...@@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)]; q->properties.type)];
q->properties.is_active = true; q->properties.is_active = true;
dqm->queue_count++;
if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
continue;
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties, mm); q->queue, &q->properties, mm);
if (retval && !ret) if (retval && !ret)
...@@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, ...@@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
* maintain a consistent eviction state * maintain a consistent eviction state
*/ */
ret = retval; ret = retval;
dqm->queue_count++;
} }
qpd->evicted = 0; qpd->evicted = 0;
out: out:
...@@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm) ...@@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
if (dqm->dev->device_info->asic_family == CHIP_HAWAII) if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
return pm_init(&dqm->packets, dqm); return pm_init(&dqm->packets, dqm);
dqm->sched_running = true;
return 0; return 0;
} }
...@@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm) ...@@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
{ {
if (dqm->dev->device_info->asic_family == CHIP_HAWAII) if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
pm_uninit(&dqm->packets); pm_uninit(&dqm->packets);
dqm->sched_running = false;
return 0; return 0;
} }
...@@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm) ...@@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm_lock(dqm); dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */ /* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false; dqm->is_hws_hang = false;
dqm->sched_running = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm); dqm_unlock(dqm);
...@@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) ...@@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
{ {
dqm_lock(dqm); dqm_lock(dqm);
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
dqm->sched_running = false;
dqm_unlock(dqm); dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
...@@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) ...@@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
{ {
int retval; int retval;
if (!dqm->sched_running)
return 0;
if (dqm->queue_count <= 0 || dqm->processes_count <= 0) if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
return 0; return 0;
if (dqm->active_runlist) if (dqm->active_runlist)
return 0; return 0;
...@@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, ...@@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
{ {
int retval = 0; int retval = 0;
if (!dqm->sched_running)
return 0;
if (dqm->is_hws_hang) if (dqm->is_hws_hang)
return -EIO; return -EIO;
if (!dqm->active_runlist) if (!dqm->active_runlist)
...@@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) ...@@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
int pipe, queue; int pipe, queue;
int r = 0; int r = 0;
if (!dqm->sched_running) {
seq_printf(m, " Device is stopped\n");
return 0;
}
r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
&dump, &n_regs); &dump, &n_regs);
......
...@@ -201,6 +201,7 @@ struct device_queue_manager { ...@@ -201,6 +201,7 @@ struct device_queue_manager {
bool is_hws_hang; bool is_hws_hang;
struct work_struct hw_exception_work; struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd; struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
}; };
void device_queue_manager_init_cik( void device_queue_manager_init_cik(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment