Commit dfb15c4a authored by Zhigang Luo's avatar Zhigang Luo Committed by Alex Deucher

amd/amdkfd: sync all devices to wait all processes being evicted

If there are more than one device doing reset in parallel, the first
device will call kfd_suspend_all_processes() to evict all processes
on all devices, this call takes time to finish. other device will
start reset and recover without waiting. if the process has not been
evicted before doing recover, it will be restored, then caused page
fault.
Signed-off-by: default avatarZhigang Luo <Zhigang.Luo@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 24c30a7b
...@@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) ...@@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
{ {
struct kfd_node *node; struct kfd_node *node;
int i; int i;
int count;
if (!kfd->init_complete) if (!kfd->init_complete)
return; return;
...@@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) ...@@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
/* for runtime suspend, skip locking kfd */ /* for runtime suspend, skip locking kfd */
if (!run_pm) { if (!run_pm) {
mutex_lock(&kfd_processes_mutex); mutex_lock(&kfd_processes_mutex);
count = ++kfd_locked;
mutex_unlock(&kfd_processes_mutex);
/* For first KFD device suspend all the KFD processes */ /* For first KFD device suspend all the KFD processes */
if (count == 1) if (++kfd_locked == 1)
kfd_suspend_all_processes(); kfd_suspend_all_processes();
mutex_unlock(&kfd_processes_mutex);
} }
for (i = 0; i < kfd->num_nodes; i++) { for (i = 0; i < kfd->num_nodes; i++) {
...@@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) ...@@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
{ {
int ret, count, i; int ret, i;
if (!kfd->init_complete) if (!kfd->init_complete)
return 0; return 0;
...@@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) ...@@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
/* for runtime resume, skip unlocking kfd */ /* for runtime resume, skip unlocking kfd */
if (!run_pm) { if (!run_pm) {
mutex_lock(&kfd_processes_mutex); mutex_lock(&kfd_processes_mutex);
count = --kfd_locked; if (--kfd_locked == 0)
mutex_unlock(&kfd_processes_mutex);
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
if (count == 0)
ret = kfd_resume_all_processes(); ret = kfd_resume_all_processes();
WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
mutex_unlock(&kfd_processes_mutex);
} }
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment