Commit a44fe9ee authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdkfd: Fix retry fault drain race conditions

The check for whether to drain retry faults must be under the mmap write
lock to serialize with munmap notifier callbacks.

We were also missing checks on child ranges. To fix that, simplify the
logic by using a flag rather than checking on each prange. That also
allows draining less freqeuntly when many ranges are unmapped at once.
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Tested-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Tested-by: default avatarAlex Sierra <Alex.Sierra@amd.com>
Reviewed-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3aac6aa6
...@@ -766,6 +766,7 @@ struct svm_range_list { ...@@ -766,6 +766,7 @@ struct svm_range_list {
struct list_head deferred_range_list; struct list_head deferred_range_list;
spinlock_t deferred_list_lock; spinlock_t deferred_list_lock;
atomic_t evicted_ranges; atomic_t evicted_ranges;
bool drain_pagefaults;
struct delayed_work restore_work; struct delayed_work restore_work;
DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
struct task_struct *faulting_task; struct task_struct *faulting_task;
......
...@@ -2002,20 +2002,28 @@ static void svm_range_deferred_list_work(struct work_struct *work) ...@@ -2002,20 +2002,28 @@ static void svm_range_deferred_list_work(struct work_struct *work)
pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
prange->start, prange->last, prange->work_item.op); prange->start, prange->last, prange->work_item.op);
/* Make sure no stale retry fault coming after range is freed */
if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
svm_range_drain_retry_fault(prange->svms);
mm = prange->work_item.mm; mm = prange->work_item.mm;
retry:
mmap_write_lock(mm); mmap_write_lock(mm);
mutex_lock(&svms->lock); mutex_lock(&svms->lock);
/* Remove from deferred_list must be inside mmap write lock, /* Checking for the need to drain retry faults must be in
* mmap write lock to serialize with munmap notifiers.
*
* Remove from deferred_list must be inside mmap write lock,
* otherwise, svm_range_list_lock_and_flush_work may hold mmap * otherwise, svm_range_list_lock_and_flush_work may hold mmap
* write lock, and continue because deferred_list is empty, then * write lock, and continue because deferred_list is empty, then
* deferred_list handle is blocked by mmap write lock. * deferred_list handle is blocked by mmap write lock.
*/ */
spin_lock(&svms->deferred_list_lock); spin_lock(&svms->deferred_list_lock);
if (unlikely(svms->drain_pagefaults)) {
svms->drain_pagefaults = false;
spin_unlock(&svms->deferred_list_lock);
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
svm_range_drain_retry_fault(svms);
goto retry;
}
list_del_init(&prange->deferred_list); list_del_init(&prange->deferred_list);
spin_unlock(&svms->deferred_list_lock); spin_unlock(&svms->deferred_list_lock);
...@@ -2048,6 +2056,12 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, ...@@ -2048,6 +2056,12 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
struct mm_struct *mm, enum svm_work_list_ops op) struct mm_struct *mm, enum svm_work_list_ops op)
{ {
spin_lock(&svms->deferred_list_lock); spin_lock(&svms->deferred_list_lock);
/* Make sure pending page faults are drained in the deferred worker
* before the range is freed to avoid straggler interrupts on
* unmapped memory causing "phantom faults".
*/
if (op == SVM_OP_UNMAP_RANGE)
svms->drain_pagefaults = true;
/* if prange is on the deferred list */ /* if prange is on the deferred list */
if (!list_empty(&prange->deferred_list)) { if (!list_empty(&prange->deferred_list)) {
pr_debug("update exist prange 0x%p work op %d\n", prange, op); pr_debug("update exist prange 0x%p work op %d\n", prange, op);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment