Commit 9095e554 authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 10f624ef
...@@ -1088,7 +1088,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ...@@ -1088,7 +1088,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
if (ret) { if (ret) {
pr_err("%s: Failed to get user pages: %d\n", __func__, ret); if (ret == -EAGAIN)
pr_debug("Failed to get user pages, try again\n");
else
pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
goto unregister_out; goto unregister_out;
} }
......
...@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, ...@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx", pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end); hmm_range->start, hmm_range->end);
/* Assuming 64MB takes maximum 1 second to fault page address */ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
timeout = jiffies + msecs_to_jiffies(timeout);
retry: retry:
hmm_range->notifier_seq = mmu_interval_read_begin(notifier); hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range); r = hmm_range_fault(hmm_range);
if (unlikely(r)) { if (unlikely(r)) {
schedule();
/*
* FIXME: This timeout should encompass the retry from
* mmu_interval_read_retry() as well.
*/
if (r == -EBUSY && !time_after(jiffies, timeout)) if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry; goto retry;
goto out_free_pfns; goto out_free_pfns;
...@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, ...@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
out_free_range: out_free_range:
kfree(hmm_range); kfree(hmm_range);
if (r == -EBUSY)
r = -EAGAIN;
return r; return r;
} }
......
...@@ -1690,11 +1690,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, ...@@ -1690,11 +1690,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
readonly, owner, NULL, readonly, owner, NULL,
&hmm_range); &hmm_range);
WRITE_ONCE(p->svms.faulting_task, NULL); WRITE_ONCE(p->svms.faulting_task, NULL);
if (r) { if (r)
pr_debug("failed %d to get svm range pages\n", r); pr_debug("failed %d to get svm range pages\n", r);
if (r == -EBUSY)
r = -EAGAIN;
}
} else { } else {
r = -EFAULT; r = -EFAULT;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment