Commit aa32f116 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull hmm updates from Jason Gunthorpe:
 "This is another round of bug fixing and cleanup. This time the focus
  is on the driver pattern to use mmu notifiers to monitor a VA range.
  This code is lifted out of many drivers and hmm_mirror directly into
  the mmu_notifier core and written using the best ideas from all the
  driver implementations.

  This removes many bugs from the drivers and has a very pleasing
  diffstat. More drivers can still be converted, but that is for another
  cycle.

   - A shared branch with RDMA reworking the RDMA ODP implementation

   - New mmu_interval_notifier API. This is focused on the use case of
     monitoring a VA and simplifies the process for drivers

   - A common seq-count locking scheme built into the
     mmu_interval_notifier API usable by drivers that call
     get_user_pages() or hmm_range_fault() with the VA range

   - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen
     GntDev drivers to the new API. This deletes a lot of wonky driver
     code.

   - Two improvements for hmm_range_fault(), from testing done by Ralph"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap
  mm/hmm: make full use of walk_page_range()
  xen/gntdev: use mmu_interval_notifier_insert
  mm/hmm: remove hmm_mirror and related
  drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror
  drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror
  drm/amdgpu: Call find_vma under mmap_sem
  nouveau: use mmu_interval_notifier instead of hmm_mirror
  nouveau: use mmu_notifier directly for invalidate_range_start
  drm/radeon: use mmu_interval_notifier_insert
  RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv
  RDMA/odp: Use mmu_interval_notifier_insert()
  mm/hmm: define the pre-processor related parts of hmm.h even if disabled
  mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror
  mm/mmu_notifier: add an interval tree notifier
  mm/mmu_notifier: define the header pre-processor parts even if disabled
  mm/hmm: allow snapshot of the special zero page
parents d5bb349d 93f4e735
......@@ -147,49 +147,16 @@ Address space mirroring implementation and API
Address space mirroring's main objective is to allow duplication of a range of
CPU page table into a device page table; HMM helps keep both synchronized. A
device driver that wants to mirror a process address space must start with the
registration of an hmm_mirror struct::
int hmm_mirror_register(struct hmm_mirror *mirror,
struct mm_struct *mm);
The mirror struct has a set of callbacks that are used
to propagate CPU page tables::
struct hmm_mirror_ops {
/* release() - release hmm_mirror
*
* @mirror: pointer to struct hmm_mirror
*
* This is called when the mm_struct is being released. The callback
* must ensure that all access to any pages obtained from this mirror
* is halted before the callback returns. All future access should
* fault.
*/
void (*release)(struct hmm_mirror *mirror);
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
* @update: update information (see struct mmu_notifier_range)
* Return: -EAGAIN if update.blockable false and callback need to
* block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
* in response to this callback. The update argument tells what action
* to perform.
*
* The device driver must not return from this callback until the device
* page tables are completely updated (TLBs flushed, etc); this is a
* synchronous call.
*/
int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
const struct hmm_update *update);
};
The device driver must perform the update action to the range (mark range
read only, or fully unmap, etc.). The device must complete the update before
the driver callback returns.
registration of a mmu_interval_notifier::
mni->ops = &driver_ops;
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
unsigned long start, unsigned long length,
struct mm_struct *mm);
During the driver_ops->invalidate() callback the device driver must perform
the update action to the range (mark range read only, or fully unmap,
etc.). The device must complete the update before the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use::
......@@ -216,70 +183,46 @@ The usage pattern is::
struct hmm_range range;
...
range.notifier = &mni;
range.start = ...;
range.end = ...;
range.pfns = ...;
range.flags = ...;
range.values = ...;
range.pfn_shift = ...;
hmm_range_register(&range, mirror);
/*
* Just wait for range to be valid, safe to ignore return value as we
* will use the return value of hmm_range_fault() below under the
* mmap_sem to ascertain the validity of the range.
*/
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
if (!mmget_not_zero(mni->notifier.mm))
return -EFAULT;
again:
range.notifier_seq = mmu_interval_read_begin(&mni);
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
if (ret) {
up_read(&mm->mmap_sem);
if (ret == -EBUSY) {
/*
* No need to check hmm_range_wait_until_valid() return value
* on retry we will get proper error with hmm_range_fault()
*/
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
goto again;
}
hmm_range_unregister(&range);
if (ret == -EBUSY)
goto again;
return ret;
}
up_read(&mm->mmap_sem);
take_lock(driver->update);
if (!hmm_range_valid(&range)) {
if (mmu_interval_read_retry(&ni, range.notifier_seq) {
release_lock(driver->update);
up_read(&mm->mmap_sem);
goto again;
}
// Use pfns array content to update device page table
/* Use pfns array content to update device page table,
* under the update lock */
hmm_range_unregister(&range);
release_lock(driver->update);
up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
sync_cpu_device_pagetables() callback. That lock must be held before calling
hmm_range_valid() to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
concurrent device updates in multi-devices scenario.
HMM also serves as an impedance mismatch between how CPU page table updates
are done (by CPU write to the page table and TLB flushes) and how devices
update their own page table. Device updates are a multi-step process. First,
appropriate commands are written to a buffer, then this buffer is scheduled for
execution on the device. It is only once the device has executed commands in
the buffer that the update is done. Creating and scheduling the update command
buffer can happen concurrently for multiple devices. Waiting for each device to
report commands as executed is serialized (there is no point in doing this
concurrently).
invalidate() callback. That lock must be held before calling
mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
update.
Leverage default_flags and pfn_flags_mask
=========================================
......
......@@ -967,6 +967,8 @@ struct amdgpu_device {
struct mutex lock_reset;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
int asic_reset_res;
struct work_struct xgmi_reset_work;
......
......@@ -505,8 +505,7 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
*
* Returns 0 for success, negative errno for errors.
*/
static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
uint64_t user_addr)
static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
{
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
......@@ -1199,7 +1198,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
if (user_addr) {
ret = init_user_pages(*mem, current->mm, user_addr);
ret = init_user_pages(*mem, user_addr);
if (ret)
goto allocate_init_user_pages_failed;
}
......@@ -1744,6 +1743,10 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
return ret;
}
/*
* FIXME: Cannot ignore the return code, must hold
* notifier_lock
*/
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
/* Mark the BO as valid unless it was invalidated
......
......@@ -538,8 +538,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
e->tv.num_shared = 2;
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
if (p->bo_list->first_userptr != p->bo_list->num_entries)
p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX);
INIT_LIST_HEAD(&duplicates);
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
......@@ -1219,11 +1217,11 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
if (r)
goto error_unlock;
/* No memory allocation is allowed while holding the mn lock.
* p->mn is hold until amdgpu_cs_submit is finished and fence is added
* to BOs.
/* No memory allocation is allowed while holding the notifier lock.
* The lock is held until amdgpu_cs_submit is finished and fence is
* added to BOs.
*/
amdgpu_mn_lock(p->mn);
mutex_lock(&p->adev->notifier_lock);
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
......@@ -1266,13 +1264,13 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
amdgpu_mn_unlock(p->mn);
mutex_unlock(&p->adev->notifier_lock);
return 0;
error_abort:
drm_sched_job_cleanup(&job->base);
amdgpu_mn_unlock(p->mn);
mutex_unlock(&p->adev->notifier_lock);
error_unlock:
amdgpu_job_free(job);
......
......@@ -2794,6 +2794,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset);
mutex_init(&adev->notifier_lock);
mutex_init(&adev->virt.dpm_mutex);
mutex_init(&adev->psp.mutex);
......
This diff is collapsed.
......@@ -30,63 +30,10 @@
#include <linux/workqueue.h>
#include <linux/interval_tree.h>
enum amdgpu_mn_type {
AMDGPU_MN_TYPE_GFX,
AMDGPU_MN_TYPE_HSA,
};
/**
* struct amdgpu_mn
*
* @adev: amdgpu device pointer
* @mm: process address space
* @type: type of MMU notifier
* @work: destruction work item
* @node: hash table node to find structure by adev and mn
* @lock: rw semaphore protecting the notifier nodes
* @objects: interval tree containing amdgpu_mn_nodes
* @mirror: HMM mirror function support
*
* Data for each amdgpu device and process address space.
*/
struct amdgpu_mn {
/* constant after initialisation */
struct amdgpu_device *adev;
struct mm_struct *mm;
enum amdgpu_mn_type type;
/* only used on destruction */
struct work_struct work;
/* protected by adev->mn_lock */
struct hlist_node node;
/* objects protected by lock */
struct rw_semaphore lock;
struct rb_root_cached objects;
#ifdef CONFIG_HMM_MIRROR
/* HMM mirror */
struct hmm_mirror mirror;
#endif
};
#if defined(CONFIG_HMM_MIRROR)
void amdgpu_mn_lock(struct amdgpu_mn *mn);
void amdgpu_mn_unlock(struct amdgpu_mn *mn);
struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
enum amdgpu_mn_type type);
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr);
void amdgpu_mn_unregister(struct amdgpu_bo *bo);
void amdgpu_hmm_init_range(struct hmm_range *range);
#else
static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {}
static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {}
static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
enum amdgpu_mn_type type)
{
return NULL;
}
static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
{
DRM_WARN_ONCE("HMM_MIRROR kernel config option is not enabled, "
......
......@@ -30,6 +30,9 @@
#include <drm/amdgpu_drm.h>
#include "amdgpu.h"
#ifdef CONFIG_MMU_NOTIFIER
#include <linux/mmu_notifier.h>
#endif
#define AMDGPU_BO_INVALID_OFFSET LONG_MAX
#define AMDGPU_BO_MAX_PLACEMENTS 3
......@@ -101,10 +104,12 @@ struct amdgpu_bo {
struct ttm_bo_kmap_obj dma_buf_vmap;
struct amdgpu_mn *mn;
union {
struct list_head mn_list;
struct list_head shadow_list;
};
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_interval_notifier notifier;
#endif
struct list_head shadow_list;
struct kgd_mem *kfd_bo;
};
......
......@@ -35,6 +35,7 @@
#include <linux/hmm.h>
#include <linux/pagemap.h>
#include <linux/sched/task.h>
#include <linux/sched/mm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/swap.h>
......@@ -769,6 +770,20 @@ struct amdgpu_ttm_tt {
#endif
};
#ifdef CONFIG_DRM_AMDGPU_USERPTR
/* flags used by HMM internal, not related to CPU/GPU PTE flags */
static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
(1 << 0), /* HMM_PFN_VALID */
(1 << 1), /* HMM_PFN_WRITE */
0 /* HMM_PFN_DEVICE_PRIVATE */
};
static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
0, /* HMM_PFN_NONE */
0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
};
/**
* amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
* memory and start HMM tracking CPU page table update
......@@ -776,85 +791,89 @@ struct amdgpu_ttm_tt {
* Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
* once afterwards to stop HMM tracking
*/
#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
#define MAX_RETRY_HMM_RANGE_FAULT 16
int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
{
struct hmm_mirror *mirror = bo->mn ? &bo->mn->mirror : NULL;
struct ttm_tt *ttm = bo->tbo.ttm;
struct amdgpu_ttm_tt *gtt = (void *)ttm;
struct mm_struct *mm = gtt->usertask->mm;
unsigned long start = gtt->userptr;
struct vm_area_struct *vma;
struct hmm_range *range;
unsigned long timeout;
struct mm_struct *mm;
unsigned long i;
uint64_t *pfns;
int r = 0;
if (!mm) /* Happens during process shutdown */
return -ESRCH;
if (unlikely(!mirror)) {
DRM_DEBUG_DRIVER("Failed to get hmm_mirror\n");
r = -EFAULT;
goto out;
mm = bo->notifier.mm;
if (unlikely(!mm)) {
DRM_DEBUG_DRIVER("BO is not registered?\n");
return -EFAULT;
}
vma = find_vma(mm, start);
if (unlikely(!vma || start < vma->vm_start)) {
r = -EFAULT;
goto out;
}
if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
vma->vm_file)) {
r = -EPERM;
goto out;
}
/* Another get_user_pages is running at the same time?? */
if (WARN_ON(gtt->range))
return -EFAULT;
if (!mmget_not_zero(mm)) /* Happens during process shutdown */
return -ESRCH;
range = kzalloc(sizeof(*range), GFP_KERNEL);
if (unlikely(!range)) {
r = -ENOMEM;
goto out;
}
range->notifier = &bo->notifier;
range->flags = hmm_range_flags;
range->values = hmm_range_values;
range->pfn_shift = PAGE_SHIFT;
range->start = bo->notifier.interval_tree.start;
range->end = bo->notifier.interval_tree.last + 1;
range->default_flags = hmm_range_flags[HMM_PFN_VALID];
if (!amdgpu_ttm_tt_is_readonly(ttm))
range->default_flags |= range->flags[HMM_PFN_WRITE];
pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
if (unlikely(!pfns)) {
range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
GFP_KERNEL);
if (unlikely(!range->pfns)) {
r = -ENOMEM;
goto out_free_ranges;
}
amdgpu_hmm_init_range(range);
range->default_flags = range->flags[HMM_PFN_VALID];
range->default_flags |= amdgpu_ttm_tt_is_readonly(ttm) ?
0 : range->flags[HMM_PFN_WRITE];
range->pfn_flags_mask = 0;
range->pfns = pfns;
range->start = start;
range->end = start + ttm->num_pages * PAGE_SIZE;
hmm_range_register(range, mirror);
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
if (unlikely(!vma || start < vma->vm_start)) {
r = -EFAULT;
goto out_unlock;
}
if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
vma->vm_file)) {
r = -EPERM;
goto out_unlock;
}
up_read(&mm->mmap_sem);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
/*
* Just wait for range to be valid, safe to ignore return value as we
* will use the return value of hmm_range_fault() below under the
* mmap_sem to ascertain the validity of the range.
*/
hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
retry:
range->notifier_seq = mmu_interval_read_begin(&bo->notifier);
down_read(&mm->mmap_sem);
r = hmm_range_fault(range, 0);
up_read(&mm->mmap_sem);
if (unlikely(r < 0))
if (unlikely(r <= 0)) {
/*
* FIXME: This timeout should encompass the retry from
* mmu_interval_read_retry() as well.
*/
if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
}
for (i = 0; i < ttm->num_pages; i++) {
pages[i] = hmm_device_entry_to_page(range, pfns[i]);
/* FIXME: The pages cannot be touched outside the notifier_lock */
pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
if (unlikely(!pages[i])) {
pr_err("Page fault failed for pfn[%lu] = 0x%llx\n",
i, pfns[i]);
i, range->pfns[i]);
r = -ENOMEM;
goto out_free_pfns;
......@@ -862,15 +881,18 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
}
gtt->range = range;
mmput(mm);
return 0;
out_unlock:
up_read(&mm->mmap_sem);
out_free_pfns:
hmm_range_unregister(range);
kvfree(pfns);
kvfree(range->pfns);
out_free_ranges:
kfree(range);
out:
mmput(mm);
return r;
}
......@@ -895,15 +917,18 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
"No user pages to check\n");
if (gtt->range) {
r = hmm_range_valid(gtt->range);
hmm_range_unregister(gtt->range);
/*
* FIXME: Must always hold notifier_lock for this, and must
* not ignore the return code.
*/
r = mmu_interval_read_retry(gtt->range->notifier,
gtt->range->notifier_seq);
kvfree(gtt->range->pfns);
kfree(gtt->range);
gtt->range = NULL;
}
return r;
return !r;
}
#endif
......@@ -984,10 +1009,18 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
sg_free_table(ttm->sg);
#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
if (gtt->range &&
ttm->pages[0] == hmm_device_entry_to_page(gtt->range,
gtt->range->pfns[0]))
WARN_ONCE(1, "Missing get_user_page_done\n");
if (gtt->range) {
unsigned long i;
for (i = 0; i < ttm->num_pages; i++) {
if (ttm->pages[i] !=
hmm_device_entry_to_page(gtt->range,
gtt->range->pfns[i]))
break;
}
WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
}
#endif
}
......
This diff is collapsed.
......@@ -68,6 +68,10 @@
#include <linux/hashtable.h>
#include <linux/dma-fence.h>
#ifdef CONFIG_MMU_NOTIFIER
#include <linux/mmu_notifier.h>
#endif
#include <drm/ttm/ttm_bo_api.h>
#include <drm/ttm/ttm_bo_driver.h>
#include <drm/ttm/ttm_placement.h>
......@@ -509,8 +513,9 @@ struct radeon_bo {
struct ttm_bo_kmap_obj dma_buf_vmap;
pid_t pid;
struct radeon_mn *mn;
struct list_head mn_list;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_interval_notifier notifier;
#endif
};
#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
......
......@@ -36,131 +36,51 @@
#include "radeon.h"
struct radeon_mn {
struct mmu_notifier mn;
/* objects protected by lock */
struct mutex lock;
struct rb_root_cached objects;
};
struct radeon_mn_node {
struct interval_tree_node it;
struct list_head bos;
};
/**
* radeon_mn_invalidate_range_start - callback to notify about mm change
* radeon_mn_invalidate - callback to notify about mm change
*
* @mn: our notifier
* @mn: the mm this callback is about
* @start: start of updated range
* @end: end of updated range
* @range: the VMA under invalidation
*
* We block for all BOs between start and end to be idle and
* unmap them by move them into system domain again.
*/
static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
static bool radeon_mn_invalidate(struct mmu_interval_notifier *mn,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct radeon_bo *bo = container_of(mn, struct radeon_bo, notifier);
struct ttm_operation_ctx ctx = { false, false };
struct interval_tree_node *it;
unsigned long end;
int ret = 0;
/* notification is exclusive, but interval is inclusive */
end = range->end - 1;
/* TODO we should be able to split locking for interval tree and
* the tear down.
*/
if (mmu_notifier_range_blockable(range))
mutex_lock(&rmn->lock);
else if (!mutex_trylock(&rmn->lock))
return -EAGAIN;
it = interval_tree_iter_first(&rmn->objects, range->start, end);
while (it) {
struct radeon_mn_node *node;
struct radeon_bo *bo;
long r;
if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
goto out_unlock;
}
node = container_of(it, struct radeon_mn_node, it);
it = interval_tree_iter_next(it, range->start, end);
long r;
list_for_each_entry(bo, &node->bos, mn_list) {
if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
return true;
if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
continue;
if (!mmu_notifier_range_blockable(range))
return false;
r = radeon_bo_reserve(bo, true);
if (r) {
DRM_ERROR("(%ld) failed to reserve user bo\n", r);
continue;
}
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
true, false, MAX_SCHEDULE_TIMEOUT);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (r)
DRM_ERROR("(%ld) failed to validate user bo\n", r);
radeon_bo_unreserve(bo);
}
r = radeon_bo_reserve(bo, true);
if (r) {
DRM_ERROR("(%ld) failed to reserve user bo\n", r);
return true;
}
out_unlock:
mutex_unlock(&rmn->lock);
return ret;
}
static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
struct mmu_notifier_range range = {
.mm = mm,
.start = 0,
.end = ULONG_MAX,
.flags = 0,
.event = MMU_NOTIFY_UNMAP,
};
radeon_mn_invalidate_range_start(mn, &range);
}
static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
{
struct radeon_mn *rmn;
rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
if (!rmn)
return ERR_PTR(-ENOMEM);
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
MAX_SCHEDULE_TIMEOUT);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
mutex_init(&rmn->lock);
rmn->objects = RB_ROOT_CACHED;
return &rmn->mn;
}
radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (r)
DRM_ERROR("(%ld) failed to validate user bo\n", r);
static void radeon_mn_free_notifier(struct mmu_notifier *mn)
{
kfree(container_of(mn, struct radeon_mn, mn));
radeon_bo_unreserve(bo);
return true;
}
static const struct mmu_notifier_ops radeon_mn_ops = {
.release = radeon_mn_release,
.invalidate_range_start = radeon_mn_invalidate_range_start,
.alloc_notifier = radeon_mn_alloc_notifier,
.free_notifier = radeon_mn_free_notifier,
static const struct mmu_interval_notifier_ops radeon_mn_ops = {
.invalidate = radeon_mn_invalidate,
};
/**
......@@ -174,51 +94,20 @@ static const struct mmu_notifier_ops radeon_mn_ops = {
*/
int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
{
unsigned long end = addr + radeon_bo_size(bo) - 1;
struct mmu_notifier *mn;
struct radeon_mn *rmn;
struct radeon_mn_node *node = NULL;
struct list_head bos;
struct interval_tree_node *it;
mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
if (IS_ERR(mn))
return PTR_ERR(mn);
rmn = container_of(mn, struct radeon_mn, mn);
INIT_LIST_HEAD(&bos);
mutex_lock(&rmn->lock);
while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) {
kfree(node);
node = container_of(it, struct radeon_mn_node, it);
interval_tree_remove(&node->it, &rmn->objects);
addr = min(it->start, addr);
end = max(it->last, end);
list_splice(&node->bos, &bos);
}
if (!node) {
node = kmalloc(sizeof(struct radeon_mn_node), GFP_KERNEL);
if (!node) {
mutex_unlock(&rmn->lock);
return -ENOMEM;
}
}
bo->mn = rmn;
node->it.start = addr;
node->it.last = end;
INIT_LIST_HEAD(&node->bos);
list_splice(&bos, &node->bos);
list_add(&bo->mn_list, &node->bos);
interval_tree_insert(&node->it, &rmn->objects);
mutex_unlock(&rmn->lock);
int ret;
ret = mmu_interval_notifier_insert(&bo->notifier, current->mm, addr,
radeon_bo_size(bo), &radeon_mn_ops);
if (ret)
return ret;
/*
* FIXME: radeon appears to allow get_user_pages to run during
* invalidate_range_start/end, which is not a safe way to read the
* PTEs. It should use the mmu_interval_read_begin() scheme around the
* get_user_pages to ensure that the PTEs are read properly
*/
mmu_interval_read_begin(&bo->notifier);
return 0;
}
......@@ -231,27 +120,8 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
*/
void radeon_mn_unregister(struct radeon_bo *bo)
{
struct radeon_mn *rmn = bo->mn;
struct list_head *head;
if (!rmn)
if (!bo->notifier.mm)
return;
mutex_lock(&rmn->lock);
/* save the next list entry for later */
head = bo->mn_list.next;
list_del(&bo->mn_list);
if (list_empty(head)) {
struct radeon_mn_node *node;
node = container_of(head, struct radeon_mn_node, bos);
interval_tree_remove(&node->it, &rmn->objects);
kfree(node);
}
mutex_unlock(&rmn->lock);
mmu_notifier_put(&rmn->mn);
bo->mn = NULL;
mmu_interval_notifier_remove(&bo->notifier);
bo->notifier.mm = NULL;
}
......@@ -2634,7 +2634,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vf_guid);
SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port);
SET_DEVICE_OP(dev_ops, invalidate_range);
SET_DEVICE_OP(dev_ops, iw_accept);
SET_DEVICE_OP(dev_ops, iw_add_ref);
SET_DEVICE_OP(dev_ops, iw_connect);
......
This diff is collapsed.
......@@ -1138,7 +1138,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
/* adjust flag if this fd is not able to cache */
if (!fd->handler)
if (!fd->use_mn)
cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
cinfo.num_active = hfi1_count_active_units();
......
......@@ -1444,7 +1444,7 @@ struct hfi1_filedata {
/* for cpu affinity; -1 if none */
int rec_cpu_num;
u32 tid_n_pinned;
struct mmu_rb_handler *handler;
bool use_mn;
struct tid_rb_node **entry_to_rb;
spinlock_t tid_lock; /* protect tid_[limit,used] counters */
u32 tid_limit;
......
......@@ -59,11 +59,11 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
struct tid_user_buf *tbuf,
u32 rcventry, struct tid_group *grp,
u16 pageidx, unsigned int npages);
static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
struct tid_rb_node *tnode);
static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
struct tid_group *grp,
unsigned int start, u16 count,
......@@ -73,10 +73,8 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
struct tid_group **grp);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
static struct mmu_rb_ops tid_rb_ops = {
.insert = tid_rb_insert,
.remove = tid_rb_remove,
.invalidate = tid_rb_invalidate
static const struct mmu_interval_notifier_ops tid_mn_ops = {
.invalidate = tid_rb_invalidate,
};
/*
......@@ -87,7 +85,6 @@ static struct mmu_rb_ops tid_rb_ops = {
int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
struct hfi1_ctxtdata *uctxt)
{
struct hfi1_devdata *dd = uctxt->dd;
int ret = 0;
spin_lock_init(&fd->tid_lock);
......@@ -109,20 +106,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
fd->entry_to_rb = NULL;
return -ENOMEM;
}
/*
* Register MMU notifier callbacks. If the registration
* fails, continue without TID caching for this context.
*/
ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
dd->pport->hfi1_wq,
&fd->handler);
if (ret) {
dd_dev_info(dd,
"Failed MMU notifier registration %d\n",
ret);
ret = 0;
}
fd->use_mn = true;
}
/*
......@@ -139,7 +123,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
* init.
*/
spin_lock(&fd->tid_lock);
if (uctxt->subctxt_cnt && fd->handler) {
if (uctxt->subctxt_cnt && fd->use_mn) {
u16 remainder;
fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
......@@ -158,18 +142,10 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
/*
* The notifier would have been removed when the process'es mm
* was freed.
*/
if (fd->handler) {
hfi1_mmu_rb_unregister(fd->handler);
} else {
if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
}
if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
kfree(fd->invalid_tids);
fd->invalid_tids = NULL;
......@@ -201,7 +177,7 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd,
if (mapped) {
pci_unmap_single(dd->pcidev, node->dma_addr,
node->mmu.len, PCI_DMA_FROMDEVICE);
node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
pages = &node->pages[idx];
} else {
pages = &tidbuf->pages[idx];
......@@ -777,8 +753,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
return -EFAULT;
}
node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
node->mmu.len = npages * PAGE_SIZE;
node->fdata = fd;
node->phys = page_to_phys(pages[0]);
node->npages = npages;
node->rcventry = rcventry;
......@@ -787,23 +762,35 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
node->freed = false;
memcpy(node->pages, pages, sizeof(struct page *) * npages);
if (!fd->handler)
ret = tid_rb_insert(fd, &node->mmu);
else
ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
if (ret) {
hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
node->rcventry, node->mmu.addr, node->phys, ret);
pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
PCI_DMA_FROMDEVICE);
kfree(node);
return -EFAULT;
if (fd->use_mn) {
ret = mmu_interval_notifier_insert(
&node->notifier, fd->mm,
tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
&tid_mn_ops);
if (ret)
goto out_unmap;
/*
* FIXME: This is in the wrong order, the notifier should be
* established before the pages are pinned by pin_rcv_pages.
*/
mmu_interval_read_begin(&node->notifier);
}
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
node->mmu.addr, node->phys, phys);
node->notifier.interval_tree.start, node->phys,
phys);
return 0;
out_unmap:
hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
node->rcventry, node->notifier.interval_tree.start,
node->phys, ret);
pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
PCI_DMA_FROMDEVICE);
kfree(node);
return -EFAULT;
}
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
......@@ -833,10 +820,9 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
if (grp)
*grp = node->grp;
if (!fd->handler)
cacheless_tid_rb_remove(fd, node);
else
hfi1_mmu_rb_remove(fd->handler, &node->mmu);
if (fd->use_mn)
mmu_interval_notifier_remove(&node->notifier);
cacheless_tid_rb_remove(fd, node);
return 0;
}
......@@ -847,7 +833,8 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
struct hfi1_devdata *dd = uctxt->dd;
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
node->npages, node->mmu.addr, node->phys,
node->npages,
node->notifier.interval_tree.start, node->phys,
node->dma_addr);
/*
......@@ -894,30 +881,29 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
if (!node || node->rcventry != rcventry)
continue;
if (fd->use_mn)
mmu_interval_notifier_remove(
&node->notifier);
cacheless_tid_rb_remove(fd, node);
}
}
}
}
/*
* Always return 0 from this function. A non-zero return indicates that the
* remove operation will be called and that memory should be unpinned.
* However, the driver cannot unpin out from under PSM. Instead, retain the
* memory (by returning 0) and inform PSM that the memory is going away. PSM
* will call back later when it has removed the memory from its list.
*/
static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct hfi1_filedata *fdata = arg;
struct hfi1_ctxtdata *uctxt = fdata->uctxt;
struct tid_rb_node *node =
container_of(mnode, struct tid_rb_node, mmu);
container_of(mni, struct tid_rb_node, notifier);
struct hfi1_filedata *fdata = node->fdata;
struct hfi1_ctxtdata *uctxt = fdata->uctxt;
if (node->freed)
return 0;
return true;
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
node->notifier.interval_tree.start,
node->rcventry, node->npages, node->dma_addr);
node->freed = true;
......@@ -946,18 +932,7 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
fdata->invalid_tid_idx++;
}
spin_unlock(&fdata->invalid_lock);
return 0;
}
static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
{
struct hfi1_filedata *fdata = arg;
struct tid_rb_node *tnode =
container_of(node, struct tid_rb_node, mmu);
u32 base = fdata->uctxt->expected_base;
fdata->entry_to_rb[tnode->rcventry - base] = tnode;
return 0;
return true;
}
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
......@@ -968,12 +943,3 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
fdata->entry_to_rb[tnode->rcventry - base] = NULL;
clear_tid_node(fdata, tnode);
}
static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
{
struct hfi1_filedata *fdata = arg;
struct tid_rb_node *tnode =
container_of(node, struct tid_rb_node, mmu);
cacheless_tid_rb_remove(fdata, tnode);
}
......@@ -65,7 +65,8 @@ struct tid_user_buf {
};
struct tid_rb_node {
struct mmu_rb_node mmu;
struct mmu_interval_notifier notifier;
struct hfi1_filedata *fdata;
unsigned long phys;
struct tid_group *grp;
u32 rcventry;
......
......@@ -1258,8 +1258,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
unsigned long end);
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
size_t nentries, struct mlx5_ib_mr *mr, int flags);
......@@ -1289,11 +1287,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
{
return -EOPNOTSUPP;
}
static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
unsigned long start,
unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
/* Needed for rep profile */
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
......
......@@ -749,7 +749,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_umem_odp *odp;
odp = ib_umem_odp_get(udata, start, length, access_flags);
odp = ib_umem_odp_get(udata, start, length, access_flags,
&mlx5_mn_ops);
if (IS_ERR(odp)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
PTR_ERR(odp));
......
......@@ -241,18 +241,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
xa_unlock(&imr->implicit_children);
}
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
unsigned long end)
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct ib_umem_odp *umem_odp =
container_of(mni, struct ib_umem_odp, notifier);
struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
sizeof(struct mlx5_mtt)) - 1;
u64 idx = 0, blk_start_idx = 0;
u64 invalidations = 0;
unsigned long start;
unsigned long end;
int in_block = 0;
u64 addr;
if (!mmu_notifier_range_blockable(range))
return false;
mutex_lock(&umem_odp->umem_mutex);
mmu_interval_set_seq(mni, cur_seq);
/*
* If npages is zero then umem_odp->private may not be setup yet. This
* does not complete until after the first page is mapped for DMA.
......@@ -261,8 +270,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
goto out;
mr = umem_odp->private;
start = max_t(u64, ib_umem_start(umem_odp), start);
end = min_t(u64, ib_umem_end(umem_odp), end);
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
/*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
......@@ -319,8 +328,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
destroy_unused_implicit_child_mr(mr);
out:
mutex_unlock(&umem_odp->umem_mutex);
return true;
}
const struct mmu_interval_notifier_ops mlx5_mn_ops = {
.invalidate = mlx5_ib_invalidate_range,
};
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
struct ib_odp_caps *caps = &dev->odp_caps;
......@@ -419,7 +433,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
idx * MLX5_IMR_MTT_SIZE,
MLX5_IMR_MTT_SIZE);
MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
if (IS_ERR(odp))
return ERR_CAST(odp);
......@@ -606,8 +620,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
u64 user_va, size_t bcnt, u32 *bytes_mapped,
u32 flags)
{
int current_seq, page_shift, ret, np;
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
unsigned long current_seq;
u64 access_mask;
u64 start_idx, page_mask;
......@@ -619,12 +634,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq);
/*
* Ensure the sequence number is valid for some time before we call
* gup.
*/
smp_rmb();
current_seq = mmu_interval_read_begin(&odp->notifier);
np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
current_seq);
......@@ -632,7 +642,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np;
mutex_lock(&odp->umem_mutex);
if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
......@@ -662,19 +672,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np << (page_shift - PAGE_SHIFT);
out:
if (ret == -EAGAIN) {
unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
if (!wait_for_completion_timeout(&odp->notifier_completion,
timeout)) {
mlx5_ib_warn(
mr->dev,
"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
current_seq, odp->notifiers_seq,
odp->notifiers_count);
}
}
return ret;
}
......@@ -1622,7 +1619,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr,
.invalidate_range = mlx5_ib_invalidate_range,
};
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
......
......@@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv;
struct gntdev_priv {
/* Maps with visible offsets in the file descriptor. */
struct list_head maps;
/*
* Maps that are not visible; will be freed on munmap.
* Only populated if populate_freeable_maps == 1
*/
struct list_head freeable_maps;
/* lock protects maps and freeable_maps. */
struct mutex lock;
struct mm_struct *mm;
struct mmu_notifier mn;
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
/* Device for which DMA memory is allocated. */
......@@ -49,6 +42,7 @@ struct gntdev_unmap_notify {
};
struct gntdev_grant_map {
struct mmu_interval_notifier notifier;
struct list_head next;
struct vm_area_struct *vma;
int index;
......
......@@ -63,7 +63,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
static atomic_t pages_mapped = ATOMIC_INIT(0);
static int use_ptemod;
#define populate_freeable_maps use_ptemod
static int unmap_grant_pages(struct gntdev_grant_map *map,
int offset, int pages);
......@@ -249,12 +248,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
evtchn_put(map->notify.event);
}
if (populate_freeable_maps && priv) {
mutex_lock(&priv->lock);
list_del(&map->next);
mutex_unlock(&priv->lock);
}
if (map->pages && !use_ptemod)
unmap_grant_pages(map, 0, map->count);
gntdev_free_map(map);
......@@ -444,16 +437,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
pr_debug("gntdev_vma_close %p\n", vma);
if (use_ptemod) {
/* It is possible that an mmu notifier could be running
* concurrently, so take priv->lock to ensure that the vma won't
* vanishing during the unmap_grant_pages call, since we will
* spin here until that completes. Such a concurrent call will
* not do any unmapping, since that has been done prior to
* closing the vma, but it may still iterate the unmap_ops list.
*/
mutex_lock(&priv->lock);
WARN_ON(map->vma != vma);
mmu_interval_notifier_remove(&map->notifier);
map->vma = NULL;
mutex_unlock(&priv->lock);
}
vma->vm_private_data = NULL;
gntdev_put_map(priv, map);
......@@ -475,109 +461,44 @@ static const struct vm_operations_struct gntdev_vmops = {
/* ------------------------------------------------------------------ */
static bool in_range(struct gntdev_grant_map *map,
unsigned long start, unsigned long end)
{
if (!map->vma)
return false;
if (map->vma->vm_start >= end)
return false;
if (map->vma->vm_end <= start)
return false;
return true;
}
static int unmap_if_in_range(struct gntdev_grant_map *map,
unsigned long start, unsigned long end,
bool blockable)
static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct gntdev_grant_map *map =
container_of(mn, struct gntdev_grant_map, notifier);
unsigned long mstart, mend;
int err;
if (!in_range(map, start, end))
return 0;
if (!mmu_notifier_range_blockable(range))
return false;
if (!blockable)
return -EAGAIN;
/*
* If the VMA is split or otherwise changed the notifier is not
* updated, but we don't want to process VA's outside the modified
* VMA. FIXME: It would be much more understandable to just prevent
* modifying the VMA in the first place.
*/
if (map->vma->vm_start >= range->end ||
map->vma->vm_end <= range->start)
return true;
mstart = max(start, map->vma->vm_start);
mend = min(end, map->vma->vm_end);
mstart = max(range->start, map->vma->vm_start);
mend = min(range->end, map->vma->vm_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
map->index, map->count,
map->vma->vm_start, map->vma->vm_end,
start, end, mstart, mend);
range->start, range->end, mstart, mend);
err = unmap_grant_pages(map,
(mstart - map->vma->vm_start) >> PAGE_SHIFT,
(mend - mstart) >> PAGE_SHIFT);
WARN_ON(err);
return 0;
}
static int mn_invl_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
{
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
struct gntdev_grant_map *map;
int ret = 0;
if (mmu_notifier_range_blockable(range))
mutex_lock(&priv->lock);
else if (!mutex_trylock(&priv->lock))
return -EAGAIN;
list_for_each_entry(map, &priv->maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
list_for_each_entry(map, &priv->freeable_maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
out_unlock:
mutex_unlock(&priv->lock);
return ret;
}
static void mn_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
struct gntdev_grant_map *map;
int err;
mutex_lock(&priv->lock);
list_for_each_entry(map, &priv->maps, next) {
if (!map->vma)
continue;
pr_debug("map %d+%d (%lx %lx)\n",
map->index, map->count,
map->vma->vm_start, map->vma->vm_end);
err = unmap_grant_pages(map, /* offset */ 0, map->count);
WARN_ON(err);
}
list_for_each_entry(map, &priv->freeable_maps, next) {
if (!map->vma)
continue;
pr_debug("map %d+%d (%lx %lx)\n",
map->index, map->count,
map->vma->vm_start, map->vma->vm_end);
err = unmap_grant_pages(map, /* offset */ 0, map->count);
WARN_ON(err);
}
mutex_unlock(&priv->lock);
return true;
}
static const struct mmu_notifier_ops gntdev_mmu_ops = {
.release = mn_release,
.invalidate_range_start = mn_invl_range_start,
static const struct mmu_interval_notifier_ops gntdev_mmu_ops = {
.invalidate = gntdev_invalidate,
};
/* ------------------------------------------------------------------ */
......@@ -592,7 +513,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
return -ENOMEM;
INIT_LIST_HEAD(&priv->maps);
INIT_LIST_HEAD(&priv->freeable_maps);
mutex_init(&priv->lock);
#ifdef CONFIG_XEN_GNTDEV_DMABUF
......@@ -604,17 +524,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
}
#endif
if (use_ptemod) {
priv->mm = get_task_mm(current);
if (!priv->mm) {
kfree(priv);
return -ENOMEM;
}
priv->mn.ops = &gntdev_mmu_ops;
ret = mmu_notifier_register(&priv->mn, priv->mm);
mmput(priv->mm);
}
if (ret) {
kfree(priv);
return ret;
......@@ -644,16 +553,12 @@ static int gntdev_release(struct inode *inode, struct file *flip)
list_del(&map->next);
gntdev_put_map(NULL /* already removed */, map);
}
WARN_ON(!list_empty(&priv->freeable_maps));
mutex_unlock(&priv->lock);
#ifdef CONFIG_XEN_GNTDEV_DMABUF
gntdev_dmabuf_fini(priv->dmabuf_priv);
#endif
if (use_ptemod)
mmu_notifier_unregister(&priv->mn, priv->mm);
kfree(priv);
return 0;
}
......@@ -714,8 +619,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
if (map) {
list_del(&map->next);
if (populate_freeable_maps)
list_add_tail(&map->next, &priv->freeable_maps);
err = 0;
}
mutex_unlock(&priv->lock);
......@@ -1087,11 +990,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
goto unlock_out;
if (use_ptemod && map->vma)
goto unlock_out;
if (use_ptemod && priv->mm != vma->vm_mm) {
pr_warn("Huh? Other mm?\n");
goto unlock_out;
}
refcount_inc(&map->users);
vma->vm_ops = &gntdev_vmops;
......@@ -1102,10 +1000,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
vma->vm_flags |= VM_DONTCOPY;
vma->vm_private_data = map;
if (use_ptemod)
map->vma = vma;
if (map->flags) {
if ((vma->vm_flags & VM_WRITE) &&
(map->flags & GNTMAP_readonly))
......@@ -1116,8 +1010,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
map->flags |= GNTMAP_readonly;
}
if (use_ptemod) {
map->vma = vma;
err = mmu_interval_notifier_insert_locked(
&map->notifier, vma->vm_mm, vma->vm_start,
vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
if (err)
goto out_unlock_put;
}
mutex_unlock(&priv->lock);
/*
* gntdev takes the address of the PTE in find_grant_ptes() and passes
* it to the hypervisor in gntdev_map_grant_pages(). The purpose of
* the notifier is to prevent the hypervisor pointer to the PTE from
* going stale.
*
* Since this vma's mappings can't be touched without the mmap_sem,
* and we are holding it now, there is no need for the notifier_range
* locking pattern.
*/
mmu_interval_read_begin(&map->notifier);
if (use_ptemod) {
map->pages_vm_start = vma->vm_start;
err = apply_to_page_range(vma->vm_mm, vma->vm_start,
......@@ -1166,8 +1080,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
mutex_unlock(&priv->lock);
out_put_map:
if (use_ptemod) {
map->vma = NULL;
unmap_grant_pages(map, 0, map->count);
if (map->vma) {
mmu_interval_notifier_remove(&map->notifier);
map->vma = NULL;
}
}
gntdev_put_map(priv, map);
return err;
......
......@@ -62,37 +62,12 @@
#include <linux/kconfig.h>
#include <asm/pgtable.h>
#ifdef CONFIG_HMM_MIRROR
#include <linux/device.h>
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
#include <linux/mmu_notifier.h>
/*
* struct hmm - HMM per mm struct
*
* @mm: mm struct this HMM struct is bound to
* @lock: lock protecting ranges list
* @ranges: list of range being snapshotted
* @mirrors: list of mirrors for this mm
* @mmu_notifier: mmu notifier to track updates to CPU page table
* @mirrors_sem: read/write semaphore protecting the mirrors list
* @wq: wait queue for user waiting on a range invalidation
* @notifiers: count of active mmu notifiers
*/
struct hmm {
struct mmu_notifier mmu_notifier;
spinlock_t ranges_lock;
struct list_head ranges;
struct list_head mirrors;
struct rw_semaphore mirrors_sem;
wait_queue_head_t wq;
long notifiers;
};
/*
* hmm_pfn_flag_e - HMM flag enums
*
......@@ -145,6 +120,8 @@ enum hmm_pfn_value_e {
/*
* struct hmm_range - track invalidation lock on virtual address range
*
* @notifier: a mmu_interval_notifier that includes the start/end
* @notifier_seq: result of mmu_interval_read_begin()
* @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
......@@ -159,8 +136,8 @@ enum hmm_pfn_value_e {
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
struct hmm *hmm;
struct list_head list;
struct mmu_interval_notifier *notifier;
unsigned long notifier_seq;
unsigned long start;
unsigned long end;
uint64_t *pfns;
......@@ -169,32 +146,8 @@ struct hmm_range {
uint64_t default_flags;
uint64_t pfn_flags_mask;
uint8_t pfn_shift;
bool valid;
};
/*
* hmm_range_wait_until_valid() - wait for range to be valid
* @range: range affected by invalidation to wait on
* @timeout: time out for wait in ms (ie abort wait after that period of time)
* Return: true if the range is valid, false otherwise.
*/
static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
unsigned long timeout)
{
return wait_event_timeout(range->hmm->wq, range->valid,
msecs_to_jiffies(timeout)) != 0;
}
/*
* hmm_range_valid() - test if a range is valid or not
* @range: range
* Return: true if the range is valid, false otherwise.
*/
static inline bool hmm_range_valid(struct hmm_range *range)
{
return range->valid;
}
/*
* hmm_device_entry_to_page() - return struct page pointed to by a device entry
* @range: range use to decode device entry value
......@@ -264,120 +217,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
range->flags[HMM_PFN_VALID];
}
/*
* Mirroring: how to synchronize device page table with CPU page table.
*
* A device driver that is participating in HMM mirroring must always
* synchronize with CPU page table updates. For this, device drivers can either
* directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
* drivers can decide to register one mirror per device per process, or just
* one mirror per process for a group of devices. The pattern is:
*
* int device_bind_address_space(..., struct mm_struct *mm, ...)
* {
* struct device_address_space *das;
*
* // Device driver specific initialization, and allocation of das
* // which contains an hmm_mirror struct as one of its fields.
* ...
*
* ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
* if (ret) {
* // Cleanup on error
* return ret;
* }
*
* // Other device driver specific initialization
* ...
* }
*
* Once an hmm_mirror is registered for an address space, the device driver
* will get callbacks through sync_cpu_device_pagetables() operation (see
* hmm_mirror_ops struct).
*
* Device driver must not free the struct containing the hmm_mirror struct
* before calling hmm_mirror_unregister(). The expected usage is to do that when
* the device driver is unbinding from an address space.
*
*
* void device_unbind_address_space(struct device_address_space *das)
* {
* // Device driver specific cleanup
* ...
*
* hmm_mirror_unregister(&das->mirror);
*
* // Other device driver specific cleanup, and now das can be freed
* ...
* }
*/
struct hmm_mirror;
/*
* struct hmm_mirror_ops - HMM mirror device operations callback
*
* @update: callback to update range on a device
*/
struct hmm_mirror_ops {
/* release() - release hmm_mirror
*
* @mirror: pointer to struct hmm_mirror
*
* This is called when the mm_struct is being released. The callback
* must ensure that all access to any pages obtained from this mirror
* is halted before the callback returns. All future access should
* fault.
*/
void (*release)(struct hmm_mirror *mirror);
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
* @update: update information (see struct mmu_notifier_range)
* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
* and callback needs to block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
* in response to this callback. The update argument tells what action
* to perform.
*
* The device driver must not return from this callback until the device
* page tables are completely updated (TLBs flushed, etc); this is a
* synchronous call.
*/
int (*sync_cpu_device_pagetables)(
struct hmm_mirror *mirror,
const struct mmu_notifier_range *update);
};
/*
* struct hmm_mirror - mirror struct for a device driver
*
* @hmm: pointer to struct hmm (which is unique per mm_struct)
* @ops: device driver callback for HMM mirror operations
* @list: for list of mirrors of a given mm
*
* Each address space (mm_struct) being mirrored by a device must register one
* instance of an hmm_mirror struct with HMM. HMM will track the list of all
* mirrors for each mm_struct.
*/
struct hmm_mirror {
struct hmm *hmm;
const struct hmm_mirror_ops *ops;
struct list_head list;
};
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
void hmm_range_unregister(struct hmm_range *range);
/*
* Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
*/
......@@ -386,16 +225,17 @@ void hmm_range_unregister(struct hmm_range *range);
/* Don't fault in missing PTEs, just snapshot the current state. */
#define HMM_FAULT_SNAPSHOT (1 << 1)
#ifdef CONFIG_HMM_MIRROR
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
long hmm_range_dma_map(struct hmm_range *range,
struct device *device,
dma_addr_t *daddrs,
unsigned int flags);
long hmm_range_dma_unmap(struct hmm_range *range,
struct device *device,
dma_addr_t *daddrs,
bool dirty);
#else
static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
return -EOPNOTSUPP;
}
#endif
/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
......@@ -406,6 +246,4 @@ long hmm_range_dma_unmap(struct hmm_range *range,
*/
#define HMM_RANGE_DEFAULT_TIMEOUT 1000
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
#endif /* LINUX_HMM_H */
......@@ -6,9 +6,12 @@
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>
struct mmu_notifier_mm;
struct mmu_notifier;
struct mmu_notifier_ops;
struct mmu_notifier_range;
struct mmu_interval_notifier;
/**
* enum mmu_notifier_event - reason for the mmu notifier callback
......@@ -31,6 +34,9 @@ struct mmu_notifier_ops;
* access flags). User should soft dirty the page in the end callback to make
* sure that anyone relying on soft dirtyness catch pages that might be written
* through non CPU mappings.
*
* @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
* that the mm refcount is zero and the range is no longer accessible.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
......@@ -38,38 +44,11 @@ enum mmu_notifier_event {
MMU_NOTIFY_PROTECTION_VMA,
MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY,
};
#ifdef CONFIG_MMU_NOTIFIER
#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif
/*
* The mmu notifier_mm structure is allocated and installed in
* mm->mmu_notifier_mm inside the mm_take_all_locks() protected
* critical section and it's released only when mm_count reaches zero
* in mmdrop().
*/
struct mmu_notifier_mm {
/* all mmu notifiers registerd in this mm are queued in this list */
struct hlist_head list;
/* to serialize the list modifications and hlist_unhashed */
spinlock_t lock;
MMU_NOTIFY_RELEASE,
};
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
struct mmu_notifier_range {
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
unsigned flags;
enum mmu_notifier_event event;
};
struct mmu_notifier_ops {
/*
* Called either by mmu_notifier_unregister or when the mm is
......@@ -249,6 +228,41 @@ struct mmu_notifier {
unsigned int users;
};
/**
* struct mmu_interval_notifier_ops
* @invalidate: Upon return the caller must stop using any SPTEs within this
* range. This function can sleep. Return false only if sleeping
* was required but mmu_notifier_range_blockable(range) is false.
*/
struct mmu_interval_notifier_ops {
bool (*invalidate)(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
};
struct mmu_interval_notifier {
struct interval_tree_node interval_tree;
const struct mmu_interval_notifier_ops *ops;
struct mm_struct *mm;
struct hlist_node deferred_item;
unsigned long invalidate_seq;
};
#ifdef CONFIG_MMU_NOTIFIER
#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif
struct mmu_notifier_range {
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
unsigned flags;
enum mmu_notifier_event event;
};
static inline int mm_has_notifiers(struct mm_struct *mm)
{
return unlikely(mm->mmu_notifier_mm);
......@@ -275,6 +289,81 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm);
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
struct mmu_interval_notifier *mni, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
/**
* mmu_interval_set_seq - Save the invalidation sequence
* @mni - The mni passed to invalidate
* @cur_seq - The cur_seq passed to the invalidate() callback
*
* This must be called unconditionally from the invalidate callback of a
* struct mmu_interval_notifier_ops under the same lock that is used to call
* mmu_interval_read_retry(). It updates the sequence number for later use by
* mmu_interval_read_retry(). The provided cur_seq will always be odd.
*
* If the caller does not call mmu_interval_read_begin() or
* mmu_interval_read_retry() then this call is not required.
*/
static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
unsigned long cur_seq)
{
WRITE_ONCE(mni->invalidate_seq, cur_seq);
}
/**
* mmu_interval_read_retry - End a read side critical section against a VA range
* mni: The range
* seq: The return of the paired mmu_interval_read_begin()
*
* This MUST be called under a user provided lock that is also held
* unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
*
* Each call should be paired with a single mmu_interval_read_begin() and
* should be used to conclude the read side.
*
* Returns true if an invalidation collided with this critical section, and
* the caller should retry.
*/
static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
{
return mni->invalidate_seq != seq;
}
/**
* mmu_interval_check_retry - Test if a collision has occurred
* mni: The range
* seq: The return of the matching mmu_interval_read_begin()
*
* This can be used in the critical section between mmu_interval_read_begin()
* and mmu_interval_read_retry(). A return of true indicates an invalidation
* has collided with this critical region and a future
* mmu_interval_read_retry() will return true.
*
* False is not reliable and only suggests a collision may not have
* occured. It can be called many times and does not have to hold the user
* provided lock.
*
* This call can be used as part of loops and other expensive operations to
* expedite a retry.
*/
static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
{
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
return READ_ONCE(mni->invalidate_seq) != seq;
}
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
......
......@@ -35,11 +35,11 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <linux/interval_tree.h>
struct ib_umem_odp {
struct ib_umem umem;
struct ib_ucontext_per_mm *per_mm;
struct mmu_interval_notifier notifier;
struct pid *tgid;
/*
* An array of the pages included in the on-demand paging umem.
......@@ -62,13 +62,8 @@ struct ib_umem_odp {
struct mutex umem_mutex;
void *private; /* for the HW driver to use. */
int notifiers_seq;
int notifiers_count;
int npages;
/* Tree tracking */
struct interval_tree_node interval_tree;
/*
* An implicit odp umem cannot be DMA mapped, has 0 length, and serves
* only as an anchor for the driver to hold onto the per_mm. FIXME:
......@@ -77,7 +72,6 @@ struct ib_umem_odp {
*/
bool is_implicit_odp;
struct completion notifier_completion;
unsigned int page_shift;
};
......@@ -89,13 +83,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
/* Returns the first page of an ODP umem. */
static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
{
return umem_odp->interval_tree.start;
return umem_odp->notifier.interval_tree.start;
}
/* Returns the address of the page after the last one of an ODP umem. */
static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
{
return umem_odp->interval_tree.last + 1;
return umem_odp->notifier.interval_tree.last + 1;
}
static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
......@@ -119,21 +113,15 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_ucontext_per_mm {
struct mmu_notifier mn;
struct pid *tgid;
struct rb_root_cached umem_tree;
/* Protects umem_tree */
struct rw_semaphore umem_rwsem;
};
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access);
struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops);
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
int access);
struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem,
unsigned long addr, size_t size);
struct ib_umem_odp *
ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
size_t size,
const struct mmu_interval_notifier_ops *ops);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
......@@ -143,39 +131,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
u64 bound);
typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end,
void *cookie);
/*
* Call the callback on each ib_umem in the range. Returns the logical or of
* the return values of the functions called.
*/
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 end,
umem_call_back cb,
bool blockable, void *cookie);
static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
unsigned long mmu_seq)
{
/*
* This code is strongly based on the KVM code from
* mmu_notifier_retry. Should be called with
* the relevant locks taken (umem_odp->umem_mutex
* and the ucontext umem_mutex semaphore locked for read).
*/
if (unlikely(umem_odp->notifiers_count))
return 1;
if (umem_odp->notifiers_seq != mmu_seq)
return 1;
return 0;
}
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata,
unsigned long addr,
size_t size, int access)
static inline struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops)
{
return ERR_PTR(-EINVAL);
}
......
......@@ -2451,8 +2451,6 @@ struct ib_device_ops {
u64 iova);
int (*unmap_fmr)(struct list_head *fmr_list);
int (*dealloc_fmr)(struct ib_fmr *fmr);
void (*invalidate_range)(struct ib_umem_odp *umem_odp,
unsigned long start, unsigned long end);
int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
......
......@@ -40,7 +40,6 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
......
......@@ -284,6 +284,7 @@ config VIRT_TO_BUS
config MMU_NOTIFIER
bool
select SRCU
select INTERVAL_TREE
config KSM
bool "Enable KSM for page merging"
......@@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS
config HMM_MIRROR
bool
depends on MMU
depends on MMU_NOTIFIER
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment