Commit 5256edcb authored by Jason Gunthorpe's avatar Jason Gunthorpe

RDMA/mlx5: Rework implicit ODP destroy

Use SRCU in a sensible way by removing all MRs in the implicit tree from
the two xarrays (the update operation), then a synchronize, followed by a
normal single threaded teardown.

This is only a little unusual from the normal pattern as there can still
be some work pending in the unbound wq that may also require a workqueue
flush. This is tracked with a single atomic, consolidating the redundant
existing atomics and wait queue.

For understand-ability the entire ODP implicit create/destroy flow now
largely exists in a single pair of functions within odp.c, with a few
support functions for tearing down an unused child.

Link: https://lore.kernel.org/r/20191009160934.3143-13-jgg@ziepe.caReviewed-by: default avatarArtemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent b70d785d
...@@ -6146,8 +6146,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) ...@@ -6146,8 +6146,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{ {
mlx5_ib_cleanup_multiport_master(dev); mlx5_ib_cleanup_multiport_master(dev);
WARN_ON(!xa_empty(&dev->odp_mkeys)); WARN_ON(!xa_empty(&dev->odp_mkeys));
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
srcu_barrier(&dev->odp_srcu);
cleanup_srcu_struct(&dev->odp_srcu); cleanup_srcu_struct(&dev->odp_srcu);
WARN_ON(!xa_empty(&dev->sig_mrs)); WARN_ON(!xa_empty(&dev->sig_mrs));
......
...@@ -618,10 +618,13 @@ struct mlx5_ib_mr { ...@@ -618,10 +618,13 @@ struct mlx5_ib_mr {
u64 pi_iova; u64 pi_iova;
/* For ODP and implicit */ /* For ODP and implicit */
atomic_t num_leaf_free; atomic_t num_deferred_work;
wait_queue_head_t q_leaf_free;
atomic_t num_pending_prefetch;
struct xarray implicit_children; struct xarray implicit_children;
union {
struct rcu_head rcu;
struct list_head elm;
struct work_struct work;
} odp_destroy;
struct mlx5_async_work cb_work; struct mlx5_async_work cb_work;
}; };
......
...@@ -1317,7 +1317,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -1317,7 +1317,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (is_odp_mr(mr)) { if (is_odp_mr(mr)) {
to_ib_umem_odp(mr->umem)->private = mr; to_ib_umem_odp(mr->umem)->private = mr;
atomic_set(&mr->num_pending_prefetch, 0); atomic_set(&mr->num_deferred_work, 0);
err = xa_err(xa_store(&dev->odp_mkeys, err = xa_err(xa_store(&dev->odp_mkeys,
mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
GFP_KERNEL)); GFP_KERNEL));
...@@ -1573,17 +1573,15 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) ...@@ -1573,17 +1573,15 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
synchronize_srcu(&dev->odp_srcu); synchronize_srcu(&dev->odp_srcu);
/* dequeue pending prefetch requests for the mr */ /* dequeue pending prefetch requests for the mr */
if (atomic_read(&mr->num_pending_prefetch)) if (atomic_read(&mr->num_deferred_work)) {
flush_workqueue(system_unbound_wq); flush_workqueue(system_unbound_wq);
WARN_ON(atomic_read(&mr->num_pending_prefetch)); WARN_ON(atomic_read(&mr->num_deferred_work));
}
/* Destroy all page mappings */ /* Destroy all page mappings */
if (!umem_odp->is_implicit_odp) mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp),
mlx5_ib_invalidate_range(umem_odp, ib_umem_end(umem_odp));
ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
else
mlx5_ib_free_implicit_mr(mr);
/* /*
* We kill the umem before the MR for ODP, * We kill the umem before the MR for ODP,
* so that there will not be any invalidations in * so that there will not be any invalidations in
...@@ -1620,6 +1618,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) ...@@ -1620,6 +1618,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
} }
if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
mlx5_ib_free_implicit_mr(mmr);
return 0;
}
dereg_mr(to_mdev(ibmr->device), mmr); dereg_mr(to_mdev(ibmr->device), mmr);
return 0; return 0;
......
...@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, ...@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
} }
} }
static void mr_leaf_free_action(struct work_struct *work) /*
* This must be called after the mr has been removed from implicit_children
* and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily
* have to be empty here, parallel page faults could have raced with the free
* process and added pages to it.
*/
static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
{ {
struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); struct mlx5_ib_mr *imr = mr->parent;
int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
int srcu_key; int srcu_key;
mr->parent = NULL; /* implicit_child_mr's are not allowed to have deferred work */
synchronize_srcu(&mr->dev->odp_srcu); WARN_ON(atomic_read(&mr->num_deferred_work));
if (xa_load(&mr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key))) { if (need_imr_xlt) {
srcu_key = srcu_read_lock(&mr->dev->odp_srcu); srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
mutex_lock(&odp_imr->umem_mutex); mutex_lock(&odp_imr->umem_mutex);
mlx5_ib_update_xlt(imr, idx, 1, 0, mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC); MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex); mutex_unlock(&odp_imr->umem_mutex);
srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
} }
ib_umem_odp_release(odp);
mr->parent = NULL;
mlx5_mr_cache_free(mr->dev, mr); mlx5_mr_cache_free(mr->dev, mr);
ib_umem_odp_release(odp);
atomic_dec(&imr->num_deferred_work);
}
static void free_implicit_child_mr_work(struct work_struct *work)
{
struct mlx5_ib_mr *mr =
container_of(work, struct mlx5_ib_mr, odp_destroy.work);
free_implicit_child_mr(mr, true);
}
static void free_implicit_child_mr_rcu(struct rcu_head *head)
{
struct mlx5_ib_mr *mr =
container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
queue_work(system_unbound_wq, &mr->odp_destroy.work);
}
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
if (atomic_dec_and_test(&imr->num_leaf_free)) xa_lock(&imr->implicit_children);
wake_up(&imr->q_leaf_free); /*
* This can race with mlx5_ib_free_implicit_mr(), the first one to
* reach the xa lock wins the race and destroys the MR.
*/
if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
mr)
goto out_unlock;
__xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
atomic_inc(&imr->num_deferred_work);
call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
free_implicit_child_mr_rcu);
out_unlock:
xa_unlock(&imr->implicit_children);
} }
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
...@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ...@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
ib_umem_odp_unmap_dma_pages(umem_odp, start, end); ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
if (unlikely(!umem_odp->npages && mr->parent && if (unlikely(!umem_odp->npages && mr->parent))
!umem_odp->dying)) { destroy_unused_implicit_child_mr(mr);
xa_erase(&mr->parent->implicit_children,
ib_umem_start(umem_odp) >> MLX5_IMR_MTT_SHIFT);
xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
umem_odp->dying = 1;
atomic_inc(&mr->parent->num_leaf_free);
schedule_work(&umem_odp->work);
}
mutex_unlock(&umem_odp->umem_mutex); mutex_unlock(&umem_odp->umem_mutex);
} }
...@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, ...@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
mr->parent = imr; mr->parent = imr;
odp->private = mr; odp->private = mr;
INIT_WORK(&odp->work, mr_leaf_free_action);
err = mlx5_ib_update_xlt(mr, 0, err = mlx5_ib_update_xlt(mr, 0,
MLX5_IMR_MTT_ENTRIES, MLX5_IMR_MTT_ENTRIES,
...@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, ...@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
* Once the store to either xarray completes any error unwind has to * Once the store to either xarray completes any error unwind has to
* use synchronize_srcu(). Avoid this with xa_reserve() * use synchronize_srcu(). Avoid this with xa_reserve()
*/ */
ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, GFP_KERNEL); ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
GFP_KERNEL);
if (likely(!ret))
xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
&mr->mmkey, GFP_ATOMIC);
if (unlikely(ret)) { if (unlikely(ret)) {
if (xa_is_err(ret)) { if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret)); ret = ERR_PTR(xa_err(ret));
...@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, ...@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
goto out_release; goto out_release;
} }
xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
&mr->mmkey, GFP_ATOMIC);
mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
return mr; return mr;
...@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, ...@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->ibmr.lkey = imr->mmkey.key; imr->ibmr.lkey = imr->mmkey.key;
imr->ibmr.rkey = imr->mmkey.key; imr->ibmr.rkey = imr->mmkey.key;
imr->umem = &umem_odp->umem; imr->umem = &umem_odp->umem;
init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_deferred_work, 0);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
xa_init(&imr->implicit_children); xa_init(&imr->implicit_children);
err = mlx5_ib_update_xlt(imr, 0, err = mlx5_ib_update_xlt(imr, 0,
...@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, ...@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{ {
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
struct mlx5_ib_dev *dev = imr->dev;
struct list_head destroy_list;
struct mlx5_ib_mr *mtt; struct mlx5_ib_mr *mtt;
struct mlx5_ib_mr *tmp;
unsigned long idx; unsigned long idx;
mutex_lock(&odp_imr->umem_mutex); INIT_LIST_HEAD(&destroy_list);
xa_for_each (&imr->implicit_children, idx, mtt) {
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem);
xa_erase(&imr->implicit_children, idx); xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
/*
* This stops the SRCU protected page fault path from touching either
* the imr or any children. The page fault path can only reach the
* children xarray via the imr.
*/
synchronize_srcu(&dev->odp_srcu);
mutex_lock(&umem_odp->umem_mutex); xa_lock(&imr->implicit_children);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), xa_for_each (&imr->implicit_children, idx, mtt) {
ib_umem_end(umem_odp)); __xa_erase(&imr->implicit_children, idx);
__xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key));
list_add(&mtt->odp_destroy.elm, &destroy_list);
}
xa_unlock(&imr->implicit_children);
if (umem_odp->dying) { /* Fence access to the child pointers via the pagefault thread */
mutex_unlock(&umem_odp->umem_mutex); synchronize_srcu(&dev->odp_srcu);
continue;
}
umem_odp->dying = 1; /*
atomic_inc(&imr->num_leaf_free); * num_deferred_work can only be incremented inside the odp_srcu, or
schedule_work(&umem_odp->work); * under xa_lock while the child is in the xarray. Thus at this point
mutex_unlock(&umem_odp->umem_mutex); * it is only decreasing, and all work holding it is now on the wq.
*/
if (atomic_read(&imr->num_deferred_work)) {
flush_workqueue(system_unbound_wq);
WARN_ON(atomic_read(&imr->num_deferred_work));
} }
mutex_unlock(&odp_imr->umem_mutex);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
WARN_ON(!xa_empty(&imr->implicit_children)); free_implicit_child_mr(mtt, false);
/* Remove any left over reserved elements */
xa_destroy(&imr->implicit_children); mlx5_mr_cache_free(dev, imr);
ib_umem_odp_release(odp_imr);
} }
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
...@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) ...@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
u32 i; u32 i;
for (i = 0; i < work->num_sge; ++i) for (i = 0; i < work->num_sge; ++i)
atomic_dec(&work->frags[i].mr->num_pending_prefetch); atomic_dec(&work->frags[i].mr->num_deferred_work);
kvfree(work); kvfree(work);
} }
...@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd, ...@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd,
} }
/* Keep the MR pointer will valid outside the SRCU */ /* Keep the MR pointer will valid outside the SRCU */
atomic_inc(&work->frags[i].mr->num_pending_prefetch); atomic_inc(&work->frags[i].mr->num_deferred_work);
} }
work->num_sge = num_sge; work->num_sge = num_sge;
return true; return true;
......
...@@ -78,9 +78,7 @@ struct ib_umem_odp { ...@@ -78,9 +78,7 @@ struct ib_umem_odp {
bool is_implicit_odp; bool is_implicit_odp;
struct completion notifier_completion; struct completion notifier_completion;
int dying;
unsigned int page_shift; unsigned int page_shift;
struct work_struct work;
}; };
static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment