Commit fb985e27 authored by Jason Gunthorpe's avatar Jason Gunthorpe

RDMA/mlx5: Use SRCU properly in ODP prefetch

When working with SRCU protected xarrays the xarray itself should be the
SRCU 'update' point. Instead prefetch is using live as the SRCU update
point and this prevents switching the locking design to use the xarray
instead.

To solve this the prefetch must only read from the xarray once, and hold
on to the actual MR pointer for the duration of the async
operation. Incrementing num_pending_prefetch delays destruction of the MR,
so it is suitable.

Prefetch calls directly to the pagefault_mr using the MR pointer and only
does a single xarray lookup.

All the testing if a MR is prefetchable or not is now done only in the
prefetch code and removed from the pagefault critical path.

Link: https://lore.kernel.org/r/20191009160934.3143-2-jgg@ziepe.caReviewed-by: default avatarArtemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent d6d5df1d
...@@ -606,16 +606,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) ...@@ -606,16 +606,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
} }
#define MLX5_PF_FLAGS_PREFETCH BIT(0)
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
u64 io_virt, size_t bcnt, u32 *bytes_mapped, u32 *bytes_mapped, u32 flags)
u32 flags)
{ {
int npages = 0, current_seq, page_shift, ret, np; int npages = 0, current_seq, page_shift, ret, np;
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
u64 access_mask; u64 access_mask;
u64 start_idx, page_mask; u64 start_idx, page_mask;
struct ib_umem_odp *odp; struct ib_umem_odp *odp;
...@@ -639,14 +636,6 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, ...@@ -639,14 +636,6 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT; access_mask = ODP_READ_ALLOWED_BIT;
if (prefetch && !downgrade && !odp->umem.writable) {
/* prefetch with write-access must
* be supported by the MR
*/
ret = -EINVAL;
goto out;
}
if (odp->umem.writable && !downgrade) if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT; access_mask |= ODP_WRITE_ALLOWED_BIT;
...@@ -681,7 +670,8 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, ...@@ -681,7 +670,8 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
if (ret < 0) { if (ret < 0) {
if (ret != -EAGAIN) if (ret != -EAGAIN)
mlx5_ib_err(dev, "Failed to update mkey page tables\n"); mlx5_ib_err(mr->dev,
"Failed to update mkey page tables\n");
goto out; goto out;
} }
...@@ -700,7 +690,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, ...@@ -700,7 +690,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
io_virt += size; io_virt += size;
next = odp_next(odp); next = odp_next(odp);
if (unlikely(!next || ib_umem_start(next) != io_virt)) { if (unlikely(!next || ib_umem_start(next) != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", mlx5_ib_dbg(
mr->dev,
"next implicit leaf removed at 0x%llx. got %p\n",
io_virt, next); io_virt, next);
return -EAGAIN; return -EAGAIN;
} }
...@@ -718,7 +710,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, ...@@ -718,7 +710,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
if (!wait_for_completion_timeout(&odp->notifier_completion, if (!wait_for_completion_timeout(&odp->notifier_completion,
timeout)) { timeout)) {
mlx5_ib_warn( mlx5_ib_warn(
dev, mr->dev,
"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
current_seq, odp->notifiers_seq, current_seq, odp->notifiers_seq,
odp->notifiers_count); odp->notifiers_count);
...@@ -775,10 +767,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -775,10 +767,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
struct ib_pd *pd, u32 key, struct ib_pd *pd, u32 key,
u64 io_virt, size_t bcnt, u64 io_virt, size_t bcnt,
u32 *bytes_committed, u32 *bytes_committed,
u32 *bytes_mapped, u32 flags) u32 *bytes_mapped)
{ {
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
struct pf_frame *head = NULL, *frame; struct pf_frame *head = NULL, *frame;
struct mlx5_core_mkey *mmkey; struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr; struct mlx5_ib_mr *mr;
...@@ -800,12 +791,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -800,12 +791,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
goto srcu_unlock; goto srcu_unlock;
} }
if (prefetch && mmkey->type != MLX5_MKEY_MR) {
mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
ret = -EINVAL;
goto srcu_unlock;
}
switch (mmkey->type) { switch (mmkey->type) {
case MLX5_MKEY_MR: case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
...@@ -815,17 +800,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -815,17 +800,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
goto srcu_unlock; goto srcu_unlock;
} }
if (prefetch) {
if (!is_odp_mr(mr) ||
mr->ibmr.pd != pd) {
mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
is_odp_mr(mr) ? "MR is not ODP" :
"PD is not of the MR");
ret = -EINVAL;
goto srcu_unlock;
}
}
if (!is_odp_mr(mr)) { if (!is_odp_mr(mr)) {
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
key); key);
...@@ -835,7 +809,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -835,7 +809,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
goto srcu_unlock; goto srcu_unlock;
} }
ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
if (ret < 0) if (ret < 0)
goto srcu_unlock; goto srcu_unlock;
...@@ -1009,7 +983,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ...@@ -1009,7 +983,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
ret = pagefault_single_data_segment(dev, NULL, key, ret = pagefault_single_data_segment(dev, NULL, key,
io_virt, bcnt, io_virt, bcnt,
&pfault->bytes_committed, &pfault->bytes_committed,
bytes_mapped, 0); bytes_mapped);
if (ret < 0) if (ret < 0)
break; break;
npages += ret; npages += ret;
...@@ -1292,8 +1266,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ...@@ -1292,8 +1266,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
} }
ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
&pfault->bytes_committed, NULL, &pfault->bytes_committed, NULL);
0);
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */ /* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0; prefetch_activated = 0;
...@@ -1320,8 +1293,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ...@@ -1320,8 +1293,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
ret = pagefault_single_data_segment(dev, NULL, rkey, address, ret = pagefault_single_data_segment(dev, NULL, rkey, address,
prefetch_len, prefetch_len,
&bytes_committed, NULL, &bytes_committed, NULL);
0);
if (ret < 0 && ret != -EAGAIN) { if (ret < 0 && ret != -EAGAIN) {
mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
ret, pfault->token, address, prefetch_len); ret, pfault->token, address, prefetch_len);
...@@ -1624,114 +1596,138 @@ int mlx5_ib_odp_init(void) ...@@ -1624,114 +1596,138 @@ int mlx5_ib_odp_init(void)
struct prefetch_mr_work { struct prefetch_mr_work {
struct work_struct work; struct work_struct work;
struct ib_pd *pd;
u32 pf_flags; u32 pf_flags;
u32 num_sge; u32 num_sge;
struct ib_sge sg_list[0]; struct {
u64 io_virt;
struct mlx5_ib_mr *mr;
size_t length;
} frags[];
}; };
static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev, static void destroy_prefetch_work(struct prefetch_mr_work *work)
struct ib_sge *sg_list, u32 num_sge,
u32 from)
{ {
u32 i; u32 i;
int srcu_key;
srcu_key = srcu_read_lock(&dev->mr_srcu);
for (i = from; i < num_sge; ++i) {
struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr;
mmkey = xa_load(&dev->mdev->priv.mkey_table,
mlx5_base_mkey(sg_list[i].lkey));
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
atomic_dec(&mr->num_pending_prefetch);
}
srcu_read_unlock(&dev->mr_srcu, srcu_key); for (i = 0; i < work->num_sge; ++i)
atomic_dec(&work->frags[i].mr->num_pending_prefetch);
kvfree(work);
} }
static bool num_pending_prefetch_inc(struct ib_pd *pd, static struct mlx5_ib_mr *
struct ib_sge *sg_list, u32 num_sge) get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 lkey)
{ {
struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_dev *dev = to_mdev(pd->device);
bool ret = true;
u32 i;
for (i = 0; i < num_sge; ++i) {
struct mlx5_core_mkey *mmkey; struct mlx5_core_mkey *mmkey;
struct ib_umem_odp *odp;
struct mlx5_ib_mr *mr; struct mlx5_ib_mr *mr;
mmkey = xa_load(&dev->mdev->priv.mkey_table, lockdep_assert_held(&dev->mr_srcu);
mlx5_base_mkey(sg_list[i].lkey));
if (!mmkey || mmkey->key != sg_list[i].lkey) {
ret = false;
break;
}
if (mmkey->type != MLX5_MKEY_MR) { mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(lkey));
ret = false; if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
break; return NULL;
}
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
if (!smp_load_acquire(&mr->live)) { if (!smp_load_acquire(&mr->live))
ret = false; return NULL;
break;
}
if (mr->ibmr.pd != pd) { if (mr->ibmr.pd != pd || !is_odp_mr(mr))
ret = false; return NULL;
break;
}
atomic_inc(&mr->num_pending_prefetch); /*
} * Implicit child MRs are internal and userspace should not refer to
* them.
*/
if (mr->parent)
return NULL;
if (!ret) odp = to_ib_umem_odp(mr->umem);
num_pending_prefetch_dec(dev, sg_list, i, 0);
return ret; /* prefetch with write-access must be supported by the MR */
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
!odp->umem.writable)
return NULL;
return mr;
}
static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
{
struct prefetch_mr_work *work =
container_of(w, struct prefetch_mr_work, work);
u32 bytes_mapped = 0;
u32 i;
for (i = 0; i < work->num_sge; ++i)
pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
work->frags[i].length, &bytes_mapped,
work->pf_flags);
destroy_prefetch_work(work);
} }
static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags, static bool init_prefetch_work(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 pf_flags, struct prefetch_mr_work *work,
struct ib_sge *sg_list, u32 num_sge) struct ib_sge *sg_list, u32 num_sge)
{ {
u32 i; u32 i;
int ret = 0;
struct mlx5_ib_dev *dev = to_mdev(pd->device);
for (i = 0; i < num_sge; ++i) { INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
struct ib_sge *sg = &sg_list[i]; work->pf_flags = pf_flags;
int bytes_committed = 0;
ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr, for (i = 0; i < num_sge; ++i) {
sg->length, work->frags[i].io_virt = sg_list[i].addr;
&bytes_committed, NULL, work->frags[i].length = sg_list[i].length;
pf_flags); work->frags[i].mr =
if (ret < 0) get_prefetchable_mr(pd, advice, sg_list[i].lkey);
break; if (!work->frags[i].mr) {
work->num_sge = i - 1;
if (i)
destroy_prefetch_work(work);
return false;
} }
return ret < 0 ? ret : 0; /* Keep the MR pointer will valid outside the SRCU */
atomic_inc(&work->frags[i].mr->num_pending_prefetch);
}
work->num_sge = num_sge;
return true;
} }
static void mlx5_ib_prefetch_mr_work(struct work_struct *work) static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 pf_flags, struct ib_sge *sg_list,
u32 num_sge)
{ {
struct prefetch_mr_work *w = struct mlx5_ib_dev *dev = to_mdev(pd->device);
container_of(work, struct prefetch_mr_work, work); u32 bytes_mapped = 0;
int srcu_key;
int ret = 0;
u32 i;
if (ib_device_try_get(w->pd->device)) { srcu_key = srcu_read_lock(&dev->mr_srcu);
mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list, for (i = 0; i < num_sge; ++i) {
w->num_sge); struct mlx5_ib_mr *mr;
ib_device_put(w->pd->device);
mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
if (!mr) {
ret = -ENOENT;
goto out;
} }
ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
&bytes_mapped, pf_flags);
if (ret < 0)
goto out;
}
ret = 0;
num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list, out:
w->num_sge, 0); srcu_read_unlock(&dev->mr_srcu, srcu_key);
kvfree(w); return ret;
} }
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
...@@ -1739,43 +1735,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, ...@@ -1739,43 +1735,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
u32 flags, struct ib_sge *sg_list, u32 num_sge) u32 flags, struct ib_sge *sg_list, u32 num_sge)
{ {
struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_dev *dev = to_mdev(pd->device);
u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; u32 pf_flags = 0;
struct prefetch_mr_work *work; struct prefetch_mr_work *work;
bool valid_req;
int srcu_key; int srcu_key;
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list, return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
num_sge); num_sge);
work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL); work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
if (!work) if (!work)
return -ENOMEM; return -ENOMEM;
memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
/* It is guaranteed that the pd when work is executed is the pd when
* work was queued since pd can't be destroyed while it holds MRs and
* destroying a MR leads to flushing the workquque
*/
work->pd = pd;
work->pf_flags = pf_flags;
work->num_sge = num_sge;
INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
srcu_key = srcu_read_lock(&dev->mr_srcu); srcu_key = srcu_read_lock(&dev->mr_srcu);
if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge); srcu_read_unlock(&dev->mr_srcu, srcu_key);
if (valid_req) return -EINVAL;
}
queue_work(system_unbound_wq, &work->work); queue_work(system_unbound_wq, &work->work);
else
kvfree(work);
srcu_read_unlock(&dev->mr_srcu, srcu_key); srcu_read_unlock(&dev->mr_srcu, srcu_key);
return 0;
return valid_req ? 0 : -EINVAL;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment