Commit 1b7dbc26 authored by Artemy Kovalyov's avatar Artemy Kovalyov Committed by Doug Ledford

IB/mlx5: Extract page fault code

To make page fault handling code more flexible
split pagefault_single_data_segment() function.
Keep MR resolution in pagefault_single_data_segment() and
move actual updates into pagefault_single_mr().
Signed-off-by: default avatarArtemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 0008b84e
...@@ -511,81 +511,38 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) ...@@ -511,81 +511,38 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
} }
/* static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
* Handle a single data segment in a page-fault WQE or RDMA region. u64 io_virt, size_t bcnt, u32 *bytes_mapped)
*
* Returns number of OS pages retrieved on success. The caller may continue to
* the next data segment.
* Can return the following error codes:
* -EAGAIN to designate a temporary error. The caller will abort handling the
* page fault and resolve it.
* -EFAULT when there's an error mapping the requested pages. The caller will
* abort the page fault handling.
*/
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 key, u64 io_virt, size_t bcnt,
u32 *bytes_committed,
u32 *bytes_mapped)
{ {
int srcu_key;
unsigned int current_seq = 0;
u64 start_idx, page_mask;
int npages = 0, ret = 0;
struct mlx5_ib_mr *mr;
u64 access_mask = ODP_READ_ALLOWED_BIT; u64 access_mask = ODP_READ_ALLOWED_BIT;
int npages = 0, page_shift, np;
u64 start_idx, page_mask;
struct ib_umem_odp *odp; struct ib_umem_odp *odp;
int implicit = 0; int current_seq;
size_t size; size_t size;
int page_shift; int ret;
srcu_key = srcu_read_lock(&dev->mr_srcu);
mr = mlx5_ib_odp_find_mr_lkey(dev, key);
/*
* If we didn't find the MR, it means the MR was closed while we were
* handling the ODP event. In this case we return -EFAULT so that the
* QP will be closed.
*/
if (!mr || !mr->ibmr.pd) {
mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
key);
ret = -EFAULT;
goto srcu_unlock;
}
if (!mr->umem->odp_data) {
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
key);
if (bytes_mapped)
*bytes_mapped +=
(bcnt - *bytes_committed);
goto srcu_unlock;
}
/*
* Avoid branches - this code will perform correctly
* in all iterations (in iteration 2 and above,
* bytes_committed == 0).
*/
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
if (!mr->umem->odp_data->page_list) { if (!mr->umem->odp_data->page_list) {
odp = implicit_mr_get_data(mr, io_virt, bcnt); odp = implicit_mr_get_data(mr, io_virt, bcnt);
if (IS_ERR(odp)) { if (IS_ERR(odp))
ret = PTR_ERR(odp); return PTR_ERR(odp);
goto srcu_unlock;
}
mr = odp->private; mr = odp->private;
implicit = 1;
} else { } else {
odp = mr->umem->odp_data; odp = mr->umem->odp_data;
} }
next_mr:
size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
page_shift = mr->umem->page_shift; page_shift = mr->umem->page_shift;
page_mask = ~(BIT(page_shift) - 1); page_mask = ~(BIT(page_shift) - 1);
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
if (mr->umem->writable)
access_mask |= ODP_WRITE_ALLOWED_BIT;
next_mr:
current_seq = READ_ONCE(odp->notifiers_seq); current_seq = READ_ONCE(odp->notifiers_seq);
/* /*
* Ensure the sequence number is valid for some time before we call * Ensure the sequence number is valid for some time before we call
...@@ -593,20 +550,13 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -593,20 +550,13 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
*/ */
smp_rmb(); smp_rmb();
size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
if (mr->umem->writable)
access_mask |= ODP_WRITE_ALLOWED_BIT;
ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
access_mask, current_seq); access_mask, current_seq);
if (ret < 0) if (ret < 0)
goto srcu_unlock; goto out;
if (ret > 0) { np = ret;
int np = ret;
mutex_lock(&odp->umem_mutex); mutex_lock(&odp->umem_mutex);
if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
...@@ -616,28 +566,27 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -616,28 +566,27 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
* checks this. * checks this.
*/ */
ret = mlx5_ib_update_xlt(mr, start_idx, np, ret = mlx5_ib_update_xlt(mr, start_idx, np,
page_shift, page_shift, MLX5_IB_UPD_XLT_ATOMIC);
MLX5_IB_UPD_XLT_ATOMIC);
} else { } else {
ret = -EAGAIN; ret = -EAGAIN;
} }
mutex_unlock(&odp->umem_mutex); mutex_unlock(&odp->umem_mutex);
if (ret < 0) { if (ret < 0) {
if (ret != -EAGAIN) if (ret != -EAGAIN)
mlx5_ib_err(dev, "Failed to update mkey page tables\n"); mlx5_ib_err(dev, "Failed to update mkey page tables\n");
goto srcu_unlock; goto out;
} }
if (bytes_mapped) { if (bytes_mapped) {
u32 new_mappings = (np << page_shift) - u32 new_mappings = (np << page_shift) -
(io_virt - round_down(io_virt, (io_virt - round_down(io_virt, 1 << page_shift));
1 << page_shift));
*bytes_mapped += min_t(u32, new_mappings, size); *bytes_mapped += min_t(u32, new_mappings, size);
} }
npages += np << (page_shift - PAGE_SHIFT); npages += np << (page_shift - PAGE_SHIFT);
}
bcnt -= size; bcnt -= size;
if (unlikely(bcnt)) { if (unlikely(bcnt)) {
struct ib_umem_odp *next; struct ib_umem_odp *next;
...@@ -646,17 +595,18 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -646,17 +595,18 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
if (unlikely(!next || next->umem->address != io_virt)) { if (unlikely(!next || next->umem->address != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
io_virt, next); io_virt, next);
ret = -EAGAIN; return -EAGAIN;
goto srcu_unlock_no_wait;
} }
odp = next; odp = next;
mr = odp->private; mr = odp->private;
goto next_mr; goto next_mr;
} }
srcu_unlock: return npages;
out:
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
if (implicit || !odp->dying) { if (mr->parent || !odp->dying) {
unsigned long timeout = unsigned long timeout =
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
...@@ -672,7 +622,62 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, ...@@ -672,7 +622,62 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
} }
} }
srcu_unlock_no_wait: return ret;
}
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
* Returns number of OS pages retrieved on success. The caller may continue to
* the next data segment.
* Can return the following error codes:
* -EAGAIN to designate a temporary error. The caller will abort handling the
* page fault and resolve it.
* -EFAULT when there's an error mapping the requested pages. The caller will
* abort the page fault handling.
*/
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 key, u64 io_virt, size_t bcnt,
u32 *bytes_committed,
u32 *bytes_mapped)
{
int npages = 0, srcu_key, ret;
struct mlx5_ib_mr *mr;
size_t size;
srcu_key = srcu_read_lock(&dev->mr_srcu);
mr = mlx5_ib_odp_find_mr_lkey(dev, key);
/*
* If we didn't find the MR, it means the MR was closed while we were
* handling the ODP event. In this case we return -EFAULT so that the
* QP will be closed.
*/
if (!mr || !mr->ibmr.pd) {
mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
key);
ret = -EFAULT;
goto srcu_unlock;
}
if (!mr->umem->odp_data) {
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
key);
if (bytes_mapped)
*bytes_mapped +=
(bcnt - *bytes_committed);
goto srcu_unlock;
}
/*
* Avoid branches - this code will perform correctly
* in all iterations (in iteration 2 and above,
* bytes_committed == 0).
*/
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
npages = pagefault_mr(dev, mr, io_virt, size, bytes_mapped);
srcu_unlock:
srcu_read_unlock(&dev->mr_srcu, srcu_key); srcu_read_unlock(&dev->mr_srcu, srcu_key);
*bytes_committed = 0; *bytes_committed = 0;
return ret ? ret : npages; return ret ? ret : npages;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment