Commit 8a8c600d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma fixes from Jason Gunthorpe:
 "The usual collection of driver bug fixes, and a few regressions from
  the merge window. Nothing particularly worrisome.

   - Various missed memory frees and error unwind bugs

   - Fix regressions in a few iwarp drivers from 5.4 patches

   - A few regressions added in past kernels

   - Squash a number of races in mlx5 ODP code"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  RDMA/mlx5: Add missing synchronize_srcu() for MW cases
  RDMA/mlx5: Put live in the correct place for ODP MRs
  RDMA/mlx5: Order num_pending_prefetch properly with synchronize_srcu
  RDMA/odp: Lift umem_mutex out of ib_umem_odp_unmap_dma_pages()
  RDMA/mlx5: Fix a race with mlx5_ib_update_xlt on an implicit MR
  RDMA/mlx5: Do not allow rereg of a ODP MR
  IB/core: Fix wrong iterating on ports
  RDMA/nldev: Reshuffle the code to avoid need to rebind QP in error path
  RDMA/cxgb4: Do not dma memory off of the stack
  RDMA/cm: Fix memory leak in cm_add/remove_one
  RDMA/core: Fix an error handling path in 'res_get_common_doit()'
  RDMA/i40iw: Associate ibdev to netdev before IB device registration
  RDMA/iwcm: Fix a lock inversion issue
  RDMA/iw_cxgb4: fix SRQ access from dump_qp()
  RDMA/hfi1: Prevent memory leak in sdma_init
  RDMA/core: Fix use after free and refcnt leak on ndev in_device in iwarp_query_port
  RDMA/siw: Fix serialization issue in write_space()
  RDMA/vmw_pvrdma: Free SRQ only once
parents e60329c9 04177915
......@@ -4399,6 +4399,7 @@ static void cm_add_one(struct ib_device *ib_device)
error1:
port_modify.set_port_cap_mask = 0;
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
kfree(port);
while (--i) {
if (!rdma_cap_ib_cm(ib_device, i))
continue;
......@@ -4407,6 +4408,7 @@ static void cm_add_one(struct ib_device *ib_device)
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
kfree(port);
}
free:
kfree(cm_dev);
......@@ -4460,6 +4462,7 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
spin_unlock_irq(&cm.state_lock);
ib_unregister_mad_agent(cur_mad_agent);
cm_remove_port_fs(port);
kfree(port);
}
kfree(cm_dev);
......
......@@ -2396,9 +2396,10 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
conn_id->cm_id.iw = NULL;
cma_exch(conn_id, RDMA_CM_DESTROYING);
mutex_unlock(&conn_id->handler_mutex);
mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
rdma_destroy_id(&conn_id->id);
goto out;
return ret;
}
mutex_unlock(&conn_id->handler_mutex);
......
......@@ -1987,8 +1987,6 @@ static int iw_query_port(struct ib_device *device,
if (!netdev)
return -ENODEV;
dev_put(netdev);
port_attr->max_mtu = IB_MTU_4096;
port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
......@@ -1996,19 +1994,22 @@ static int iw_query_port(struct ib_device *device,
port_attr->state = IB_PORT_DOWN;
port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
} else {
inetdev = in_dev_get(netdev);
rcu_read_lock();
inetdev = __in_dev_get_rcu(netdev);
if (inetdev && inetdev->ifa_list) {
port_attr->state = IB_PORT_ACTIVE;
port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
in_dev_put(inetdev);
} else {
port_attr->state = IB_PORT_INIT;
port_attr->phys_state =
IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
}
rcu_read_unlock();
}
dev_put(netdev);
err = device->ops.query_port(device, port_num, port_attr);
if (err)
return err;
......
......@@ -1230,7 +1230,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
goto err;
goto err_get;
}
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
......@@ -1787,10 +1787,6 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
if (ret)
goto err_unbind;
if (fill_nldev_handle(msg, device) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
......@@ -1799,13 +1795,15 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_fill;
}
ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
if (ret)
goto err_fill;
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
err_fill:
rdma_counter_bind_qpn(device, port, qpn, cntn);
err_unbind:
nlmsg_free(msg);
err:
ib_device_put(device);
......
......@@ -426,7 +426,7 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
int ret;
rdma_for_each_port (dev, i) {
is_ib = rdma_protocol_ib(dev, i++);
is_ib = rdma_protocol_ib(dev, i);
if (is_ib)
break;
}
......
......@@ -451,8 +451,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
* that the hardware will not attempt to access the MR any more.
*/
if (!umem_odp->is_implicit_odp) {
mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
mutex_unlock(&umem_odp->umem_mutex);
kvfree(umem_odp->dma_list);
kvfree(umem_odp->page_list);
}
......@@ -719,6 +721,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
/* Note that during the run of this function, the
......@@ -726,7 +730,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
* faults from completion. We might be racing with other
* invalidations, so we must make sure we free each page only
* once. */
mutex_lock(&umem_odp->umem_mutex);
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
if (umem_odp->page_list[idx]) {
......@@ -757,7 +760,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
umem_odp->npages--;
}
}
mutex_unlock(&umem_odp->umem_mutex);
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
......
......@@ -242,10 +242,13 @@ static void set_ep_sin6_addrs(struct c4iw_ep *ep,
}
}
static int dump_qp(struct c4iw_qp *qp, struct c4iw_debugfs_data *qpd)
static int dump_qp(unsigned long id, struct c4iw_qp *qp,
struct c4iw_debugfs_data *qpd)
{
int space;
int cc;
if (id != qp->wq.sq.qid)
return 0;
space = qpd->bufsize - qpd->pos - 1;
if (space == 0)
......@@ -350,7 +353,7 @@ static int qp_open(struct inode *inode, struct file *file)
xa_lock_irq(&qpd->devp->qps);
xa_for_each(&qpd->devp->qps, index, qp)
dump_qp(qp, qpd);
dump_qp(index, qp, qpd);
xa_unlock_irq(&qpd->devp->qps);
qpd->buf[qpd->pos++] = 0;
......
......@@ -275,13 +275,17 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
struct sk_buff *skb, struct c4iw_wr_wait *wr_waitp)
{
int err;
struct fw_ri_tpte tpt;
struct fw_ri_tpte *tpt;
u32 stag_idx;
static atomic_t key;
if (c4iw_fatal_error(rdev))
return -EIO;
tpt = kmalloc(sizeof(*tpt), GFP_KERNEL);
if (!tpt)
return -ENOMEM;
stag_state = stag_state > 0;
stag_idx = (*stag) >> 8;
......@@ -291,6 +295,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
mutex_lock(&rdev->stats.lock);
rdev->stats.stag.fail++;
mutex_unlock(&rdev->stats.lock);
kfree(tpt);
return -ENOMEM;
}
mutex_lock(&rdev->stats.lock);
......@@ -305,28 +310,28 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
/* write TPT entry */
if (reset_tpt_entry)
memset(&tpt, 0, sizeof(tpt));
memset(tpt, 0, sizeof(*tpt));
else {
tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
tpt->valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) |
FW_RI_TPTE_STAGSTATE_V(stag_state) |
FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid));
tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
tpt->locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
(bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) |
FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO :
FW_RI_VA_BASED_TO))|
FW_RI_TPTE_PS_V(page_size));
tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
tpt->nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3));
tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
tpt.va_hi = cpu_to_be32((u32)(to >> 32));
tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
tpt.len_hi = cpu_to_be32((u32)(len >> 32));
tpt->len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
tpt->va_hi = cpu_to_be32((u32)(to >> 32));
tpt->va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
tpt->dca_mwbcnt_pstag = cpu_to_be32(0);
tpt->len_hi = cpu_to_be32((u32)(len >> 32));
}
err = write_adapter_mem(rdev, stag_idx +
(rdev->lldi.vr->stag.start >> 5),
sizeof(tpt), &tpt, skb, wr_waitp);
sizeof(*tpt), tpt, skb, wr_waitp);
if (reset_tpt_entry) {
c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
......@@ -334,6 +339,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
rdev->stats.stag.cur -= 32;
mutex_unlock(&rdev->stats.lock);
}
kfree(tpt);
return err;
}
......
......@@ -2737,15 +2737,11 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs,
if (CHELSIO_CHIP_VERSION(rhp->rdev.lldi.adapter_type) > CHELSIO_T6)
srq->flags = T4_SRQ_LIMIT_SUPPORT;
ret = xa_insert_irq(&rhp->qps, srq->wq.qid, srq, GFP_KERNEL);
if (ret)
goto err_free_queue;
if (udata) {
srq_key_mm = kmalloc(sizeof(*srq_key_mm), GFP_KERNEL);
if (!srq_key_mm) {
ret = -ENOMEM;
goto err_remove_handle;
goto err_free_queue;
}
srq_db_key_mm = kmalloc(sizeof(*srq_db_key_mm), GFP_KERNEL);
if (!srq_db_key_mm) {
......@@ -2789,8 +2785,6 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs,
kfree(srq_db_key_mm);
err_free_srq_key_mm:
kfree(srq_key_mm);
err_remove_handle:
xa_erase_irq(&rhp->qps, srq->wq.qid);
err_free_queue:
free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
srq->wr_waitp);
......@@ -2813,8 +2807,6 @@ void c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
rhp = srq->rhp;
pr_debug("%s id %d\n", __func__, srq->wq.qid);
xa_erase_irq(&rhp->qps, srq->wq.qid);
ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
ibucontext);
free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
......
......@@ -1526,8 +1526,11 @@ int sdma_init(struct hfi1_devdata *dd, u8 port)
}
ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
if (ret < 0)
if (ret < 0) {
kfree(tmp_sdma_rht);
goto bail;
}
dd->sdma_rht = tmp_sdma_rht;
dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
......
......@@ -2773,6 +2773,10 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
return -ENOMEM;
iwibdev = iwdev->iwibdev;
rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
ret = ib_device_set_netdev(&iwibdev->ibdev, iwdev->netdev, 1);
if (ret)
goto error;
ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
if (ret)
goto error;
......
......@@ -1298,29 +1298,6 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
return 0;
}
static void devx_free_indirect_mkey(struct rcu_head *rcu)
{
kfree(container_of(rcu, struct devx_obj, devx_mr.rcu));
}
/* This function to delete from the radix tree needs to be called before
* destroying the underlying mkey. Otherwise a race might occur in case that
* other thread will get the same mkey before this one will be deleted,
* in that case it will fail via inserting to the tree its own data.
*
* Note:
* An error in the destroy is not expected unless there is some other indirect
* mkey which points to this one. In a kernel cleanup flow it will be just
* destroyed in the iterative destruction call. In a user flow, in case
* the application didn't close in the expected order it's its own problem,
* the mkey won't be part of the tree, in both cases the kernel is safe.
*/
static void devx_cleanup_mkey(struct devx_obj *obj)
{
xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
mlx5_base_mkey(obj->devx_mr.mmkey.key));
}
static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
struct devx_event_subscription *sub)
{
......@@ -1362,8 +1339,16 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
int ret;
dev = mlx5_udata_to_mdev(&attrs->driver_udata);
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
devx_cleanup_mkey(obj);
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
/*
* The pagefault_single_data_segment() does commands against
* the mmkey, we must wait for that to stop before freeing the
* mkey, as another allocation could get the same mkey #.
*/
xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
mlx5_base_mkey(obj->devx_mr.mmkey.key));
synchronize_srcu(&dev->mr_srcu);
}
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
......@@ -1382,12 +1367,6 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
devx_cleanup_subscription(dev, sub_entry);
mutex_unlock(&devx_event_table->event_xa_lock);
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
devx_free_indirect_mkey);
return ret;
}
kfree(obj);
return ret;
}
......@@ -1491,26 +1470,21 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
&obj_id);
WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
if (err)
goto obj_destroy;
}
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
if (err)
goto err_copy;
goto obj_destroy;
if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT)
obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type);
obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id);
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
if (err)
goto obj_destroy;
}
return 0;
err_copy:
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
devx_cleanup_mkey(obj);
obj_destroy:
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
......
......@@ -606,7 +606,7 @@ struct mlx5_ib_mr {
struct mlx5_ib_dev *dev;
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
struct mlx5_core_sig_ctx *sig;
int live;
unsigned int live;
void *descs_alloc;
int access_flags; /* Needed for rereg MR */
......@@ -639,7 +639,6 @@ struct mlx5_ib_mw {
struct mlx5_ib_devx_mr {
struct mlx5_core_mkey mmkey;
int ndescs;
struct rcu_head rcu;
};
struct mlx5_ib_umr_context {
......
......@@ -84,32 +84,6 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
}
static void update_odp_mr(struct mlx5_ib_mr *mr)
{
if (is_odp_mr(mr)) {
/*
* This barrier prevents the compiler from moving the
* setting of umem->odp_data->private to point to our
* MR, before reg_umr finished, to ensure that the MR
* initialization have finished before starting to
* handle invalidations.
*/
smp_wmb();
to_ib_umem_odp(mr->umem)->private = mr;
/*
* Make sure we will see the new
* umem->odp_data->private value in the invalidation
* routines, before we can get page faults on the
* MR. Page faults can happen once we put the MR in
* the tree, below this line. Without the barrier,
* there can be a fault handling and an invalidation
* before umem->odp_data->private == mr is visible to
* the invalidation handler.
*/
smp_wmb();
}
}
static void reg_mr_callback(int status, struct mlx5_async_work *context)
{
struct mlx5_ib_mr *mr =
......@@ -1346,8 +1320,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->umem = umem;
set_mr_fields(dev, mr, npages, length, access_flags);
update_odp_mr(mr);
if (use_umr) {
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
......@@ -1363,10 +1335,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
}
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
mr->live = 1;
if (is_odp_mr(mr)) {
to_ib_umem_odp(mr->umem)->private = mr;
atomic_set(&mr->num_pending_prefetch, 0);
}
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
smp_store_release(&mr->live, 1);
return &mr->ibmr;
error:
......@@ -1441,6 +1415,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
if (!mr->umem)
return -EINVAL;
if (is_odp_mr(mr))
return -EOPNOTSUPP;
if (flags & IB_MR_REREG_TRANS) {
addr = virt_addr;
len = length;
......@@ -1486,8 +1463,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
mr->allocated_from_cache = 0;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
mr->live = 1;
} else {
/*
* Send a UMR WQE
......@@ -1516,7 +1491,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
set_mr_fields(dev, mr, npages, len, access_flags);
update_odp_mr(mr);
return 0;
err:
......@@ -1607,15 +1581,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Prevent new page faults and
* prefetch requests from succeeding
*/
mr->live = 0;
WRITE_ONCE(mr->live, 0);
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* dequeue pending prefetch requests for the mr */
if (atomic_read(&mr->num_pending_prefetch))
flush_workqueue(system_unbound_wq);
WARN_ON(atomic_read(&mr->num_pending_prefetch));
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
if (!umem_odp->is_implicit_odp)
mlx5_ib_invalidate_range(umem_odp,
......@@ -1987,14 +1962,25 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
int mlx5_ib_dealloc_mw(struct ib_mw *mw)
{
struct mlx5_ib_dev *dev = to_mdev(mw->device);
struct mlx5_ib_mw *mmw = to_mmw(mw);
int err;
err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
&mmw->mmkey);
if (!err)
kfree(mmw);
return err;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
xa_erase(&dev->mdev->priv.mkey_table,
mlx5_base_mkey(mmw->mmkey.key));
/*
* pagefault_single_data_segment() may be accessing mmw under
* SRCU if the user bound an ODP MR to this MW.
*/
synchronize_srcu(&dev->mr_srcu);
}
err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
if (err)
return err;
kfree(mmw);
return 0;
}
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
......
......@@ -178,6 +178,29 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
return;
}
/*
* The locking here is pretty subtle. Ideally the implicit children
* list would be protected by the umem_mutex, however that is not
* possible. Instead this uses a weaker update-then-lock pattern:
*
* srcu_read_lock()
* <change children list>
* mutex_lock(umem_mutex)
* mlx5_ib_update_xlt()
* mutex_unlock(umem_mutex)
* destroy lkey
*
* ie any change the children list must be followed by the locked
* update_xlt before destroying.
*
* The umem_mutex provides the acquire/release semantic needed to make
* the children list visible to a racing thread. While SRCU is not
* technically required, using it gives consistent use of the SRCU
* locking around the children list.
*/
lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
lockdep_assert_held(&mr->dev->mr_srcu);
odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
nentries * MLX5_IMR_MTT_SIZE, mr);
......@@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
int srcu_key;
mr->parent = NULL;
synchronize_srcu(&mr->dev->mr_srcu);
ib_umem_odp_release(odp);
if (imr->live)
if (smp_load_acquire(&imr->live)) {
srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
mutex_lock(&odp_imr->umem_mutex);
mlx5_ib_update_xlt(imr, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex);
srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
}
ib_umem_odp_release(odp);
mlx5_mr_cache_free(mr->dev, mr);
if (atomic_dec_and_test(&imr->num_leaf_free))
......@@ -278,7 +308,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
idx - blk_start_idx + 1, 0,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&umem_odp->umem_mutex);
/*
* We are now sure that the device will not access the
* memory. We can safely unmap it, and mark it as dirty if
......@@ -289,10 +318,12 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
if (unlikely(!umem_odp->npages && mr->parent &&
!umem_odp->dying)) {
WRITE_ONCE(umem_odp->dying, 1);
WRITE_ONCE(mr->live, 0);
umem_odp->dying = 1;
atomic_inc(&mr->parent->num_leaf_free);
schedule_work(&umem_odp->work);
}
mutex_unlock(&umem_odp->umem_mutex);
}
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
......@@ -429,8 +460,6 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
mr->live = 1;
mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
mr->mmkey.key, dev->mdev, mr);
......@@ -484,6 +513,8 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
mtt->parent = mr;
INIT_WORK(&odp->work, mr_leaf_free_action);
smp_store_release(&mtt->live, 1);
if (!nentries)
start_idx = addr >> MLX5_IMR_MTT_SHIFT;
nentries++;
......@@ -536,6 +567,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
smp_store_release(&imr->live, 1);
return imr;
}
......@@ -555,15 +587,19 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
if (mr->parent != imr)
continue;
mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
if (umem_odp->dying)
if (umem_odp->dying) {
mutex_unlock(&umem_odp->umem_mutex);
continue;
}
WRITE_ONCE(umem_odp->dying, 1);
umem_odp->dying = 1;
atomic_inc(&imr->num_leaf_free);
schedule_work(&umem_odp->work);
mutex_unlock(&umem_odp->umem_mutex);
}
up_read(&per_mm->umem_rwsem);
......@@ -773,7 +809,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
switch (mmkey->type) {
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
if (!mr->live || !mr->ibmr.pd) {
if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) {
mlx5_ib_dbg(dev, "got dead MR\n");
ret = -EFAULT;
goto srcu_unlock;
......@@ -1641,12 +1677,12 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd,
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
if (mr->ibmr.pd != pd) {
if (!smp_load_acquire(&mr->live)) {
ret = false;
break;
}
if (!mr->live) {
if (mr->ibmr.pd != pd) {
ret = false;
break;
}
......
......@@ -230,8 +230,6 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
pvrdma_page_dir_cleanup(dev, &srq->pdir);
kfree(srq);
atomic_dec(&dev->num_srqs);
}
......
......@@ -182,12 +182,19 @@ void siw_qp_llp_close(struct siw_qp *qp)
*/
void siw_qp_llp_write_space(struct sock *sk)
{
struct siw_cep *cep = sk_to_cep(sk);
struct siw_cep *cep;
cep->sk_write_space(sk);
read_lock(&sk->sk_callback_lock);
cep = sk_to_cep(sk);
if (cep) {
cep->sk_write_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
(void)siw_sq_start(cep->qp);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
(void)siw_sq_start(cep->qp);
}
read_unlock(&sk->sk_callback_lock);
}
static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
......
......@@ -112,17 +112,11 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0};
struct xarray *mkeys = &dev->priv.mkey_table;
struct mlx5_core_mkey *deleted_mkey;
unsigned long flags;
xa_lock_irqsave(mkeys, flags);
deleted_mkey = __xa_erase(mkeys, mlx5_base_mkey(mkey->key));
__xa_erase(mkeys, mlx5_base_mkey(mkey->key));
xa_unlock_irqrestore(mkeys, flags);
if (!deleted_mkey) {
mlx5_core_dbg(dev, "failed xarray delete of mkey 0x%x\n",
mlx5_base_mkey(mkey->key));
return -ENOENT;
}
MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment