Commit c7398583 authored by Long Li's avatar Long Li Committed by Steve French

CIFS: SMBD: Implement RDMA memory registration

Memory registration is used for transferring payload via RDMA read or write.
After I/O is done, memory registrations are recovered and reused. This
process can be time consuming and is done in a work queue.
Signed-off-by: default avatarLong Li <longli@microsoft.com>
Reviewed-by: default avatarPavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: default avatarRonnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: default avatarSteve French <smfrench@gmail.com>
parent 9762c2d0
......@@ -48,6 +48,9 @@ static int smbd_post_send_page(struct smbd_connection *info,
struct page *page, unsigned long offset,
size_t size, int remaining_data_length);
static void destroy_mr_list(struct smbd_connection *info);
static int allocate_mr_list(struct smbd_connection *info);
/* SMBD version number */
#define SMBD_V1 0x0100
......@@ -198,6 +201,12 @@ static void smbd_destroy_rdma_work(struct work_struct *work)
wait_event(info->wait_send_payload_pending,
atomic_read(&info->send_payload_pending) == 0);
log_rdma_event(INFO, "freeing mr list\n");
wake_up_interruptible_all(&info->wait_mr);
wait_event(info->wait_for_mr_cleanup,
atomic_read(&info->mr_used_count) == 0);
destroy_mr_list(info);
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
......@@ -453,6 +462,16 @@ static bool process_negotiation_response(
}
info->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
info->rdma_readwrite_threshold =
rdma_readwrite_threshold > info->max_fragmented_send_size ?
info->max_fragmented_send_size :
rdma_readwrite_threshold;
info->max_readwrite_size = min_t(u32,
le32_to_cpu(packet->max_readwrite_size),
info->max_frmr_depth * PAGE_SIZE);
info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
return true;
}
......@@ -748,6 +767,12 @@ static int smbd_ia_open(
rc = -EPROTONOSUPPORT;
goto out2;
}
info->max_frmr_depth = min_t(int,
smbd_max_frmr_depth,
info->id->device->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
info->pd = ib_alloc_pd(info->id->device, 0);
if (IS_ERR(info->pd)) {
......@@ -1582,6 +1607,8 @@ struct smbd_connection *_smbd_get_connection(
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
struct ib_port_immutable port_immutable;
u32 ird_ord_hdr[2];
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
......@@ -1670,6 +1697,28 @@ struct smbd_connection *_smbd_get_connection(
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = 0;
conn_param.responder_resources =
info->id->device->attrs.max_qp_rd_atom
< SMBD_CM_RESPONDER_RESOURCES ?
info->id->device->attrs.max_qp_rd_atom :
SMBD_CM_RESPONDER_RESOURCES;
info->responder_resources = conn_param.responder_resources;
log_rdma_mr(INFO, "responder_resources=%d\n",
info->responder_resources);
/* Need to send IRD/ORD in private data for iWARP */
info->id->device->get_port_immutable(
info->id->device, info->id->port_num, &port_immutable);
if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
ird_ord_hdr[0] = info->responder_resources;
ird_ord_hdr[1] = 1;
conn_param.private_data = ird_ord_hdr;
conn_param.private_data_len = sizeof(ird_ord_hdr);
} else {
conn_param.private_data = NULL;
conn_param.private_data_len = 0;
}
conn_param.retry_count = SMBD_CM_RETRY;
conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
conn_param.flow_control = 0;
......@@ -1734,8 +1783,19 @@ struct smbd_connection *_smbd_get_connection(
goto negotiation_failed;
}
rc = allocate_mr_list(info);
if (rc) {
log_rdma_mr(ERR, "memory registration allocation failed\n");
goto allocate_mr_failed;
}
return info;
allocate_mr_failed:
/* At this point, need to a full transport shutdown */
smbd_destroy(info);
return NULL;
negotiation_failed:
cancel_delayed_work_sync(&info->idle_timer_work);
destroy_caches_and_workqueue(info);
......@@ -2189,3 +2249,364 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
return rc;
}
static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbd_mr *mr;
struct ib_cqe *cqe;
if (wc->status) {
log_rdma_mr(ERR, "status=%d\n", wc->status);
cqe = wc->wr_cqe;
mr = container_of(cqe, struct smbd_mr, cqe);
smbd_disconnect_rdma_connection(mr->conn);
}
}
/*
* The work queue function that recovers MRs
* We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
* again. Both calls are slow, so finish them in a workqueue. This will not
* block I/O path.
* There is one workqueue that recovers MRs, there is no need to lock as the
* I/O requests calling smbd_register_mr will never update the links in the
* mr_list.
*/
static void smbd_mr_recovery_work(struct work_struct *work)
{
struct smbd_connection *info =
container_of(work, struct smbd_connection, mr_recovery_work);
struct smbd_mr *smbdirect_mr;
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
if (smbdirect_mr->state == MR_INVALIDATED ||
smbdirect_mr->state == MR_ERROR) {
if (smbdirect_mr->state == MR_INVALIDATED) {
ib_dma_unmap_sg(
info->id->device, smbdirect_mr->sgl,
smbdirect_mr->sgl_count,
smbdirect_mr->dir);
smbdirect_mr->state = MR_READY;
} else if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
if (rc) {
log_rdma_mr(ERR,
"ib_dereg_mr faield rc=%x\n",
rc);
smbd_disconnect_rdma_connection(info);
}
smbdirect_mr->mr = ib_alloc_mr(
info->pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR,
"ib_alloc_mr failed mr_type=%x "
"max_frmr_depth=%x\n",
info->mr_type,
info->max_frmr_depth);
smbd_disconnect_rdma_connection(info);
}
smbdirect_mr->state = MR_READY;
}
/* smbdirect_mr->state is updated by this function
* and is read and updated by I/O issuing CPUs trying
* to get a MR, the call to atomic_inc_return
* implicates a memory barrier and guarantees this
* value is updated before waking up any calls to
* get_mr() from the I/O issuing CPUs
*/
if (atomic_inc_return(&info->mr_ready_count) == 1)
wake_up_interruptible(&info->wait_mr);
}
}
}
static void destroy_mr_list(struct smbd_connection *info)
{
struct smbd_mr *mr, *tmp;
cancel_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
ib_dma_unmap_sg(info->id->device, mr->sgl,
mr->sgl_count, mr->dir);
ib_dereg_mr(mr->mr);
kfree(mr->sgl);
kfree(mr);
}
}
/*
* Allocate MRs used for RDMA read/write
* The number of MRs will not exceed hardware capability in responder_resources
* All MRs are kept in mr_list. The MR can be recovered after it's used
* Recovery is done in smbd_mr_recovery_work. The content of list entry changes
* as MRs are used and recovered for I/O, but the list links will not change
*/
static int allocate_mr_list(struct smbd_connection *info)
{
int i;
struct smbd_mr *smbdirect_mr, *tmp;
INIT_LIST_HEAD(&info->mr_list);
init_waitqueue_head(&info->wait_mr);
spin_lock_init(&info->mr_list_lock);
atomic_set(&info->mr_ready_count, 0);
atomic_set(&info->mr_used_count, 0);
init_waitqueue_head(&info->wait_for_mr_cleanup);
/* Allocate more MRs (2x) than hardware responder_resources */
for (i = 0; i < info->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
goto out;
smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
"max_frmr_depth=%x\n",
info->mr_type, info->max_frmr_depth);
goto out;
}
smbdirect_mr->sgl = kcalloc(
info->max_frmr_depth,
sizeof(struct scatterlist),
GFP_KERNEL);
if (!smbdirect_mr->sgl) {
log_rdma_mr(ERR, "failed to allocate sgl\n");
ib_dereg_mr(smbdirect_mr->mr);
goto out;
}
smbdirect_mr->state = MR_READY;
smbdirect_mr->conn = info;
list_add_tail(&smbdirect_mr->list, &info->mr_list);
atomic_inc(&info->mr_ready_count);
}
INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
return 0;
out:
kfree(smbdirect_mr);
list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
ib_dereg_mr(smbdirect_mr->mr);
kfree(smbdirect_mr->sgl);
kfree(smbdirect_mr);
}
return -ENOMEM;
}
/*
* Get a MR from mr_list. This function waits until there is at least one
* MR available in the list. It may access the list while the
* smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
* as they never modify the same places. However, there may be several CPUs
* issueing I/O trying to get MR at the same time, mr_list_lock is used to
* protect this situation.
*/
static struct smbd_mr *get_mr(struct smbd_connection *info)
{
struct smbd_mr *ret;
int rc;
again:
rc = wait_event_interruptible(info->wait_mr,
atomic_read(&info->mr_ready_count) ||
info->transport_status != SMBD_CONNECTED);
if (rc) {
log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
return NULL;
}
if (info->transport_status != SMBD_CONNECTED) {
log_rdma_mr(ERR, "info->transport_status=%x\n",
info->transport_status);
return NULL;
}
spin_lock(&info->mr_list_lock);
list_for_each_entry(ret, &info->mr_list, list) {
if (ret->state == MR_READY) {
ret->state = MR_REGISTERED;
spin_unlock(&info->mr_list_lock);
atomic_dec(&info->mr_ready_count);
atomic_inc(&info->mr_used_count);
return ret;
}
}
spin_unlock(&info->mr_list_lock);
/*
* It is possible that we could fail to get MR because other processes may
* try to acquire a MR at the same time. If this is the case, retry it.
*/
goto again;
}
/*
* Register memory for RDMA read/write
* pages[]: the list of pages to register memory with
* num_pages: the number of pages to register
* tailsz: if non-zero, the bytes to register in the last page
* writing: true if this is a RDMA write (SMB read), false for RDMA read
* need_invalidate: true if this MR needs to be locally invalidated after I/O
* return value: the MR registered, NULL if failed.
*/
struct smbd_mr *smbd_register_mr(
struct smbd_connection *info, struct page *pages[], int num_pages,
int tailsz, bool writing, bool need_invalidate)
{
struct smbd_mr *smbdirect_mr;
int rc, i;
enum dma_data_direction dir;
struct ib_reg_wr *reg_wr;
struct ib_send_wr *bad_wr;
if (num_pages > info->max_frmr_depth) {
log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
num_pages, info->max_frmr_depth);
return NULL;
}
smbdirect_mr = get_mr(info);
if (!smbdirect_mr) {
log_rdma_mr(ERR, "get_mr returning NULL\n");
return NULL;
}
smbdirect_mr->need_invalidate = need_invalidate;
smbdirect_mr->sgl_count = num_pages;
sg_init_table(smbdirect_mr->sgl, num_pages);
for (i = 0; i < num_pages - 1; i++)
sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
sg_set_page(&smbdirect_mr->sgl[i], pages[i],
tailsz ? tailsz : PAGE_SIZE, 0);
dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
smbdirect_mr->dir = dir;
rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
if (!rc) {
log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
num_pages, dir, rc);
goto dma_map_error;
}
rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
NULL, PAGE_SIZE);
if (rc != num_pages) {
log_rdma_mr(INFO,
"ib_map_mr_sg failed rc = %x num_pages = %x\n",
rc, num_pages);
goto map_mr_error;
}
ib_update_fast_reg_key(smbdirect_mr->mr,
ib_inc_rkey(smbdirect_mr->mr->rkey));
reg_wr = &smbdirect_mr->wr;
reg_wr->wr.opcode = IB_WR_REG_MR;
smbdirect_mr->cqe.done = register_mr_done;
reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
reg_wr->wr.num_sge = 0;
reg_wr->wr.send_flags = IB_SEND_SIGNALED;
reg_wr->mr = smbdirect_mr->mr;
reg_wr->key = smbdirect_mr->mr->rkey;
reg_wr->access = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
/*
* There is no need for waiting for complemtion on ib_post_send
* on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
* on the next ib_post_send when we actaully send I/O to remote peer
*/
rc = ib_post_send(info->id->qp, &reg_wr->wr, &bad_wr);
if (!rc)
return smbdirect_mr;
log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
rc, reg_wr->key);
/* If all failed, attempt to recover this MR by setting it MR_ERROR*/
map_mr_error:
ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
smbdirect_mr->sgl_count, smbdirect_mr->dir);
dma_map_error:
smbdirect_mr->state = MR_ERROR;
if (atomic_dec_and_test(&info->mr_used_count))
wake_up(&info->wait_for_mr_cleanup);
return NULL;
}
static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbd_mr *smbdirect_mr;
struct ib_cqe *cqe;
cqe = wc->wr_cqe;
smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
smbdirect_mr->state = MR_INVALIDATED;
if (wc->status != IB_WC_SUCCESS) {
log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
smbdirect_mr->state = MR_ERROR;
}
complete(&smbdirect_mr->invalidate_done);
}
/*
* Deregister a MR after I/O is done
* This function may wait if remote invalidation is not used
* and we have to locally invalidate the buffer to prevent data is being
* modified by remote peer after upper layer consumes it
*/
int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
{
struct ib_send_wr *wr, *bad_wr;
struct smbd_connection *info = smbdirect_mr->conn;
int rc = 0;
if (smbdirect_mr->need_invalidate) {
/* Need to finish local invalidation before returning */
wr = &smbdirect_mr->inv_wr;
wr->opcode = IB_WR_LOCAL_INV;
smbdirect_mr->cqe.done = local_inv_done;
wr->wr_cqe = &smbdirect_mr->cqe;
wr->num_sge = 0;
wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
wr->send_flags = IB_SEND_SIGNALED;
init_completion(&smbdirect_mr->invalidate_done);
rc = ib_post_send(info->id->qp, wr, &bad_wr);
if (rc) {
log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
smbd_disconnect_rdma_connection(info);
goto done;
}
wait_for_completion(&smbdirect_mr->invalidate_done);
smbdirect_mr->need_invalidate = false;
} else
/*
* For remote invalidation, just set it to MR_INVALIDATED
* and defer to mr_recovery_work to recover the MR for next use
*/
smbdirect_mr->state = MR_INVALIDATED;
/*
* Schedule the work to do MR recovery for future I/Os
* MR recovery is slow and we don't want it to block the current I/O
*/
queue_work(info->workqueue, &info->mr_recovery_work);
done:
if (atomic_dec_and_test(&info->mr_used_count))
wake_up(&info->wait_for_mr_cleanup);
return rc;
}
......@@ -90,6 +90,29 @@ struct smbd_connection {
int receive_credit_target;
int fragment_reassembly_remaining;
/* Memory registrations */
/* Maximum number of RDMA read/write outstanding on this connection */
int responder_resources;
/* Maximum number of SGEs in a RDMA write/read */
int max_frmr_depth;
/*
* If payload is less than or equal to the threshold,
* use RDMA send/recv to send upper layer I/O.
* If payload is more than the threshold,
* use RDMA read/write through memory registration for I/O.
*/
int rdma_readwrite_threshold;
enum ib_mr_type mr_type;
struct list_head mr_list;
spinlock_t mr_list_lock;
/* The number of available MRs ready for memory registration */
atomic_t mr_ready_count;
atomic_t mr_used_count;
wait_queue_head_t wait_mr;
struct work_struct mr_recovery_work;
/* Used by transport to wait until all MRs are returned */
wait_queue_head_t wait_for_mr_cleanup;
/* Activity accoutning */
/* Pending reqeusts issued from upper layer */
int smbd_send_pending;
......@@ -262,6 +285,36 @@ void smbd_destroy(struct smbd_connection *info);
int smbd_recv(struct smbd_connection *info, struct msghdr *msg);
int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst);
enum mr_state {
MR_READY,
MR_REGISTERED,
MR_INVALIDATED,
MR_ERROR
};
struct smbd_mr {
struct smbd_connection *conn;
struct list_head list;
enum mr_state state;
struct ib_mr *mr;
struct scatterlist *sgl;
int sgl_count;
enum dma_data_direction dir;
union {
struct ib_reg_wr wr;
struct ib_send_wr inv_wr;
};
struct ib_cqe cqe;
bool need_invalidate;
struct completion invalidate_done;
};
/* Interfaces to register and deregister MR for RDMA read/write */
struct smbd_mr *smbd_register_mr(
struct smbd_connection *info, struct page *pages[], int num_pages,
int tailsz, bool writing, bool need_invalidate);
int smbd_deregister_mr(struct smbd_mr *mr);
#else
#define cifs_rdma_enabled(server) 0
struct smbd_connection {};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment