Commit 32cf5a4e authored by Chuck Lever's avatar Chuck Lever

Revert "svcrdma: Add Write chunk WRs to the RPC's Send WR chain"

Performance regression reported with NFS/RDMA using Omnipath,
bisected to commit e084ee67 ("svcrdma: Add Write chunk WRs to
the RPC's Send WR chain").

Tracing on the server reports:

  nfsd-7771  [060]  1758.891809: svcrdma_sq_post_err:
	cq.id=205 cid=226 sc_sq_avail=13643/851 status=-12

sq_post_err reports ENOMEM, and the rdma->sc_sq_avail (13643) is
larger than rdma->sc_sq_depth (851). The number of available Send
Queue entries is always supposed to be smaller than the Send Queue
depth. That seems like a Send Queue accounting bug in svcrdma.

As it's getting to be late in the 6.9-rc cycle, revert this commit.
It can be revisited in a subsequent kernel release.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=218743
Fixes: e084ee67 ("svcrdma: Add Write chunk WRs to the RPC's Send WR chain")
Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
parent f488138b
...@@ -210,7 +210,6 @@ struct svc_rdma_recv_ctxt { ...@@ -210,7 +210,6 @@ struct svc_rdma_recv_ctxt {
*/ */
struct svc_rdma_write_info { struct svc_rdma_write_info {
struct svcxprt_rdma *wi_rdma; struct svcxprt_rdma *wi_rdma;
struct list_head wi_list;
const struct svc_rdma_chunk *wi_chunk; const struct svc_rdma_chunk *wi_chunk;
...@@ -239,10 +238,7 @@ struct svc_rdma_send_ctxt { ...@@ -239,10 +238,7 @@ struct svc_rdma_send_ctxt {
struct ib_cqe sc_cqe; struct ib_cqe sc_cqe;
struct xdr_buf sc_hdrbuf; struct xdr_buf sc_hdrbuf;
struct xdr_stream sc_stream; struct xdr_stream sc_stream;
struct list_head sc_write_info_list;
struct svc_rdma_write_info sc_reply_info; struct svc_rdma_write_info sc_reply_info;
void *sc_xprt_buf; void *sc_xprt_buf;
int sc_page_count; int sc_page_count;
int sc_cur_sge_no; int sc_cur_sge_no;
...@@ -274,14 +270,11 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, ...@@ -274,14 +270,11 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc, struct svc_rdma_chunk_ctxt *cc,
enum dma_data_direction dir); enum dma_data_direction dir);
extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt);
extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt); struct svc_rdma_send_ctxt *ctxt);
extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_recv_ctxt *rctxt,
struct svc_rdma_send_ctxt *sctxt, const struct xdr_buf *xdr);
const struct xdr_buf *xdr);
extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *write_pcl,
const struct svc_rdma_pcl *reply_pcl, const struct svc_rdma_pcl *reply_pcl,
......
...@@ -230,28 +230,6 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) ...@@ -230,28 +230,6 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
queue_work(svcrdma_wq, &info->wi_work); queue_work(svcrdma_wq, &info->wi_work);
} }
/**
* svc_rdma_write_chunk_release - Release Write chunk I/O resources
* @rdma: controlling transport
* @ctxt: Send context that is being released
*/
void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
{
struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc;
while (!list_empty(&ctxt->sc_write_info_list)) {
info = list_first_entry(&ctxt->sc_write_info_list,
struct svc_rdma_write_info, wi_list);
list_del(&info->wi_list);
cc = &info->wi_cc;
svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
svc_rdma_write_info_free(info);
}
}
/** /**
* svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
* @rdma: controlling transport * @rdma: controlling transport
...@@ -308,11 +286,13 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) ...@@ -308,11 +286,13 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
struct ib_cqe *cqe = wc->wr_cqe; struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_chunk_ctxt *cc = struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
struct svc_rdma_write_info *info =
container_of(cc, struct svc_rdma_write_info, wi_cc);
switch (wc->status) { switch (wc->status) {
case IB_WC_SUCCESS: case IB_WC_SUCCESS:
trace_svcrdma_wc_write(&cc->cc_cid); trace_svcrdma_wc_write(&cc->cc_cid);
return; break;
case IB_WC_WR_FLUSH_ERR: case IB_WC_WR_FLUSH_ERR:
trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
break; break;
...@@ -320,11 +300,12 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) ...@@ -320,11 +300,12 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
trace_svcrdma_wc_write_err(wc, &cc->cc_cid); trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
} }
/* The RDMA Write has flushed, so the client won't get svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
* some of the outgoing RPC message. Signal the loss
* to the client by closing the connection. if (unlikely(wc->status != IB_WC_SUCCESS))
*/ svc_xprt_deferred_close(&rdma->sc_xprt);
svc_xprt_deferred_close(&rdma->sc_xprt);
svc_rdma_write_info_free(info);
} }
/** /**
...@@ -620,19 +601,13 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) ...@@ -620,19 +601,13 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
return xdr->len; return xdr->len;
} }
/* Link Write WRs for @chunk onto @sctxt's WR chain. static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
*/ const struct svc_rdma_chunk *chunk,
static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, const struct xdr_buf *xdr)
struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_chunk *chunk,
const struct xdr_buf *xdr)
{ {
struct svc_rdma_write_info *info; struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc; struct svc_rdma_chunk_ctxt *cc;
struct ib_send_wr *first_wr;
struct xdr_buf payload; struct xdr_buf payload;
struct list_head *pos;
struct ib_cqe *cqe;
int ret; int ret;
if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
...@@ -648,25 +623,10 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, ...@@ -648,25 +623,10 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
if (ret != payload.len) if (ret != payload.len)
goto out_err; goto out_err;
ret = -EINVAL;
if (unlikely(cc->cc_sqecount > rdma->sc_sq_depth))
goto out_err;
first_wr = sctxt->sc_wr_chain;
cqe = &cc->cc_cqe;
list_for_each(pos, &cc->cc_rwctxts) {
struct svc_rdma_rw_ctxt *rwc;
rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, cqe, first_wr);
cqe = NULL;
}
sctxt->sc_wr_chain = first_wr;
sctxt->sc_sqecount += cc->cc_sqecount;
list_add(&info->wi_list, &sctxt->sc_write_info_list);
trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
ret = svc_rdma_post_chunk_ctxt(rdma, cc);
if (ret < 0)
goto out_err;
return 0; return 0;
out_err: out_err:
...@@ -675,27 +635,25 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, ...@@ -675,27 +635,25 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
} }
/** /**
* svc_rdma_prepare_write_list - Construct WR chain for sending Write list * svc_rdma_send_write_list - Send all chunks on the Write list
* @rdma: controlling RDMA transport * @rdma: controlling RDMA transport
* @write_pcl: Write list provisioned by the client * @rctxt: Write list provisioned by the client
* @sctxt: Send WR resources
* @xdr: xdr_buf containing an RPC Reply message * @xdr: xdr_buf containing an RPC Reply message
* *
* Returns zero on success, or a negative errno if one or more * Returns zero on success, or a negative errno if one or more
* Write chunks could not be sent. * Write chunks could not be sent.
*/ */
int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_recv_ctxt *rctxt,
struct svc_rdma_send_ctxt *sctxt, const struct xdr_buf *xdr)
const struct xdr_buf *xdr)
{ {
struct svc_rdma_chunk *chunk; struct svc_rdma_chunk *chunk;
int ret; int ret;
pcl_for_each_chunk(chunk, write_pcl) { pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
if (!chunk->ch_payload_length) if (!chunk->ch_payload_length)
break; break;
ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
if (ret < 0) if (ret < 0)
return ret; return ret;
} }
......
...@@ -142,7 +142,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ...@@ -142,7 +142,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_cqe.done = svc_rdma_wc_send;
INIT_LIST_HEAD(&ctxt->sc_write_info_list);
ctxt->sc_xprt_buf = buffer; ctxt->sc_xprt_buf = buffer;
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
rdma->sc_max_req_size); rdma->sc_max_req_size);
...@@ -228,7 +227,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, ...@@ -228,7 +227,6 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device; struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i; unsigned int i;
svc_rdma_write_chunk_release(rdma, ctxt);
svc_rdma_reply_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt);
if (ctxt->sc_page_count) if (ctxt->sc_page_count)
...@@ -1015,8 +1013,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ...@@ -1015,8 +1013,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (!p) if (!p)
goto put_ctxt; goto put_ctxt;
ret = svc_rdma_prepare_write_list(rdma, &rctxt->rc_write_pcl, sctxt, ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
&rqstp->rq_res);
if (ret < 0) if (ret < 0)
goto put_ctxt; goto put_ctxt;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment