Commit cd2abef6 authored by Chuck Lever's avatar Chuck Lever Committed by Ben Hutchings

svcrdma: Tail iovec leaves an orphaned DMA mapping

commit cace564f upstream.

The ctxt's count field is overloaded to mean the number of pages in
the ctxt->page array and the number of SGEs in the ctxt->sge array.
Typically these two numbers are the same.

However, when an inline RPC reply is constructed from an xdr_buf
with a tail iovec, the head and tail often occupy the same page,
but each are DMA mapped independently. In that case, ->count equals
the number of pages, but it does not equal the number of SGEs.
There's one more SGE, for the tail iovec. Hence there is one more
DMA mapping than there are pages in the ctxt->page array.

This isn't a real problem until the server's iommu is enabled. Then
each RPC reply that has content in that iovec orphans a DMA mapping
that consists of real resources.

krb5i and krb5p always populate that tail iovec. After a couple
million sent krb5i/p RPC replies, the NFS server starts behaving
erratically. Reboot is needed to clear the problem.

Fixes: 9d11b51c ("svcrdma: Fix send_reply() scatter/gather set-up")
Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
Signed-off-by: default avatarJ. Bruce Fields <bfields@redhat.com>
[bwh: Backported to 3.16:
 - Adjust context
 - Drop changes to svc_rdma_bc_sendto()
 - s/xprt->sc_pd->local_dma_lkey/xprt->sc_dma_lkey/
Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
parent 4f00b3d7
......@@ -83,6 +83,7 @@ struct svc_rdma_op_ctxt {
unsigned long flags;
enum dma_data_direction direction;
int count;
unsigned int mapped_sges;
struct ib_sge sge[RPCSVC_MAXPAGES];
struct page *pages[RPCSVC_MAXPAGES];
};
......@@ -178,6 +179,14 @@ struct svcxprt_rdma {
#define RPCRDMA_MAX_REQUESTS 16
#define RPCRDMA_MAX_REQ_SIZE 4096
/* Track DMA maps for this transport and context */
static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
struct svc_rdma_op_ctxt *ctxt)
{
ctxt->mapped_sges++;
atomic_inc(&rdma->sc_dma_used);
}
/* svc_rdma_marshal.c */
extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
int *, int *);
......
......@@ -178,7 +178,7 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
ctxt->sge[pno].addr);
if (ret)
goto err;
atomic_inc(&xprt->sc_dma_used);
svc_rdma_count_mappings(xprt, ctxt);
/* The lkey here is either a local dma lkey or a dma_mr lkey */
ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
......
......@@ -184,7 +184,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
sge[sge_no].addr))
goto err;
atomic_inc(&xprt->sc_dma_used);
svc_rdma_count_mappings(xprt, ctxt);
sge[sge_no].lkey = xprt->sc_dma_lkey;
ctxt->count++;
sge_off = 0;
......@@ -411,7 +411,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[0].length, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
svc_rdma_count_mappings(rdma, ctxt);
ctxt->direction = DMA_TO_DEVICE;
......@@ -427,7 +427,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
if (ib_dma_mapping_error(rdma->sc_cm_id->device,
ctxt->sge[sge_no].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
svc_rdma_count_mappings(rdma, ctxt);
ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
......@@ -442,23 +442,9 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
/*
* If there are more pages than SGE, terminate SGE
* list so that svc_rdma_unmap_dma doesn't attempt to
* unmap garbage.
*/
if (page_no+1 >= sge_no)
ctxt->sge[page_no+1].length = 0;
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
/* The loop above bumps sc_dma_used for each sge. The
* xdr_buf.tail gets a separate sge, but resides in the
* same page as xdr_buf.head. Don't count it twice.
*/
if (sge_no > ctxt->count)
atomic_dec(&rdma->sc_dma_used);
BUG_ON(sge_no > rdma->sc_max_sge);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
......
......@@ -108,6 +108,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
ctxt->xprt = xprt;
INIT_LIST_HEAD(&ctxt->dto_q);
ctxt->count = 0;
ctxt->mapped_sges = 0;
ctxt->frmr = NULL;
atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
......@@ -116,22 +117,27 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
struct ib_device *device = xprt->sc_cm_id->device;
u32 lkey = xprt->sc_dma_lkey;
unsigned int i, count;
for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the sc_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_page(xprt->sc_cm_id->device,
if (ctxt->sge[i].lkey == lkey) {
count++;
ib_dma_unmap_page(device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
}
}
ctxt->mapped_sges = 0;
atomic_sub(count, &xprt->sc_dma_used);
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
......@@ -521,7 +527,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
goto err_put_ctxt;
atomic_inc(&xprt->sc_dma_used);
svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
......@@ -1346,7 +1352,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
svc_rdma_put_context(ctxt, 1);
return;
}
atomic_inc(&xprt->sc_dma_used);
svc_rdma_count_mappings(xprt, ctxt);
ctxt->sge[0].lkey = xprt->sc_dma_lkey;
ctxt->sge[0].length = length;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment