Commit 972a2bf7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'nfs-for-5.4-1' of git://git.linux-nfs.org/projects/anna/linux-nfs

Pull NFS client updates from Anna Schumaker:
 "Stable bugfixes:
   - Dequeue the request from the receive queue while we're re-encoding
     # v4.20+
   - Fix buffer handling of GSS MIC without slack # 5.1

  Features:
   - Increase xprtrdma maximum transport header and slot table sizes
   - Add support for nfs4_call_sync() calls using a custom
     rpc_task_struct
   - Optimize the default readahead size
   - Enable pNFS filelayout LAYOUTGET on OPEN

  Other bugfixes and cleanups:
   - Fix possible null-pointer dereferences and memory leaks
   - Various NFS over RDMA cleanups
   - Various NFS over RDMA comment updates
   - Don't receive TCP data into a reset request buffer
   - Don't try to parse incomplete RPC messages
   - Fix congestion window race with disconnect
   - Clean up pNFS return-on-close error handling
   - Fixes for NFS4ERR_OLD_STATEID handling"

* tag 'nfs-for-5.4-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (53 commits)
  pNFS/filelayout: enable LAYOUTGET on OPEN
  NFS: Optimise the default readahead size
  NFSv4: Handle NFS4ERR_OLD_STATEID in LOCKU
  NFSv4: Handle NFS4ERR_OLD_STATEID in CLOSE/OPEN_DOWNGRADE
  NFSv4: Fix OPEN_DOWNGRADE error handling
  pNFS: Handle NFS4ERR_OLD_STATEID on layoutreturn by bumping the state seqid
  NFSv4: Add a helper to increment stateid seqids
  NFSv4: Handle RPC level errors in LAYOUTRETURN
  NFSv4: Handle NFS4ERR_DELAY correctly in return-on-close
  NFSv4: Clean up pNFS return-on-close error handling
  pNFS: Ensure we do clear the return-on-close layout stateid on fatal errors
  NFS: remove unused check for negative dentry
  NFSv3: use nfs_add_or_obtain() to create and reference inodes
  NFS: Refactor nfs_instantiate() for dentry referencing callers
  SUNRPC: Fix congestion window race with disconnect
  SUNRPC: Don't try to parse incomplete RPC messages
  SUNRPC: Rename xdr_buf_read_netobj to xdr_buf_read_mic
  SUNRPC: Fix buffer handling of GSS MIC without slack
  SUNRPC: RPC level errors should always set task->tk_rpc_status
  SUNRPC: Don't receive TCP data into a request buffer that has been reset
  ...
parents 7be3cb01 a8fd0fee
......@@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
#endif /* CONFIG_NFSV4 */
/*
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
struct dentry *
nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs_fattr *fattr,
struct nfs4_label *label)
{
......@@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
struct inode *dir = d_inode(parent);
struct inode *inode;
struct dentry *d;
int error = -EACCES;
int error;
d_drop(dentry);
/* We may have been initialized further down */
if (d_really_is_positive(dentry))
goto out;
if (fhandle->size == 0) {
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
if (error)
......@@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
}
inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
d = d_splice_alias(inode, dentry);
if (IS_ERR(d)) {
error = PTR_ERR(d);
goto out_error;
}
dput(d);
out:
dput(parent);
return 0;
return d;
out_error:
nfs_mark_for_revalidate(dir);
dput(parent);
return error;
d = ERR_PTR(error);
goto out;
}
EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
/*
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs_fattr *fattr,
struct nfs4_label *label)
{
struct dentry *d;
d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
if (IS_ERR(d))
return PTR_ERR(d);
/* Callers don't care */
dput(d);
return 0;
}
EXPORT_SYMBOL_GPL(nfs_instantiate);
......
......@@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
.owner = THIS_MODULE,
.flags = PNFS_LAYOUTGET_ON_OPEN,
.max_layoutget_response = 4096, /* 1 page or so... */
.alloc_layout_hdr = filelayout_alloc_layout_hdr,
.free_layout_hdr = filelayout_free_layout_hdr,
......
......@@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops;
struct nfs_string;
/* Maximum number of readahead requests
* FIXME: this should really be a sysctl so that users may tune it to suit
* their needs. People that do NFS over a slow network, might for
* instance want to reduce it to something closer to 1 for improved
* interactive response.
*/
#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
{
if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
......
......@@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void)
return data;
}
static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
static struct dentry *
nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
{
int status;
status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
nfs_post_op_update_inode(dir, data->res.dir_attr);
if (status == 0)
status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
return status;
if (status != 0)
return ERR_PTR(status);
return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
}
static void nfs3_free_createdata(struct nfs3_createdata *data)
......@@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call create %pd\n", dentry);
......@@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
for (;;) {
status = nfs3_do_create(dir, dentry, data);
d_alias = nfs3_do_create(dir, dentry, data);
status = PTR_ERR_OR_ZERO(d_alias);
if (status != -ENOTSUPP)
break;
......@@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (status != 0)
goto out_release_acls;
if (d_alias)
dentry = d_alias;
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
......@@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
if (status != 0)
goto out_release_acls;
goto out_dput;
}
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
out_dput:
dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
......@@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
if (len > NFS3_MAXPATHLEN)
......@@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
data->arg.symlink.pathlen = len;
data->arg.symlink.sattr = sattr;
status = nfs3_do_create(dir, dentry, data);
d_alias = nfs3_do_create(dir, dentry, data);
status = PTR_ERR_OR_ZERO(d_alias);
if (status == 0)
dput(d_alias);
nfs3_free_createdata(data);
out:
......@@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
......@@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
data->arg.mkdir.len = dentry->d_name.len;
data->arg.mkdir.sattr = sattr;
status = nfs3_do_create(dir, dentry, data);
d_alias = nfs3_do_create(dir, dentry, data);
status = PTR_ERR_OR_ZERO(d_alias);
if (status != 0)
goto out_release_acls;
if (d_alias)
dentry = d_alias;
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
......@@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call mknod %pd %u:%u\n", dentry,
......@@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
}
status = nfs3_do_create(dir, dentry, data);
d_alias = nfs3_do_create(dir, dentry, data);
status = PTR_ERR_OR_ZERO(d_alias);
if (status != 0)
goto out_release_acls;
if (d_alias)
dentry = d_alias;
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
......
......@@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
const struct nfs_lock_context *, nfs4_stateid *,
const struct cred **);
extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
struct nfs4_state *state);
extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
struct nfs4_state *state);
......@@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat
return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
}
static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
{
u32 seqid = be32_to_cpu(s1->seqid);
if (++seqid == 0)
++seqid;
s1->seqid = cpu_to_be32(seqid);
}
static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
{
return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
......
This diff is collapsed.
......@@ -1015,22 +1015,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
return ret;
}
bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
bool ret;
int seq;
do {
ret = false;
seq = read_seqbegin(&state->seqlock);
if (nfs4_state_match_open_stateid_other(state, dst)) {
dst->seqid = state->open_stateid.seqid;
ret = true;
}
} while (read_seqretry(&state->seqlock, seq));
return ret;
}
bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
bool ret;
......@@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
}
status = nfs4_begin_drain_session(clp);
if (status != 0)
return status;
if (status != 0) {
result = status;
goto out;
}
status = nfs4_replace_transport(server, locations);
if (status != 0) {
......
......@@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
} else
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
*p++ = cpu_to_be32(label->lfs);
*p++ = cpu_to_be32(label->pi);
*p++ = cpu_to_be32(label->len);
......
......@@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
}
/*
* Update the seqid of a layout stateid
* Update the seqid of a layout stateid after receiving
* NFS4ERR_OLD_STATEID
*/
bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode)
{
......@@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
spin_lock(&inode->i_lock);
lo = NFS_I(inode)->layout;
if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
if (lo && pnfs_layout_is_valid(lo) &&
nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
/* Is our call using the most recent seqid? If so, bump it */
if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
nfs4_stateid_seqid_inc(dst);
ret = true;
goto out;
}
/* Try to update the seqid to the most recent */
err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
if (err != -EBUSY) {
dst->seqid = lo->plh_stateid.seqid;
......@@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
ret = true;
}
}
out:
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
return ret;
......@@ -1440,6 +1450,52 @@ bool pnfs_roc(struct inode *ino,
return false;
}
int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
struct nfs4_layoutreturn_args **argpp,
struct nfs4_layoutreturn_res **respp,
int *ret)
{
struct nfs4_layoutreturn_args *arg = *argpp;
int retval = -EAGAIN;
if (!arg)
return 0;
/* Handle Layoutreturn errors */
switch (*ret) {
case 0:
retval = 0;
break;
case -NFS4ERR_NOMATCHING_LAYOUT:
/* Was there an RPC level error? If not, retry */
if (task->tk_rpc_status == 0)
break;
/* If the call was not sent, let caller handle it */
if (!RPC_WAS_SENT(task))
return 0;
/*
* Otherwise, assume the call succeeded and
* that we need to release the layout
*/
*ret = 0;
(*respp)->lrs_present = 0;
retval = 0;
break;
case -NFS4ERR_DELAY:
/* Let the caller handle the retry */
*ret = -NFS4ERR_NOMATCHING_LAYOUT;
return 0;
case -NFS4ERR_OLD_STATEID:
if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
&arg->range, inode))
break;
*ret = -NFS4ERR_NOMATCHING_LAYOUT;
return -EAGAIN;
}
*argpp = NULL;
*respp = NULL;
return retval;
}
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
int ret)
......@@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
const nfs4_stateid *res_stateid = NULL;
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
if (ret == 0) {
arg_stateid = &args->stateid;
switch (ret) {
case -NFS4ERR_NOMATCHING_LAYOUT:
break;
case 0:
if (res->lrs_present)
res_stateid = &res->stateid;
/* Fallthrough */
default:
arg_stateid = &args->stateid;
}
pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
res_stateid);
......
......@@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
bool is_recall);
int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
bool is_recall);
bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode);
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
......@@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
const struct cred *cred);
int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
struct nfs4_layoutreturn_args **argpp,
struct nfs4_layoutreturn_res **respp,
int *ret);
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
int ret);
......@@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino,
return false;
}
static inline int
pnfs_roc_done(struct rpc_task *task, struct inode *inode,
struct nfs4_layoutreturn_args **argpp,
struct nfs4_layoutreturn_res **respp,
int *ret)
{
return 0;
}
static inline void
pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
......@@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
{
}
static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode)
{
......
......@@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
}
EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
static void nfs_set_readahead(struct backing_dev_info *bdi,
unsigned long iomax_pages)
{
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = iomax_pages;
}
struct dentry *nfs_fs_mount_common(struct nfs_server *server,
int flags, const char *dev_name,
struct nfs_mount_info *mount_info,
......@@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
mntroot = ERR_PTR(error);
goto error_splat_super;
}
s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
nfs_set_readahead(s->s_bdi, server->rpages);
server->super = s;
}
......
......@@ -490,6 +490,9 @@ extern const struct file_operations nfs_dir_operations;
extern const struct dentry_operations nfs_dentry_operations;
extern void nfs_force_lookup_revalidate(struct inode *dir);
extern struct dentry *nfs_add_or_obtain(struct dentry *dentry,
struct nfs_fh *fh, struct nfs_fattr *fattr,
struct nfs4_label *label);
extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
struct nfs_fattr *fattr, struct nfs4_label *label);
extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);
......
......@@ -242,9 +242,6 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue,
void rpc_sleep_on_priority(struct rpc_wait_queue *,
struct rpc_task *,
int priority);
void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task);
void rpc_wake_up_queued_task(struct rpc_wait_queue *,
struct rpc_task *);
void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *,
......
......@@ -186,7 +186,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
extern void xdr_shift_buf(struct xdr_buf *, size_t);
extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int);
extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
......
......@@ -352,6 +352,7 @@ bool xprt_prepare_transmit(struct rpc_task *task);
void xprt_request_enqueue_transmit(struct rpc_task *task);
void xprt_request_enqueue_receive(struct rpc_task *task);
void xprt_request_wait_receive(struct rpc_task *task);
void xprt_request_dequeue_xprt(struct rpc_task *task);
bool xprt_request_need_retransmit(struct rpc_task *task);
void xprt_transmit(struct rpc_task *task);
void xprt_end_transmit(struct rpc_task *task);
......
......@@ -49,9 +49,9 @@
* fully-chunked NFS message (read chunks are the largest). Note only
* a single chunk type per message is supported currently.
*/
#define RPCRDMA_MIN_SLOT_TABLE (2U)
#define RPCRDMA_MIN_SLOT_TABLE (4U)
#define RPCRDMA_DEF_SLOT_TABLE (128U)
#define RPCRDMA_MAX_SLOT_TABLE (256U)
#define RPCRDMA_MAX_SLOT_TABLE (16384U)
#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
#define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */
......
......@@ -451,20 +451,81 @@ TRACE_EVENT(xprtrdma_createmrs,
TP_STRUCT__entry(
__field(const void *, r_xprt)
__string(addr, rpcrdma_addrstr(r_xprt))
__string(port, rpcrdma_portstr(r_xprt))
__field(unsigned int, count)
),
TP_fast_assign(
__entry->r_xprt = r_xprt;
__entry->count = count;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
TP_printk("r_xprt=%p: created %u MRs",
__entry->r_xprt, __entry->count
TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs",
__get_str(addr), __get_str(port), __entry->r_xprt,
__entry->count
)
);
DEFINE_RXPRT_EVENT(xprtrdma_nomrs);
TRACE_EVENT(xprtrdma_mr_get,
TP_PROTO(
const struct rpcrdma_req *req
),
TP_ARGS(req),
TP_STRUCT__entry(
__field(const void *, req)
__field(unsigned int, task_id)
__field(unsigned int, client_id)
__field(u32, xid)
),
TP_fast_assign(
const struct rpc_rqst *rqst = &req->rl_slot;
__entry->req = req;
__entry->task_id = rqst->rq_task->tk_pid;
__entry->client_id = rqst->rq_task->tk_client->cl_clid;
__entry->xid = be32_to_cpu(rqst->rq_xid);
),
TP_printk("task:%u@%u xid=0x%08x req=%p",
__entry->task_id, __entry->client_id, __entry->xid,
__entry->req
)
);
TRACE_EVENT(xprtrdma_nomrs,
TP_PROTO(
const struct rpcrdma_req *req
),
TP_ARGS(req),
TP_STRUCT__entry(
__field(const void *, req)
__field(unsigned int, task_id)
__field(unsigned int, client_id)
__field(u32, xid)
),
TP_fast_assign(
const struct rpc_rqst *rqst = &req->rl_slot;
__entry->req = req;
__entry->task_id = rqst->rq_task->tk_pid;
__entry->client_id = rqst->rq_task->tk_client->cl_clid;
__entry->xid = be32_to_cpu(rqst->rq_xid);
),
TP_printk("task:%u@%u xid=0x%08x req=%p",
__entry->task_id, __entry->client_id, __entry->xid,
__entry->req
)
);
DEFINE_RDCH_EVENT(read);
DEFINE_WRCH_EVENT(write);
......@@ -623,21 +684,21 @@ TRACE_EVENT(xprtrdma_post_send,
TRACE_EVENT(xprtrdma_post_recv,
TP_PROTO(
const struct ib_cqe *cqe
const struct rpcrdma_rep *rep
),
TP_ARGS(cqe),
TP_ARGS(rep),
TP_STRUCT__entry(
__field(const void *, cqe)
__field(const void *, rep)
),
TP_fast_assign(
__entry->cqe = cqe;
__entry->rep = rep;
),
TP_printk("cqe=%p",
__entry->cqe
TP_printk("rep=%p",
__entry->rep
)
);
......@@ -715,14 +776,15 @@ TRACE_EVENT(xprtrdma_wc_receive,
TP_ARGS(wc),
TP_STRUCT__entry(
__field(const void *, cqe)
__field(const void *, rep)
__field(u32, byte_len)
__field(unsigned int, status)
__field(u32, vendor_err)
),
TP_fast_assign(
__entry->cqe = wc->wr_cqe;
__entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep,
rr_cqe);
__entry->status = wc->status;
if (wc->status) {
__entry->byte_len = 0;
......@@ -733,8 +795,8 @@ TRACE_EVENT(xprtrdma_wc_receive,
}
),
TP_printk("cqe=%p %u bytes: %s (%u/0x%x)",
__entry->cqe, __entry->byte_len,
TP_printk("rep=%p %u bytes: %s (%u/0x%x)",
__entry->rep, __entry->byte_len,
rdma_show_wc_status(__entry->status),
__entry->status, __entry->vendor_err
)
......
......@@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
goto unwrap_failed;
if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
goto unwrap_failed;
maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
......
......@@ -1837,7 +1837,7 @@ call_allocate(struct rpc_task *task)
return;
}
rpc_exit(task, -ERESTARTSYS);
rpc_call_rpcerror(task, -ERESTARTSYS);
}
static int
......@@ -1862,6 +1862,7 @@ rpc_xdr_encode(struct rpc_task *task)
req->rq_rbuffer,
req->rq_rcvsize);
req->rq_reply_bytes_recvd = 0;
req->rq_snd_buf.head[0].iov_len = 0;
xdr_init_encode(&xdr, &req->rq_snd_buf,
req->rq_snd_buf.head[0].iov_base, req);
......@@ -1881,6 +1882,8 @@ call_encode(struct rpc_task *task)
if (!rpc_task_need_encode(task))
goto out;
dprint_status(task);
/* Dequeue task from the receive queue while we're encoding */
xprt_request_dequeue_xprt(task);
/* Encode here so that rpcsec_gss can use correct sequence number. */
rpc_xdr_encode(task);
/* Did the encode result in an error condition? */
......@@ -2479,6 +2482,7 @@ call_decode(struct rpc_task *task)
struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
struct xdr_stream xdr;
int err;
dprint_status(task);
......@@ -2501,6 +2505,15 @@ call_decode(struct rpc_task *task)
* before it changed req->rq_reply_bytes_recvd.
*/
smp_rmb();
/*
* Did we ever call xprt_complete_rqst()? If not, we should assume
* the message is incomplete.
*/
err = -EAGAIN;
if (!req->rq_reply_bytes_recvd)
goto out;
req->rq_rcv_buf.len = req->rq_private_buf.len;
/* Check that the softirq receive buffer is valid */
......@@ -2509,7 +2522,9 @@ call_decode(struct rpc_task *task)
xdr_init_decode(&xdr, &req->rq_rcv_buf,
req->rq_rcv_buf.head[0].iov_base, req);
switch (rpc_decode_header(task, &xdr)) {
err = rpc_decode_header(task, &xdr);
out:
switch (err) {
case 0:
task->tk_action = rpc_exit_task;
task->tk_status = rpcauth_unwrap_resp(task, &xdr);
......@@ -2518,9 +2533,6 @@ call_decode(struct rpc_task *task)
return;
case -EAGAIN:
task->tk_status = 0;
xdr_free_bvec(&req->rq_rcv_buf);
req->rq_reply_bytes_recvd = 0;
req->rq_rcv_buf.len = 0;
if (task->tk_client->cl_discrtry)
xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
......@@ -2561,7 +2573,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr)
return 0;
out_fail:
trace_rpc_bad_callhdr(task);
rpc_exit(task, error);
rpc_call_rpcerror(task, error);
return error;
}
......@@ -2628,7 +2640,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr)
return -EAGAIN;
}
out_err:
rpc_exit(task, error);
rpc_call_rpcerror(task, error);
return error;
out_unparsable:
......
......@@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
return NULL;
}
static void
rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
struct rpc_wait_queue *queue, struct rpc_task *task)
{
rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
}
/*
* Wake up a queued task while the queue lock is being held
*/
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{
rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
}
/*
* Wake up a task on a specific queue
*/
void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task)
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue,
struct rpc_task *task)
{
if (!RPC_IS_QUEUED(task))
return;
spin_lock(&queue->lock);
rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
spin_unlock(&queue->lock);
rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
task, NULL, NULL);
}
/*
......@@ -930,8 +911,10 @@ static void __rpc_execute(struct rpc_task *task)
/*
* Signalled tasks should exit rather than sleep.
*/
if (RPC_SIGNALLED(task))
if (RPC_SIGNALLED(task)) {
task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
/*
* The queue->lock protects against races with
......@@ -967,6 +950,7 @@ static void __rpc_execute(struct rpc_task *task)
*/
dprintk("RPC: %5u got signal\n", task->tk_pid);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
......
......@@ -560,7 +560,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
* required at the end of encoding, or any other time when the xdr_buf
* data might be read.
*/
void xdr_commit_encode(struct xdr_stream *xdr)
inline void xdr_commit_encode(struct xdr_stream *xdr)
{
int shift = xdr->scratch.iov_len;
void *page;
......@@ -1236,43 +1236,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
/* If the netobj starting offset bytes from the start of xdr_buf is contained
* entirely in the head or the tail, set object to point to it; otherwise
* try to find space for it at the end of the tail, copy it there, and
* set obj to point to it. */
int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
/**
* xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
* @buf: pointer to buffer containing a mic
* @mic: on success, returns the address of the mic
* @offset: the offset in buf where mic may be found
*
* This function may modify the xdr buf if the mic is found to be straddling
* a boundary between head, pages, and tail. On success the mic can be read
* from the address returned. There is no need to free the mic.
*
* Return: Success returns 0, otherwise an integer error.
*/
int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
{
struct xdr_buf subbuf;
unsigned int boundary;
if (xdr_decode_word(buf, offset, &obj->len))
if (xdr_decode_word(buf, offset, &mic->len))
return -EFAULT;
if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
offset += 4;
/* Is the mic partially in the head? */
boundary = buf->head[0].iov_len;
if (offset < boundary && (offset + mic->len) > boundary)
xdr_shift_buf(buf, boundary - offset);
/* Is the mic partially in the pages? */
boundary += buf->page_len;
if (offset < boundary && (offset + mic->len) > boundary)
xdr_shrink_pagelen(buf, boundary - offset);
if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
return -EFAULT;
/* Is the obj contained entirely in the head? */
obj->data = subbuf.head[0].iov_base;
if (subbuf.head[0].iov_len == obj->len)
/* Is the mic contained entirely in the head? */
mic->data = subbuf.head[0].iov_base;
if (subbuf.head[0].iov_len == mic->len)
return 0;
/* ..or is the obj contained entirely in the tail? */
obj->data = subbuf.tail[0].iov_base;
if (subbuf.tail[0].iov_len == obj->len)
/* ..or is the mic contained entirely in the tail? */
mic->data = subbuf.tail[0].iov_base;
if (subbuf.tail[0].iov_len == mic->len)
return 0;
/* use end of tail as storage for obj:
* (We don't copy to the beginning because then we'd have
* to worry about doing a potentially overlapping copy.
* This assumes the object is at most half the length of the
* tail.) */
if (obj->len > buf->buflen - buf->len)
/* Find a contiguous area in @buf to hold all of @mic */
if (mic->len > buf->buflen - buf->len)
return -ENOMEM;
if (buf->tail[0].iov_len != 0)
obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
else
obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
__read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
__read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
return 0;
}
EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
/* Returns 0 on success, or else a negative error code. */
static int
......
......@@ -456,6 +456,12 @@ void xprt_release_rqst_cong(struct rpc_task *task)
}
EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt)
{
if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state))
__xprt_lock_write_next_cong(xprt);
}
/*
* Clear the congestion window wait flag and wake up the next
* entry on xprt->sending
......@@ -671,6 +677,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
xprt_clear_write_space_locked(xprt);
xprt_clear_congestion_window_wait_locked(xprt);
xprt_wake_pending_tasks(xprt, -ENOTCONN);
spin_unlock(&xprt->transport_lock);
}
......@@ -1323,6 +1330,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task)
spin_unlock(&xprt->queue_lock);
}
/**
* xprt_request_dequeue_xprt - remove a task from the transmit+receive queue
* @task: pointer to rpc_task
*
* Remove a task from the transmit and receive queues, and ensure that
* it is not pinned by the receive work item.
*/
void
xprt_request_dequeue_xprt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
xprt_is_pinned_rqst(req)) {
spin_lock(&xprt->queue_lock);
xprt_request_dequeue_transmit_locked(task);
xprt_request_dequeue_receive_locked(task);
while (xprt_is_pinned_rqst(req)) {
set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
xprt_wait_on_pinned_rqst(req);
spin_lock(&xprt->queue_lock);
clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
}
spin_unlock(&xprt->queue_lock);
}
}
/**
* xprt_request_prepare - prepare an encoded request for transport
* @req: pointer to rpc_rqst
......@@ -1747,28 +1784,6 @@ void xprt_retry_reserve(struct rpc_task *task)
xprt_do_reserve(xprt, task);
}
static void
xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
{
struct rpc_xprt *xprt = req->rq_xprt;
if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
xprt_is_pinned_rqst(req)) {
spin_lock(&xprt->queue_lock);
xprt_request_dequeue_transmit_locked(task);
xprt_request_dequeue_receive_locked(task);
while (xprt_is_pinned_rqst(req)) {
set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
xprt_wait_on_pinned_rqst(req);
spin_lock(&xprt->queue_lock);
clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
}
spin_unlock(&xprt->queue_lock);
}
}
/**
* xprt_release - release an RPC request slot
* @task: task which is finished with the slot
......@@ -1788,7 +1803,7 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
xprt_request_dequeue_all(task, req);
xprt_request_dequeue_xprt(task);
spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
......
......@@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
return r_xprt->rx_buf.rb_bc_srv_max_requests;
return RPCRDMA_BACKWARD_WRS >> 1;
}
static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
......
This diff is collapsed.
......@@ -342,6 +342,32 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
return 0;
}
static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing,
struct rpcrdma_mr **mr)
{
*mr = rpcrdma_mr_pop(&req->rl_free_mrs);
if (!*mr) {
*mr = rpcrdma_mr_get(r_xprt);
if (!*mr)
goto out_getmr_err;
trace_xprtrdma_mr_get(req);
(*mr)->mr_req = req;
}
rpcrdma_mr_push(*mr, &req->rl_registered);
return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
trace_xprtrdma_nomrs(req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
if (r_xprt->rx_ep.rep_connected != -ENODEV)
schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
return ERR_PTR(-EAGAIN);
}
/* Register and XDR encode the Read list. Supports encoding a list of read
* segments that belong to a single read chunk.
*
......@@ -356,9 +382,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
*
* Only a single @pos value is currently supported.
*/
static noinline int
rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req,
struct rpc_rqst *rqst,
enum rpcrdma_chunktype rtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
......@@ -379,10 +406,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return nsegs;
do {
seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr);
seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_read_segment(xdr, mr, pos) < 0)
return -EMSGSIZE;
......@@ -411,9 +437,10 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
*
* Only a single Write chunk is currently supported.
*/
static noinline int
rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req,
struct rpc_rqst *rqst,
enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
......@@ -440,10 +467,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
......@@ -474,9 +500,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
* Returns zero on success, or a negative errno if a failure occurred.
* @xdr is advanced to the next position in the stream.
*/
static noinline int
rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req,
struct rpc_rqst *rqst,
enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
......@@ -501,10 +528,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
......@@ -841,12 +867,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
* chunks. Very likely the connection has been replaced,
* so these registrations are invalid and unusable.
*/
while (unlikely(!list_empty(&req->rl_registered))) {
struct rpcrdma_mr *mr;
mr = rpcrdma_mr_pop(&req->rl_registered);
rpcrdma_mr_recycle(mr);
}
frwr_recycle(req);
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
......@@ -1240,8 +1261,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
struct rpc_rqst *rqst = rep->rr_rqst;
int status;
xprt->reestablish_timeout = 0;
switch (rep->rr_proc) {
case rdma_msg:
status = rpcrdma_decode_msg(r_xprt, rep, rqst);
......@@ -1300,6 +1319,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
u32 credits;
__be32 *p;
/* Any data means we had a useful conversation, so
* then we don't need to delay the next reconnect.
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
/* Fixed transport header fields */
xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
rep->rr_hdrbuf.head[0].iov_base, NULL);
......
......@@ -423,8 +423,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
if (ep->rep_connected == -ENODEV)
return;
if (ep->rep_connected > 0)
xprt->reestablish_timeout = 0;
rpcrdma_ep_disconnect(ep, ia);
/* Prepare @xprt for the next connection by reinitializing
......@@ -434,6 +432,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
xprt->cwnd = RPC_CWNDSHIFT;
out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
}
......@@ -494,9 +493,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
* @reconnect_timeout: reconnect timeout after server disconnects
*
*/
static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
unsigned long connect_timeout,
unsigned long reconnect_timeout)
static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
unsigned long connect_timeout,
unsigned long reconnect_timeout)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
......@@ -571,6 +570,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
return;
out_sleep:
set_bit(XPRT_CONGESTED, &xprt->state);
rpc_sleep_on(&xprt->backlog, task, NULL);
task->tk_status = -EAGAIN;
}
......@@ -589,7 +589,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
memset(rqst, 0, sizeof(*rqst));
rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
rpc_wake_up_next(&xprt->backlog);
if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
clear_bit(XPRT_CONGESTED, &xprt->state);
}
static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
......@@ -803,7 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
.set_connect_timeout = xprt_rdma_tcp_set_connect_timeout,
.set_connect_timeout = xprt_rdma_set_connect_timeout,
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
......
This diff is collapsed.
......@@ -47,6 +47,7 @@
#include <linux/atomic.h> /* atomic_t, etc */
#include <linux/kref.h> /* struct kref */
#include <linux/workqueue.h> /* struct work_struct */
#include <linux/llist.h>
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
......@@ -117,9 +118,6 @@ struct rpcrdma_ep {
#endif
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
*
* The below structure appears at the front of a large region of kmalloc'd
* memory, which always starts on a good alignment boundary.
*/
struct rpcrdma_regbuf {
......@@ -158,25 +156,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
* is capped at 8. This prevents less-capable devices and
* memory registrations from overrunning the Send buffer
* while building chunk lists.
* is capped at 16. This prevents less-capable devices from
* overrunning the Send buffer while building chunk lists.
*
* Elements of the Read list take up more room than the
* Write list or Reply chunk. 8 read segments means the Read
* list (or Write list or Reply chunk) cannot consume more
* than
*
* ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
* Write list or Reply chunk. 16 read segments means the
* chunk lists cannot consume more than
*
* And the fixed part of the header is another 24 bytes.
* ((16 + 2) * read segment size) + 1 XDR words,
*
* The smallest inline threshold is 1024 bytes, ensuring that
* at least 750 bytes are available for RPC messages.
* or about 400 bytes. The fixed part of the header is
* another 24 bytes. Thus when the inline threshold is
* 1024 bytes, at least 600 bytes are available for RPC
* message bodies.
*/
enum {
RPCRDMA_MAX_HDR_SEGS = 8,
RPCRDMA_HDRBUF_SIZE = 256,
RPCRDMA_MAX_HDR_SEGS = 16,
};
/*
......@@ -206,7 +201,7 @@ struct rpcrdma_rep {
struct rpc_rqst *rr_rqst;
struct xdr_buf rr_hdrbuf;
struct xdr_stream rr_stream;
struct list_head rr_list;
struct llist_node rr_node;
struct ib_recv_wr rr_recv_wr;
};
......@@ -240,20 +235,20 @@ struct rpcrdma_sendctx {
* An external memory region is any buffer or page that is registered
* on the fly (ie, not pre-registered).
*/
struct rpcrdma_req;
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
struct completion fr_linv_done;
struct rpcrdma_req *fr_req;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
};
};
struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
struct rpcrdma_req *mr_req;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
......@@ -331,7 +326,8 @@ struct rpcrdma_req {
struct list_head rl_all;
struct kref rl_kref;
struct list_head rl_registered; /* registered segments */
struct list_head rl_free_mrs;
struct list_head rl_registered;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
......@@ -344,7 +340,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst)
static inline void
rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
{
list_add_tail(&mr->mr_list, list);
list_add(&mr->mr_list, list);
}
static inline struct rpcrdma_mr *
......@@ -352,8 +348,9 @@ rpcrdma_mr_pop(struct list_head *list)
{
struct rpcrdma_mr *mr;
mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
list_del_init(&mr->mr_list);
mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
if (mr)
list_del_init(&mr->mr_list);
return mr;
}
......@@ -364,19 +361,19 @@ rpcrdma_mr_pop(struct list_head *list)
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
spinlock_t rb_mrlock; /* protect rb_mrs list */
spinlock_t rb_lock;
struct list_head rb_send_bufs;
struct list_head rb_mrs;
struct list_head rb_all;
unsigned long rb_sc_head;
unsigned long rb_sc_tail;
unsigned long rb_sc_last;
struct rpcrdma_sendctx **rb_sc_ctxs;
spinlock_t rb_lock; /* protect buf lists */
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
struct list_head rb_allreqs;
struct list_head rb_all_mrs;
struct llist_head rb_free_reps;
u32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
......@@ -384,7 +381,7 @@ struct rpcrdma_buffer {
u32 rb_bc_srv_max_requests;
u32 rb_bc_max_requests;
struct delayed_work rb_refresh_worker;
struct work_struct rb_refresh_worker;
};
/*
......@@ -490,7 +487,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
void rpcrdma_mr_put(struct rpcrdma_mr *mr);
void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
static inline void
rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
......@@ -546,6 +542,7 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
bool frwr_is_supported(struct ib_device *device);
void frwr_recycle(struct rpcrdma_req *req);
void frwr_reset(struct rpcrdma_req *req);
int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
......@@ -554,7 +551,7 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr **mr);
struct rpcrdma_mr *mr);
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
......
......@@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
printk(KERN_WARNING "Callback slot table overflowed\n");
return -ESHUTDOWN;
}
if (transport->recv.copied && !req->rq_private_buf.len)
return -ESHUTDOWN;
ret = xs_read_stream_request(transport, msg, flags, req);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
xprt_complete_bc_request(req, transport->recv.copied);
else
req->rq_private_buf.len = transport->recv.copied;
return ret;
}
......@@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
/* Look up and lock the request corresponding to the given XID */
spin_lock(&xprt->queue_lock);
req = xprt_lookup_rqst(xprt, transport->recv.xid);
if (!req) {
if (!req || (transport->recv.copied && !req->rq_private_buf.len)) {
msg->msg_flags |= MSG_TRUNC;
goto out;
}
......@@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
spin_lock(&xprt->queue_lock);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
xprt_complete_rqst(req->rq_task, transport->recv.copied);
else
req->rq_private_buf.len = transport->recv.copied;
xprt_unpin_rqst(req);
out:
spin_unlock(&xprt->queue_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment