Commit 00833408 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag '9p-for-6.1' of https://github.com/martinetd/linux

Pull 9p updates from Dominique Martinet:
 "Smaller buffers for small messages and fixes.

  The highlight of this is Christian's patch to allocate smaller buffers
  for most metadata requests: 9p with a big msize would try to allocate
  large buffers when just 4 or 8k would be more than enough; this brings
  in nice performance improvements.

  There's also a few fixes for problems reported by syzkaller (thanks to
  Schspa Shi, Tetsuo Handa for tests and feedback/patches) as well as
  some minor cleanup"

* tag '9p-for-6.1' of https://github.com/martinetd/linux:
  net/9p: clarify trans_fd parse_opt failure handling
  net/9p: add __init/__exit annotations to module init/exit funcs
  net/9p: use a dedicated spinlock for trans_fd
  9p/trans_fd: always use O_NONBLOCK read/write
  net/9p: allocate appropriate reduced message buffers
  net/9p: add 'pooled_rbuffers' flag to struct p9_trans_module
  net/9p: add p9_msg_buf_size()
  9p: add P9_ERRMAX for 9p2000 and 9p2000.u
  net/9p: split message size argument into 't_size' and 'r_size' pair
  9p: trans_fd/p9_conn_cancel: drop client lock earlier
parents 288fc860 a8e633c6
......@@ -331,6 +331,9 @@ enum p9_qid_t {
/* size of header for zero copy read/write */
#define P9_ZC_HDR_SZ 4096
/* maximum length of an error string */
#define P9_ERRMAX 128
/**
* struct p9_qid - file system entity information
* @type: 8-bit type &p9_qid_t
......
......@@ -19,6 +19,10 @@
* @list: used to maintain a list of currently available transports
* @name: the human-readable name of the transport
* @maxsize: transport provided maximum packet size
* @pooled_rbuffers: currently only set for RDMA transport which pulls the
* response buffers from a shared pool, and accordingly
* we're less flexible when choosing the response message
* size in this case
* @def: set if this transport should be considered the default
* @create: member function to create a new connection on this transport
* @close: member function to discard a connection on this transport
......@@ -38,6 +42,7 @@ struct p9_trans_module {
struct list_head list;
char *name; /* name of transport */
int maxsize; /* max message size of transport */
bool pooled_rbuffers;
int def; /* this transport should be default */
struct module *owner;
int (*create)(struct p9_client *client,
......
......@@ -255,24 +255,42 @@ static struct kmem_cache *p9_req_cache;
* p9_tag_alloc - Allocate a new request.
* @c: Client session.
* @type: Transaction type.
* @max_size: Maximum packet size for this request.
* @t_size: Buffer size for holding this request
* (automatic calculation by format template if 0).
* @r_size: Buffer size for holding server's reply on this request
* (automatic calculation by format template if 0).
* @fmt: Format template for assembling 9p request message
* (see p9pdu_vwritef).
* @ap: Variable arguments to be fed to passed format template
* (see p9pdu_vwritef).
*
* Context: Process context.
* Return: Pointer to new request.
*/
static struct p9_req_t *
p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
const char *fmt, va_list ap)
{
struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
int alloc_msize = min(c->msize, max_size);
int alloc_tsize;
int alloc_rsize;
int tag;
va_list apc;
va_copy(apc, ap);
alloc_tsize = min_t(size_t, c->msize,
t_size ?: p9_msg_buf_size(c, type, fmt, apc));
va_end(apc);
alloc_rsize = min_t(size_t, c->msize,
r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
if (!req)
return ERR_PTR(-ENOMEM);
if (p9_fcall_init(c, &req->tc, alloc_msize))
if (p9_fcall_init(c, &req->tc, alloc_tsize))
goto free_req;
if (p9_fcall_init(c, &req->rc, alloc_msize))
if (p9_fcall_init(c, &req->rc, alloc_rsize))
goto free;
p9pdu_reset(&req->tc);
......@@ -592,11 +610,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
}
static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
int8_t type, int req_size,
int8_t type, uint t_size, uint r_size,
const char *fmt, va_list ap)
{
int err;
struct p9_req_t *req;
va_list apc;
p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
......@@ -608,7 +627,9 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
if (c->status == BeginDisconnect && type != P9_TCLUNK)
return ERR_PTR(-EIO);
req = p9_tag_alloc(c, type, req_size);
va_copy(apc, ap);
req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
va_end(apc);
if (IS_ERR(req))
return req;
......@@ -643,9 +664,18 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
int sigpending, err;
unsigned long flags;
struct p9_req_t *req;
/* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
* auto determine an appropriate (small) request/response size
* according to actual message data being sent. Currently RDMA
* transport is excluded from this response message size optimization,
* as it would not cope with it, due to its pooled response buffers
* (using an optimized request size for RDMA as well though).
*/
const uint tsize = 0;
const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
va_start(ap, fmt);
req = p9_client_prepare_req(c, type, c->msize, fmt, ap);
req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
va_end(ap);
if (IS_ERR(req))
return req;
......@@ -743,7 +773,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
/* We allocate a inline protocol data of only 4k bytes.
* The actual content is passed in zero-copy fashion.
*/
req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap);
req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap);
va_end(ap);
if (IS_ERR(req))
return req;
......
......@@ -23,6 +23,173 @@
#include <trace/events/9p.h>
/* len[2] text[len] */
#define P9_STRLEN(s) \
(2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX))
/**
* p9_msg_buf_size - Returns a buffer size sufficiently large to hold the
* intended 9p message.
* @c: client
* @type: message type
* @fmt: format template for assembling request message
* (see p9pdu_vwritef)
* @ap: variable arguments to be fed to passed format template
* (see p9pdu_vwritef)
*
* Note: Even for response types (P9_R*) the format template and variable
* arguments must always be for the originating request type (P9_T*).
*/
size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
const char *fmt, va_list ap)
{
/* size[4] type[1] tag[2] */
const int hdr = 4 + 1 + 2;
/* ename[s] errno[4] */
const int rerror_size = hdr + P9_ERRMAX + 4;
/* ecode[4] */
const int rlerror_size = hdr + 4;
const int err_size =
c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size;
static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes "
"a max. allowed directory entry name length of 4k");
switch (type) {
/* message types not used at all */
case P9_TERROR:
case P9_TLERROR:
case P9_TAUTH:
case P9_RAUTH:
BUG();
/* variable length & potentially large message types */
case P9_TATTACH:
BUG_ON(strcmp("ddss?u", fmt));
va_arg(ap, int32_t);
va_arg(ap, int32_t);
{
const char *uname = va_arg(ap, const char *);
const char *aname = va_arg(ap, const char *);
/* fid[4] afid[4] uname[s] aname[s] n_uname[4] */
return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4;
}
case P9_TWALK:
BUG_ON(strcmp("ddT", fmt));
va_arg(ap, int32_t);
va_arg(ap, int32_t);
{
uint i, nwname = va_arg(ap, int);
size_t wname_all;
const char **wnames = va_arg(ap, const char **);
for (i = 0, wname_all = 0; i < nwname; ++i) {
wname_all += P9_STRLEN(wnames[i]);
}
/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
return hdr + 4 + 4 + 2 + wname_all;
}
case P9_RWALK:
BUG_ON(strcmp("ddT", fmt));
va_arg(ap, int32_t);
va_arg(ap, int32_t);
{
uint nwname = va_arg(ap, int);
/* nwqid[2] nwqid*(wqid[13]) */
return max_t(size_t, hdr + 2 + nwname * 13, err_size);
}
case P9_TCREATE:
BUG_ON(strcmp("dsdb?s", fmt));
va_arg(ap, int32_t);
{
const char *name = va_arg(ap, const char *);
if (c->proto_version == p9_proto_legacy) {
/* fid[4] name[s] perm[4] mode[1] */
return hdr + 4 + P9_STRLEN(name) + 4 + 1;
} else {
va_arg(ap, int32_t);
va_arg(ap, int);
{
const char *ext = va_arg(ap, const char *);
/* fid[4] name[s] perm[4] mode[1] extension[s] */
return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext);
}
}
}
case P9_TLCREATE:
BUG_ON(strcmp("dsddg", fmt));
va_arg(ap, int32_t);
{
const char *name = va_arg(ap, const char *);
/* fid[4] name[s] flags[4] mode[4] gid[4] */
return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4;
}
case P9_RREAD:
case P9_RREADDIR:
BUG_ON(strcmp("dqd", fmt));
va_arg(ap, int32_t);
va_arg(ap, int64_t);
{
const int32_t count = va_arg(ap, int32_t);
/* count[4] data[count] */
return max_t(size_t, hdr + 4 + count, err_size);
}
case P9_TWRITE:
BUG_ON(strcmp("dqV", fmt));
va_arg(ap, int32_t);
va_arg(ap, int64_t);
{
const int32_t count = va_arg(ap, int32_t);
/* fid[4] offset[8] count[4] data[count] */
return hdr + 4 + 8 + 4 + count;
}
case P9_TRENAMEAT:
BUG_ON(strcmp("dsds", fmt));
va_arg(ap, int32_t);
{
const char *oldname, *newname;
oldname = va_arg(ap, const char *);
va_arg(ap, int32_t);
newname = va_arg(ap, const char *);
/* olddirfid[4] oldname[s] newdirfid[4] newname[s] */
return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname);
}
case P9_TSYMLINK:
BUG_ON(strcmp("dssg", fmt));
va_arg(ap, int32_t);
{
const char *name = va_arg(ap, const char *);
const char *symtgt = va_arg(ap, const char *);
/* fid[4] name[s] symtgt[s] gid[4] */
return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4;
}
case P9_RERROR:
return rerror_size;
case P9_RLERROR:
return rlerror_size;
/* small message types */
case P9_TWSTAT:
case P9_RSTAT:
case P9_RREADLINK:
case P9_TXATTRWALK:
case P9_TXATTRCREATE:
case P9_TLINK:
case P9_TMKDIR:
case P9_TMKNOD:
case P9_TRENAME:
case P9_TUNLINKAT:
case P9_TLOCK:
return 8 * 1024;
/* tiny message types */
default:
return 4 * 1024;
}
}
static int
p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
......
......@@ -8,6 +8,8 @@
* Copyright (C) 2008 by IBM, Corp.
*/
size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
const char *fmt, va_list ap);
int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
va_list ap);
int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
......
......@@ -91,6 +91,7 @@ struct p9_poll_wait {
* @mux_list: list link for mux to manage multiple connections (?)
* @client: reference to client instance for this connection
* @err: error state
* @req_lock: lock protecting req_list and requests statuses
* @req_list: accounting for requests which have been sent
* @unsent_req_list: accounting for requests that haven't been sent
* @rreq: read request
......@@ -114,6 +115,7 @@ struct p9_conn {
struct list_head mux_list;
struct p9_client *client;
int err;
spinlock_t req_lock;
struct list_head req_list;
struct list_head unsent_req_list;
struct p9_req_t *rreq;
......@@ -189,10 +191,10 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
spin_lock(&m->client->lock);
spin_lock(&m->req_lock);
if (m->err) {
spin_unlock(&m->client->lock);
spin_unlock(&m->req_lock);
return;
}
......@@ -205,6 +207,8 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
list_move(&req->req_list, &cancel_list);
}
spin_unlock(&m->req_lock);
list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
list_del(&req->req_list);
......@@ -212,7 +216,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
req->t_err = err;
p9_client_cb(m->client, req, REQ_STATUS_ERROR);
}
spin_unlock(&m->client->lock);
}
static __poll_t
......@@ -359,7 +362,7 @@ static void p9_read_work(struct work_struct *work)
if ((m->rreq) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
m->rreq->rc.size = m->rc.offset;
spin_lock(&m->client->lock);
spin_lock(&m->req_lock);
if (m->rreq->status == REQ_STATUS_SENT) {
list_del(&m->rreq->req_list);
p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD);
......@@ -368,14 +371,14 @@ static void p9_read_work(struct work_struct *work)
p9_debug(P9_DEBUG_TRANS,
"Ignore replies associated with a cancelled request\n");
} else {
spin_unlock(&m->client->lock);
spin_unlock(&m->req_lock);
p9_debug(P9_DEBUG_ERROR,
"Request tag %d errored out while we were reading the reply\n",
m->rc.tag);
err = -EIO;
goto error;
}
spin_unlock(&m->client->lock);
spin_unlock(&m->req_lock);
m->rc.sdata = NULL;
m->rc.offset = 0;
m->rc.capacity = 0;
......@@ -453,10 +456,10 @@ static void p9_write_work(struct work_struct *work)
}
if (!m->wsize) {
spin_lock(&m->client->lock);
spin_lock(&m->req_lock);
if (list_empty(&m->unsent_req_list)) {
clear_bit(Wworksched, &m->wsched);
spin_unlock(&m->client->lock);
spin_unlock(&m->req_lock);
return;
}
......@@ -471,7 +474,7 @@ static void p9_write_work(struct work_struct *work)
m->wpos = 0;
p9_req_get(req);
m->wreq = req;
spin_unlock(&m->client->lock);
spin_unlock(&m->req_lock);
}
p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
......@@ -588,6 +591,7 @@ static void p9_conn_create(struct p9_client *client)
INIT_LIST_HEAD(&m->mux_list);
m->client = client;
spin_lock_init(&m->req_lock);
INIT_LIST_HEAD(&m->req_list);
INIT_LIST_HEAD(&m->unsent_req_list);
INIT_WORK(&m->rq, p9_read_work);
......@@ -669,10 +673,10 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
if (m->err < 0)
return m->err;
spin_lock(&client->lock);
spin_lock(&m->req_lock);
req->status = REQ_STATUS_UNSENT;
list_add_tail(&req->req_list, &m->unsent_req_list);
spin_unlock(&client->lock);
spin_unlock(&m->req_lock);
if (test_and_clear_bit(Wpending, &m->wsched))
n = EPOLLOUT;
......@@ -687,11 +691,13 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
{
struct p9_trans_fd *ts = client->trans;
struct p9_conn *m = &ts->conn;
int ret = 1;
p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
spin_lock(&client->lock);
spin_lock(&m->req_lock);
if (req->status == REQ_STATUS_UNSENT) {
list_del(&req->req_list);
......@@ -699,21 +705,24 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
p9_req_put(client, req);
ret = 0;
}
spin_unlock(&client->lock);
spin_unlock(&m->req_lock);
return ret;
}
static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
{
struct p9_trans_fd *ts = client->trans;
struct p9_conn *m = &ts->conn;
p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
spin_lock(&client->lock);
spin_lock(&m->req_lock);
/* Ignore cancelled request if message has been received
* before lock.
*/
if (req->status == REQ_STATUS_RCVD) {
spin_unlock(&client->lock);
spin_unlock(&m->req_lock);
return 0;
}
......@@ -722,7 +731,8 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
*/
list_del(&req->req_list);
req->status = REQ_STATUS_FLSHD;
spin_unlock(&client->lock);
spin_unlock(&m->req_lock);
p9_req_put(client, req);
return 0;
......@@ -821,11 +831,14 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
goto out_free_ts;
if (!(ts->rd->f_mode & FMODE_READ))
goto out_put_rd;
/* prevent workers from hanging on IO when fd is a pipe */
ts->rd->f_flags |= O_NONBLOCK;
ts->wr = fget(wfd);
if (!ts->wr)
goto out_put_rd;
if (!(ts->wr->f_mode & FMODE_WRITE))
goto out_put_wr;
ts->wr->f_flags |= O_NONBLOCK;
client->trans = ts;
client->status = Connected;
......@@ -1061,7 +1074,9 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
int err;
struct p9_fd_opts opts;
parse_opts(args, &opts);
err = parse_opts(args, &opts);
if (err < 0)
return err;
client->trans_opts.fd.rfd = opts.rfd;
client->trans_opts.fd.wfd = opts.wfd;
......@@ -1082,6 +1097,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
static struct p9_trans_module p9_tcp_trans = {
.name = "tcp",
.maxsize = MAX_SOCK_BUF,
.pooled_rbuffers = false,
.def = 0,
.create = p9_fd_create_tcp,
.close = p9_fd_close,
......
......@@ -739,6 +739,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
static struct p9_trans_module p9_rdma_trans = {
.name = "rdma",
.maxsize = P9_RDMA_MAXSIZE,
.pooled_rbuffers = true,
.def = 0,
.owner = THIS_MODULE,
.create = rdma_create_trans,
......
......@@ -802,6 +802,7 @@ static struct p9_trans_module p9_virtio_trans = {
* page in zero copy.
*/
.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
.pooled_rbuffers = false,
.def = 1,
.owner = THIS_MODULE,
};
......
......@@ -246,6 +246,7 @@ static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r)
static struct p9_trans_module p9_xen_trans = {
.name = "xen",
.maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
.pooled_rbuffers = false,
.def = 1,
.create = p9_xen_create,
.close = p9_xen_close,
......@@ -510,7 +511,7 @@ static struct xenbus_driver xen_9pfs_front_driver = {
.otherend_changed = xen_9pfs_front_changed,
};
static int p9_trans_xen_init(void)
static int __init p9_trans_xen_init(void)
{
int rc;
......@@ -529,7 +530,7 @@ static int p9_trans_xen_init(void)
module_init(p9_trans_xen_init);
MODULE_ALIAS_9P("xen");
static void p9_trans_xen_exit(void)
static void __exit p9_trans_xen_exit(void)
{
v9fs_unregister_trans(&p9_xen_trans);
return xenbus_unregister_driver(&xen_9pfs_front_driver);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment