Commit 0d63a981 authored by Eric Dumazet's avatar Eric Dumazet Committed by Ben Hutchings

tcp: allow splice() to build full TSO packets

[ This combines upstream commit
  2f533844 and the follow-on bug fix
  commit 35f9c09f ]

vmsplice()/splice(pipe, socket) call do_tcp_sendpages() one page at a
time, adding at most 4096 bytes to an skb. (assuming PAGE_SIZE=4096)

The call to tcp_push() at the end of do_tcp_sendpages() forces an
immediate xmit when pipe is not already filled, and tso_fragment() try
to split these skb to MSS multiples.

4096 bytes are usually split in a skb with 2 MSS, and a remaining
sub-mss skb (assuming MTU=1500)

This makes slow start suboptimal because many small frames are sent to
qdisc/driver layers instead of big ones (constrained by cwnd and packets
in flight of course)

In fact, applications using sendmsg() (adding an additional memory copy)
instead of vmsplice()/splice()/sendfile() are a bit faster because of
this anomaly, especially if serving small files in environments with
large initial [c]wnd.

Call tcp_push() only if MSG_MORE is not set in the flags parameter.

This bit is automatically provided by splice() internals but for the
last page, or on all pages if user specified SPLICE_F_MORE splice()
flag.

In some workloads, this can reduce number of sent logical packets by an
order of magnitude, making zero-copy TCP actually faster than
one-copy :)
Reported-by: default avatarTom Herbert <therbert@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: H.K. Jerry Chu <hkchu@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
parent 410322fe
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/socket.h>
/* /*
* Attempt to steal a page from a pipe buffer. This should perhaps go into * Attempt to steal a page from a pipe buffer. This should perhaps go into
...@@ -691,7 +692,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, ...@@ -691,7 +692,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
if (!likely(file->f_op && file->f_op->sendpage)) if (!likely(file->f_op && file->f_op->sendpage))
return -EINVAL; return -EINVAL;
more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
if (sd->len < sd->total_len)
more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset, return file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more); sd->len, &pos, more);
} }
......
...@@ -265,7 +265,7 @@ struct ucred { ...@@ -265,7 +265,7 @@ struct ucred {
#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */
#define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_MORE 0x8000 /* Sender will send more */
#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
#define MSG_EOF MSG_FIN #define MSG_EOF MSG_FIN
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file
......
...@@ -860,7 +860,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse ...@@ -860,7 +860,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
} }
out: out:
if (copied) if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle); tcp_push(sk, flags, mss_now, tp->nonagle);
return copied; return copied;
......
...@@ -791,9 +791,9 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, ...@@ -791,9 +791,9 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
sock = file->private_data; sock = file->private_data;
flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
if (more) /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
flags |= MSG_MORE; flags |= more;
return kernel_sendpage(sock, page, offset, size, flags); return kernel_sendpage(sock, page, offset, size, flags);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment