Commit fd5f4d7d authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch...

Merge branch 'splice-net-rewrite-splice-to-socket-fix-splice_f_more-and-handle-msg_splice_pages-in-af_tls'

David Howells says:

====================
splice, net: Rewrite splice-to-socket, fix SPLICE_F_MORE and handle MSG_SPLICE_PAGES in AF_TLS

Here are patches to do the following:

 (1) Block MSG_SENDPAGE_* flags from leaking into ->sendmsg() from
     userspace, whilst allowing splice_to_socket() to pass them in.

 (2) Allow MSG_SPLICE_PAGES to be passed into tls_*_sendmsg().  Until
     support is added, it will be ignored and a splice-driven sendmsg()
     will be treated like a normal sendmsg().  TCP, UDP, AF_UNIX and
     Chelsio-TLS already handle the flag in net-next.

 (3) Replace a chain of functions to splice-to-sendpage with a single
     function to splice via sendmsg() with MSG_SPLICE_PAGES.  This allows a
     bunch of pages to be spliced from a pipe in a single call using a
     bio_vec[] and pushes the main processing loop down into the bowels of
     the protocol driver rather than repeatedly calling in with a page at a
     time.

 (4) Provide a ->splice_eof() op[2] that allows splice to signal to its
     output that the input observed a premature EOF and that the caller
     didn't flag SPLICE_F_MORE, thereby allowing a corked socket to be
     flushed.  This attempts to maintain the current behaviour.  It is also
     not called if we didn't manage to read any data and so didn't called
     the actor function.

     This needs routing though several layers to get it down to the network
     protocol.

     [!] Note that I chose not to pass in any flags - I'm not sure it's
     	 particularly useful to pass in the splice flags; I also elected
     	 not to return any error code - though we might actually want to do
     	 that.

 (5) Provide tls_{device,sw}_splice_eof() to flush a pending TLS record if
     there is one.

 (6) Provide splice_eof() for UDP, TCP, Chelsio-TLS and AF_KCM.  AF_UNIX
     doesn't seem to pay attention to the MSG_MORE or MSG_SENDPAGE_NOTLAST
     flags.

 (7) Alter the behaviour of sendfile() and fix SPLICE_F_MORE/MSG_MORE
     signalling[1] such SPLICE_F_MORE is always signalled until we have
     read sufficient data to finish the request.  If we get a zero-length
     before we've managed to splice sufficient data, we now leave the
     socket expecting more data and leave it to userspace to deal with it.

 (8) Make AF_TLS handle the MSG_SPLICE_PAGES internal sendmsg flag.
     MSG_SPLICE_PAGES is an internal hint that tells the protocol that it
     should splice the pages supplied if it can.  Its sendpage
     implementations are then turned into wrappers around that.

Link: https://lore.kernel.org/r/499791.1685485603@warthog.procyon.org.uk/ [1]
Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ [2]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=51c78a4d532efe9543a4df019ff405f05c6157f6 # part 1
Link: https://lore.kernel.org/r/20230524153311.3625329-1-dhowells@redhat.com/ # v1
====================

Link: https://lore.kernel.org/r/20230607181920.2294972-1-dhowells@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 73601329 3dc8976c
......@@ -568,6 +568,7 @@ void chtls_destroy_sock(struct sock *sk);
int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int chtls_recvmsg(struct sock *sk, struct msghdr *msg,
size_t len, int flags, int *addr_len);
void chtls_splice_eof(struct socket *sock);
int chtls_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int send_tx_flowc_wr(struct sock *sk, int compl,
......
......@@ -1237,6 +1237,15 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto done;
}
void chtls_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
lock_sock(sk);
chtls_tcp_push(sk, 0);
release_sock(sk);
}
int chtls_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
......
......@@ -606,6 +606,7 @@ static void __init chtls_init_ulp_ops(void)
chtls_cpl_prot.destroy = chtls_destroy_sock;
chtls_cpl_prot.shutdown = chtls_shutdown;
chtls_cpl_prot.sendmsg = chtls_sendmsg;
chtls_cpl_prot.splice_eof = chtls_splice_eof;
chtls_cpl_prot.sendpage = chtls_sendpage;
chtls_cpl_prot.recvmsg = chtls_recvmsg;
chtls_cpl_prot.setsockopt = chtls_setsockopt;
......
......@@ -33,6 +33,7 @@
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>
......@@ -448,30 +449,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);
/*
* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
* using sendpage(). Return the number of bytes sent.
*/
static int pipe_to_sendpage(struct pipe_inode_info *pipe,
struct pipe_buffer *buf, struct splice_desc *sd)
{
struct file *file = sd->u.file;
loff_t pos = sd->pos;
int more;
if (!likely(file->f_op->sendpage))
return -EINVAL;
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
if (sd->len < sd->total_len &&
pipe_occupancy(pipe->head, pipe->tail) > 1)
more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
}
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
......@@ -652,7 +629,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
* Description:
* This function does little more than loop over the pipe and call
* @actor to do the actual moving of a single struct pipe_buffer to
* the desired destination. See pipe_to_file, pipe_to_sendpage, or
* the desired destination. See pipe_to_file, pipe_to_sendmsg, or
* pipe_to_user.
*
*/
......@@ -833,8 +810,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
EXPORT_SYMBOL(iter_file_splice_write);
#ifdef CONFIG_NET
/**
* generic_splice_sendpage - splice data from a pipe to a socket
* splice_to_socket - splice data from a pipe to a socket
* @pipe: pipe to splice from
* @out: socket to write to
* @ppos: position in @out
......@@ -846,13 +824,131 @@ EXPORT_SYMBOL(iter_file_splice_write);
* is involved.
*
*/
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
}
struct socket *sock = sock_from_file(out);
struct bio_vec bvec[16];
struct msghdr msg = {};
ssize_t ret = 0;
size_t spliced = 0;
bool need_wakeup = false;
pipe_lock(pipe);
while (len > 0) {
unsigned int head, tail, mask, bc = 0;
size_t remain = len;
/*
* Check for signal early to make process killable when there
* are always buffers available
*/
ret = -ERESTARTSYS;
if (signal_pending(current))
break;
while (pipe_empty(pipe->head, pipe->tail)) {
ret = 0;
if (!pipe->writers)
goto out;
if (spliced)
goto out;
ret = -EAGAIN;
if (flags & SPLICE_F_NONBLOCK)
goto out;
ret = -ERESTARTSYS;
if (signal_pending(current))
goto out;
if (need_wakeup) {
wakeup_pipe_writers(pipe);
need_wakeup = false;
}
pipe_wait_readable(pipe);
}
head = pipe->head;
tail = pipe->tail;
mask = pipe->ring_size - 1;
while (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t seg;
if (!buf->len) {
tail++;
continue;
}
seg = min_t(size_t, remain, buf->len);
seg = min_t(size_t, seg, PAGE_SIZE);
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
ret = 0;
break;
}
EXPORT_SYMBOL(generic_splice_sendpage);
bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
remain -= seg;
if (seg >= buf->len)
tail++;
if (bc >= ARRAY_SIZE(bvec))
break;
}
if (!bc)
break;
msg.msg_flags = MSG_SPLICE_PAGES;
if (flags & SPLICE_F_MORE)
msg.msg_flags |= MSG_MORE;
if (remain && pipe_occupancy(pipe->head, tail) > 0)
msg.msg_flags |= MSG_MORE;
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
len - remain);
ret = sock_sendmsg(sock, &msg);
if (ret <= 0)
break;
spliced += ret;
len -= ret;
tail = pipe->tail;
while (ret > 0) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t seg = min_t(size_t, ret, buf->len);
buf->offset += seg;
buf->len -= seg;
ret -= seg;
if (!buf->len) {
pipe_buf_release(pipe, buf);
tail++;
}
}
if (tail != pipe->tail) {
pipe->tail = tail;
if (pipe->files)
need_wakeup = true;
}
}
out:
pipe_unlock(pipe);
if (need_wakeup)
wakeup_pipe_writers(pipe);
return spliced ?: ret;
}
#endif
static int warn_unsupported(struct file *file, const char *op)
{
......@@ -873,6 +969,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
return out->f_op->splice_write(pipe, out, ppos, len, flags);
}
/*
* Indicate to the caller that there was a premature EOF when reading from the
* source and the caller didn't indicate they would be sending more data after
* this.
*/
static void do_splice_eof(struct splice_desc *sd)
{
if (sd->splice_eof)
sd->splice_eof(sd);
}
/*
* Attempt to initiate a splice from a file to a pipe.
*/
......@@ -956,13 +1063,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
*/
bytes = 0;
len = sd->total_len;
/* Don't block on output, we have to drain the direct pipe. */
flags = sd->flags;
sd->flags &= ~SPLICE_F_NONBLOCK;
/*
* Don't block on output, we have to drain the direct pipe.
* We signal MORE until we've read sufficient data to fulfill the
* request and we keep signalling it if the caller set it.
*/
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
sd->flags |= SPLICE_F_MORE;
WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
......@@ -972,20 +1083,18 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
ret = do_splice_to(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
goto out_release;
goto read_failure;
read_len = ret;
sd->total_len = read_len;
/*
* If more data is pending, set SPLICE_F_MORE
* If this is the last data and SPLICE_F_MORE was not set
* initially, clears it.
* If we now have sufficient data to fulfill the request then
* we clear SPLICE_F_MORE if it was not set initially.
*/
if (read_len < len)
sd->flags |= SPLICE_F_MORE;
else if (!more)
if (read_len >= len && !more)
sd->flags &= ~SPLICE_F_MORE;
/*
* NOTE: nonblocking mode only applies to the input. We
* must not do the output in nonblocking mode as then we
......@@ -1012,6 +1121,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
file_accessed(in);
return bytes;
read_failure:
/*
* If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
* "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
* "->splice_in()" that returned EOF (ie zero) *and* we have sent at
* least 1 byte *then* we will also do the ->splice_eof() call.
*/
if (ret == 0 && !more && len > 0 && bytes)
do_splice_eof(sd);
out_release:
/*
* If we did an incomplete transfer we must release
......@@ -1040,6 +1158,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
sd->flags);
}
static void direct_file_splice_eof(struct splice_desc *sd)
{
struct file *file = sd->u.file;
if (file->f_op->splice_eof)
file->f_op->splice_eof(file);
}
/**
* do_splice_direct - splices data directly between two files
* @in: file to splice from
......@@ -1065,6 +1191,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.flags = flags,
.pos = *ppos,
.u.file = out,
.splice_eof = direct_file_splice_eof,
.opos = opos,
};
long ret;
......
......@@ -1796,6 +1796,7 @@ struct file_operations {
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
void (*splice_eof)(struct file *file);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
......@@ -2759,8 +2760,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
struct file *, loff_t *, size_t, unsigned int);
extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
struct file *out, loff_t *, size_t len, unsigned int flags);
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
loff_t *opos, size_t len, unsigned int flags);
......
......@@ -210,6 +210,7 @@ struct proto_ops {
int offset, size_t size, int flags);
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
void (*splice_eof)(struct socket *sock);
int (*set_peek_off)(struct sock *sk, int val);
int (*peek_len)(struct socket *sock);
......
......@@ -339,7 +339,9 @@ struct ucred {
#endif
/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)
#define MSG_INTERNAL_SENDMSG_FLAGS \
(MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \
MSG_SENDPAGE_DECRYPTED)
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP 0
......
......@@ -38,6 +38,7 @@ struct splice_desc {
struct file *file; /* file to read/write */
void *data; /* cookie */
} u;
void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */
loff_t pos; /* file position */
loff_t *opos; /* sendfile: output position */
size_t num_spliced; /* number of bytes already spliced */
......@@ -84,6 +85,8 @@ extern long do_splice(struct file *in, loff_t *off_in,
extern long do_tee(struct file *in, struct file *out, size_t len,
unsigned int flags);
extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags);
/*
* for dynamic pipe sizing
......
......@@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
struct sock *newsk);
int inet_send_prepare(struct sock *sk);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
void inet_splice_eof(struct socket *sock);
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags);
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
......
......@@ -1279,6 +1279,7 @@ struct proto {
size_t len, int flags, int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
void (*splice_eof)(struct socket *sock);
int (*bind)(struct sock *sk,
struct sockaddr *addr, int addr_len);
int (*bind_add)(struct sock *sk,
......
......@@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
size_t size, struct ubuf_info *uarg);
void tcp_splice_eof(struct socket *sock);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
......
......@@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
void udp_splice_eof(struct socket *sock);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
......
......@@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(inet_sendmsg);
void inet_splice_eof(struct socket *sock)
{
const struct proto *prot;
struct sock *sk = sock->sk;
if (unlikely(inet_send_prepare(sk)))
return;
/* IPV6_ADDRFORM can change sk->sk_prot under us. */
prot = READ_ONCE(sk->sk_prot);
if (prot->splice_eof)
prot->splice_eof(sock);
}
EXPORT_SYMBOL_GPL(inet_splice_eof);
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags)
{
......@@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
......@@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
.read_skb = udp_read_skb,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
......@@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
......
......@@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(tcp_sendmsg);
void tcp_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
if (!tcp_write_queue_tail(sk))
return;
lock_sock(sk);
mss_now = tcp_send_mss(sk, &size_goal, 0);
tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
release_sock(sk);
}
EXPORT_SYMBOL_GPL(tcp_splice_eof);
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
......
......@@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.splice_eof = tcp_splice_eof,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
......
......@@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
EXPORT_SYMBOL(udp_sendmsg);
void udp_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
if (!up->pending || READ_ONCE(up->corkflag))
return;
lock_sock(sk);
if (up->pending && !READ_ONCE(up->corkflag))
udp_push_pending_frames(sk);
release_sock(sk);
}
EXPORT_SYMBOL_GPL(udp_splice_eof);
int udp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
......@@ -2918,6 +2933,7 @@ struct proto udp_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.splice_eof = udp_splice_eof,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
......
......@@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
......
......@@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.splice_eof = tcp_splice_eof,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
......
......@@ -1653,6 +1653,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
EXPORT_SYMBOL(udpv6_sendmsg);
static void udpv6_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
if (!up->pending || READ_ONCE(up->corkflag))
return;
lock_sock(sk);
if (up->pending && !READ_ONCE(up->corkflag))
udp_v6_push_pending_frames(sk);
release_sock(sk);
}
void udpv6_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
......@@ -1764,6 +1778,7 @@ struct proto udpv6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
.splice_eof = udpv6_splice_eof,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
......
......@@ -968,6 +968,19 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
return err;
}
static void kcm_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
if (skb_queue_empty_lockless(&sk->sk_write_queue))
return;
lock_sock(sk);
kcm_write_msgs(kcm);
release_sock(sk);
}
static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, int flags)
......@@ -1773,6 +1786,7 @@ static const struct proto_ops kcm_dgram_ops = {
.sendmsg = kcm_sendmsg,
.recvmsg = kcm_recvmsg,
.mmap = sock_no_mmap,
.splice_eof = kcm_splice_eof,
.sendpage = kcm_sendpage,
};
......@@ -1794,6 +1808,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
.sendmsg = kcm_sendmsg,
.recvmsg = kcm_recvmsg,
.mmap = sock_no_mmap,
.splice_eof = kcm_splice_eof,
.sendpage = kcm_sendpage,
.splice_read = kcm_splice_read,
};
......
......@@ -57,6 +57,7 @@
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/thread_info.h>
......@@ -126,11 +127,10 @@ static long compat_sock_ioctl(struct file *file,
unsigned int cmd, unsigned long arg);
#endif
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more);
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
static void sock_splice_eof(struct file *file);
#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
......@@ -162,9 +162,9 @@ static const struct file_operations socket_file_ops = {
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_write = splice_to_socket,
.splice_read = sock_splice_read,
.splice_eof = sock_splice_eof,
.show_fdinfo = sock_show_fdinfo,
};
......@@ -1066,26 +1066,6 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
}
EXPORT_SYMBOL(kernel_recvmsg);
static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more)
{
struct socket *sock;
int flags;
int ret;
sock = file->private_data;
flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
flags |= more;
ret = kernel_sendpage(sock, page, offset, size, flags);
if (trace_sock_send_length_enabled())
call_trace_sock_send_length(sock->sk, ret, 0);
return ret;
}
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
......@@ -1098,6 +1078,14 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
return sock->ops->splice_read(sock, ppos, pipe, len, flags);
}
static void sock_splice_eof(struct file *file)
{
struct socket *sock = file->private_data;
if (sock->ops->splice_eof)
sock->ops->splice_eof(sock);
}
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
......
......@@ -97,6 +97,7 @@ void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
void tls_sw_strparser_done(struct tls_context *tls_ctx);
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
void tls_sw_splice_eof(struct socket *sock);
int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int tls_sw_sendpage(struct sock *sk, struct page *page,
......@@ -115,6 +116,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
size_t len, unsigned int flags);
int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
void tls_device_splice_eof(struct socket *sock);
int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int tls_tx_records(struct sock *sk, int flags);
......
......@@ -422,16 +422,10 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
return 0;
}
union tls_iter_offset {
struct iov_iter *msg_iter;
int offset;
};
static int tls_push_data(struct sock *sk,
union tls_iter_offset iter_offset,
struct iov_iter *iter,
size_t size, int flags,
unsigned char record_type,
struct page *zc_page)
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_prot_info *prot = &tls_ctx->prot_info;
......@@ -447,7 +441,8 @@ static int tls_push_data(struct sock *sk,
long timeo;
if (flags &
~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST |
MSG_SPLICE_PAGES))
return -EOPNOTSUPP;
if (unlikely(sk->sk_err))
......@@ -499,21 +494,35 @@ static int tls_push_data(struct sock *sk,
record = ctx->open_record;
copy = min_t(size_t, size, max_open_record_len - record->len);
if (copy && zc_page) {
if (copy && (flags & MSG_SPLICE_PAGES)) {
struct page_frag zc_pfrag;
struct page **pages = &zc_pfrag.page;
size_t off;
rc = iov_iter_extract_pages(iter, &pages,
copy, 1, 0, &off);
if (rc <= 0) {
if (rc == 0)
rc = -EIO;
goto handle_error;
}
copy = rc;
zc_pfrag.page = zc_page;
zc_pfrag.offset = iter_offset.offset;
if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) {
iov_iter_revert(iter, copy);
rc = -EIO;
goto handle_error;
}
zc_pfrag.offset = off;
zc_pfrag.size = copy;
tls_append_frag(record, &zc_pfrag, copy);
iter_offset.offset += copy;
} else if (copy) {
copy = min_t(size_t, copy, pfrag->size - pfrag->offset);
rc = tls_device_copy_data(page_address(pfrag->page) +
pfrag->offset, copy,
iter_offset.msg_iter);
iter);
if (rc)
goto handle_error;
tls_append_frag(record, pfrag, copy);
......@@ -568,9 +577,11 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
unsigned char record_type = TLS_RECORD_TYPE_DATA;
struct tls_context *tls_ctx = tls_get_ctx(sk);
union tls_iter_offset iter;
int rc;
if (!tls_ctx->zerocopy_sendfile)
msg->msg_flags &= ~MSG_SPLICE_PAGES;
mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
......@@ -580,8 +591,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto out;
}
iter.msg_iter = &msg->msg_iter;
rc = tls_push_data(sk, iter, size, msg->msg_flags, record_type, NULL);
rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags,
record_type);
out:
release_sock(sk);
......@@ -589,47 +600,42 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
return rc;
}
int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
void tls_device_splice_eof(struct socket *sock)
{
struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
union tls_iter_offset iter_offset;
struct iov_iter msg_iter;
char *kaddr;
struct kvec iov;
int rc;
struct iov_iter iter = {};
if (flags & MSG_SENDPAGE_NOTLAST)
flags |= MSG_MORE;
if (!tls_is_partially_sent_record(tls_ctx))
return;
mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
if (flags & MSG_OOB) {
rc = -EOPNOTSUPP;
goto out;
}
if (tls_ctx->zerocopy_sendfile) {
iter_offset.offset = offset;
rc = tls_push_data(sk, iter_offset, size,
flags, TLS_RECORD_TYPE_DATA, page);
goto out;
if (tls_is_partially_sent_record(tls_ctx)) {
iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0);
tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA);
}
kaddr = kmap(page);
iov.iov_base = kaddr + offset;
iov.iov_len = size;
iov_iter_kvec(&msg_iter, ITER_SOURCE, &iov, 1, size);
iter_offset.msg_iter = &msg_iter;
rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA,
NULL);
kunmap(page);
out:
release_sock(sk);
mutex_unlock(&tls_ctx->tx_lock);
return rc;
}
int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct bio_vec bvec;
struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
if (flags & MSG_SENDPAGE_NOTLAST)
msg.msg_flags |= MSG_MORE;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
bvec_set_page(&bvec, page, size, offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
return tls_device_sendmsg(sk, &msg, size);
}
struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
......@@ -694,12 +700,10 @@ EXPORT_SYMBOL(tls_get_record);
static int tls_device_push_pending_record(struct sock *sk, int flags)
{
union tls_iter_offset iter;
struct iov_iter msg_iter;
struct iov_iter iter;
iov_iter_kvec(&msg_iter, ITER_SOURCE, NULL, 0, 0);
iter.msg_iter = &msg_iter;
return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL);
iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0);
return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA);
}
void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
......
......@@ -957,6 +957,7 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG]
ops[TLS_BASE][TLS_BASE] = *base;
ops[TLS_SW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE];
ops[TLS_SW ][TLS_BASE].splice_eof = tls_sw_splice_eof;
ops[TLS_SW ][TLS_BASE].sendpage_locked = tls_sw_sendpage_locked;
ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE];
......@@ -1027,6 +1028,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg;
prot[TLS_SW][TLS_BASE].splice_eof = tls_sw_splice_eof;
prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
......@@ -1042,10 +1044,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
#ifdef CONFIG_TLS_DEVICE
prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_BASE].splice_eof = tls_device_splice_eof;
prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage;
prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_SW].splice_eof = tls_device_splice_eof;
prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
......
......@@ -931,7 +931,37 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags)
&copied, flags);
}
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
static int tls_sw_sendmsg_splice(struct sock *sk, struct msghdr *msg,
struct sk_msg *msg_pl, size_t try_to_copy,
ssize_t *copied)
{
struct page *page = NULL, **pages = &page;
do {
ssize_t part;
size_t off;
part = iov_iter_extract_pages(&msg->msg_iter, &pages,
try_to_copy, 1, 0, &off);
if (part <= 0)
return part ?: -EIO;
if (WARN_ON_ONCE(!sendpage_ok(page))) {
iov_iter_revert(&msg->msg_iter, part);
return -EIO;
}
sk_msg_page_add(msg_pl, page, part, off);
sk_mem_charge(sk, part);
*copied += part;
try_to_copy -= part;
} while (try_to_copy && !sk_msg_full(msg_pl));
return 0;
}
static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
size_t size)
{
long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
......@@ -954,15 +984,6 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
int ret = 0;
int pending;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_CMSG_COMPAT))
return -EOPNOTSUPP;
ret = mutex_lock_interruptible(&tls_ctx->tx_lock);
if (ret)
return ret;
lock_sock(sk);
if (unlikely(msg->msg_controllen)) {
ret = tls_process_cmsg(sk, msg, &record_type);
if (ret) {
......@@ -1020,6 +1041,17 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
full_record = true;
}
if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) {
ret = tls_sw_sendmsg_splice(sk, msg, msg_pl,
try_to_copy, &copied);
if (ret < 0)
goto send_end;
tls_ctx->pending_open_record_frags = true;
if (full_record || eor || sk_msg_full(msg_pl))
goto copied;
continue;
}
if (!is_kvec && (full_record || eor) && !async_capable) {
u32 first = msg_pl->sg.end;
......@@ -1084,6 +1116,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
tls_ctx->pending_open_record_frags = true;
copied += try_to_copy;
copied:
if (full_record || eor) {
ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
record_type, &copied,
......@@ -1151,157 +1184,136 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
send_end:
ret = sk_stream_error(sk, msg->msg_flags, ret);
return copied > 0 ? copied : ret;
}
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
int ret;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_CMSG_COMPAT | MSG_SPLICE_PAGES |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
return -EOPNOTSUPP;
ret = mutex_lock_interruptible(&tls_ctx->tx_lock);
if (ret)
return ret;
lock_sock(sk);
ret = tls_sw_sendmsg_locked(sk, msg, size);
release_sock(sk);
mutex_unlock(&tls_ctx->tx_lock);
return copied > 0 ? copied : ret;
return ret;
}
static int tls_sw_do_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
/*
* Handle unexpected EOF during splice without SPLICE_F_MORE set.
*/
void tls_sw_splice_eof(struct socket *sock)
{
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_prot_info *prot = &tls_ctx->prot_info;
unsigned char record_type = TLS_RECORD_TYPE_DATA;
struct sk_msg *msg_pl;
struct tls_rec *rec;
int num_async = 0;
struct sk_msg *msg_pl;
ssize_t copied = 0;
bool full_record;
int record_room;
bool retrying = false;
int ret = 0;
bool eor;
eor = !(flags & MSG_SENDPAGE_NOTLAST);
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* Call the sk_stream functions to manage the sndbuf mem. */
while (size > 0) {
size_t copy, required_size;
if (sk->sk_err) {
ret = -sk->sk_err;
goto sendpage_end;
}
int pending;
if (ctx->open_rec)
rec = ctx->open_rec;
else
rec = ctx->open_rec = tls_get_rec(sk);
if (!rec) {
ret = -ENOMEM;
goto sendpage_end;
}
if (!ctx->open_rec)
return;
msg_pl = &rec->msg_plaintext;
mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
full_record = false;
record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
copy = size;
if (copy >= record_room) {
copy = record_room;
full_record = true;
}
retry:
rec = ctx->open_rec;
if (!rec)
goto unlock;
required_size = msg_pl->sg.size + copy + prot->overhead_size;
msg_pl = &rec->msg_plaintext;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
alloc_payload:
ret = tls_alloc_encrypted_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto wait_for_memory;
/* Check the BPF advisor and perform transmission. */
ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA,
&copied, 0);
switch (ret) {
case 0:
case -EAGAIN:
if (retrying)
goto unlock;
retrying = true;
goto retry;
case -EINPROGRESS:
break;
default:
goto unlock;
}
/* Adjust copy according to the amount that was
* actually allocated. The difference is due
* to max sg elements limit
*/
copy -= required_size - msg_pl->sg.size;
full_record = true;
}
/* Wait for pending encryptions to get completed */
spin_lock_bh(&ctx->encrypt_compl_lock);
ctx->async_notify = true;
sk_msg_page_add(msg_pl, page, copy, offset);
sk_mem_charge(sk, copy);
pending = atomic_read(&ctx->encrypt_pending);
spin_unlock_bh(&ctx->encrypt_compl_lock);
if (pending)
crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
else
reinit_completion(&ctx->async_wait.completion);
offset += copy;
size -= copy;
copied += copy;
/* There can be no concurrent accesses, since we have no pending
* encrypt operations
*/
WRITE_ONCE(ctx->async_notify, false);
tls_ctx->pending_open_record_frags = true;
if (full_record || eor || sk_msg_full(msg_pl)) {
ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
record_type, &copied, flags);
if (ret) {
if (ret == -EINPROGRESS)
num_async++;
else if (ret == -ENOMEM)
goto wait_for_memory;
else if (ret != -EAGAIN) {
if (ret == -ENOSPC)
ret = 0;
goto sendpage_end;
}
}
}
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
if (ctx->open_rec)
tls_trim_both_msgs(sk, msg_pl->sg.size);
goto sendpage_end;
}
if (ctx->async_wait.err)
goto unlock;
if (ctx->open_rec)
goto alloc_payload;
/* Transmit if any encryptions have completed */
if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
cancel_delayed_work(&ctx->tx_work.work);
tls_tx_records(sk, 0);
}
if (num_async) {
/* Transmit if any encryptions have completed */
if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
cancel_delayed_work(&ctx->tx_work.work);
tls_tx_records(sk, flags);
}
}
sendpage_end:
ret = sk_stream_error(sk, flags, ret);
return copied > 0 ? copied : ret;
unlock:
release_sock(sk);
mutex_unlock(&tls_ctx->tx_lock);
}
int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct bio_vec bvec;
struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY |
MSG_NO_SHARED_FRAGS))
return -EOPNOTSUPP;
if (flags & MSG_SENDPAGE_NOTLAST)
msg.msg_flags |= MSG_MORE;
return tls_sw_do_sendpage(sk, page, offset, size, flags);
bvec_set_page(&bvec, page, size, offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
return tls_sw_sendmsg_locked(sk, &msg, size);
}
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
int ret;
struct bio_vec bvec;
struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
return -EOPNOTSUPP;
if (flags & MSG_SENDPAGE_NOTLAST)
msg.msg_flags |= MSG_MORE;
ret = mutex_lock_interruptible(&tls_ctx->tx_lock);
if (ret)
return ret;
lock_sock(sk);
ret = tls_sw_do_sendpage(sk, page, offset, size, flags);
release_sock(sk);
mutex_unlock(&tls_ctx->tx_lock);
return ret;
bvec_set_page(&bvec, page, size, offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
return tls_sw_sendmsg(sk, &msg, size);
}
static int
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment