Commit 18fb76ed authored by Arjun Roy's avatar Arjun Roy Committed by Jakub Kicinski

net-zerocopy: Copy straggler unaligned data for TCP Rx. zerocopy.

When TCP receive zerocopy does not successfully map the entire
requested space, it outputs a 'hint' that the caller should recvmsg().

Augment zerocopy to accept a user buffer that it tries to copy this
hint into - if it is possible to copy the entire hint, it will do so.
This elides a recvmsg() call for received traffic that isn't exactly
page-aligned in size.

This was tested with RPC-style traffic of arbitrary sizes. Normally,
each received message required at least one getsockopt() call, and one
recvmsg() call for the remaining unaligned data.

With this change, almost all of the recvmsg() calls are eliminated,
leading to a savings of about 25%-50% in number of system calls
for RPC-style workloads.
Signed-off-by: default avatarArjun Roy <arjunroy@google.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 4be986c8
...@@ -349,5 +349,7 @@ struct tcp_zerocopy_receive { ...@@ -349,5 +349,7 @@ struct tcp_zerocopy_receive {
__u32 recv_skip_hint; /* out: amount of bytes to skip */ __u32 recv_skip_hint; /* out: amount of bytes to skip */
__u32 inq; /* out: amount of bytes in read queue */ __u32 inq; /* out: amount of bytes in read queue */
__s32 err; /* out: socket error */ __s32 err; /* out: socket error */
__u64 copybuf_address; /* in: copybuf address (small reads) */
__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
}; };
#endif /* _UAPI_LINUX_TCP_H */ #endif /* _UAPI_LINUX_TCP_H */
...@@ -1758,6 +1758,52 @@ int tcp_mmap(struct file *file, struct socket *sock, ...@@ -1758,6 +1758,52 @@ int tcp_mmap(struct file *file, struct socket *sock,
} }
EXPORT_SYMBOL(tcp_mmap); EXPORT_SYMBOL(tcp_mmap);
static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
struct sk_buff *skb, u32 copylen,
u32 *offset, u32 *seq)
{
unsigned long copy_address = (unsigned long)zc->copybuf_address;
struct msghdr msg = {};
struct iovec iov;
int err;
if (copy_address != zc->copybuf_address)
return -EINVAL;
err = import_single_range(READ, (void __user *)copy_address,
copylen, &iov, &msg.msg_iter);
if (err)
return err;
err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
if (err)
return err;
zc->recv_skip_hint -= copylen;
*offset += copylen;
*seq += copylen;
return (__s32)copylen;
}
static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
struct sock *sk,
struct sk_buff *skb,
u32 *seq,
s32 copybuf_len)
{
u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
if (!copylen)
return 0;
/* skb is null if inq < PAGE_SIZE. */
if (skb)
offset = *seq - TCP_SKB_CB(skb)->seq;
else
skb = tcp_recv_skb(sk, *seq, &offset);
zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
seq);
return zc->copybuf_len < 0 ? 0 : copylen;
}
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
struct page **pages, struct page **pages,
unsigned long pages_to_map, unsigned long pages_to_map,
...@@ -1791,8 +1837,10 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, ...@@ -1791,8 +1837,10 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
static int tcp_zerocopy_receive(struct sock *sk, static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc) struct tcp_zerocopy_receive *zc)
{ {
u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
unsigned long address = (unsigned long)zc->address; unsigned long address = (unsigned long)zc->address;
u32 length = 0, seq, offset, zap_len; s32 copybuf_len = zc->copybuf_len;
struct tcp_sock *tp = tcp_sk(sk);
#define PAGE_BATCH_SIZE 8 #define PAGE_BATCH_SIZE 8
struct page *pages[PAGE_BATCH_SIZE]; struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL; const skb_frag_t *frags = NULL;
...@@ -1800,10 +1848,12 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1800,10 +1848,12 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct sk_buff *skb = NULL; struct sk_buff *skb = NULL;
unsigned long pg_idx = 0; unsigned long pg_idx = 0;
unsigned long curr_addr; unsigned long curr_addr;
struct tcp_sock *tp; u32 seq = tp->copied_seq;
int inq; int inq = tcp_inq(sk);
int ret; int ret;
zc->copybuf_len = 0;
if (address & (PAGE_SIZE - 1) || address != zc->address) if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL; return -EINVAL;
...@@ -1812,8 +1862,6 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1812,8 +1862,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk); sock_rps_record_flow(sk);
tp = tcp_sk(sk);
mmap_read_lock(current->mm); mmap_read_lock(current->mm);
vma = find_vma(current->mm, address); vma = find_vma(current->mm, address);
...@@ -1821,17 +1869,16 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1821,17 +1869,16 @@ static int tcp_zerocopy_receive(struct sock *sk,
mmap_read_unlock(current->mm); mmap_read_unlock(current->mm);
return -EINVAL; return -EINVAL;
} }
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
avail_len = min_t(u32, vma_len, inq);
seq = tp->copied_seq; aligned_len = avail_len & ~(PAGE_SIZE - 1);
inq = tcp_inq(sk); if (aligned_len) {
zc->length = min_t(u32, zc->length, inq); zap_page_range(vma, address, aligned_len);
zap_len = zc->length & ~(PAGE_SIZE - 1); zc->length = aligned_len;
if (zap_len) {
zap_page_range(vma, address, zap_len);
zc->recv_skip_hint = 0; zc->recv_skip_hint = 0;
} else { } else {
zc->recv_skip_hint = zc->length; zc->length = avail_len;
zc->recv_skip_hint = avail_len;
} }
ret = 0; ret = 0;
curr_addr = address; curr_addr = address;
...@@ -1900,13 +1947,18 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1900,13 +1947,18 @@ static int tcp_zerocopy_receive(struct sock *sk,
} }
out: out:
mmap_read_unlock(current->mm); mmap_read_unlock(current->mm);
if (length) { /* Try to copy straggler data. */
if (!ret)
copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
copybuf_len);
if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq); WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk); tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */ /* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset); tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, length); tcp_cleanup_rbuf(sk, length + copylen);
ret = 0; ret = 0;
if (length == zc->length) if (length == zc->length)
zc->recv_skip_hint = 0; zc->recv_skip_hint = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment