Commit 3763a24c authored by Arjun Roy's avatar Arjun Roy Committed by David S. Miller

net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy

Use vm_insert_pages() for tcp receive zerocopy.  Spin lock cycles (as
reported by perf) drop from a couple of percentage points to a fraction of
a percent.  This results in a roughly 6% increase in efficiency, measured
roughly as zerocopy receive count divided by CPU utilization.

The intention of this patchset is to reduce atomic ops for tcp zerocopy
receives, which normally hits the same spinlock multiple times
consecutively.

[akpm@linux-foundation.org: suppress gcc-7.2.0 warning]
Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.comSigned-off-by: default avatarArjun Roy <arjunroy@google.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0e6fbe39
...@@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock, ...@@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock,
} }
EXPORT_SYMBOL(tcp_mmap); EXPORT_SYMBOL(tcp_mmap);
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
struct page **pages,
unsigned long pages_to_map,
unsigned long *insert_addr,
u32 *length_with_pending,
u32 *seq,
struct tcp_zerocopy_receive *zc)
{
unsigned long pages_remaining = pages_to_map;
int bytes_mapped;
int ret;
ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
/* Even if vm_insert_pages fails, it may have partially succeeded in
* mapping (some but not all of the pages).
*/
*seq += bytes_mapped;
*insert_addr += bytes_mapped;
if (ret) {
/* But if vm_insert_pages did fail, we have to unroll some state
* we speculatively touched before.
*/
const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
*length_with_pending -= bytes_not_mapped;
zc->recv_skip_hint += bytes_not_mapped;
}
return ret;
}
static int tcp_zerocopy_receive(struct sock *sk, static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc) struct tcp_zerocopy_receive *zc)
{ {
unsigned long address = (unsigned long)zc->address; unsigned long address = (unsigned long)zc->address;
u32 length = 0, seq, offset, zap_len; u32 length = 0, seq, offset, zap_len;
#define PAGE_BATCH_SIZE 8
struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL; const skb_frag_t *frags = NULL;
struct vm_area_struct *vma; struct vm_area_struct *vma;
struct sk_buff *skb = NULL; struct sk_buff *skb = NULL;
unsigned long pg_idx = 0;
unsigned long curr_addr;
struct tcp_sock *tp; struct tcp_sock *tp;
int inq; int inq;
int ret; int ret;
...@@ -1762,6 +1796,8 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1762,6 +1796,8 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk); sock_rps_record_flow(sk);
tp = tcp_sk(sk);
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, address); vma = find_vma(current->mm, address);
...@@ -1771,7 +1807,6 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1771,7 +1807,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
} }
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
tp = tcp_sk(sk);
seq = tp->copied_seq; seq = tp->copied_seq;
inq = tcp_inq(sk); inq = tcp_inq(sk);
zc->length = min_t(u32, zc->length, inq); zc->length = min_t(u32, zc->length, inq);
...@@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint = zc->length; zc->recv_skip_hint = zc->length;
} }
ret = 0; ret = 0;
curr_addr = address;
while (length + PAGE_SIZE <= zc->length) { while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) { if (zc->recv_skip_hint < PAGE_SIZE) {
/* If we're here, finish the current batch. */
if (pg_idx) {
ret = tcp_zerocopy_vm_insert_batch(vma, pages,
pg_idx,
&curr_addr,
&length,
&seq, zc);
if (ret)
goto out;
pg_idx = 0;
}
if (skb) { if (skb) {
if (zc->recv_skip_hint > 0) if (zc->recv_skip_hint > 0)
break; break;
...@@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
} else { } else {
skb = tcp_recv_skb(sk, seq, &offset); skb = tcp_recv_skb(sk, seq, &offset);
} }
zc->recv_skip_hint = skb->len - offset; zc->recv_skip_hint = skb->len - offset;
offset -= skb_headlen(skb); offset -= skb_headlen(skb);
if ((int)offset < 0 || skb_has_frag_list(skb)) if ((int)offset < 0 || skb_has_frag_list(skb))
...@@ -1817,14 +1863,24 @@ static int tcp_zerocopy_receive(struct sock *sk, ...@@ -1817,14 +1863,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint -= remaining; zc->recv_skip_hint -= remaining;
break; break;
} }
ret = vm_insert_page(vma, address + length, pages[pg_idx] = skb_frag_page(frags);
skb_frag_page(frags)); pg_idx++;
if (ret)
break;
length += PAGE_SIZE; length += PAGE_SIZE;
seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE;
frags++; frags++;
if (pg_idx == PAGE_BATCH_SIZE) {
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
&curr_addr, &length,
&seq, zc);
if (ret)
goto out;
pg_idx = 0;
}
}
if (pg_idx) {
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
&curr_addr, &length, &seq,
zc);
} }
out: out:
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment