Commit 8b27dae5 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: add one skb cache for rx

Often times, recvmsg() system calls and BH handling for a particular
TCP socket are done on different cpus.

This means the incoming skb had to be allocated on a cpu,
but freed on another.

This incurs a high spinlock contention in slab layer for small rpc,
but also a high number of cache line ping pongs for larger packets.

A full size GRO packet might use 45 page fragments, meaning
that up to 45 put_page() can be involved.

More over performing the __kfree_skb() in the recvmsg() context
adds a latency for user applications, and increase probability
of trapping them in backlog processing, since the BH handler
might found the socket owned by the user.

This patch, combined with the prior one increases the rpc
performance by about 10 % on servers with large number of cores.

(tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps
 instead of 8 Mpps)

This also increases single bulk flow performance on 40Gbit+ links,
since in this case there are often two cpus working in tandem :

 - CPU handling the NIC rx interrupts, feeding the receive queue,
  and (after this patch) freeing the skbs that were consumed.

 - CPU in recvmsg() system call, essentially 100 % busy copying out
  data to user space.

Having at most one skb in a per-socket cache has very little risk
of memory exhaustion, and since it is protected by socket lock,
its management is essentially free.

Note that if rps/rfs is used, we do not enable this feature, because
there is high chance that the same cpu is handling both the recvmsg()
system call and the TCP rx path, but that another cpu did the skb
allocations in the device driver right before the RPS/RFS logic.

To properly handle this case, it seems we would need to record
on which cpu skb was allocated, and use a different channel
to give skbs back to this cpu.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 472c2e07
...@@ -368,6 +368,7 @@ struct sock { ...@@ -368,6 +368,7 @@ struct sock {
atomic_t sk_drops; atomic_t sk_drops;
int sk_rcvlowat; int sk_rcvlowat;
struct sk_buff_head sk_error_queue; struct sk_buff_head sk_error_queue;
struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_receive_queue;
/* /*
* The backlog queue is special, it is always used with * The backlog queue is special, it is always used with
...@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) ...@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{ {
__skb_unlink(skb, &sk->sk_receive_queue); __skb_unlink(skb, &sk->sk_receive_queue);
if (
#ifdef CONFIG_RPS
!static_branch_unlikely(&rps_needed) &&
#endif
!sk->sk_rx_skb_cache) {
sk->sk_rx_skb_cache = skb;
skb_orphan(skb);
return;
}
__kfree_skb(skb); __kfree_skb(skb);
} }
......
...@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk) ...@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
if (sk->sk_rx_skb_cache) {
__kfree_skb(sk->sk_rx_skb_cache);
sk->sk_rx_skb_cache = NULL;
}
__skb_queue_purge(&sk->sk_error_queue); __skb_queue_purge(&sk->sk_error_queue);
sk_mem_reclaim(sk); sk_mem_reclaim(sk);
......
...@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk); tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
if (sk->sk_rx_skb_cache) {
__kfree_skb(sk->sk_rx_skb_cache);
sk->sk_rx_skb_cache = NULL;
}
tp->copied_seq = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt;
tp->urg_data = 0; tp->urg_data = 0;
tcp_write_queue_purge(sk); tcp_write_queue_purge(sk);
......
...@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, ...@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb) int tcp_v4_rcv(struct sk_buff *skb)
{ {
struct net *net = dev_net(skb->dev); struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb); int sdif = inet_sdif(skb);
const struct iphdr *iph; const struct iphdr *iph;
const struct tcphdr *th; const struct tcphdr *th;
...@@ -1905,11 +1906,17 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1905,11 +1906,17 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_segs_in(tcp_sk(sk), skb); tcp_segs_in(tcp_sk(sk), skb);
ret = 0; ret = 0;
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb); ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) { } else {
goto discard_and_relse; if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
if (skb_to_free)
__kfree_skb(skb_to_free);
put_and_return: put_and_return:
if (refcounted) if (refcounted)
......
...@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, ...@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
static int tcp_v6_rcv(struct sk_buff *skb) static int tcp_v6_rcv(struct sk_buff *skb)
{ {
struct sk_buff *skb_to_free;
int sdif = inet6_sdif(skb); int sdif = inet6_sdif(skb);
const struct tcphdr *th; const struct tcphdr *th;
const struct ipv6hdr *hdr; const struct ipv6hdr *hdr;
...@@ -1562,12 +1563,17 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1562,12 +1563,17 @@ static int tcp_v6_rcv(struct sk_buff *skb)
tcp_segs_in(tcp_sk(sk), skb); tcp_segs_in(tcp_sk(sk), skb);
ret = 0; ret = 0;
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v6_do_rcv(sk, skb); ret = tcp_v6_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) { } else {
goto discard_and_relse; if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
if (skb_to_free)
__kfree_skb(skb_to_free);
put_and_return: put_and_return:
if (refcounted) if (refcounted)
sock_put(sk); sock_put(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment