Merge branch 'tcp-rx-tx-cache'

Eric Dumazet says: ==================== tcp: add rx/tx cache to reduce lock contention On hosts with many cpus we can observe a very serious contention on spinlocks used in mm slab layer. The following can happen quite often : 1) TX path sendmsg() allocates one (fclone) skb on CPU A, sends a clone. ACK is received on CPU B, and consumes the skb that was in the retransmit queue. 2) RX path network driver allocates skb on CPU C recvmsg() happens on CPU D, freeing the skb after it has been delivered to user space. In both cases, we are hitting the asymetric alloc/free pattern for which slab has to drain alien caches. At 8 Mpps per second, this represents 16 Mpps alloc/free per second and has a huge penalty. In an interesting experiment, I tried to use a single kmem_cache for all the skbs (in skb_init() : skbuff_fclone_cache = skbuff_head_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones),); qnd most of the contention disappeared, since cpus could better use their local slab per-cpu cache. But we can do actually better, in the following patches. TX : at ACK time, no longer free the skb but put it back in a tcp socket cache, so that next sendmsg() can reuse it immediately. RX : at recvmsg() time, do not free the skb but put it in a tcp socket cache so that it can be freed by the cpu feeding the incoming packets in BH. This increased the performance of small RPC benchmark by about 10 % on a host with 112 hyperthreads. v2 : - Solved a race condition : sk_stream_alloc_skb() to make sure the prior clone has been freed. - Really test rps_needed in sk_eat_skb() as claimed. - Fixed rps_needed use in drivers/net/tun.c v3: Added a #ifdef CONFIG_RPS, to avoid compile error (kbuild robot) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'tcp-rx-tx-cache'
Eric Dumazet says: ==================== tcp: add rx/tx cache to reduce lock contention On hosts with many cpus we can observe a very serious contention on spinlocks used in mm slab layer. The following can happen quite often : 1) TX path sendmsg() allocates one (fclone) skb on CPU A, sends a clone. ACK is received on CPU B, and consumes the skb that was in the retransmit queue. 2) RX path network driver allocates skb on CPU C recvmsg() happens on CPU D, freeing the skb after it has been delivered to user space. In both cases, we are hitting the asymetric alloc/free pattern for which slab has to drain alien caches. At 8 Mpps per second, this represents 16 Mpps alloc/free per second and has a huge penalty. In an interesting experiment, I tried to use a single kmem_cache for all the skbs (in skb_init() : skbuff_fclone_cache = skbuff_head_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones),); qnd most of the contention disappeared, since cpus could better use their local slab per-cpu cache. But we can do actually better, in the following patches. TX : at ACK time, no longer free the skb but put it back in a tcp socket cache, so that next sendmsg() can reuse it immediately. RX : at recvmsg() time, do not free the skb but put it in a tcp socket cache so that it can be freed by the cpu feeding the incoming packets in BH. This increased the performance of small RPC benchmark by about 10 % on a host with 112 hyperthreads. v2 : - Solved a race condition : sk_stream_alloc_skb() to make sure the prior clone has been freed. - Really test rps_needed in sk_eat_skb() as claimed. - Fixed rps_needed use in drivers/net/tun.c v3: Added a #ifdef CONFIG_RPS, to avoid compile error (kbuild robot) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
bdaba895 · David S. Miller · 7c1508e5 · 8b27dae5 · bdaba895 · bdaba895
Commit bdaba895 authored Mar 23, 2019 by David S. Miller
10 changed files
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1042,7 +1042,7 @@ static int tun_net_close(struct net_device *dev)
 static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 {
 #ifdef CONFIG_RPS
-	if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
+	if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
 		/* Select queue was not called for the skbuff, so we extract the
 		 * RPS hash and save it into the flow_table here.
 		 */

--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -194,8 +194,8 @@ struct net_device_stats {

 #ifdef CONFIG_RPS
 #include <linux/static_key.h>
-extern struct static_key rps_needed;
-extern struct static_key rfs_needed;
+extern struct static_key_false rps_needed;
+extern struct static_key_false rfs_needed;
 #endif

 struct neighbour;

--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
 	atomic_t		sk_drops;
 	int			sk_rcvlowat;
 	struct sk_buff_head	sk_error_queue;
+	struct sk_buff		*sk_rx_skb_cache;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -414,6 +415,7 @@ struct sock {
 		struct sk_buff	*sk_send_head;
 		struct rb_root	tcp_rtx_queue;
 	};
+	struct sk_buff		*sk_tx_skb_cache;
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
@@ -966,7 +968,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	if (static_key_false(&rfs_needed)) {
+	if (static_branch_unlikely(&rfs_needed)) {
 		/* Reading sk->sk_rxhash might incur an expensive cache line
 		 * miss.
 		 *
@@ -1463,6 +1465,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)

 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	if (!sk->sk_tx_skb_cache) {
+		sk->sk_tx_skb_cache = skb;
+		return;
+	}
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
@@ -2433,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (
+#ifdef CONFIG_RPS
+	    !static_branch_unlikely(&rps_needed) &&
+#endif
+	    !sk->sk_rx_skb_cache) {
+		sk->sk_rx_skb_cache = skb;
+		skb_orphan(skb);
+		return;
+	}
 	__kfree_skb(skb);
 }


--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3982,9 +3982,9 @@ EXPORT_SYMBOL(rps_sock_flow_table);
 u32 rps_cpu_mask __read_mostly;
 EXPORT_SYMBOL(rps_cpu_mask);

-struct static_key rps_needed __read_mostly;
+struct static_key_false rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
-struct static_key rfs_needed __read_mostly;
+struct static_key_false rfs_needed __read_mostly;
 EXPORT_SYMBOL(rfs_needed);

 static struct rps_dev_flow *
@@ -4510,7 +4510,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 	}

 #ifdef CONFIG_RPS
-	if (static_key_false(&rps_needed)) {
+	if (static_branch_unlikely(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;

@@ -5179,7 +5179,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)

 	rcu_read_lock();
 #ifdef CONFIG_RPS
-	if (static_key_false(&rps_needed)) {
+	if (static_branch_unlikely(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);

@@ -5227,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)

 	rcu_read_lock();
 #ifdef CONFIG_RPS
-	if (static_key_false(&rps_needed)) {
+	if (static_branch_unlikely(&rps_needed)) {
 		list_for_each_entry_safe(skb, next, head, list) {
 			struct rps_dev_flow voidflow, *rflow = &voidflow;
 			int cpu = get_rps_cpu(skb->dev, skb, &rflow);

--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -754,9 +754,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	rcu_assign_pointer(queue->rps_map, map);

 	if (map)
-		static_key_slow_inc(&rps_needed);
+		static_branch_inc(&rps_needed);
 	if (old_map)
-		static_key_slow_dec(&rps_needed);
+		static_branch_dec(&rps_needed);

 	mutex_unlock(&rps_map_mutex);


--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -95,12 +95,12 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 		if (sock_table != orig_sock_table) {
 			rcu_assign_pointer(rps_sock_flow_table, sock_table);
 			if (sock_table) {
-				static_key_slow_inc(&rps_needed);
-				static_key_slow_inc(&rfs_needed);
+				static_branch_inc(&rps_needed);
+				static_branch_inc(&rfs_needed);
 			}
 			if (orig_sock_table) {
-				static_key_slow_dec(&rps_needed);
-				static_key_slow_dec(&rfs_needed);
+				static_branch_dec(&rps_needed);
+				static_branch_dec(&rfs_needed);
 				synchronize_rcu();
 				vfree(orig_sock_table);
 			}

--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);

 	__skb_queue_purge(&sk->sk_receive_queue);
+	if (sk->sk_rx_skb_cache) {
+		__kfree_skb(sk->sk_rx_skb_cache);
+		sk->sk_rx_skb_cache = NULL;
+	}
 	__skb_queue_purge(&sk->sk_error_queue);

 	sk_mem_reclaim(sk);

--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -865,6 +865,21 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 {
 	struct sk_buff *skb;

+	skb = sk->sk_tx_skb_cache;
+	if (skb && !size) {
+		const struct sk_buff_fclones *fclones;
+
+		fclones = container_of(skb, struct sk_buff_fclones, skb1);
+		if (refcount_read(&fclones->fclone_ref) == 1) {
+			sk->sk_wmem_queued -= skb->truesize;
+			sk_mem_uncharge(sk, skb->truesize);
+			skb->truesize -= skb->data_len;
+			sk->sk_tx_skb_cache = NULL;
+			pskb_trim(skb, 0);
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+			return skb;
+		}
+	}
 	/* The TCP header must be at least 32-bit aligned.  */
 	size = ALIGN(size, 4);

@@ -1098,30 +1113,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 }
 EXPORT_SYMBOL(tcp_sendpage);

-/* Do not bother using a page frag for very small frames.
- * But use this heuristic only for the first skb in write queue.
- *
- * Having no payload in skb->head allows better SACK shifting
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
- * write queue has less skbs.
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
- * This also speeds up tso_fragment(), since it wont fallback
- * to tcp_fragment().
- */
-static int linear_payload_sz(bool first_skb)
-{
-	if (first_skb)
-		return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
-	return 0;
-}
-
-static int select_size(bool first_skb, bool zc)
-{
-	if (zc)
-		return 0;
-	return linear_payload_sz(first_skb);
-}
-
 void tcp_free_fastopen_req(struct tcp_sock *tp)
 {
 	if (tp->fastopen_req) {
@@ -1272,7 +1263,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)

 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 			bool first_skb;
-			int linear;

 new_segment:
 			if (!sk_stream_memory_free(sk))
@@ -1283,8 +1273,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 				goto restart;
 			}
 			first_skb = tcp_rtx_and_write_queues_empty(sk);
-			linear = select_size(first_skb, zc);
-			skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
+			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
 						  first_skb);
 			if (!skb)
 				goto wait_for_memory;
@@ -2552,6 +2541,13 @@ void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	}
 	tcp_rtx_queue_purge(sk);
+	skb = sk->sk_tx_skb_cache;
+	if (skb) {
+		sk->sk_wmem_queued -= skb->truesize;
+		sk_mem_uncharge(sk, skb->truesize);
+		__kfree_skb(skb);
+		sk->sk_tx_skb_cache = NULL;
+	}
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2587,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)

 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
+	if (sk->sk_rx_skb_cache) {
+		__kfree_skb(sk->sk_rx_skb_cache);
+		sk->sk_rx_skb_cache = NULL;
+	}
 	tp->copied_seq = tp->rcv_nxt;
 	tp->urg_data = 0;
 	tcp_write_queue_purge(sk);

--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 int tcp_v4_rcv(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
+	struct sk_buff *skb_to_free;
 	int sdif = inet_sdif(skb);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
+		skb_to_free = sk->sk_rx_skb_cache;
+		sk->sk_rx_skb_cache = NULL;
 		ret = tcp_v4_do_rcv(sk, skb);
-	} else if (tcp_add_backlog(sk, skb)) {
+	} else {
+		if (tcp_add_backlog(sk, skb))
 			goto discard_and_relse;
+		skb_to_free = NULL;
 	}
 	bh_unlock_sock(sk);
+	if (skb_to_free)
+		__kfree_skb(skb_to_free);

 put_and_return:
 	if (refcounted)

--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,

 static int tcp_v6_rcv(struct sk_buff *skb)
 {
+	struct sk_buff *skb_to_free;
 	int sdif = inet6_sdif(skb);
 	const struct tcphdr *th;
 	const struct ipv6hdr *hdr;
@@ -1562,12 +1563,17 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
+		skb_to_free = sk->sk_rx_skb_cache;
+		sk->sk_rx_skb_cache = NULL;
 		ret = tcp_v6_do_rcv(sk, skb);
-	} else if (tcp_add_backlog(sk, skb)) {
+	} else {
+		if (tcp_add_backlog(sk, skb))
 			goto discard_and_relse;
+		skb_to_free = NULL;
 	}
 	bh_unlock_sock(sk);
-
+	if (skb_to_free)
+		__kfree_skb(skb_to_free);
 put_and_return:
 	if (refcounted)
 		sock_put(sk);