Commit 133c4c0d authored by Eric Dumazet's avatar Eric Dumazet Committed by Paolo Abeni

tcp: defer regular ACK while processing socket backlog

This idea came after a particular workload requested
the quickack attribute set on routes, and a performance
drop was noticed for large bulk transfers.

For high throughput flows, it is best to use one cpu
running the user thread issuing socket system calls,
and a separate cpu to process incoming packets from BH context.
(With TSO/GRO, bottleneck is usually the 'user' cpu)

Problem is the user thread can spend a lot of time while holding
the socket lock, forcing BH handler to queue most of incoming
packets in the socket backlog.

Whenever the user thread releases the socket lock, it must first
process all accumulated packets in the backlog, potentially
adding latency spikes. Due to flood mitigation, having too many
packets in the backlog increases chance of unexpected drops.

Backlog processing unfortunately shifts a fair amount of cpu cycles
from the BH cpu to the 'user' cpu, thus reducing max throughput.

This patch takes advantage of the backlog processing,
and the fact that ACK are mostly cumulative.

The idea is to detect we are in the backlog processing
and defer all eligible ACK into a single one,
sent from tcp_release_cb().

This saves cpu cycles on both sides, and network resources.

Performance of a single TCP flow on a 200Gbit NIC:

- Throughput is increased by 20% (100Gbit -> 120Gbit).
- Number of generated ACK per second shrinks from 240,000 to 40,000.
- Number of backlog drops per second shrinks from 230 to 0.

Benchmark context:
 - Regular netperf TCP_STREAM (no zerocopy)
 - Intel(R) Xeon(R) Platinum 8481C (Saphire Rapids)
 - MAX_SKB_FRAGS = 17 (~60KB per GRO packet)

This feature is guarded by a new sysctl, and enabled by default:
 /proc/sys/net/ipv4/tcp_backlog_ack_defer
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarYuchung Cheng <ycheng@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarDave Taht <dave.taht@gmail.com>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 4505dc2a
...@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER ...@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
Default : 44 Default : 44
tcp_backlog_ack_defer - BOOLEAN
If set, user thread processing socket backlog tries sending
one ACK for the whole queue. This helps to avoid potential
long latencies at end of a TCP socket syscall.
Default : true
tcp_slow_start_after_idle - BOOLEAN tcp_slow_start_after_idle - BOOLEAN
If set, provide RFC2861 behavior and time out the congestion If set, provide RFC2861 behavior and time out the congestion
window after an idle period. An idle period is defined at window after an idle period. An idle period is defined at
......
...@@ -463,15 +463,17 @@ enum tsq_enum { ...@@ -463,15 +463,17 @@ enum tsq_enum {
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced() * tcp_v{4|6}_mtu_reduced()
*/ */
TCP_ACK_DEFERRED, /* TX pure ack is deferred */
}; };
enum tsq_flags { enum tsq_flags {
TSQF_THROTTLED = (1UL << TSQ_THROTTLED), TSQF_THROTTLED = BIT(TSQ_THROTTLED),
TSQF_QUEUED = (1UL << TSQ_QUEUED), TSQF_QUEUED = BIT(TSQ_QUEUED),
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED), TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED),
TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED),
}; };
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
......
...@@ -132,6 +132,7 @@ struct netns_ipv4 { ...@@ -132,6 +132,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_syncookies; u8 sysctl_tcp_syncookies;
u8 sysctl_tcp_migrate_req; u8 sysctl_tcp_migrate_req;
u8 sysctl_tcp_comp_sack_nr; u8 sysctl_tcp_comp_sack_nr;
u8 sysctl_tcp_backlog_ack_defer;
int sysctl_tcp_reordering; int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2; u8 sysctl_tcp_retries2;
......
...@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dou8vec_minmax, .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
}, },
{
.procname = "tcp_backlog_ack_defer",
.data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ {
.procname = "tcp_reflect_tos", .procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos, .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
......
...@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) ...@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) || tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */ /* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
/* If we are running from __release_sock() in user context,
* Defer the ack until tcp_release_cb().
*/
if (sock_owned_by_user_nocheck(sk) &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
return;
}
send_now: send_now:
tcp_send_ack(sk); tcp_send_ack(sk);
return; return;
......
...@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0); atomic_set(&net->ipv4.tfo_active_disable_times, 0);
......
...@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t) ...@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED) TCPF_MTU_REDUCED_DEFERRED | \
TCPF_ACK_DEFERRED)
/** /**
* tcp_release_cb - tcp release_sock() callback * tcp_release_cb - tcp release_sock() callback
* @sk: socket * @sk: socket
...@@ -1114,6 +1115,8 @@ void tcp_release_cb(struct sock *sk) ...@@ -1114,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk); __sock_put(sk);
} }
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
tcp_send_ack(sk);
} }
EXPORT_SYMBOL(tcp_release_cb); EXPORT_SYMBOL(tcp_release_cb);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment