Commit 10d3be56 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp-tso: do not split TSO packets at retransmit time

Linux TCP stack painfully segments all TSO/GSO packets before retransmits.

This was fine back in the days when TSO/GSO were emerging, with their
bugs, but we believe the dark age is over.

Keeping big packets in write queues, but also in stack traversal
has a lot of benefits.
 - Less memory overhead, because write queues have less skbs
 - Less cpu overhead at ACK processing.
 - Better SACK processing, as lot of studies mentioned how
   awful linux was at this ;)
 - Less cpu overhead to send the rtx packets
   (IP stack traversal, netfilter traversal, drivers...)
 - Better latencies in presence of losses.
 - Smaller spikes in fq like packet schedulers, as retransmits
   are not constrained by TCP Small Queues.

1 % packet losses are common today, and at 100Gbit speeds, this
translates to ~80,000 losses per second.
Losses are often correlated, and we see many retransmit events
leading to 1-MSS train of packets, at the time hosts are already
under stress.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 8cee83dd
...@@ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); ...@@ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle); int nonagle);
bool tcp_may_send_now(struct sock *sk); bool tcp_may_send_now(struct sock *sk);
int __tcp_retransmit_skb(struct sock *, struct sk_buff *); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *, struct sk_buff *); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk); void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *); void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *); void tcp_simple_retransmit(struct sock *);
......
...@@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, ...@@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (data) { /* Retransmit unacked data in SYN */ if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) { tcp_for_write_queue_from(data, sk) {
if (data == tcp_send_head(sk) || if (data == tcp_send_head(sk) ||
__tcp_retransmit_skb(sk, data)) __tcp_retransmit_skb(sk, data, 1))
break; break;
} }
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
......
...@@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk) ...@@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk)
if (WARN_ON(!skb || !tcp_skb_pcount(skb))) if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer; goto rearm_timer;
if (__tcp_retransmit_skb(sk, skb)) if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer; goto rearm_timer;
/* Record snd_nxt for loss detection. */ /* Record snd_nxt for loss detection. */
...@@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, ...@@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
* state updates are done by the caller. Returns non-zero if an * state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send. * error occurred which prevented the send.
*/ */
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{ {
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss; unsigned int cur_mss;
int err; int diff, len, err;
/* Inconslusive MTU probe */ /* Inconclusive MTU probe */
if (icsk->icsk_mtup.probe_size) { if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.probe_size = 0;
}
/* Do not sent more than we queued. 1/4 is reserved for possible /* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc. * copying overhead: fragmentation, tunneling, mangling etc.
...@@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->seq != tp->snd_una) TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN; return -EAGAIN;
if (skb->len > cur_mss) { len = cur_mss * segs;
if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) if (skb->len > len) {
if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */ return -ENOMEM; /* We'll try again later. */
} else { } else {
int oldpcount = tcp_skb_pcount(skb); if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
if (unlikely(oldpcount > 1)) { diff = tcp_skb_pcount(skb);
if (skb_unclone(skb, GFP_ATOMIC)) tcp_set_skb_tso_segs(skb, cur_mss);
return -ENOMEM; diff -= tcp_skb_pcount(skb);
tcp_init_tso_segs(skb, cur_mss); if (diff)
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); tcp_adjust_pcount(sk, skb, diff);
} if (skb->len < cur_mss)
tcp_retrans_try_collapse(sk, skb, cur_mss);
} }
/* RFC3168, section 6.1.1.1. ECN fallback */ /* RFC3168, section 6.1.1.1. ECN fallback */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb); tcp_ecn_clear_syn(sk, skb);
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
/* make sure skb->data is aligned on arches that require it /* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom * and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover. * beyond what csum_start can cover.
...@@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
} }
if (likely(!err)) { if (likely(!err)) {
segs = tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
/* Update global TCP statistics. */ /* Update global TCP statistics. */
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans++; tp->total_retrans += segs;
} }
return err; return err;
} }
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int err = __tcp_retransmit_skb(sk, skb); int err = __tcp_retransmit_skb(sk, skb, segs);
if (err == 0) { if (err == 0) {
#if FASTRETRANS_DEBUG > 0 #if FASTRETRANS_DEBUG > 0
...@@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
tcp_for_write_queue_from(skb, sk) { tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked; __u8 sacked = TCP_SKB_CB(skb)->sacked;
int segs;
if (skb == tcp_send_head(sk)) if (skb == tcp_send_head(sk))
break; break;
...@@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!hole) if (!hole)
tp->retransmit_skb_hint = skb; tp->retransmit_skb_hint = skb;
/* Assume this retransmit will generate segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
* only one packet for congestion window if (segs <= 0)
* calculation purposes. This works because
* tcp_retransmit_skb() will chop up the
* packet to be MSS sized and all the
* packet counting works out.
*/
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return; return;
if (fwd_rexmitting) { if (fwd_rexmitting) {
...@@ -2808,7 +2802,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2808,7 +2802,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue; continue;
if (tcp_retransmit_skb(sk, skb)) if (tcp_retransmit_skb(sk, skb, segs))
return; return;
NET_INC_STATS_BH(sock_net(sk), mib_idx); NET_INC_STATS_BH(sock_net(sk), mib_idx);
......
...@@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out; goto out;
} }
tcp_enter_loss(sk); tcp_enter_loss(sk);
tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
__sk_dst_reset(sk); __sk_dst_reset(sk);
goto out_reset_timer; goto out_reset_timer;
} }
...@@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk); tcp_enter_loss(sk);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion, /* Retransmission failed because of local congestion,
* do not backoff. * do not backoff.
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment