Commit e33099f9 authored by Yuchung Cheng's avatar Yuchung Cheng Committed by David S. Miller

tcp: implement RFC5682 F-RTO

This patch implements F-RTO (foward RTO recovery):

When the first retransmission after timeout is acknowledged, F-RTO
sends new data instead of old data. If the next ACK acknowledges
some never-retransmitted data, then the timeout was spurious and the
congestion state is reverted.  Otherwise if the next ACK selectively
acknowledges the new data, then the timeout was genuine and the
loss recovery continues. This idea applies to recurring timeouts
as well. While F-RTO sends different data during timeout recovery,
it does not (and should not) change the congestion control.

The implementaion follows the three steps of SACK enhanced algorithm
(section 3) in RFC5682. Step 1 is in tcp_enter_loss(). Step 2 and
3 are in tcp_process_loss().  The basic version is not supported
because SACK enhanced version also works for non-SACK connections.

The new implementation is functionally in parity with the old F-RTO
implementation except the one case where it increases undo events:
In addition to the RFC algorithm, a spurious timeout may be detected
without sending data in step 2, as long as the SACK confirms not
all the original data are dropped. When this happens, the sender
will undo the cwnd and perhaps enter fast recovery instead. This
additional check increases the F-RTO undo events by 5x compared
to the prior implementation on Google Web servers, since the sender
often does not have new data to send for HTTP.

Note F-RTO may detect spurious timeout before Eifel with timestamps
does so.
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Acked-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ab42d9ee
...@@ -225,19 +225,13 @@ tcp_fin_timeout - INTEGER ...@@ -225,19 +225,13 @@ tcp_fin_timeout - INTEGER
Default: 60 seconds Default: 60 seconds
tcp_frto - INTEGER tcp_frto - INTEGER
Enables Forward RTO-Recovery (F-RTO) defined in RFC4138. Enables Forward RTO-Recovery (F-RTO) defined in RFC5682.
F-RTO is an enhanced recovery algorithm for TCP retransmission F-RTO is an enhanced recovery algorithm for TCP retransmission
timeouts. It is particularly beneficial in wireless environments timeouts. It is particularly beneficial in networks where the
where packet loss is typically due to random radio interference RTT fluctuates (e.g., wireless). F-RTO is sender-side only
rather than intermediate router congestion. F-RTO is sender-side modification. It does not require any support from the peer.
only modification. Therefore it does not require any support from
the peer. By default it's enabled with a non-zero value. 0 disables F-RTO.
If set to 1, basic version is enabled. 2 enables SACK enhanced
F-RTO if flow uses SACK. The basic version can be used also when
SACK is in use though scenario(s) with it exists where F-RTO
interacts badly with the packet counting of the SACK enabled TCP
flow.
tcp_keepalive_time - INTEGER tcp_keepalive_time - INTEGER
How often TCP sends out keepalive messages when keepalive is enabled. How often TCP sends out keepalive messages when keepalive is enabled.
......
...@@ -192,7 +192,8 @@ struct tcp_sock { ...@@ -192,7 +192,8 @@ struct tcp_sock {
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */ thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */ thin_dupack : 1,/* Fast retransmit on first dupack */
repair : 1; repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue; u8 repair_queue;
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
syn_data:1, /* SYN includes data */ syn_data:1, /* SYN includes data */
......
...@@ -107,6 +107,7 @@ int sysctl_tcp_early_retrans __read_mostly = 3; ...@@ -107,6 +107,7 @@ int sysctl_tcp_early_retrans __read_mostly = 3;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
...@@ -1155,6 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk, ...@@ -1155,6 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
tcp_highest_sack_seq(tp))) tcp_highest_sack_seq(tp)))
state->reord = min(fack_count, state->reord = min(fack_count,
state->reord); state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
} }
if (sacked & TCPCB_LOST) { if (sacked & TCPCB_LOST) {
...@@ -1835,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how) ...@@ -1835,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how)
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
bool new_recovery = false;
/* Reduce ssthresh if it has not yet been made inside this window. */ /* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS); tcp_ca_event(sk, CA_EVENT_LOSS);
...@@ -1883,6 +1889,14 @@ void tcp_enter_loss(struct sock *sk, int how) ...@@ -1883,6 +1889,14 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss); tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt; tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp); TCP_ECN_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
*/
tp->frto = sysctl_tcp_frto &&
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
} }
/* If ACK arrived pointing to a remembered SACK, it means that our /* If ACK arrived pointing to a remembered SACK, it means that our
...@@ -2426,12 +2440,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) ...@@ -2426,12 +2440,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
return failed; return failed;
} }
/* Undo during loss recovery after partial ACK. */ /* Undo during loss recovery after partial ACK or using F-RTO. */
static bool tcp_try_undo_loss(struct sock *sk) static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (tcp_may_undo(tp)) { if (frto_undo || tcp_may_undo(tp)) {
struct sk_buff *skb; struct sk_buff *skb;
tcp_for_write_queue(skb, sk) { tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk)) if (skb == tcp_send_head(sk))
...@@ -2445,9 +2459,12 @@ static bool tcp_try_undo_loss(struct sock *sk) ...@@ -2445,9 +2459,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
tp->lost_out = 0; tp->lost_out = 0;
tcp_undo_cwr(sk, true); tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0; inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0; tp->undo_marker = 0;
if (tcp_is_sack(tp)) if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open); tcp_set_ca_state(sk, TCP_CA_Open);
return true; return true;
} }
...@@ -2667,24 +2684,52 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) ...@@ -2667,24 +2684,52 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs. * recovered or spurious. Otherwise retransmits more on partial ACKs.
*/ */
static void tcp_process_loss(struct sock *sk, int flag) static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
if (!before(tp->snd_una, tp->high_seq)) { if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (flag & FLAG_ORIG_SACK_ACKED) {
/* Step 3.b. A timeout is spurious if not all data are
* lost, i.e., never-retransmitted data are (s)acked.
*/
tcp_try_undo_loss(sk, true);
return;
}
if (after(tp->snd_nxt, tp->high_seq) &&
(flag & FLAG_DATA_SACKED || is_dupack)) {
tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
return; /* Step 2.b */
tp->frto = 0;
}
}
if (recovered) {
/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
icsk->icsk_retransmits = 0; icsk->icsk_retransmits = 0;
tcp_try_undo_recovery(sk); tcp_try_undo_recovery(sk);
return; return;
} }
if (flag & FLAG_DATA_ACKED) if (flag & FLAG_DATA_ACKED)
icsk->icsk_retransmits = 0; icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) if (tcp_is_reno(tp)) {
tcp_reset_reno_sack(tp); /* A Reno DUPACK means new data in F-RTO step 2.b above are
if (tcp_try_undo_loss(sk)) * delivered. Lower inflight to clock out (re)tranmissions.
*/
if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
tcp_add_reno_sack(sk);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
if (tcp_try_undo_loss(sk, false))
return; return;
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk); tcp_xmit_retransmit_queue(sk);
} }
...@@ -2764,7 +2809,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, ...@@ -2764,7 +2809,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
break; break;
case TCP_CA_Loss: case TCP_CA_Loss:
tcp_process_loss(sk, flag); tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open) if (icsk->icsk_ca_state != TCP_CA_Open)
return; return;
/* Fall through to processing in Open state. */ /* Fall through to processing in Open state. */
...@@ -3003,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3003,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
} }
if (!(sacked & TCPCB_SACKED_ACKED)) if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord); reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
} }
if (sacked & TCPCB_SACKED_ACKED) if (sacked & TCPCB_SACKED_ACKED)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment