Commit ed66dfaf authored by Neal Cardwell's avatar Neal Cardwell Committed by David S. Miller

tcp: when scheduling TLP, time of RTO should account for current ACK

Fix the TLP scheduling logic so that when scheduling a TLP probe, we
ensure that the estimated time at which an RTO would fire accounts for
the fact that ACKs indicating forward progress should push back RTO
times.

After the following fix:

df92c839 ("tcp: fix xmit timer to only be reset if data ACKed/SACKed")

we had an unintentional behavior change in the following kind of
scenario: suppose the RTT variance has been very low recently. Then
suppose we send out a flight of N packets and our RTT is 100ms:

t=0: send a flight of N packets
t=100ms: receive an ACK for N-1 packets

The response before df92c839 that was:
  -> schedule a TLP for now + RTO_interval

The response after df92c839 is:
  -> schedule a TLP for t=0 + RTO_interval

Since RTO_interval = srtt + RTT_variance, this means that we have
scheduled a TLP timer at a point in the future that only accounts for
RTT_variance. If the RTT_variance term is small, this means that the
timer fires soon.

Before df92c839 this would not happen, because in that code, when
we receive an ACK for a prefix of flight, we did:

    1) Near the top of tcp_ack(), switch from TLP timer to RTO
       at write_queue_head->paket_tx_time + RTO_interval:
            if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                   tcp_rearm_rto(sk);

    2) In tcp_clean_rtx_queue(), update the RTO to now + RTO_interval:
            if (flag & FLAG_ACKED) {
                   tcp_rearm_rto(sk);

    3) In tcp_ack() after tcp_fastretrans_alert() switch from RTO
       to TLP at now + RTO_interval:
            if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                   tcp_schedule_loss_probe(sk);

In df92c839 we removed that 3-phase dance, and instead directly
set the TLP timer once: we set the TLP timer in cases like this to
write_queue_head->packet_tx_time + RTO_interval. So if the RTT
variance is small, then this means that this is setting the TLP timer
to fire quite soon. This means if the ACK for the tail of the flight
takes longer than an RTT to arrive (often due to delayed ACKs), then
the TLP timer fires too quickly.

Fixes: df92c839 ("tcp: fix xmit timer to only be reset if data ACKed/SACKed")
Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 61c59355
...@@ -539,7 +539,7 @@ void tcp_push_one(struct sock *, unsigned int mss_now); ...@@ -539,7 +539,7 @@ void tcp_push_one(struct sock *, unsigned int mss_now);
void tcp_send_ack(struct sock *sk); void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk); void tcp_send_loss_probe(struct sock *sk);
bool tcp_schedule_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb, void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb); const struct sk_buff *next_skb);
......
...@@ -2964,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk) ...@@ -2964,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk)
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk) static void tcp_set_xmit_timer(struct sock *sk)
{ {
if (!tcp_schedule_loss_probe(sk)) if (!tcp_schedule_loss_probe(sk, true))
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
} }
......
...@@ -2391,7 +2391,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2391,7 +2391,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
/* Send one loss probe per tail loss episode. */ /* Send one loss probe per tail loss episode. */
if (push_one != 2) if (push_one != 2)
tcp_schedule_loss_probe(sk); tcp_schedule_loss_probe(sk, false);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited); tcp_cwnd_validate(sk, is_cwnd_limited);
return false; return false;
...@@ -2399,7 +2399,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2399,7 +2399,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
return !tp->packets_out && !tcp_write_queue_empty(sk); return !tp->packets_out && !tcp_write_queue_empty(sk);
} }
bool tcp_schedule_loss_probe(struct sock *sk) bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
...@@ -2440,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -2440,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
} }
/* If the RTO formula yields an earlier time, then use that time. */ /* If the RTO formula yields an earlier time, then use that time. */
rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ rto_delta_us = advancing_rto ?
jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
tcp_rto_delta_us(sk); /* How far in future is RTO? */
if (rto_delta_us > 0) if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment