Commit 337f1b07 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-xmit-timer-rearming'

Neal Cardwell says:

====================
tcp: fix xmit timer rearming to avoid stalls

This patch series is a bug fix for a TCP loss recovery performance bug
reported independently in recent netdev threads:

 (i)  July 26, 2017: netdev thread "TCP fast retransmit issues"
 (ii) July 26, 2017: netdev thread:
       "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one
       outstanding TLP retransmission"

Many thanks to Klavs Klavsen and Mao Wenan for the detailed reports,
traces, and packetdrill test cases, which enabled us to root-cause
this issue and verify the fix.

- v1 -> v2:
 - In patch 2/3, changed an unclear comment in the pre-existing code
   in tcp_schedule_loss_probe() to be more clear (thanks to Eric Dumazet
   for suggesting we improve this).
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b91d5329 df92c839
...@@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, ...@@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
u64 xmit_time); u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_reo_timeout(struct sock *sk);
/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
const struct sk_buff *skb = tcp_write_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
}
/* /*
* Save and compile IPv4 options, return a pointer to it * Save and compile IPv4 options, return a pointer to it
*/ */
......
...@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; ...@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
...@@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk) ...@@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk)
/* Offset the time elapsed after installing regular RTO */ /* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk); s64 delta_us = tcp_rto_delta_us(sk);
u64 rto_time_stamp = skb->skb_mstamp +
jiffies_to_usecs(rto);
s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
/* delta_us may not be positive if the socket is locked /* delta_us may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled. * when the retrans timer fires and is rescheduled.
*/ */
...@@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk) ...@@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
} }
} }
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
if (!tcp_schedule_loss_probe(sk))
tcp_rearm_rto(sk);
}
/* If we get here, the whole TSO packet has not been acked. */ /* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{ {
...@@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us, sack->rate); ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) { if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk); flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
if (unlikely(icsk->icsk_mtup.probe_size && if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk); tcp_mtup_probe_success(sk);
...@@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* after when the head was last (re)transmitted. Otherwise the * after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery. * timeout may continue to extend in loss recovery.
*/ */
tcp_rearm_rto(sk); flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
} }
if (icsk->icsk_ca_ops->pkts_acked) { if (icsk->icsk_ca_ops->pkts_acked) {
...@@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt)) if (after(ack, tp->snd_nxt))
goto invalid_ack; goto invalid_ack;
if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una)) { if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED; flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0; icsk->icsk_retransmits = 0;
...@@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state); &sack_state);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
/* If needed, reset TLP/RTO timer; RACK may later override this. */
if (flag & FLAG_SET_XMIT_TIMER)
tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
} }
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk); sk_dst_confirm(sk);
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */ lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, sack_state.rate); tcp_rate_gen(sk, delivered, lost, sack_state.rate);
......
...@@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
u32 timeout, rto_delta_us;
/* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk);
return false;
}
/* Don't do any loss probe on a Fast Open connection before 3WHS /* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes. * finishes.
*/ */
if (tp->fastopen_rsk) if (tp->fastopen_rsk)
return false; return false;
/* TLP is only scheduled when next timer event is RTO. */
if (icsk->icsk_pending != ICSK_TIME_RETRANS)
return false;
/* Schedule a loss probe in 2*RTT for SACK capable connections /* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application. * in Open state, that are either limited by cwnd or application.
*/ */
...@@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
(rtt + (rtt >> 1) + TCP_DELACK_MAX)); (rtt + (rtt >> 1) + TCP_DELACK_MAX));
timeout = max_t(u32, timeout, msecs_to_jiffies(10)); timeout = max_t(u32, timeout, msecs_to_jiffies(10));
/* If RTO is shorter, just schedule TLP in its place. */ /* If the RTO formula yields an earlier time, then use that time. */
tlp_time_stamp = tcp_jiffies32 + timeout; rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; if (rto_delta_us > 0)
if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
s32 delta = rto_time_stamp - tcp_jiffies32;
if (delta > 0)
timeout = delta;
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
TCP_RTO_MAX); TCP_RTO_MAX);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment