Commit eb36be0f authored by Yuchung Cheng's avatar Yuchung Cheng Committed by David S. Miller

tcp: avoid min-RTT overestimation from delayed ACKs

This patch avoids having TCP sender or congestion control
overestimate the min RTT by orders of magnitude. This happens when
all the samples in the windowed filter are one-packet transfer
like small request and health-check like chit-chat, which is farily
common for applications using persistent connections. This patch
tries to conservatively labels and skip RTT samples obtained from
this type of workload.
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Acked-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 60c25306
...@@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; ...@@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
...@@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, ...@@ -2857,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
*rexmit = REXMIT_LOST; *rexmit = REXMIT_LOST;
} }
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{ {
u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
/* If the remote keeps returning delayed ACKs, eventually
* the min filter would pick it up and overestimate the
* prop. delay when it expires. Skip suspected delayed ACKs.
*/
return;
}
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1)); rtt_us ? : jiffies_to_usecs(1));
} }
...@@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, ...@@ -2901,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* always taken together with ACK, SACK, or TS-opts. Any negative * always taken together with ACK, SACK, or TS-opts. Any negative
* values will be skipped with the seq_rtt_us < 0 check above. * values will be skipped with the seq_rtt_us < 0 check above.
*/ */
tcp_update_rtt_min(sk, ca_rtt_us); tcp_update_rtt_min(sk, ca_rtt_us, flag);
tcp_rtt_estimator(sk, seq_rtt_us); tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk); tcp_set_rto(sk);
...@@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, ...@@ -3125,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
last_in_flight && !prior_sacked && fully_acked &&
sack->rate->prior_delivered + 1 == tp->delivered &&
!(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
/* Conservatively mark a delayed ACK. It's typically
* from a lone runt packet over the round trip to
* a receiver w/o out-of-order or CE events.
*/
flag |= FLAG_ACK_MAYBE_DELAYED;
}
} }
if (sack->first_sackt) { if (sack->first_sackt) {
sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment