Commit eb9fae32 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-rack'

Yuchung Cheng says:

====================
RACK loss detection

RACK (Recent ACK) loss recovery uses the notion of time instead of
packet sequence (FACK) or counts (dupthresh).

It's inspired by the FACK heuristic in tcp_mark_lost_retrans(): when a
limited transmit (new data packet) is sacked in recovery, then any
retransmission sent before that newly sacked packet was sent must have
been lost, since at least one round trip time has elapsed.

But that existing heuristic from tcp_mark_lost_retrans()
has several limitations:
  1) it can't detect tail drops since it depends on limited transmit
  2) it's disabled upon reordering (assumes no reordering)
  3) it's only enabled in fast recovery but not timeout recovery

RACK addresses these limitations with a core idea: an unacknowledged
packet P1 is deemed lost if a packet P2 that was sent later is is
s/acked, since at least one round trip has passed.

Since RACK cares about the time sequence instead of the data sequence
of packets, it can detect tail drops when a later retransmission is
s/acked, while FACK or dupthresh can't. For reordering RACK uses a
dynamically adjusted reordering window ("reo_wnd") to reduce false
positives on ever (small) degree of reordering, similar to the delayed
Early Retransmit.

In the current patch set RACK is only a supplemental loss detection
and does not trigger fast recovery. However we are developing RACK
to replace or consolidate FACK/dupthresh, early retransmit, and
thin-dupack. These heuristics all implicitly bear the time notion.
For example, the delayed Early Retransmit is simply applying RACK
to trigger the fast recovery with small inflight.

RACK requires measuring the minimum RTT. Tracking a global min is less
robust due to traffic engineering pathing changes. Therefore it uses a
windowed filter by Kathleen Nichols. The min RTT can also be useful
for various other purposes like congestion control or stat monitoring.

This patch has been used on Google servers for well over 1 year. RACK
has also been implemented in the QUIC protocol. We are submitting an
IETF draft as well.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c8fdc324 4f41b1c5
...@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max ...@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
Defaults are calculated at boot time from amount of available Defaults are calculated at boot time from amount of available
memory. memory.
tcp_min_rtt_wlen - INTEGER
The window length of the windowed min filter to track the minimum RTT.
A shorter window lets a flow more quickly pick up new (higher)
minimum RTT when it is moved to a longer path (e.g., due to traffic
engineering). A longer window makes the filter more resistant to RTT
inflations such as transient congestion. The unit is seconds.
Default: 300
tcp_moderate_rcvbuf - BOOLEAN tcp_moderate_rcvbuf - BOOLEAN
If set, TCP performs receive buffer auto-tuning, attempting to If set, TCP performs receive buffer auto-tuning, attempting to
automatically size the buffer (no greater than tcp_rmem[2]) to automatically size the buffer (no greater than tcp_rmem[2]) to
...@@ -425,6 +433,15 @@ tcp_orphan_retries - INTEGER ...@@ -425,6 +433,15 @@ tcp_orphan_retries - INTEGER
you should think about lowering this value, such sockets you should think about lowering this value, such sockets
may consume significant resources. Cf. tcp_max_orphans. may consume significant resources. Cf. tcp_max_orphans.
tcp_recovery - INTEGER
This value is a bitmap to enable various experimental loss recovery
features.
RACK: 0x1 enables the RACK loss detection for fast detection of lost
retransmissions and tail drops.
Default: 0x1
tcp_reordering - INTEGER tcp_reordering - INTEGER
Initial reordering level of packets in a TCP stream. Initial reordering level of packets in a TCP stream.
TCP stack can then dynamically adjust flow reordering level TCP stack can then dynamically adjust flow reordering level
......
...@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, ...@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
return delta_us; return delta_us;
} }
static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
const struct skb_mstamp *t0)
{
s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
if (!diff)
diff = t1->stamp_us - t0->stamp_us;
return diff > 0;
}
/** /**
* struct sk_buff - socket buffer * struct sk_buff - socket buffer
......
...@@ -194,6 +194,12 @@ struct tcp_sock { ...@@ -194,6 +194,12 @@ struct tcp_sock {
u32 window_clamp; /* Maximal window to advertise */ u32 window_clamp; /* Maximal window to advertise */
u32 rcv_ssthresh; /* Current window clamp */ u32 rcv_ssthresh; /* Current window clamp */
/* Information of the most recently (s)acked skb */
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
u8 advanced; /* mstamp advanced since last lost marking */
u8 reord; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */ u16 advmss; /* Advertised MSS */
u8 unused; u8 unused;
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
...@@ -217,6 +223,9 @@ struct tcp_sock { ...@@ -217,6 +223,9 @@ struct tcp_sock {
u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */ u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */ u32 rtt_seq; /* sequence number to update rttvar */
struct rtt_meas {
u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
} rtt_min[3];
u32 packets_out; /* Packets which are "in flight" */ u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */ u32 retrans_out; /* Retransmitted packets out */
...@@ -280,8 +289,6 @@ struct tcp_sock { ...@@ -280,8 +289,6 @@ struct tcp_sock {
int lost_cnt_hint; int lost_cnt_hint;
u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 retransmit_high; /* L-bits may be on up to this seqno */
u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq; /* snd_nxt at onset of congestion */ u32 high_seq; /* snd_nxt at onset of congestion */
......
...@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes; ...@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat; extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_min_rtt_wlen;
extern int sysctl_tcp_autocorking; extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ss_ratio;
...@@ -566,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk); ...@@ -566,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
void tcp_rearm_rto(struct sock *sk); void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk); void tcp_reset(struct sock *sk);
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
/* tcp_timer.c */ /* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *); void tcp_init_xmit_timers(struct sock *);
...@@ -671,6 +673,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) ...@@ -671,6 +673,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
return dst_metric_locked(dst, RTAX_CC_ALGO); return dst_metric_locked(dst, RTAX_CC_ALGO);
} }
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
return tp->rtt_min[0].rtt;
}
/* Compute the actual receive window we are currently advertising. /* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data * Rcv_nxt can be after the window if our peer push more data
* than the offered window. * than the offered window.
...@@ -1743,6 +1751,19 @@ int tcpv4_offload_init(void); ...@@ -1743,6 +1751,19 @@ int tcpv4_offload_init(void);
void tcp_v4_init(void); void tcp_v4_init(void);
void tcp_init(void); void tcp_init(void);
/* tcp_recovery.c */
/* Flags to enable various loss recovery features. See below */
extern int sysctl_tcp_recovery;
/* Use TCP RACK to detect (some) tail and retransmit losses */
#define TCP_RACK_LOST_RETRANS 0x1
extern int tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp,
const struct skb_mstamp *xmit_time, u8 sacked);
/* /*
* Save and compile IPv4 options, return a pointer to it * Save and compile IPv4 options, return a pointer to it
*/ */
......
...@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ ...@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \ inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \ fib_frontend.o fib_semantics.o fib_trie.o \
......
...@@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = { ...@@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_recovery",
.data = &sysctl_tcp_recovery,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ {
.procname = "tcp_reordering", .procname = "tcp_reordering",
.data = &sysctl_tcp_reordering, .data = &sysctl_tcp_reordering,
...@@ -576,6 +583,13 @@ static struct ctl_table ipv4_table[] = { ...@@ -576,6 +583,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_min_rtt_wlen",
.data = &sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{ {
.procname = "tcp_low_latency", .procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency, .data = &sysctl_tcp_low_latency,
......
...@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk) ...@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the /* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control * initial SYN frame in their delayed-ACK and congestion control
......
...@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly; ...@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_thin_dupack __read_mostly;
...@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, ...@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0) if (metric > 0)
tcp_disable_early_retrans(tp); tcp_disable_early_retrans(tp);
tp->rack.reord = 1;
} }
/* This must be called before lost_out is incremented */ /* This must be called before lost_out is incremented */
...@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) ...@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
} }
} }
static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
struct sk_buff *skb)
{ {
tcp_verify_retransmit_hint(tp, skb); tcp_verify_retransmit_hint(tp, skb);
...@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, ...@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window); return !before(start_seq, end_seq - tp->max_window);
} }
/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
* Event "B". Later note: FACK people cheated me again 8), we have to account
* for reordering! Ugly, but should help.
*
* Search retransmitted skbs from write_queue that were sent when snd_nxt was
* less than what is now known to be received by the other end (derived from
* highest SACK block). Also calculate the lowest snd_nxt among the remaining
* retransmitted skbs to avoid some costly processing per ACKs.
*/
static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
u32 new_low_seq = tp->snd_nxt;
u32 received_upto = tcp_highest_sack_seq(tp);
if (!tcp_is_fack(tp) || !tp->retrans_out ||
!after(received_upto, tp->lost_retrans_low) ||
icsk->icsk_ca_state != TCP_CA_Recovery)
return;
tcp_for_write_queue(skb, sk) {
u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
if (skb == tcp_send_head(sk))
break;
if (cnt == tp->retrans_out)
break;
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
continue;
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
continue;
/* TODO: We would like to get rid of tcp_is_fack(tp) only
* constraint here (see above) but figuring out that at
* least tp->reordering SACK blocks reside between ack_seq
* and received_upto is not easy task to do cheaply with
* the available datastructures.
*
* Whether FACK should check here for tp->reordering segs
* in-between one could argue for either way (it would be
* rather simple to implement as we could count fack_count
* during the walk and do tp->fackets_out - fack_count).
*/
if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
*flag |= FLAG_LOST_RETRANS;
tcp_skb_mark_lost_uncond_verify(tp, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
} else {
if (before(ack_seq, new_low_seq))
new_low_seq = ack_seq;
cnt += tcp_skb_pcount(skb);
}
}
if (tp->retrans_out)
tp->lost_retrans_low = new_low_seq;
}
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks, struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una) u32 prior_snd_una)
...@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk, ...@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked; return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) { if (!(sacked & TCPCB_SACKED_ACKED)) {
tcp_rack_advance(tp, xmit_time, sacked);
if (sacked & TCPCB_SACKED_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost, /* If the segment is not tagged as lost,
* we do not clear RETRANS, believing * we do not clear RETRANS, believing
...@@ -1837,7 +1776,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, ...@@ -1837,7 +1776,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp); tcp_verify_left_out(tp);
out: out:
...@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) ...@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp; tp->snd_cwnd_stamp = tcp_time_stamp;
} }
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
before(tp->rx_opt.rcv_tsecr, when);
}
/* skb is spurious retransmitted if the returned timestamp echo
* reply is prior to the skb transmission time
*/
static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
}
/* Nothing was retransmitted or returned timestamp is less /* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission. * than timestamp of the first retransmission.
*/ */
static inline bool tcp_packet_delayed(const struct tcp_sock *tp) static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{ {
return !tp->retrans_stamp || return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
} }
/* Undo procedures. */ /* Undo procedures. */
...@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
} }
} }
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
tcp_rack_mark_lost(sk))
flag |= FLAG_LOST_RETRANS;
/* E. Process state. */ /* E. Process state. */
switch (icsk->icsk_ca_state) { switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery: case TCP_CA_Recovery:
...@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk); tcp_xmit_retransmit_queue(sk);
} }
/* Kathleen Nichols' algorithm for tracking the minimum value of
* a data stream over some fixed time interval. (E.g., the minimum
* RTT over the past five minutes.) It uses constant space and constant
* time per update yet almost always delivers the same minimum as an
* implementation that has to keep all the data in the window.
*
* The algorithm keeps track of the best, 2nd best & 3rd best min
* values, maintaining an invariant that the measurement time of the
* n'th best >= n-1'th best. It also makes sure that the three values
* are widely separated in the time window since that bounds the worse
* case error when that data is monotonically increasing over the window.
*
* Upon getting a new min, we can forget everything earlier because it
* has no value - the new min is <= everything else in the window by
* definition and it's the most recent. So we restart fresh on every new min
* and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
* best.
*/
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
struct rtt_meas *m = tcp_sk(sk)->rtt_min;
struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
u32 elapsed;
/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
if (unlikely(rttm.rtt <= m[0].rtt))
m[0] = m[1] = m[2] = rttm;
else if (rttm.rtt <= m[1].rtt)
m[1] = m[2] = rttm;
else if (rttm.rtt <= m[2].rtt)
m[2] = rttm;
elapsed = now - m[0].ts;
if (unlikely(elapsed > wlen)) {
/* Passed entire window without a new min so make 2nd choice
* the new min & 3rd choice the new 2nd. So forth and so on.
*/
m[0] = m[1];
m[1] = m[2];
m[2] = rttm;
if (now - m[0].ts > wlen) {
m[0] = m[1];
m[1] = rttm;
if (now - m[0].ts > wlen)
m[0] = rttm;
}
} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
/* Passed a quarter of the window without a new min so
* take 2nd choice from the 2nd quarter of the window.
*/
m[2] = m[1] = rttm;
} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
/* Passed half the window without a new min so take the 3rd
* choice from the last half of the window.
*/
m[2] = rttm;
}
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
long seq_rtt_us, long sack_rtt_us) long seq_rtt_us, long sack_rtt_us,
long ca_rtt_us)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
...@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, ...@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data * Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298). * is acked (RFC6298).
*/ */
if (flag & FLAG_RETRANS_DATA_ACKED)
seq_rtt_us = -1L;
if (seq_rtt_us < 0) if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us; seq_rtt_us = sack_rtt_us;
...@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, ...@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/ */
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED) flag & FLAG_ACKED)
seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0) if (seq_rtt_us < 0)
return false; return false;
/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
* always taken together with ACK, SACK, or TS-opts. Any negative
* values will be skipped with the seq_rtt_us < 0 check above.
*/
tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us); tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk); tcp_set_rto(sk);
...@@ -2964,7 +2985,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) ...@@ -2964,7 +2985,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
} }
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
} }
...@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED) if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount; tp->sacked_out -= acked_pcount;
else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST) if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount; tp->lost_out -= acked_pcount;
...@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING; flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now); skb_mstamp_get(&now);
if (likely(first_ackt.v64)) { if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
} }
...@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
} }
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
if (flag & FLAG_ACKED) { if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
......
...@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0; newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0; newtp->packets_out = 0;
...@@ -547,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -547,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
tcp_ecn_openreq_child(newtp, req); tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL; newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0; newtp->syn_data_acked = 0;
newtp->rack.mstamp.v64 = 0;
newtp->rack.advanced = 0;
newtp->saved_syn = req->saved_syn; newtp->saved_syn = req->saved_syn;
req->saved_syn = NULL; req->saved_syn = NULL;
......
...@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n"); net_dbg_ratelimited("retrans_out leaked\n");
} }
#endif #endif
if (!tp->retrans_out)
tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb); tp->retrans_out += tcp_skb_pcount(skb);
...@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp) if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb); tp->retrans_stamp = tcp_skb_timestamp(skb);
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) { } else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
} }
......
#include <linux/tcp.h>
#include <net/tcp.h>
int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
/* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
* dupthresh: 3 OOO packets delivered (packet count)
* FACK: sequence delta to highest sacked sequence (sequence space)
* RACK: sent time delta to the latest delivered packet (time domain)
*
* The advantage of RACK is it applies to both original and retransmitted
* packet and therefore is robust against tail losses. Another advantage
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
* The current version is only used after recovery starts but can be
* easily extended to detect the first loss.
*/
int tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 reo_wnd, prior_retrans = tp->retrans_out;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
return 0;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
*
* TODO: measure and adapt to the observed reordering delay, and
* use a timer to retransmit like the delayed early retransmit.
*/
reo_wnd = 1000;
if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
if (skb == tcp_send_head(sk))
break;
/* Skip ones already (s)acked */
if (!after(scb->end_seq, tp->snd_una) ||
scb->sacked & TCPCB_SACKED_ACKED)
continue;
if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
if (skb_mstamp_us_delta(&tp->rack.mstamp,
&skb->skb_mstamp) <= reo_wnd)
continue;
/* skb is lost if packet sent later is sacked */
tcp_skb_mark_lost_uncond_verify(tp, skb);
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
*/
break;
}
}
return prior_retrans - tp->retrans_out;
}
/* Record the most recently (re)sent time among the (s)acked packets */
void tcp_rack_advance(struct tcp_sock *tp,
const struct skb_mstamp *xmit_time, u8 sacked)
{
if (tp->rack.mstamp.v64 &&
!skb_mstamp_after(xmit_time, &tp->rack.mstamp))
return;
if (sacked & TCPCB_RETRANS) {
struct skb_mstamp now;
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
*
* If the original is lost, there is no ambiguity. Otherwise
* we assume the original can be delayed up to aRTT + min_rtt.
* the aRTT term is bounded by the fast recovery or timeout,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
skb_mstamp_get(&now);
if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
return;
}
tp->rack.mstamp = *xmit_time;
tp->rack.advanced = 1;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment