Commit 20ff44aa authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp'

Yuchung Cheng says:

====================
This patch series improve RTT sampling in three ways:
1. Sample RTT during fast recovery and reordering events.
2. Favor ack-based RTT to timestamps because of broken TS ECR fields
3. Consolidate the RTT measurement logic.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c3f51d5f ed08495c
...@@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk); ...@@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
extern int tcp_mss_to_mtu(struct sock *sk, int mss); extern int tcp_mss_to_mtu(struct sock *sk, int mss);
extern void tcp_mtup_init(struct sock *sk); extern void tcp_mtup_init(struct sock *sk);
extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
extern void tcp_init_buffer_space(struct sock *sk); extern void tcp_init_buffer_space(struct sock *sk);
static inline void tcp_bound_rto(const struct sock *sk) static inline void tcp_bound_rto(const struct sock *sk)
...@@ -1094,15 +1093,6 @@ static inline void tcp_openreq_init(struct request_sock *req, ...@@ -1094,15 +1093,6 @@ static inline void tcp_openreq_init(struct request_sock *req,
ireq->loc_port = tcp_hdr(skb)->dest; ireq->loc_port = tcp_hdr(skb)->dest;
} }
/* Compute time elapsed between SYNACK and the ACK completing 3WHS */
static inline void tcp_synack_rtt_meas(struct sock *sk,
struct request_sock *req)
{
if (tcp_rsk(req)->snt_synack)
tcp_valid_rtt_meas(sk,
tcp_time_stamp - tcp_rsk(req)->snt_synack);
}
extern void tcp_enter_memory_pressure(struct sock *sk); extern void tcp_enter_memory_pressure(struct sock *sk);
static inline int keepalive_intvl_when(const struct tcp_sock *tp) static inline int keepalive_intvl_when(const struct tcp_sock *tp)
......
...@@ -1048,6 +1048,7 @@ struct tcp_sacktag_state { ...@@ -1048,6 +1048,7 @@ struct tcp_sacktag_state {
int reord; int reord;
int fack_count; int fack_count;
int flag; int flag;
s32 rtt; /* RTT measured by SACKing never-retransmitted data */
}; };
/* Check if skb is fully within the SACK block. In presence of GSO skbs, /* Check if skb is fully within the SACK block. In presence of GSO skbs,
...@@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, ...@@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk, static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked, struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq, u32 start_seq, u32 end_seq,
bool dup_sack, int pcount) int dup_sack, int pcount, u32 xmit_time)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count; int fack_count = state->fack_count;
...@@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk, ...@@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord); state->reord);
if (!after(end_seq, tp->high_seq)) if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED; state->flag |= FLAG_ORIG_SACK_ACKED;
/* Pick the earliest sequence sacked for RTT */
if (state->rtt < 0)
state->rtt = tcp_time_stamp - xmit_time;
} }
if (sacked & TCPCB_LOST) { if (sacked & TCPCB_LOST) {
...@@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* tcp_highest_sack_seq() when skb is highest_sack. * tcp_highest_sack_seq() when skb is highest_sack.
*/ */
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount); start_seq, end_seq, dup_sack, pcount,
TCP_SKB_CB(skb)->when);
if (skb == tp->lost_skb_hint) if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount; tp->lost_cnt_hint += pcount;
...@@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, ...@@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->end_seq,
dup_sack, dup_sack,
tcp_skb_pcount(skb)); tcp_skb_pcount(skb),
TCP_SKB_CB(skb)->when);
if (!before(TCP_SKB_CB(skb)->seq, if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp))) tcp_highest_sack_seq(tp)))
...@@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl ...@@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una) u32 prior_snd_una, s32 *sack_rtt)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) + const unsigned char *ptr = (skb_transport_header(ack_skb) +
...@@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, ...@@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.flag = 0; state.flag = 0;
state.reord = tp->packets_out; state.reord = tp->packets_out;
state.rtt = -1;
if (!tp->sacked_out) { if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out)) if (WARN_ON(tp->fackets_out))
...@@ -1737,6 +1744,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, ...@@ -1737,6 +1744,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif #endif
*sack_rtt = state.rtt;
return state.flag; return state.flag;
} }
...@@ -2792,65 +2800,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2792,65 +2800,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk); tcp_xmit_retransmit_queue(sk);
} }
void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
s32 seq_rtt, s32 sack_rtt)
{ {
tcp_rtt_estimator(sk, seq_rtt); const struct tcp_sock *tp = tcp_sk(sk);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0; /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
} * broken middle-boxes or peers may corrupt TS-ECR fields. But
EXPORT_SYMBOL(tcp_valid_rtt_meas); * Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
seq_rtt = -1;
if (seq_rtt < 0)
seq_rtt = sack_rtt;
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Supersedes RFC1323)
*/
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to /* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment * update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the * acknowledges some new data, i.e., only if it advances the
* left edge of the send window. * left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3. * See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/ */
struct tcp_sock *tp = tcp_sk(sk); if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
}
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) if (seq_rtt < 0)
{ return false;
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (flag & FLAG_RETRANS_DATA_ACKED) tcp_rtt_estimator(sk, seq_rtt);
return; tcp_set_rto(sk);
tcp_valid_rtt_meas(sk, seq_rtt); /* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
return true;
} }
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
const s32 seq_rtt) static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{ {
const struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ s32 seq_rtt = -1;
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag); if (tp->lsndtime && !tp->total_retrans)
else if (seq_rtt >= 0) seq_rtt = tcp_time_stamp - tp->lsndtime;
tcp_ack_no_tstamp(sk, seq_rtt, flag); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
} }
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
...@@ -2939,7 +2933,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) ...@@ -2939,7 +2933,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end. * arrived at the other end.
*/ */
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una) u32 prior_snd_una, s32 sack_rtt)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
...@@ -2978,8 +2972,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -2978,8 +2972,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS) if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount; tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED; flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
} else { } else {
ca_seq_rtt = now - scb->when; ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp; last_ackt = skb->tstamp;
...@@ -3031,6 +3023,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3031,6 +3023,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING; flag |= FLAG_SACK_RENEGING;
if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
(flag & FLAG_ACKED))
tcp_rearm_rto(sk);
if (flag & FLAG_ACKED) { if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops; = inet_csk(sk)->icsk_ca_ops;
...@@ -3040,9 +3036,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3040,9 +3036,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_mtup_probe_success(sk); tcp_mtup_probe_success(sk);
} }
tcp_ack_update_rtt(sk, flag, seq_rtt);
tcp_rearm_rto(sk);
if (tcp_is_reno(tp)) { if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked); tcp_remove_reno_sacks(sk, pkts_acked);
} else { } else {
...@@ -3274,6 +3267,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3274,6 +3267,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out; int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out; const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */ int acked = 0; /* Number of packets newly acked */
s32 sack_rtt = -1;
/* If the ack is older than previous acks /* If the ack is older than previous acks
* then we can probably ignore it. * then we can probably ignore it.
...@@ -3330,7 +3324,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3330,7 +3324,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked) if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE; flag |= FLAG_ECE;
...@@ -3349,7 +3344,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3349,7 +3344,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */ /* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out; acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
acked -= tp->packets_out; acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
...@@ -3402,7 +3397,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3402,7 +3397,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* If data was DSACKed, see if we can undo a cwnd reduction. * If data was DSACKed, see if we can undo a cwnd reduction.
*/ */
if (TCP_SKB_CB(skb)->sacked) { if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
tcp_fastretrans_alert(sk, acked, prior_unsacked, tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag); is_dupack, flag);
} }
...@@ -5624,9 +5620,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5624,9 +5620,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it. * so release it.
*/ */
if (req) { if (req) {
tcp_synack_rtt_meas(sk, req);
tp->total_retrans = req->num_retrans; tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false); reqsk_fastopen_remove(sk, req, false);
} else { } else {
/* Make sure socket is routed, for correct metrics. */ /* Make sure socket is routed, for correct metrics. */
...@@ -5651,6 +5645,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5651,6 +5645,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok) if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
......
...@@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ...@@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk); tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
newtp->total_retrans = req->num_retrans;
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */ /* Copy over the MD5 key from the original socket */
......
...@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp); tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0; newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack;
newtp->total_retrans = req->num_retrans;
/* So many TCP implementations out there (incorrectly) count the /* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control * initial SYN frame in their delayed-ACK and congestion control
...@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK)) if (!(flg & TCP_FLAG_ACK))
return NULL; return NULL;
/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
tcp_rsk(req)->snt_synack = 0;
/* For Fast Open no more processing is needed (sk is the /* For Fast Open no more processing is needed (sk is the
* child socket). * child socket).
*/ */
......
...@@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ...@@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk); tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
newtp->total_retrans = req->num_retrans;
newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
newinet->inet_rcv_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment