Commit 16a76bdb authored by David S. Miller's avatar David S. Miller

[TCP]: Add vegas congestion avoidance support.

A forward port of an old 2.3.x kernel hack done
years ago.  I (DaveM) did the first rough port,
Stephen Hemminger actually cleaned it up and
made it usable.
parent 91a79387
......@@ -327,6 +327,10 @@ enum
NET_TCP_WESTWOOD=95,
NET_IPV4_IGMP_MAX_MSF=96,
NET_TCP_NO_METRICS_SAVE=97,
NET_TCP_VEGAS=98,
NET_TCP_VEGAS_ALPHA=99,
NET_TCP_VEGAS_BETA=100,
NET_TCP_VEGAS_GAMMA=101,
};
enum {
......
......@@ -388,6 +388,18 @@ struct tcp_opt {
__u32 rtt;
__u32 rtt_min; /* minimum observed RTT */
} westwood;
/* Vegas variables */
struct {
__u32 beg_snd_nxt; /* right edge during last RTT */
__u32 beg_snd_una; /* left edge during last RTT */
__u32 beg_snd_cwnd; /* saves the size of the cwnd */
__u8 do_vegas; /* do vegas for this connection */
__u8 doing_vegas_now;/* if true, do vegas for this RTT */
__u16 cntRTT; /* # of RTTs measured within last RTT */
__u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
__u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
} vegas;
};
/* WARNING: don't change the layout of the members in tcp_sock! */
......
......@@ -583,6 +583,10 @@ extern int sysctl_tcp_tw_reuse;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_westwood;
extern int sysctl_tcp_vegas_cong_avoid;
extern int sysctl_tcp_vegas_alpha;
extern int sysctl_tcp_vegas_beta;
extern int sysctl_tcp_vegas_gamma;
extern int sysctl_tcp_nometrics_save;
extern atomic_t tcp_memory_allocated;
......@@ -1212,8 +1216,56 @@ static inline __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
return max(tp->snd_cwnd >> 1U, 2U);
}
/* Stop taking Vegas samples for now. */
#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
/* Is this TCP connection using Vegas (regardless of whether it is taking
* Vegas measurements at the current time)?
*/
#define tcp_is_vegas(__tp) ((__tp)->vegas.do_vegas)
static inline void tcp_vegas_enable(struct tcp_opt *tp)
{
/* There are several situations when we must "re-start" Vegas:
*
* o when a connection is established
* o after an RTO
* o after fast recovery
* o when we send a packet and there is no outstanding
* unacknowledged data (restarting an idle connection)
*
* In these circumstances we cannot do a Vegas calculation at the
* end of the first RTT, because any calculation we do is using
* stale info -- both the saved cwnd and congestion feedback are
* stale.
*
* Instead we must wait until the completion of an RTT during
* which we actually receive ACKs.
*/
/* Begin taking Vegas samples next time we send something. */
tp->vegas.doing_vegas_now = 1;
/* Set the beginning of the next send window. */
tp->vegas.beg_snd_nxt = tp->snd_nxt;
tp->vegas.cntRTT = 0;
tp->vegas.minRTT = 0x7fffffff;
}
/* Should we be taking Vegas samples right now? */
#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
extern void tcp_vegas_init(struct tcp_opt *tp);
static inline void tcp_set_ca_state(struct tcp_opt *tp, u8 ca_state)
{
if (tcp_is_vegas(tp)) {
if (ca_state == TCP_CA_Open)
tcp_vegas_enable(tp);
else
tcp_vegas_disable(tp);
}
tp->ca_state = ca_state;
}
......
......@@ -609,7 +609,38 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_VEGAS,
.procname = "tcp_vegas_cong_avoid",
.data = &sysctl_tcp_vegas_cong_avoid,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_VEGAS_ALPHA,
.procname = "tcp_vegas_alpha",
.data = &sysctl_tcp_vegas_alpha,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_VEGAS_BETA,
.procname = "tcp_vegas_beta",
.data = &sysctl_tcp_vegas_beta,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_VEGAS_GAMMA,
.procname = "tcp_vegas_gamma",
.data = &sysctl_tcp_vegas_gamma,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ .ctl_name = 0 }
};
......
......@@ -86,8 +86,17 @@ int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;
int sysctl_tcp_max_orphans = NR_FILE;
int sysctl_tcp_frto;
int sysctl_tcp_westwood;
int sysctl_tcp_nometrics_save;
int sysctl_tcp_westwood;
int sysctl_tcp_vegas_cong_avoid;
/* Default values of the Vegas variables, in fixed-point representation
* with V_PARAM_SHIFT bits to the right of the binary point.
*/
#define V_PARAM_SHIFT 1
int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
......@@ -404,6 +413,42 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_b
tcp_grow_window(sk, tp, skb);
}
/* Set up a new TCP connection, depending on whether it should be
* using Vegas or not.
*/
void tcp_vegas_init(struct tcp_opt *tp)
{
if (sysctl_tcp_vegas_cong_avoid) {
tp->vegas.do_vegas = 1;
tp->vegas.baseRTT = 0x7fffffff;
tcp_vegas_enable(tp);
} else
tcp_vegas_disable(tp);
}
/* Do RTT sampling needed for Vegas.
* Basically we:
* o min-filter RTT samples from within an RTT to get the current
* propagation delay + queuing delay (we are min-filtering to try to
* avoid the effects of delayed ACKs)
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt)
{
__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
/* Filter to find propagation delay: */
if (vrtt < tp->vegas.baseRTT)
tp->vegas.baseRTT = vrtt;
/* Find the min RTT during the last RTT to find
* the current prop. delay + queuing delay:
*/
tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
tp->vegas.cntRTT++;
}
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
......@@ -417,6 +462,9 @@ static void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
long m = mrtt; /* RTT */
if (tcp_vegas_enabled(tp))
vegas_rtt_calc(tp, mrtt);
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
......@@ -1810,11 +1858,10 @@ tcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt)
else if (seq_rtt >= 0)
tcp_ack_no_tstamp(tp, seq_rtt, flag);
}
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
static __inline__ void reno_cong_avoid(struct tcp_opt *tp)
{
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* In "safe" area, increase. */
......@@ -1834,6 +1881,236 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* This is based on the congestion detection/avoidance scheme described in
* Lawrence S. Brakmo and Larry L. Peterson.
* "TCP Vegas: End to end congestion avoidance on a global internet."
* IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
* October 1995. Available from:
* ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
*
* See http://www.cs.arizona.edu/xkernel/ for their implementation.
* The main aspects that distinguish this implementation from the
* Arizona Vegas implementation are:
* o We do not change the loss detection or recovery mechanisms of
* Linux in any way. Linux already recovers from losses quite well,
* using fine-grained timers, NewReno, and FACK.
* o To avoid the performance penalty imposed by increasing cwnd
* only every-other RTT during slow start, we increase during
* every RTT during slow start, just like Reno.
* o Largely to allow continuous cwnd growth during slow start,
* we use the rate at which ACKs come back as the "actual"
* rate, rather than the rate at which data is sent.
* o To speed convergence to the right rate, we set the cwnd
* to achieve the right ("actual") rate when we exit slow start.
* o To filter out the noise caused by delayed ACKs, we use the
* minimum RTT sample observed during the last RTT to calculate
* the actual rate.
* o When the sender re-starts from idle, it waits until it has
* received ACKs for an entire flight of new data before making
* a cwnd adjustment decision. The original Vegas implementation
* assumed senders never went idle.
*/
static void vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt)
{
/* The key players are v_beg_snd_una and v_beg_snd_nxt.
*
* These are so named because they represent the approximate values
* of snd_una and snd_nxt at the beginning of the current RTT. More
* precisely, they represent the amount of data sent during the RTT.
* At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
* we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
* bytes of data have been ACKed during the course of the RTT, giving
* an "actual" rate of:
*
* (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
*
* Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
* because delayed ACKs can cover more than one segment, so they
* don't line up nicely with the boundaries of RTTs.
*
* Another unfortunate fact of life is that delayed ACKs delay the
* advance of the left edge of our send window, so that the number
* of bytes we send in an RTT is often less than our cwnd will allow.
* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
*/
if (after(ack, tp->vegas.beg_snd_nxt)) {
/* Do the Vegas once-per-RTT cwnd adjustment. */
u32 old_wnd, old_snd_cwnd;
/* Here old_wnd is essentially the window of data that was
* sent during the previous RTT, and has all
* been acknowledged in the course of the RTT that ended
* with the ACK we just received. Likewise, old_snd_cwnd
* is the cwnd during the previous RTT.
*/
old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
tp->mss_cache;
old_snd_cwnd = tp->vegas.beg_snd_cwnd;
/* Save the extent of the current window so we can use this
* at the end of the next RTT.
*/
tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
tp->vegas.beg_snd_nxt = tp->snd_nxt;
tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
/* Take into account the current RTT sample too, to
* decrease the impact of delayed acks. This double counts
* this sample since we count it for the next window as well,
* but that's not too awful, since we're taking the min,
* rather than averaging.
*/
vegas_rtt_calc(tp, seq_rtt);
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
* at least one RTT sample that wasn't from a delayed ACK.
* If we only had 2 samples total,
* then that means we're getting only 1 ACK per RTT, which
* means they're almost certainly delayed ACKs.
* If we have 3 samples, we should be OK.
*/
if (tp->vegas.cntRTT <= 2) {
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd++;
} else {
u32 rtt, target_cwnd, diff;
/* We have enough RTT samples, so, using the Vegas
* algorithm, we determine if we should increase or
* decrease cwnd, and by how much.
*/
/* Pluck out the RTT we are using for the Vegas
* calculations. This is the min RTT seen during the
* last RTT. Taking the min filters out the effects
* of delayed ACKs, at the cost of noticing congestion
* a bit later.
*/
rtt = tp->vegas.minRTT;
/* Calculate the cwnd we should have, if we weren't
* going too fast.
*
* This is:
* (actual rate in segments) * baseRTT
* We keep it as a fixed point number with
* V_PARAM_SHIFT bits to the right of the binary point.
*/
target_cwnd = ((old_wnd * tp->vegas.baseRTT)
<< V_PARAM_SHIFT) / rtt;
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
* is the "Diff" from the Arizona Vegas papers.
*
* Again, this is a fixed point number with
* V_PARAM_SHIFT bits to the right of the binary
* point.
*/
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
if (tp->snd_cwnd < tp->snd_ssthresh) {
/* Slow start. */
if (diff > sysctl_tcp_vegas_gamma) {
/* Going too fast. Time to slow down
* and switch to congestion avoidance.
*/
tp->snd_ssthresh = 2;
/* Set cwnd to match the actual rate
* exactly:
* cwnd = (actual rate) * baseRTT
* Then we add 1 because the integer
* truncation robs us of full link
* utilization.
*/
tp->snd_cwnd = min(tp->snd_cwnd,
(target_cwnd >>
V_PARAM_SHIFT)+1);
}
} else {
/* Congestion avoidance. */
u32 next_snd_cwnd;
/* Figure out where we would like cwnd
* to be.
*/
if (diff > sysctl_tcp_vegas_beta) {
/* The old window was too fast, so
* we slow down.
*/
next_snd_cwnd = old_snd_cwnd - 1;
} else if (diff < sysctl_tcp_vegas_alpha) {
/* We don't have enough extra packets
* in the network, so speed up.
*/
next_snd_cwnd = old_snd_cwnd + 1;
} else {
/* Sending just as fast as we
* should be.
*/
next_snd_cwnd = old_snd_cwnd;
}
/* Adjust cwnd upward or downward, toward the
* desired value.
*/
if (next_snd_cwnd > tp->snd_cwnd)
tp->snd_cwnd++;
else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--;
}
}
/* Wipe the slate clean for the next RTT. */
tp->vegas.cntRTT = 0;
tp->vegas.minRTT = 0x7fffffff;
}
/* The following code is executed for every ack we receive,
* except for conditions checked in should_advance_cwnd()
* before the call to tcp_cong_avoid(). Mainly this means that
* we only execute this code if the ack actually acked some
* data.
*/
/* If we are in slow start, increase our cwnd in response to this ACK.
* (If we are not in slow start then we are in congestion avoidance,
* and adjust our congestion window only once per RTT. See the code
* above.)
*/
if (tp->snd_cwnd <= tp->snd_ssthresh)
tp->snd_cwnd++;
/* to keep cwnd from growing without bound */
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
/* Make sure that we are never so timid as to reduce our cwnd below
* 2 MSS.
*
* Going below 2 MSS would risk huge delayed ACKs from our receiver.
*/
tp->snd_cwnd = max(tp->snd_cwnd, 2U);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt)
{
if (tcp_vegas_enabled(tp))
vegas_cong_avoid(tp, ack, seq_rtt);
else
reno_cong_avoid(tp);
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
......@@ -1848,7 +2125,7 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
}
/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk)
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
struct tcp_opt *tp = tcp_sk(sk);
struct sk_buff *skb;
......@@ -1934,6 +2211,7 @@ static int tcp_clean_rtx_queue(struct sock *sk)
}
}
#endif
*seq_rtt_p = seq_rtt;
return acked;
}
......@@ -2302,6 +2580,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
int prior_packets;
/* If the ack is newer than sent or older than previous acks
......@@ -2353,7 +2632,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk);
flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
......@@ -2361,13 +2640,14 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (tcp_ack_is_dubious(tp, flag)) {
/* Advanve CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) &&
prior_in_flight >= tp->snd_cwnd &&
(tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
tcp_may_raise_cwnd(tp, flag))
tcp_cong_avoid(tp);
tcp_cong_avoid(tp, ack, seq_rtt);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
} else {
if ((flag & FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd)
tcp_cong_avoid(tp);
if ((flag & FLAG_DATA_ACKED) &&
(tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
tcp_cong_avoid(tp, ack, seq_rtt);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
......
......@@ -841,6 +841,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
if (newtp->ecn_flags&TCP_ECN_OK)
newsk->sk_no_largesend = 1;
tcp_vegas_init(newtp);
TCP_INC_STATS_BH(TcpPassiveOpens);
}
return newsk;
......
......@@ -106,6 +106,9 @@ static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
u32 restart_cwnd = tcp_init_cwnd(tp, dst);
u32 cwnd = tp->snd_cwnd;
if (tcp_is_vegas(tp))
tcp_vegas_enable(tp);
tp->snd_ssthresh = tcp_current_ssthresh(tp);
restart_cwnd = min(restart_cwnd, cwnd);
......@@ -225,6 +228,19 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
}
/*
* If the connection is idle and we are restarting,
* then we don't want to do any Vegas calculations
* until we get fresh RTT samples. So when we
* restart, we reset our Vegas state to a clean
* slate. After we get acks for this flight of
* packets, _then_ we can make Vegas calculations
* again.
*/
if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
tcp_vegas_enable(tp);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
skb_set_owner_w(skb, sk);
......@@ -1268,6 +1284,7 @@ static inline void tcp_connect_init(struct sock *sk)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(sk);
tcp_vegas_init(tp);
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
......@@ -1318,6 +1335,7 @@ int tcp_connect(struct sock *sk)
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
tcp_vegas_init(tp);
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment