Commit 1f255691 authored by Priyaranjan Jha's avatar Priyaranjan Jha Committed by David S. Miller

tcp: higher throughput under reordering with adaptive RACK reordering wnd

Currently TCP RACK loss detection does not work well if packets are
being reordered beyond its static reordering window (min_rtt/4).Under
such reordering it may falsely trigger loss recoveries and reduce TCP
throughput significantly.

This patch improves that by increasing and reducing the reordering
window based on DSACK, which is now supported in major TCP implementations.
It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries.

- If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
  by srtt), since there is possibility that spurious retransmission was
  due to reordering delay longer than reo_wnd.

- Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
  no. of successful recoveries (accounts for full DSACK-based loss
  recovery undo). After that, reset it to default (min_rtt/4).

- At max, reo_wnd is incremented only once per rtt. So that the new
  DSACK on which we are reacting, is due to the spurious retx (approx)
  after the reo_wnd has been updated last time.

- reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
  absolute value to account for change in rtt.

In our internal testing, we observed significant increase in throughput,
in scenarios where reordering exceeds min_rtt/4 (previous static value).
Signed-off-by: default avatarPriyaranjan Jha <priyarjha@google.com>
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6c49b5e2
...@@ -454,6 +454,7 @@ tcp_recovery - INTEGER ...@@ -454,6 +454,7 @@ tcp_recovery - INTEGER
RACK: 0x1 enables the RACK loss detection for fast detection of lost RACK: 0x1 enables the RACK loss detection for fast detection of lost
retransmissions and tail drops. retransmissions and tail drops.
RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
Default: 0x1 Default: 0x1
......
...@@ -210,8 +210,13 @@ struct tcp_sock { ...@@ -210,8 +210,13 @@ struct tcp_sock {
u64 mstamp; /* (Re)sent time of the skb */ u64 mstamp; /* (Re)sent time of the skb */
u32 rtt_us; /* Associated RTT */ u32 rtt_us; /* Associated RTT */
u32 end_seq; /* Ending TCP sequence of the skb */ u32 end_seq; /* Ending TCP sequence of the skb */
u8 advanced; /* mstamp advanced since last lost marking */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */
u8 reord; /* reordering detected */ u8 reo_wnd_steps; /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
u8 reo_wnd_persist:5, /* No. of recovery since last adj */
dsack_seen:1, /* Whether DSACK seen after last adj */
advanced:1, /* mstamp advanced since last lost marking */
reord:1; /* reordering detected */
} rack; } rack;
u16 advmss; /* Advertised MSS */ u16 advmss; /* Advertised MSS */
u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */
......
...@@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3]; ...@@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_rmem[3];
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
extern atomic_long_t tcp_memory_allocated; extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated; extern struct percpu_counter tcp_sockets_allocated;
...@@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk); ...@@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
u64 xmit_time); u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_reo_timeout(struct sock *sk);
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
/* At how many usecs into the future should the RTO fire? */ /* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk) static inline s64 tcp_rto_delta_us(const struct sock *sk)
......
...@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk) ...@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk)
tcp_assign_congestion_control(sk); tcp_assign_congestion_control(sk);
tp->tsoffset = 0; tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
sk->sk_state = TCP_CLOSE; sk->sk_state = TCP_CLOSE;
......
...@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp) ...@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp)
static void tcp_dsack_seen(struct tcp_sock *tp) static void tcp_dsack_seen(struct tcp_sock *tp)
{ {
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1;
} }
static void tcp_update_reordering(struct sock *sk, const int metric, static void tcp_update_reordering(struct sock *sk, const int metric,
...@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) ...@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
mib_idx = LINUX_MIB_TCPFULLUNDO; mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS(sock_net(sk), mib_idx); NET_INC_STATS(sock_net(sk), mib_idx);
} else if (tp->rack.reo_wnd_persist) {
tp->rack.reo_wnd_persist--;
} }
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq /* Hold old state until something *above* high_seq
...@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk) ...@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) { if (tp->undo_marker && !tp->undo_retrans) {
tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
tp->rack.reo_wnd_persist + 1);
DBGUNDO(sk, "D-SACK"); DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false); tcp_undo_cwnd_reduction(sk, false);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
...@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state); &sack_state);
tcp_rack_update_reo_wnd(sk, &rs);
if (tp->tlp_high_seq) if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag); tcp_process_tlp_ack(sk, ack, flag);
/* If needed, reset TLP/RTO timer; RACK may later override this. */ /* If needed, reset TLP/RTO timer; RACK may later override this. */
......
...@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->syn_data_acked = 0; newtp->syn_data_acked = 0;
newtp->rack.mstamp = 0; newtp->rack.mstamp = 0;
newtp->rack.advanced = 0; newtp->rack.advanced = 0;
newtp->rack.reo_wnd_steps = 1;
newtp->rack.last_delivered = 0;
newtp->rack.reo_wnd_persist = 0;
newtp->rack.dsack_seen = 0;
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
} }
......
...@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) ...@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 min_rtt = tcp_min_rtt(tp);
struct sk_buff *skb, *n; struct sk_buff *skb, *n;
u32 reo_wnd; u32 reo_wnd;
...@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) ...@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
* to queuing or delayed ACKs. * to queuing or delayed ACKs.
*/ */
reo_wnd = 1000; reo_wnd = 1000;
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
}
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
tcp_tsorted_anchor) { tcp_tsorted_anchor) {
...@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk) ...@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
} }
/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
*
* If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
* by srtt), since there is possibility that spurious retransmission was
* due to reordering delay longer than reo_wnd.
*
* Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
* no. of successful recoveries (accounts for full DSACK-based loss
* recovery undo). After that, reset it to default (min_rtt/4).
*
* At max, reo_wnd is incremented only once per rtt. So that the new
* DSACK on which we are reacting, is due to the spurious retx (approx)
* after the reo_wnd has been updated last time.
*
* reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
* absolute value to account for change in rtt.
*/
void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
!rs->prior_delivered)
return;
/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
if (before(rs->prior_delivered, tp->rack.last_delivered))
tp->rack.dsack_seen = 0;
/* Adjust the reo_wnd if update is pending */
if (tp->rack.dsack_seen) {
tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
tp->rack.reo_wnd_steps + 1);
tp->rack.dsack_seen = 0;
tp->rack.last_delivered = tp->delivered;
tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
} else if (!tp->rack.reo_wnd_persist) {
tp->rack.reo_wnd_steps = 1;
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment