Commit 43e122b0 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: refine pacing rate determination

When TCP pacing was added back in linux-3.12, we chose
to apply a fixed ratio of 200 % against current rate,
to allow probing for optimal throughput even during
slow start phase, where cwnd can be doubled every other gRTT.

At Google, we found it was better applying a different ratio
while in Congestion Avoidance phase.
This ratio was set to 120 %.

We've used the normal tcp_in_slow_start() helper for a while,
then tuned the condition to select the conservative ratio
as soon as cwnd >= ssthresh/2 :

- After cwnd reduction, it is safer to ramp up more slowly,
  as we approach optimal cwnd.
- Initial ramp up (ssthresh == INFINITY) still allows doubling
  cwnd every other RTT.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 4ec3b28c
...@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER ...@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
if available window is too small. if available window is too small.
Default: 2 Default: 2
tcp_pacing_ss_ratio - INTEGER
sk->sk_pacing_rate is set by TCP stack using a ratio applied
to current rate. (current_rate = cwnd * mss / srtt)
If TCP is in slow start, tcp_pacing_ss_ratio is applied
to let TCP probe for bigger speeds, assuming cwnd can be
doubled every other RTT.
Default: 200
tcp_pacing_ca_ratio - INTEGER
sk->sk_pacing_rate is set by TCP stack using a ratio applied
to current rate. (current_rate = cwnd * mss / srtt)
If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
is applied to conservatively probe for bigger throughput.
Default: 120
tcp_tso_win_divisor - INTEGER tcp_tso_win_divisor - INTEGER
This allows control over what percentage of the congestion window This allows control over what percentage of the congestion window
can be consumed by a single TSO frame. can be consumed by a single TSO frame.
......
...@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat; ...@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking; extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
extern int sysctl_tcp_pacing_ca_ratio;
extern atomic_long_t tcp_memory_allocated; extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated; extern struct percpu_counter tcp_sockets_allocated;
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
static int zero; static int zero;
static int one = 1; static int one = 1;
static int four = 4; static int four = 4;
static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS; static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255; static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_min[] = { 1, 1 };
...@@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = { ...@@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &one, .extra1 = &one,
.extra2 = &gso_max_segs, .extra2 = &gso_max_segs,
}, },
{
.procname = "tcp_pacing_ss_ratio",
.data = &sysctl_tcp_pacing_ss_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{
.procname = "tcp_pacing_ca_ratio",
.data = &sysctl_tcp_pacing_ca_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{ {
.procname = "tcp_autocorking", .procname = "tcp_autocorking",
.data = &sysctl_tcp_autocorking, .data = &sysctl_tcp_autocorking,
......
...@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) ...@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
* TCP pacing, to smooth the burst on large writes when packets * TCP pacing, to smooth the burst on large writes when packets
* in flight is significantly lower than cwnd (or rwin) * in flight is significantly lower than cwnd (or rwin)
*/ */
int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
static void tcp_update_pacing_rate(struct sock *sk) static void tcp_update_pacing_rate(struct sock *sk)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
u64 rate; u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
/* current rate is (cwnd * mss) / srtt
* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
* In Congestion Avoidance phase, set it to 120 % the current rate.
*
* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
* end of slow start and should slow down.
*/
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
rate *= sysctl_tcp_pacing_ss_ratio;
else
rate *= sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out); rate *= max(tp->snd_cwnd, tp->packets_out);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment