Commit f0fdc80b authored by David S. Miller's avatar David S. Miller

Merge branch 'pmtu-probe'

Fan Du says:

====================
Improvements for TCP PMTU

This patchset performs some improvements and enhancement
for current TCP PMTU as per RFC4821 with the aim to find
optimal mms size quickly, and also be adaptive to route
changes like enlarged path MTU. Then TCP PMTU could be
used to probe a effective pmtu in absence of ICMP message
for tunnels(e.g. vxlan) across different networking stack.

Patch1/4: Set probe mss base to 1024 Bytes per RFC4821
Patch2/4: Do not double probe_size for each probing,
          use a simple binary search to gain maximum performance.
	  mss for next probing.
Patch3/4: Create a probe timer to detect enlarged path MTU.
Patch4/4: Update ip-sysctl.txt for new sysctl knobs.

Changelog:
v5:
  - Zero probe_size before resetting search range.
  - Update ip-sysctl.txt for new sysctl knobs.
v4:
  - Convert probe_size to mss, not directly from search_low/high
  - Clamp probe_threshold
  - Don't adjust search_high in blackhole probe, so drop orignal patch3
v3:
  - Update commit message for patch2
  - Fix pseudo timer delta calculation in patch4
v2:
  - Introduce sysctl_tcp_probe_threshold to control when
    probing will stop, as suggested by John Heffner.
  - Add patch3 to shrink current mss value for search low boundary.
  - Drop cannonical timer usages, implements pseudo timer based on
    32bits jiffies tcp_time_stamp, as suggested by Eric Dumazet.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents aaa4e704 fab42760
...@@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER ...@@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER
1 - Disabled by default, enabled when an ICMP black hole detected 1 - Disabled by default, enabled when an ICMP black hole detected
2 - Always enabled, use initial MSS of tcp_base_mss. 2 - Always enabled, use initial MSS of tcp_base_mss.
tcp_probe_interval - INTEGER
Controls how often to start TCP Packetization-Layer Path MTU
Discovery reprobe. The default is reprobing every 10 minutes as
per RFC4821.
tcp_probe_threshold - INTEGER
Controls when TCP Packetization-Layer Path MTU Discovery probing
will stop in respect to the width of search range in bytes. Default
is 8 bytes.
tcp_no_metrics_save - BOOLEAN tcp_no_metrics_save - BOOLEAN
By default, TCP saves various connection metrics in the route cache By default, TCP saves various connection metrics in the route cache
when the connection closes, so that connections established in the when the connection closes, so that connections established in the
......
...@@ -126,6 +126,8 @@ struct inet_connection_sock { ...@@ -126,6 +126,8 @@ struct inet_connection_sock {
/* Information on the current probe. */ /* Information on the current probe. */
int probe_size; int probe_size;
u32 probe_timestamp;
} icsk_mtup; } icsk_mtup;
u32 icsk_ca_priv[16]; u32 icsk_ca_priv[16];
u32 icsk_user_timeout; u32 icsk_user_timeout;
......
...@@ -87,6 +87,8 @@ struct netns_ipv4 { ...@@ -87,6 +87,8 @@ struct netns_ipv4 {
int sysctl_tcp_fwmark_accept; int sysctl_tcp_fwmark_accept;
int sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss; int sysctl_tcp_base_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
struct ping_group_range ping_group_range; struct ping_group_range ping_group_range;
......
...@@ -65,7 +65,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -65,7 +65,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_MIN_MSS 88U #define TCP_MIN_MSS 88U
/* The least MTU to use for probing */ /* The least MTU to use for probing */
#define TCP_BASE_MSS 512 #define TCP_BASE_MSS 1024
/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL 600
/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD 8
/* After receiving this amount of duplicate ACKs fast retransmit starts. */ /* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3 #define TCP_FASTRETRANS_THRESH 3
......
...@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
{
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_probe_interval",
.data = &init_net.ipv4.sysctl_tcp_probe_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ } { }
}; };
......
...@@ -2460,6 +2460,8 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2460,6 +2460,8 @@ static int __net_init tcp_sk_init(struct net *net)
} }
net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
return 0; return 0;
fail: fail:
......
...@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk) ...@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len; icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
} }
EXPORT_SYMBOL(tcp_mtup_init); EXPORT_SYMBOL(tcp_mtup_init);
...@@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, ...@@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
return false; return false;
} }
static inline void tcp_mtu_check_reprobe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 interval;
s32 delta;
interval = net->ipv4.sysctl_tcp_probe_interval;
delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
/* Update current search range */
icsk->icsk_mtup.probe_size = 0;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
/* Update probe time stamp */
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
}
/* Create a new MTU probe if we are ready. /* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by * MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing * deliberately sending larger packets. This discovers routing
...@@ -1842,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -1842,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next; struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
int len; int len;
int probe_size; int probe_size;
int size_needed; int size_needed;
int copy; int copy;
int mss_now; int mss_now;
int interval;
/* Not currently probing/verifying, /* Not currently probing/verifying,
* not in recovery, * not in recovery,
...@@ -1859,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -1859,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack) tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1; return -1;
/* Very simple search strategy: just double the MSS. */ /* Use binary search for probe_size between tcp_mss_base,
* and current mss_clamp. if (search_high - search_low)
* smaller than a threshold, backoff from probing.
*/
mss_now = tcp_current_mss(sk); mss_now = tcp_current_mss(sk);
probe_size = 2 * tp->mss_cache; probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* TODO: set timer for probe_converge_event */ /* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
*/
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
interval < net->ipv4.sysctl_tcp_probe_threshold) {
/* Check whether enough time has elaplased for
* another round of probing.
*/
tcp_mtu_check_reprobe(sk);
return -1; return -1;
} }
......
...@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) ...@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) { if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) { if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1; icsk->icsk_mtup.enabled = 1;
icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else { } else {
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment