Commit 3ec21b65 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-fastopen-middlebox-fixes'

Wei Wang says:

====================
net/tcp_fastopen: Fix for various TFO firewall issues

Currently there are still some firewall issues in the middlebox
which make the middlebox drop packets silently for TFO sockets.
This kind of issue is hard to be detected by the end client.

This patch series tries to detect such issues in the kernel and disable
TFO temporarily.
More details about the issues and the fixes are included in the following
patches.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents bc95cd8e 59450f8d
...@@ -602,6 +602,14 @@ tcp_fastopen - INTEGER ...@@ -602,6 +602,14 @@ tcp_fastopen - INTEGER
Note that that additional client or server features are only Note that that additional client or server features are only
effective if the basic support (0x1 and 0x2) are enabled respectively. effective if the basic support (0x1 and 0x2) are enabled respectively.
tcp_fastopen_blackhole_timeout_sec - INTEGER
Initial time period in second to disable Fastopen on active TCP sockets
when a TFO firewall blackhole issue happens.
This time period will grow exponentially when more blackhole issues
get detected right after Fastopen is re-enabled and will reset to
initial value when the blackhole issue goes away.
By default, it is set to 1hr.
tcp_syn_retries - INTEGER tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 127. Default value will be retransmitted. Should not be higher than 127. Default value
......
...@@ -233,6 +233,7 @@ struct tcp_sock { ...@@ -233,6 +233,7 @@ struct tcp_sock {
u8 syn_data:1, /* SYN includes data */ u8 syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */ save_syn:1, /* Save headers of SYN packet */
is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
......
...@@ -1506,6 +1506,12 @@ struct tcp_fastopen_context { ...@@ -1506,6 +1506,12 @@ struct tcp_fastopen_context {
struct rcu_head rcu; struct rcu_head rcu;
}; };
extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
void tcp_fastopen_active_disable(struct sock *sk);
bool tcp_fastopen_active_should_disable(struct sock *sk);
void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
void tcp_fastopen_active_timeout_reset(void);
/* Latencies incurred by various limits for a sender. They are /* Latencies incurred by various limits for a sender. They are
* chronograph-like stats that are mutually exclusive. * chronograph-like stats that are mutually exclusive.
*/ */
......
...@@ -259,6 +259,7 @@ enum ...@@ -259,6 +259,7 @@ enum
LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */ LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
LINUX_MIB_TCPFASTOPENBLACKHOLE, /* TCPFastOpenBlackholeDetect */
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */
LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */
......
...@@ -281,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -281,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
......
...@@ -350,6 +350,19 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, ...@@ -350,6 +350,19 @@ static int proc_udp_early_demux(struct ctl_table *table, int write,
return ret; return ret;
} }
static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
tcp_fastopen_active_timeout_reset();
return ret;
}
static struct ctl_table ipv4_table[] = { static struct ctl_table ipv4_table[] = {
{ {
.procname = "tcp_timestamps", .procname = "tcp_timestamps",
...@@ -399,6 +412,14 @@ static struct ctl_table ipv4_table[] = { ...@@ -399,6 +412,14 @@ static struct ctl_table ipv4_table[] = {
.maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
.proc_handler = proc_tcp_fastopen_key, .proc_handler = proc_tcp_fastopen_key,
}, },
{
.procname = "tcp_fastopen_blackhole_timeout_sec",
.data = &sysctl_tcp_fastopen_blackhole_timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_tfo_blackhole_detect_timeout,
.extra1 = &zero,
},
{ {
.procname = "tcp_abort_on_overflow", .procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow, .data = &sysctl_tcp_abort_on_overflow,
......
...@@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk); tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk); tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue); skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0; inet->inet_dport = 0;
......
...@@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, ...@@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
cookie->len = -1; cookie->len = -1;
return false; return false;
} }
/* Firewall blackhole issue check */
if (tcp_fastopen_active_should_disable(sk)) {
cookie->len = -1;
return false;
}
if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
cookie->len = -1; cookie->len = -1;
return true; return true;
...@@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) ...@@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
return false; return false;
} }
EXPORT_SYMBOL(tcp_fastopen_defer_connect); EXPORT_SYMBOL(tcp_fastopen_defer_connect);
/*
* The following code block is to deal with middle box issues with TFO:
* Middlebox firewall issues can potentially cause server's data being
* blackholed after a successful 3WHS using TFO.
* The proposed solution is to disable active TFO globally under the
* following circumstances:
* 1. client side TFO socket receives out of order FIN
* 2. client side TFO socket receives out of order RST
* We disable active side TFO globally for 1hr at first. Then if it
* happens again, we disable it for 2h, then 4h, 8h, ...
* And we reset the timeout back to 1hr when we see a successful active
* TFO connection with data exchanges.
*/
/* Default to 1hr */
unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
static unsigned long tfo_active_disable_stamp __read_mostly;
/* Disable active TFO and record current jiffies and
* tfo_active_disable_times
*/
void tcp_fastopen_active_disable(struct sock *sk)
{
atomic_inc(&tfo_active_disable_times);
tfo_active_disable_stamp = jiffies;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
}
/* Reset tfo_active_disable_times to 0 */
void tcp_fastopen_active_timeout_reset(void)
{
atomic_set(&tfo_active_disable_times, 0);
}
/* Calculate timeout for tfo active disable
* Return true if we are still in the active TFO disable period
* Return false if timeout already expired and we should use active TFO
*/
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
int tfo_da_times = atomic_read(&tfo_active_disable_times);
int multiplier;
unsigned long timeout;
if (!tfo_da_times)
return false;
/* Limit timout to max: 2^6 * initial timeout */
multiplier = 1 << min(tfo_da_times - 1, 6);
timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
if (time_before(jiffies, tfo_active_disable_stamp + timeout))
return true;
/* Mark check bit so we can check for successful active TFO
* condition and reset tfo_active_disable_times
*/
tcp_sk(sk)->syn_fastopen_ch = 1;
return false;
}
/* Disable active TFO if FIN is the only packet in the ofo queue
* and no data is received.
* Also check if we can reset tfo_active_disable_times if data is
* received successfully on a marked active TFO sockets opened on
* a non-loopback interface
*/
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *p;
struct sk_buff *skb;
struct dst_entry *dst;
if (!tp->syn_fastopen)
return;
if (!tp->data_segs_in) {
p = rb_first(&tp->out_of_order_queue);
if (p && !rb_next(p)) {
skb = rb_entry(p, struct sk_buff, rbnode);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
tcp_fastopen_active_disable(sk);
return;
}
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&tfo_active_disable_times)) {
dst = sk_dst_get(sk);
if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
tcp_fastopen_active_timeout_reset();
dst_release(dst);
}
}
...@@ -5300,8 +5300,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, ...@@ -5300,8 +5300,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (rst_seq_match) if (rst_seq_match)
tcp_reset(sk); tcp_reset(sk);
else else {
/* Disable TFO if RST is out-of-order
* and no data has been received
* for current active TFO socket
*/
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk, skb); tcp_send_challenge_ack(sk, skb);
}
goto discard; goto discard;
} }
...@@ -6044,9 +6052,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -6044,9 +6052,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break; break;
} }
if (tp->linger2 < 0 || if (tp->linger2 < 0) {
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && tcp_done(sk);
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
/* Receive out of order FIN after close() */
if (tp->syn_fastopen && th->fin)
tcp_fastopen_active_disable(sk);
tcp_done(sk); tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1; return 1;
......
...@@ -1855,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk) ...@@ -1855,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleanup up the write buffer. */ /* Cleanup up the write buffer. */
tcp_write_queue_purge(sk); tcp_write_queue_purge(sk);
/* Check if we want to disable active TFO */
tcp_fastopen_active_disable_ofo_check(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */ /* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue); skb_rbtree_purge(&tp->out_of_order_queue);
......
...@@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk)
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
/* Some middle-boxes may black-hole Fast Open _after_ /* Some middle-boxes may black-hole Fast Open _after_
* the handshake. Therefore we conservatively disable * the handshake. Therefore we conservatively disable
* Fast Open on this path on recurring timeouts with * Fast Open on this path on recurring timeouts after
* few or zero bytes acked after Fast Open. * successful Fast Open.
*/ */
if (tp->syn_data_acked && if (tp->syn_data_acked) {
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0); tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
NET_INC_STATS(sock_net(sk), NET_INC_STATS(sock_net(sk),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment