Commit 9d9b1ee0 authored by Enke Chen's avatar Enke Chen Committed by Jakub Kicinski

tcp: fix TCP_USER_TIMEOUT with zero window

The TCP session does not terminate with TCP_USER_TIMEOUT when data
remain untransmitted due to zero window.

The number of unanswered zero-window probes (tcp_probes_out) is
reset to zero with incoming acks irrespective of the window size,
as described in tcp_probe_timer():

    RFC 1122 4.2.2.17 requires the sender to stay open indefinitely
    as long as the receiver continues to respond probes. We support
    this by default and reset icsk_probes_out with incoming ACKs.

This counter, however, is the wrong one to be used in calculating the
duration that the window remains closed and data remain untransmitted.
Thanks to Jonathan Maxwell <jmaxwell37@gmail.com> for diagnosing the
actual issue.

In this patch a new timestamp is introduced for the socket in order to
track the elapsed time for the zero-window probes that have not been
answered with any non-zero window ack.

Fixes: 9721e709 ("tcp: simplify window probe aborting on USER_TIMEOUT")
Reported-by: default avatarWilliam McCall <william.mccall@gmail.com>
Co-developed-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarEnke Chen <enchen@paloaltonetworks.com>
Reviewed-by: default avatarYuchung Cheng <ycheng@google.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20210115223058.GA39267@localhost.localdomainSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent b889c7c8
...@@ -76,6 +76,8 @@ struct inet_connection_sock_af_ops { ...@@ -76,6 +76,8 @@ struct inet_connection_sock_af_ops {
* @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
* @icsk_ack: Delayed ACK control data * @icsk_ack: Delayed ACK control data
* @icsk_mtup; MTU probing control data * @icsk_mtup; MTU probing control data
* @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack)
* @icsk_user_timeout: TCP_USER_TIMEOUT value
*/ */
struct inet_connection_sock { struct inet_connection_sock {
/* inet_sock has to be the first member! */ /* inet_sock has to be the first member! */
...@@ -129,6 +131,7 @@ struct inet_connection_sock { ...@@ -129,6 +131,7 @@ struct inet_connection_sock {
u32 probe_timestamp; u32 probe_timestamp;
} icsk_mtup; } icsk_mtup;
u32 icsk_probes_tstamp;
u32 icsk_user_timeout; u32 icsk_user_timeout;
u64 icsk_ca_priv[104 / sizeof(u64)]; u64 icsk_ca_priv[104 / sizeof(u64)];
......
...@@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, ...@@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newicsk->icsk_retransmits = 0; newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0; newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0; newicsk->icsk_probes_out = 0;
newicsk->icsk_probes_tstamp = 0;
/* Deinitialize accept_queue to trap illegal accesses. */ /* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
......
...@@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0; icsk->icsk_backoff = 0;
icsk->icsk_probes_out = 0; icsk->icsk_probes_out = 0;
icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto = TCP_TIMEOUT_INIT;
icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_rto_min = TCP_RTO_MIN;
icsk->icsk_delack_max = TCP_DELACK_MAX; icsk->icsk_delack_max = TCP_DELACK_MAX;
......
...@@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk) ...@@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk)
return; return;
if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0; icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check(). /* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using! * This function is not for random using!
......
...@@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk) ...@@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk)
/* Cancel probe timer, if it is not required. */ /* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0; icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0; icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
return; return;
} }
......
...@@ -349,6 +349,7 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -349,6 +349,7 @@ static void tcp_probe_timer(struct sock *sk)
if (tp->packets_out || !skb) { if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0; icsk->icsk_probes_out = 0;
icsk->icsk_probes_tstamp = 0;
return; return;
} }
...@@ -360,13 +361,12 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -360,13 +361,12 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when * corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer(). * we use RTO to probe window in tcp_retransmit_timer().
*/ */
if (icsk->icsk_user_timeout) { if (!icsk->icsk_probes_tstamp)
u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, icsk->icsk_probes_tstamp = tcp_jiffies32;
tcp_probe0_base(sk)); else if (icsk->icsk_user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
if (elapsed >= icsk->icsk_user_timeout) msecs_to_jiffies(icsk->icsk_user_timeout))
goto abort; goto abort;
}
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) { if (sock_flag(sk, SOCK_DEAD)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment