Commit 43b54c6e authored by Mat Martineau's avatar Mat Martineau Committed by David S. Miller

mptcp: Use full MPTCP-level disconnect state machine

RFC 8684 appendix D describes the connection state machine for
MPTCP. This patch implements the DATA_FIN / DATA_ACK exchanges and
MPTCP-level socket state changes described in that appendix, rather than
simply sending DATA_FIN along with TCP FIN when disconnecting subflows.

DATA_FIN is now sent and acknowledged before shutting down the
subflows. Received DATA_FIN information (if not part of a data packet)
is written to the MPTCP socket when the incoming DSS option is parsed by
the subflow, and the MPTCP worker is scheduled to process the
flag. DATA_FIN received as part of a full DSS mapping will be handled
when the mapping is processed.

The DATA_FIN is acknowledged by the worker if the reader is caught
up. If there is still data to be moved to the MPTCP-level queue, ack_seq
will be incremented to account for the DATA_FIN when it reaches the end
of the stream and a DATA_ACK will be sent to the peer.
Signed-off-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 16a9a9da
...@@ -868,6 +868,17 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, ...@@ -868,6 +868,17 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
if (mp_opt.use_ack) if (mp_opt.use_ack)
update_una(msk, &mp_opt); update_una(msk, &mp_opt);
/* Zero-length packets, like bare ACKs carrying a DATA_FIN, are
* dropped by the caller and not propagated to the MPTCP layer.
* Copy the DATA_FIN information now.
*/
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
if (mp_opt.data_fin && mp_opt.data_len == 1 &&
mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) &&
schedule_work(&msk->work))
sock_hold(subflow->conn);
}
mpext = skb_ext_add(skb, SKB_EXT_MPTCP); mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext) if (!mpext)
return; return;
......
...@@ -381,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, ...@@ -381,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
*bytes = moved; *bytes = moved;
/* If the moves have caught up with the DATA_FIN sequence number
* it's time to ack the DATA_FIN and change socket state, but
* this is not a good place to change state. Let the workqueue
* do it.
*/
if (mptcp_pending_data_fin(sk, NULL) &&
schedule_work(&msk->work))
sock_hold(sk);
return done; return done;
} }
...@@ -466,7 +475,8 @@ void mptcp_data_acked(struct sock *sk) ...@@ -466,7 +475,8 @@ void mptcp_data_acked(struct sock *sk)
{ {
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
if (!sk_stream_is_writeable(sk) && if ((!sk_stream_is_writeable(sk) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work)) schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk); sock_hold(sk);
} }
...@@ -1384,6 +1394,7 @@ static void mptcp_worker(struct work_struct *work) ...@@ -1384,6 +1394,7 @@ static void mptcp_worker(struct work_struct *work)
lock_sock(sk); lock_sock(sk);
mptcp_clean_una(sk); mptcp_clean_una(sk);
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk); __mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk); __mptcp_move_skbs(msk);
...@@ -1393,6 +1404,8 @@ static void mptcp_worker(struct work_struct *work) ...@@ -1393,6 +1404,8 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk); mptcp_check_for_eof(msk);
mptcp_check_data_fin(sk);
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock; goto unlock;
...@@ -1515,7 +1528,7 @@ static void mptcp_cancel_work(struct sock *sk) ...@@ -1515,7 +1528,7 @@ static void mptcp_cancel_work(struct sock *sk)
sock_put(sk); sock_put(sk);
} }
static void mptcp_subflow_shutdown(struct sock *ssk, int how) static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
{ {
lock_sock(ssk); lock_sock(ssk);
...@@ -1528,8 +1541,15 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) ...@@ -1528,8 +1541,15 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how)
tcp_disconnect(ssk, O_NONBLOCK); tcp_disconnect(ssk, O_NONBLOCK);
break; break;
default: default:
if (__mptcp_check_fallback(mptcp_sk(sk))) {
pr_debug("Fallback");
ssk->sk_shutdown |= how; ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how); tcp_shutdown(ssk, how);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
mptcp_set_timeout(sk, ssk);
tcp_send_ack(ssk);
}
break; break;
} }
...@@ -1570,9 +1590,35 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -1570,9 +1590,35 @@ static void mptcp_close(struct sock *sk, long timeout)
LIST_HEAD(conn_list); LIST_HEAD(conn_list);
lock_sock(sk); lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TCP_LISTEN) {
inet_sk_state_store(sk, TCP_CLOSE); inet_sk_state_store(sk, TCP_CLOSE);
goto cleanup;
} else if (sk->sk_state == TCP_CLOSE) {
goto cleanup;
}
if (__mptcp_check_fallback(msk)) {
goto update_state;
} else if (mptcp_close_state(sk)) {
pr_debug("Sending DATA_FIN sk=%p", sk);
WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
WRITE_ONCE(msk->snd_data_fin_enable, 1);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK);
}
}
sk_stream_wait_close(sk, timeout);
update_state:
inet_sk_state_store(sk, TCP_CLOSE);
cleanup:
/* be sure to always acquire the join list lock, to sync vs /* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join(). * mptcp_finish_join().
*/ */
...@@ -1581,8 +1627,6 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -1581,8 +1627,6 @@ static void mptcp_close(struct sock *sk, long timeout)
spin_unlock_bh(&msk->join_list_lock); spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list); list_splice_init(&msk->conn_list, &conn_list);
msk->snd_data_fin_enable = 1;
__mptcp_clear_xmit(sk); __mptcp_clear_xmit(sk);
release_sock(sk); release_sock(sk);
...@@ -2265,11 +2309,8 @@ static int mptcp_shutdown(struct socket *sock, int how) ...@@ -2265,11 +2309,8 @@ static int mptcp_shutdown(struct socket *sock, int how)
pr_debug("sk=%p, how=%d", msk, how); pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk); lock_sock(sock->sk);
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
how++; how++;
if ((how & ~SHUTDOWN_MASK) || !how) { if ((how & ~SHUTDOWN_MASK) || !how) {
ret = -EINVAL; ret = -EINVAL;
goto out_unlock; goto out_unlock;
...@@ -2283,13 +2324,31 @@ static int mptcp_shutdown(struct socket *sock, int how) ...@@ -2283,13 +2324,31 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED; sock->state = SS_CONNECTED;
} }
/* If we've already sent a FIN, or it's a closed state, skip this. */
if (__mptcp_check_fallback(msk)) {
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
}
} else if ((how & SEND_SHUTDOWN) &&
((1 << sock->sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) &&
mptcp_close_state(sock->sk)) {
__mptcp_flush_join_list(msk); __mptcp_flush_join_list(msk);
msk->snd_data_fin_enable = 1;
WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
WRITE_ONCE(msk->snd_data_fin_enable, 1);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
mptcp_subflow_shutdown(tcp_sk, how); mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
}
} }
/* Wake up anyone sleeping in poll. */ /* Wake up anyone sleeping in poll. */
......
...@@ -598,7 +598,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) ...@@ -598,7 +598,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
return true; return true;
} }
static enum mapping_status get_mapping_status(struct sock *ssk) static enum mapping_status get_mapping_status(struct sock *ssk,
struct mptcp_sock *msk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
...@@ -648,7 +649,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk) ...@@ -648,7 +649,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (mpext->data_fin == 1) { if (mpext->data_fin == 1) {
if (data_len == 1) { if (data_len == 1) {
pr_debug("DATA_FIN with no payload"); mptcp_update_rcv_data_fin(msk, mpext->data_seq);
pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
if (subflow->map_valid) { if (subflow->map_valid) {
/* A DATA_FIN might arrive in a DSS /* A DATA_FIN might arrive in a DSS
* option before the previous mapping * option before the previous mapping
...@@ -660,6 +662,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) ...@@ -660,6 +662,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
} else { } else {
return MAPPING_DATA_FIN; return MAPPING_DATA_FIN;
} }
} else {
mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len);
pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len);
} }
/* Adjust for DATA_FIN using 1 byte of sequence space */ /* Adjust for DATA_FIN using 1 byte of sequence space */
...@@ -748,7 +753,7 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -748,7 +753,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
u64 ack_seq; u64 ack_seq;
u64 old_ack; u64 old_ack;
status = get_mapping_status(ssk); status = get_mapping_status(ssk, msk);
pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
if (status == MAPPING_INVALID) { if (status == MAPPING_INVALID) {
ssk->sk_err = EBADMSG; ssk->sk_err = EBADMSG;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment