Commit 13f1555c authored by David S. Miller's avatar David S. Miller

Merge branch 'MPTCP-improve-fallback-to-TCP'

Davide Caratti says:

====================
MPTCP: improve fallback to TCP

there are situations where MPTCP sockets should fall-back to regular TCP:
this series reworks the fallback code to pursue the following goals:

1) cleanup the non fallback code, removing most of 'if (<fallback>)' in
   the data path
2) improve performance for non-fallback sockets, avoiding locks in poll()

further work will also leverage on this changes to achieve:

a) more consistent behavior of gestockopt()/setsockopt() on passive sockets
   after fallback
b) support for "infinite maps" as per RFC8684, section 3.7

the series is made of the following items:

- patch 1 lets sendmsg() / recvmsg() / poll() use the main socket also
  after fallback
- patch 2 fixes 'simultaneous connect' scenario after fallback. The
  problem was present also before the rework, but the fix is much easier
  to implement after patch 1
- patch 3, 4, 5 are clean-ups for code that is no more needed after the
  fallback rework
- patch 6 fixes a race condition between close() and poll(). The problem
  was theoretically present before the rework, but it became almost
  systematic after patch 1
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e1170333 8a05661b
...@@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ...@@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0; opts->suboptions = 0;
if (unlikely(mptcp_check_fallback(sk)))
return false;
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true; ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
...@@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, ...@@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
*/ */
if (!mp_opt->mp_capable) { if (!mp_opt->mp_capable) {
subflow->mp_capable = 0; subflow->mp_capable = 0;
tcp_sk(sk)->is_mptcp = 0; pr_fallback(msk);
__mptcp_do_fallback(msk);
return false; return false;
} }
...@@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, ...@@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct mptcp_options_received mp_opt; struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
if (__mptcp_check_fallback(msk))
return;
mptcp_get_options(skb, &mp_opt); mptcp_get_options(skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return; return;
......
This diff is collapsed.
...@@ -89,6 +89,7 @@ ...@@ -89,6 +89,7 @@
#define MPTCP_SEND_SPACE 1 #define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2 #define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
struct mptcp_options_received { struct mptcp_options_received {
u64 sndr_key; u64 sndr_key;
...@@ -457,4 +458,46 @@ static inline bool before64(__u64 seq1, __u64 seq2) ...@@ -457,4 +458,46 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
{
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
static inline bool mptcp_check_fallback(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
return __mptcp_check_fallback(msk);
}
static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
{
if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
pr_debug("TCP fallback already done (msk=%p)", msk);
return;
}
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
static inline void mptcp_do_fallback(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
__mptcp_do_fallback(msk);
}
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
static inline bool subflow_simultaneous_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
return sk->sk_state == TCP_ESTABLISHED &&
!mptcp_sk(parent)->pm.server_side &&
!subflow->conn_finished;
}
#endif /* __MPTCP_PROTOCOL_H */ #endif /* __MPTCP_PROTOCOL_H */
...@@ -216,7 +216,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -216,7 +216,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_options_received mp_opt; struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
struct tcp_sock *tp = tcp_sk(sk);
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
...@@ -230,6 +229,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -230,6 +229,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
return; return;
subflow->conn_finished = 1; subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt); mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp && mp_opt.mp_capable) { if (subflow->request_mptcp && mp_opt.mp_capable) {
...@@ -245,21 +246,20 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -245,21 +246,20 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce); subflow->thmac, subflow->remote_nonce);
} else { } else {
tp->is_mptcp = 0; if (subflow->request_mptcp)
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
mptcp_do_fallback(sk);
pr_fallback(mptcp_sk(subflow->conn));
} }
if (!tp->is_mptcp) if (mptcp_check_fallback(sk))
return; return;
if (subflow->mp_capable) { if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key); subflow->remote_key);
mptcp_finish_connect(sk); mptcp_finish_connect(sk);
if (skb) {
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
}
} else if (subflow->mp_join) { } else if (subflow->mp_join) {
u8 hmac[SHA256_DIGEST_SIZE]; u8 hmac[SHA256_DIGEST_SIZE];
...@@ -279,9 +279,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -279,9 +279,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
if (skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
if (!mptcp_finish_join(sk)) if (!mptcp_finish_join(sk))
goto do_reset; goto do_reset;
...@@ -557,7 +554,8 @@ enum mapping_status { ...@@ -557,7 +554,8 @@ enum mapping_status {
MAPPING_OK, MAPPING_OK,
MAPPING_INVALID, MAPPING_INVALID,
MAPPING_EMPTY, MAPPING_EMPTY,
MAPPING_DATA_FIN MAPPING_DATA_FIN,
MAPPING_DUMMY
}; };
static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
...@@ -621,6 +619,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) ...@@ -621,6 +619,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (!skb) if (!skb)
return MAPPING_EMPTY; return MAPPING_EMPTY;
if (mptcp_check_fallback(ssk))
return MAPPING_DUMMY;
mpext = mptcp_get_ext(skb); mpext = mptcp_get_ext(skb);
if (!mpext || !mpext->use_map) { if (!mpext || !mpext->use_map) {
if (!subflow->map_valid && !skb->len) { if (!subflow->map_valid && !skb->len) {
...@@ -762,6 +763,16 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -762,6 +763,16 @@ static bool subflow_check_data_avail(struct sock *ssk)
ssk->sk_err = EBADMSG; ssk->sk_err = EBADMSG;
goto fatal; goto fatal;
} }
if (status == MAPPING_DUMMY) {
__mptcp_do_fallback(msk);
skb = skb_peek(&ssk->sk_receive_queue);
subflow->map_valid = 1;
subflow->map_seq = READ_ONCE(msk->ack_seq);
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
return true;
}
if (status != MAPPING_OK) if (status != MAPPING_OK)
return false; return false;
...@@ -885,14 +896,18 @@ static void subflow_data_ready(struct sock *sk) ...@@ -885,14 +896,18 @@ static void subflow_data_ready(struct sock *sk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
struct mptcp_sock *msk;
if (!subflow->mp_capable && !subflow->mp_join) { msk = mptcp_sk(parent);
subflow->tcp_data_ready(sk); if (inet_sk_state_load(sk) == TCP_LISTEN) {
set_bit(MPTCP_DATA_READY, &msk->flags);
parent->sk_data_ready(parent); parent->sk_data_ready(parent);
return; return;
} }
WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
!subflow->mp_join);
if (mptcp_subflow_data_available(sk)) if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk); mptcp_data_ready(parent, sk);
} }
...@@ -1113,11 +1128,21 @@ static void subflow_state_change(struct sock *sk) ...@@ -1113,11 +1128,21 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk); __subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) {
mptcp_do_fallback(sk);
pr_fallback(mptcp_sk(parent));
subflow->conn_finished = 1;
if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
inet_sk_state_store(parent, TCP_ESTABLISHED);
parent->sk_state_change(parent);
}
}
/* as recvmsg() does not acquire the subflow socket for ssk selection /* as recvmsg() does not acquire the subflow socket for ssk selection
* a fin packet carrying a DSS can be unnoticed if we don't trigger * a fin packet carrying a DSS can be unnoticed if we don't trigger
* the data available machinery here. * the data available machinery here.
*/ */
if (subflow->mp_capable && mptcp_subflow_data_available(sk)) if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk); mptcp_data_ready(parent, sk);
if (!(parent->sk_shutdown & RCV_SHUTDOWN) && if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment