Commit a3c8c7f4 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-non-backup-subflows-pre-reqs'

Paolo Abeni says:

====================
mptcp: non backup subflows pre-reqs

This series contains a bunch of MPTCP improvements loosely related to
concurrent subflows xmit usage, currently under development.

The first 3 patches are actually bugfixes for issues that will become apparent
as soon as we will enable the above feature.

The later patches improve the handling of incoming additional subflows,
improving significantly the performances in stress tests based on a high new
connection rate.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 205a55f4 4cf8b7e4
...@@ -709,6 +709,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, ...@@ -709,6 +709,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
* additional ack. * additional ack.
*/ */
subflow->fully_established = 1; subflow->fully_established = 1;
WRITE_ONCE(msk->fully_established, true);
goto fully_established; goto fully_established;
} }
...@@ -724,9 +725,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, ...@@ -724,9 +725,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
if (unlikely(!READ_ONCE(msk->pm.server_side))) if (unlikely(!READ_ONCE(msk->pm.server_side)))
pr_warn_once("bogus mpc option on established client sk"); pr_warn_once("bogus mpc option on established client sk");
subflow->fully_established = 1; mptcp_subflow_fully_established(subflow, mp_opt);
subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1;
fully_established: fully_established:
if (likely(subflow->pm_notified)) if (likely(subflow->pm_notified))
......
...@@ -460,15 +460,20 @@ static void mptcp_clean_una(struct sock *sk) ...@@ -460,15 +460,20 @@ static void mptcp_clean_una(struct sock *sk)
dfrag = mptcp_rtx_head(sk); dfrag = mptcp_rtx_head(sk);
if (dfrag && after64(snd_una, dfrag->data_seq)) { if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; u64 delta = snd_una - dfrag->data_seq;
if (WARN_ON_ONCE(delta > dfrag->data_len))
goto out;
dfrag->data_seq += delta; dfrag->data_seq += delta;
dfrag->offset += delta;
dfrag->data_len -= delta; dfrag->data_len -= delta;
dfrag_uncharge(sk, delta); dfrag_uncharge(sk, delta);
cleaned = true; cleaned = true;
} }
out:
if (cleaned) { if (cleaned) {
sk_mem_reclaim_partial(sk); sk_mem_reclaim_partial(sk);
...@@ -1517,6 +1522,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, ...@@ -1517,6 +1522,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->local_key = subflow_req->local_key; msk->local_key = subflow_req->local_key;
msk->token = subflow_req->token; msk->token = subflow_req->token;
msk->subflow = NULL; msk->subflow = NULL;
WRITE_ONCE(msk->fully_established, false);
msk->write_seq = subflow_req->idsn + 1; msk->write_seq = subflow_req->idsn + 1;
atomic64_set(&msk->snd_una, msk->write_seq); atomic64_set(&msk->snd_una, msk->write_seq);
...@@ -1600,7 +1606,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, ...@@ -1600,7 +1606,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
newsk = new_mptcp_sock; newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk); mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list); list_add(&subflow->node, &msk->conn_list);
inet_sk_state_store(newsk, TCP_ESTABLISHED);
mptcp_rcv_space_init(msk, ssk); mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(new_mptcp_sock); bh_unlock_sock(new_mptcp_sock);
...@@ -1814,7 +1819,6 @@ void mptcp_finish_connect(struct sock *ssk) ...@@ -1814,7 +1819,6 @@ void mptcp_finish_connect(struct sock *ssk)
ack_seq++; ack_seq++;
subflow->map_seq = ack_seq; subflow->map_seq = ack_seq;
subflow->map_subflow_seq = 1; subflow->map_subflow_seq = 1;
subflow->rel_write_seq = 1;
/* the socket is not connected yet, no msk/subflow ops can access/race /* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below * accessing the field below
...@@ -1851,7 +1855,7 @@ bool mptcp_finish_join(struct sock *sk) ...@@ -1851,7 +1855,7 @@ bool mptcp_finish_join(struct sock *sk)
pr_debug("msk=%p, subflow=%p", msk, subflow); pr_debug("msk=%p, subflow=%p", msk, subflow);
/* mptcp socket already closing? */ /* mptcp socket already closing? */
if (inet_sk_state_load(parent) != TCP_ESTABLISHED) if (!mptcp_is_fully_established(parent))
return false; return false;
if (!msk->pm.server_side) if (!msk->pm.server_side)
...@@ -1940,6 +1944,13 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -1940,6 +1944,13 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return err; return err;
} }
static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow)
{
subflow->request_mptcp = 0;
__mptcp_do_fallback(msk);
}
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags) int addr_len, int flags)
{ {
...@@ -1971,10 +1982,10 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -1971,10 +1982,10 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
* TCP option space. * TCP option space.
*/ */
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
subflow->request_mptcp = 0; mptcp_subflow_early_fallback(msk, subflow);
#endif #endif
if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
subflow->request_mptcp = 0; mptcp_subflow_early_fallback(msk, subflow);
do_connect: do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags); err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
......
...@@ -198,6 +198,7 @@ struct mptcp_sock { ...@@ -198,6 +198,7 @@ struct mptcp_sock {
u32 token; u32 token;
unsigned long flags; unsigned long flags;
bool can_ack; bool can_ack;
bool fully_established;
spinlock_t join_list_lock; spinlock_t join_list_lock;
struct work_struct work; struct work_struct work;
struct list_head conn_list; struct list_head conn_list;
...@@ -342,6 +343,8 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) ...@@ -342,6 +343,8 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
} }
int mptcp_is_enabled(struct net *net); int mptcp_is_enabled(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk); bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void); void __init mptcp_subflow_init(void);
...@@ -373,6 +376,11 @@ void mptcp_get_options(const struct sk_buff *skb, ...@@ -373,6 +376,11 @@ void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt); struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk); void mptcp_finish_connect(struct sock *sk);
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
READ_ONCE(mptcp_sk(sk)->fully_established);
}
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk); bool mptcp_finish_join(struct sock *sk);
......
...@@ -53,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, ...@@ -53,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
} }
static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
{
return mptcp_is_fully_established((void *)msk) &&
READ_ONCE(msk->pm.accept_subflow);
}
/* validate received token and create truncated hmac and nonce for SYN-ACK */ /* validate received token and create truncated hmac and nonce for SYN-ACK */
static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
const struct sk_buff *skb) const struct sk_buff *skb)
...@@ -200,49 +206,40 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -200,49 +206,40 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished) if (subflow->conn_finished)
return; return;
subflow->rel_write_seq = 1;
subflow->conn_finished = 1; subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq; subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt); mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp && mp_opt.mp_capable) { if (subflow->request_mptcp) {
if (!mp_opt.mp_capable) {
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
mptcp_do_fallback(sk);
pr_fallback(mptcp_sk(subflow->conn));
goto fallback;
}
subflow->mp_capable = 1; subflow->mp_capable = 1;
subflow->can_ack = 1; subflow->can_ack = 1;
subflow->remote_key = mp_opt.sndr_key; subflow->remote_key = mp_opt.sndr_key;
pr_debug("subflow=%p, remote_key=%llu", subflow, pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key); subflow->remote_key);
} else if (subflow->request_join && mp_opt.mp_join) { mptcp_finish_connect(sk);
subflow->mp_join = 1; } else if (subflow->request_join) {
u8 hmac[SHA256_DIGEST_SIZE];
if (!mp_opt.mp_join)
goto do_reset;
subflow->thmac = mp_opt.thmac; subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce; subflow->remote_nonce = mp_opt.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce); subflow->thmac, subflow->remote_nonce);
} else {
if (subflow->request_mptcp)
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
mptcp_do_fallback(sk);
pr_fallback(mptcp_sk(subflow->conn));
}
if (mptcp_check_fallback(sk)) {
mptcp_rcv_space_init(mptcp_sk(parent), sk);
return;
}
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
} else if (subflow->mp_join) {
u8 hmac[SHA256_DIGEST_SIZE];
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
subflow, subflow->thmac,
subflow->remote_nonce);
if (!subflow_thmac_valid(subflow)) { if (!subflow_thmac_valid(subflow)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
subflow->mp_join = 0;
goto do_reset; goto do_reset;
} }
...@@ -250,18 +247,22 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -250,18 +247,22 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->local_nonce, subflow->local_nonce,
subflow->remote_nonce, subflow->remote_nonce,
hmac); hmac);
memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
if (!mptcp_finish_join(sk)) if (!mptcp_finish_join(sk))
goto do_reset; goto do_reset;
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
} else { } else if (mptcp_check_fallback(sk)) {
do_reset: fallback:
tcp_send_active_reset(sk, GFP_ATOMIC); mptcp_rcv_space_init(mptcp_sk(parent), sk);
tcp_done(sk);
} }
return;
do_reset:
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
} }
static struct request_sock_ops subflow_request_sock_ops; static struct request_sock_ops subflow_request_sock_ops;
...@@ -386,6 +387,17 @@ static void subflow_drop_ctx(struct sock *ssk) ...@@ -386,6 +387,17 @@ static void subflow_drop_ctx(struct sock *ssk)
kfree_rcu(ctx, rcu); kfree_rcu(ctx, rcu);
} }
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt)
{
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
subflow->remote_key = mp_opt->sndr_key;
subflow->fully_established = 1;
subflow->can_ack = 1;
WRITE_ONCE(msk->fully_established, true);
}
static struct sock *subflow_syn_recv_sock(const struct sock *sk, static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb, struct sk_buff *skb,
struct request_sock *req, struct request_sock *req,
...@@ -409,7 +421,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -409,7 +421,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
/* hopefully temporary handling for MP_JOIN+syncookie */ /* hopefully temporary handling for MP_JOIN+syncookie */
subflow_req = mptcp_subflow_rsk(req); subflow_req = mptcp_subflow_rsk(req);
fallback_is_fatal = subflow_req->mp_join; fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
fallback = !tcp_rsk(req)->is_mptcp; fallback = !tcp_rsk(req)->is_mptcp;
if (fallback) if (fallback)
goto create_child; goto create_child;
...@@ -437,6 +449,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -437,6 +449,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
} else if (subflow_req->mp_join) { } else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt); mptcp_get_options(skb, &mp_opt);
if (!mp_opt.mp_join || if (!mp_opt.mp_join ||
!mptcp_can_accept_new_subflow(subflow_req->msk) ||
!subflow_hmac_valid(req, &mp_opt)) { !subflow_hmac_valid(req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
fallback = true; fallback = true;
...@@ -465,6 +478,11 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -465,6 +478,11 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
} }
if (ctx->mp_capable) { if (ctx->mp_capable) {
/* this can't race with mptcp_close(), as the msk is
* not yet exposted to user-space
*/
inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
/* new mpc subflow takes ownership of the newly /* new mpc subflow takes ownership of the newly
* created mptcp socket * created mptcp socket
*/ */
...@@ -477,9 +495,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -477,9 +495,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
/* with OoO packets we can reach here without ingress /* with OoO packets we can reach here without ingress
* mpc option * mpc option
*/ */
ctx->remote_key = mp_opt.sndr_key; if (mp_opt.mp_capable)
ctx->fully_established = mp_opt.mp_capable; mptcp_subflow_fully_established(ctx, &mp_opt);
ctx->can_ack = mp_opt.mp_capable;
} else if (ctx->mp_join) { } else if (ctx->mp_join) {
struct mptcp_sock *owner; struct mptcp_sock *owner;
...@@ -514,9 +531,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -514,9 +531,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
dispose_child: dispose_child:
subflow_drop_ctx(child); subflow_drop_ctx(child);
tcp_rsk(req)->drop_req = true; tcp_rsk(req)->drop_req = true;
tcp_send_active_reset(child, GFP_ATOMIC);
inet_csk_prepare_for_destroy_sock(child); inet_csk_prepare_for_destroy_sock(child);
tcp_done(child); tcp_done(child);
req->rsk_ops->send_reset(sk, skb);
/* The last child reference will be released by the caller */ /* The last child reference will be released by the caller */
return child; return child;
...@@ -966,7 +983,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, ...@@ -966,7 +983,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
int addrlen; int addrlen;
int err; int err;
if (sk->sk_state != TCP_ESTABLISHED) if (!mptcp_is_fully_established(sk))
return -ENOTCONN; return -ENOTCONN;
err = mptcp_subflow_create_socket(sk, &sf); err = mptcp_subflow_create_socket(sk, &sf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment