Commit f3589be0 authored by Paolo Abeni's avatar Paolo Abeni Committed by Jakub Kicinski

mptcp: never shrink offered window

As per RFC, the offered MPTCP-level window should never shrink.
While we currently track the right edge, we don't enforce the
above constraint on the wire.
Additionally, concurrent xmit on different subflows can end-up in
erroneous right edge update.
Address the above explicitly updating the announced window and
protecting the update with an additional atomic operation (sic)
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent ea66758c
...@@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) ...@@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true; return true;
} }
static void mptcp_set_rwin(const struct tcp_sock *tp) static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{ {
const struct sock *ssk = (const struct sock *)tp; const struct sock *ssk = (const struct sock *)tp;
const struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
struct mptcp_sock *msk; struct mptcp_sock *msk;
u64 ack_seq; u32 new_win;
u64 win;
subflow = mptcp_subflow_ctx(ssk); subflow = mptcp_subflow_ctx(ssk);
msk = mptcp_sk(subflow->conn); msk = mptcp_sk(subflow->conn);
ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; ack_seq = READ_ONCE(msk->ack_seq);
rcv_wnd_new = ack_seq + tp->rcv_wnd;
rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
if (after64(rcv_wnd_new, rcv_wnd_old)) {
u64 rcv_wnd;
if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) for (;;) {
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
if (rcv_wnd == rcv_wnd_old)
break;
if (before64(rcv_wnd_new, rcv_wnd))
goto raise_win;
rcv_wnd_old = rcv_wnd;
}
return;
}
if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
win = rcv_wnd_old - ack_seq;
tp->rcv_wnd = min_t(u64, win, U32_MAX);
new_win = tp->rcv_wnd;
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
if (unlikely(th->syn))
new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
if (!tp->rx_opt.rcv_wscale &&
sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
th->window = htons(new_win);
}
} }
u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
...@@ -1554,7 +1592,7 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, ...@@ -1554,7 +1592,7 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
} }
if (tp) if (tp)
mptcp_set_rwin(tp); mptcp_set_rwin(tp, th);
} }
__be32 mptcp_get_reset_option(const struct sk_buff *skb) __be32 mptcp_get_reset_option(const struct sk_buff *skb)
......
...@@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) ...@@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
seq = MPTCP_SKB_CB(skb)->map_seq; seq = MPTCP_SKB_CB(skb)->map_seq;
end_seq = MPTCP_SKB_CB(skb)->end_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq;
max_seq = READ_ONCE(msk->rcv_wnd_sent); max_seq = atomic64_read(&msk->rcv_wnd_sent);
pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue)); RB_EMPTY_ROOT(&msk->out_of_order_queue));
...@@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) ...@@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
mptcp_drop(sk, skb); mptcp_drop(sk, skb);
pr_debug("oow by %lld, rcv_wnd_sent %llu\n", pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
(unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)end_seq - (unsigned long)max_seq,
(unsigned long long)msk->rcv_wnd_sent); (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return; return;
} }
...@@ -3004,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, ...@@ -3004,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++; ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); atomic64_set(&msk->rcv_wnd_sent, ack_seq);
} }
sock_reset_flag(nsk, SOCK_RCU_FREE); sock_reset_flag(nsk, SOCK_RCU_FREE);
...@@ -3297,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk) ...@@ -3297,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq);
atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0); mptcp_pm_new_connection(msk, ssk, 0);
......
...@@ -257,7 +257,7 @@ struct mptcp_sock { ...@@ -257,7 +257,7 @@ struct mptcp_sock {
u64 write_seq; u64 write_seq;
u64 snd_nxt; u64 snd_nxt;
u64 ack_seq; u64 ack_seq;
u64 rcv_wnd_sent; atomic64_t rcv_wnd_sent;
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
int rmem_fwd_alloc; int rmem_fwd_alloc;
struct sock *last_snd; struct sock *last_snd;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment