Commit 949dfdcf authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-improve-mptcp-level-window-tracking'

Mat Martineau says:

====================
mptcp: Improve MPTCP-level window tracking

This series improves MPTCP receive window compliance with RFC 8684 and
helps increase throughput on high-speed links. Note that patch 3 makes a
change in tcp_output.c

For the details, Paolo says:

I've been chasing bad/unstable performance with multiple subflows
on very high speed links.

It looks like the root cause is due to the current mptcp-level
congestion window handling. There are apparently a few different
sub-issues:

- the rcv_wnd is not effectively shared on the tx side, as each
  subflow takes in account only the value received by the underlaying
  TCP connection. This is addressed in patch 1/5

- The mptcp-level offered wnd right edge is currently allowed to shrink.
  Reading section 3.3.4.:

"""
   The receive window is relative to the DATA_ACK.  As in TCP, a
   receiver MUST NOT shrink the right edge of the receive window (i.e.,
   DATA_ACK + receive window).  The receiver will use the data sequence
   number to tell if a packet should be accepted at the connection
   level.
"""

I read the above as we need to reflect window right-edge tracking
on the wire, see patch 4/5.

- The offered window right edge tracking can happen concurrently on
  multiple subflows, but there is no mutex protection. We need an
  additional atomic operation - still patch 4/5

This series additionally bumps a few new MIBs to track all the above
(ensure/observe that the suspected races actually take place).

I could not access again the host where the issue was so
noticeable, still in the current setup the tput changes from
[6-18] Gbps to 19Gbps very stable.
====================

Link: https://lore.kernel.org/r/20220504215408.349318-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 10b4a11f 38acb626
...@@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ...@@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
struct mptcp_out_options *opts); struct mptcp_out_options *opts);
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
struct mptcp_out_options *opts); struct mptcp_out_options *opts);
void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info);
......
...@@ -445,12 +445,13 @@ struct tcp_out_options { ...@@ -445,12 +445,13 @@ struct tcp_out_options {
struct mptcp_out_options mptcp; struct mptcp_out_options mptcp;
}; };
static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
struct tcp_sock *tp,
struct tcp_out_options *opts) struct tcp_out_options *opts)
{ {
#if IS_ENABLED(CONFIG_MPTCP) #if IS_ENABLED(CONFIG_MPTCP)
if (unlikely(OPTION_MPTCP & opts->options)) if (unlikely(OPTION_MPTCP & opts->options))
mptcp_write_options(ptr, tp, &opts->mptcp); mptcp_write_options(th, ptr, tp, &opts->mptcp);
#endif #endif
} }
...@@ -606,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, ...@@ -606,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
* At least SACK_PERM as the first option is known to lead to a disaster * At least SACK_PERM as the first option is known to lead to a disaster
* (but it may well be that other scenarios fail similarly). * (but it may well be that other scenarios fail similarly).
*/ */
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts) struct tcp_out_options *opts)
{ {
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */ u16 options = opts->options; /* mungable copy */
if (unlikely(OPTION_MD5 & options)) { if (unlikely(OPTION_MD5 & options)) {
...@@ -702,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, ...@@ -702,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
smc_options_write(ptr, &options); smc_options_write(ptr, &options);
mptcp_options_write(ptr, tp, opts); mptcp_options_write(th, ptr, tp, opts);
} }
static void smc_set_option(const struct tcp_sock *tp, static void smc_set_option(const struct tcp_sock *tp,
...@@ -1355,7 +1357,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1355,7 +1357,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
th->window = htons(min(tp->rcv_wnd, 65535U)); th->window = htons(min(tp->rcv_wnd, 65535U));
} }
tcp_options_write((__be32 *)(th + 1), tp, &opts); tcp_options_write(th, tp, &opts);
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */ /* Calculate the MD5 hash, as we have all we need now */
...@@ -3591,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -3591,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U)); th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts); tcp_options_write(th, NULL, &opts);
th->doff = (tcp_header_size >> 2); th->doff = (tcp_header_size >> 2);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
......
...@@ -56,6 +56,10 @@ static const struct snmp_mib mptcp_snmp_list[] = { ...@@ -56,6 +56,10 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED),
SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };
......
...@@ -49,6 +49,12 @@ enum linux_mptcp_mib_field { ...@@ -49,6 +49,12 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */
MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to
* conflict with another subflow while updating msk rcv wnd
*/
MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */
__MPTCP_MIB_MAX __MPTCP_MIB_MAX
}; };
......
...@@ -1224,20 +1224,62 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) ...@@ -1224,20 +1224,62 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true; return true;
} }
static void mptcp_set_rwin(const struct tcp_sock *tp) static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{ {
const struct sock *ssk = (const struct sock *)tp; const struct sock *ssk = (const struct sock *)tp;
const struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
struct mptcp_sock *msk; struct mptcp_sock *msk;
u64 ack_seq; u32 new_win;
u64 win;
subflow = mptcp_subflow_ctx(ssk); subflow = mptcp_subflow_ctx(ssk);
msk = mptcp_sk(subflow->conn); msk = mptcp_sk(subflow->conn);
ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; ack_seq = READ_ONCE(msk->ack_seq);
rcv_wnd_new = ack_seq + tp->rcv_wnd;
rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
if (after64(rcv_wnd_new, rcv_wnd_old)) {
u64 rcv_wnd;
for (;;) {
rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
if (rcv_wnd == rcv_wnd_old)
break;
if (before64(rcv_wnd_new, rcv_wnd)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
goto raise_win;
}
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
rcv_wnd_old = rcv_wnd;
}
return;
}
if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
win = rcv_wnd_old - ack_seq;
tp->rcv_wnd = min_t(u64, win, U32_MAX);
new_win = tp->rcv_wnd;
if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) /* Make sure we do not exceed the maximum possible
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); * scaled window.
*/
if (unlikely(th->syn))
new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
if (!tp->rx_opt.rcv_wscale &&
sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
th->window = htons(new_win);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
}
} }
u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
...@@ -1265,7 +1307,7 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext) ...@@ -1265,7 +1307,7 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
~csum_unfold(mpext->csum)); ~csum_unfold(mpext->csum));
} }
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
struct mptcp_out_options *opts) struct mptcp_out_options *opts)
{ {
const struct sock *ssk = (const struct sock *)tp; const struct sock *ssk = (const struct sock *)tp;
...@@ -1554,7 +1596,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ...@@ -1554,7 +1596,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
} }
if (tp) if (tp)
mptcp_set_rwin(tp); mptcp_set_rwin(tp, th);
} }
__be32 mptcp_get_reset_option(const struct sk_buff *skb) __be32 mptcp_get_reset_option(const struct sk_buff *skb)
......
...@@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) ...@@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
seq = MPTCP_SKB_CB(skb)->map_seq; seq = MPTCP_SKB_CB(skb)->map_seq;
end_seq = MPTCP_SKB_CB(skb)->end_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq;
max_seq = READ_ONCE(msk->rcv_wnd_sent); max_seq = atomic64_read(&msk->rcv_wnd_sent);
pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue)); RB_EMPTY_ROOT(&msk->out_of_order_queue));
...@@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) ...@@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
mptcp_drop(sk, skb); mptcp_drop(sk, skb);
pr_debug("oow by %lld, rcv_wnd_sent %llu\n", pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
(unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)end_seq - (unsigned long)max_seq,
(unsigned long long)msk->rcv_wnd_sent); (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return; return;
} }
...@@ -1141,18 +1141,21 @@ struct mptcp_sendmsg_info { ...@@ -1141,18 +1141,21 @@ struct mptcp_sendmsg_info {
bool data_lock_held; bool data_lock_held;
}; };
static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
int avail_size) u64 data_seq, int avail_size)
{ {
u64 window_end = mptcp_wnd_end(msk); u64 window_end = mptcp_wnd_end(msk);
u64 mptcp_snd_wnd;
if (__mptcp_check_fallback(msk)) if (__mptcp_check_fallback(msk))
return avail_size; return avail_size;
if (!before64(data_seq + avail_size, window_end)) { mptcp_snd_wnd = window_end - data_seq;
u64 allowed_size = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
return min_t(unsigned int, allowed_size, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED);
} }
return avail_size; return avail_size;
...@@ -1305,7 +1308,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -1305,7 +1308,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
} }
/* Zero window and all data acked? Probe. */ /* Zero window and all data acked? Probe. */
copy = mptcp_check_allowed_size(msk, data_seq, copy); copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy);
if (copy == 0) { if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una); u64 snd_una = READ_ONCE(msk->snd_una);
...@@ -1498,11 +1501,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -1498,11 +1501,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
* to check that subflow has a non empty cwin. * to check that subflow has a non empty cwin.
*/ */
ssk = send_info[SSK_MODE_ACTIVE].ssk; ssk = send_info[SSK_MODE_ACTIVE].ssk;
if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) if (!ssk || !sk_stream_memory_free(ssk))
return NULL; return NULL;
burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued); wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst) {
msk->last_snd = NULL;
return ssk;
}
subflow = mptcp_subflow_ctx(ssk); subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst, READ_ONCE(ssk->sk_pacing_rate) * burst,
...@@ -2996,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, ...@@ -2996,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++; ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); atomic64_set(&msk->rcv_wnd_sent, ack_seq);
} }
sock_reset_flag(nsk, SOCK_RCU_FREE); sock_reset_flag(nsk, SOCK_RCU_FREE);
...@@ -3289,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk) ...@@ -3289,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq);
atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0); mptcp_pm_new_connection(msk, ssk, 0);
......
...@@ -257,7 +257,7 @@ struct mptcp_sock { ...@@ -257,7 +257,7 @@ struct mptcp_sock {
u64 write_seq; u64 write_seq;
u64 snd_nxt; u64 snd_nxt;
u64 ack_seq; u64 ack_seq;
u64 rcv_wnd_sent; atomic64_t rcv_wnd_sent;
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
int rmem_fwd_alloc; int rmem_fwd_alloc;
struct sock *last_snd; struct sock *last_snd;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment