Commit ae23b55c authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-miscellaneous-mptcp-fixes'

Mat Martineau says:

====================
mptcp: Miscellaneous MPTCP fixes

This is a collection of small fixup and minor enhancement patches that
have accumulated in the MPTCP tree while net-next was closed. These are
prerequisites for larger changes we have queued up.

Patch 1 refines receive buffer autotuning.

Patches 2 and 4 are some minor locking and refactoring changes.

Patch 3 improves GRO and RX coalescing with MPTCP skbs.

Patches 5-7 add a sysctl for tuning ADD_ADDR retransmission timeout,
corresponding test code, and documentation.

v2: Add sysctl documentation and fix signoff tags.
====================

Link: https://lore.kernel.org/r/20201103190509.27416-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 82728b91 8d014eaa
......@@ -70,6 +70,7 @@ Contents:
lapb-module
mac80211-injection
mpls-sysctl
mptcp-sysctl
multiqueue
netconsole
netdev-features
......
.. SPDX-License-Identifier: GPL-2.0
=====================
MPTCP Sysfs variables
=====================
/proc/sys/net/mptcp/* Variables
===============================
enabled - INTEGER
Control whether MPTCP sockets can be created.
MPTCP sockets can be created if the value is nonzero. This is
a per-namespace sysctl.
Default: 1
add_addr_timeout - INTEGER (seconds)
Set the timeout after which an ADD_ADDR control message will be
resent to an MPTCP peer that has not acknowledged a previous
ADD_ADDR message.
The default value matches TCP_RTO_MAX. This is a per-namespace
sysctl.
Default: 120
......@@ -12265,6 +12265,7 @@ L: mptcp@lists.01.org
S: Maintained
W: https://github.com/multipath-tcp/mptcp_net-next/wiki
B: https://github.com/multipath-tcp/mptcp_net-next/issues
F: Documentation/networking/mptcp-sysctl.rst
F: include/net/mptcp.h
F: include/uapi/linux/mptcp.h
F: net/mptcp/
......
......@@ -29,7 +29,8 @@ struct mptcp_ext {
use_ack:1,
ack64:1,
mpc_map:1,
__unused:2;
frozen:1,
__unused:1;
/* one byte hole */
};
......@@ -106,6 +107,19 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to,
from->active_extensions = 0;
}
static inline void mptcp_skb_ext_copy(struct sk_buff *to,
struct sk_buff *from)
{
struct mptcp_ext *from_ext;
from_ext = skb_ext_find(from, SKB_EXT_MPTCP);
if (!from_ext)
return;
from_ext->frozen = 1;
skb_ext_copy(to, from);
}
static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
const struct mptcp_ext *from_ext)
{
......@@ -193,6 +207,11 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to,
{
}
static inline void mptcp_skb_ext_copy(struct sk_buff *to,
struct sk_buff *from)
{
}
static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from)
{
......
......@@ -1569,6 +1569,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
......@@ -2123,6 +2124,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
......@@ -2393,6 +2395,7 @@ static int tcp_mtu_probe(struct sock *sk)
skb = tcp_send_head(sk);
skb_copy_decrypted(nskb, skb);
mptcp_skb_ext_copy(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
......
......@@ -18,6 +18,7 @@ struct mptcp_pernet {
struct ctl_table_header *ctl_table_hdr;
int mptcp_enabled;
unsigned int add_addr_timeout;
};
static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
......@@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net)
return mptcp_get_pernet(net)->mptcp_enabled;
}
unsigned int mptcp_get_add_addr_timeout(struct net *net)
{
return mptcp_get_pernet(net)->add_addr_timeout;
}
static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
......@@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = {
*/
.proc_handler = proc_dointvec,
},
{
.procname = "add_addr_timeout",
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{}
};
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
}
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
......@@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
}
table[0].data = &pernet->mptcp_enabled;
table[1].data = &pernet->add_addr_timeout;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
......
......@@ -206,6 +206,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);
pr_debug("msk=%p", msk);
......@@ -232,7 +233,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
}
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX);
sk_reset_timer(sk, timer,
jiffies + mptcp_get_add_addr_timeout(net));
spin_unlock_bh(&msk->pm.lock);
......@@ -264,6 +266,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);
if (lookup_anno_list_by_saddr(msk, &entry->addr))
return false;
......@@ -279,7 +282,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
add_entry->retrans_times = 0;
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX);
sk_reset_timer(sk, &add_entry->add_timer,
jiffies + mptcp_get_add_addr_timeout(net));
return true;
}
......
......@@ -466,6 +466,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
u32 old_copied_seq;
bool done = false;
int sk_rbuf;
sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
if (unlikely(ssk_rbuf > sk_rbuf)) {
WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
sk_rbuf = ssk_rbuf;
}
}
pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
......@@ -528,7 +540,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);
if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
done = true;
break;
}
......@@ -622,6 +634,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
int sk_rbuf, ssk_rbuf;
bool wake;
/* move_skbs_to_msk below can legitly clear the data_avail flag,
......@@ -632,12 +645,16 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
if (wake)
set_bit(MPTCP_DATA_READY, &msk->flags);
if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
move_skbs_to_msk(msk, ssk))
ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
if (unlikely(ssk_rbuf > sk_rbuf))
sk_rbuf = ssk_rbuf;
/* over limit? can't append more skbs to msk */
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
goto wake;
/* don't schedule if mptcp sk is (still) over limit */
if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
if (move_skbs_to_msk(msk, ssk))
goto wake;
/* mptcp socket is owned, release_cb should retry */
......@@ -754,8 +771,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
if (!tcp_skb_can_collapse_to(skb))
return false;
/* can collapse only if MPTCP level sequence is in order */
return mpext && mpext->data_seq + mpext->data_len == write_seq;
/* can collapse only if MPTCP level sequence is in order and this
* mapping has not been xmitted yet
*/
return mpext && mpext->data_seq + mpext->data_len == write_seq &&
!mpext->frozen;
}
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
......@@ -833,19 +853,25 @@ static void mptcp_clean_una(struct sock *sk)
}
out:
if (cleaned) {
if (cleaned)
sk_mem_reclaim_partial(sk);
}
/* Only wake up writers if a subflow is ready */
if (mptcp_is_writeable(msk)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
smp_mb__after_atomic();
static void mptcp_clean_una_wakeup(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk);
}
mptcp_clean_una(sk);
/* Only wake up writers if a subflow is ready */
if (mptcp_is_writeable(msk)) {
set_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk);
}
}
......@@ -1476,13 +1502,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
__mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
if (!ssk)
break;
lock_sock(ssk);
slowpath = lock_sock_fast(ssk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
release_sock(ssk);
unlock_sock_fast(ssk, slowpath);
} while (!done);
if (mptcp_ofo_queue(msk) || moved > 0) {
......@@ -1748,7 +1775,7 @@ static void mptcp_worker(struct work_struct *work)
long timeo = 0;
lock_sock(sk);
mptcp_clean_una(sk);
mptcp_clean_una_wakeup(sk);
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
......
......@@ -362,6 +362,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
}
int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
......
......@@ -5,3 +5,13 @@ CONFIG_INET_DIAG=m
CONFIG_INET_MPTCP_DIAG=m
CONFIG_VETH=y
CONFIG_NET_SCH_NETEM=m
CONFIG_NETFILTER=y
CONFIG_NETFILTER_ADVANCED=y
CONFIG_NETFILTER_NETLINK=m
CONFIG_NF_TABLES=m
CONFIG_NFT_COUNTER=m
CONFIG_NFT_COMPAT=m
CONFIG_NETFILTER_XTABLES=m
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NF_TABLES_IPV4=y
CONFIG_NF_TABLES_IPV6=y
......@@ -13,6 +13,24 @@ capture=0
TEST_COUNT=0
# generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) ||
# (ip6 && (ip6[74] & 0xf0) == 0x30)'"
CBPF_MPTCP_SUBOPTION_ADD_ADDR="14,
48 0 0 0,
84 0 0 240,
21 0 3 64,
48 0 0 54,
84 0 0 240,
21 6 7 48,
48 0 0 0,
84 0 0 240,
21 0 4 96,
48 0 0 74,
84 0 0 240,
21 0 1 48,
6 0 0 65535,
6 0 0 0"
init()
{
capout=$(mktemp)
......@@ -82,6 +100,26 @@ reset_with_cookies()
done
}
reset_with_add_addr_timeout()
{
local ip="${1:-4}"
local tables
tables="iptables"
if [ $ip -eq 6 ]; then
tables="ip6tables"
fi
reset
ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1
ip netns exec $ns2 $tables -A OUTPUT -p tcp \
-m tcp --tcp-option 30 \
-m bpf --bytecode \
"$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \
-j DROP
}
for arg in "$@"; do
if [ "$arg" = "-c" ]; then
capture=1
......@@ -94,6 +132,17 @@ if [ $? -ne 0 ];then
exit $ksft_skip
fi
iptables -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run all tests without iptables tool"
exit $ksft_skip
fi
ip6tables -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run all tests without ip6tables tool"
exit $ksft_skip
fi
check_transfer()
{
......@@ -135,6 +184,7 @@ do_transfer()
connect_addr="$5"
rm_nr_ns1="$6"
rm_nr_ns2="$7"
speed="$8"
port=$((10000+$TEST_COUNT))
TEST_COUNT=$((TEST_COUNT+1))
......@@ -159,7 +209,7 @@ do_transfer()
sleep 1
fi
if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then
if [ $speed = "fast" ]; then
mptcp_connect="./mptcp_connect -j"
else
mptcp_connect="./mptcp_connect -r"
......@@ -250,26 +300,13 @@ run_tests()
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
rm_nr_ns1="${4:-0}"
rm_nr_ns2="${5:-0}"
speed="${6:-fast}"
lret=0
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
return
fi
}
run_remove_tests()
{
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
rm_nr_ns1="$4"
rm_nr_ns2="$5"
lret=0
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2}
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \
${rm_nr_ns1} ${rm_nr_ns2} ${speed}
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
......@@ -491,12 +528,21 @@ run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "multiple subflows and signal" 3 3 3
chk_add_nr 1 1
# add_addr timeout
reset_with_add_addr_timeout
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 1 1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1 0 0 slow
chk_join_nr "signal address, ADD_ADDR timeout" 1 1 1
chk_add_nr 4 0
# single subflow, remove
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 0 1
run_tests $ns1 $ns2 10.0.1.1 0 1 slow
chk_join_nr "remove single subflow" 1 1 1
chk_rm_nr 1 1
......@@ -506,7 +552,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 0 2
run_tests $ns1 $ns2 10.0.1.1 0 2 slow
chk_join_nr "remove multiple subflows" 2 2 2
chk_rm_nr 2 2
......@@ -515,7 +561,7 @@ reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 1
run_remove_tests $ns1 $ns2 10.0.1.1 1 0
run_tests $ns1 $ns2 10.0.1.1 1 0 slow
chk_join_nr "remove single address" 1 1 1
chk_add_nr 1 1
chk_rm_nr 0 0
......@@ -526,7 +572,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 1 1
run_tests $ns1 $ns2 10.0.1.1 1 1 slow
chk_join_nr "remove subflow and signal" 2 2 2
chk_add_nr 1 1
chk_rm_nr 1 1
......@@ -538,7 +584,7 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 1 2
run_tests $ns1 $ns2 10.0.1.1 1 2 slow
chk_join_nr "remove subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment