Commit 23212a70 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-add-receive-buffer-auto-tuning'

Florian Westphal says:

====================
mptcp: add receive buffer auto-tuning

First patch extends the test script to allow for reproducible results.
Second patch adds receive auto-tuning.  Its based on what TCP is doing,
only difference is that we use the largest RTT of any of the subflows
and that we will update all subflows with the new value.

Else, we get spurious packet drops because the mptcp work queue might
not be able to move packets from subflow socket to master socket
fast enough.  Without the adjustment, TCP may drop the packets because
the subflow socket is over its rcvbuffer limit.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b97e9d9d a6b118fe
......@@ -179,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
return false;
}
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
if (rcvbuf > sk->sk_rcvbuf)
sk->sk_rcvbuf = rcvbuf;
}
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
......@@ -916,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
return copied;
}
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
*
* Only difference: Use highest rtt estimate of the subflows in use.
*/
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
u32 time, advmss = 1;
u64 rtt_us, mstamp;
sock_owned_by_me(sk);
if (copied <= 0)
return;
msk->rcvq_space.copied += copied;
mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
rtt_us = msk->rcvq_space.rtt_us;
if (rtt_us && time < (rtt_us >> 3))
return;
rtt_us = 0;
mptcp_for_each_subflow(msk, subflow) {
const struct tcp_sock *tp;
u64 sf_rtt_us;
u32 sf_advmss;
tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
sf_advmss = READ_ONCE(tp->advmss);
rtt_us = max(sf_rtt_us, rtt_us);
advmss = max(sf_advmss, advmss);
}
msk->rcvq_space.rtt_us = rtt_us;
if (time < (rtt_us >> 3) || rtt_us == 0)
return;
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
do_div(grow, msk->rcvq_space.space);
rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < advmss)
rcvmem += 128;
do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
window_clamp = tcp_win_from_space(sk, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
* the mptcp rx queue fast enough (announced rcv_win can
* exceed ssk->sk_rcvbuf).
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
ssk = mptcp_subflow_tcp_sock(subflow);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
tcp_sk(ssk)->window_clamp = window_clamp;
}
}
}
msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp;
}
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
unsigned int moved = 0;
......@@ -1028,6 +1115,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
return copied;
}
......@@ -1241,6 +1330,7 @@ static int mptcp_init_sock(struct sock *sk)
return ret;
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
return 0;
......@@ -1423,6 +1513,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
return nsk;
}
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
{
const struct tcp_sock *tp = tcp_sk(ssk);
msk->rcvq_space.copied = 0;
msk->rcvq_space.rtt_us = 0;
msk->rcvq_space.time = tp->tcp_mstamp;
/* initial rcv_space offering made to peer */
msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
TCP_INIT_CWND * tp->advmss);
if (msk->rcvq_space.space == 0)
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
}
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
bool kern)
{
......@@ -1471,6 +1577,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
list_add(&subflow->node, &msk->conn_list);
inet_sk_state_store(newsk, TCP_ESTABLISHED);
mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(new_mptcp_sock);
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
......@@ -1631,6 +1738,8 @@ void mptcp_finish_connect(struct sock *ssk)
atomic64_set(&msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
mptcp_rcv_space_init(msk, ssk);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
......
......@@ -209,6 +209,12 @@ struct mptcp_sock {
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
struct {
u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
......@@ -369,6 +375,7 @@ void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
......
......@@ -225,8 +225,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_fallback(mptcp_sk(subflow->conn));
}
if (mptcp_check_fallback(sk))
if (mptcp_check_fallback(sk)) {
mptcp_rcv_space_init(mptcp_sk(parent), sk);
return;
}
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
......@@ -1118,6 +1120,7 @@ static void subflow_state_change(struct sock *sk)
if (subflow_simultaneous_connect(sk)) {
mptcp_do_fallback(sk);
mptcp_rcv_space_init(mptcp_sk(parent), sk);
pr_fallback(mptcp_sk(parent));
subflow->conn_finished = 1;
if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
......
......@@ -3,7 +3,7 @@
time_start=$(date +%s)
optstring="S:R:d:e:l:r:h4cm:"
optstring="S:R:d:e:l:r:h4cm:f:t"
ret=0
sin=""
sout=""
......@@ -21,6 +21,8 @@ testmode=""
sndbuf=0
rcvbuf=0
options_log=true
do_tcp=0
filesize=0
if [ $tc_loss -eq 100 ];then
tc_loss=1%
......@@ -40,9 +42,11 @@ usage() {
echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)"
echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)"
echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
echo -e "\t-f: size of file to transfer in bytes (default random)"
echo -e "\t-S: set sndbuf value (default: use kernel default)"
echo -e "\t-R: set rcvbuf value (default: use kernel default)"
echo -e "\t-m: test mode (poll, sendfile; default: poll)"
echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)"
}
while getopts "$optstring" option;do
......@@ -94,6 +98,12 @@ while getopts "$optstring" option;do
"m")
testmode="$OPTARG"
;;
"f")
filesize="$OPTARG"
;;
"t")
do_tcp=$((do_tcp+1))
;;
"?")
usage $0
exit 1
......@@ -449,20 +459,25 @@ make_file()
{
local name=$1
local who=$2
local SIZE=$filesize
local ksize
local rem
local SIZE TSIZE
SIZE=$((RANDOM % (1024 * 8)))
TSIZE=$((SIZE * 1024))
if [ $SIZE -eq 0 ]; then
local MAXSIZE=$((1024 * 1024 * 8))
local MINSIZE=$((1024 * 256))
dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
SIZE=$(((RANDOM * RANDOM + MINSIZE) % MAXSIZE))
fi
SIZE=$((RANDOM % 1024))
SIZE=$((SIZE + 128))
TSIZE=$((TSIZE + SIZE))
dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
ksize=$((SIZE / 1024))
rem=$((SIZE - (ksize * 1024)))
dd if=/dev/urandom of="$name" bs=1024 count=$ksize 2> /dev/null
dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$rem 2> /dev/null
echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
echo "Created $name (size $TSIZE) containing data sent by $who"
echo "Created $name (size $(du -b "$name")) containing data sent by $who"
}
run_tests_lo()
......@@ -497,9 +512,11 @@ run_tests_lo()
return 1
fi
# don't bother testing fallback tcp except for loopback case.
if [ ${listener_ns} != ${connector_ns} ]; then
return 0
if [ $do_tcp -eq 0 ]; then
# don't bother testing fallback tcp except for loopback case.
if [ ${listener_ns} != ${connector_ns} ]; then
return 0
fi
fi
do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr}
......@@ -516,6 +533,15 @@ run_tests_lo()
return 1
fi
if [ $do_tcp -gt 1 ] ;then
do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr}
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
return 1
fi
fi
return 0
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment