Commit 7e1c5d7b authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-miscellaneous-changes-for-5-17'

Mat Martineau says:

====================
mptcp: Miscellaneous changes for 5.17

These are three unrelated patches that we've been testing in the MPTCP
tree.

Patch 1 modifies the packet scheduler that picks which TCP subflow is
used for each chunk of outgoing data. The updated scheduler improves
throughput on multiple-subflow connections.

Patch 2 updates a selftest to verify recent TCP_ULP sockopt changes
on MPTCP fallback sockets.

Patch 3 cleans up some unnecessary comparisons with an 8-bit value.
====================

Link: https://lore.kernel.org/r/20211217233702.299461-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ab9d0e21 59060a47
...@@ -38,7 +38,8 @@ struct mptcp_pm_add_entry { ...@@ -38,7 +38,8 @@ struct mptcp_pm_add_entry {
u8 retrans_times; u8 retrans_times;
}; };
#define MAX_ADDR_ID 255 /* max value of mptcp_addr_info.id */
#define MAX_ADDR_ID U8_MAX
#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) #define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
struct pm_nl_pernet { struct pm_nl_pernet {
...@@ -825,14 +826,13 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, ...@@ -825,14 +826,13 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
entry->addr.id = find_next_zero_bit(pernet->id_bitmap, entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
MAX_ADDR_ID + 1, MAX_ADDR_ID + 1,
pernet->next_id); pernet->next_id);
if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) && if (!entry->addr.id && pernet->next_id != 1) {
pernet->next_id != 1) {
pernet->next_id = 1; pernet->next_id = 1;
goto find_next; goto find_next;
} }
} }
if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID) if (!entry->addr.id)
goto out; goto out;
__set_bit(entry->addr.id, pernet->id_bitmap); __set_bit(entry->addr.id, pernet->id_bitmap);
......
...@@ -1372,7 +1372,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -1372,7 +1372,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct subflow_send_info { struct subflow_send_info {
struct sock *ssk; struct sock *ssk;
u64 ratio; u64 linger_time;
}; };
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
...@@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) ...@@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return __mptcp_subflow_active(subflow); return __mptcp_subflow_active(subflow);
} }
#define SSK_MODE_ACTIVE 0
#define SSK_MODE_BACKUP 1
#define SSK_MODE_MAX 2
/* implement the mptcp packet scheduler; /* implement the mptcp packet scheduler;
* returns the subflow that will transmit the next DSS * returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout * additionally updates the rtx timeout
*/ */
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{ {
struct subflow_send_info send_info[2]; struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
u32 pace, burst, wmem;
int i, nr_active = 0; int i, nr_active = 0;
struct sock *ssk; struct sock *ssk;
u64 linger_time;
long tout = 0; long tout = 0;
u64 ratio;
u32 pace;
sock_owned_by_me(sk); sock_owned_by_me(sk);
...@@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
} }
/* pick the subflow with the lower wmem/wspace ratio */ /* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) { for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL; send_info[i].ssk = NULL;
send_info[i].ratio = -1; send_info[i].linger_time = -1;
} }
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
trace_mptcp_subflow_get_send(subflow); trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow); ssk = mptcp_subflow_tcp_sock(subflow);
...@@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
tout = max(tout, mptcp_timeout_from_subflow(subflow)); tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup; nr_active += !subflow->backup;
if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) pace = subflow->avg_pacing_rate;
continue; if (unlikely(!pace)) {
/* init pacing rate from socket */
pace = READ_ONCE(ssk->sk_pacing_rate); subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
pace = subflow->avg_pacing_rate;
if (!pace) if (!pace)
continue; continue;
}
ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
pace); if (linger_time < send_info[subflow->backup].linger_time) {
if (ratio < send_info[subflow->backup].ratio) {
send_info[subflow->backup].ssk = ssk; send_info[subflow->backup].ssk = ssk;
send_info[subflow->backup].ratio = ratio; send_info[subflow->backup].linger_time = linger_time;
} }
} }
__mptcp_set_timeout(sk, tout); __mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */ /* pick the best backup if no other subflow is active */
if (!nr_active) if (!nr_active)
send_info[0].ssk = send_info[1].ssk; send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
if (send_info[0].ssk) { /* According to the blest algorithm, to avoid HoL blocking for the
msk->last_snd = send_info[0].ssk; * faster flow, we need to:
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, * - estimate the faster flow linger time
tcp_sk(msk->last_snd)->snd_wnd); * - use the above to estimate the amount of byte transferred
return msk->last_snd; * by the faster flow
} * - check that the amount of queued data is greter than the above,
* otherwise do not use the picked, slower, subflow
* We select the subflow with the shorter estimated time to flush
* the queued mem, which basically ensure the above. We just need
* to check that subflow has a non empty cwin.
*/
ssk = send_info[SSK_MODE_ACTIVE].ssk;
if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
return NULL; return NULL;
burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
wmem = READ_ONCE(ssk->sk_wmem_queued);
subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem);
msk->last_snd = ssk;
msk->snd_burst = burst;
return ssk;
} }
static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
......
...@@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); ...@@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
/* MPTCP subflow context */ /* MPTCP subflow context */
struct mptcp_subflow_context { struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */ struct list_head node;/* conn_list of subflows */
unsigned long avg_pacing_rate; /* protected by msk socket lock */
u64 local_key; u64 local_key;
u64 remote_key; u64 remote_key;
u64 idsn; u64 idsn;
......
...@@ -59,7 +59,6 @@ static enum cfg_peek cfg_peek = CFG_NONE_PEEK; ...@@ -59,7 +59,6 @@ static enum cfg_peek cfg_peek = CFG_NONE_PEEK;
static const char *cfg_host; static const char *cfg_host;
static const char *cfg_port = "12000"; static const char *cfg_port = "12000";
static int cfg_sock_proto = IPPROTO_MPTCP; static int cfg_sock_proto = IPPROTO_MPTCP;
static bool tcpulp_audit;
static int pf = AF_INET; static int pf = AF_INET;
static int cfg_sndbuf; static int cfg_sndbuf;
static int cfg_rcvbuf; static int cfg_rcvbuf;
...@@ -103,7 +102,6 @@ static void die_usage(void) ...@@ -103,7 +102,6 @@ static void die_usage(void)
fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-M mark -- set socket packet mark\n");
fprintf(stderr, "\t-u -- check mptcp ulp\n");
fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n"); fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
fprintf(stderr, "\t-o option -- test sockopt <option>\n"); fprintf(stderr, "\t-o option -- test sockopt <option>\n");
...@@ -215,6 +213,42 @@ static void set_transparent(int fd, int pf) ...@@ -215,6 +213,42 @@ static void set_transparent(int fd, int pf)
} }
} }
static int do_ulp_so(int sock, const char *name)
{
return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name));
}
#define X(m) xerror("%s:%u: %s: failed for proto %d at line %u", __FILE__, __LINE__, (m), proto, line)
static void sock_test_tcpulp(int sock, int proto, unsigned int line)
{
socklen_t buflen = 8;
char buf[8] = "";
int ret = getsockopt(sock, IPPROTO_TCP, TCP_ULP, buf, &buflen);
if (ret != 0)
X("getsockopt");
if (buflen > 0) {
if (strcmp(buf, "mptcp") != 0)
xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line);
ret = do_ulp_so(sock, "tls");
if (ret == 0)
X("setsockopt");
} else if (proto == IPPROTO_MPTCP) {
ret = do_ulp_so(sock, "tls");
if (ret != -1)
X("setsockopt");
}
ret = do_ulp_so(sock, "mptcp");
if (ret != -1)
X("setsockopt");
#undef X
}
#define SOCK_TEST_TCPULP(s, p) sock_test_tcpulp((s), (p), __LINE__)
static int sock_listen_mptcp(const char * const listenaddr, static int sock_listen_mptcp(const char * const listenaddr,
const char * const port) const char * const port)
{ {
...@@ -238,6 +272,8 @@ static int sock_listen_mptcp(const char * const listenaddr, ...@@ -238,6 +272,8 @@ static int sock_listen_mptcp(const char * const listenaddr,
if (sock < 0) if (sock < 0)
continue; continue;
SOCK_TEST_TCPULP(sock, cfg_sock_proto);
if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
sizeof(one))) sizeof(one)))
perror("setsockopt"); perror("setsockopt");
...@@ -260,50 +296,17 @@ static int sock_listen_mptcp(const char * const listenaddr, ...@@ -260,50 +296,17 @@ static int sock_listen_mptcp(const char * const listenaddr,
return sock; return sock;
} }
SOCK_TEST_TCPULP(sock, cfg_sock_proto);
if (listen(sock, 20)) { if (listen(sock, 20)) {
perror("listen"); perror("listen");
close(sock); close(sock);
return -1; return -1;
} }
return sock; SOCK_TEST_TCPULP(sock, cfg_sock_proto);
}
static bool sock_test_tcpulp(const char * const remoteaddr,
const char * const port)
{
struct addrinfo hints = {
.ai_protocol = IPPROTO_TCP,
.ai_socktype = SOCK_STREAM,
};
struct addrinfo *a, *addr;
int sock = -1, ret = 0;
bool test_pass = false;
hints.ai_family = AF_INET; return sock;
xgetaddrinfo(remoteaddr, port, &hints, &addr);
for (a = addr; a; a = a->ai_next) {
sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP);
if (sock < 0) {
perror("socket");
continue;
}
ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp",
sizeof("mptcp"));
if (ret == -1 && errno == EOPNOTSUPP)
test_pass = true;
close(sock);
if (test_pass)
break;
if (!ret)
fprintf(stderr,
"setsockopt(TCP_ULP) returned 0\n");
else
perror("setsockopt(TCP_ULP)");
}
return test_pass;
} }
static int sock_connect_mptcp(const char * const remoteaddr, static int sock_connect_mptcp(const char * const remoteaddr,
...@@ -326,6 +329,8 @@ static int sock_connect_mptcp(const char * const remoteaddr, ...@@ -326,6 +329,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
continue; continue;
} }
SOCK_TEST_TCPULP(sock, proto);
if (cfg_mark) if (cfg_mark)
set_mark(sock, cfg_mark); set_mark(sock, cfg_mark);
...@@ -338,6 +343,8 @@ static int sock_connect_mptcp(const char * const remoteaddr, ...@@ -338,6 +343,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
} }
freeaddrinfo(addr); freeaddrinfo(addr);
if (sock != -1)
SOCK_TEST_TCPULP(sock, proto);
return sock; return sock;
} }
...@@ -954,6 +961,8 @@ int main_loop_s(int listensock) ...@@ -954,6 +961,8 @@ int main_loop_s(int listensock)
check_sockaddr(pf, &ss, salen); check_sockaddr(pf, &ss, salen);
check_getpeername(remotesock, &ss, salen); check_getpeername(remotesock, &ss, salen);
SOCK_TEST_TCPULP(remotesock, 0);
return copyfd_io(0, remotesock, 1); return copyfd_io(0, remotesock, 1);
} }
...@@ -1059,6 +1068,8 @@ int main_loop(void) ...@@ -1059,6 +1068,8 @@ int main_loop(void)
check_getpeername_connect(fd); check_getpeername_connect(fd);
SOCK_TEST_TCPULP(fd, cfg_sock_proto);
if (cfg_rcvbuf) if (cfg_rcvbuf)
set_rcvbuf(fd, cfg_rcvbuf); set_rcvbuf(fd, cfg_rcvbuf);
if (cfg_sndbuf) if (cfg_sndbuf)
...@@ -1151,7 +1162,7 @@ static void parse_opts(int argc, char **argv) ...@@ -1151,7 +1162,7 @@ static void parse_opts(int argc, char **argv)
{ {
int c; int c;
while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:o:")) != -1) { while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) {
switch (c) { switch (c) {
case 'j': case 'j':
cfg_join = true; cfg_join = true;
...@@ -1177,9 +1188,6 @@ static void parse_opts(int argc, char **argv) ...@@ -1177,9 +1188,6 @@ static void parse_opts(int argc, char **argv)
case 'h': case 'h':
die_usage(); die_usage();
break; break;
case 'u':
tcpulp_audit = true;
break;
case '6': case '6':
pf = AF_INET6; pf = AF_INET6;
break; break;
...@@ -1233,9 +1241,6 @@ int main(int argc, char *argv[]) ...@@ -1233,9 +1241,6 @@ int main(int argc, char *argv[])
signal(SIGUSR1, handle_signal); signal(SIGUSR1, handle_signal);
parse_opts(argc, argv); parse_opts(argc, argv);
if (tcpulp_audit)
return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1;
if (listen_mode) { if (listen_mode) {
int fd = sock_listen_mptcp(cfg_host, cfg_port); int fd = sock_listen_mptcp(cfg_host, cfg_port);
......
...@@ -296,24 +296,6 @@ check_mptcp_disabled() ...@@ -296,24 +296,6 @@ check_mptcp_disabled()
return 0 return 0
} }
check_mptcp_ulp_setsockopt()
{
local t retval
t="ns_ulp-$sech-$(mktemp -u XXXXXX)"
ip netns add ${t} || exit $ksft_skip
if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then
printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n"
retval=1
ret=$retval
else
printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n"
retval=0
fi
ip netns del ${t}
return $retval
}
# $1: IP address # $1: IP address
is_v6() is_v6()
{ {
...@@ -780,8 +762,6 @@ make_file "$sin" "server" ...@@ -780,8 +762,6 @@ make_file "$sin" "server"
check_mptcp_disabled check_mptcp_disabled
check_mptcp_ulp_setsockopt
stop_if_error "The kernel configuration is not valid for MPTCP" stop_if_error "The kernel configuration is not valid for MPTCP"
echo "INFO: validating network environment with pings" echo "INFO: validating network environment with pings"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment