Commit afb0c192 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-remove-msk-subflow'

Matthieu Baerts says:

====================
mptcp: get rid of msk->subflow

The MPTCP protocol maintains an additional struct socket per connection,
mainly to be able to easily use tcp-level struct socket operations.

This leads to several side effects, beyond the quite unfortunate /
confusing 'subflow' field name:

- active and passive sockets behaviour is inconsistent: only active ones
  have a not NULL msk->subflow, leading to different error handling and
  different error code returned to the user-space in several places.

- active sockets uses an unneeded, larger amount of memory

- passive sockets can't successfully go through accept(), disconnect(),
  accept() sequence, see [1] for more details.

The 13 first patches of this series are from Paolo and address all the
above, finally getting rid of the blamed field:

- The first patch is a minor clean-up.

- In the next 11 patches, msk->subflow usage is systematically removed
  from the MPTCP protocol, replacing it with direct msk->first usage,
  eventually introducing new core helpers when needed.

- The 13th patch finally disposes the field, and it's the only patch in
  the series intended to produce functional changes.

The last and 14th patch is from Kuniyuki and it is not linked to the
previous ones: it is a small clean-up to get rid of an unnecessary check
in mptcp_init_sock().

[1] https://github.com/multipath-tcp/mptcp_net-next/issues/290
====================
Signed-off-by: default avatarMatthieu Baerts <matthieu.baerts@tessares.net>
parents f614a29d e2636917
...@@ -40,8 +40,10 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ...@@ -40,8 +40,10 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags); int flags);
int inet_shutdown(struct socket *sock, int how); int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog); int inet_listen(struct socket *sock, int backlog);
int __inet_listen_sk(struct sock *sk, int backlog);
void inet_sock_destruct(struct sock *sk); void inet_sock_destruct(struct sock *sk);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */ /* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0)
/* Grab and release socket lock. */ /* Grab and release socket lock. */
......
...@@ -1216,6 +1216,7 @@ void inet6_cleanup_sock(struct sock *sk); ...@@ -1216,6 +1216,7 @@ void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk); void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock); int inet6_release(struct socket *sock);
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
int peer); int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
......
...@@ -187,24 +187,13 @@ static int inet_autobind(struct sock *sk) ...@@ -187,24 +187,13 @@ static int inet_autobind(struct sock *sk)
return 0; return 0;
} }
/* int __inet_listen_sk(struct sock *sk, int backlog)
* Move a socket into listening state.
*/
int inet_listen(struct socket *sock, int backlog)
{ {
struct sock *sk = sock->sk; unsigned char old_state = sk->sk_state;
unsigned char old_state;
int err, tcp_fastopen; int err, tcp_fastopen;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out; return -EINVAL;
WRITE_ONCE(sk->sk_max_ack_backlog, backlog); WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state /* Really, if the socket is already in listen state
...@@ -227,10 +216,27 @@ int inet_listen(struct socket *sock, int backlog) ...@@ -227,10 +216,27 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk); err = inet_csk_listen_start(sk);
if (err) if (err)
goto out; return err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
} }
err = 0; return 0;
}
/*
* Move a socket into listening state.
*/
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
int err = -EINVAL;
lock_sock(sk);
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
err = __inet_listen_sk(sk, backlog);
out: out:
release_sock(sk); release_sock(sk);
...@@ -431,9 +437,8 @@ int inet_release(struct socket *sock) ...@@ -431,9 +437,8 @@ int inet_release(struct socket *sock)
} }
EXPORT_SYMBOL(inet_release); EXPORT_SYMBOL(inet_release);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ {
struct sock *sk = sock->sk;
u32 flags = BIND_WITH_LOCK; u32 flags = BIND_WITH_LOCK;
int err; int err;
...@@ -454,6 +459,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -454,6 +459,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return __inet_bind(sk, uaddr, addr_len, flags); return __inet_bind(sk, uaddr, addr_len, flags);
} }
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind); EXPORT_SYMBOL(inet_bind);
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
......
...@@ -435,10 +435,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -435,10 +435,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
goto out; goto out;
} }
/* bind for INET6 API */ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{ {
struct sock *sk = sock->sk;
u32 flags = BIND_WITH_LOCK; u32 flags = BIND_WITH_LOCK;
const struct proto *prot; const struct proto *prot;
int err = 0; int err = 0;
...@@ -462,6 +460,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -462,6 +460,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return __inet6_bind(sk, uaddr, addr_len, flags); return __inet6_bind(sk, uaddr, addr_len, flags);
} }
/* bind for INET6 API */
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
return inet6_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet6_bind); EXPORT_SYMBOL(inet6_bind);
int inet6_release(struct socket *sock) int inet6_release(struct socket *sock)
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/inet.h> #include <linux/inet.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <net/tcp.h> #include <net/tcp.h>
#include <net/inet_common.h>
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include <net/mptcp.h> #include <net/mptcp.h>
#include <net/genetlink.h> #include <net/genetlink.h>
...@@ -1005,8 +1006,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, ...@@ -1005,8 +1006,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
bool is_ipv6 = sk->sk_family == AF_INET6; bool is_ipv6 = sk->sk_family == AF_INET6;
int addrlen = sizeof(struct sockaddr_in); int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr; struct sockaddr_storage addr;
struct socket *ssock; struct sock *newsk, *ssk;
struct sock *newsk;
int backlog = 1024; int backlog = 1024;
int err; int err;
...@@ -1032,28 +1032,32 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, ...@@ -1032,28 +1032,32 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
&mptcp_keys[is_ipv6]); &mptcp_keys[is_ipv6]);
lock_sock(newsk); lock_sock(newsk);
ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
release_sock(newsk); release_sock(newsk);
if (IS_ERR(ssock)) if (IS_ERR(ssk))
return PTR_ERR(ssock); return PTR_ERR(ssk);
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (entry->addr.family == AF_INET6) if (entry->addr.family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6); addrlen = sizeof(struct sockaddr_in6);
#endif #endif
err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); if (ssk->sk_family == AF_INET)
err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (ssk->sk_family == AF_INET6)
err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#endif
if (err) if (err)
return err; return err;
inet_sk_state_store(newsk, TCP_LISTEN); inet_sk_state_store(newsk, TCP_LISTEN);
err = kernel_listen(ssock, backlog); lock_sock(ssk);
if (err) err = __inet_listen_sk(ssk, backlog);
return err; if (!err)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); release_sock(ssk);
return err;
return 0;
} }
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
......
This diff is collapsed.
...@@ -299,7 +299,8 @@ struct mptcp_sock { ...@@ -299,7 +299,8 @@ struct mptcp_sock {
cork:1, cork:1,
nodelay:1, nodelay:1,
fastopening:1, fastopening:1,
in_accept_queue:1; in_accept_queue:1,
free_first:1;
struct work_struct work; struct work_struct work;
struct sk_buff *ooo_last_skb; struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue; struct rb_root out_of_order_queue;
...@@ -308,12 +309,10 @@ struct mptcp_sock { ...@@ -308,12 +309,10 @@ struct mptcp_sock {
struct list_head rtx_queue; struct list_head rtx_queue;
struct mptcp_data_frag *first_pending; struct mptcp_data_frag *first_pending;
struct list_head join_list; struct list_head join_list;
struct socket *subflow; /* outgoing connect/listener/!mp_capable struct sock *first; /* The mptcp ops can safely dereference, using suitable
* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket
* ONCE annotation, the subflow outside the socket * lock as such sock is freed after close().
* lock as such sock is freed after close(). */
*/
struct sock *first;
struct mptcp_pm_data pm; struct mptcp_pm_data pm;
struct { struct {
u32 space; /* bytes copied in last measurement window */ u32 space; /* bytes copied in last measurement window */
...@@ -640,7 +639,7 @@ void __mptcp_subflow_send_ack(struct sock *ssk); ...@@ -640,7 +639,7 @@ void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent); void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout); bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk); void mptcp_cancel_work(struct sock *sk);
void __mptcp_unaccepted_force_close(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk);
......
...@@ -292,7 +292,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, ...@@ -292,7 +292,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen) sockptr_t optval, unsigned int optlen)
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
struct socket *ssock; struct sock *ssk;
int ret; int ret;
switch (optname) { switch (optname) {
...@@ -301,22 +301,22 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, ...@@ -301,22 +301,22 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_BINDTODEVICE: case SO_BINDTODEVICE:
case SO_BINDTOIFINDEX: case SO_BINDTOIFINDEX:
lock_sock(sk); lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk); ssk = __mptcp_nmpc_sk(msk);
if (IS_ERR(ssock)) { if (IS_ERR(ssk)) {
release_sock(sk); release_sock(sk);
return PTR_ERR(ssock); return PTR_ERR(ssk);
} }
ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
if (ret == 0) { if (ret == 0) {
if (optname == SO_REUSEPORT) if (optname == SO_REUSEPORT)
sk->sk_reuseport = ssock->sk->sk_reuseport; sk->sk_reuseport = ssk->sk_reuseport;
else if (optname == SO_REUSEADDR) else if (optname == SO_REUSEADDR)
sk->sk_reuse = ssock->sk->sk_reuse; sk->sk_reuse = ssk->sk_reuse;
else if (optname == SO_BINDTODEVICE) else if (optname == SO_BINDTODEVICE)
sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
else if (optname == SO_BINDTOIFINDEX) else if (optname == SO_BINDTOIFINDEX)
sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
} }
release_sock(sk); release_sock(sk);
return ret; return ret;
...@@ -390,20 +390,20 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, ...@@ -390,20 +390,20 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
int ret = -EOPNOTSUPP; int ret = -EOPNOTSUPP;
struct socket *ssock; struct sock *ssk;
switch (optname) { switch (optname) {
case IPV6_V6ONLY: case IPV6_V6ONLY:
case IPV6_TRANSPARENT: case IPV6_TRANSPARENT:
case IPV6_FREEBIND: case IPV6_FREEBIND:
lock_sock(sk); lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk); ssk = __mptcp_nmpc_sk(msk);
if (IS_ERR(ssock)) { if (IS_ERR(ssk)) {
release_sock(sk); release_sock(sk);
return PTR_ERR(ssock); return PTR_ERR(ssk);
} }
ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
if (ret != 0) { if (ret != 0) {
release_sock(sk); release_sock(sk);
return ret; return ret;
...@@ -413,13 +413,13 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, ...@@ -413,13 +413,13 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
switch (optname) { switch (optname) {
case IPV6_V6ONLY: case IPV6_V6ONLY:
sk->sk_ipv6only = ssock->sk->sk_ipv6only; sk->sk_ipv6only = ssk->sk_ipv6only;
break; break;
case IPV6_TRANSPARENT: case IPV6_TRANSPARENT:
inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent; inet_sk(sk)->transparent = inet_sk(ssk)->transparent;
break; break;
case IPV6_FREEBIND: case IPV6_FREEBIND:
inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind; inet_sk(sk)->freebind = inet_sk(ssk)->freebind;
break; break;
} }
...@@ -685,7 +685,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o ...@@ -685,7 +685,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
struct inet_sock *issk; struct inet_sock *issk;
struct socket *ssock; struct sock *ssk;
int err; int err;
err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
...@@ -694,13 +694,13 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o ...@@ -694,13 +694,13 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
lock_sock(sk); lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk); ssk = __mptcp_nmpc_sk(msk);
if (IS_ERR(ssock)) { if (IS_ERR(ssk)) {
release_sock(sk); release_sock(sk);
return PTR_ERR(ssock); return PTR_ERR(ssk);
} }
issk = inet_sk(ssock->sk); issk = inet_sk(ssk);
switch (optname) { switch (optname) {
case IP_FREEBIND: case IP_FREEBIND:
...@@ -763,18 +763,18 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int ...@@ -763,18 +763,18 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
sockptr_t optval, unsigned int optlen) sockptr_t optval, unsigned int optlen)
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
struct socket *sock; struct sock *ssk;
int ret; int ret;
/* Limit to first subflow, before the connection establishment */ /* Limit to first subflow, before the connection establishment */
lock_sock(sk); lock_sock(sk);
sock = __mptcp_nmpc_socket(msk); ssk = __mptcp_nmpc_sk(msk);
if (IS_ERR(sock)) { if (IS_ERR(ssk)) {
ret = PTR_ERR(sock); ret = PTR_ERR(ssk);
goto unlock; goto unlock;
} }
ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
unlock: unlock:
release_sock(sk); release_sock(sk);
...@@ -864,9 +864,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int ...@@ -864,9 +864,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
char __user *optval, int __user *optlen) char __user *optval, int __user *optlen)
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
struct socket *ssock;
int ret;
struct sock *ssk; struct sock *ssk;
int ret;
lock_sock(sk); lock_sock(sk);
ssk = msk->first; ssk = msk->first;
...@@ -875,13 +874,13 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int ...@@ -875,13 +874,13 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
goto out; goto out;
} }
ssock = __mptcp_nmpc_socket(msk); ssk = __mptcp_nmpc_sk(msk);
if (IS_ERR(ssock)) { if (IS_ERR(ssk)) {
ret = PTR_ERR(ssock); ret = PTR_ERR(ssk);
goto out; goto out;
} }
ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
out: out:
release_sock(sk); release_sock(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment