Commit 1a2e10a2 authored by David S. Miller's avatar David S. Miller

Merge branch 'Rework-ip_ra_chain-protection'

Kirill Tkhai says:

====================
Rework ip_ra_chain protection

Commit 1215e51e "ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).

This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.

v3: Change patches order: [2/5] and [3/5].
v2: Fix sparse warning [4/5], as reported by kbuild test robot.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f2d254fa d9ff3049
...@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb) ...@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
return 0; return 0;
} }
/* Special input handler for packets caught by router alert option.
They are selected only by protocol field, and then processed likely
local ones; but only if someone wants them! Otherwise, router
not running rsvpd will kill RSVP.
It is user level problem, what it will make with them.
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
but receiver should be enough clever f.e. to forward mtrace requests,
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain { struct ip_ra_chain {
struct ip_ra_chain __rcu *next; struct ip_ra_chain __rcu *next;
struct sock *sk; struct sock *sk;
...@@ -101,8 +112,6 @@ struct ip_ra_chain { ...@@ -101,8 +112,6 @@ struct ip_ra_chain {
struct rcu_head rcu; struct rcu_head rcu;
}; };
extern struct ip_ra_chain __rcu *ip_ra_chain;
/* IP flags. */ /* IP flags. */
#define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_CE 0x8000 /* Flag: "Congestion" */
#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */
......
...@@ -49,6 +49,8 @@ struct netns_ipv4 { ...@@ -49,6 +49,8 @@ struct netns_ipv4 {
#endif #endif
struct ipv4_devconf *devconf_all; struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt; struct ipv4_devconf *devconf_dflt;
struct ip_ra_chain __rcu *ra_chain;
struct mutex ra_mutex;
#ifdef CONFIG_IP_MULTIPLE_TABLES #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops; struct fib_rules_ops *rules_ops;
bool fib_has_custom_rules; bool fib_has_custom_rules;
......
...@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) ...@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->user_ns = user_ns; net->user_ns = user_ns;
idr_init(&net->netns_ids); idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock); spin_lock_init(&net->nsid_lock);
mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) { list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net); error = ops_init(ops, net);
......
...@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) ...@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
struct net_device *dev = skb->dev; struct net_device *dev = skb->dev;
struct net *net = dev_net(dev); struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk; struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report /* If socket is bound to an interface, only report
...@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) ...@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
*/ */
if (sk && inet_sk(sk)->inet_num == protocol && if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if || (!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) && sk->sk_bound_dev_if == dev->ifindex)) {
net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) { if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true; return true;
......
...@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ...@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
return 0; return 0;
} }
/* Special input handler for packets caught by router alert option.
They are selected only by protocol field, and then processed likely
local ones; but only if someone wants them! Otherwise, router
not running rsvpd will kill RSVP.
It is user level problem, what it will make with them.
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
but receiver should be enough clever f.e. to forward mtrace requests,
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain __rcu *ip_ra_chain;
static void ip_ra_destroy_rcu(struct rcu_head *head) static void ip_ra_destroy_rcu(struct rcu_head *head)
{ {
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
...@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on, ...@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
{ {
struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain *ra, *new_ra;
struct ip_ra_chain __rcu **rap; struct ip_ra_chain __rcu **rap;
struct net *net = sock_net(sk);
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL; return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
for (rap = &ip_ra_chain; mutex_lock(&net->ipv4.ra_mutex);
(ra = rtnl_dereference(*rap)) != NULL; for (rap = &net->ipv4.ra_chain;
(ra = rcu_dereference_protected(*rap,
lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
rap = &ra->next) { rap = &ra->next) {
if (ra->sk == sk) { if (ra->sk == sk) {
if (on) { if (on) {
mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra); kfree(new_ra);
return -EADDRINUSE; return -EADDRINUSE;
} }
/* dont let ip_call_ra_chain() use sk again */ /* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL; ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next); RCU_INIT_POINTER(*rap, ra->next);
mutex_unlock(&net->ipv4.ra_mutex);
if (ra->destructor) if (ra->destructor)
ra->destructor(sk); ra->destructor(sk);
...@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, ...@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0; return 0;
} }
} }
if (!new_ra) if (!new_ra) {
mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS; return -ENOBUFS;
}
new_ra->sk = sk; new_ra->sk = sk;
new_ra->destructor = destructor; new_ra->destructor = destructor;
RCU_INIT_POINTER(new_ra->next, ra); RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra); rcu_assign_pointer(*rap, new_ra);
sock_hold(sk); sock_hold(sk);
mutex_unlock(&net->ipv4.ra_mutex);
return 0; return 0;
} }
...@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname) ...@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP: case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE:
case IP_ROUTER_ALERT:
return true; return true;
} }
return false; return false;
...@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, ...@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* If optlen==0, it is equivalent to val == 0 */ /* If optlen==0, it is equivalent to val == 0 */
if (optname == IP_ROUTER_ALERT)
return ip_ra_control(sk, val ? 1 : 0, NULL);
if (ip_mroute_opt(optname)) if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen); return ip_mroute_setsockopt(sk, optname, optval, optlen);
...@@ -1149,9 +1144,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, ...@@ -1149,9 +1144,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval; goto e_inval;
inet->mc_all = val; inet->mc_all = val;
break; break;
case IP_ROUTER_ALERT:
err = ip_ra_control(sk, val ? 1 : 0, NULL);
break;
case IP_FREEBIND: case IP_FREEBIND:
if (optlen < 1) if (optlen < 1)
......
...@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk) ...@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct mr_table *mrt; struct mr_table *mrt;
ASSERT_RTNL(); rtnl_lock();
ipmr_for_each_table(mrt, net) { ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) { if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
...@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk) ...@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false); mroute_clean_tables(mrt, false);
} }
} }
rtnl_unlock();
} }
/* Socket options and virtual interface manipulation. The whole /* Socket options and virtual interface manipulation. The whole
...@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ...@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) { if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES; ret = -EACCES;
} else { } else {
/* We need to unlock here because mrtsock_destruct takes
* care of rtnl itself and we can't change that due to
* the IP_ROUTER_ALERT setsockopt which runs without it.
*/
rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL); ret = ip_ra_control(sk, 0, NULL);
goto out_unlock; goto out;
} }
break; break;
case MRT_ADD_VIF: case MRT_ADD_VIF:
...@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ...@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
} }
out_unlock: out_unlock:
rtnl_unlock(); rtnl_unlock();
out:
return ret; return ret;
} }
......
...@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout) ...@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
/* /*
* Raw sockets may have direct kernel references. Kill them. * Raw sockets may have direct kernel references. Kill them.
*/ */
rtnl_lock();
ip_ra_control(sk, 0, NULL); ip_ra_control(sk, 0, NULL);
rtnl_unlock();
sk_common_release(sk); sk_common_release(sk);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment