Commit 4caaf758 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-speedup-netns-dismantles'

Eric Dumazet says:

====================
net: speedup netns dismantles

From: Eric Dumazet <edumazet@google.com>

In this series, I made network namespace deletions more scalable,
by 4x on the little benchmark described in this cover letter.

- Remove bottleneck on ipv6 addrconf, by replacing a global
  hash table to a per netns one.

- Rework many (struct pernet_operations)->exit() handlers to
  exit_batch() ones. This removes many rtnl acquisitions,
  and gives to cleanup_net() kind of a priority over rtnl
  ownership.

Tested on a host with 24 cpus (48 HT)

Test script:

for nr in {1..10}
do
  (for i in {1..10000}; do unshare -n /bin/bash -c "ifconfig lo up"; done) &
done
wait

for i in {1..10}
do
  sleep 1
  echo 3 >/proc/sys/vm/drop_caches
  grep net_namespace /proc/slabinfo
done

Before: We can see host struggles to clean the netns, even after there are no new creations.
Memory cost is high, because each netns consumes a good amount of memory.

time ./unshare10.sh
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      82634  82634   3968    1    1 : tunables   24   12    8 : slabdata  82634  82634      0
net_namespace      37214  37792   3968    1    1 : tunables   24   12    8 : slabdata  37214  37792    192

real	6m57.766s
user	3m37.277s
sys	40m4.826s

After: We can see the script completes much faster,
the kernel thread doing the cleanup_net() keeps up just fine.
Memory cost is not too big.

time ./unshare10.sh
net_namespace       9945   9945   4096    1    1 : tunables   24   12    8 : slabdata   9945   9945      0
net_namespace       4087   4665   4096    1    1 : tunables   24   12    8 : slabdata   4087   4665    192
net_namespace       4082   4607   4096    1    1 : tunables   24   12    8 : slabdata   4082   4607    192
net_namespace        234    761   4096    1    1 : tunables   24   12    8 : slabdata    234    761    192
net_namespace        224    751   4096    1    1 : tunables   24   12    8 : slabdata    224    751    192
net_namespace        218    745   4096    1    1 : tunables   24   12    8 : slabdata    218    745    192
net_namespace        193    667   4096    1    1 : tunables   24   12    8 : slabdata    193    667    172
net_namespace        167    609   4096    1    1 : tunables   24   12    8 : slabdata    167    609    152
net_namespace        167    609   4096    1    1 : tunables   24   12    8 : slabdata    167    609    152
net_namespace        157    609   4096    1    1 : tunables   24   12    8 : slabdata    157    609    152

real    1m43.876s
user    3m39.728s
sys 7m36.342s
====================

Link: https://lore.kernel.org/r/20220208045038.2635826-1-eric.dumazet@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b2309a71 ee403248
......@@ -6048,27 +6048,38 @@ static int __net_init bond_net_init(struct net *net)
return 0;
}
static void __net_exit bond_net_exit(struct net *net)
static void __net_exit bond_net_exit_batch(struct list_head *net_list)
{
struct bond_net *bn = net_generic(net, bond_net_id);
struct bonding *bond, *tmp_bond;
struct bond_net *bn;
struct net *net;
LIST_HEAD(list);
bond_destroy_sysfs(bn);
list_for_each_entry(net, net_list, exit_list) {
bn = net_generic(net, bond_net_id);
bond_destroy_sysfs(bn);
}
/* Kill off any bonds created after unregistering bond rtnl ops */
rtnl_lock();
list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
unregister_netdevice_queue(bond->dev, &list);
list_for_each_entry(net, net_list, exit_list) {
struct bonding *bond, *tmp_bond;
bn = net_generic(net, bond_net_id);
list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
unregister_netdevice_queue(bond->dev, &list);
}
unregister_netdevice_many(&list);
rtnl_unlock();
bond_destroy_proc_dir(bn);
list_for_each_entry(net, net_list, exit_list) {
bn = net_generic(net, bond_net_id);
bond_destroy_proc_dir(bn);
}
}
static struct pernet_operations bond_net_ops = {
.init = bond_net_init,
.exit = bond_net_exit,
.exit_batch = bond_net_exit_batch,
.id = &bond_net_id,
.size = sizeof(struct bond_net),
};
......
......@@ -307,7 +307,6 @@ void __net_init bond_create_proc_dir(struct bond_net *bn)
}
/* Destroy the bonding directory under /proc/net, if empty.
* Caller must hold rtnl_lock.
*/
void __net_exit bond_destroy_proc_dir(struct bond_net *bn)
{
......
......@@ -92,6 +92,11 @@ struct netns_ipv6 {
struct sock *tcp_sk;
struct sock *igmp_sk;
struct sock *mc_autojoin_sk;
struct hlist_head *inet6_addr_lst;
spinlock_t addrconf_hash_lock;
struct delayed_work addr_chk_work;
#ifdef CONFIG_IPV6_MROUTE
#ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
struct mr_table *mrt6;
......
......@@ -1239,16 +1239,19 @@ static int __net_init cangw_pernet_init(struct net *net)
return 0;
}
static void __net_exit cangw_pernet_exit(struct net *net)
static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
cgw_remove_all_jobs(net);
list_for_each_entry(net, net_list, exit_list)
cgw_remove_all_jobs(net);
rtnl_unlock();
}
static struct pernet_operations cangw_pernet_ops = {
.init = cangw_pernet_init,
.exit = cangw_pernet_exit,
.exit_batch = cangw_pernet_exit_batch,
};
static __init int cgw_module_init(void)
......
......@@ -10850,14 +10850,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
.exit = netdev_exit,
};
static void __net_exit default_device_exit(struct net *net)
static void __net_exit default_device_exit_net(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
rtnl_lock();
ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
......@@ -10881,22 +10881,22 @@ static void __net_exit default_device_exit(struct net *net)
BUG();
}
}
rtnl_unlock();
}
static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
{
/* Return with the rtnl_lock held when there are no network
/* Return (with the rtnl_lock held) when there are no network
* devices unregistering in any network namespace in net_list.
*/
struct net *net;
bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
bool unregistering;
struct net *net;
ASSERT_RTNL();
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
unregistering = false;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
if (net->dev_unreg_count > 0) {
unregistering = true;
......@@ -10908,6 +10908,7 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
__rtnl_unlock();
wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
rtnl_lock();
}
remove_wait_queue(&netdev_unregistering_wq, &wait);
}
......@@ -10923,6 +10924,11 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
default_device_exit_net(net);
cond_resched();
}
/* To prevent network device cleanup code from dereferencing
* loopback devices or network devices that have been freed
* wait here for all pending unregistrations to complete,
......@@ -10935,6 +10941,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
* default_device_exit_batch.
*/
rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
......@@ -10948,7 +10955,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
......
......@@ -1556,7 +1556,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
rtnl_lock();
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
......@@ -1581,7 +1581,7 @@ static void ip_fib_net_exit(struct net *net)
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
rtnl_unlock();
kfree(net->ipv4.fib_table_hash);
fib4_notifier_exit(net);
}
......@@ -1608,7 +1608,9 @@ static int __net_init fib_net_init(struct net *net)
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
rtnl_lock();
ip_fib_net_exit(net);
rtnl_unlock();
goto out;
}
......@@ -1616,12 +1618,23 @@ static void __net_exit fib_net_exit(struct net *net)
{
fib_proc_exit(net);
nl_fib_lookup_exit(net);
ip_fib_net_exit(net);
}
static void __net_exit fib_net_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list)
ip_fib_net_exit(net);
rtnl_unlock();
}
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
.exit_batch = fib_net_exit_batch,
};
void __init ip_fib_init(void)
......
......@@ -266,13 +266,12 @@ static void __net_exit ipmr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
rtnl_lock();
ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
......@@ -328,10 +327,9 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
rtnl_lock();
ASSERT_RTNL();
ipmr_free_table(net->ipv4.mrt);
net->ipv4.mrt = NULL;
rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
......@@ -3075,7 +3073,9 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
rtnl_lock();
ipmr_rules_exit(net);
rtnl_unlock();
#endif
ipmr_rules_fail:
ipmr_notifier_exit(net);
......@@ -3090,12 +3090,22 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_notifier_exit(net);
ipmr_rules_exit(net);
}
static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list)
ipmr_rules_exit(net);
rtnl_unlock();
}
static struct pernet_operations ipmr_net_ops = {
.init = ipmr_net_init,
.exit = ipmr_net_exit,
.exit_batch = ipmr_net_exit_batch,
};
int __init ip_mr_init(void)
......
......@@ -3733,12 +3733,16 @@ void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
static void __net_exit nexthop_net_exit(struct net *net)
static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
flush_all_nexthops(net);
list_for_each_entry(net, net_list, exit_list) {
flush_all_nexthops(net);
kfree(net->nexthop.devhash);
}
rtnl_unlock();
kfree(net->nexthop.devhash);
}
static int __net_init nexthop_net_init(struct net *net)
......@@ -3756,7 +3760,7 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
.exit = nexthop_net_exit,
.exit_batch = nexthop_net_exit_batch,
};
static int __init nexthop_init(void)
......
This diff is collapsed.
......@@ -493,16 +493,21 @@ static int __net_init fib6_rules_net_init(struct net *net)
goto out;
}
static void __net_exit fib6_rules_net_exit(struct net *net)
static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
fib_rules_unregister(net->ipv6.fib6_rules_ops);
list_for_each_entry(net, net_list, exit_list) {
fib_rules_unregister(net->ipv6.fib6_rules_ops);
cond_resched();
}
rtnl_unlock();
}
static struct pernet_operations fib6_rules_net_ops = {
.init = fib6_rules_net_init,
.exit = fib6_rules_net_exit,
.exit_batch = fib6_rules_net_exit_batch,
};
int __init fib6_rules_init(void)
......
......@@ -253,13 +253,12 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
rtnl_lock();
ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
list_del(&mrt->list);
ip6mr_free_table(mrt);
}
fib_rules_unregister(net->ipv6.mr6_rules_ops);
rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
......@@ -316,10 +315,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
static void __net_exit ip6mr_rules_exit(struct net *net)
{
rtnl_lock();
ASSERT_RTNL();
ip6mr_free_table(net->ipv6.mrt6);
net->ipv6.mrt6 = NULL;
rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
......@@ -1323,7 +1321,9 @@ static int __net_init ip6mr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip6_mr_vif", net->proc_net);
proc_vif_fail:
rtnl_lock();
ip6mr_rules_exit(net);
rtnl_unlock();
#endif
ip6mr_rules_fail:
ip6mr_notifier_exit(net);
......@@ -1336,13 +1336,23 @@ static void __net_exit ip6mr_net_exit(struct net *net)
remove_proc_entry("ip6_mr_cache", net->proc_net);
remove_proc_entry("ip6_mr_vif", net->proc_net);
#endif
ip6mr_rules_exit(net);
ip6mr_notifier_exit(net);
}
static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
{
struct net *net;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list)
ip6mr_rules_exit(net);
rtnl_unlock();
}
static struct pernet_operations ip6mr_net_ops = {
.init = ip6mr_net_init,
.exit = ip6mr_net_exit,
.exit_batch = ip6mr_net_exit_batch,
};
int __init ip6_mr_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment