Commit 66f5d6ce authored by Wei Wang's avatar Wei Wang Committed by David S. Miller

ipv6: replace rwlock with rcu and spinlock in fib6_table

With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().

A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.

2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.

3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.

4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.

5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: default avatarWei Wang <weiwan@google.com>
Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 17ecf590
...@@ -101,7 +101,7 @@ struct dst_entry { ...@@ -101,7 +101,7 @@ struct dst_entry {
union { union {
struct dst_entry *next; struct dst_entry *next;
struct rtable __rcu *rt_next; struct rtable __rcu *rt_next;
struct rt6_info *rt6_next; struct rt6_info __rcu *rt6_next;
struct dn_route __rcu *dn_next; struct dn_route __rcu *dn_next;
}; };
}; };
......
...@@ -68,18 +68,18 @@ struct fib6_config { ...@@ -68,18 +68,18 @@ struct fib6_config {
}; };
struct fib6_node { struct fib6_node {
struct fib6_node *parent; struct fib6_node __rcu *parent;
struct fib6_node *left; struct fib6_node __rcu *left;
struct fib6_node *right; struct fib6_node __rcu *right;
#ifdef CONFIG_IPV6_SUBTREES #ifdef CONFIG_IPV6_SUBTREES
struct fib6_node *subtree; struct fib6_node __rcu *subtree;
#endif #endif
struct rt6_info *leaf; struct rt6_info __rcu *leaf;
__u16 fn_bit; /* bit key */ __u16 fn_bit; /* bit key */
__u16 fn_flags; __u16 fn_flags;
int fn_sernum; int fn_sernum;
struct rt6_info *rr_ptr; struct rt6_info __rcu *rr_ptr;
struct rcu_head rcu; struct rcu_head rcu;
}; };
...@@ -91,7 +91,7 @@ struct fib6_gc_args { ...@@ -91,7 +91,7 @@ struct fib6_gc_args {
#ifndef CONFIG_IPV6_SUBTREES #ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn) NULL #define FIB6_SUBTREE(fn) NULL
#else #else
#define FIB6_SUBTREE(fn) ((fn)->subtree) #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1))
#endif #endif
struct mx6_config { struct mx6_config {
...@@ -174,6 +174,14 @@ struct rt6_info { ...@@ -174,6 +174,14 @@ struct rt6_info {
unused:7; unused:7;
}; };
#define for_each_fib6_node_rt_rcu(fn) \
for (rt = rcu_dereference((fn)->leaf); rt; \
rt = rcu_dereference(rt->dst.rt6_next))
#define for_each_fib6_walker_rt(w) \
for (rt = (w)->leaf; rt; \
rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
{ {
return ((struct rt6_info *)dst)->rt6i_idev; return ((struct rt6_info *)dst)->rt6i_idev;
...@@ -310,7 +318,7 @@ struct rt6_statistics { ...@@ -310,7 +318,7 @@ struct rt6_statistics {
struct fib6_table { struct fib6_table {
struct hlist_node tb6_hlist; struct hlist_node tb6_hlist;
u32 tb6_id; u32 tb6_id;
rwlock_t tb6_lock; spinlock_t tb6_lock;
struct fib6_node tb6_root; struct fib6_node tb6_root;
struct inet_peer_base tb6_peers; struct inet_peer_base tb6_peers;
unsigned int flags; unsigned int flags;
......
...@@ -2321,12 +2321,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, ...@@ -2321,12 +2321,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
if (!table) if (!table)
return NULL; return NULL;
read_lock_bh(&table->tb6_lock); rcu_read_lock();
fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
if (!fn) if (!fn)
goto out; goto out;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { for_each_fib6_node_rt_rcu(fn) {
if (rt->dst.dev->ifindex != dev->ifindex) if (rt->dst.dev->ifindex != dev->ifindex)
continue; continue;
if ((rt->rt6i_flags & flags) != flags) if ((rt->rt6i_flags & flags) != flags)
...@@ -2338,7 +2338,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, ...@@ -2338,7 +2338,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
break; break;
} }
out: out:
read_unlock_bh(&table->tb6_lock); rcu_read_unlock();
return rt; return rt;
} }
...@@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) ...@@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
spin_lock(&ifa->lock); spin_lock(&ifa->lock);
if (ifa->rt) { if (ifa->rt) {
struct rt6_info *rt = ifa->rt; struct rt6_info *rt = ifa->rt;
struct fib6_table *table = rt->rt6i_table;
int cpu; int cpu;
read_lock(&table->tb6_lock); rcu_read_lock();
addrconf_set_nopolicy(ifa->rt, val); addrconf_set_nopolicy(ifa->rt, val);
if (rt->rt6i_pcpu) { if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
...@@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) ...@@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
addrconf_set_nopolicy(*rtp, val); addrconf_set_nopolicy(*rtp, val);
} }
} }
read_unlock(&table->tb6_lock); rcu_read_unlock();
} }
spin_unlock(&ifa->lock); spin_unlock(&ifa->lock);
} }
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment