Commit f34436a4 authored by David Ahern's avatar David Ahern Committed by David S. Miller

net/ipv6: Simplify route replace and appending into multipath route

Bring consistency to ipv6 route replace and append semantics.

Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases:
1. can not replace a route with a reject route. Existing code appends
   a new route instead of replacing the existing one.

2. can not have a multipath route where a leg uses a dev only nexthop

Existing use cases affected by this change:
1. adding a route with existing prefix and metric using NLM_F_CREATE
   without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls
   'prepend'). Existing code auto-determines that the new nexthop can
   be appended to an existing route to create a multipath route. This
   change breaks that by requiring the APPEND flag for the new route
   to be added to an existing one. Instead the prepend just adds another
   route entry.

2. route replace. Existing code replaces first matching multipath route
   if new route is multipath capable and fallback to first matching
   non-ECMP route (reject or dev only route) in case one isn't available.
   New behavior replaces first matching route. (Thanks to Ido for spotting
   this one)

Note: Newer iproute2 is needed to display multipath routes with a dev-only
      nexthop. This is due to a bug in iproute2 and parsing nexthops.
Signed-off-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 5a15a1b0
...@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) ...@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
} }
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
RTF_GATEWAY;
}
void ip6_route_input(struct sk_buff *skb); void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net, struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev, struct net_device *dev,
......
...@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ...@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
{ {
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock)); lockdep_is_held(&rt->fib6_table->tb6_lock));
struct fib6_info *iter = NULL; struct fib6_info *iter = NULL, *match = NULL;
struct fib6_info __rcu **ins; struct fib6_info __rcu **ins;
struct fib6_info __rcu **fallback_ins = NULL;
int replace = (info->nlh && int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE)); (info->nlh->nlmsg_flags & NLM_F_REPLACE));
int append = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh || int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE)); (info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0; int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL; u16 nlflags = NLM_F_EXCL;
int err; int err;
if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) if (append)
nlflags |= NLM_F_APPEND; nlflags |= NLM_F_APPEND;
ins = &fn->leaf; ins = &fn->leaf;
...@@ -968,14 +968,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ...@@ -968,14 +968,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
nlflags &= ~NLM_F_EXCL; nlflags &= ~NLM_F_EXCL;
if (replace) { if (replace) {
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
found++; found++;
break; break;
} }
if (rt_can_ecmp)
fallback_ins = fallback_ins ?: ins;
goto next_iter;
}
if (rt6_duplicate_nexthop(iter, rt)) { if (rt6_duplicate_nexthop(iter, rt)) {
if (rt->fib6_nsiblings) if (rt->fib6_nsiblings)
...@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ...@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST; return -EEXIST;
} }
/* If we have the same destination and the same metric,
* but not the same gateway, then the route we try to /* first route that matches */
* add is sibling to this route, increment our counter if (!match)
* of siblings, and later we will add our route to the match = iter;
* list.
* Only static routes (which don't have flag
* RTF_EXPIRES) are used for ECMPv6.
*
* To avoid long list, we only had siblings if the
* route have a gateway.
*/
if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
rt->fib6_nsiblings++;
} }
if (iter->fib6_metric > rt->fib6_metric) if (iter->fib6_metric > rt->fib6_metric)
break; break;
next_iter:
ins = &iter->fib6_next; ins = &iter->fib6_next;
} }
if (fallback_ins && !found) {
/* No ECMP-able route found, replace first non-ECMP one */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
found++;
}
/* Reset round-robin state, if necessary */ /* Reset round-robin state, if necessary */
if (ins == &fn->leaf) if (ins == &fn->leaf)
fn->rr_ptr = NULL; fn->rr_ptr = NULL;
/* Link this route to others same route. */ /* Link this route to others same route. */
if (rt->fib6_nsiblings) { if (append && match) {
unsigned int fib6_nsiblings;
struct fib6_info *sibling, *temp_sibling; struct fib6_info *sibling, *temp_sibling;
/* Find the first route that have the same metric */ if (rt->fib6_flags & RTF_REJECT) {
sibling = leaf; NL_SET_ERR_MSG(extack,
while (sibling) { "Can not append a REJECT route");
if (sibling->fib6_metric == rt->fib6_metric && return -EINVAL;
rt6_qualify_for_ecmp(sibling)) { } else if (match->fib6_flags & RTF_REJECT) {
list_add_tail(&rt->fib6_siblings, NL_SET_ERR_MSG(extack,
&sibling->fib6_siblings); "Can not append to a REJECT route");
break; return -EINVAL;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
} }
rt->fib6_nsiblings = match->fib6_nsiblings;
list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
match->fib6_nsiblings++;
/* For each sibling in the list, increment the counter of /* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings * siblings. BUG() if counters does not match, list of siblings
* is broken! * is broken!
*/ */
fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling, list_for_each_entry_safe(sibling, temp_sibling,
&rt->fib6_siblings, fib6_siblings) { &match->fib6_siblings, fib6_siblings) {
sibling->fib6_nsiblings++; sibling->fib6_nsiblings++;
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
fib6_nsiblings++;
} }
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
rt6_multipath_rebalance(temp_sibling); rt6_multipath_rebalance(match);
} }
/* /*
* insert node * insert node
*/ */
if (!replace) { if (!replace) {
enum fib_event_type event;
if (!add) if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n"); pr_warn("NLM_F_CREATE should be set when creating new route\n");
add: add:
nlflags |= NLM_F_CREATE; nlflags |= NLM_F_CREATE;
err = call_fib6_entry_notifiers(info->nl_net, event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
FIB_EVENT_ENTRY_ADD, err = call_fib6_entry_notifiers(info->nl_net, event, rt,
rt, extack); extack);
if (err) if (err)
return err; return err;
...@@ -1086,7 +1062,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ...@@ -1086,7 +1062,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
} }
} else { } else {
int nsiblings; struct fib6_info *tmp;
if (!found) { if (!found) {
if (add) if (add)
...@@ -1101,48 +1077,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ...@@ -1101,48 +1077,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
if (err) if (err)
return err; return err;
/* if route being replaced has siblings, set tmp to
* last one, otherwise tmp is current route. this is
* used to set fib6_next for new route
*/
if (iter->fib6_nsiblings)
tmp = list_last_entry(&iter->fib6_siblings,
struct fib6_info,
fib6_siblings);
else
tmp = iter;
/* insert new route */
atomic_inc(&rt->fib6_ref); atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(rt->fib6_node, fn);
rt->fib6_next = iter->fib6_next; rt->fib6_next = tmp->fib6_next;
rcu_assign_pointer(*ins, rt); rcu_assign_pointer(*ins, rt);
if (!info->skip_notify) if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) { if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++; info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO; fn->fn_flags |= RTN_RTINFO;
} }
nsiblings = iter->fib6_nsiblings;
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);
if (nsiblings) { /* delete old route */
rt = iter;
if (rt->fib6_nsiblings) {
struct fib6_info *tmp;
/* Replacing an ECMP route, remove all siblings */ /* Replacing an ECMP route, remove all siblings */
ins = &rt->fib6_next; list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
iter = rcu_dereference_protected(*ins, fib6_siblings) {
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric > rt->fib6_metric)
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->fib6_next;
iter->fib6_node = NULL; iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net); fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter) if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL; fn->rr_ptr = NULL;
fib6_info_release(iter); fib6_info_release(iter);
nsiblings--;
rt->fib6_nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
ins = &iter->fib6_next;
} }
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
}
WARN_ON(nsiblings != 0);
} }
WARN_ON(rt->fib6_nsiblings != 0);
rt->fib6_node = NULL;
fib6_purge_rt(rt, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
fib6_info_release(rt);
} }
return 0; return 0;
......
...@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) ...@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
lockdep_is_held(&rt->fib6_table->tb6_lock)); lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) { while (iter) {
if (iter->fib6_metric == rt->fib6_metric && if (iter->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(iter)) iter->fib6_nsiblings)
return iter; return iter;
iter = rcu_dereference_protected(iter->fib6_next, iter = rcu_dereference_protected(iter->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock)); lockdep_is_held(&rt->fib6_table->tb6_lock));
...@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, ...@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/ */
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE); NLM_F_REPLACE);
cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
nhn++; nhn++;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment