Commit 4001f1f0 authored by David S. Miller's avatar David S. Miller

Merge branch 'Support-for-fdb-ECMP-nexthop-groups'

Roopa Prabhu says:

====================
Support for fdb ECMP nexthop groups

This series introduces ecmp nexthops and nexthop groups
for mac fdb entries. In subsequent patches this is used
by the vxlan driver fdb entries. The use case is
E-VPN multihoming [1,2,3] which requires bridged vxlan traffic
to be load balanced to remote switches (vteps) belonging to
the same multi-homed ethernet segment (This is analogous to
a multi-homed LAG but over vxlan).

Changes include new nexthop flag NHA_FDB for nexthops
referenced by fdb entries. These nexthops only have ip.
The patches make sure that routes dont reference such nexthops.

example:
$ip nexthop add id 12 via 172.16.1.2 fdb
$ip nexthop add id 13 via 172.16.1.3 fdb
$ip nexthop add id 102 group 12/13 fdb

$bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self

[1] E-VPN https://tools.ietf.org/html/rfc7432
[2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365
[3] LPC talk with mention of nexthop groups for L2 ecmp
http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf

v4 -
    - fix error path free_skb in vxlan_xmit_nh
    - fix atomic notifier initialization issue
      (Reported-by: kernel test robot <rong.a.chen@intel.com>)
      The reported error was easy to locate and fix, but i was not
      able to re-test with the robot reproducer script due to some
      other issues with running the script on my test system.

v3 - fix wording in selftest print as pointed out by davidA

v2 -
	- dropped nikolays fixes for nexthop multipath null pointer deref
	  (he will send those separately)
	- added negative tests for route add with fdb nexthop + a few more
	- Fixes for a few  fdb replace conditions found during more testing
	- Moved to rcu_dereference_rtnl in vxlan_fdb_info and consolidate rcu
	  dereferences
	- Fixes to build failures Reported-by: kbuild test robot <lkp@intel.com>
	- DavidA, I am going to send a separate patch for the neighbor code validation
	  for NDA_NH_ID if thats ok.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7b1b843a 0534c548
This diff is collapsed.
...@@ -65,6 +65,7 @@ struct fib6_config { ...@@ -65,6 +65,7 @@ struct fib6_config {
struct nl_info fc_nlinfo; struct nl_info fc_nlinfo;
struct nlattr *fc_encap; struct nlattr *fc_encap;
u16 fc_encap_type; u16 fc_encap_type;
bool fc_is_fdb;
}; };
struct fib6_node { struct fib6_node {
......
...@@ -14,5 +14,6 @@ struct netns_nexthop { ...@@ -14,5 +14,6 @@ struct netns_nexthop {
unsigned int seq; /* protected by rtnl_mutex */ unsigned int seq; /* protected by rtnl_mutex */
u32 last_id_allocated; u32 last_id_allocated;
struct atomic_notifier_head notifier_chain;
}; };
#endif #endif
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#define __LINUX_NEXTHOP_H #define __LINUX_NEXTHOP_H
#include <linux/netdevice.h> #include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/route.h> #include <linux/route.h>
#include <linux/types.h> #include <linux/types.h>
#include <net/ip_fib.h> #include <net/ip_fib.h>
...@@ -26,6 +27,7 @@ struct nh_config { ...@@ -26,6 +27,7 @@ struct nh_config {
u8 nh_family; u8 nh_family;
u8 nh_protocol; u8 nh_protocol;
u8 nh_blackhole; u8 nh_blackhole;
u8 nh_fdb;
u32 nh_flags; u32 nh_flags;
int nh_ifindex; int nh_ifindex;
...@@ -52,6 +54,7 @@ struct nh_info { ...@@ -52,6 +54,7 @@ struct nh_info {
u8 family; u8 family;
bool reject_nh; bool reject_nh;
bool fdb_nh;
union { union {
struct fib_nh_common fib_nhc; struct fib_nh_common fib_nhc;
...@@ -80,6 +83,7 @@ struct nexthop { ...@@ -80,6 +83,7 @@ struct nexthop {
struct rb_node rb_node; /* entry on netns rbtree */ struct rb_node rb_node; /* entry on netns rbtree */
struct list_head fi_list; /* v4 entries using nh */ struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */
struct list_head fdb_list; /* fdb entries using this nh */
struct list_head grp_list; /* nh group entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */
struct net *net; struct net *net;
...@@ -88,6 +92,7 @@ struct nexthop { ...@@ -88,6 +92,7 @@ struct nexthop {
u8 protocol; /* app managing this nh */ u8 protocol; /* app managing this nh */
u8 nh_flags; u8 nh_flags;
bool is_group; bool is_group;
bool is_fdb_nh;
refcount_t refcnt; refcount_t refcnt;
struct rcu_head rcu; struct rcu_head rcu;
...@@ -98,6 +103,17 @@ struct nexthop { ...@@ -98,6 +103,17 @@ struct nexthop {
}; };
}; };
enum nexthop_event_type {
NEXTHOP_EVENT_ADD,
NEXTHOP_EVENT_DEL
};
int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
enum nexthop_event_type event_type,
struct nexthop *nh);
int register_nexthop_notifier(struct net *net, struct notifier_block *nb);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
/* caller is holding rcu or rtnl; no reference taken to nexthop */ /* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id); struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head); void nexthop_free_rcu(struct rcu_head *head);
...@@ -304,4 +320,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) ...@@ -304,4 +320,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
int nexthop_for_each_fib6_nh(struct nexthop *nh, int nexthop_for_each_fib6_nh(struct nexthop *nh,
int (*cb)(struct fib6_nh *nh, void *arg), int (*cb)(struct fib6_nh *nh, void *arg),
void *arg); void *arg);
static inline int nexthop_get_family(struct nexthop *nh)
{
struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
return nhi->family;
}
static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
return &nhi->fib_nhc;
}
static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
int hash)
{
struct nh_info *nhi;
struct nexthop *nhp;
nhp = nexthop_select_path(nh, hash);
if (unlikely(!nhp))
return NULL;
nhi = rcu_dereference(nhp->nh_info);
return &nhi->fib_nhc;
}
#endif #endif
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <net/dst_metadata.h> #include <net/dst_metadata.h>
#include <net/rtnetlink.h> #include <net/rtnetlink.h>
#include <net/switchdev.h> #include <net/switchdev.h>
#include <net/nexthop.h>
#define IANA_VXLAN_UDP_PORT 4789 #define IANA_VXLAN_UDP_PORT 4789
...@@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype, ...@@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype,
#undef VXLAN_FLAG #undef VXLAN_FLAG
} }
static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
int hash,
struct vxlan_rdst *rdst)
{
struct fib_nh_common *nhc;
nhc = nexthop_path_fdb_result(nh, hash);
if (unlikely(!nhc))
return false;
switch (nhc->nhc_gw_family) {
case AF_INET:
rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4;
rdst->remote_ip.sa.sa_family = AF_INET;
break;
case AF_INET6:
rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6;
rdst->remote_ip.sa.sa_family = AF_INET6;
break;
}
return true;
}
#endif #endif
...@@ -29,6 +29,7 @@ enum { ...@@ -29,6 +29,7 @@ enum {
NDA_LINK_NETNSID, NDA_LINK_NETNSID,
NDA_SRC_VNI, NDA_SRC_VNI,
NDA_PROTOCOL, /* Originator of entry */ NDA_PROTOCOL, /* Originator of entry */
NDA_NH_ID,
__NDA_MAX __NDA_MAX
}; };
......
...@@ -49,6 +49,9 @@ enum { ...@@ -49,6 +49,9 @@ enum {
NHA_GROUPS, /* flag; only return nexthop groups in dump */ NHA_GROUPS, /* flag; only return nexthop groups in dump */
NHA_MASTER, /* u32; only return nexthops with given master dev */ NHA_MASTER, /* u32; only return nexthops with given master dev */
NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
/* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
__NHA_MAX, __NHA_MAX,
}; };
......
...@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) ...@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family)
} }
const struct nla_policy nda_policy[NDA_MAX+1] = { const struct nla_policy nda_policy[NDA_MAX+1] = {
[NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
[NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
...@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { ...@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
[NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 },
[NDA_MASTER] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 },
[NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_PROTOCOL] = { .type = NLA_U8 },
[NDA_NH_ID] = { .type = NLA_U32 },
}; };
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
......
This diff is collapsed.
...@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, ...@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
#ifdef CONFIG_IPV6_ROUTER_PREF #ifdef CONFIG_IPV6_ROUTER_PREF
fib6_nh->last_probe = jiffies; fib6_nh->last_probe = jiffies;
#endif #endif
if (cfg->fc_is_fdb) {
fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
fib6_nh->fib_nh_gw_family = AF_INET6;
return 0;
}
err = -ENODEV; err = -ENODEV;
if (cfg->fc_ifindex) { if (cfg->fc_ifindex) {
......
...@@ -19,8 +19,8 @@ ret=0 ...@@ -19,8 +19,8 @@ ret=0
ksft_skip=4 ksft_skip=4
# all tests in this script. Can be overridden with -t option # all tests in this script. Can be overridden with -t option
IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal"
IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal"
ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
TESTS="${ALL_TESTS}" TESTS="${ALL_TESTS}"
...@@ -146,6 +146,7 @@ setup() ...@@ -146,6 +146,7 @@ setup()
create_ns remote create_ns remote
IP="ip -netns me" IP="ip -netns me"
BRIDGE="bridge -netns me"
set -e set -e
$IP li add veth1 type veth peer name veth2 $IP li add veth1 type veth peer name veth2
$IP li set veth1 up $IP li set veth1 up
...@@ -280,6 +281,161 @@ stop_ip_monitor() ...@@ -280,6 +281,161 @@ stop_ip_monitor()
return $rc return $rc
} }
check_nexthop_fdb_support()
{
$IP nexthop help 2>&1 | grep -q fdb
if [ $? -ne 0 ]; then
echo "SKIP: iproute2 too old, missing fdb nexthop support"
return $ksft_skip
fi
}
ipv6_fdb_grp_fcnal()
{
local rc
echo
echo "IPv6 fdb groups functional"
echo "--------------------------"
check_nexthop_fdb_support
if [ $? -eq $ksft_skip ]; then
return $ksft_skip
fi
# create group with multiple nexthops
run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb"
run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb"
run_cmd "$IP nexthop add id 102 group 61/62 fdb"
check_nexthop "id 102" "id 102 group 61/62 fdb"
log_test $? 0 "Fdb Nexthop group with multiple nexthops"
## get nexthop group
run_cmd "$IP nexthop get id 102"
check_nexthop "id 102" "id 102 group 61/62 fdb"
log_test $? 0 "Get Fdb nexthop group by id"
# fdb nexthop group can only contain fdb nexthops
run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
run_cmd "$IP nexthop add id 103 group 63/64 fdb"
log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
# Non fdb nexthop group can not contain fdb nexthops
run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb"
run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb"
run_cmd "$IP nexthop add id 104 group 65/66"
log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
# fdb nexthop cannot have blackhole
run_cmd "$IP nexthop add id 67 blackhole fdb"
log_test $? 2 "Fdb Nexthop with blackhole"
# fdb nexthop with oif
run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb"
log_test $? 2 "Fdb Nexthop with oif"
# fdb nexthop with onlink
run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb"
log_test $? 2 "Fdb Nexthop with onlink"
# fdb nexthop with encap
run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
log_test $? 2 "Fdb Nexthop with encap"
run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
log_test $? 0 "Fdb mac add with nexthop group"
## fdb nexthops can only reference nexthop groups and not nexthops
run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self"
log_test $? 255 "Fdb mac add with nexthop"
run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66"
log_test $? 2 "Route add with fdb nexthop"
run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103"
log_test $? 2 "Route add with fdb nexthop group"
run_cmd "$IP nexthop del id 102"
log_test $? 0 "Fdb nexthop delete"
$IP link del dev vx10
}
ipv4_fdb_grp_fcnal()
{
local rc
echo
echo "IPv4 fdb groups functional"
echo "--------------------------"
check_nexthop_fdb_support
if [ $? -eq $ksft_skip ]; then
return $ksft_skip
fi
# create group with multiple nexthops
run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb"
run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb"
run_cmd "$IP nexthop add id 102 group 12/13 fdb"
check_nexthop "id 102" "id 102 group 12/13 fdb"
log_test $? 0 "Fdb Nexthop group with multiple nexthops"
# get nexthop group
run_cmd "$IP nexthop get id 102"
check_nexthop "id 102" "id 102 group 12/13 fdb"
log_test $? 0 "Get Fdb nexthop group by id"
# fdb nexthop group can only contain fdb nexthops
run_cmd "$IP nexthop add id 14 via 172.16.1.2"
run_cmd "$IP nexthop add id 15 via 172.16.1.3"
run_cmd "$IP nexthop add id 103 group 14/15 fdb"
log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
# Non fdb nexthop group can not contain fdb nexthops
run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
run_cmd "$IP nexthop add id 104 group 14/15"
log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
# fdb nexthop cannot have blackhole
run_cmd "$IP nexthop add id 18 blackhole fdb"
log_test $? 2 "Fdb Nexthop with blackhole"
# fdb nexthop with oif
run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb"
log_test $? 2 "Fdb Nexthop with oif"
# fdb nexthop with onlink
run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb"
log_test $? 2 "Fdb Nexthop with onlink"
# fdb nexthop with encap
run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
log_test $? 2 "Fdb Nexthop with encap"
run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
log_test $? 0 "Fdb mac add with nexthop group"
# fdb nexthops can only reference nexthop groups and not nexthops
run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
log_test $? 255 "Fdb mac add with nexthop"
run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
log_test $? 2 "Route add with fdb nexthop"
run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
log_test $? 2 "Route add with fdb nexthop group"
run_cmd "$IP nexthop del id 102"
log_test $? 0 "Fdb nexthop delete"
$IP link del dev vx10
}
################################################################################ ################################################################################
# basic operations (add, delete, replace) on nexthops and nexthop groups # basic operations (add, delete, replace) on nexthops and nexthop groups
# #
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment