Commit 3e455b7d authored by David S. Miller's avatar David S. Miller

Merge branch 'nexthop-API-sysctl'

Roopa Prabhu says:

====================
New sysctl to turn off nexthop API compat mode

Currently route nexthop API maintains user space compatibility
with old route API by default. Dumps and netlink notifications
support both new and old API format. In systems which have
moved to the new API, this compatibility mode cancels some
of the performance benefits provided by the new nexthop API.

This patch adds new sysctl nexthop_compat_mode which is on
by default but provides the ability to turn off compatibility
mode allowing systems to run entirely with the new routing
API if they wish to. Old route API behaviour and support is
not modified by this sysctl

v4:
	- Use davids note for Documenting the sysctl
	- test with latest iproute2 and adjust 'pref'

v3:
	- Document new sysctl
	- move sysctl to use proc_dointvec_minmax with 0 and 1 values
	- selftest: remove pref medium in ipv6 test

v2:
       - Incorporate David Aherns pointers on covering dumps and
         nexthop deletes. Also use one ipv4 sysctl to cover
         both ipv4 and ipv6 (I see it is done that way for many
         others)
       - Added a selftest to cover dump and notfications for nexthop
	 api compat mode
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2ac757e4 4dddb5be
......@@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN
on userspace caches to track link events and evict routes.
Default: false (generate message)
nexthop_compat_mode - BOOLEAN
New nexthop API provides a means for managing nexthops independent of
prefixes. Backwards compatibilty with old route format is enabled by
default which means route dumps and notifications contain the new
nexthop attribute but also the full, expanded nexthop definition.
Further, updates or deletes of a nexthop configuration generate route
notifications for each fib entry using the nexthop. Once a system
understands the new API, this sysctl can be disabled to achieve full
performance benefits of the new API by disabling the nexthop expansion
and extraneous notifications.
Default: true (backward compat mode)
IPv6 Fragmentation:
ip6frag_high_thresh - INTEGER
......
......@@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
int ip6_del_rt(struct net *net, struct fib6_info *f6i);
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);
void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
......
......@@ -48,7 +48,7 @@ struct ipv6_stub {
struct netlink_ext_ack *extack);
void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt);
int (*ip6_del_rt)(struct net *net, struct fib6_info *rt);
int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify);
void (*fib6_rt_update)(struct net *net, struct fib6_info *rt,
struct nl_info *info);
......
......@@ -111,6 +111,8 @@ struct netns_ipv4 {
int sysctl_tcp_early_demux;
int sysctl_udp_early_demux;
int sysctl_nexthop_compat_mode;
int sysctl_fwmark_reflect;
int sysctl_tcp_fwmark_accept;
#ifdef CONFIG_NET_L3_MASTER_DEV
......
......@@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
net->ipv4.sysctl_tcp_early_demux = 1;
net->ipv4.sysctl_nexthop_compat_mode = 1;
#ifdef CONFIG_SYSCTL
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif
......
......@@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
if (nexthop_is_blackhole(fi->nh))
rtm->rtm_type = RTN_BLACKHOLE;
if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
goto offload;
}
if (nhs == 1) {
......@@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
}
offload:
if (fri->offload)
rtm->rtm_flags |= RTM_F_OFFLOAD;
if (fri->trap)
......
......@@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
ipv6_stub->ip6_del_rt(net, f6i);
ipv6_stub->ip6_del_rt(net, f6i,
!net->ipv4.sysctl_nexthop_compat_mode);
}
}
......@@ -1041,7 +1042,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
if (!rc) {
nh_base_seq_inc(net);
nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
if (replace_notify)
if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
}
......
......@@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_tcp_early_demux
},
{
.procname = "nexthop_compat_mode",
.data = &init_net.ipv4.sysctl_nexthop_compat_mode,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "ip_default_ttl",
.data = &init_net.ipv4.sysctl_ip_default_ttl,
......
......@@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
ifp->idev->dev, 0, RTF_DEFAULT, true);
if (f6i) {
if (del_rt)
ip6_del_rt(dev_net(ifp->idev->dev), f6i);
ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
else {
if (!(f6i->fib6_flags & RTF_EXPIRES))
fib6_set_expires(f6i, expires);
......@@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
......@@ -3813,7 +3813,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
spin_unlock_bh(&ifa->lock);
if (rt)
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
......@@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
if (f6i->fib6_metric != prio) {
/* delete old one */
ip6_del_rt(dev_net(ifp->idev->dev), f6i);
ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
/* add new one */
addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
......@@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
ifp->idev->dev, 0, 0,
false);
if (rt)
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
}
if (ifp->rt) {
ip6_del_rt(net, ifp->rt);
ip6_del_rt(net, ifp->rt, false);
ifp->rt = NULL;
}
rt_genid_bump_ipv6(net);
......
......@@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
return -EAFNOSUPPORT;
}
static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
bool skip_notify)
{
return -EAFNOSUPPORT;
}
......
......@@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
ipv6_del_acaddr_hash(aca);
addrconf_leave_solict(idev, &aca->aca_addr);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
aca_put(aca);
return 0;
......@@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
addrconf_leave_solict(idev, &aca->aca_addr);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
aca_put(aca);
......
......@@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
}
}
if (rt && lifetime == 0) {
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
rt = NULL;
}
......
......@@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
gwaddr, dev);
if (rt && !lifetime) {
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
rt = NULL;
}
......@@ -3729,9 +3729,12 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
return err;
}
int ip6_del_rt(struct net *net, struct fib6_info *rt)
int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
struct nl_info info = { .nl_net = net };
struct nl_info info = {
.nl_net = net,
.skip_notify = skip_notify
};
return __ip6_del_rt(rt, &info);
}
......@@ -4252,7 +4255,7 @@ static void __rt6_purge_dflt_routers(struct net *net,
(!idev || idev->cnf.accept_ra != 2) &&
fib6_info_hold_safe(rt)) {
rcu_read_unlock();
ip6_del_rt(net, rt);
ip6_del_rt(net, rt, false);
goto restart;
}
}
......@@ -5554,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (nexthop_is_blackhole(rt->nh))
rtm->rtm_type = RTN_BLACKHOLE;
if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
if (net->ipv4.sysctl_nexthop_compat_mode &&
rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
goto nla_put_failure;
rtm->rtm_flags |= nh_flags;
......
......@@ -19,8 +19,8 @@ ret=0
ksft_skip=4
# all tests in this script. Can be overridden with -t option
IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime"
IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime"
IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode"
IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode"
ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
TESTS="${ALL_TESTS}"
......@@ -253,6 +253,33 @@ check_route6()
check_output "${out}" "${expected}"
}
start_ip_monitor()
{
local mtype=$1
# start the monitor in the background
tmpfile=`mktemp /var/run/nexthoptestXXX`
mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
echo "$mpid $tmpfile"
}
stop_ip_monitor()
{
local mpid=$1
local tmpfile=$2
local el=$3
# check the monitor results
kill $mpid
lines=`wc -l $tmpfile | cut "-d " -f1`
test $lines -eq $el
rc=$?
rm -rf $tmpfile
return $rc
}
################################################################################
# basic operations (add, delete, replace) on nexthops and nexthop groups
#
......@@ -883,6 +910,173 @@ ipv4_fcnal_runtime()
log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check"
}
sysctl_nexthop_compat_mode_check()
{
local sysctlname="net.ipv4.nexthop_compat_mode"
local lprefix=$1
IPE="ip netns exec me"
$IPE sysctl -q $sysctlname 2>&1 >/dev/null
if [ $? -ne 0 ]; then
echo "SKIP: kernel lacks nexthop compat mode sysctl control"
return $ksft_skip
fi
out=$($IPE sysctl $sysctlname 2>/dev/null)
log_test $? 0 "$lprefix default nexthop compat mode check"
check_output "${out}" "$sysctlname = 1"
}
sysctl_nexthop_compat_mode_set()
{
local sysctlname="net.ipv4.nexthop_compat_mode"
local mode=$1
local lprefix=$2
IPE="ip netns exec me"
out=$($IPE sysctl -w $sysctlname=$mode)
log_test $? 0 "$lprefix set compat mode - $mode"
check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode"
}
ipv6_compat_mode()
{
local rc
echo
echo "IPv6 nexthop api compat mode test"
echo "--------------------------------"
sysctl_nexthop_compat_mode_check "IPv6"
if [ $? -eq $ksft_skip ]; then
return $ksft_skip
fi
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 122 group 62/63"
ipmout=$(start_ip_monitor route)
run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
# route add notification should contain expanded nexthops
stop_ip_monitor $ipmout 3
log_test $? 0 "IPv6 compat mode on - route add notification"
# route dump should contain expanded nexthops
check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1"
log_test $? 0 "IPv6 compat mode on - route dump"
# change in nexthop group should generate route notification
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop replace id 122 group 62/64"
stop_ip_monitor $ipmout 3
log_test $? 0 "IPv6 compat mode on - nexthop change"
# set compat mode off
sysctl_nexthop_compat_mode_set 0 "IPv6"
run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122"
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 122 group 62/63"
ipmout=$(start_ip_monitor route)
run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
# route add notification should not contain expanded nexthops
stop_ip_monitor $ipmout 1
log_test $? 0 "IPv6 compat mode off - route add notification"
# route dump should not contain expanded nexthops
check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium"
log_test $? 0 "IPv6 compat mode off - route dump"
# change in nexthop group should not generate route notification
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop replace id 122 group 62/64"
stop_ip_monitor $ipmout 0
log_test $? 0 "IPv6 compat mode off - nexthop change"
# nexthop delete should not generate route notification
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop del id 122"
stop_ip_monitor $ipmout 0
log_test $? 0 "IPv6 compat mode off - nexthop delete"
# set compat mode back on
sysctl_nexthop_compat_mode_set 1 "IPv6"
}
ipv4_compat_mode()
{
local rc
echo
echo "IPv4 nexthop api compat mode"
echo "----------------------------"
sysctl_nexthop_compat_mode_check "IPv4"
if [ $? -eq $ksft_skip ]; then
return $ksft_skip
fi
run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1"
run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1"
run_cmd "$IP nexthop add id 122 group 21/22"
ipmout=$(start_ip_monitor route)
run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
stop_ip_monitor $ipmout 3
# route add notification should contain expanded nexthops
log_test $? 0 "IPv4 compat mode on - route add notification"
# route dump should contain expanded nexthops
check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
log_test $? 0 "IPv4 compat mode on - route dump"
# change in nexthop group should generate route notification
run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1"
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop replace id 122 group 21/23"
stop_ip_monitor $ipmout 3
log_test $? 0 "IPv4 compat mode on - nexthop change"
sysctl_nexthop_compat_mode_set 0 "IPv4"
# cleanup
run_cmd "$IP ro del 172.16.101.1/32 nhid 122"
ipmout=$(start_ip_monitor route)
run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
stop_ip_monitor $ipmout 1
# route add notification should not contain expanded nexthops
log_test $? 0 "IPv4 compat mode off - route add notification"
# route dump should not contain expanded nexthops
check_route "172.16.101.1" "172.16.101.1 nhid 122"
log_test $? 0 "IPv4 compat mode off - route dump"
# change in nexthop group should not generate route notification
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop replace id 122 group 21/22"
stop_ip_monitor $ipmout 0
log_test $? 0 "IPv4 compat mode off - nexthop change"
# nexthop delete should not generate route notification
ipmout=$(start_ip_monitor route)
run_cmd "$IP nexthop del id 122"
stop_ip_monitor $ipmout 0
log_test $? 0 "IPv4 compat mode off - nexthop delete"
sysctl_nexthop_compat_mode_set 1 "IPv4"
}
basic()
{
echo
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment