Commit 7d30a7f6 authored by David S. Miller's avatar David S. Miller

Merge branch 'ipv6-avoid-taking-refcnt-on-dst-during-route-lookup'

Wei Wang says:

====================
ipv6: avoid taking refcnt on dst during route lookup

Ipv6 route lookup code always grabs refcnt on the dst for the caller.
But for certain cases, grabbing refcnt is not always necessary if the
call path is rcu protected and the caller does not cache the dst.
Another issue in the route lookup logic is:
When there are multiple custom rules, we have to do the lookup into
each table associated to each rule individually. And when we can't
find the route in one table, we grab and release refcnt on
net->ipv6.ip6_null_entry before going to the next table.
This operation is completely redundant, and causes false issue because
net->ipv6.ip6_null_entry is a shared object.

This patch set introduces a new flag RT6_LOOKUP_F_DST_NOREF for route
lookup callers to set, to avoid any manipulation on the dst refcnt. And
it converts the major input and output path to use it.

The performance gain is noticable.
I ran synflood tests between 2 hosts under the same switch. Both hosts
have 20G mlx NIC, and 8 tx/rx queues.
Sender sends pure SYN flood with random src IPs and ports using trafgen.
Receiver has a simple TCP listener on the target port.
Both hosts have multiple custom rules:
- For incoming packets, only local table is traversed.
- For outgoing packets, 3 tables are traversed to find the route.
The packet processing rate on the receiver is as follows:
- Before the fix: 3.78Mpps
- After the fix:  5.50Mpps

v2->v3:
- Handled fib6_rule_lookup() when CONFIG_IPV6_MULTIPLE_TABLES is not
  configured in patch 03 (suggested by David Ahern)
- Removed the renaming of l3mdev_link_scope_lookup() in patch 05
  (suggested by David Ahern)
- Moved definition of ip6_route_output_flags() from an inline function
  in /net/ipv6/route.c to net/ipv6/route.c in order to address kbuild
  error in patch 05

v1->v2:
- Added a helper ip6_rt_put_flags() in patch 3 suggested by David Miller
====================
Reviewed-by: default avatarDavid Ahern <dsahern@gmail.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8c25c0cb 7d9e5f42
......@@ -1072,12 +1072,14 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
#if IS_ENABLED(CONFIG_IPV6)
/* send to link-local or multicast address via interface enslaved to
* VRF device. Force lookup to VRF table without changing flow struct
* Note: Caller to this function must hold rcu_read_lock() and no refcnt
* is taken on the dst by this function.
*/
static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
struct flowi6 *fl6)
{
struct net *net = dev_net(dev);
int flags = RT6_LOOKUP_F_IFACE;
int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
struct dst_entry *dst = NULL;
struct rt6_info *rt;
......@@ -1087,7 +1089,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
*/
if (fl6->flowi6_oif == dev->ifindex) {
dst = &net->ipv6.ip6_null_entry->dst;
dst_hold(dst);
return dst;
}
......
......@@ -36,6 +36,7 @@ struct route_info {
#define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010
#define RT6_LOOKUP_F_SRCPREF_COA 0x00000020
#define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040
#define RT6_LOOKUP_F_DST_NOREF 0x00000080
/* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
* Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
......@@ -83,6 +84,10 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
struct flowi6 *fl6,
const struct sk_buff *skb, int flags);
struct dst_entry *ip6_route_output_flags_noref(struct net *net,
const struct sock *sk,
struct flowi6 *fl6, int flags);
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct flowi6 *fl6, int flags);
......@@ -93,6 +98,16 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
return ip6_route_output_flags(net, sk, fl6, 0);
}
/* Only conditionally release dst if flags indicates
* !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
*/
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
!list_empty(&rt->rt6i_uncached))
ip6_rt_put(rt);
}
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb, int flags);
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
......
......@@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
ip6_rt_put(rt);
ip6_rt_put_flags(rt, flags);
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
ip6_rt_put(rt);
ip6_rt_put_flags(rt, flags);
}
dst_hold(&net->ipv6.ip6_null_entry->dst);
if (!(flags & RT6_LOOKUP_F_DST_NOREF))
dst_hold(&net->ipv6.ip6_null_entry->dst);
return &net->ipv6.ip6_null_entry->dst;
}
......@@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
again:
ip6_rt_put(rt);
ip6_rt_put_flags(rt, flags);
err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
dst_hold(&rt->dst);
if (!(flags & RT6_LOOKUP_F_DST_NOREF))
dst_hold(&rt->dst);
out:
res->rt6 = rt;
return err;
......
......@@ -316,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
ip6_rt_put(rt);
ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
if (!(flags | RT6_LOOKUP_F_DST_NOREF))
dst_hold(&rt->dst);
}
return &rt->dst;
......
......@@ -1391,9 +1391,6 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
if (pcpu_rt)
ip6_hold_safe(NULL, &pcpu_rt);
return pcpu_rt;
}
......@@ -1403,12 +1400,9 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(res);
if (!pcpu_rt) {
dst_hold(&net->ipv6.ip6_null_entry->dst);
return net->ipv6.ip6_null_entry;
}
if (!pcpu_rt)
return NULL;
dst_hold(&pcpu_rt->dst);
p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
......@@ -2189,9 +2183,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
const struct sk_buff *skb, int flags)
{
struct fib6_result res = {};
struct rt6_info *rt;
struct rt6_info *rt = NULL;
int strict = 0;
WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
!rcu_read_lock_held());
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
......@@ -2200,23 +2197,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
fib6_table_lookup(net, table, oif, fl6, &res, strict);
if (res.f6i == net->ipv6.fib6_null_entry) {
rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
dst_hold(&rt->dst);
return rt;
}
if (res.f6i == net->ipv6.fib6_null_entry)
goto out;
fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
rcu_read_unlock();
return rt;
goto out;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
......@@ -2224,40 +2213,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
struct rt6_info *uncached_rt;
uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
rcu_read_unlock();
if (uncached_rt) {
/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
* No need for another dst_hold()
if (rt) {
/* 1 refcnt is taken during ip6_rt_cache_alloc().
* As rt6_uncached_list_add() does not consume refcnt,
* this refcnt is always returned to the caller even
* if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
rt6_uncached_list_add(uncached_rt);
rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
} else {
uncached_rt = net->ipv6.ip6_null_entry;
dst_hold(&uncached_rt->dst);
}
rcu_read_unlock();
return uncached_rt;
return rt;
}
} else {
/* Get a percpu copy */
struct rt6_info *pcpu_rt;
local_bh_disable();
pcpu_rt = rt6_get_pcpu_route(&res);
rt = rt6_get_pcpu_route(&res);
if (!pcpu_rt)
pcpu_rt = rt6_make_pcpu_route(net, &res);
if (!rt)
rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
rcu_read_unlock();
return pcpu_rt;
}
out:
if (!rt)
rt = net->ipv6.ip6_null_entry;
if (!(flags & RT6_LOOKUP_F_DST_NOREF))
ip6_hold_safe(net, &rt);
rcu_read_unlock();
return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
......@@ -2388,11 +2375,12 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
return mhash >> 1;
}
/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
int flags = RT6_LOOKUP_F_HAS_SADDR;
int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
......@@ -2414,8 +2402,8 @@ void ip6_route_input(struct sk_buff *skb)
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
skb_dst_set(skb,
ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
&fl6, skb, flags));
}
static struct rt6_info *ip6_pol_route_output(struct net *net,
......@@ -2427,8 +2415,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct flowi6 *fl6, int flags)
struct dst_entry *ip6_route_output_flags_noref(struct net *net,
const struct sock *sk,
struct flowi6 *fl6, int flags)
{
bool any_src;
......@@ -2436,6 +2425,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
/* This function does not take refcnt on the dst */
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
......@@ -2443,6 +2433,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
fl6->flowi6_iif = LOOPBACK_IFINDEX;
flags |= RT6_LOOKUP_F_DST_NOREF;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
......@@ -2455,6 +2446,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
struct dst_entry *ip6_route_output_flags(struct net *net,
const struct sock *sk,
struct flowi6 *fl6,
int flags)
{
struct dst_entry *dst;
struct rt6_info *rt6;
rcu_read_lock();
dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
rt6 = (struct rt6_info *)dst;
/* For dst cached in uncached_list, refcnt is already taken. */
if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
dst = &net->ipv6.ip6_null_entry->dst;
dst_hold(dst);
}
rcu_read_unlock();
return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
......@@ -6029,6 +6042,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
......@@ -6040,6 +6054,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
......@@ -6049,6 +6064,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
#endif
net->ipv6.sysctl.flush_delay = 0;
......
......@@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
* local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
* This function does not hold refcnt on the returned dst.
* Caller must hold rcu_read_lock().
*/
struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
......@@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
struct dst_entry *dst = NULL;
struct net_device *dev;
WARN_ON_ONCE(!rcu_read_lock_held());
if (fl6->flowi6_oif) {
rcu_read_lock();
dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (dev && netif_is_l3_slave(dev))
dev = netdev_master_upper_dev_get_rcu(dev);
......@@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_link_scope_lookup)
dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
rcu_read_unlock();
}
return dst;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment