Commit a023f92b authored by Kirill Smelkov's avatar Kirill Smelkov

X net/ipv6: Catching unreachable routes in cache ...

For now could get under helloweb loaded from several places via wrk e.g.

[ 2223.320576] +cU 2001067c1254000e0010000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000001 00000000 01200200       lo
[ 2223.320601]  . U 2001067c125400000000000000000000 30 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00200200       lo

That means that

    unreachable 2001:67c:1254:e:10::1/128 cached

was cloned from

    unreachable 2001:67c:1254::/48

in time when

    2001:67c:1254:e:10::/80

was absent.

But that unreachable route in cache was removed pretty fast, when route
for 2001:67c:1254:e:10::/80 appeared again.

Still watching...
parent a11efd7b
......@@ -188,9 +188,9 @@ static inline void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
struct dst_entry *new = (struct dst_entry *) from;
rt->rt6i_flags &= ~RTF_EXPIRES;
rt->rt6i_flags &= ~RTF_EXPIRES; // XXX cache always not expires ?
dst_hold(new);
rt->dst.from = new;
rt->dst.from = new; // XXX try to track rt->dst.from in monitor dump - to know from which route this was cloned
}
static inline void ip6_rt_put(struct rt6_info *rt)
......
......@@ -86,6 +86,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
struct fib6_rule *r = (struct fib6_rule *)rule;
/*
* XXX is it relevant?
* If we need to find a source address for this traffic,
* we check the result if it meets requirement of the rule.
*/
......
......@@ -81,7 +81,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w);
* A routing update causes an increase of the serial number on the
* affected subtree. This allows for cached routes to be asynchronously
* tested when modifications are made to the destination cache as a
* result of redirects, path MTU changes, etc.
* result of redirects, path MTU changes, etc. XXX NOTE!
*/
static __u32 rt_sernum;
......@@ -460,6 +460,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
* Exact match ?
*/
// NOTE relookup entry, when entry is already "same" _daddr_
// (i.e. it could be unreachable in fib, but tryung to add as reachable -> BUG)
// XXX ^^^ wrong - here we only locate fib6_node place - content is set by caller
if (plen == fn->fn_bit) {
/* clean up an intermediate node */
if (!(fn->fn_flags & RTN_RTINFO)) {
......
......@@ -903,6 +903,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
return rt;
}
// NOTE here
static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
const struct in6_addr *daddr)
{
......@@ -946,7 +947,7 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
else if (!(rt->dst.flags & DST_HOST))
nrt = rt6_alloc_clone(rt, &fl6->daddr);
nrt = rt6_alloc_clone(rt, &fl6->daddr); // <-- NOTE here
else
goto out2;
......@@ -955,7 +956,7 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
dst_hold(&rt->dst);
if (nrt) {
err = ip6_ins_rt(nrt);
err = ip6_ins_rt(nrt); // <-- NOTE here cloned route inserted back
if (!err)
goto out2;
}
......@@ -1160,6 +1161,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
}
// NOTE
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
int oif, u32 mark)
{
......@@ -2781,6 +2783,55 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh)
return err;
}
static void rt6_print(const struct rt6_info *rt)
{
// TODO?
// dst CHECK
// table (rt6i_table->tb6_id)
// prefsrc rt->rt6i_prefsrc
// protocol rt->rt6i_protocol
static const struct in6_addr in6_zero = {};
char rttype, rtcached;
rtcached = rt->rt6i_flags & RTF_CACHE ? 'c' : ' ';
// see rt6_fill_node()
rttype = '?';
if (rt->rt6i_flags & RTF_REJECT) {
switch(rt->dst.error) {
case -EINVAL:
rttype = 'B'; // rtm->rtm_type = RTN_BLACKHOLE;
break;
case -EACCES:
rttype = 'P'; // rtm->rtm_type = RTN_PROHIBIT;
break;
case -EAGAIN:
rttype = 'T'; // rtm->rtm_type = RTN_THROW;
break;
default:
rttype = 'U'; // rtm->rtm_type = RTN_UNREACHABLE;
break;
}
}
else if (rt->rt6i_flags & RTF_LOCAL)
rttype = 'l'; // rtm->rtm_type = RTN_LOCAL;
else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
rttype = 'l'; // rtm->rtm_type = RTN_LOCAL;
else
rttype = ' '; // rtm->rtm_type = RTN_UNICAST;
printk("%c%c %pi6 %02x %pi6 %02x %pi6 %08x %08x %08x %08x %8s",
rtcached, rttype,
&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
&rt->rt6i_src.addr, rt->rt6i_src.plen,
(rt->rt6i_flags & RTF_GATEWAY ? &rt->rt6i_gateway : &in6_zero),
rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
rt->dst.__use, rt->rt6i_flags,
rt->dst.dev ? rt->dst.dev->name : ""
);
}
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
{
struct sk_buff *skb;
......@@ -2788,6 +2839,33 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
u32 seq;
int err;
if (event == RTM_NEWROUTE || event == RTM_DELROUTE) {
struct rt6_info *rt_from;
char adddel = (event == RTM_NEWROUTE ? '+' : '-');
if (!(rt->rt6i_flags & RTF_CACHE)) goto rtnotrace; // only cached
if (!(rt->rt6i_flags & RTF_REJECT)) goto rtnotrace; // only unreachable
// show route
printk(KERN_ERR "%c", adddel);
rt6_print(rt);
printk("\n");
// show route this route is cached from
rt_from = rt;
while (rt_from && rt_from->rt6i_flags & RTF_CACHE) {
rt_from = (struct rt6_info *)rt_from->dst.from;
printk(KERN_ERR " .");
if (rt_from)
rt6_print(rt_from);
else
printk("nil");
printk("\n");
}
}
rtnotrace:
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
......
  • /cc @jm

  • Great. This will surely be more useful than from userland.

    in time when 2001:67c:1254:e:10::/80 was absent.

    It should have never been absent. Well, maybe in this case, there was a down hop, but we first care about issues that happen when babeld just switches routes from one hop to another.

  • @jm, thanks for feedback.

    It should have never been absent. Well, maybe in this case, there was a down hop, but we first care about issues that happen when babeld just switches routes from one hop to another.

    Current state, judging by babeld source code, is babel first removes route and then adds updated one:

    https://lab.nexedi.com/nexedi/babeld/blob/v1.6.2-nxd1/kernel_netlink.c#L982

    however, even if we change this to replacing 1 route at a time atomically, with e.g. NLM_F_REPLACE flag we cannot avoid having a route being non-missing in all cases. For example a route can be removed by babel, because it thinks the route is lost, and reintroduced later after some time.

    So my idea is, since we cannot do atomic routes update all the time in principle, we should not even try.

    The problem it seems is the kernel somehow does not remove cache route which was cloned from upper-level unreachable route, or something like that. I have not looked into details yet, but to me it is, let's say 90% clear, this is just a mistake in cache invalidation.

    I've started to collect the traces for my helloweb@mini service being tortured here:

    https://lab.nexedi.com/kirr/mini-helloweb

    dmesg already have something interesting, and full ip monitor (with rtcache info) is there too.

    I have not looked into details yet. I gave you access so you can see it too.

    Thanks again for feedback,
    Kirill

  • /cc @Nicolas

  • For example a route can be removed by babel, because it thinks the route is lost, and reintroduced later after some time.

    I know that. It's normal to get ENETUNREACH errors in this case. In all other cases, it's not acceptable. Having 503 errors just because babeld switches route is not. Really, NLM_F_REPLACE must be used when possible.

  • NLM_F_REPLACE won't help: If babel for some reason removes a route, and a cached route gets created from other unreachable route, and persists in cache (since it is used all the time by clients e.g. trying to reconnect) and is not removed from cache when proper route comes up - we still have the problem.

    We have this problem now, just occuring a bit more "frequently" and we won't completely kill it with NLM_F_REPLACE.

    The bug has to be dealt with in kernel - by properly invalidating RT cache nodes upon route additions, or by not using RT cache at all (which more or less started to happen with Facebook patches).

  • Ok, it's true there are 2 issues:

    • 1 in the kernel with invalid route in the cache
    • 1 in babel that does not use NLM_F_REPLACE

    Both must be fixed. I'm sure the 2nd one would fix more failures in our network because we first care about services that are up all the time, with very rare issues in the underlying network issues (between frontends and backends), and for which we sets up direct tunnels in re6stnet conf.

    Edited by Julien Muchembled
  • ok, this your statement I more or less agree:

    1. bug in the kernel is there
    2. babel can be optimised to use NLM_F_REPLACE for modifying routes

    though 2 would be nice to do, it is still an optimisation, and the kernel must be working correctly without it. I'm saying this because we have default route which should backup when another one is missing.

    In fact for more reliably catching the kernel issue, I think I will add a sleep(1) to babel code in between DELROUTE and ADDROUTE, so the probability to hit the issue is higher.

    P.S.

    FYI, our default route is source-specific, e.g.

    default from 2001:67c:1254::/48 via fe80::44ce:aeff:fe61:75de dev re6stnet3  proto babel  metric 1024  pref medium

    and there is a bug in kernels < 4.1 when default source-specific route is not used on connect & friends, so eventhough there is a default route it is not used and clients get unreachable result. Details:

    https://git.kernel.org/linus/e16e888b

  • we have default route which should backup when another one is missing

    we don't, otherwise we'd never have such unreachable routes in cache

    It's not just an optimization. We can still get occasional 503 if we only fix the kernel. And it does happen often because that was almost always the source cause of issues between frontends and lab.

    In fact for more reliably catching the kernel issue, I think I will add a sleep(1) to babel code in between DELROUTE and ADDROUTE, so the probability to hit the issue is higher.

    Good idea.

    https://git.kernel.org/linus/e16e888b

    Great. I saw this recently. But that only affects re6st nodes without --default option, to connect outside our network.

    Edited by Julien Muchembled
  • we don't, otherwise we'd never have such unreachable routes in cache

    We do have them:

    [2016-05-20T22:11:54.576731] [ROUTE]+ default from 2001:67c:1254::/48 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto babel  metric 1024 

    and from ip monitor logs I can prove the default route is all there setup well (e.g. as above) when sticky unreachable cache entry is born, like e.g.

    [2016-05-20T22:11:54.576731] [ROUTE]+ default from 2001:67c:1254::/48 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto babel  metric 1024 
    
    ... a lot, but default is not changed
    
    [2016-05-20T22:32:32.532745] [RTCAH]- 2001:67c:1254:e:10::1 via fe80::f485:c1ff:feb4:f32a dev re6stnet-tcp  src 2001:67c:1254:e:63::1  metric 0 \    cache  users 3 used 2372
    [2016-05-20T22:32:32.532755] [ROUTE]- 2001:67c:1254:e:10::/80 via fe80::f485:c1ff:feb4:f32a dev re6stnet-tcp  proto babel  src 2001:67c:1254:e:63::1  metric 1024 
    [2016-05-20T22:32:32.532757] [ROUTE]+ 2001:67c:1254:e:10::/80 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto babel  src 2001:67c:1254:e:63::1  metric 1024 
    
    # sticky unreachable in cache for several minutes
    [2016-05-20T22:32:32.532758] [RTCAH]+ unreachable 2001:67c:1254:e:10::1 dev lo  metric 0 \    cache  error -101 users 1
    
    ... a lot, but default is not changed
    
    [2016-05-20T22:35:27.441827] [RTCAH]- unreachable 2001:67c:1254:e:10::1 dev lo  metric 0 \    cache  error -101 used 281 age 2sec
    [2016-05-20T22:35:27.441834] [ROUTE]- 2001:67c:1254:e:10::/80 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto babel  src 2001:67c:1254:e:63::1  metric 1024 
    [2016-05-20T22:35:27.441835] [ROUTE]+ 2001:67c:1254:e:10::/80 via fe80::f485:c1ff:feb4:f32a dev re6stnet-tcp  proto babel  src 2001:67c:1254:e:63::1  metric 1024 
    [2016-05-20T22:35:28.613731] [RTCAH]+ 2001:67c:1254:e:10::1 via fe80::f485:c1ff:feb4:f32a dev re6stnet-tcp  src 2001:67c:1254:e:63::1  metric 0 \    cache  users 1
    

    also: if there is no default route and route lookup just fails to find appropriate FIB node (compared to e.g. finding unreachable FIB node) no cache entry is created. A cloned route (= cached in other speak) always has rt->dst.from set non NULL.

    So unreachable route cache entries are always created from some existing FIB nodes, not e.g. due to absent default route.

    That's also why in tracing I added showing of from which actual route a cloned route was created. I have not looked into details yet, but it is not from default, as the patch message shows.

    So it's not just an optimization. We can still get occasional 503 if we only fix the kernel. And it does happen often because that was almost always the source cause of issues between frontends and lab.

    We get 503 only if a backend or frontend has sticky unreachable route in their route cache, despite they have good reachable non-cached route to each other. All the outages so far was of this kind. So from this point of view, if we get unreachable entry in cache for some small subsecond time and then it is correctly adjusted to reachable entry, because on route modification babel adds new route to the destination, it should all just work. The worst what can happen is for that small amount of time for one or two connection it gets "packet lost" with small probability, and on next resend try it is all up again rigth after the (microsecond time) change. So it should not be 503 because of how TCP works (imho, 90% sure). I agree though via using atomic route change we can remove this potential "packet lost for small time" issue, but this problem is several orders of magnitude less important compared to sticky unreachable cache entry case.

    https://git.kernel.org/linus/e16e888b

    Great. I saw this recently. But that only affects re6st nodes without --default option, to connect outside our network.

    On lab backend we have default in re6stnet.conf. Then

    $ ip -6 r |grep default
    default from 2001:67c:1254::/48 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto 42  metric 1024
    
    $ host www.google.ru
    www.google.ru has address 216.58.209.3
    www.google.ru has IPv6 address 2a00:1450:4007:80c::2003
    
    $ telnet 2a00:1450:4007:80c::2003 80
    Trying 2a00:1450:4007:80c::2003...
    telnet: Unable to connect to remote host: Network is unreachable
    
    # NOTE unreachable returned is *not* cached
    $ ip -6 r get 2a00:1450:4007:80c::2003
    unreachable 2a00:1450:4007:80c::2003 from :: dev lo  table unspec  proto kernel  src 2001:67c:1254:e:63::1  metric 4294967295  error -101
    
    $ ip -6 -o r get 2a00:1450:4007:80c::2003 from 2001:67c:1254::1
    2a00:1450:4007:80c::2003 from 2001:67c:1254::1 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  src 2001:67c:1254:e:63::1  metric 0 \    cache 

    With the patch it would telnet ok. So to me it is for those who have --default also.

    Also, if we have some address on our network for which there is no route, there is

    $ ip -6 r |grep unreachable
    unreachable 2001:67c:1254:e:63::/80 dev lo  proto kernel  metric 256  error -101
    unreachable 2001:67c:1254::/48 dev lo  metric 1024  error -101   <-- HERE

    so yes, default is not taken here, but unreachable 2001:67c:1254::/48 is. Actually in a lot of cases this is the "normal" route unreachable cached entries are cloned from (as they should).

    The open question so far is why in some of the cases they remain sticky there in cache live even after corresponding reachable route is added back.

  • What I meant is that we don't fallback on the default route. During a route switch (for a destination inside our network), ie between DEL & ADD, it always fallbacks on the unreachable /48 route, which masks the default.

    On lab backend we have default in re6stnet.conf.

    Are you sure ? Your default route would be different if it was (with a preferred source address, instead of being source-specific).

    root@COMP-2468:~# grep 'default\|table 0' /etc/re6stnet/re6stnet.conf
    table 0
    root@COMP-2468:~# ip -6 r show default
    default via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet2  proto 42  src 2001:67c:1254:e:3e::1  metric 1024 
    jm@ns309975:~$ grep 'default\|table 0' /etc/re6stnet/re6stnet.conf
    jm@ns309975:~$ ip -6 r show default
    default from 2001:67c:1254::/48 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet2  proto 42  metric 1024 
    default via 2001:41d0:2:8eff:ff:ff:ff:ff dev eth0  metric 1024 
  • What I meant is that we don't fallback on the default route. During a route switch (for a destination inside our network), ie between DEL & ADD, it always fallbacks on the unreachable /48 route, which masks the default.

    Yes, I meant that too.

    Are you sure ? Your default route would be different if it was (with a preferred source address, instead of being source-specific).

    Yes, I'm sure:

    $ hostname
    ns365282.ip-188-165-221.eu
    $ grep default /etc/re6stnet/re6stnet.conf 
    default
    $ ip -6 -o -s -s -s -d r |grep default
    unicast default from 2001:67c:1254::/48 via fe80::a8b6:97ff:fe99:8ca8 dev re6stnet8  proto 42  scope global  metric 1024 
  • Are you sure ? Your default route would be different if it was (with a preferred source address, instead of being source-specific).

    Yes, I'm sure:

    Though please note: I do not have root on the backend. Maybe the machine admin edited re6stnet.conf adding default, but did not restarted re6stnet service.

    @rafael, is it maybe the case?

  • @rafael, on lab backend babeld was started Feb24. The last time re6stnet conf was edited is later:

    $ ll /etc/re6stnet/re6stnet.conf 
    -rw-r--r-- 1 root root 292 Apr 22 11:00 /etc/re6stnet/re6stnet.conf

    do you recall about what was the changes? Is there maybe etckeeper installed we can see the diffs? After disussing @jm says babeld command line does not match current re6stnet config.

  • So it should not be 503 because of how TCP works (imho, 90% sure).

    I tested and any packet other than SYN is only lost. There remains the case of establishing a connection, which always fails immediately if there's no route. Even an unreachable route on a intermediate node rejects SYN packets with 'destination unreachable' (icmp6).

    Edited by Julien Muchembled
  • ok, thanks for clarifying this. So we will have 503 for connections trying to be established in the exactly that microsecond time frame, but again good after trying to reconnect.

    From this point of view it is good to fix this. Also using one replace-route netlink call uses less system resources compared to del and then add route.

    I think regarding this we can already file an issue or a patch to babeld.

Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment