Commit 03d51c4f authored by David S. Miller's avatar David S. Miller

Merge branch 'Simplify-IPv4-route-offload-API'

Ido Schimmel says:

====================
Simplify IPv4 route offload API

Motivation
==========

The aim of this patch set is to simplify the IPv4 route offload API by
making the stack a bit smarter about the notifications it is generating.
This allows driver authors to focus on programming the underlying device
instead of having to duplicate the IPv4 route insertion logic in their
driver, which is error-prone.

This is the first patch set out of a series of four. Subsequent patch
sets will simplify the IPv6 API, add offload/trap indication to routes
and add tests for all the code paths (including error paths). Available
here [1].

Details
=======

Today, whenever an IPv4 route is added or deleted a notification is sent
in the FIB notification chain and it is up to offload drivers to decide
if the route should be programmed to the hardware or not. This is not an
easy task as in hardware routes are keyed by {prefix, prefix length,
table id}, whereas the kernel can store multiple such routes that only
differ in metric / TOS / nexthop info.

This series makes sure that only routes that are actually used in the
data path are notified to offload drivers. This greatly simplifies the
work these drivers need to do, as they are now only concerned with
programming the hardware and do not need to replicate the IPv4 route
insertion logic and store multiple identical routes.

The route that is notified is the first FIB alias in the FIB node with
the given {prefix, prefix length, table ID}. In case the route is
deleted and there is another route with the same key, a replace
notification is emitted. Otherwise, a delete notification is emitted.

The above means that in the case of multiple routes with the same key,
but different TOS, only the route with the highest TOS is notified.
While the kernel can route a packet based on its TOS, this is not
supported by any hardware devices I am familiar with. Moreover, this is
not supported by IPv6 nor by BIRD/FRR from what I could see. Offload
drivers should therefore use the presence of a non-zero TOS as an
indication to trap packets matching the route and let the kernel route
them instead. mlxsw has been doing it for the past two years.

Testing
=======

To ensure there is no degradation in route insertion rates, I averaged
the insertion rate of 512k routes (/24 and /32) over 50 runs. Did not
observe any degradation.

Functional tests are available here [1]. They rely on route trap
indication, which is only added in the last patch set.

In addition, I have been running syzkaller for the past week with all
four patch sets and debug options enabled. Did not observe any problems.

Patch set overview
==================

Patches #1-#8 gradually introduce the new FIB notifications
Patch #9 converts mlxsw to use the new notifications
Patch #10 converts the remaining listeners and removes the old
notifications

v2:
* Extend fib_find_alias() with another argument instead of introducing a
  new function (David Ahern)

RFC: https://patchwork.ozlabs.org/cover/1170530/

[1] https://github.com/idosch/linux/tree/fib-notifier
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 366c7bb0 446f7391
......@@ -200,8 +200,6 @@ static void mlx5_lag_fib_update(struct work_struct *work)
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
mlx5_lag_fib_route_event(ldev, fib_work->event,
fib_work->fen_info.fi);
......@@ -259,8 +257,6 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
switch (event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
fen_info = container_of(info, struct fib_entry_notifier_info,
info);
......
......@@ -3845,7 +3845,7 @@ static void mlxsw_sp_nexthop4_event(struct mlxsw_sp *mlxsw_sp,
key.fib_nh = fib_nh;
nh = mlxsw_sp_nexthop_lookup(mlxsw_sp, key);
if (WARN_ON_ONCE(!nh))
if (!nh)
return;
switch (event) {
......@@ -4780,95 +4780,6 @@ static void mlxsw_sp_fib_node_put(struct mlxsw_sp *mlxsw_sp,
mlxsw_sp_vr_put(mlxsw_sp, vr);
}
static struct mlxsw_sp_fib4_entry *
mlxsw_sp_fib4_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
const struct mlxsw_sp_fib4_entry *new4_entry)
{
struct mlxsw_sp_fib4_entry *fib4_entry;
list_for_each_entry(fib4_entry, &fib_node->entry_list, common.list) {
if (fib4_entry->tb_id > new4_entry->tb_id)
continue;
if (fib4_entry->tb_id != new4_entry->tb_id)
break;
if (fib4_entry->tos > new4_entry->tos)
continue;
if (fib4_entry->prio >= new4_entry->prio ||
fib4_entry->tos < new4_entry->tos)
return fib4_entry;
}
return NULL;
}
static int
mlxsw_sp_fib4_node_list_append(struct mlxsw_sp_fib4_entry *fib4_entry,
struct mlxsw_sp_fib4_entry *new4_entry)
{
struct mlxsw_sp_fib_node *fib_node;
if (WARN_ON(!fib4_entry))
return -EINVAL;
fib_node = fib4_entry->common.fib_node;
list_for_each_entry_from(fib4_entry, &fib_node->entry_list,
common.list) {
if (fib4_entry->tb_id != new4_entry->tb_id ||
fib4_entry->tos != new4_entry->tos ||
fib4_entry->prio != new4_entry->prio)
break;
}
list_add_tail(&new4_entry->common.list, &fib4_entry->common.list);
return 0;
}
static int
mlxsw_sp_fib4_node_list_insert(struct mlxsw_sp_fib4_entry *new4_entry,
bool replace, bool append)
{
struct mlxsw_sp_fib_node *fib_node = new4_entry->common.fib_node;
struct mlxsw_sp_fib4_entry *fib4_entry;
fib4_entry = mlxsw_sp_fib4_node_entry_find(fib_node, new4_entry);
if (append)
return mlxsw_sp_fib4_node_list_append(fib4_entry, new4_entry);
if (replace && WARN_ON(!fib4_entry))
return -EINVAL;
/* Insert new entry before replaced one, so that we can later
* remove the second.
*/
if (fib4_entry) {
list_add_tail(&new4_entry->common.list,
&fib4_entry->common.list);
} else {
struct mlxsw_sp_fib4_entry *last;
list_for_each_entry(last, &fib_node->entry_list, common.list) {
if (new4_entry->tb_id > last->tb_id)
break;
fib4_entry = last;
}
if (fib4_entry)
list_add(&new4_entry->common.list,
&fib4_entry->common.list);
else
list_add(&new4_entry->common.list,
&fib_node->entry_list);
}
return 0;
}
static void
mlxsw_sp_fib4_node_list_remove(struct mlxsw_sp_fib4_entry *fib4_entry)
{
list_del(&fib4_entry->common.list);
}
static int mlxsw_sp_fib_node_entry_add(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib_entry *fib_entry)
{
......@@ -4912,14 +4823,12 @@ static void mlxsw_sp_fib_node_entry_del(struct mlxsw_sp *mlxsw_sp,
}
static int mlxsw_sp_fib4_node_entry_link(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib4_entry *fib4_entry,
bool replace, bool append)
struct mlxsw_sp_fib4_entry *fib4_entry)
{
struct mlxsw_sp_fib_node *fib_node = fib4_entry->common.fib_node;
int err;
err = mlxsw_sp_fib4_node_list_insert(fib4_entry, replace, append);
if (err)
return err;
list_add(&fib4_entry->common.list, &fib_node->entry_list);
err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib4_entry->common);
if (err)
......@@ -4928,7 +4837,7 @@ static int mlxsw_sp_fib4_node_entry_link(struct mlxsw_sp *mlxsw_sp,
return 0;
err_fib_node_entry_add:
mlxsw_sp_fib4_node_list_remove(fib4_entry);
list_del(&fib4_entry->common.list);
return err;
}
......@@ -4937,20 +4846,19 @@ mlxsw_sp_fib4_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib4_entry *fib4_entry)
{
mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib4_entry->common);
mlxsw_sp_fib4_node_list_remove(fib4_entry);
list_del(&fib4_entry->common.list);
if (fib4_entry->common.type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP)
mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, &fib4_entry->common);
}
static void mlxsw_sp_fib4_entry_replace(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib4_entry *fib4_entry,
bool replace)
struct mlxsw_sp_fib4_entry *fib4_entry)
{
struct mlxsw_sp_fib_node *fib_node = fib4_entry->common.fib_node;
struct mlxsw_sp_fib4_entry *replaced;
if (!replace)
if (list_is_singular(&fib_node->entry_list))
return;
/* We inserted the new entry before replaced one */
......@@ -4962,9 +4870,8 @@ static void mlxsw_sp_fib4_entry_replace(struct mlxsw_sp *mlxsw_sp,
}
static int
mlxsw_sp_router_fib4_add(struct mlxsw_sp *mlxsw_sp,
const struct fib_entry_notifier_info *fen_info,
bool replace, bool append)
mlxsw_sp_router_fib4_replace(struct mlxsw_sp *mlxsw_sp,
const struct fib_entry_notifier_info *fen_info)
{
struct mlxsw_sp_fib4_entry *fib4_entry;
struct mlxsw_sp_fib_node *fib_node;
......@@ -4989,14 +4896,13 @@ mlxsw_sp_router_fib4_add(struct mlxsw_sp *mlxsw_sp,
goto err_fib4_entry_create;
}
err = mlxsw_sp_fib4_node_entry_link(mlxsw_sp, fib4_entry, replace,
append);
err = mlxsw_sp_fib4_node_entry_link(mlxsw_sp, fib4_entry);
if (err) {
dev_warn(mlxsw_sp->bus_info->dev, "Failed to link FIB entry to node\n");
goto err_fib4_node_entry_link;
}
mlxsw_sp_fib4_entry_replace(mlxsw_sp, fib4_entry, replace);
mlxsw_sp_fib4_entry_replace(mlxsw_sp, fib4_entry);
return 0;
......@@ -6094,7 +6000,6 @@ static void mlxsw_sp_router_fib4_event_work(struct work_struct *work)
struct mlxsw_sp_fib_event_work *fib_work =
container_of(work, struct mlxsw_sp_fib_event_work, work);
struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
bool replace, append;
int err;
/* Protect internal structures from changes */
......@@ -6102,13 +6007,9 @@ static void mlxsw_sp_router_fib4_event_work(struct work_struct *work)
mlxsw_sp_span_respin(mlxsw_sp);
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD:
replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info,
replace, append);
case FIB_EVENT_ENTRY_REPLACE:
err = mlxsw_sp_router_fib4_replace(mlxsw_sp,
&fib_work->fen_info);
if (err)
mlxsw_sp_router_fib_abort(mlxsw_sp);
fib_info_put(fib_work->fen_info.fi);
......@@ -6211,8 +6112,6 @@ static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work,
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
fen_info = container_of(info, struct fib_entry_notifier_info,
info);
......@@ -6343,9 +6242,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
err = mlxsw_sp_router_fib_rule_event(event, info,
router->mlxsw_sp);
return notifier_from_errno(err);
case FIB_EVENT_ENTRY_ADD:
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_REPLACE:
if (router->aborted) {
NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route");
return notifier_from_errno(-EINVAL);
......
......@@ -2159,7 +2159,7 @@ static void rocker_router_fib_event_work(struct work_struct *work)
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
case FIB_EVENT_ENTRY_REPLACE:
err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
if (err)
rocker_world_fib4_abort(rocker);
......@@ -2201,7 +2201,7 @@ static int rocker_router_fib_event(struct notifier_block *nb,
fib_work->event = event;
switch (event) {
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_DEL:
if (info->family == AF_INET) {
struct fib_entry_notifier_info *fen_info = ptr;
......
......@@ -177,10 +177,10 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event,
event == FIB_EVENT_RULE_ADD);
break;
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
err = nsim_fib_event(data, info,
event == FIB_EVENT_ENTRY_ADD);
err = nsim_fib_event(data, info, event != FIB_EVENT_ENTRY_DEL);
break;
}
......
......@@ -980,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t,
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
* If 'find_first' is set, return the first matching
* fib alias, regardless of TOS and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
u8 tos, u32 prio, u32 tb_id)
u8 tos, u32 prio, u32 tb_id,
bool find_first)
{
struct fib_alias *fa;
......@@ -998,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
continue;
if (fa->tb_id != tb_id)
break;
if (find_first)
return fa;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
......@@ -1063,9 +1068,6 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp,
return -ENOMEM;
}
/* fib notifier for ADD is sent before calling fib_insert_alias with
* the expectation that the only possible failure ENOMEM
*/
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
......@@ -1118,11 +1120,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
return true;
}
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *old);
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
......@@ -1149,7 +1153,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
l = fib_find_node(t, &tp, key);
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
tb->tb_id) : NULL;
tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
......@@ -1217,12 +1221,17 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
err = call_fib_entry_notifiers(net,
FIB_EVENT_ENTRY_REPLACE,
key, plen, new_fa,
extack);
if (err)
goto out_free_new_fa;
if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
tb->tb_id, true) == fa) {
enum fib_event_type fib_event;
fib_event = FIB_EVENT_ENTRY_REPLACE;
err = call_fib_entry_notifiers(net, fib_event,
key, plen,
new_fa, extack);
if (err)
goto out_free_new_fa;
}
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
......@@ -1244,12 +1253,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND) {
event = FIB_EVENT_ENTRY_APPEND;
if (cfg->fc_nlflags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
} else {
else
fa = fa_first;
}
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
......@@ -1269,14 +1276,26 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
if (err)
goto out_free_new_fa;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
goto out_fib_notif;
goto out_free_new_fa;
/* The alias was already inserted, so the node must exist. */
l = l ? l : fib_find_node(t, &tp, key);
if (WARN_ON_ONCE(!l))
goto out_free_new_fa;
if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
new_fa) {
enum fib_event_type fib_event;
fib_event = FIB_EVENT_ENTRY_REPLACE;
err = call_fib_entry_notifiers(net, fib_event, key, plen,
new_fa, extack);
if (err)
goto out_remove_new_fa;
}
if (!plen)
tb->tb_num_default++;
......@@ -1287,14 +1306,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
succeeded:
return 0;
out_fib_notif:
/* notifier was sent that entry would be added to trie, but
* the add failed and need to recover. Only failure for
* fib_insert_alias is ENOMEM.
*/
NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
plen, new_fa, NULL);
out_remove_new_fa:
fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
......@@ -1545,6 +1558,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
node_pull_suffix(tp, fa->fa_slen);
}
static void fib_notify_alias_delete(struct net *net, u32 key,
struct hlist_head *fah,
struct fib_alias *fa_to_delete,
struct netlink_ext_ack *extack)
{
struct fib_alias *fa_next, *fa_to_notify;
u32 tb_id = fa_to_delete->tb_id;
u8 slen = fa_to_delete->fa_slen;
enum fib_event_type fib_event;
/* Do not notify if we do not care about the route. */
if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
return;
/* Determine if the route should be replaced by the next route in the
* list.
*/
fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
struct fib_alias, fa_list);
if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
fib_event = FIB_EVENT_ENTRY_REPLACE;
fa_to_notify = fa_next;
} else {
fib_event = FIB_EVENT_ENTRY_DEL;
fa_to_notify = fa_to_delete;
}
call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
fa_to_notify, extack);
}
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
......@@ -1566,7 +1609,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
......@@ -1598,8 +1641,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!fa_to_delete)
return -ESRCH;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
fa_to_delete, extack);
fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
......@@ -1923,10 +1965,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
continue;
}
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
n->key,
KEYLENGTH - fa->fa_slen, fa,
NULL);
fib_notify_alias_delete(net, n->key, &n->leaf, fa,
NULL);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
......@@ -2022,6 +2062,7 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
struct netlink_ext_ack *extack)
{
struct fib_alias *fa;
int last_slen = -1;
int err;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
......@@ -2036,8 +2077,12 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
if (tb->tb_id != fa->tb_id)
continue;
err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key,
KEYLENGTH - fa->fa_slen,
if (fa->fa_slen == last_slen)
continue;
last_slen = fa->fa_slen;
err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
l->key, KEYLENGTH - fa->fa_slen,
fa, extack);
if (err)
return err;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment