Commit 69248719 authored by David S. Miller's avatar David S. Miller

Merge branch 'fib-notifier-event-replay'

Jiri Pirko says:

====================
ipv4: fib: Replay events when registering FIB notifier

Ido says:

In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced
by a new FIB notification chain to which modules could register in order
to be notified about the addition and deletion of FIB entries. The
motivation for this change was that switchdev drivers need to be able to
reflect the entire FIB table and not only FIBs configured on top of the
port netdevs themselves. This is useful in case of in-band management.

The fundamental problem with this approach is that upon registration
listeners lose all the information previously sent in the chain and
thus have an incomplete view of the FIB tables, which can result in
packet loss. This patchset fixes that by dumping the FIB tables and
replaying notifications previously sent in the chain for the registered
notification block.

The entire dump process is done under RCU and thus the FIB notification
chain is converted to be atomic. The listeners are modified accordingly.
This is done in the first eight patches.

The ninth patch adds a change sequence counter to ensure the integrity
of the FIB dump. The last patch adds the dump itself to the FIB chain
registration function and modifies existing listeners to pass a callback
to be executed in case dump was inconsistent.

---
v3->v4:
- Register the notification block after the dump and protect it using
  the change sequence counter (Hannes Frederic Sowa).
- Since we now integrate the dump into the registration function, drop
  the sysctl to set maximum number of retries and instead set it to a
  fixed number. Lets see if it's really a problem before adding something
  we can never remove.
- For the same reason, dump FIB tables for all net namespaces.
- Add a comment regarding guarantees provided by mutex semantics.

v2->v3:
- Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa).
- Read the sequence counter under RTNL to ensure synchronization
  between the dump process and other processes changing the routing
  tables (Hannes Frederic Sowa).
- Pass a callback to the dump function to be executed prior to a retry.
- Limit the dump to a single net namespace.

v1->v2:
- Add a sequence counter to ensure the integrity of the FIB dump
  (David S. Miller, Hannes Frederic Sowa).
- Protect notifications from re-ordering in listeners by using an
  ordered workqueue (Hannes Frederic Sowa).
- Introduce fib_info_hold() (Jiri Pirko).
- Relieve rocker from the need to invoke the FIB dump by registering
  to the FIB notification chain prior to ports creation.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 548ed722 c3852ef7
......@@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
static struct dentry *mlxsw_core_dbg_root;
static struct workqueue_struct *mlxsw_wq;
static struct workqueue_struct *mlxsw_owq;
struct mlxsw_core_pcpu_stats {
u64 trap_rx_packets[MLXSW_TRAP_ID_MAX];
......@@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
}
EXPORT_SYMBOL(mlxsw_core_schedule_dw);
int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
{
return queue_delayed_work(mlxsw_owq, dwork, delay);
}
EXPORT_SYMBOL(mlxsw_core_schedule_odw);
void mlxsw_core_flush_owq(void)
{
flush_workqueue(mlxsw_owq);
}
EXPORT_SYMBOL(mlxsw_core_flush_owq);
static int __init mlxsw_core_module_init(void)
{
int err;
......@@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void)
mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
if (!mlxsw_wq)
return -ENOMEM;
mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
mlxsw_core_driver_name);
if (!mlxsw_owq) {
err = -ENOMEM;
goto err_alloc_ordered_workqueue;
}
mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
if (!mlxsw_core_dbg_root) {
err = -ENOMEM;
......@@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void)
return 0;
err_debugfs_create_dir:
destroy_workqueue(mlxsw_owq);
err_alloc_ordered_workqueue:
destroy_workqueue(mlxsw_wq);
return err;
}
......@@ -1922,6 +1943,7 @@ static int __init mlxsw_core_module_init(void)
static void __exit mlxsw_core_module_exit(void)
{
debugfs_remove_recursive(mlxsw_core_dbg_root);
destroy_workqueue(mlxsw_owq);
destroy_workqueue(mlxsw_wq);
}
......
......@@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
u8 local_port);
int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
void mlxsw_core_flush_owq(void);
#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
......
......@@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
{
/* At this stage we're guaranteed not to have new incoming
* FIB notifications and the work queue is free from FIBs
* sitting on top of mlxsw netdevs. However, we can still
* have other FIBs queued. Flush the queue before flushing
* the device's tables. No need for locks, as we're the only
* writer.
*/
mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
kfree(mlxsw_sp->router.vrs);
}
......@@ -1948,33 +1956,89 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
kfree(mlxsw_sp->rifs);
}
static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
struct mlxsw_sp_fib_event_work {
struct delayed_work dw;
struct fib_entry_notifier_info fen_info;
struct mlxsw_sp *mlxsw_sp;
unsigned long event;
};
static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
struct fib_entry_notifier_info *fen_info = ptr;
struct mlxsw_sp_fib_event_work *fib_work =
container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
int err;
if (!net_eq(fen_info->info.net, &init_net))
return NOTIFY_DONE;
switch (event) {
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
if (err)
mlxsw_sp_router_fib4_abort(mlxsw_sp);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
mlxsw_sp_router_fib4_abort(mlxsw_sp);
break;
}
rtnl_unlock();
kfree(fib_work);
}
/* Called with rcu_read_lock() */
static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
struct mlxsw_sp_fib_event_work *fib_work;
struct fib_notifier_info *info = ptr;
if (!net_eq(info->net, &init_net))
return NOTIFY_DONE;
fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
if (WARN_ON(!fib_work))
return NOTIFY_BAD;
INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
fib_work->mlxsw_sp = mlxsw_sp;
fib_work->event = event;
switch (event) {
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
/* Take referece on fib_info to prevent it from being
* freed while work is queued. Release it afterwards.
*/
fib_info_hold(fib_work->fen_info.fi);
break;
}
mlxsw_core_schedule_odw(&fib_work->dw, 0);
return NOTIFY_DONE;
}
static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
/* Flush pending FIB notifications and then flush the device's
* table before requesting another dump. The FIB notification
* block is unregistered, so no need to take RTNL.
*/
mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
}
int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
int err;
......@@ -1995,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
goto err_neigh_init;
mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
register_fib_notifier(&mlxsw_sp->fib_nb);
err = register_fib_notifier(&mlxsw_sp->fib_nb,
mlxsw_sp_router_fib_dump_flush);
if (err)
goto err_register_fib_notifier;
return 0;
err_register_fib_notifier:
mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:
......
......@@ -72,6 +72,7 @@ struct rocker {
struct rocker_dma_ring_info event_ring;
struct notifier_block fib_nb;
struct rocker_world_ops *wops;
struct workqueue_struct *rocker_owq;
void *wpriv;
};
......
......@@ -28,6 +28,7 @@
#include <linux/if_bridge.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/workqueue.h>
#include <net/switchdev.h>
#include <net/rtnetlink.h>
#include <net/netevent.h>
......@@ -2165,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
.switchdev_port_obj_dump = rocker_port_obj_dump,
};
static int rocker_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
struct rocker_fib_event_work {
struct work_struct work;
struct fib_entry_notifier_info fen_info;
struct rocker *rocker;
unsigned long event;
};
static void rocker_router_fib_event_work(struct work_struct *work)
{
struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
struct fib_entry_notifier_info *fen_info = ptr;
struct rocker_fib_event_work *fib_work =
container_of(work, struct rocker_fib_event_work, work);
struct rocker *rocker = fib_work->rocker;
int err;
switch (event) {
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
err = rocker_world_fib4_add(rocker, fen_info);
err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
if (err)
rocker_world_fib4_abort(rocker);
else
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
rocker_world_fib4_del(rocker, fen_info);
rocker_world_fib4_del(rocker, &fib_work->fen_info);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
rocker_world_fib4_abort(rocker);
break;
}
rtnl_unlock();
kfree(fib_work);
}
/* Called with rcu_read_lock() */
static int rocker_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
struct rocker_fib_event_work *fib_work;
fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
if (WARN_ON(!fib_work))
return NOTIFY_BAD;
INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
fib_work->rocker = rocker;
fib_work->event = event;
switch (event) {
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
/* Take referece on fib_info to prevent it from being
* freed while work is queued. Release it afterwards.
*/
fib_info_hold(fib_work->fen_info.fi);
break;
}
queue_work(rocker->rocker_owq, &fib_work->work);
return NOTIFY_DONE;
}
......@@ -2754,6 +2797,21 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_request_event_irq;
}
rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
WQ_MEM_RECLAIM);
if (!rocker->rocker_owq) {
err = -ENOMEM;
goto err_alloc_ordered_workqueue;
}
/* Only FIBs pointing to our own netdevs are programmed into
* the device, so no need to pass a callback.
*/
rocker->fib_nb.notifier_call = rocker_router_fib_event;
err = register_fib_notifier(&rocker->fib_nb, NULL);
if (err)
goto err_register_fib_notifier;
rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
err = rocker_probe_ports(rocker);
......@@ -2762,15 +2820,16 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_probe_ports;
}
rocker->fib_nb.notifier_call = rocker_router_fib_event;
register_fib_notifier(&rocker->fib_nb);
dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
(int)sizeof(rocker->hw.id), &rocker->hw.id);
return 0;
err_probe_ports:
unregister_fib_notifier(&rocker->fib_nb);
err_register_fib_notifier:
destroy_workqueue(rocker->rocker_owq);
err_alloc_ordered_workqueue:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
err_request_event_irq:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
......@@ -2796,9 +2855,10 @@ static void rocker_remove(struct pci_dev *pdev)
{
struct rocker *rocker = pci_get_drvdata(pdev);
rocker_remove_ports(rocker);
unregister_fib_notifier(&rocker->fib_nb);
rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
rocker_remove_ports(rocker);
destroy_workqueue(rocker->rocker_owq);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
rocker_dma_rings_fini(rocker);
......
......@@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker)
int bkt;
del_timer_sync(&ofdpa->fdb_cleanup_timer);
flush_workqueue(rocker->rocker_owq);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)
......
......@@ -221,7 +221,8 @@ enum fib_event_type {
FIB_EVENT_RULE_DEL,
};
int register_fib_notifier(struct notifier_block *nb);
int register_fib_notifier(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb));
int unregister_fib_notifier(struct notifier_block *nb);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info);
......@@ -397,6 +398,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
void free_fib_info(struct fib_info *fi);
static inline void fib_info_hold(struct fib_info *fi)
{
atomic_inc(&fi->fib_clntref);
}
static inline void fib_info_put(struct fib_info *fi)
{
if (atomic_dec_and_test(&fi->fib_clntref))
......
......@@ -135,6 +135,9 @@ struct netns_ipv4 {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int sysctl_fib_multipath_use_neigh;
#endif
unsigned int fib_seq; /* protected by rtnl_mutex */
atomic_t rt_genid;
};
#endif
......@@ -1219,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
net->ipv4.fib_seq = 0;
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
......
......@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
......
......@@ -84,25 +84,114 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
static BLOCKING_NOTIFIER_HEAD(fib_chain);
static unsigned int fib_seq_sum(void)
{
unsigned int fib_seq = 0;
struct net *net;
rtnl_lock();
for_each_net(net)
fib_seq += net->ipv4.fib_seq;
rtnl_unlock();
return fib_seq;
}
static ATOMIC_NOTIFIER_HEAD(fib_chain);
int register_fib_notifier(struct notifier_block *nb)
static int call_fib_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
return blocking_notifier_chain_register(&fib_chain, nb);
info->net = net;
return nb->notifier_call(nb, event_type, info);
}
static void fib_rules_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type)
{
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_notifier_info info;
if (net->ipv4.fib_has_custom_rules)
call_fib_notifier(nb, net, event_type, &info);
#endif
}
static void fib_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type);
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id, u32 nlflags)
{
struct fib_entry_notifier_info info = {
.dst = dst,
.dst_len = dst_len,
.fi = fi,
.tos = tos,
.type = type,
.tb_id = tb_id,
.nlflags = nlflags,
};
return call_fib_notifier(nb, net, event_type, &info.info);
}
static bool fib_dump_is_consistent(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb),
unsigned int fib_seq)
{
atomic_notifier_chain_register(&fib_chain, nb);
if (fib_seq == fib_seq_sum())
return true;
atomic_notifier_chain_unregister(&fib_chain, nb);
if (cb)
cb(nb);
return false;
}
#define FIB_DUMP_MAX_RETRIES 5
int register_fib_notifier(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb))
{
int retries = 0;
do {
unsigned int fib_seq = fib_seq_sum();
struct net *net;
/* Mutex semantics guarantee that every change done to
* FIB tries before we read the change sequence counter
* is now visible to us.
*/
rcu_read_lock();
for_each_net_rcu(net) {
fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
}
rcu_read_unlock();
if (fib_dump_is_consistent(nb, cb, fib_seq))
return 0;
} while (++retries < FIB_DUMP_MAX_RETRIES);
return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
int unregister_fib_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&fib_chain, nb);
return atomic_notifier_chain_unregister(&fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
net->ipv4.fib_seq++;
info->net = net;
return blocking_notifier_call_chain(&fib_chain, event_type, info);
return atomic_notifier_call_chain(&fib_chain, event_type, info);
}
static int call_fib_entry_notifiers(struct net *net,
......@@ -1901,6 +1990,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
static void fib_leaf_notify(struct net *net, struct key_vector *l,
struct fib_table *tb, struct notifier_block *nb,
enum fib_event_type event_type)
{
struct fib_alias *fa;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (!fi)
continue;
/* local and main table can share the same trie,
* so don't notify twice for the same entry.
*/
if (tb->tb_id != fa->tb_id)
continue;
call_fib_entry_notifier(nb, net, event_type, l->key,
KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
fa->fa_type, fa->tb_id, 0);
}
}
static void fib_table_notify(struct net *net, struct fib_table *tb,
struct notifier_block *nb,
enum fib_event_type event_type)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
fib_leaf_notify(net, l, tb, nb, event_type);
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
}
static void fib_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type)
{
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist)
fib_table_notify(net, tb, nb, event_type);
}
}
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment