Commit c3852ef7f2f8f75a9f85a864bec1f6f5a3068eea

Authored by Ido Schimmel
Committed by David S. Miller
1 parent cacaad11f4

ipv4: fib: Replay events when registering FIB notifier

Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by dumping the FIB tables and replaying the events to the
passed notification block. The dump itself is done using RCU in order
not to starve consumers that need RTNL to make progress.

The integrity of the dump is ensured by reading the FIB change sequence
counter before and after the dump under RTNL. This allows us to avoid
the problematic situation in which the dumping process sends a ENTRY_ADD
notification following ENTRY_DEL generated by another process holding
RTNL.

Callers of the registration function may pass a callback that is
executed in case the dump was inconsistent with current FIB tables.

The number of retries until a consistent dump is achieved is set to a
fixed number to prevent callers from looping for long periods of time.
In case current limit proves to be problematic in the future, it can be
easily converted to be configurable using a sysctl.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 4 changed files with 174 additions and 5 deletions Side-by-side Diff

drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
... ... @@ -2027,6 +2027,18 @@
2027 2027 return NOTIFY_DONE;
2028 2028 }
2029 2029  
  2030 +static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
  2031 +{
  2032 + struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
  2033 +
  2034 + /* Flush pending FIB notifications and then flush the device's
  2035 + * table before requesting another dump. The FIB notification
  2036 + * block is unregistered, so no need to take RTNL.
  2037 + */
  2038 + mlxsw_core_flush_owq();
  2039 + mlxsw_sp_router_fib_flush(mlxsw_sp);
  2040 +}
  2041 +
2030 2042 int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
2031 2043 {
2032 2044 int err;
2033 2045  
... ... @@ -2047,9 +2059,15 @@
2047 2059 goto err_neigh_init;
2048 2060  
2049 2061 mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
2050   - register_fib_notifier(&mlxsw_sp->fib_nb);
  2062 + err = register_fib_notifier(&mlxsw_sp->fib_nb,
  2063 + mlxsw_sp_router_fib_dump_flush);
  2064 + if (err)
  2065 + goto err_register_fib_notifier;
  2066 +
2051 2067 return 0;
2052 2068  
  2069 +err_register_fib_notifier:
  2070 + mlxsw_sp_neigh_fini(mlxsw_sp);
2053 2071 err_neigh_init:
2054 2072 mlxsw_sp_vrs_fini(mlxsw_sp);
2055 2073 err_vrs_init:
drivers/net/ethernet/rocker/rocker_main.c
... ... @@ -2804,8 +2804,13 @@
2804 2804 goto err_alloc_ordered_workqueue;
2805 2805 }
2806 2806  
  2807 + /* Only FIBs pointing to our own netdevs are programmed into
  2808 + * the device, so no need to pass a callback.
  2809 + */
2807 2810 rocker->fib_nb.notifier_call = rocker_router_fib_event;
2808   - register_fib_notifier(&rocker->fib_nb);
  2811 + err = register_fib_notifier(&rocker->fib_nb, NULL);
  2812 + if (err)
  2813 + goto err_register_fib_notifier;
2809 2814  
2810 2815 rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
2811 2816  
... ... @@ -2822,6 +2827,7 @@
2822 2827  
2823 2828 err_probe_ports:
2824 2829 unregister_fib_notifier(&rocker->fib_nb);
  2830 +err_register_fib_notifier:
2825 2831 destroy_workqueue(rocker->rocker_owq);
2826 2832 err_alloc_ordered_workqueue:
2827 2833 free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
include/net/ip_fib.h
... ... @@ -221,7 +221,8 @@
221 221 FIB_EVENT_RULE_DEL,
222 222 };
223 223  
224   -int register_fib_notifier(struct notifier_block *nb);
  224 +int register_fib_notifier(struct notifier_block *nb,
  225 + void (*cb)(struct notifier_block *nb));
225 226 int unregister_fib_notifier(struct notifier_block *nb);
226 227 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
227 228 struct fib_notifier_info *info);
... ... @@ -84,12 +84,100 @@
84 84 #include <trace/events/fib.h>
85 85 #include "fib_lookup.h"
86 86  
  87 +static unsigned int fib_seq_sum(void)
  88 +{
  89 + unsigned int fib_seq = 0;
  90 + struct net *net;
  91 +
  92 + rtnl_lock();
  93 + for_each_net(net)
  94 + fib_seq += net->ipv4.fib_seq;
  95 + rtnl_unlock();
  96 +
  97 + return fib_seq;
  98 +}
  99 +
87 100 static ATOMIC_NOTIFIER_HEAD(fib_chain);
88 101  
89   -int register_fib_notifier(struct notifier_block *nb)
  102 +static int call_fib_notifier(struct notifier_block *nb, struct net *net,
  103 + enum fib_event_type event_type,
  104 + struct fib_notifier_info *info)
90 105 {
91   - return atomic_notifier_chain_register(&fib_chain, nb);
  106 + info->net = net;
  107 + return nb->notifier_call(nb, event_type, info);
92 108 }
  109 +
  110 +static void fib_rules_notify(struct net *net, struct notifier_block *nb,
  111 + enum fib_event_type event_type)
  112 +{
  113 +#ifdef CONFIG_IP_MULTIPLE_TABLES
  114 + struct fib_notifier_info info;
  115 +
  116 + if (net->ipv4.fib_has_custom_rules)
  117 + call_fib_notifier(nb, net, event_type, &info);
  118 +#endif
  119 +}
  120 +
  121 +static void fib_notify(struct net *net, struct notifier_block *nb,
  122 + enum fib_event_type event_type);
  123 +
  124 +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
  125 + enum fib_event_type event_type, u32 dst,
  126 + int dst_len, struct fib_info *fi,
  127 + u8 tos, u8 type, u32 tb_id, u32 nlflags)
  128 +{
  129 + struct fib_entry_notifier_info info = {
  130 + .dst = dst,
  131 + .dst_len = dst_len,
  132 + .fi = fi,
  133 + .tos = tos,
  134 + .type = type,
  135 + .tb_id = tb_id,
  136 + .nlflags = nlflags,
  137 + };
  138 + return call_fib_notifier(nb, net, event_type, &info.info);
  139 +}
  140 +
  141 +static bool fib_dump_is_consistent(struct notifier_block *nb,
  142 + void (*cb)(struct notifier_block *nb),
  143 + unsigned int fib_seq)
  144 +{
  145 + atomic_notifier_chain_register(&fib_chain, nb);
  146 + if (fib_seq == fib_seq_sum())
  147 + return true;
  148 + atomic_notifier_chain_unregister(&fib_chain, nb);
  149 + if (cb)
  150 + cb(nb);
  151 + return false;
  152 +}
  153 +
  154 +#define FIB_DUMP_MAX_RETRIES 5
  155 +int register_fib_notifier(struct notifier_block *nb,
  156 + void (*cb)(struct notifier_block *nb))
  157 +{
  158 + int retries = 0;
  159 +
  160 + do {
  161 + unsigned int fib_seq = fib_seq_sum();
  162 + struct net *net;
  163 +
  164 + /* Mutex semantics guarantee that every change done to
  165 + * FIB tries before we read the change sequence counter
  166 + * is now visible to us.
  167 + */
  168 + rcu_read_lock();
  169 + for_each_net_rcu(net) {
  170 + fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
  171 + fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
  172 + }
  173 + rcu_read_unlock();
  174 +
  175 + if (fib_dump_is_consistent(nb, cb, fib_seq))
  176 + return 0;
  177 + } while (++retries < FIB_DUMP_MAX_RETRIES);
  178 +
  179 + return -EBUSY;
  180 +}
93 181 EXPORT_SYMBOL(register_fib_notifier);
94 182  
95 183 int unregister_fib_notifier(struct notifier_block *nb)
... ... @@ -1900,6 +1988,62 @@
1900 1988  
1901 1989 pr_debug("trie_flush found=%d\n", found);
1902 1990 return found;
  1991 +}
  1992 +
  1993 +static void fib_leaf_notify(struct net *net, struct key_vector *l,
  1994 + struct fib_table *tb, struct notifier_block *nb,
  1995 + enum fib_event_type event_type)
  1996 +{
  1997 + struct fib_alias *fa;
  1998 +
  1999 + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
  2000 + struct fib_info *fi = fa->fa_info;
  2001 +
  2002 + if (!fi)
  2003 + continue;
  2004 +
  2005 + /* local and main table can share the same trie,
  2006 + * so don't notify twice for the same entry.
  2007 + */
  2008 + if (tb->tb_id != fa->tb_id)
  2009 + continue;
  2010 +
  2011 + call_fib_entry_notifier(nb, net, event_type, l->key,
  2012 + KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
  2013 + fa->fa_type, fa->tb_id, 0);
  2014 + }
  2015 +}
  2016 +
  2017 +static void fib_table_notify(struct net *net, struct fib_table *tb,
  2018 + struct notifier_block *nb,
  2019 + enum fib_event_type event_type)
  2020 +{
  2021 + struct trie *t = (struct trie *)tb->tb_data;
  2022 + struct key_vector *l, *tp = t->kv;
  2023 + t_key key = 0;
  2024 +
  2025 + while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
  2026 + fib_leaf_notify(net, l, tb, nb, event_type);
  2027 +
  2028 + key = l->key + 1;
  2029 + /* stop in case of wrap around */
  2030 + if (key < l->key)
  2031 + break;
  2032 + }
  2033 +}
  2034 +
  2035 +static void fib_notify(struct net *net, struct notifier_block *nb,
  2036 + enum fib_event_type event_type)
  2037 +{
  2038 + unsigned int h;
  2039 +
  2040 + for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  2041 + struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  2042 + struct fib_table *tb;
  2043 +
  2044 + hlist_for_each_entry_rcu(tb, head, tb_hlist)
  2045 + fib_table_notify(net, tb, nb, event_type);
  2046 + }
1903 2047 }
1904 2048  
1905 2049 static void __trie_free_rcu(struct rcu_head *head)