Commit 77162022ab26a1f99d3af30c03760a76f86e193d

Authored by John Fastabend
Committed by David S. Miller
1 parent 136cd14e1e

net: add generic PF_BRIDGE:RTM_ FDB hooks

This adds two new flags NTF_MASTER and NTF_SELF that can
now be used to specify where PF_BRIDGE netlink commands should
be sent. NTF_MASTER sends the commands to the 'dev->master'
device for parsing. Typically this will be the linux net/bridge,
or open-vswitch devices. Also without any flags set the command
will be handled by the master device as well so that current user
space tools continue to work as expected.

The NTF_SELF flag will push the PF_BRIDGE commands to the
device. In the basic example below the commands are then parsed
and programmed in the embedded bridge.

Note if both NTF_SELF and NTF_MASTER bits are set then the
command will be sent to both 'dev->master' and 'dev' this allows
user space to easily keep the embedded bridge and software bridge
in sync.

There is a slight complication in the case with both flags set
when an error occurs. To resolve this the rtnl handler clears
the NTF_ flag in the netlink ack to indicate which sets completed
successfully. The add/del handlers will abort as soon as any
error occurs.

To support this new net device ops were added to call into
the device and the existing bridging code was refactored
to use these. There should be no required changes in user space
to support the current bridge behavior.

A basic setup with a SR-IOV enabled NIC looks like this,

          veth0  veth2
            |      |
          ------------
          |  bridge0 |   <---- software bridging
          ------------
               /
               /
  ethx.y      ethx
    VF         PF
     \         \          <---- propagate FDB entries to HW
     \         \
  --------------------
  |  Embedded Bridge |    <---- hardware offloaded switching
  --------------------

In this case the embedded bridge must be managed to allow 'veth0'
to communicate with 'ethx.y' correctly. At present drivers managing
the embedded bridge either send frames onto the network which
then get dropped by the switch OR the embedded bridge will flood
these frames. With this patch we have a mechanism to manage the
embedded bridge correctly from user space. This example is specific
to SR-IOV but replacing the VF with another PF or dropping this
into the DSA framework generates similar management issues.

Examples session using the 'br'[1] tool to add, dump and then
delete a mac address with a new "embedded" option and enabled
ixgbe driver:

# br fdb add 22:35:19:ac:60:59 dev eth3
# br fdb
port    mac addr                flags
veth0   22:35:19:ac:60:58       static
veth0   9a:5f:81:f7:f6:ec       local
eth3    00:1b:21:55:23:59       local
eth3    22:35:19:ac:60:59       static
veth0   22:35:19:ac:60:57       static
#br fdb add 22:35:19:ac:60:59 embedded dev eth3
#br fdb
port    mac addr                flags
veth0   22:35:19:ac:60:58       static
veth0   9a:5f:81:f7:f6:ec       local
eth3    00:1b:21:55:23:59       local
eth3    22:35:19:ac:60:59       static
veth0   22:35:19:ac:60:57       static
eth3    22:35:19:ac:60:59       local embedded
#br fdb del 22:35:19:ac:60:59 embedded dev eth3

I added a couple lines to 'br' to set the flags correctly is all. It
is my opinion that the merit of this patch is now embedded and SW
bridges can both be modeled correctly in user space using very nearly
the same message passing.

[1] 'br' tool was published as an RFC here and will be renamed 'bridge'
    http://patchwork.ozlabs.org/patch/117664/

Thanks to Jamal Hadi Salim, Stephen Hemminger and Ben Hutchings for
valuable feedback, suggestions, and review.

v2: fixed api descriptions and error case with both NTF_SELF and
    NTF_MASTER set plus updated patch description.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 8 changed files with 228 additions and 112 deletions Side-by-side Diff

include/linux/neighbour.h
... ... @@ -33,6 +33,9 @@
33 33 #define NTF_PROXY 0x08 /* == ATF_PUBL */
34 34 #define NTF_ROUTER 0x80
35 35  
  36 +#define NTF_SELF 0x02
  37 +#define NTF_MASTER 0x04
  38 +
36 39 /*
37 40 * Neighbor Cache Entry States.
38 41 */
include/linux/netdevice.h
... ... @@ -54,6 +54,7 @@
54 54 #include <net/netprio_cgroup.h>
55 55  
56 56 #include <linux/netdev_features.h>
  57 +#include <linux/neighbour.h>
57 58  
58 59 struct netpoll_info;
59 60 struct device;
... ... @@ -905,6 +906,16 @@
905 906 * feature set might be less than what was returned by ndo_fix_features()).
906 907 * Must return >0 or -errno if it changed dev->features itself.
907 908 *
  909 + * int (*ndo_fdb_add)(struct ndmsg *ndm, struct net_device *dev,
  910 + * unsigned char *addr, u16 flags)
  911 + * Adds an FDB entry to dev for addr.
  912 + * int (*ndo_fdb_del)(struct ndmsg *ndm, struct net_device *dev,
  913 + * unsigned char *addr)
  914 + * Deletes the FDB entry from dev coresponding to addr.
  915 + * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  916 + * struct net_device *dev, int idx)
  917 + * Used to add FDB entries to dump requests. Implementers should add
  918 + * entries to skb and update idx with the number of entries.
908 919 */
909 920 struct net_device_ops {
910 921 int (*ndo_init)(struct net_device *dev);
... ... @@ -1002,6 +1013,18 @@
1002 1013 netdev_features_t features);
1003 1014 int (*ndo_neigh_construct)(struct neighbour *n);
1004 1015 void (*ndo_neigh_destroy)(struct neighbour *n);
  1016 +
  1017 + int (*ndo_fdb_add)(struct ndmsg *ndm,
  1018 + struct net_device *dev,
  1019 + unsigned char *addr,
  1020 + u16 flags);
  1021 + int (*ndo_fdb_del)(struct ndmsg *ndm,
  1022 + struct net_device *dev,
  1023 + unsigned char *addr);
  1024 + int (*ndo_fdb_dump)(struct sk_buff *skb,
  1025 + struct netlink_callback *cb,
  1026 + struct net_device *dev,
  1027 + int idx);
1005 1028 };
1006 1029  
1007 1030 /*
include/linux/rtnetlink.h
... ... @@ -801,6 +801,10 @@
801 801 return table;
802 802 }
803 803  
  804 +extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
  805 + struct netlink_callback *cb,
  806 + struct net_device *dev,
  807 + int idx);
804 808 #endif /* __KERNEL__ */
805 809  
806 810  
net/bridge/br_device.c
... ... @@ -317,6 +317,9 @@
317 317 .ndo_add_slave = br_add_slave,
318 318 .ndo_del_slave = br_del_slave,
319 319 .ndo_fix_features = br_fix_features,
  320 + .ndo_fdb_add = br_fdb_add,
  321 + .ndo_fdb_del = br_fdb_delete,
  322 + .ndo_fdb_dump = br_fdb_dump,
320 323 };
321 324  
322 325 static void br_dev_free(struct net_device *dev)
... ... @@ -535,44 +535,38 @@
535 535 }
536 536  
537 537 /* Dump information about entries, in response to GETNEIGH */
538   -int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
  538 +int br_fdb_dump(struct sk_buff *skb,
  539 + struct netlink_callback *cb,
  540 + struct net_device *dev,
  541 + int idx)
539 542 {
540   - struct net *net = sock_net(skb->sk);
541   - struct net_device *dev;
542   - int idx = 0;
  543 + struct net_bridge *br = netdev_priv(dev);
  544 + int i;
543 545  
544   - rcu_read_lock();
545   - for_each_netdev_rcu(net, dev) {
546   - struct net_bridge *br = netdev_priv(dev);
547   - int i;
  546 + if (!(dev->priv_flags & IFF_EBRIDGE))
  547 + goto out;
548 548  
549   - if (!(dev->priv_flags & IFF_EBRIDGE))
550   - continue;
  549 + for (i = 0; i < BR_HASH_SIZE; i++) {
  550 + struct hlist_node *h;
  551 + struct net_bridge_fdb_entry *f;
551 552  
552   - for (i = 0; i < BR_HASH_SIZE; i++) {
553   - struct hlist_node *h;
554   - struct net_bridge_fdb_entry *f;
  553 + hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
  554 + if (idx < cb->args[0])
  555 + goto skip;
555 556  
556   - hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
557   - if (idx < cb->args[0])
558   - goto skip;
559   -
560   - if (fdb_fill_info(skb, br, f,
561   - NETLINK_CB(cb->skb).pid,
562   - cb->nlh->nlmsg_seq,
563   - RTM_NEWNEIGH,
564   - NLM_F_MULTI) < 0)
565   - break;
  557 + if (fdb_fill_info(skb, br, f,
  558 + NETLINK_CB(cb->skb).pid,
  559 + cb->nlh->nlmsg_seq,
  560 + RTM_NEWNEIGH,
  561 + NLM_F_MULTI) < 0)
  562 + break;
566 563 skip:
567   - ++idx;
568   - }
  564 + ++idx;
569 565 }
570 566 }
571   - rcu_read_unlock();
572 567  
573   - cb->args[0] = idx;
574   -
575   - return skb->len;
  568 +out:
  569 + return idx;
576 570 }
577 571  
578 572 /* Update (create or replace) forwarding database entry */
579 573  
580 574  
581 575  
... ... @@ -614,44 +608,12 @@
614 608 }
615 609  
616 610 /* Add new permanent fdb entry with RTM_NEWNEIGH */
617   -int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
  611 +int br_fdb_add(struct ndmsg *ndm, struct net_device *dev,
  612 + unsigned char *addr, u16 nlh_flags)
618 613 {
619   - struct net *net = sock_net(skb->sk);
620   - struct ndmsg *ndm;
621   - struct nlattr *tb[NDA_MAX+1];
622   - struct net_device *dev;
623 614 struct net_bridge_port *p;
624   - const __u8 *addr;
625   - int err;
  615 + int err = 0;
626 616  
627   - ASSERT_RTNL();
628   - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
629   - if (err < 0)
630   - return err;
631   -
632   - ndm = nlmsg_data(nlh);
633   - if (ndm->ndm_ifindex == 0) {
634   - pr_info("bridge: RTM_NEWNEIGH with invalid ifindex\n");
635   - return -EINVAL;
636   - }
637   -
638   - dev = __dev_get_by_index(net, ndm->ndm_ifindex);
639   - if (dev == NULL) {
640   - pr_info("bridge: RTM_NEWNEIGH with unknown ifindex\n");
641   - return -ENODEV;
642   - }
643   -
644   - if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
645   - pr_info("bridge: RTM_NEWNEIGH with invalid address\n");
646   - return -EINVAL;
647   - }
648   -
649   - addr = nla_data(tb[NDA_LLADDR]);
650   - if (!is_valid_ether_addr(addr)) {
651   - pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
652   - return -EINVAL;
653   - }
654   -
655 617 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
656 618 pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
657 619 return -EINVAL;
658 620  
... ... @@ -670,14 +632,14 @@
670 632 rcu_read_unlock();
671 633 } else {
672 634 spin_lock_bh(&p->br->hash_lock);
673   - err = fdb_add_entry(p, addr, ndm->ndm_state, nlh->nlmsg_flags);
  635 + err = fdb_add_entry(p, addr, ndm->ndm_state, nlh_flags);
674 636 spin_unlock_bh(&p->br->hash_lock);
675 637 }
676 638  
677 639 return err;
678 640 }
679 641  
680   -static int fdb_delete_by_addr(struct net_bridge_port *p, const u8 *addr)
  642 +static int fdb_delete_by_addr(struct net_bridge_port *p, u8 *addr)
681 643 {
682 644 struct net_bridge *br = p->br;
683 645 struct hlist_head *head = &br->hash[br_mac_hash(addr)];
684 646  
685 647  
686 648  
... ... @@ -692,39 +654,11 @@
692 654 }
693 655  
694 656 /* Remove neighbor entry with RTM_DELNEIGH */
695   -int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
  657 +int br_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
  658 + unsigned char *addr)
696 659 {
697   - struct net *net = sock_net(skb->sk);
698   - struct ndmsg *ndm;
699 660 struct net_bridge_port *p;
700   - struct nlattr *llattr;
701   - const __u8 *addr;
702   - struct net_device *dev;
703 661 int err;
704   -
705   - ASSERT_RTNL();
706   - if (nlmsg_len(nlh) < sizeof(*ndm))
707   - return -EINVAL;
708   -
709   - ndm = nlmsg_data(nlh);
710   - if (ndm->ndm_ifindex == 0) {
711   - pr_info("bridge: RTM_DELNEIGH with invalid ifindex\n");
712   - return -EINVAL;
713   - }
714   -
715   - dev = __dev_get_by_index(net, ndm->ndm_ifindex);
716   - if (dev == NULL) {
717   - pr_info("bridge: RTM_DELNEIGH with unknown ifindex\n");
718   - return -ENODEV;
719   - }
720   -
721   - llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR);
722   - if (llattr == NULL || nla_len(llattr) != ETH_ALEN) {
723   - pr_info("bridge: RTM_DELNEIGH with invalid address\n");
724   - return -EINVAL;
725   - }
726   -
727   - addr = nla_data(llattr);
728 662  
729 663 p = br_port_get_rtnl(dev);
730 664 if (p == NULL) {
net/bridge/br_netlink.c
... ... @@ -232,18 +232,6 @@
232 232 br_rtm_setlink, NULL, NULL);
233 233 if (err)
234 234 goto err3;
235   - err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH,
236   - br_fdb_add, NULL, NULL);
237   - if (err)
238   - goto err3;
239   - err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH,
240   - br_fdb_delete, NULL, NULL);
241   - if (err)
242   - goto err3;
243   - err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH,
244   - NULL, br_fdb_dump, NULL);
245   - if (err)
246   - goto err3;
247 235  
248 236 return 0;
249 237  
net/bridge/br_private.h
... ... @@ -360,9 +360,18 @@
360 360 extern void br_fdb_update(struct net_bridge *br,
361 361 struct net_bridge_port *source,
362 362 const unsigned char *addr);
363   -extern int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb);
364   -extern int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg);
365   -extern int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg);
  363 +
  364 +extern int br_fdb_delete(struct ndmsg *ndm,
  365 + struct net_device *dev,
  366 + unsigned char *addr);
  367 +extern int br_fdb_add(struct ndmsg *nlh,
  368 + struct net_device *dev,
  369 + unsigned char *addr,
  370 + u16 nlh_flags);
  371 +extern int br_fdb_dump(struct sk_buff *skb,
  372 + struct netlink_callback *cb,
  373 + struct net_device *dev,
  374 + int idx);
366 375  
367 376 /* br_forward.c */
368 377 extern void br_deliver(const struct net_bridge_port *to,
net/core/rtnetlink.c
... ... @@ -35,7 +35,9 @@
35 35 #include <linux/security.h>
36 36 #include <linux/mutex.h>
37 37 #include <linux/if_addr.h>
  38 +#include <linux/if_bridge.h>
38 39 #include <linux/pci.h>
  40 +#include <linux/etherdevice.h>
39 41  
40 42 #include <asm/uaccess.h>
41 43  
... ... @@ -1978,6 +1980,152 @@
1978 1980 rtnl_set_sk_err(net, RTNLGRP_LINK, err);
1979 1981 }
1980 1982  
  1983 +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
  1984 +{
  1985 + struct net *net = sock_net(skb->sk);
  1986 + struct net_device *master = NULL;
  1987 + struct ndmsg *ndm;
  1988 + struct nlattr *tb[NDA_MAX+1];
  1989 + struct net_device *dev;
  1990 + u8 *addr;
  1991 + int err;
  1992 +
  1993 + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
  1994 + if (err < 0)
  1995 + return err;
  1996 +
  1997 + ndm = nlmsg_data(nlh);
  1998 + if (ndm->ndm_ifindex == 0) {
  1999 + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
  2000 + return -EINVAL;
  2001 + }
  2002 +
  2003 + dev = __dev_get_by_index(net, ndm->ndm_ifindex);
  2004 + if (dev == NULL) {
  2005 + pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
  2006 + return -ENODEV;
  2007 + }
  2008 +
  2009 + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
  2010 + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
  2011 + return -EINVAL;
  2012 + }
  2013 +
  2014 + addr = nla_data(tb[NDA_LLADDR]);
  2015 + if (!is_valid_ether_addr(addr)) {
  2016 + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n");
  2017 + return -EINVAL;
  2018 + }
  2019 +
  2020 + err = -EOPNOTSUPP;
  2021 +
  2022 + /* Support fdb on master device the net/bridge default case */
  2023 + if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
  2024 + (dev->priv_flags & IFF_BRIDGE_PORT)) {
  2025 + master = dev->master;
  2026 + err = master->netdev_ops->ndo_fdb_add(ndm, dev, addr,
  2027 + nlh->nlmsg_flags);
  2028 + if (err)
  2029 + goto out;
  2030 + else
  2031 + ndm->ndm_flags &= ~NTF_MASTER;
  2032 + }
  2033 +
  2034 + /* Embedded bridge, macvlan, and any other device support */
  2035 + if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) {
  2036 + err = dev->netdev_ops->ndo_fdb_add(ndm, dev, addr,
  2037 + nlh->nlmsg_flags);
  2038 +
  2039 + if (!err)
  2040 + ndm->ndm_flags &= ~NTF_SELF;
  2041 + }
  2042 +out:
  2043 + return err;
  2044 +}
  2045 +
  2046 +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
  2047 +{
  2048 + struct net *net = sock_net(skb->sk);
  2049 + struct ndmsg *ndm;
  2050 + struct nlattr *llattr;
  2051 + struct net_device *dev;
  2052 + int err = -EINVAL;
  2053 + __u8 *addr;
  2054 +
  2055 + if (nlmsg_len(nlh) < sizeof(*ndm))
  2056 + return -EINVAL;
  2057 +
  2058 + ndm = nlmsg_data(nlh);
  2059 + if (ndm->ndm_ifindex == 0) {
  2060 + pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
  2061 + return -EINVAL;
  2062 + }
  2063 +
  2064 + dev = __dev_get_by_index(net, ndm->ndm_ifindex);
  2065 + if (dev == NULL) {
  2066 + pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
  2067 + return -ENODEV;
  2068 + }
  2069 +
  2070 + llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR);
  2071 + if (llattr == NULL || nla_len(llattr) != ETH_ALEN) {
  2072 + pr_info("PF_BRIGDE: RTM_DELNEIGH with invalid address\n");
  2073 + return -EINVAL;
  2074 + }
  2075 +
  2076 + addr = nla_data(llattr);
  2077 + err = -EOPNOTSUPP;
  2078 +
  2079 + /* Support fdb on master device the net/bridge default case */
  2080 + if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
  2081 + (dev->priv_flags & IFF_BRIDGE_PORT)) {
  2082 + struct net_device *master = dev->master;
  2083 +
  2084 + if (master->netdev_ops->ndo_fdb_del)
  2085 + err = master->netdev_ops->ndo_fdb_del(ndm, dev, addr);
  2086 +
  2087 + if (err)
  2088 + goto out;
  2089 + else
  2090 + ndm->ndm_flags &= ~NTF_MASTER;
  2091 + }
  2092 +
  2093 + /* Embedded bridge, macvlan, and any other device support */
  2094 + if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) {
  2095 + err = dev->netdev_ops->ndo_fdb_del(ndm, dev, addr);
  2096 +
  2097 + if (!err)
  2098 + ndm->ndm_flags &= ~NTF_SELF;
  2099 + }
  2100 +out:
  2101 + return err;
  2102 +}
  2103 +
  2104 +static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
  2105 +{
  2106 + int idx = 0;
  2107 + struct net *net = sock_net(skb->sk);
  2108 + struct net_device *dev;
  2109 +
  2110 + rcu_read_lock();
  2111 + for_each_netdev_rcu(net, dev) {
  2112 + if (dev->priv_flags & IFF_BRIDGE_PORT) {
  2113 + struct net_device *master = dev->master;
  2114 + const struct net_device_ops *ops = master->netdev_ops;
  2115 +
  2116 + if (ops->ndo_fdb_dump)
  2117 + idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
  2118 + }
  2119 +
  2120 + if (dev->netdev_ops->ndo_fdb_dump)
  2121 + idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
  2122 + }
  2123 + rcu_read_unlock();
  2124 +
  2125 + cb->args[0] = idx;
  2126 + return skb->len;
  2127 +}
  2128 +
1981 2129 /* Protected by RTNL sempahore. */
1982 2130 static struct rtattr **rta_buf;
1983 2131 static int rtattr_max;
... ... @@ -2150,5 +2298,9 @@
2150 2298  
2151 2299 rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
2152 2300 rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
  2301 +
  2302 + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
  2303 + rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
  2304 + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
2153 2305 }