Commit f271b2cc78f09c93ccd00a2056d3237134bf994c

Authored by Max Krasnyansky
Committed by David S. Miller
1 parent 89146504cb

tun: Fix/rewrite packet filtering logic

Please see the following thread to get some context on this
	http://marc.info/?l=linux-netdev&m=121564433018903&w=2

Basically the issue is that current multi-cast filtering stuff in
the TUN/TAP driver is seriously broken.
Original patch went in without proper review and ACK. It was broken and
confusing to start with and subsequent patches broke it completely.
To give you an idea of what's broken here are some of the issues:

- Very confusing comments throughout the code that imply that the
character device is a network interface in its own right, and that packets
are passed between the two nics. Which is completely wrong.

- Wrong set of ioctls is used for setting up filters. They look like
shortcuts for manipulating state of the tun/tap network interface but
in reality manipulate the state of the TX filter.

- ioctls that were originally used for setting address of the the TX filter
got "fixed" and now set the address of the network interface itself. Which
made filter totaly useless.

- Filtering is done too late. Instead of filtering early on, to avoid
unnecessary wakeups, filtering is done in the read() call.

The list goes on and on :)

So the patch cleans all that up. It introduces simple and clean interface for
setting up TX filters (TUNSETTXFILTER + tun_filter spec) and does filtering
before enqueuing the packets.

TX filtering is useful in the scenarios where TAP is part of a bridge, in
which case it gets all broadcast, multicast and potentially other packets when
the bridge is learning. So for example Ethernet tunnelling app may want to
setup TX filters to avoid tunnelling multicast traffic. QEMU and other
hypervisors can push RX filtering that is currently done in the guest into the
host context therefore saving wakeups and unnecessary data transfer.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 174 additions and 166 deletions Side-by-side Diff

... ... @@ -18,15 +18,11 @@
18 18 /*
19 19 * Changes:
20 20 *
21   - * Brian Braunstein <linuxkernel@bristyle.com> 2007/03/23
22   - * Fixed hw address handling. Now net_device.dev_addr is kept consistent
23   - * with tun.dev_addr when the address is set by this module.
24   - *
25 21 * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
26 22 * Add TUNSETLINK ioctl to set the link encapsulation
27 23 *
28 24 * Mark Smith <markzzzsmith@yahoo.com.au>
29   - * Use random_ether_addr() for tap MAC address.
  25 + * Use random_ether_addr() for tap MAC address.
30 26 *
31 27 * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20
32 28 * Fixes in packet dropping, queue length setting and queue wakeup.
33 29  
... ... @@ -83,9 +79,16 @@
83 79 #define DBG1( a... )
84 80 #endif
85 81  
  82 +#define FLT_EXACT_COUNT 8
  83 +struct tap_filter {
  84 + unsigned int count; /* Number of addrs. Zero means disabled */
  85 + u32 mask[2]; /* Mask of the hashed addrs */
  86 + unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
  87 +};
  88 +
86 89 struct tun_struct {
87 90 struct list_head list;
88   - unsigned long flags;
  91 + unsigned int flags;
89 92 int attached;
90 93 uid_t owner;
91 94 gid_t group;
92 95  
93 96  
94 97  
... ... @@ -94,19 +97,119 @@
94 97 struct sk_buff_head readq;
95 98  
96 99 struct net_device *dev;
  100 + struct fasync_struct *fasync;
97 101  
98   - struct fasync_struct *fasync;
  102 + struct tap_filter txflt;
99 103  
100   - unsigned long if_flags;
101   - u8 dev_addr[ETH_ALEN];
102   - u32 chr_filter[2];
103   - u32 net_filter[2];
104   -
105 104 #ifdef TUN_DEBUG
106 105 int debug;
107 106 #endif
108 107 };
109 108  
  109 +/* TAP filterting */
  110 +static void addr_hash_set(u32 *mask, const u8 *addr)
  111 +{
  112 + int n = ether_crc(ETH_ALEN, addr) >> 26;
  113 + mask[n >> 5] |= (1 << (n & 31));
  114 +}
  115 +
  116 +static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
  117 +{
  118 + int n = ether_crc(ETH_ALEN, addr) >> 26;
  119 + return mask[n >> 5] & (1 << (n & 31));
  120 +}
  121 +
  122 +static int update_filter(struct tap_filter *filter, void __user *arg)
  123 +{
  124 + struct { u8 u[ETH_ALEN]; } *addr;
  125 + struct tun_filter uf;
  126 + int err, alen, n, nexact;
  127 +
  128 + if (copy_from_user(&uf, arg, sizeof(uf)))
  129 + return -EFAULT;
  130 +
  131 + if (!uf.count) {
  132 + /* Disabled */
  133 + filter->count = 0;
  134 + return 0;
  135 + }
  136 +
  137 + alen = ETH_ALEN * uf.count;
  138 + addr = kmalloc(alen, GFP_KERNEL);
  139 + if (!addr)
  140 + return -ENOMEM;
  141 +
  142 + if (copy_from_user(addr, arg + sizeof(uf), alen)) {
  143 + err = -EFAULT;
  144 + goto done;
  145 + }
  146 +
  147 + /* The filter is updated without holding any locks. Which is
  148 + * perfectly safe. We disable it first and in the worst
  149 + * case we'll accept a few undesired packets. */
  150 + filter->count = 0;
  151 + wmb();
  152 +
  153 + /* Use first set of addresses as an exact filter */
  154 + for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
  155 + memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
  156 +
  157 + nexact = n;
  158 +
  159 + /* The rest is hashed */
  160 + memset(filter->mask, 0, sizeof(filter->mask));
  161 + for (; n < uf.count; n++)
  162 + addr_hash_set(filter->mask, addr[n].u);
  163 +
  164 + /* For ALLMULTI just set the mask to all ones.
  165 + * This overrides the mask populated above. */
  166 + if ((uf.flags & TUN_FLT_ALLMULTI))
  167 + memset(filter->mask, ~0, sizeof(filter->mask));
  168 +
  169 + /* Now enable the filter */
  170 + wmb();
  171 + filter->count = nexact;
  172 +
  173 + /* Return the number of exact filters */
  174 + err = nexact;
  175 +
  176 +done:
  177 + kfree(addr);
  178 + return err;
  179 +}
  180 +
  181 +/* Returns: 0 - drop, !=0 - accept */
  182 +static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
  183 +{
  184 + /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
  185 + * at this point. */
  186 + struct ethhdr *eh = (struct ethhdr *) skb->data;
  187 + int i;
  188 +
  189 + /* Exact match */
  190 + for (i = 0; i < filter->count; i++)
  191 + if (!compare_ether_addr(eh->h_dest, filter->addr[i]))
  192 + return 1;
  193 +
  194 + /* Inexact match (multicast only) */
  195 + if (is_multicast_ether_addr(eh->h_dest))
  196 + return addr_hash_test(filter->mask, eh->h_dest);
  197 +
  198 + return 0;
  199 +}
  200 +
  201 +/*
  202 + * Checks whether the packet is accepted or not.
  203 + * Returns: 0 - drop, !=0 - accept
  204 + */
  205 +static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
  206 +{
  207 + if (!filter->count)
  208 + return 1;
  209 +
  210 + return run_filter(filter, skb);
  211 +}
  212 +
110 213 /* Network device part of the driver */
111 214  
112 215 static unsigned int tun_net_id;
... ... @@ -141,7 +244,12 @@
141 244 if (!tun->attached)
142 245 goto drop;
143 246  
144   - /* Packet dropping */
  247 + /* Drop if the filter does not like it.
  248 + * This is a noop if the filter is disabled.
  249 + * Filter can be enabled only for the TAP devices. */
  250 + if (!check_filter(&tun->txflt, skb))
  251 + goto drop;
  252 +
145 253 if (skb_queue_len(&tun->readq) >= dev->tx_queue_len) {
146 254 if (!(tun->flags & TUN_ONE_QUEUE)) {
147 255 /* Normal queueing mode. */
... ... @@ -158,7 +266,7 @@
158 266 }
159 267 }
160 268  
161   - /* Queue packet */
  269 + /* Enqueue packet */
162 270 skb_queue_tail(&tun->readq, skb);
163 271 dev->trans_start = jiffies;
164 272  
165 273  
166 274  
... ... @@ -174,43 +282,16 @@
174 282 return 0;
175 283 }
176 284  
177   -/** Add the specified Ethernet address to this multicast filter. */
178   -static void
179   -add_multi(u32* filter, const u8* addr)
  285 +static void tun_net_mclist(struct net_device *dev)
180 286 {
181   - int bit_nr = ether_crc(ETH_ALEN, addr) >> 26;
182   - filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
  287 + /*
  288 + * This callback is supposed to deal with mc filter in
  289 + * _rx_ path and has nothing to do with the _tx_ path.
  290 + * In rx path we always accept everything userspace gives us.
  291 + */
  292 + return;
183 293 }
184 294  
185   -/** Remove the specified Ethernet addres from this multicast filter. */
186   -static void
187   -del_multi(u32* filter, const u8* addr)
188   -{
189   - int bit_nr = ether_crc(ETH_ALEN, addr) >> 26;
190   - filter[bit_nr >> 5] &= ~(1 << (bit_nr & 31));
191   -}
192   -
193   -/** Update the list of multicast groups to which the network device belongs.
194   - * This list is used to filter packets being sent from the character device to
195   - * the network device. */
196   -static void
197   -tun_net_mclist(struct net_device *dev)
198   -{
199   - struct tun_struct *tun = netdev_priv(dev);
200   - const struct dev_mc_list *mclist;
201   - int i;
202   - DECLARE_MAC_BUF(mac);
203   - DBG(KERN_DEBUG "%s: tun_net_mclist: mc_count %d\n",
204   - dev->name, dev->mc_count);
205   - memset(tun->chr_filter, 0, sizeof tun->chr_filter);
206   - for (i = 0, mclist = dev->mc_list; i < dev->mc_count && mclist != NULL;
207   - i++, mclist = mclist->next) {
208   - add_multi(tun->net_filter, mclist->dmi_addr);
209   - DBG(KERN_DEBUG "%s: tun_net_mclist: %s\n",
210   - dev->name, print_mac(mac, mclist->dmi_addr));
211   - }
212   -}
213   -
214 295 #define MIN_MTU 68
215 296 #define MAX_MTU 65535
216 297  
217 298  
218 299  
... ... @@ -244,14 +325,12 @@
244 325  
245 326 case TUN_TAP_DEV:
246 327 /* Ethernet TAP Device */
  328 + ether_setup(dev);
  329 + dev->change_mtu = tun_net_change_mtu;
247 330 dev->set_multicast_list = tun_net_mclist;
248 331  
249   - ether_setup(dev);
250   - dev->change_mtu = tun_net_change_mtu;
  332 + random_ether_addr(dev->dev_addr);
251 333  
252   - /* random address already created for us by tun_set_iff, use it */
253   - memcpy(dev->dev_addr, tun->dev_addr, min(sizeof(tun->dev_addr), sizeof(dev->dev_addr)) );
254   -
255 334 dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */
256 335 break;
257 336 }
... ... @@ -486,7 +565,6 @@
486 565 DECLARE_WAITQUEUE(wait, current);
487 566 struct sk_buff *skb;
488 567 ssize_t len, ret = 0;
489   - DECLARE_MAC_BUF(mac);
490 568  
491 569 if (!tun)
492 570 return -EBADFD;
... ... @@ -499,10 +577,6 @@
499 577  
500 578 add_wait_queue(&tun->read_wait, &wait);
501 579 while (len) {
502   - const u8 ones[ ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
503   - u8 addr[ ETH_ALEN];
504   - int bit_nr;
505   -
506 580 current->state = TASK_INTERRUPTIBLE;
507 581  
508 582 /* Read frames from the queue */
... ... @@ -522,36 +596,9 @@
522 596 }
523 597 netif_wake_queue(tun->dev);
524 598  
525   - /** Decide whether to accept this packet. This code is designed to
526   - * behave identically to an Ethernet interface. Accept the packet if
527   - * - we are promiscuous.
528   - * - the packet is addressed to us.
529   - * - the packet is broadcast.
530   - * - the packet is multicast and
531   - * - we are multicast promiscous.
532   - * - we belong to the multicast group.
533   - */
534   - skb_copy_from_linear_data(skb, addr, min_t(size_t, sizeof addr,
535   - skb->len));
536   - bit_nr = ether_crc(sizeof addr, addr) >> 26;
537   - if ((tun->if_flags & IFF_PROMISC) ||
538   - memcmp(addr, tun->dev_addr, sizeof addr) == 0 ||
539   - memcmp(addr, ones, sizeof addr) == 0 ||
540   - (((addr[0] == 1 && addr[1] == 0 && addr[2] == 0x5e) ||
541   - (addr[0] == 0x33 && addr[1] == 0x33)) &&
542   - ((tun->if_flags & IFF_ALLMULTI) ||
543   - (tun->chr_filter[bit_nr >> 5] & (1 << (bit_nr & 31)))))) {
544   - DBG(KERN_DEBUG "%s: tun_chr_readv: accepted: %s\n",
545   - tun->dev->name, print_mac(mac, addr));
546   - ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
547   - kfree_skb(skb);
548   - break;
549   - } else {
550   - DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %s\n",
551   - tun->dev->name, print_mac(mac, addr));
552   - kfree_skb(skb);
553   - continue;
554   - }
  599 + ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
  600 + kfree_skb(skb);
  601 + break;
555 602 }
556 603  
557 604 current->state = TASK_RUNNING;
... ... @@ -647,12 +694,7 @@
647 694 tun = netdev_priv(dev);
648 695 tun->dev = dev;
649 696 tun->flags = flags;
650   - /* Be promiscuous by default to maintain previous behaviour. */
651   - tun->if_flags = IFF_PROMISC;
652   - /* Generate random Ethernet address. */
653   - *(__be16 *)tun->dev_addr = htons(0x00FF);
654   - get_random_bytes(tun->dev_addr + sizeof(u16), 4);
655   - memset(tun->chr_filter, 0, sizeof tun->chr_filter);
  697 + tun->txflt.count = 0;
656 698  
657 699 tun_net_init(dev);
658 700  
... ... @@ -751,6 +793,7 @@
751 793 struct tun_struct *tun = file->private_data;
752 794 void __user* argp = (void __user*)arg;
753 795 struct ifreq ifr;
  796 + int ret;
754 797 DECLARE_MAC_BUF(mac);
755 798  
756 799 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
... ... @@ -826,9 +869,6 @@
826 869 break;
827 870  
828 871 case TUNSETLINK:
829   - {
830   - int ret;
831   -
832 872 /* Only allow setting the type when the interface is down */
833 873 rtnl_lock();
834 874 if (tun->dev->flags & IFF_UP) {
835 875  
836 876  
837 877  
838 878  
839 879  
840 880  
841 881  
842 882  
... ... @@ -842,94 +882,44 @@
842 882 }
843 883 rtnl_unlock();
844 884 return ret;
845   - }
846 885  
847 886 #ifdef TUN_DEBUG
848 887 case TUNSETDEBUG:
849 888 tun->debug = arg;
850 889 break;
851 890 #endif
852   -
853 891 case TUNSETOFFLOAD:
854   - {
855   - int ret;
856 892 rtnl_lock();
857 893 ret = set_offload(tun->dev, arg);
858 894 rtnl_unlock();
859 895 return ret;
860   - }
861 896  
862   - case SIOCGIFFLAGS:
863   - ifr.ifr_flags = tun->if_flags;
864   - if (copy_to_user( argp, &ifr, sizeof ifr))
865   - return -EFAULT;
866   - return 0;
  897 + case TUNSETTXFILTER:
  898 + /* Can be set only for TAPs */
  899 + if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
  900 + return -EINVAL;
  901 + rtnl_lock();
  902 + ret = update_filter(&tun->txflt, (void *) __user arg);
  903 + rtnl_unlock();
  904 + return ret;
867 905  
868   - case SIOCSIFFLAGS:
869   - /** Set the character device's interface flags. Currently only
870   - * IFF_PROMISC and IFF_ALLMULTI are used. */
871   - tun->if_flags = ifr.ifr_flags;
872   - DBG(KERN_INFO "%s: interface flags 0x%lx\n",
873   - tun->dev->name, tun->if_flags);
874   - return 0;
875   -
876 906 case SIOCGIFHWADDR:
877   - /* Note: the actual net device's address may be different */
878   - memcpy(ifr.ifr_hwaddr.sa_data, tun->dev_addr,
879   - min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun->dev_addr));
880   - if (copy_to_user( argp, &ifr, sizeof ifr))
  907 + /* Get hw addres */
  908 + memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
  909 + ifr.ifr_hwaddr.sa_family = tun->dev->type;
  910 + if (copy_to_user(argp, &ifr, sizeof ifr))
881 911 return -EFAULT;
882 912 return 0;
883 913  
884 914 case SIOCSIFHWADDR:
885   - {
886   - /* try to set the actual net device's hw address */
887   - int ret;
  915 + /* Set hw address */
  916 + DBG(KERN_DEBUG "%s: set hw address: %s\n",
  917 + tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
888 918  
889 919 rtnl_lock();
890 920 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
891 921 rtnl_unlock();
892   -
893   - if (ret == 0) {
894   - /** Set the character device's hardware address. This is used when
895   - * filtering packets being sent from the network device to the character
896   - * device. */
897   - memcpy(tun->dev_addr, ifr.ifr_hwaddr.sa_data,
898   - min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun->dev_addr));
899   - DBG(KERN_DEBUG "%s: set hardware address: %x:%x:%x:%x:%x:%x\n",
900   - tun->dev->name,
901   - tun->dev_addr[0], tun->dev_addr[1], tun->dev_addr[2],
902   - tun->dev_addr[3], tun->dev_addr[4], tun->dev_addr[5]);
903   - }
904   -
905   - return ret;
906   - }
907   -
908   - case SIOCADDMULTI:
909   - /** Add the specified group to the character device's multicast filter
910   - * list. */
911   - rtnl_lock();
912   - netif_tx_lock_bh(tun->dev);
913   - add_multi(tun->chr_filter, ifr.ifr_hwaddr.sa_data);
914   - netif_tx_unlock_bh(tun->dev);
915   - rtnl_unlock();
916   -
917   - DBG(KERN_DEBUG "%s: add multi: %s\n",
918   - tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
919   - return 0;
920   -
921   - case SIOCDELMULTI:
922   - /** Remove the specified group from the character device's multicast
923   - * filter list. */
924   - rtnl_lock();
925   - netif_tx_lock_bh(tun->dev);
926   - del_multi(tun->chr_filter, ifr.ifr_hwaddr.sa_data);
927   - netif_tx_unlock_bh(tun->dev);
928   - rtnl_unlock();
929   -
930   - DBG(KERN_DEBUG "%s: del multi: %s\n",
931   - tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
932   - return 0;
  922 + return ret;
933 923  
934 924 default:
935 925 return -EINVAL;
include/linux/if_tun.h
... ... @@ -17,6 +17,7 @@
17 17 #define __IF_TUN_H
18 18  
19 19 #include <linux/types.h>
  20 +#include <linux/if_ether.h>
20 21  
21 22 /* Read queue size */
22 23 #define TUN_READQ_SIZE 500
... ... @@ -42,7 +43,8 @@
42 43 #define TUNSETLINK _IOW('T', 205, int)
43 44 #define TUNSETGROUP _IOW('T', 206, int)
44 45 #define TUNGETFEATURES _IOR('T', 207, unsigned int)
45   -#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
  46 +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
  47 +#define TUNSETTXFILTER _IOW('T', 209, unsigned int)
46 48  
47 49 /* TUNSETIFF ifr flags */
48 50 #define IFF_TUN 0x0001
49 51  
50 52  
... ... @@ -57,11 +59,27 @@
57 59 #define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
58 60 #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
59 61  
  62 +/* Protocol info prepended to the packets (when IFF_NO_PI is not set) */
  63 +#define TUN_PKT_STRIP 0x0001
60 64 struct tun_pi {
61   - unsigned short flags;
  65 + __u16 flags;
62 66 __be16 proto;
63 67 };
64   -#define TUN_PKT_STRIP 0x0001
  68 +
  69 +/*
  70 + * Filter spec (used for SETXXFILTER ioctls)
  71 + * This stuff is applicable only to the TAP (Ethernet) devices.
  72 + * If the count is zero the filter is disabled and the driver accepts
  73 + * all packets (promisc mode).
  74 + * If the filter is enabled in order to accept broadcast packets
  75 + * broadcast addr must be explicitly included in the addr list.
  76 + */
  77 +#define TUN_FLT_ALLMULTI 0x0001 /* Accept all multicast packets */
  78 +struct tun_filter {
  79 + __u16 flags; /* TUN_FLT_ flags see above */
  80 + __u16 count; /* Number of addresses */
  81 + __u8 addr[0][ETH_ALEN];
  82 +};
65 83  
66 84 #endif /* __IF_TUN_H */