Commit 33dccbb050bbe35b88ca8cf1228dcf3e4d4b3554

Authored by Herbert Xu
Committed by David S. Miller
1 parent 4cc7f68d65

tun: Limit amount of queued packets per device

Unlike a normal socket path, the tuntap device send path does
not have any accounting.  This means that the user-space sender
may be able to pin down arbitrary amounts of kernel memory by
continuing to send data to an end-point that is congested.

Even when this isn't an issue because of limited queueing at
most end points, this can also be a problem because its only
response to congestion is packet loss.  That is, when those
local queues at the end-point fills up, the tuntap device will
start wasting system time because it will continue to send
data there which simply gets dropped straight away.

Of course one could argue that everybody should do congestion
control end-to-end, unfortunately there are people in this world
still hooked on UDP, and they don't appear to be going away
anywhere fast.  In fact, we've always helped them by performing
accounting in our UDP code, the sole purpose of which is to
provide congestion feedback other than through packet loss.

This patch attempts to apply the same bandaid to the tuntap device.
It creates a pseudo-socket object which is used to account our
packets just as a normal socket does for UDP.  Of course things
are a little complex because we're actually reinjecting traffic
back into the stack rather than out of the stack.

The stack complexities however should have been resolved by preceding
patches.  So this one can simply start using skb_set_owner_w.

For now the accounting is essentially disabled by default for
backwards compatibility.  In particular, we set the cap to INT_MAX.
This is so that existing applications don't get confused by the
sudden arrival EAGAIN errors.

In future we may wish (or be forced to) do this by default.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 3 changed files with 118 additions and 53 deletions Side-by-side Diff

... ... @@ -64,6 +64,7 @@
64 64 #include <net/net_namespace.h>
65 65 #include <net/netns/generic.h>
66 66 #include <net/rtnetlink.h>
  67 +#include <net/sock.h>
67 68  
68 69 #include <asm/system.h>
69 70 #include <asm/uaccess.h>
... ... @@ -95,6 +96,8 @@
95 96 wait_queue_head_t read_wait;
96 97 };
97 98  
  99 +struct tun_sock;
  100 +
98 101 struct tun_struct {
99 102 struct tun_file *tfile;
100 103 unsigned int flags;
101 104  
... ... @@ -107,12 +110,24 @@
107 110 struct fasync_struct *fasync;
108 111  
109 112 struct tap_filter txflt;
  113 + struct sock *sk;
  114 + struct socket socket;
110 115  
111 116 #ifdef TUN_DEBUG
112 117 int debug;
113 118 #endif
114 119 };
115 120  
  121 +struct tun_sock {
  122 + struct sock sk;
  123 + struct tun_struct *tun;
  124 +};
  125 +
  126 +static inline struct tun_sock *tun_sk(struct sock *sk)
  127 +{
  128 + return container_of(sk, struct tun_sock, sk);
  129 +}
  130 +
116 131 static int tun_attach(struct tun_struct *tun, struct file *file)
117 132 {
118 133 struct tun_file *tfile = file->private_data;
... ... @@ -461,7 +476,8 @@
461 476 {
462 477 struct tun_file *tfile = file->private_data;
463 478 struct tun_struct *tun = __tun_get(tfile);
464   - unsigned int mask = POLLOUT | POLLWRNORM;
  479 + struct sock *sk = tun->sk;
  480 + unsigned int mask = 0;
465 481  
466 482 if (!tun)
467 483 return POLLERR;
... ... @@ -473,6 +489,11 @@
473 489 if (!skb_queue_empty(&tun->readq))
474 490 mask |= POLLIN | POLLRDNORM;
475 491  
  492 + if (sock_writeable(sk) ||
  493 + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
  494 + sock_writeable(sk)))
  495 + mask |= POLLOUT | POLLWRNORM;
  496 +
476 497 if (tun->dev->reg_state != NETREG_REGISTERED)
477 498 mask = POLLERR;
478 499  
479 500  
480 501  
481 502  
482 503  
483 504  
484 505  
485 506  
486 507  
487 508  
... ... @@ -482,66 +503,35 @@
482 503  
483 504 /* prepad is the amount to reserve at front. len is length after that.
484 505 * linear is a hint as to how much to copy (usually headers). */
485   -static struct sk_buff *tun_alloc_skb(size_t prepad, size_t len, size_t linear,
486   - gfp_t gfp)
  506 +static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
  507 + size_t prepad, size_t len,
  508 + size_t linear, int noblock)
487 509 {
  510 + struct sock *sk = tun->sk;
488 511 struct sk_buff *skb;
489   - unsigned int i;
  512 + int err;
490 513  
491   - skb = alloc_skb(prepad + len, gfp|__GFP_NOWARN);
492   - if (skb) {
493   - skb_reserve(skb, prepad);
494   - skb_put(skb, len);
495   - return skb;
496   - }
497   -
498 514 /* Under a page? Don't bother with paged skb. */
499 515 if (prepad + len < PAGE_SIZE)
500   - return NULL;
  516 + linear = len;
501 517  
502   - /* Start with a normal skb, and add pages. */
503   - skb = alloc_skb(prepad + linear, gfp);
  518 + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
  519 + &err);
504 520 if (!skb)
505   - return NULL;
  521 + return ERR_PTR(err);
506 522  
507 523 skb_reserve(skb, prepad);
508 524 skb_put(skb, linear);
  525 + skb->data_len = len - linear;
  526 + skb->len += len - linear;
509 527  
510   - len -= linear;
511   -
512   - for (i = 0; i < MAX_SKB_FRAGS; i++) {
513   - skb_frag_t *f = &skb_shinfo(skb)->frags[i];
514   -
515   - f->page = alloc_page(gfp|__GFP_ZERO);
516   - if (!f->page)
517   - break;
518   -
519   - f->page_offset = 0;
520   - f->size = PAGE_SIZE;
521   -
522   - skb->data_len += PAGE_SIZE;
523   - skb->len += PAGE_SIZE;
524   - skb->truesize += PAGE_SIZE;
525   - skb_shinfo(skb)->nr_frags++;
526   -
527   - if (len < PAGE_SIZE) {
528   - len = 0;
529   - break;
530   - }
531   - len -= PAGE_SIZE;
532   - }
533   -
534   - /* Too large, or alloc fail? */
535   - if (unlikely(len)) {
536   - kfree_skb(skb);
537   - skb = NULL;
538   - }
539   -
540 528 return skb;
541 529 }
542 530  
543 531 /* Get packet from user space buffer */
544   -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
  532 +static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
  533 + struct iovec *iv, size_t count,
  534 + int noblock)
545 535 {
546 536 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
547 537 struct sk_buff *skb;
... ... @@ -573,9 +563,11 @@
573 563 return -EINVAL;
574 564 }
575 565  
576   - if (!(skb = tun_alloc_skb(align, len, gso.hdr_len, GFP_KERNEL))) {
577   - tun->dev->stats.rx_dropped++;
578   - return -ENOMEM;
  566 + skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
  567 + if (IS_ERR(skb)) {
  568 + if (PTR_ERR(skb) != -EAGAIN)
  569 + tun->dev->stats.rx_dropped++;
  570 + return PTR_ERR(skb);
579 571 }
580 572  
581 573 if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
... ... @@ -661,7 +653,8 @@
661 653 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
662 654 unsigned long count, loff_t pos)
663 655 {
664   - struct tun_struct *tun = tun_get(iocb->ki_filp);
  656 + struct file *file = iocb->ki_filp;
  657 + struct tun_struct *tun = file->private_data;
665 658 ssize_t result;
666 659  
667 660 if (!tun)
... ... @@ -669,7 +662,8 @@
669 662  
670 663 DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
671 664  
672   - result = tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
  665 + result = tun_get_user(tun, (struct iovec *)iv, iov_length(iv, count),
  666 + file->f_flags & O_NONBLOCK);
673 667  
674 668 tun_put(tun);
675 669 return result;
676 670  
677 671  
678 672  
... ... @@ -828,11 +822,40 @@
828 822 .validate = tun_validate,
829 823 };
830 824  
  825 +static void tun_sock_write_space(struct sock *sk)
  826 +{
  827 + struct tun_struct *tun;
831 828  
  829 + if (!sock_writeable(sk))
  830 + return;
  831 +
  832 + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
  833 + wake_up_interruptible_sync(sk->sk_sleep);
  834 +
  835 + if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
  836 + return;
  837 +
  838 + tun = container_of(sk, struct tun_sock, sk)->tun;
  839 + kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
  840 +}
  841 +
  842 +static void tun_sock_destruct(struct sock *sk)
  843 +{
  844 + dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
  845 +}
  846 +
  847 +static struct proto tun_proto = {
  848 + .name = "tun",
  849 + .owner = THIS_MODULE,
  850 + .obj_size = sizeof(struct tun_sock),
  851 +};
  852 +
832 853 static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
833 854 {
  855 + struct sock *sk;
834 856 struct tun_struct *tun;
835 857 struct net_device *dev;
  858 + struct tun_file *tfile = file->private_data;
836 859 int err;
837 860  
838 861 dev = __dev_get_by_name(net, ifr->ifr_name);
839 862  
840 863  
... ... @@ -885,14 +908,31 @@
885 908 tun->flags = flags;
886 909 tun->txflt.count = 0;
887 910  
  911 + err = -ENOMEM;
  912 + sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
  913 + if (!sk)
  914 + goto err_free_dev;
  915 +
  916 + /* This ref count is for tun->sk. */
  917 + dev_hold(dev);
  918 + sock_init_data(&tun->socket, sk);
  919 + sk->sk_write_space = tun_sock_write_space;
  920 + sk->sk_destruct = tun_sock_destruct;
  921 + sk->sk_sndbuf = INT_MAX;
  922 + sk->sk_sleep = &tfile->read_wait;
  923 +
  924 + tun->sk = sk;
  925 + container_of(sk, struct tun_sock, sk)->tun = tun;
  926 +
888 927 tun_net_init(dev);
889 928  
890 929 if (strchr(dev->name, '%')) {
891 930 err = dev_alloc_name(dev, dev->name);
892 931 if (err < 0)
893   - goto err_free_dev;
  932 + goto err_free_sk;
894 933 }
895 934  
  935 + err = -EINVAL;
896 936 err = register_netdevice(tun->dev);
897 937 if (err < 0)
898 938 goto err_free_dev;
... ... @@ -928,6 +968,8 @@
928 968 strcpy(ifr->ifr_name, tun->dev->name);
929 969 return 0;
930 970  
  971 + err_free_sk:
  972 + sock_put(sk);
931 973 err_free_dev:
932 974 free_netdev(dev);
933 975 failed:
... ... @@ -1012,6 +1054,7 @@
1012 1054 struct tun_struct *tun;
1013 1055 void __user* argp = (void __user*)arg;
1014 1056 struct ifreq ifr;
  1057 + int sndbuf;
1015 1058 int ret;
1016 1059  
1017 1060 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
... ... @@ -1151,6 +1194,22 @@
1151 1194 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1152 1195 rtnl_unlock();
1153 1196 break;
  1197 +
  1198 + case TUNGETSNDBUF:
  1199 + sndbuf = tun->sk->sk_sndbuf;
  1200 + if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
  1201 + ret = -EFAULT;
  1202 + break;
  1203 +
  1204 + case TUNSETSNDBUF:
  1205 + if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
  1206 + ret = -EFAULT;
  1207 + break;
  1208 + }
  1209 +
  1210 + tun->sk->sk_sndbuf = sndbuf;
  1211 + break;
  1212 +
1154 1213 default:
1155 1214 ret = -EINVAL;
1156 1215 break;
1157 1216  
... ... @@ -1218,8 +1277,10 @@
1218 1277 __tun_detach(tun);
1219 1278  
1220 1279 /* If desireable, unregister the netdevice. */
1221   - if (!(tun->flags & TUN_PERSIST))
  1280 + if (!(tun->flags & TUN_PERSIST)) {
  1281 + sock_put(tun->sk);
1222 1282 unregister_netdevice(tun->dev);
  1283 + }
1223 1284  
1224 1285 rtnl_unlock();
1225 1286 }
... ... @@ -1988,6 +1988,8 @@
1988 1988 COMPATIBLE_IOCTL(TUNGETFEATURES)
1989 1989 COMPATIBLE_IOCTL(TUNSETOFFLOAD)
1990 1990 COMPATIBLE_IOCTL(TUNSETTXFILTER)
  1991 +COMPATIBLE_IOCTL(TUNGETSNDBUF)
  1992 +COMPATIBLE_IOCTL(TUNSETSNDBUF)
1991 1993 /* Big V */
1992 1994 COMPATIBLE_IOCTL(VT_SETMODE)
1993 1995 COMPATIBLE_IOCTL(VT_GETMODE)
include/linux/if_tun.h
... ... @@ -46,6 +46,8 @@
46 46 #define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
47 47 #define TUNSETTXFILTER _IOW('T', 209, unsigned int)
48 48 #define TUNGETIFF _IOR('T', 210, unsigned int)
  49 +#define TUNGETSNDBUF _IOR('T', 211, int)
  50 +#define TUNSETSNDBUF _IOW('T', 212, int)
49 51  
50 52 /* TUNSETIFF ifr flags */
51 53 #define IFF_TUN 0x0001