Commit 14deae41566b5cdd992c01d0069518ced5227c83

Authored by David S. Miller
1 parent eb4dea5853

ipv6: Fix sporadic sendmsg -EINVAL when sending to multicast groups.

Thanks to excellent diagnosis by Eduard Guzovsky.

The core problem is that on a network with lots of active
multicast traffic, the neighbour cache can fill up.  If
we try to allocate a new route and thus neighbour cache
entry, the bog-standard GC attempt the neighbour layer does
in ineffective because route entries hold a reference
to the existing neighbour entries and GC can only liberate
entries with no references.

IPV4 already has a way to handle this, by doing a route cache
GC in such situations (when neigh attach returns -ENOBUFS).

So simply mimick this on the ipv6 side.

Tested-by: Eduard Guzovsky <eguzovsky@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 49 additions and 7 deletions Side-by-side Diff

... ... @@ -155,9 +155,9 @@
155 155 {
156 156  
157 157 if (dev)
158   - return __neigh_lookup(&nd_tbl, addr, dev, 1);
  158 + return __neigh_lookup_errno(&nd_tbl, addr, dev);
159 159  
160   - return NULL;
  160 + return ERR_PTR(-ENODEV);
161 161 }
162 162  
163 163  
... ... @@ -627,6 +627,9 @@
627 627 rt = ip6_rt_copy(ort);
628 628  
629 629 if (rt) {
  630 + struct neighbour *neigh;
  631 + int attempts = !in_softirq();
  632 +
630 633 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631 634 if (rt->rt6i_dst.plen != 128 &&
632 635 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633 636  
... ... @@ -646,8 +649,36 @@
646 649 }
647 650 #endif
648 651  
649   - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
  652 + retry:
  653 + neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
  654 + if (IS_ERR(neigh)) {
  655 + struct net *net = dev_net(rt->rt6i_dev);
  656 + int saved_rt_min_interval =
  657 + net->ipv6.sysctl.ip6_rt_gc_min_interval;
  658 + int saved_rt_elasticity =
  659 + net->ipv6.sysctl.ip6_rt_gc_elasticity;
650 660  
  661 + if (attempts-- > 0) {
  662 + net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
  663 + net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
  664 +
  665 + ip6_dst_gc(net->ipv6.ip6_dst_ops);
  666 +
  667 + net->ipv6.sysctl.ip6_rt_gc_elasticity =
  668 + saved_rt_elasticity;
  669 + net->ipv6.sysctl.ip6_rt_gc_min_interval =
  670 + saved_rt_min_interval;
  671 + goto retry;
  672 + }
  673 +
  674 + if (net_ratelimit())
  675 + printk(KERN_WARNING
  676 + "Neighbour table overflow.\n");
  677 + dst_free(&rt->u.dst);
  678 + return NULL;
  679 + }
  680 + rt->rt6i_nexthop = neigh;
  681 +
651 682 }
652 683  
653 684 return rt;
654 685  
... ... @@ -945,8 +976,11 @@
945 976 dev_hold(dev);
946 977 if (neigh)
947 978 neigh_hold(neigh);
948   - else
  979 + else {
949 980 neigh = ndisc_get_neigh(dev, addr);
  981 + if (IS_ERR(neigh))
  982 + neigh = NULL;
  983 + }
950 984  
951 985 rt->rt6i_dev = dev;
952 986 rt->rt6i_idev = idev;
... ... @@ -1887,6 +1921,7 @@
1887 1921 {
1888 1922 struct net *net = dev_net(idev->dev);
1889 1923 struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
  1924 + struct neighbour *neigh;
1890 1925  
1891 1926 if (rt == NULL)
1892 1927 return ERR_PTR(-ENOMEM);
1893 1928  
1894 1929  
... ... @@ -1909,11 +1944,18 @@
1909 1944 rt->rt6i_flags |= RTF_ANYCAST;
1910 1945 else
1911 1946 rt->rt6i_flags |= RTF_LOCAL;
1912   - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1913   - if (rt->rt6i_nexthop == NULL) {
  1947 + neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
  1948 + if (IS_ERR(neigh)) {
1914 1949 dst_free(&rt->u.dst);
1915   - return ERR_PTR(-ENOMEM);
  1950 +
  1951 + /* We are casting this because that is the return
  1952 + * value type. But an errno encoded pointer is the
  1953 + * same regardless of the underlying pointer type,
  1954 + * and that's what we are returning. So this is OK.
  1955 + */
  1956 + return (struct rt6_info *) neigh;
1916 1957 }
  1958 + rt->rt6i_nexthop = neigh;
1917 1959  
1918 1960 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1919 1961 rt->rt6i_dst.plen = 128;