Commit 941c8726e4e737e74d418ccec3d8e7b946a65541

Authored by Linus Torvalds

Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

Pull final RDMA changes from Roland Dreier:
 - Fix IPoIB to stop using unsafe linkage between networking neighbour
   layer and private path database.
 - Small fixes for bugs found by Fengguang Wu's automated builds.

* tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband:
  IPoIB: Use a private hash table for path lookup in xmit path
  IB/qib: Fix size of cc_supported_table_entries
  RDMA/ucma: Convert open-coded equivalent to memdup_user()
  RDMA/ocrdma: Fix check of GSI CQs
  RDMA/cma: Use PTR_RET rather than if (IS_ERR(...)) + PTR_ERR

Showing 8 changed files Side-by-side Diff

drivers/infiniband/core/cma.c
... ... @@ -3064,10 +3064,7 @@
3064 3064 id_priv->id.port_num, &rec,
3065 3065 comp_mask, GFP_KERNEL,
3066 3066 cma_ib_mc_handler, mc);
3067   - if (IS_ERR(mc->multicast.ib))
3068   - return PTR_ERR(mc->multicast.ib);
3069   -
3070   - return 0;
  3067 + return PTR_RET(mc->multicast.ib);
3071 3068 }
3072 3069  
3073 3070 static void iboe_mcast_work_handler(struct work_struct *work)
drivers/infiniband/core/ucma.c
... ... @@ -1002,23 +1002,18 @@
1002 1002 if (IS_ERR(ctx))
1003 1003 return PTR_ERR(ctx);
1004 1004  
1005   - optval = kmalloc(cmd.optlen, GFP_KERNEL);
1006   - if (!optval) {
1007   - ret = -ENOMEM;
1008   - goto out1;
  1005 + optval = memdup_user((void __user *) (unsigned long) cmd.optval,
  1006 + cmd.optlen);
  1007 + if (IS_ERR(optval)) {
  1008 + ret = PTR_ERR(optval);
  1009 + goto out;
1009 1010 }
1010 1011  
1011   - if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
1012   - cmd.optlen)) {
1013   - ret = -EFAULT;
1014   - goto out2;
1015   - }
1016   -
1017 1012 ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
1018 1013 cmd.optlen);
1019   -out2:
1020 1014 kfree(optval);
1021   -out1:
  1015 +
  1016 +out:
1022 1017 ucma_put_ctx(ctx);
1023 1018 return ret;
1024 1019 }
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
... ... @@ -893,7 +893,9 @@
893 893 /* verify consumer QPs are not trying to use GSI QP's CQ */
894 894 if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) {
895 895 if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) ||
896   - (dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq))) {
  896 + (dev->gsi_sqcq == get_ocrdma_cq(attrs->recv_cq)) ||
  897 + (dev->gsi_rqcq == get_ocrdma_cq(attrs->send_cq)) ||
  898 + (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) {
897 899 ocrdma_err("%s(%d) Consumer QP cannot use GSI CQs.\n",
898 900 __func__, dev->id);
899 901 return -EINVAL;
drivers/infiniband/hw/qib/qib.h
... ... @@ -656,6 +656,11 @@
656 656 /* 16 congestion entries with each entry corresponding to a SL */
657 657 struct ib_cc_congestion_entry_shadow *congestion_entries;
658 658  
  659 + /* Maximum number of congestion control entries that the agent expects
  660 + * the manager to send.
  661 + */
  662 + u16 cc_supported_table_entries;
  663 +
659 664 /* Total number of congestion control table entries */
660 665 u16 total_cct_entry;
661 666  
... ... @@ -667,11 +672,6 @@
667 672  
668 673 /* CA's max number of 64 entry units in the congestion control table */
669 674 u8 cc_max_table_entries;
670   -
671   - /* Maximum number of congestion control entries that the agent expects
672   - * the manager to send.
673   - */
674   - u8 cc_supported_table_entries;
675 675 };
676 676  
677 677 /* Observers. Not to be taken lightly, possibly not to ship. */
drivers/infiniband/ulp/ipoib/ipoib.h
... ... @@ -92,6 +92,8 @@
92 92 IPOIB_STOP_REAPER = 7,
93 93 IPOIB_FLAG_ADMIN_CM = 9,
94 94 IPOIB_FLAG_UMCAST = 10,
  95 + IPOIB_STOP_NEIGH_GC = 11,
  96 + IPOIB_NEIGH_TBL_FLUSH = 12,
95 97  
96 98 IPOIB_MAX_BACKOFF_SECONDS = 16,
97 99  
... ... @@ -260,6 +262,20 @@
260 262 u16 max_coalesced_frames;
261 263 };
262 264  
  265 +struct ipoib_neigh_hash {
  266 + struct ipoib_neigh __rcu **buckets;
  267 + struct rcu_head rcu;
  268 + u32 mask;
  269 + u32 size;
  270 +};
  271 +
  272 +struct ipoib_neigh_table {
  273 + struct ipoib_neigh_hash __rcu *htbl;
  274 + rwlock_t rwlock;
  275 + atomic_t entries;
  276 + struct completion flushed;
  277 +};
  278 +
263 279 /*
264 280 * Device private locking: network stack tx_lock protects members used
265 281 * in TX fast path, lock protects everything else. lock nests inside
... ... @@ -279,6 +295,8 @@
279 295 struct rb_root path_tree;
280 296 struct list_head path_list;
281 297  
  298 + struct ipoib_neigh_table ntbl;
  299 +
282 300 struct ipoib_mcast *broadcast;
283 301 struct list_head multicast_list;
284 302 struct rb_root multicast_tree;
... ... @@ -291,7 +309,7 @@
291 309 struct work_struct flush_heavy;
292 310 struct work_struct restart_task;
293 311 struct delayed_work ah_reap_task;
294   -
  312 + struct delayed_work neigh_reap_task;
295 313 struct ib_device *ca;
296 314 u8 port;
297 315 u16 pkey;
298 316  
299 317  
... ... @@ -377,13 +395,16 @@
377 395 #ifdef CONFIG_INFINIBAND_IPOIB_CM
378 396 struct ipoib_cm_tx *cm;
379 397 #endif
380   - union ib_gid dgid;
  398 + u8 daddr[INFINIBAND_ALEN];
381 399 struct sk_buff_head queue;
382 400  
383   - struct neighbour *neighbour;
384 401 struct net_device *dev;
385 402  
386 403 struct list_head list;
  404 + struct ipoib_neigh __rcu *hnext;
  405 + struct rcu_head rcu;
  406 + atomic_t refcnt;
  407 + unsigned long alive;
387 408 };
388 409  
389 410 #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
390 411  
391 412  
392 413  
... ... @@ -394,21 +415,17 @@
394 415 return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
395 416 }
396 417  
397   -/*
398   - * We stash a pointer to our private neighbour information after our
399   - * hardware address in neigh->ha. The ALIGN() expression here makes
400   - * sure that this pointer is stored aligned so that an unaligned
401   - * load is not needed to dereference it.
402   - */
403   -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
  418 +void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
  419 +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
404 420 {
405   - return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) +
406   - INFINIBAND_ALEN, sizeof(void *));
  421 + if (atomic_dec_and_test(&neigh->refcnt))
  422 + ipoib_neigh_dtor(neigh);
407 423 }
408   -
409   -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
  424 +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
  425 +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
410 426 struct net_device *dev);
411   -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
  427 +void ipoib_neigh_free(struct ipoib_neigh *neigh);
  428 +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
412 429  
413 430 extern struct workqueue_struct *ipoib_workqueue;
414 431  
... ... @@ -425,7 +442,6 @@
425 442 {
426 443 kref_put(&ah->ref, ipoib_free_ah);
427 444 }
428   -
429 445 int ipoib_open(struct net_device *dev);
430 446 int ipoib_add_pkey_attr(struct net_device *dev);
431 447 int ipoib_add_umcast_attr(struct net_device *dev);
... ... @@ -455,7 +471,7 @@
455 471  
456 472 void ipoib_mcast_join_task(struct work_struct *work);
457 473 void ipoib_mcast_carrier_on_task(struct work_struct *work);
458   -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
  474 +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
459 475  
460 476 void ipoib_mcast_restart_task(struct work_struct *work);
461 477 int ipoib_mcast_start_thread(struct net_device *dev);
462 478  
... ... @@ -517,10 +533,10 @@
517 533 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
518 534 }
519 535  
520   -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
  536 +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
521 537 {
522 538 struct ipoib_dev_priv *priv = netdev_priv(dev);
523   - return IPOIB_CM_SUPPORTED(n->ha) &&
  539 + return IPOIB_CM_SUPPORTED(hwaddr) &&
524 540 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
525 541 }
526 542  
... ... @@ -575,7 +591,7 @@
575 591 {
576 592 return 0;
577 593 }
578   -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
  594 +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
579 595  
580 596 {
581 597 return 0;
drivers/infiniband/ulp/ipoib/ipoib_cm.c
... ... @@ -811,9 +811,7 @@
811 811 if (neigh) {
812 812 neigh->cm = NULL;
813 813 list_del(&neigh->list);
814   - if (neigh->ah)
815   - ipoib_put_ah(neigh->ah);
816   - ipoib_neigh_free(dev, neigh);
  814 + ipoib_neigh_free(neigh);
817 815  
818 816 tx->neigh = NULL;
819 817 }
... ... @@ -1230,9 +1228,7 @@
1230 1228 if (neigh) {
1231 1229 neigh->cm = NULL;
1232 1230 list_del(&neigh->list);
1233   - if (neigh->ah)
1234   - ipoib_put_ah(neigh->ah);
1235   - ipoib_neigh_free(dev, neigh);
  1231 + ipoib_neigh_free(neigh);
1236 1232  
1237 1233 tx->neigh = NULL;
1238 1234 }
... ... @@ -1279,7 +1275,7 @@
1279 1275 list_move(&tx->list, &priv->cm.reap_list);
1280 1276 queue_work(ipoib_workqueue, &priv->cm.reap_task);
1281 1277 ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1282   - tx->neigh->dgid.raw);
  1278 + tx->neigh->daddr + 4);
1283 1279 tx->neigh = NULL;
1284 1280 }
1285 1281 }
... ... @@ -1304,7 +1300,7 @@
1304 1300 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1305 1301 list_del_init(&p->list);
1306 1302 neigh = p->neigh;
1307   - qpn = IPOIB_QPN(neigh->neighbour->ha);
  1303 + qpn = IPOIB_QPN(neigh->daddr);
1308 1304 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1309 1305  
1310 1306 spin_unlock_irqrestore(&priv->lock, flags);
... ... @@ -1320,9 +1316,7 @@
1320 1316 if (neigh) {
1321 1317 neigh->cm = NULL;
1322 1318 list_del(&neigh->list);
1323   - if (neigh->ah)
1324   - ipoib_put_ah(neigh->ah);
1325   - ipoib_neigh_free(dev, neigh);
  1319 + ipoib_neigh_free(neigh);
1326 1320 }
1327 1321 list_del(&p->list);
1328 1322 kfree(p);
drivers/infiniband/ulp/ipoib/ipoib_main.c
... ... @@ -46,7 +46,8 @@
46 46 #include <linux/ip.h>
47 47 #include <linux/in.h>
48 48  
49   -#include <net/dst.h>
  49 +#include <linux/jhash.h>
  50 +#include <net/arp.h>
50 51  
51 52 MODULE_AUTHOR("Roland Dreier");
52 53 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
... ... @@ -84,6 +85,7 @@
84 85  
85 86 static void ipoib_add_one(struct ib_device *device);
86 87 static void ipoib_remove_one(struct ib_device *device);
  88 +static void ipoib_neigh_reclaim(struct rcu_head *rp);
87 89  
88 90 static struct ib_client ipoib_client = {
89 91 .name = "ipoib",
90 92  
91 93  
92 94  
93 95  
... ... @@ -264,31 +266,16 @@
264 266  
265 267 static void path_free(struct net_device *dev, struct ipoib_path *path)
266 268 {
267   - struct ipoib_dev_priv *priv = netdev_priv(dev);
268   - struct ipoib_neigh *neigh, *tn;
269 269 struct sk_buff *skb;
270   - unsigned long flags;
271 270  
272 271 while ((skb = __skb_dequeue(&path->queue)))
273 272 dev_kfree_skb_irq(skb);
274 273  
275   - spin_lock_irqsave(&priv->lock, flags);
  274 + ipoib_dbg(netdev_priv(dev), "path_free\n");
276 275  
277   - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
278   - /*
279   - * It's safe to call ipoib_put_ah() inside priv->lock
280   - * here, because we know that path->ah will always
281   - * hold one more reference, so ipoib_put_ah() will
282   - * never do more than decrement the ref count.
283   - */
284   - if (neigh->ah)
285   - ipoib_put_ah(neigh->ah);
  276 + /* remove all neigh connected to this path */
  277 + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
286 278  
287   - ipoib_neigh_free(dev, neigh);
288   - }
289   -
290   - spin_unlock_irqrestore(&priv->lock, flags);
291   -
292 279 if (path->ah)
293 280 ipoib_put_ah(path->ah);
294 281  
295 282  
296 283  
... ... @@ -458,19 +445,15 @@
458 445 }
459 446 kref_get(&path->ah->ref);
460 447 neigh->ah = path->ah;
461   - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
462   - sizeof(union ib_gid));
463 448  
464   - if (ipoib_cm_enabled(dev, neigh->neighbour)) {
  449 + if (ipoib_cm_enabled(dev, neigh->daddr)) {
465 450 if (!ipoib_cm_get(neigh))
466 451 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
467 452 path,
468 453 neigh));
469 454 if (!ipoib_cm_get(neigh)) {
470 455 list_del(&neigh->list);
471   - if (neigh->ah)
472   - ipoib_put_ah(neigh->ah);
473   - ipoib_neigh_free(dev, neigh);
  456 + ipoib_neigh_free(neigh);
474 457 continue;
475 458 }
476 459 }
477 460  
... ... @@ -555,15 +538,15 @@
555 538 return 0;
556 539 }
557 540  
558   -/* called with rcu_read_lock */
559   -static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_device *dev)
  541 +static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
  542 + struct net_device *dev)
560 543 {
561 544 struct ipoib_dev_priv *priv = netdev_priv(dev);
562 545 struct ipoib_path *path;
563 546 struct ipoib_neigh *neigh;
564 547 unsigned long flags;
565 548  
566   - neigh = ipoib_neigh_alloc(n, skb->dev);
  549 + neigh = ipoib_neigh_alloc(daddr, dev);
567 550 if (!neigh) {
568 551 ++dev->stats.tx_dropped;
569 552 dev_kfree_skb_any(skb);
570 553  
... ... @@ -572,9 +555,9 @@
572 555  
573 556 spin_lock_irqsave(&priv->lock, flags);
574 557  
575   - path = __path_find(dev, n->ha + 4);
  558 + path = __path_find(dev, daddr + 4);
576 559 if (!path) {
577   - path = path_rec_create(dev, n->ha + 4);
  560 + path = path_rec_create(dev, daddr + 4);
578 561 if (!path)
579 562 goto err_path;
580 563  
581 564  
582 565  
... ... @@ -586,17 +569,13 @@
586 569 if (path->ah) {
587 570 kref_get(&path->ah->ref);
588 571 neigh->ah = path->ah;
589   - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
590   - sizeof(union ib_gid));
591 572  
592   - if (ipoib_cm_enabled(dev, neigh->neighbour)) {
  573 + if (ipoib_cm_enabled(dev, neigh->daddr)) {
593 574 if (!ipoib_cm_get(neigh))
594 575 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
595 576 if (!ipoib_cm_get(neigh)) {
596 577 list_del(&neigh->list);
597   - if (neigh->ah)
598   - ipoib_put_ah(neigh->ah);
599   - ipoib_neigh_free(dev, neigh);
  578 + ipoib_neigh_free(neigh);
600 579 goto err_drop;
601 580 }
602 581 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
... ... @@ -608,7 +587,8 @@
608 587 }
609 588 } else {
610 589 spin_unlock_irqrestore(&priv->lock, flags);
611   - ipoib_send(dev, skb, path->ah, IPOIB_QPN(n->ha));
  590 + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
  591 + ipoib_neigh_put(neigh);
612 592 return;
613 593 }
614 594 } else {
615 595  
616 596  
617 597  
... ... @@ -621,37 +601,22 @@
621 601 }
622 602  
623 603 spin_unlock_irqrestore(&priv->lock, flags);
  604 + ipoib_neigh_put(neigh);
624 605 return;
625 606  
626 607 err_list:
627 608 list_del(&neigh->list);
628 609  
629 610 err_path:
630   - ipoib_neigh_free(dev, neigh);
  611 + ipoib_neigh_free(neigh);
631 612 err_drop:
632 613 ++dev->stats.tx_dropped;
633 614 dev_kfree_skb_any(skb);
634 615  
635 616 spin_unlock_irqrestore(&priv->lock, flags);
  617 + ipoib_neigh_put(neigh);
636 618 }
637 619  
638   -/* called with rcu_read_lock */
639   -static void ipoib_path_lookup(struct sk_buff *skb, struct neighbour *n, struct net_device *dev)
640   -{
641   - struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
642   -
643   - /* Look up path record for unicasts */
644   - if (n->ha[4] != 0xff) {
645   - neigh_add_path(skb, n, dev);
646   - return;
647   - }
648   -
649   - /* Add in the P_Key for multicasts */
650   - n->ha[8] = (priv->pkey >> 8) & 0xff;
651   - n->ha[9] = priv->pkey & 0xff;
652   - ipoib_mcast_send(dev, n->ha + 4, skb);
653   -}
654   -
655 620 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
656 621 struct ipoib_cb *cb)
657 622 {
658 623  
659 624  
660 625  
661 626  
662 627  
663 628  
664 629  
665 630  
666 631  
667 632  
668 633  
669 634  
... ... @@ -710,96 +675,80 @@
710 675 {
711 676 struct ipoib_dev_priv *priv = netdev_priv(dev);
712 677 struct ipoib_neigh *neigh;
713   - struct neighbour *n = NULL;
  678 + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
  679 + struct ipoib_header *header;
714 680 unsigned long flags;
715 681  
716   - rcu_read_lock();
717   - if (likely(skb_dst(skb))) {
718   - n = dst_neigh_lookup_skb(skb_dst(skb), skb);
719   - if (!n) {
  682 + header = (struct ipoib_header *) skb->data;
  683 +
  684 + if (unlikely(cb->hwaddr[4] == 0xff)) {
  685 + /* multicast, arrange "if" according to probability */
  686 + if ((header->proto != htons(ETH_P_IP)) &&
  687 + (header->proto != htons(ETH_P_IPV6)) &&
  688 + (header->proto != htons(ETH_P_ARP)) &&
  689 + (header->proto != htons(ETH_P_RARP))) {
  690 + /* ethertype not supported by IPoIB */
720 691 ++dev->stats.tx_dropped;
721 692 dev_kfree_skb_any(skb);
722   - goto unlock;
  693 + return NETDEV_TX_OK;
723 694 }
  695 + /* Add in the P_Key for multicast*/
  696 + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
  697 + cb->hwaddr[9] = priv->pkey & 0xff;
  698 +
  699 + neigh = ipoib_neigh_get(dev, cb->hwaddr);
  700 + if (likely(neigh))
  701 + goto send_using_neigh;
  702 + ipoib_mcast_send(dev, cb->hwaddr, skb);
  703 + return NETDEV_TX_OK;
724 704 }
725   - if (likely(n)) {
726   - if (unlikely(!*to_ipoib_neigh(n))) {
727   - ipoib_path_lookup(skb, n, dev);
728   - goto unlock;
729   - }
730 705  
731   - neigh = *to_ipoib_neigh(n);
732   -
733   - if (unlikely((memcmp(&neigh->dgid.raw,
734   - n->ha + 4,
735   - sizeof(union ib_gid))) ||
736   - (neigh->dev != dev))) {
737   - spin_lock_irqsave(&priv->lock, flags);
738   - /*
739   - * It's safe to call ipoib_put_ah() inside
740   - * priv->lock here, because we know that
741   - * path->ah will always hold one more reference,
742   - * so ipoib_put_ah() will never do more than
743   - * decrement the ref count.
744   - */
745   - if (neigh->ah)
746   - ipoib_put_ah(neigh->ah);
747   - list_del(&neigh->list);
748   - ipoib_neigh_free(dev, neigh);
749   - spin_unlock_irqrestore(&priv->lock, flags);
750   - ipoib_path_lookup(skb, n, dev);
751   - goto unlock;
  706 + /* unicast, arrange "switch" according to probability */
  707 + switch (header->proto) {
  708 + case htons(ETH_P_IP):
  709 + case htons(ETH_P_IPV6):
  710 + neigh = ipoib_neigh_get(dev, cb->hwaddr);
  711 + if (unlikely(!neigh)) {
  712 + neigh_add_path(skb, cb->hwaddr, dev);
  713 + return NETDEV_TX_OK;
752 714 }
  715 + break;
  716 + case htons(ETH_P_ARP):
  717 + case htons(ETH_P_RARP):
  718 + /* for unicast ARP and RARP should always perform path find */
  719 + unicast_arp_send(skb, dev, cb);
  720 + return NETDEV_TX_OK;
  721 + default:
  722 + /* ethertype not supported by IPoIB */
  723 + ++dev->stats.tx_dropped;
  724 + dev_kfree_skb_any(skb);
  725 + return NETDEV_TX_OK;
  726 + }
753 727  
754   - if (ipoib_cm_get(neigh)) {
755   - if (ipoib_cm_up(neigh)) {
756   - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
757   - goto unlock;
758   - }
759   - } else if (neigh->ah) {
760   - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha));
761   - goto unlock;
  728 +send_using_neigh:
  729 + /* note we now hold a ref to neigh */
  730 + if (ipoib_cm_get(neigh)) {
  731 + if (ipoib_cm_up(neigh)) {
  732 + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
  733 + goto unref;
762 734 }
  735 + } else if (neigh->ah) {
  736 + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
  737 + goto unref;
  738 + }
763 739  
764   - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
765   - spin_lock_irqsave(&priv->lock, flags);
766   - __skb_queue_tail(&neigh->queue, skb);
767   - spin_unlock_irqrestore(&priv->lock, flags);
768   - } else {
769   - ++dev->stats.tx_dropped;
770   - dev_kfree_skb_any(skb);
771   - }
  740 + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  741 + spin_lock_irqsave(&priv->lock, flags);
  742 + __skb_queue_tail(&neigh->queue, skb);
  743 + spin_unlock_irqrestore(&priv->lock, flags);
772 744 } else {
773   - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
  745 + ++dev->stats.tx_dropped;
  746 + dev_kfree_skb_any(skb);
  747 + }
774 748  
775   - if (cb->hwaddr[4] == 0xff) {
776   - /* Add in the P_Key for multicast*/
777   - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
778   - cb->hwaddr[9] = priv->pkey & 0xff;
  749 +unref:
  750 + ipoib_neigh_put(neigh);
779 751  
780   - ipoib_mcast_send(dev, cb->hwaddr + 4, skb);
781   - } else {
782   - /* unicast GID -- should be ARP or RARP reply */
783   -
784   - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
785   - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
786   - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
787   - skb_dst(skb) ? "neigh" : "dst",
788   - be16_to_cpup((__be16 *) skb->data),
789   - IPOIB_QPN(cb->hwaddr),
790   - cb->hwaddr + 4);
791   - dev_kfree_skb_any(skb);
792   - ++dev->stats.tx_dropped;
793   - goto unlock;
794   - }
795   -
796   - unicast_arp_send(skb, dev, cb);
797   - }
798   - }
799   -unlock:
800   - if (n)
801   - neigh_release(n);
802   - rcu_read_unlock();
803 752 return NETDEV_TX_OK;
804 753 }
805 754  
... ... @@ -821,6 +770,7 @@
821 770 const void *daddr, const void *saddr, unsigned len)
822 771 {
823 772 struct ipoib_header *header;
  773 + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
824 774  
825 775 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
826 776  
827 777  
... ... @@ -828,14 +778,11 @@
828 778 header->reserved = 0;
829 779  
830 780 /*
831   - * If we don't have a dst_entry structure, stuff the
  781 + * we don't rely on dst_entry structure, always stuff the
832 782 * destination address into skb->cb so we can figure out where
833 783 * to send the packet later.
834 784 */
835   - if (!skb_dst(skb)) {
836   - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
837   - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
838   - }
  785 + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
839 786  
840 787 return 0;
841 788 }
842 789  
843 790  
844 791  
845 792  
846 793  
847 794  
848 795  
849 796  
850 797  
851 798  
852 799  
853 800  
854 801  
855 802  
856 803  
857 804  
858 805  
859 806  
860 807  
861 808  
862 809  
863 810  
864 811  
865 812  
866 813  
... ... @@ -852,86 +799,438 @@
852 799 queue_work(ipoib_workqueue, &priv->restart_task);
853 800 }
854 801  
855   -static void ipoib_neigh_cleanup(struct neighbour *n)
  802 +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
856 803 {
857   - struct ipoib_neigh *neigh;
858   - struct ipoib_dev_priv *priv = netdev_priv(n->dev);
  804 + /*
  805 + * Use only the address parts that contributes to spreading
  806 + * The subnet prefix is not used as one can not connect to
  807 + * same remote port (GUID) using the same remote QPN via two
  808 + * different subnets.
  809 + */
  810 + /* qpn octets[1:4) & port GUID octets[12:20) */
  811 + u32 *daddr_32 = (u32 *) daddr;
  812 + u32 hv;
  813 +
  814 + hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0);
  815 + return hv & htbl->mask;
  816 +}
  817 +
  818 +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
  819 +{
  820 + struct ipoib_dev_priv *priv = netdev_priv(dev);
  821 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  822 + struct ipoib_neigh_hash *htbl;
  823 + struct ipoib_neigh *neigh = NULL;
  824 + u32 hash_val;
  825 +
  826 + rcu_read_lock_bh();
  827 +
  828 + htbl = rcu_dereference_bh(ntbl->htbl);
  829 +
  830 + if (!htbl)
  831 + goto out_unlock;
  832 +
  833 + hash_val = ipoib_addr_hash(htbl, daddr);
  834 + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
  835 + neigh != NULL;
  836 + neigh = rcu_dereference_bh(neigh->hnext)) {
  837 + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  838 + /* found, take one ref on behalf of the caller */
  839 + if (!atomic_inc_not_zero(&neigh->refcnt)) {
  840 + /* deleted */
  841 + neigh = NULL;
  842 + goto out_unlock;
  843 + }
  844 + neigh->alive = jiffies;
  845 + goto out_unlock;
  846 + }
  847 + }
  848 +
  849 +out_unlock:
  850 + rcu_read_unlock_bh();
  851 + return neigh;
  852 +}
  853 +
  854 +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
  855 +{
  856 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  857 + struct ipoib_neigh_hash *htbl;
  858 + unsigned long neigh_obsolete;
  859 + unsigned long dt;
859 860 unsigned long flags;
860   - struct ipoib_ah *ah = NULL;
  861 + int i;
861 862  
862   - neigh = *to_ipoib_neigh(n);
863   - if (neigh)
864   - priv = netdev_priv(neigh->dev);
865   - else
  863 + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
866 864 return;
867   - ipoib_dbg(priv,
868   - "neigh_cleanup for %06x %pI6\n",
869   - IPOIB_QPN(n->ha),
870   - n->ha + 4);
871 865  
872   - spin_lock_irqsave(&priv->lock, flags);
  866 + write_lock_bh(&ntbl->rwlock);
873 867  
874   - if (neigh->ah)
875   - ah = neigh->ah;
876   - list_del(&neigh->list);
877   - ipoib_neigh_free(n->dev, neigh);
  868 + htbl = rcu_dereference_protected(ntbl->htbl,
  869 + lockdep_is_held(&ntbl->rwlock));
878 870  
879   - spin_unlock_irqrestore(&priv->lock, flags);
  871 + if (!htbl)
  872 + goto out_unlock;
880 873  
881   - if (ah)
882   - ipoib_put_ah(ah);
  874 + /* neigh is obsolete if it was idle for two GC periods */
  875 + dt = 2 * arp_tbl.gc_interval;
  876 + neigh_obsolete = jiffies - dt;
  877 + /* handle possible race condition */
  878 + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
  879 + goto out_unlock;
  880 +
  881 + for (i = 0; i < htbl->size; i++) {
  882 + struct ipoib_neigh *neigh;
  883 + struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  884 +
  885 + while ((neigh = rcu_dereference_protected(*np,
  886 + lockdep_is_held(&ntbl->rwlock))) != NULL) {
  887 + /* was the neigh idle for two GC periods */
  888 + if (time_after(neigh_obsolete, neigh->alive)) {
  889 + rcu_assign_pointer(*np,
  890 + rcu_dereference_protected(neigh->hnext,
  891 + lockdep_is_held(&ntbl->rwlock)));
  892 + /* remove from path/mc list */
  893 + spin_lock_irqsave(&priv->lock, flags);
  894 + list_del(&neigh->list);
  895 + spin_unlock_irqrestore(&priv->lock, flags);
  896 + call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  897 + } else {
  898 + np = &neigh->hnext;
  899 + }
  900 +
  901 + }
  902 + }
  903 +
  904 +out_unlock:
  905 + write_unlock_bh(&ntbl->rwlock);
883 906 }
884 907  
885   -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
  908 +static void ipoib_reap_neigh(struct work_struct *work)
  909 +{
  910 + struct ipoib_dev_priv *priv =
  911 + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
  912 +
  913 + __ipoib_reap_neigh(priv);
  914 +
  915 + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
  916 + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
  917 + arp_tbl.gc_interval);
  918 +}
  919 +
  920 +
  921 +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
886 922 struct net_device *dev)
887 923 {
888 924 struct ipoib_neigh *neigh;
889 925  
890   - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
  926 + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
891 927 if (!neigh)
892 928 return NULL;
893 929  
894   - neigh->neighbour = neighbour;
895 930 neigh->dev = dev;
896   - memset(&neigh->dgid.raw, 0, sizeof (union ib_gid));
897   - *to_ipoib_neigh(neighbour) = neigh;
  931 + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
898 932 skb_queue_head_init(&neigh->queue);
  933 + INIT_LIST_HEAD(&neigh->list);
899 934 ipoib_cm_set(neigh, NULL);
  935 + /* one ref on behalf of the caller */
  936 + atomic_set(&neigh->refcnt, 1);
900 937  
901 938 return neigh;
902 939 }
903 940  
904   -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
  941 +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
  942 + struct net_device *dev)
905 943 {
  944 + struct ipoib_dev_priv *priv = netdev_priv(dev);
  945 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  946 + struct ipoib_neigh_hash *htbl;
  947 + struct ipoib_neigh *neigh;
  948 + u32 hash_val;
  949 +
  950 + write_lock_bh(&ntbl->rwlock);
  951 +
  952 + htbl = rcu_dereference_protected(ntbl->htbl,
  953 + lockdep_is_held(&ntbl->rwlock));
  954 + if (!htbl) {
  955 + neigh = NULL;
  956 + goto out_unlock;
  957 + }
  958 +
  959 + /* need to add a new neigh, but maybe some other thread succeeded?
  960 + * recalc hash, maybe hash resize took place so we do a search
  961 + */
  962 + hash_val = ipoib_addr_hash(htbl, daddr);
  963 + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
  964 + lockdep_is_held(&ntbl->rwlock));
  965 + neigh != NULL;
  966 + neigh = rcu_dereference_protected(neigh->hnext,
  967 + lockdep_is_held(&ntbl->rwlock))) {
  968 + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  969 + /* found, take one ref on behalf of the caller */
  970 + if (!atomic_inc_not_zero(&neigh->refcnt)) {
  971 + /* deleted */
  972 + neigh = NULL;
  973 + break;
  974 + }
  975 + neigh->alive = jiffies;
  976 + goto out_unlock;
  977 + }
  978 + }
  979 +
  980 + neigh = ipoib_neigh_ctor(daddr, dev);
  981 + if (!neigh)
  982 + goto out_unlock;
  983 +
  984 + /* one ref on behalf of the hash table */
  985 + atomic_inc(&neigh->refcnt);
  986 + neigh->alive = jiffies;
  987 + /* put in hash */
  988 + rcu_assign_pointer(neigh->hnext,
  989 + rcu_dereference_protected(htbl->buckets[hash_val],
  990 + lockdep_is_held(&ntbl->rwlock)));
  991 + rcu_assign_pointer(htbl->buckets[hash_val], neigh);
  992 + atomic_inc(&ntbl->entries);
  993 +
  994 +out_unlock:
  995 + write_unlock_bh(&ntbl->rwlock);
  996 +
  997 + return neigh;
  998 +}
  999 +
  1000 +void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
  1001 +{
  1002 + /* neigh reference count was dropprd to zero */
  1003 + struct net_device *dev = neigh->dev;
  1004 + struct ipoib_dev_priv *priv = netdev_priv(dev);
906 1005 struct sk_buff *skb;
907   - *to_ipoib_neigh(neigh->neighbour) = NULL;
  1006 + if (neigh->ah)
  1007 + ipoib_put_ah(neigh->ah);
908 1008 while ((skb = __skb_dequeue(&neigh->queue))) {
909 1009 ++dev->stats.tx_dropped;
910 1010 dev_kfree_skb_any(skb);
911 1011 }
912 1012 if (ipoib_cm_get(neigh))
913 1013 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
  1014 + ipoib_dbg(netdev_priv(dev),
  1015 + "neigh free for %06x %pI6\n",
  1016 + IPOIB_QPN(neigh->daddr),
  1017 + neigh->daddr + 4);
914 1018 kfree(neigh);
  1019 + if (atomic_dec_and_test(&priv->ntbl.entries)) {
  1020 + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
  1021 + complete(&priv->ntbl.flushed);
  1022 + }
915 1023 }
916 1024  
917   -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
  1025 +static void ipoib_neigh_reclaim(struct rcu_head *rp)
918 1026 {
919   - parms->neigh_cleanup = ipoib_neigh_cleanup;
  1027 + /* Called as a result of removal from hash table */
  1028 + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
  1029 + /* note TX context may hold another ref */
  1030 + ipoib_neigh_put(neigh);
  1031 +}
920 1032  
  1033 +void ipoib_neigh_free(struct ipoib_neigh *neigh)
  1034 +{
  1035 + struct net_device *dev = neigh->dev;
  1036 + struct ipoib_dev_priv *priv = netdev_priv(dev);
  1037 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1038 + struct ipoib_neigh_hash *htbl;
  1039 + struct ipoib_neigh __rcu **np;
  1040 + struct ipoib_neigh *n;
  1041 + u32 hash_val;
  1042 +
  1043 + write_lock_bh(&ntbl->rwlock);
  1044 +
  1045 + htbl = rcu_dereference_protected(ntbl->htbl,
  1046 + lockdep_is_held(&ntbl->rwlock));
  1047 + if (!htbl)
  1048 + goto out_unlock;
  1049 +
  1050 + hash_val = ipoib_addr_hash(htbl, neigh->daddr);
  1051 + np = &htbl->buckets[hash_val];
  1052 + for (n = rcu_dereference_protected(*np,
  1053 + lockdep_is_held(&ntbl->rwlock));
  1054 + n != NULL;
  1055 + n = rcu_dereference_protected(neigh->hnext,
  1056 + lockdep_is_held(&ntbl->rwlock))) {
  1057 + if (n == neigh) {
  1058 + /* found */
  1059 + rcu_assign_pointer(*np,
  1060 + rcu_dereference_protected(neigh->hnext,
  1061 + lockdep_is_held(&ntbl->rwlock)));
  1062 + call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1063 + goto out_unlock;
  1064 + } else {
  1065 + np = &n->hnext;
  1066 + }
  1067 + }
  1068 +
  1069 +out_unlock:
  1070 + write_unlock_bh(&ntbl->rwlock);
  1071 +
  1072 +}
  1073 +
  1074 +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
  1075 +{
  1076 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1077 + struct ipoib_neigh_hash *htbl;
  1078 + struct ipoib_neigh **buckets;
  1079 + u32 size;
  1080 +
  1081 + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1082 + ntbl->htbl = NULL;
  1083 + rwlock_init(&ntbl->rwlock);
  1084 + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
  1085 + if (!htbl)
  1086 + return -ENOMEM;
  1087 + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1088 + size = roundup_pow_of_two(arp_tbl.gc_thresh3);
  1089 + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
  1090 + if (!buckets) {
  1091 + kfree(htbl);
  1092 + return -ENOMEM;
  1093 + }
  1094 + htbl->size = size;
  1095 + htbl->mask = (size - 1);
  1096 + htbl->buckets = buckets;
  1097 + ntbl->htbl = htbl;
  1098 + atomic_set(&ntbl->entries, 0);
  1099 +
  1100 + /* start garbage collection */
  1101 + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1102 + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
  1103 + arp_tbl.gc_interval);
  1104 +
921 1105 return 0;
922 1106 }
923 1107  
  1108 +static void neigh_hash_free_rcu(struct rcu_head *head)
  1109 +{
  1110 + struct ipoib_neigh_hash *htbl = container_of(head,
  1111 + struct ipoib_neigh_hash,
  1112 + rcu);
  1113 + struct ipoib_neigh __rcu **buckets = htbl->buckets;
  1114 +
  1115 + kfree(buckets);
  1116 + kfree(htbl);
  1117 +}
  1118 +
  1119 +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
  1120 +{
  1121 + struct ipoib_dev_priv *priv = netdev_priv(dev);
  1122 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1123 + struct ipoib_neigh_hash *htbl;
  1124 + unsigned long flags;
  1125 + int i;
  1126 +
  1127 + /* remove all neigh connected to a given path or mcast */
  1128 + write_lock_bh(&ntbl->rwlock);
  1129 +
  1130 + htbl = rcu_dereference_protected(ntbl->htbl,
  1131 + lockdep_is_held(&ntbl->rwlock));
  1132 +
  1133 + if (!htbl)
  1134 + goto out_unlock;
  1135 +
  1136 + for (i = 0; i < htbl->size; i++) {
  1137 + struct ipoib_neigh *neigh;
  1138 + struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1139 +
  1140 + while ((neigh = rcu_dereference_protected(*np,
  1141 + lockdep_is_held(&ntbl->rwlock))) != NULL) {
  1142 + /* delete neighs belong to this parent */
  1143 + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
  1144 + rcu_assign_pointer(*np,
  1145 + rcu_dereference_protected(neigh->hnext,
  1146 + lockdep_is_held(&ntbl->rwlock)));
  1147 + /* remove from parent list */
  1148 + spin_lock_irqsave(&priv->lock, flags);
  1149 + list_del(&neigh->list);
  1150 + spin_unlock_irqrestore(&priv->lock, flags);
  1151 + call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1152 + } else {
  1153 + np = &neigh->hnext;
  1154 + }
  1155 +
  1156 + }
  1157 + }
  1158 +out_unlock:
  1159 + write_unlock_bh(&ntbl->rwlock);
  1160 +}
  1161 +
  1162 +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
  1163 +{
  1164 + struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1165 + struct ipoib_neigh_hash *htbl;
  1166 + unsigned long flags;
  1167 + int i;
  1168 +
  1169 + write_lock_bh(&ntbl->rwlock);
  1170 +
  1171 + htbl = rcu_dereference_protected(ntbl->htbl,
  1172 + lockdep_is_held(&ntbl->rwlock));
  1173 + if (!htbl)
  1174 + goto out_unlock;
  1175 +
  1176 + for (i = 0; i < htbl->size; i++) {
  1177 + struct ipoib_neigh *neigh;
  1178 + struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1179 +
  1180 + while ((neigh = rcu_dereference_protected(*np,
  1181 + lockdep_is_held(&ntbl->rwlock))) != NULL) {
  1182 + rcu_assign_pointer(*np,
  1183 + rcu_dereference_protected(neigh->hnext,
  1184 + lockdep_is_held(&ntbl->rwlock)));
  1185 + /* remove from path/mc list */
  1186 + spin_lock_irqsave(&priv->lock, flags);
  1187 + list_del(&neigh->list);
  1188 + spin_unlock_irqrestore(&priv->lock, flags);
  1189 + call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1190 + }
  1191 + }
  1192 +
  1193 + rcu_assign_pointer(ntbl->htbl, NULL);
  1194 + call_rcu(&htbl->rcu, neigh_hash_free_rcu);
  1195 +
  1196 +out_unlock:
  1197 + write_unlock_bh(&ntbl->rwlock);
  1198 +}
  1199 +
  1200 +static void ipoib_neigh_hash_uninit(struct net_device *dev)
  1201 +{
  1202 + struct ipoib_dev_priv *priv = netdev_priv(dev);
  1203 + int stopped;
  1204 +
  1205 + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
  1206 + init_completion(&priv->ntbl.flushed);
  1207 + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1208 +
  1209 + /* Stop GC if called at init fail need to cancel work */
  1210 + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1211 + if (!stopped)
  1212 + cancel_delayed_work(&priv->neigh_reap_task);
  1213 +
  1214 + if (atomic_read(&priv->ntbl.entries)) {
  1215 + ipoib_flush_neighs(priv);
  1216 + wait_for_completion(&priv->ntbl.flushed);
  1217 + }
  1218 +}
  1219 +
  1220 +
924 1221 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
925 1222 {
926 1223 struct ipoib_dev_priv *priv = netdev_priv(dev);
927 1224  
  1225 + if (ipoib_neigh_hash_init(priv) < 0)
  1226 + goto out;
928 1227 /* Allocate RX/TX "rings" to hold queued skbs */
929 1228 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
930 1229 GFP_KERNEL);
931 1230 if (!priv->rx_ring) {
932 1231 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
933 1232 ca->name, ipoib_recvq_size);
934   - goto out;
  1233 + goto out_neigh_hash_cleanup;
935 1234 }
936 1235  
937 1236 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
... ... @@ -954,6 +1253,8 @@
954 1253 out_rx_ring_cleanup:
955 1254 kfree(priv->rx_ring);
956 1255  
  1256 +out_neigh_hash_cleanup:
  1257 + ipoib_neigh_hash_uninit(dev);
957 1258 out:
958 1259 return -ENOMEM;
959 1260 }
... ... @@ -966,6 +1267,9 @@
966 1267  
967 1268 /* Delete any child interfaces first */
968 1269 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
  1270 + /* Stop GC on child */
  1271 + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
  1272 + cancel_delayed_work(&cpriv->neigh_reap_task);
969 1273 unregister_netdev(cpriv->dev);
970 1274 ipoib_dev_cleanup(cpriv->dev);
971 1275 free_netdev(cpriv->dev);
... ... @@ -978,6 +1282,8 @@
978 1282  
979 1283 priv->rx_ring = NULL;
980 1284 priv->tx_ring = NULL;
  1285 +
  1286 + ipoib_neigh_hash_uninit(dev);
981 1287 }
982 1288  
983 1289 static const struct header_ops ipoib_header_ops = {
... ... @@ -992,7 +1298,6 @@
992 1298 .ndo_start_xmit = ipoib_start_xmit,
993 1299 .ndo_tx_timeout = ipoib_timeout,
994 1300 .ndo_set_rx_mode = ipoib_set_mcast_list,
995   - .ndo_neigh_setup = ipoib_neigh_setup_dev,
996 1301 };
997 1302  
998 1303 static void ipoib_setup(struct net_device *dev)
... ... @@ -1041,6 +1346,7 @@
1041 1346 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
1042 1347 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1043 1348 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
  1349 + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1044 1350 }
1045 1351  
1046 1352 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
... ... @@ -1281,6 +1587,9 @@
1281 1587  
1282 1588 register_failed:
1283 1589 ib_unregister_event_handler(&priv->event_handler);
  1590 + /* Stop GC if started before flush */
  1591 + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1592 + cancel_delayed_work(&priv->neigh_reap_task);
1284 1593 flush_workqueue(ipoib_workqueue);
1285 1594  
1286 1595 event_failed:
... ... @@ -1347,6 +1656,9 @@
1347 1656 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1348 1657 rtnl_unlock();
1349 1658  
  1659 + /* Stop GC */
  1660 + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1661 + cancel_delayed_work(&priv->neigh_reap_task);
1350 1662 flush_workqueue(ipoib_workqueue);
1351 1663  
1352 1664 unregister_netdev(priv->dev);
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
... ... @@ -69,29 +69,14 @@
69 69 static void ipoib_mcast_free(struct ipoib_mcast *mcast)
70 70 {
71 71 struct net_device *dev = mcast->dev;
72   - struct ipoib_dev_priv *priv = netdev_priv(dev);
73   - struct ipoib_neigh *neigh, *tmp;
74 72 int tx_dropped = 0;
75 73  
76 74 ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",
77 75 mcast->mcmember.mgid.raw);
78 76  
79   - spin_lock_irq(&priv->lock);
  77 + /* remove all neigh connected to this mcast */
  78 + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);
80 79  
81   - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
82   - /*
83   - * It's safe to call ipoib_put_ah() inside priv->lock
84   - * here, because we know that mcast->ah will always
85   - * hold one more reference, so ipoib_put_ah() will
86   - * never do more than decrement the ref count.
87   - */
88   - if (neigh->ah)
89   - ipoib_put_ah(neigh->ah);
90   - ipoib_neigh_free(dev, neigh);
91   - }
92   -
93   - spin_unlock_irq(&priv->lock);
94   -
95 80 if (mcast->ah)
96 81 ipoib_put_ah(mcast->ah);
97 82  
98 83  
99 84  
100 85  
101 86  
... ... @@ -655,18 +640,13 @@
655 640 return 0;
656 641 }
657 642  
658   -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
  643 +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
659 644 {
660 645 struct ipoib_dev_priv *priv = netdev_priv(dev);
661   - struct dst_entry *dst = skb_dst(skb);
662 646 struct ipoib_mcast *mcast;
663   - struct neighbour *n;
664 647 unsigned long flags;
  648 + void *mgid = daddr + 4;
665 649  
666   - n = NULL;
667   - if (dst)
668   - n = dst_neigh_lookup_skb(dst, skb);
669   -
670 650 spin_lock_irqsave(&priv->lock, flags);
671 651  
672 652 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) ||
673 653  
674 654  
675 655  
676 656  
... ... @@ -721,28 +701,29 @@
721 701  
722 702 out:
723 703 if (mcast && mcast->ah) {
724   - if (n) {
725   - if (!*to_ipoib_neigh(n)) {
726   - struct ipoib_neigh *neigh;
  704 + struct ipoib_neigh *neigh;
727 705  
728   - neigh = ipoib_neigh_alloc(n, skb->dev);
729   - if (neigh) {
730   - kref_get(&mcast->ah->ref);
731   - neigh->ah = mcast->ah;
732   - list_add_tail(&neigh->list,
733   - &mcast->neigh_list);
734   - }
  706 + spin_unlock_irqrestore(&priv->lock, flags);
  707 + neigh = ipoib_neigh_get(dev, daddr);
  708 + spin_lock_irqsave(&priv->lock, flags);
  709 + if (!neigh) {
  710 + spin_unlock_irqrestore(&priv->lock, flags);
  711 + neigh = ipoib_neigh_alloc(daddr, dev);
  712 + spin_lock_irqsave(&priv->lock, flags);
  713 + if (neigh) {
  714 + kref_get(&mcast->ah->ref);
  715 + neigh->ah = mcast->ah;
  716 + list_add_tail(&neigh->list, &mcast->neigh_list);
735 717 }
736   - neigh_release(n);
737 718 }
738 719 spin_unlock_irqrestore(&priv->lock, flags);
739 720 ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
  721 + if (neigh)
  722 + ipoib_neigh_put(neigh);
740 723 return;
741 724 }
742 725  
743 726 unlock:
744   - if (n)
745   - neigh_release(n);
746 727 spin_unlock_irqrestore(&priv->lock, flags);
747 728 }
748 729