Commit 941c8726e4e737e74d418ccec3d8e7b946a65541
Exists in
master
and in
20 other branches
Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
Pull final RDMA changes from Roland Dreier: - Fix IPoIB to stop using unsafe linkage between networking neighbour layer and private path database. - Small fixes for bugs found by Fengguang Wu's automated builds. * tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: IPoIB: Use a private hash table for path lookup in xmit path IB/qib: Fix size of cc_supported_table_entries RDMA/ucma: Convert open-coded equivalent to memdup_user() RDMA/ocrdma: Fix check of GSI CQs RDMA/cma: Use PTR_RET rather than if (IS_ERR(...)) + PTR_ERR
Showing 8 changed files Side-by-side Diff
- drivers/infiniband/core/cma.c
- drivers/infiniband/core/ucma.c
- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
- drivers/infiniband/hw/qib/qib.h
- drivers/infiniband/ulp/ipoib/ipoib.h
- drivers/infiniband/ulp/ipoib/ipoib_cm.c
- drivers/infiniband/ulp/ipoib/ipoib_main.c
- drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/core/cma.c
... | ... | @@ -3064,10 +3064,7 @@ |
3064 | 3064 | id_priv->id.port_num, &rec, |
3065 | 3065 | comp_mask, GFP_KERNEL, |
3066 | 3066 | cma_ib_mc_handler, mc); |
3067 | - if (IS_ERR(mc->multicast.ib)) | |
3068 | - return PTR_ERR(mc->multicast.ib); | |
3069 | - | |
3070 | - return 0; | |
3067 | + return PTR_RET(mc->multicast.ib); | |
3071 | 3068 | } |
3072 | 3069 | |
3073 | 3070 | static void iboe_mcast_work_handler(struct work_struct *work) |
drivers/infiniband/core/ucma.c
... | ... | @@ -1002,23 +1002,18 @@ |
1002 | 1002 | if (IS_ERR(ctx)) |
1003 | 1003 | return PTR_ERR(ctx); |
1004 | 1004 | |
1005 | - optval = kmalloc(cmd.optlen, GFP_KERNEL); | |
1006 | - if (!optval) { | |
1007 | - ret = -ENOMEM; | |
1008 | - goto out1; | |
1005 | + optval = memdup_user((void __user *) (unsigned long) cmd.optval, | |
1006 | + cmd.optlen); | |
1007 | + if (IS_ERR(optval)) { | |
1008 | + ret = PTR_ERR(optval); | |
1009 | + goto out; | |
1009 | 1010 | } |
1010 | 1011 | |
1011 | - if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval, | |
1012 | - cmd.optlen)) { | |
1013 | - ret = -EFAULT; | |
1014 | - goto out2; | |
1015 | - } | |
1016 | - | |
1017 | 1012 | ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, |
1018 | 1013 | cmd.optlen); |
1019 | -out2: | |
1020 | 1014 | kfree(optval); |
1021 | -out1: | |
1015 | + | |
1016 | +out: | |
1022 | 1017 | ucma_put_ctx(ctx); |
1023 | 1018 | return ret; |
1024 | 1019 | } |
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
... | ... | @@ -893,7 +893,9 @@ |
893 | 893 | /* verify consumer QPs are not trying to use GSI QP's CQ */ |
894 | 894 | if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) { |
895 | 895 | if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) || |
896 | - (dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq))) { | |
896 | + (dev->gsi_sqcq == get_ocrdma_cq(attrs->recv_cq)) || | |
897 | + (dev->gsi_rqcq == get_ocrdma_cq(attrs->send_cq)) || | |
898 | + (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) { | |
897 | 899 | ocrdma_err("%s(%d) Consumer QP cannot use GSI CQs.\n", |
898 | 900 | __func__, dev->id); |
899 | 901 | return -EINVAL; |
drivers/infiniband/hw/qib/qib.h
... | ... | @@ -656,6 +656,11 @@ |
656 | 656 | /* 16 congestion entries with each entry corresponding to a SL */ |
657 | 657 | struct ib_cc_congestion_entry_shadow *congestion_entries; |
658 | 658 | |
659 | + /* Maximum number of congestion control entries that the agent expects | |
660 | + * the manager to send. | |
661 | + */ | |
662 | + u16 cc_supported_table_entries; | |
663 | + | |
659 | 664 | /* Total number of congestion control table entries */ |
660 | 665 | u16 total_cct_entry; |
661 | 666 | |
... | ... | @@ -667,11 +672,6 @@ |
667 | 672 | |
668 | 673 | /* CA's max number of 64 entry units in the congestion control table */ |
669 | 674 | u8 cc_max_table_entries; |
670 | - | |
671 | - /* Maximum number of congestion control entries that the agent expects | |
672 | - * the manager to send. | |
673 | - */ | |
674 | - u8 cc_supported_table_entries; | |
675 | 675 | }; |
676 | 676 | |
677 | 677 | /* Observers. Not to be taken lightly, possibly not to ship. */ |
drivers/infiniband/ulp/ipoib/ipoib.h
... | ... | @@ -92,6 +92,8 @@ |
92 | 92 | IPOIB_STOP_REAPER = 7, |
93 | 93 | IPOIB_FLAG_ADMIN_CM = 9, |
94 | 94 | IPOIB_FLAG_UMCAST = 10, |
95 | + IPOIB_STOP_NEIGH_GC = 11, | |
96 | + IPOIB_NEIGH_TBL_FLUSH = 12, | |
95 | 97 | |
96 | 98 | IPOIB_MAX_BACKOFF_SECONDS = 16, |
97 | 99 | |
... | ... | @@ -260,6 +262,20 @@ |
260 | 262 | u16 max_coalesced_frames; |
261 | 263 | }; |
262 | 264 | |
265 | +struct ipoib_neigh_hash { | |
266 | + struct ipoib_neigh __rcu **buckets; | |
267 | + struct rcu_head rcu; | |
268 | + u32 mask; | |
269 | + u32 size; | |
270 | +}; | |
271 | + | |
272 | +struct ipoib_neigh_table { | |
273 | + struct ipoib_neigh_hash __rcu *htbl; | |
274 | + rwlock_t rwlock; | |
275 | + atomic_t entries; | |
276 | + struct completion flushed; | |
277 | +}; | |
278 | + | |
263 | 279 | /* |
264 | 280 | * Device private locking: network stack tx_lock protects members used |
265 | 281 | * in TX fast path, lock protects everything else. lock nests inside |
... | ... | @@ -279,6 +295,8 @@ |
279 | 295 | struct rb_root path_tree; |
280 | 296 | struct list_head path_list; |
281 | 297 | |
298 | + struct ipoib_neigh_table ntbl; | |
299 | + | |
282 | 300 | struct ipoib_mcast *broadcast; |
283 | 301 | struct list_head multicast_list; |
284 | 302 | struct rb_root multicast_tree; |
... | ... | @@ -291,7 +309,7 @@ |
291 | 309 | struct work_struct flush_heavy; |
292 | 310 | struct work_struct restart_task; |
293 | 311 | struct delayed_work ah_reap_task; |
294 | - | |
312 | + struct delayed_work neigh_reap_task; | |
295 | 313 | struct ib_device *ca; |
296 | 314 | u8 port; |
297 | 315 | u16 pkey; |
298 | 316 | |
299 | 317 | |
... | ... | @@ -377,13 +395,16 @@ |
377 | 395 | #ifdef CONFIG_INFINIBAND_IPOIB_CM |
378 | 396 | struct ipoib_cm_tx *cm; |
379 | 397 | #endif |
380 | - union ib_gid dgid; | |
398 | + u8 daddr[INFINIBAND_ALEN]; | |
381 | 399 | struct sk_buff_head queue; |
382 | 400 | |
383 | - struct neighbour *neighbour; | |
384 | 401 | struct net_device *dev; |
385 | 402 | |
386 | 403 | struct list_head list; |
404 | + struct ipoib_neigh __rcu *hnext; | |
405 | + struct rcu_head rcu; | |
406 | + atomic_t refcnt; | |
407 | + unsigned long alive; | |
387 | 408 | }; |
388 | 409 | |
389 | 410 | #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) |
390 | 411 | |
391 | 412 | |
392 | 413 | |
... | ... | @@ -394,21 +415,17 @@ |
394 | 415 | return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; |
395 | 416 | } |
396 | 417 | |
397 | -/* | |
398 | - * We stash a pointer to our private neighbour information after our | |
399 | - * hardware address in neigh->ha. The ALIGN() expression here makes | |
400 | - * sure that this pointer is stored aligned so that an unaligned | |
401 | - * load is not needed to dereference it. | |
402 | - */ | |
403 | -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) | |
418 | +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); | |
419 | +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) | |
404 | 420 | { |
405 | - return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + | |
406 | - INFINIBAND_ALEN, sizeof(void *)); | |
421 | + if (atomic_dec_and_test(&neigh->refcnt)) | |
422 | + ipoib_neigh_dtor(neigh); | |
407 | 423 | } |
408 | - | |
409 | -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, | |
424 | +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); | |
425 | +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, | |
410 | 426 | struct net_device *dev); |
411 | -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); | |
427 | +void ipoib_neigh_free(struct ipoib_neigh *neigh); | |
428 | +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); | |
412 | 429 | |
413 | 430 | extern struct workqueue_struct *ipoib_workqueue; |
414 | 431 | |
... | ... | @@ -425,7 +442,6 @@ |
425 | 442 | { |
426 | 443 | kref_put(&ah->ref, ipoib_free_ah); |
427 | 444 | } |
428 | - | |
429 | 445 | int ipoib_open(struct net_device *dev); |
430 | 446 | int ipoib_add_pkey_attr(struct net_device *dev); |
431 | 447 | int ipoib_add_umcast_attr(struct net_device *dev); |
... | ... | @@ -455,7 +471,7 @@ |
455 | 471 | |
456 | 472 | void ipoib_mcast_join_task(struct work_struct *work); |
457 | 473 | void ipoib_mcast_carrier_on_task(struct work_struct *work); |
458 | -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); | |
474 | +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); | |
459 | 475 | |
460 | 476 | void ipoib_mcast_restart_task(struct work_struct *work); |
461 | 477 | int ipoib_mcast_start_thread(struct net_device *dev); |
462 | 478 | |
... | ... | @@ -517,10 +533,10 @@ |
517 | 533 | test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); |
518 | 534 | } |
519 | 535 | |
520 | -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) | |
536 | +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) | |
521 | 537 | { |
522 | 538 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
523 | - return IPOIB_CM_SUPPORTED(n->ha) && | |
539 | + return IPOIB_CM_SUPPORTED(hwaddr) && | |
524 | 540 | test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); |
525 | 541 | } |
526 | 542 | |
... | ... | @@ -575,7 +591,7 @@ |
575 | 591 | { |
576 | 592 | return 0; |
577 | 593 | } |
578 | -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) | |
594 | +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) | |
579 | 595 | |
580 | 596 | { |
581 | 597 | return 0; |
drivers/infiniband/ulp/ipoib/ipoib_cm.c
... | ... | @@ -811,9 +811,7 @@ |
811 | 811 | if (neigh) { |
812 | 812 | neigh->cm = NULL; |
813 | 813 | list_del(&neigh->list); |
814 | - if (neigh->ah) | |
815 | - ipoib_put_ah(neigh->ah); | |
816 | - ipoib_neigh_free(dev, neigh); | |
814 | + ipoib_neigh_free(neigh); | |
817 | 815 | |
818 | 816 | tx->neigh = NULL; |
819 | 817 | } |
... | ... | @@ -1230,9 +1228,7 @@ |
1230 | 1228 | if (neigh) { |
1231 | 1229 | neigh->cm = NULL; |
1232 | 1230 | list_del(&neigh->list); |
1233 | - if (neigh->ah) | |
1234 | - ipoib_put_ah(neigh->ah); | |
1235 | - ipoib_neigh_free(dev, neigh); | |
1231 | + ipoib_neigh_free(neigh); | |
1236 | 1232 | |
1237 | 1233 | tx->neigh = NULL; |
1238 | 1234 | } |
... | ... | @@ -1279,7 +1275,7 @@ |
1279 | 1275 | list_move(&tx->list, &priv->cm.reap_list); |
1280 | 1276 | queue_work(ipoib_workqueue, &priv->cm.reap_task); |
1281 | 1277 | ipoib_dbg(priv, "Reap connection for gid %pI6\n", |
1282 | - tx->neigh->dgid.raw); | |
1278 | + tx->neigh->daddr + 4); | |
1283 | 1279 | tx->neigh = NULL; |
1284 | 1280 | } |
1285 | 1281 | } |
... | ... | @@ -1304,7 +1300,7 @@ |
1304 | 1300 | p = list_entry(priv->cm.start_list.next, typeof(*p), list); |
1305 | 1301 | list_del_init(&p->list); |
1306 | 1302 | neigh = p->neigh; |
1307 | - qpn = IPOIB_QPN(neigh->neighbour->ha); | |
1303 | + qpn = IPOIB_QPN(neigh->daddr); | |
1308 | 1304 | memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); |
1309 | 1305 | |
1310 | 1306 | spin_unlock_irqrestore(&priv->lock, flags); |
... | ... | @@ -1320,9 +1316,7 @@ |
1320 | 1316 | if (neigh) { |
1321 | 1317 | neigh->cm = NULL; |
1322 | 1318 | list_del(&neigh->list); |
1323 | - if (neigh->ah) | |
1324 | - ipoib_put_ah(neigh->ah); | |
1325 | - ipoib_neigh_free(dev, neigh); | |
1319 | + ipoib_neigh_free(neigh); | |
1326 | 1320 | } |
1327 | 1321 | list_del(&p->list); |
1328 | 1322 | kfree(p); |
drivers/infiniband/ulp/ipoib/ipoib_main.c
... | ... | @@ -46,7 +46,8 @@ |
46 | 46 | #include <linux/ip.h> |
47 | 47 | #include <linux/in.h> |
48 | 48 | |
49 | -#include <net/dst.h> | |
49 | +#include <linux/jhash.h> | |
50 | +#include <net/arp.h> | |
50 | 51 | |
51 | 52 | MODULE_AUTHOR("Roland Dreier"); |
52 | 53 | MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); |
... | ... | @@ -84,6 +85,7 @@ |
84 | 85 | |
85 | 86 | static void ipoib_add_one(struct ib_device *device); |
86 | 87 | static void ipoib_remove_one(struct ib_device *device); |
88 | +static void ipoib_neigh_reclaim(struct rcu_head *rp); | |
87 | 89 | |
88 | 90 | static struct ib_client ipoib_client = { |
89 | 91 | .name = "ipoib", |
90 | 92 | |
91 | 93 | |
92 | 94 | |
93 | 95 | |
... | ... | @@ -264,31 +266,16 @@ |
264 | 266 | |
265 | 267 | static void path_free(struct net_device *dev, struct ipoib_path *path) |
266 | 268 | { |
267 | - struct ipoib_dev_priv *priv = netdev_priv(dev); | |
268 | - struct ipoib_neigh *neigh, *tn; | |
269 | 269 | struct sk_buff *skb; |
270 | - unsigned long flags; | |
271 | 270 | |
272 | 271 | while ((skb = __skb_dequeue(&path->queue))) |
273 | 272 | dev_kfree_skb_irq(skb); |
274 | 273 | |
275 | - spin_lock_irqsave(&priv->lock, flags); | |
274 | + ipoib_dbg(netdev_priv(dev), "path_free\n"); | |
276 | 275 | |
277 | - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { | |
278 | - /* | |
279 | - * It's safe to call ipoib_put_ah() inside priv->lock | |
280 | - * here, because we know that path->ah will always | |
281 | - * hold one more reference, so ipoib_put_ah() will | |
282 | - * never do more than decrement the ref count. | |
283 | - */ | |
284 | - if (neigh->ah) | |
285 | - ipoib_put_ah(neigh->ah); | |
276 | + /* remove all neigh connected to this path */ | |
277 | + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); | |
286 | 278 | |
287 | - ipoib_neigh_free(dev, neigh); | |
288 | - } | |
289 | - | |
290 | - spin_unlock_irqrestore(&priv->lock, flags); | |
291 | - | |
292 | 279 | if (path->ah) |
293 | 280 | ipoib_put_ah(path->ah); |
294 | 281 | |
295 | 282 | |
296 | 283 | |
... | ... | @@ -458,19 +445,15 @@ |
458 | 445 | } |
459 | 446 | kref_get(&path->ah->ref); |
460 | 447 | neigh->ah = path->ah; |
461 | - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, | |
462 | - sizeof(union ib_gid)); | |
463 | 448 | |
464 | - if (ipoib_cm_enabled(dev, neigh->neighbour)) { | |
449 | + if (ipoib_cm_enabled(dev, neigh->daddr)) { | |
465 | 450 | if (!ipoib_cm_get(neigh)) |
466 | 451 | ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, |
467 | 452 | path, |
468 | 453 | neigh)); |
469 | 454 | if (!ipoib_cm_get(neigh)) { |
470 | 455 | list_del(&neigh->list); |
471 | - if (neigh->ah) | |
472 | - ipoib_put_ah(neigh->ah); | |
473 | - ipoib_neigh_free(dev, neigh); | |
456 | + ipoib_neigh_free(neigh); | |
474 | 457 | continue; |
475 | 458 | } |
476 | 459 | } |
477 | 460 | |
... | ... | @@ -555,15 +538,15 @@ |
555 | 538 | return 0; |
556 | 539 | } |
557 | 540 | |
558 | -/* called with rcu_read_lock */ | |
559 | -static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_device *dev) | |
541 | +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, | |
542 | + struct net_device *dev) | |
560 | 543 | { |
561 | 544 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
562 | 545 | struct ipoib_path *path; |
563 | 546 | struct ipoib_neigh *neigh; |
564 | 547 | unsigned long flags; |
565 | 548 | |
566 | - neigh = ipoib_neigh_alloc(n, skb->dev); | |
549 | + neigh = ipoib_neigh_alloc(daddr, dev); | |
567 | 550 | if (!neigh) { |
568 | 551 | ++dev->stats.tx_dropped; |
569 | 552 | dev_kfree_skb_any(skb); |
570 | 553 | |
... | ... | @@ -572,9 +555,9 @@ |
572 | 555 | |
573 | 556 | spin_lock_irqsave(&priv->lock, flags); |
574 | 557 | |
575 | - path = __path_find(dev, n->ha + 4); | |
558 | + path = __path_find(dev, daddr + 4); | |
576 | 559 | if (!path) { |
577 | - path = path_rec_create(dev, n->ha + 4); | |
560 | + path = path_rec_create(dev, daddr + 4); | |
578 | 561 | if (!path) |
579 | 562 | goto err_path; |
580 | 563 | |
581 | 564 | |
582 | 565 | |
... | ... | @@ -586,17 +569,13 @@ |
586 | 569 | if (path->ah) { |
587 | 570 | kref_get(&path->ah->ref); |
588 | 571 | neigh->ah = path->ah; |
589 | - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, | |
590 | - sizeof(union ib_gid)); | |
591 | 572 | |
592 | - if (ipoib_cm_enabled(dev, neigh->neighbour)) { | |
573 | + if (ipoib_cm_enabled(dev, neigh->daddr)) { | |
593 | 574 | if (!ipoib_cm_get(neigh)) |
594 | 575 | ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); |
595 | 576 | if (!ipoib_cm_get(neigh)) { |
596 | 577 | list_del(&neigh->list); |
597 | - if (neigh->ah) | |
598 | - ipoib_put_ah(neigh->ah); | |
599 | - ipoib_neigh_free(dev, neigh); | |
578 | + ipoib_neigh_free(neigh); | |
600 | 579 | goto err_drop; |
601 | 580 | } |
602 | 581 | if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) |
... | ... | @@ -608,7 +587,8 @@ |
608 | 587 | } |
609 | 588 | } else { |
610 | 589 | spin_unlock_irqrestore(&priv->lock, flags); |
611 | - ipoib_send(dev, skb, path->ah, IPOIB_QPN(n->ha)); | |
590 | + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); | |
591 | + ipoib_neigh_put(neigh); | |
612 | 592 | return; |
613 | 593 | } |
614 | 594 | } else { |
615 | 595 | |
616 | 596 | |
617 | 597 | |
... | ... | @@ -621,37 +601,22 @@ |
621 | 601 | } |
622 | 602 | |
623 | 603 | spin_unlock_irqrestore(&priv->lock, flags); |
604 | + ipoib_neigh_put(neigh); | |
624 | 605 | return; |
625 | 606 | |
626 | 607 | err_list: |
627 | 608 | list_del(&neigh->list); |
628 | 609 | |
629 | 610 | err_path: |
630 | - ipoib_neigh_free(dev, neigh); | |
611 | + ipoib_neigh_free(neigh); | |
631 | 612 | err_drop: |
632 | 613 | ++dev->stats.tx_dropped; |
633 | 614 | dev_kfree_skb_any(skb); |
634 | 615 | |
635 | 616 | spin_unlock_irqrestore(&priv->lock, flags); |
617 | + ipoib_neigh_put(neigh); | |
636 | 618 | } |
637 | 619 | |
638 | -/* called with rcu_read_lock */ | |
639 | -static void ipoib_path_lookup(struct sk_buff *skb, struct neighbour *n, struct net_device *dev) | |
640 | -{ | |
641 | - struct ipoib_dev_priv *priv = netdev_priv(skb->dev); | |
642 | - | |
643 | - /* Look up path record for unicasts */ | |
644 | - if (n->ha[4] != 0xff) { | |
645 | - neigh_add_path(skb, n, dev); | |
646 | - return; | |
647 | - } | |
648 | - | |
649 | - /* Add in the P_Key for multicasts */ | |
650 | - n->ha[8] = (priv->pkey >> 8) & 0xff; | |
651 | - n->ha[9] = priv->pkey & 0xff; | |
652 | - ipoib_mcast_send(dev, n->ha + 4, skb); | |
653 | -} | |
654 | - | |
655 | 620 | static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, |
656 | 621 | struct ipoib_cb *cb) |
657 | 622 | { |
658 | 623 | |
659 | 624 | |
660 | 625 | |
661 | 626 | |
662 | 627 | |
663 | 628 | |
664 | 629 | |
665 | 630 | |
666 | 631 | |
667 | 632 | |
668 | 633 | |
669 | 634 | |
... | ... | @@ -710,96 +675,80 @@ |
710 | 675 | { |
711 | 676 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
712 | 677 | struct ipoib_neigh *neigh; |
713 | - struct neighbour *n = NULL; | |
678 | + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; | |
679 | + struct ipoib_header *header; | |
714 | 680 | unsigned long flags; |
715 | 681 | |
716 | - rcu_read_lock(); | |
717 | - if (likely(skb_dst(skb))) { | |
718 | - n = dst_neigh_lookup_skb(skb_dst(skb), skb); | |
719 | - if (!n) { | |
682 | + header = (struct ipoib_header *) skb->data; | |
683 | + | |
684 | + if (unlikely(cb->hwaddr[4] == 0xff)) { | |
685 | + /* multicast, arrange "if" according to probability */ | |
686 | + if ((header->proto != htons(ETH_P_IP)) && | |
687 | + (header->proto != htons(ETH_P_IPV6)) && | |
688 | + (header->proto != htons(ETH_P_ARP)) && | |
689 | + (header->proto != htons(ETH_P_RARP))) { | |
690 | + /* ethertype not supported by IPoIB */ | |
720 | 691 | ++dev->stats.tx_dropped; |
721 | 692 | dev_kfree_skb_any(skb); |
722 | - goto unlock; | |
693 | + return NETDEV_TX_OK; | |
723 | 694 | } |
695 | + /* Add in the P_Key for multicast*/ | |
696 | + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; | |
697 | + cb->hwaddr[9] = priv->pkey & 0xff; | |
698 | + | |
699 | + neigh = ipoib_neigh_get(dev, cb->hwaddr); | |
700 | + if (likely(neigh)) | |
701 | + goto send_using_neigh; | |
702 | + ipoib_mcast_send(dev, cb->hwaddr, skb); | |
703 | + return NETDEV_TX_OK; | |
724 | 704 | } |
725 | - if (likely(n)) { | |
726 | - if (unlikely(!*to_ipoib_neigh(n))) { | |
727 | - ipoib_path_lookup(skb, n, dev); | |
728 | - goto unlock; | |
729 | - } | |
730 | 705 | |
731 | - neigh = *to_ipoib_neigh(n); | |
732 | - | |
733 | - if (unlikely((memcmp(&neigh->dgid.raw, | |
734 | - n->ha + 4, | |
735 | - sizeof(union ib_gid))) || | |
736 | - (neigh->dev != dev))) { | |
737 | - spin_lock_irqsave(&priv->lock, flags); | |
738 | - /* | |
739 | - * It's safe to call ipoib_put_ah() inside | |
740 | - * priv->lock here, because we know that | |
741 | - * path->ah will always hold one more reference, | |
742 | - * so ipoib_put_ah() will never do more than | |
743 | - * decrement the ref count. | |
744 | - */ | |
745 | - if (neigh->ah) | |
746 | - ipoib_put_ah(neigh->ah); | |
747 | - list_del(&neigh->list); | |
748 | - ipoib_neigh_free(dev, neigh); | |
749 | - spin_unlock_irqrestore(&priv->lock, flags); | |
750 | - ipoib_path_lookup(skb, n, dev); | |
751 | - goto unlock; | |
706 | + /* unicast, arrange "switch" according to probability */ | |
707 | + switch (header->proto) { | |
708 | + case htons(ETH_P_IP): | |
709 | + case htons(ETH_P_IPV6): | |
710 | + neigh = ipoib_neigh_get(dev, cb->hwaddr); | |
711 | + if (unlikely(!neigh)) { | |
712 | + neigh_add_path(skb, cb->hwaddr, dev); | |
713 | + return NETDEV_TX_OK; | |
752 | 714 | } |
715 | + break; | |
716 | + case htons(ETH_P_ARP): | |
717 | + case htons(ETH_P_RARP): | |
718 | + /* for unicast ARP and RARP should always perform path find */ | |
719 | + unicast_arp_send(skb, dev, cb); | |
720 | + return NETDEV_TX_OK; | |
721 | + default: | |
722 | + /* ethertype not supported by IPoIB */ | |
723 | + ++dev->stats.tx_dropped; | |
724 | + dev_kfree_skb_any(skb); | |
725 | + return NETDEV_TX_OK; | |
726 | + } | |
753 | 727 | |
754 | - if (ipoib_cm_get(neigh)) { | |
755 | - if (ipoib_cm_up(neigh)) { | |
756 | - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); | |
757 | - goto unlock; | |
758 | - } | |
759 | - } else if (neigh->ah) { | |
760 | - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha)); | |
761 | - goto unlock; | |
728 | +send_using_neigh: | |
729 | + /* note we now hold a ref to neigh */ | |
730 | + if (ipoib_cm_get(neigh)) { | |
731 | + if (ipoib_cm_up(neigh)) { | |
732 | + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); | |
733 | + goto unref; | |
762 | 734 | } |
735 | + } else if (neigh->ah) { | |
736 | + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); | |
737 | + goto unref; | |
738 | + } | |
763 | 739 | |
764 | - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { | |
765 | - spin_lock_irqsave(&priv->lock, flags); | |
766 | - __skb_queue_tail(&neigh->queue, skb); | |
767 | - spin_unlock_irqrestore(&priv->lock, flags); | |
768 | - } else { | |
769 | - ++dev->stats.tx_dropped; | |
770 | - dev_kfree_skb_any(skb); | |
771 | - } | |
740 | + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { | |
741 | + spin_lock_irqsave(&priv->lock, flags); | |
742 | + __skb_queue_tail(&neigh->queue, skb); | |
743 | + spin_unlock_irqrestore(&priv->lock, flags); | |
772 | 744 | } else { |
773 | - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; | |
745 | + ++dev->stats.tx_dropped; | |
746 | + dev_kfree_skb_any(skb); | |
747 | + } | |
774 | 748 | |
775 | - if (cb->hwaddr[4] == 0xff) { | |
776 | - /* Add in the P_Key for multicast*/ | |
777 | - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; | |
778 | - cb->hwaddr[9] = priv->pkey & 0xff; | |
749 | +unref: | |
750 | + ipoib_neigh_put(neigh); | |
779 | 751 | |
780 | - ipoib_mcast_send(dev, cb->hwaddr + 4, skb); | |
781 | - } else { | |
782 | - /* unicast GID -- should be ARP or RARP reply */ | |
783 | - | |
784 | - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && | |
785 | - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { | |
786 | - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", | |
787 | - skb_dst(skb) ? "neigh" : "dst", | |
788 | - be16_to_cpup((__be16 *) skb->data), | |
789 | - IPOIB_QPN(cb->hwaddr), | |
790 | - cb->hwaddr + 4); | |
791 | - dev_kfree_skb_any(skb); | |
792 | - ++dev->stats.tx_dropped; | |
793 | - goto unlock; | |
794 | - } | |
795 | - | |
796 | - unicast_arp_send(skb, dev, cb); | |
797 | - } | |
798 | - } | |
799 | -unlock: | |
800 | - if (n) | |
801 | - neigh_release(n); | |
802 | - rcu_read_unlock(); | |
803 | 752 | return NETDEV_TX_OK; |
804 | 753 | } |
805 | 754 | |
... | ... | @@ -821,6 +770,7 @@ |
821 | 770 | const void *daddr, const void *saddr, unsigned len) |
822 | 771 | { |
823 | 772 | struct ipoib_header *header; |
773 | + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; | |
824 | 774 | |
825 | 775 | header = (struct ipoib_header *) skb_push(skb, sizeof *header); |
826 | 776 | |
827 | 777 | |
... | ... | @@ -828,14 +778,11 @@ |
828 | 778 | header->reserved = 0; |
829 | 779 | |
830 | 780 | /* |
831 | - * If we don't have a dst_entry structure, stuff the | |
781 | + * we don't rely on dst_entry structure, always stuff the | |
832 | 782 | * destination address into skb->cb so we can figure out where |
833 | 783 | * to send the packet later. |
834 | 784 | */ |
835 | - if (!skb_dst(skb)) { | |
836 | - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; | |
837 | - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); | |
838 | - } | |
785 | + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); | |
839 | 786 | |
840 | 787 | return 0; |
841 | 788 | } |
842 | 789 | |
843 | 790 | |
844 | 791 | |
845 | 792 | |
846 | 793 | |
847 | 794 | |
848 | 795 | |
849 | 796 | |
850 | 797 | |
851 | 798 | |
852 | 799 | |
853 | 800 | |
854 | 801 | |
855 | 802 | |
856 | 803 | |
857 | 804 | |
858 | 805 | |
859 | 806 | |
860 | 807 | |
861 | 808 | |
862 | 809 | |
863 | 810 | |
864 | 811 | |
865 | 812 | |
866 | 813 | |
... | ... | @@ -852,86 +799,438 @@ |
852 | 799 | queue_work(ipoib_workqueue, &priv->restart_task); |
853 | 800 | } |
854 | 801 | |
855 | -static void ipoib_neigh_cleanup(struct neighbour *n) | |
802 | +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) | |
856 | 803 | { |
857 | - struct ipoib_neigh *neigh; | |
858 | - struct ipoib_dev_priv *priv = netdev_priv(n->dev); | |
804 | + /* | |
805 | + * Use only the address parts that contributes to spreading | |
806 | + * The subnet prefix is not used as one can not connect to | |
807 | + * same remote port (GUID) using the same remote QPN via two | |
808 | + * different subnets. | |
809 | + */ | |
810 | + /* qpn octets[1:4) & port GUID octets[12:20) */ | |
811 | + u32 *daddr_32 = (u32 *) daddr; | |
812 | + u32 hv; | |
813 | + | |
814 | + hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0); | |
815 | + return hv & htbl->mask; | |
816 | +} | |
817 | + | |
818 | +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) | |
819 | +{ | |
820 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
821 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
822 | + struct ipoib_neigh_hash *htbl; | |
823 | + struct ipoib_neigh *neigh = NULL; | |
824 | + u32 hash_val; | |
825 | + | |
826 | + rcu_read_lock_bh(); | |
827 | + | |
828 | + htbl = rcu_dereference_bh(ntbl->htbl); | |
829 | + | |
830 | + if (!htbl) | |
831 | + goto out_unlock; | |
832 | + | |
833 | + hash_val = ipoib_addr_hash(htbl, daddr); | |
834 | + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); | |
835 | + neigh != NULL; | |
836 | + neigh = rcu_dereference_bh(neigh->hnext)) { | |
837 | + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { | |
838 | + /* found, take one ref on behalf of the caller */ | |
839 | + if (!atomic_inc_not_zero(&neigh->refcnt)) { | |
840 | + /* deleted */ | |
841 | + neigh = NULL; | |
842 | + goto out_unlock; | |
843 | + } | |
844 | + neigh->alive = jiffies; | |
845 | + goto out_unlock; | |
846 | + } | |
847 | + } | |
848 | + | |
849 | +out_unlock: | |
850 | + rcu_read_unlock_bh(); | |
851 | + return neigh; | |
852 | +} | |
853 | + | |
854 | +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) | |
855 | +{ | |
856 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
857 | + struct ipoib_neigh_hash *htbl; | |
858 | + unsigned long neigh_obsolete; | |
859 | + unsigned long dt; | |
859 | 860 | unsigned long flags; |
860 | - struct ipoib_ah *ah = NULL; | |
861 | + int i; | |
861 | 862 | |
862 | - neigh = *to_ipoib_neigh(n); | |
863 | - if (neigh) | |
864 | - priv = netdev_priv(neigh->dev); | |
865 | - else | |
863 | + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) | |
866 | 864 | return; |
867 | - ipoib_dbg(priv, | |
868 | - "neigh_cleanup for %06x %pI6\n", | |
869 | - IPOIB_QPN(n->ha), | |
870 | - n->ha + 4); | |
871 | 865 | |
872 | - spin_lock_irqsave(&priv->lock, flags); | |
866 | + write_lock_bh(&ntbl->rwlock); | |
873 | 867 | |
874 | - if (neigh->ah) | |
875 | - ah = neigh->ah; | |
876 | - list_del(&neigh->list); | |
877 | - ipoib_neigh_free(n->dev, neigh); | |
868 | + htbl = rcu_dereference_protected(ntbl->htbl, | |
869 | + lockdep_is_held(&ntbl->rwlock)); | |
878 | 870 | |
879 | - spin_unlock_irqrestore(&priv->lock, flags); | |
871 | + if (!htbl) | |
872 | + goto out_unlock; | |
880 | 873 | |
881 | - if (ah) | |
882 | - ipoib_put_ah(ah); | |
874 | + /* neigh is obsolete if it was idle for two GC periods */ | |
875 | + dt = 2 * arp_tbl.gc_interval; | |
876 | + neigh_obsolete = jiffies - dt; | |
877 | + /* handle possible race condition */ | |
878 | + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) | |
879 | + goto out_unlock; | |
880 | + | |
881 | + for (i = 0; i < htbl->size; i++) { | |
882 | + struct ipoib_neigh *neigh; | |
883 | + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; | |
884 | + | |
885 | + while ((neigh = rcu_dereference_protected(*np, | |
886 | + lockdep_is_held(&ntbl->rwlock))) != NULL) { | |
887 | + /* was the neigh idle for two GC periods */ | |
888 | + if (time_after(neigh_obsolete, neigh->alive)) { | |
889 | + rcu_assign_pointer(*np, | |
890 | + rcu_dereference_protected(neigh->hnext, | |
891 | + lockdep_is_held(&ntbl->rwlock))); | |
892 | + /* remove from path/mc list */ | |
893 | + spin_lock_irqsave(&priv->lock, flags); | |
894 | + list_del(&neigh->list); | |
895 | + spin_unlock_irqrestore(&priv->lock, flags); | |
896 | + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); | |
897 | + } else { | |
898 | + np = &neigh->hnext; | |
899 | + } | |
900 | + | |
901 | + } | |
902 | + } | |
903 | + | |
904 | +out_unlock: | |
905 | + write_unlock_bh(&ntbl->rwlock); | |
883 | 906 | } |
884 | 907 | |
885 | -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, | |
908 | +static void ipoib_reap_neigh(struct work_struct *work) | |
909 | +{ | |
910 | + struct ipoib_dev_priv *priv = | |
911 | + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); | |
912 | + | |
913 | + __ipoib_reap_neigh(priv); | |
914 | + | |
915 | + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) | |
916 | + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | |
917 | + arp_tbl.gc_interval); | |
918 | +} | |
919 | + | |
920 | + | |
921 | +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, | |
886 | 922 | struct net_device *dev) |
887 | 923 | { |
888 | 924 | struct ipoib_neigh *neigh; |
889 | 925 | |
890 | - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); | |
926 | + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); | |
891 | 927 | if (!neigh) |
892 | 928 | return NULL; |
893 | 929 | |
894 | - neigh->neighbour = neighbour; | |
895 | 930 | neigh->dev = dev; |
896 | - memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); | |
897 | - *to_ipoib_neigh(neighbour) = neigh; | |
931 | + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); | |
898 | 932 | skb_queue_head_init(&neigh->queue); |
933 | + INIT_LIST_HEAD(&neigh->list); | |
899 | 934 | ipoib_cm_set(neigh, NULL); |
935 | + /* one ref on behalf of the caller */ | |
936 | + atomic_set(&neigh->refcnt, 1); | |
900 | 937 | |
901 | 938 | return neigh; |
902 | 939 | } |
903 | 940 | |
904 | -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) | |
941 | +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, | |
942 | + struct net_device *dev) | |
905 | 943 | { |
944 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
945 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
946 | + struct ipoib_neigh_hash *htbl; | |
947 | + struct ipoib_neigh *neigh; | |
948 | + u32 hash_val; | |
949 | + | |
950 | + write_lock_bh(&ntbl->rwlock); | |
951 | + | |
952 | + htbl = rcu_dereference_protected(ntbl->htbl, | |
953 | + lockdep_is_held(&ntbl->rwlock)); | |
954 | + if (!htbl) { | |
955 | + neigh = NULL; | |
956 | + goto out_unlock; | |
957 | + } | |
958 | + | |
959 | + /* need to add a new neigh, but maybe some other thread succeeded? | |
960 | + * recalc hash, maybe hash resize took place so we do a search | |
961 | + */ | |
962 | + hash_val = ipoib_addr_hash(htbl, daddr); | |
963 | + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], | |
964 | + lockdep_is_held(&ntbl->rwlock)); | |
965 | + neigh != NULL; | |
966 | + neigh = rcu_dereference_protected(neigh->hnext, | |
967 | + lockdep_is_held(&ntbl->rwlock))) { | |
968 | + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { | |
969 | + /* found, take one ref on behalf of the caller */ | |
970 | + if (!atomic_inc_not_zero(&neigh->refcnt)) { | |
971 | + /* deleted */ | |
972 | + neigh = NULL; | |
973 | + break; | |
974 | + } | |
975 | + neigh->alive = jiffies; | |
976 | + goto out_unlock; | |
977 | + } | |
978 | + } | |
979 | + | |
980 | + neigh = ipoib_neigh_ctor(daddr, dev); | |
981 | + if (!neigh) | |
982 | + goto out_unlock; | |
983 | + | |
984 | + /* one ref on behalf of the hash table */ | |
985 | + atomic_inc(&neigh->refcnt); | |
986 | + neigh->alive = jiffies; | |
987 | + /* put in hash */ | |
988 | + rcu_assign_pointer(neigh->hnext, | |
989 | + rcu_dereference_protected(htbl->buckets[hash_val], | |
990 | + lockdep_is_held(&ntbl->rwlock))); | |
991 | + rcu_assign_pointer(htbl->buckets[hash_val], neigh); | |
992 | + atomic_inc(&ntbl->entries); | |
993 | + | |
994 | +out_unlock: | |
995 | + write_unlock_bh(&ntbl->rwlock); | |
996 | + | |
997 | + return neigh; | |
998 | +} | |
999 | + | |
1000 | +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) | |
1001 | +{ | |
1002 | + /* neigh reference count was dropprd to zero */ | |
1003 | + struct net_device *dev = neigh->dev; | |
1004 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
906 | 1005 | struct sk_buff *skb; |
907 | - *to_ipoib_neigh(neigh->neighbour) = NULL; | |
1006 | + if (neigh->ah) | |
1007 | + ipoib_put_ah(neigh->ah); | |
908 | 1008 | while ((skb = __skb_dequeue(&neigh->queue))) { |
909 | 1009 | ++dev->stats.tx_dropped; |
910 | 1010 | dev_kfree_skb_any(skb); |
911 | 1011 | } |
912 | 1012 | if (ipoib_cm_get(neigh)) |
913 | 1013 | ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); |
1014 | + ipoib_dbg(netdev_priv(dev), | |
1015 | + "neigh free for %06x %pI6\n", | |
1016 | + IPOIB_QPN(neigh->daddr), | |
1017 | + neigh->daddr + 4); | |
914 | 1018 | kfree(neigh); |
1019 | + if (atomic_dec_and_test(&priv->ntbl.entries)) { | |
1020 | + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) | |
1021 | + complete(&priv->ntbl.flushed); | |
1022 | + } | |
915 | 1023 | } |
916 | 1024 | |
917 | -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) | |
1025 | +static void ipoib_neigh_reclaim(struct rcu_head *rp) | |
918 | 1026 | { |
919 | - parms->neigh_cleanup = ipoib_neigh_cleanup; | |
1027 | + /* Called as a result of removal from hash table */ | |
1028 | + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); | |
1029 | + /* note TX context may hold another ref */ | |
1030 | + ipoib_neigh_put(neigh); | |
1031 | +} | |
920 | 1032 | |
1033 | +void ipoib_neigh_free(struct ipoib_neigh *neigh) | |
1034 | +{ | |
1035 | + struct net_device *dev = neigh->dev; | |
1036 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
1037 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
1038 | + struct ipoib_neigh_hash *htbl; | |
1039 | + struct ipoib_neigh __rcu **np; | |
1040 | + struct ipoib_neigh *n; | |
1041 | + u32 hash_val; | |
1042 | + | |
1043 | + write_lock_bh(&ntbl->rwlock); | |
1044 | + | |
1045 | + htbl = rcu_dereference_protected(ntbl->htbl, | |
1046 | + lockdep_is_held(&ntbl->rwlock)); | |
1047 | + if (!htbl) | |
1048 | + goto out_unlock; | |
1049 | + | |
1050 | + hash_val = ipoib_addr_hash(htbl, neigh->daddr); | |
1051 | + np = &htbl->buckets[hash_val]; | |
1052 | + for (n = rcu_dereference_protected(*np, | |
1053 | + lockdep_is_held(&ntbl->rwlock)); | |
1054 | + n != NULL; | |
1055 | + n = rcu_dereference_protected(neigh->hnext, | |
1056 | + lockdep_is_held(&ntbl->rwlock))) { | |
1057 | + if (n == neigh) { | |
1058 | + /* found */ | |
1059 | + rcu_assign_pointer(*np, | |
1060 | + rcu_dereference_protected(neigh->hnext, | |
1061 | + lockdep_is_held(&ntbl->rwlock))); | |
1062 | + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); | |
1063 | + goto out_unlock; | |
1064 | + } else { | |
1065 | + np = &n->hnext; | |
1066 | + } | |
1067 | + } | |
1068 | + | |
1069 | +out_unlock: | |
1070 | + write_unlock_bh(&ntbl->rwlock); | |
1071 | + | |
1072 | +} | |
1073 | + | |
1074 | +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) | |
1075 | +{ | |
1076 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
1077 | + struct ipoib_neigh_hash *htbl; | |
1078 | + struct ipoib_neigh **buckets; | |
1079 | + u32 size; | |
1080 | + | |
1081 | + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); | |
1082 | + ntbl->htbl = NULL; | |
1083 | + rwlock_init(&ntbl->rwlock); | |
1084 | + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); | |
1085 | + if (!htbl) | |
1086 | + return -ENOMEM; | |
1087 | + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | |
1088 | + size = roundup_pow_of_two(arp_tbl.gc_thresh3); | |
1089 | + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); | |
1090 | + if (!buckets) { | |
1091 | + kfree(htbl); | |
1092 | + return -ENOMEM; | |
1093 | + } | |
1094 | + htbl->size = size; | |
1095 | + htbl->mask = (size - 1); | |
1096 | + htbl->buckets = buckets; | |
1097 | + ntbl->htbl = htbl; | |
1098 | + atomic_set(&ntbl->entries, 0); | |
1099 | + | |
1100 | + /* start garbage collection */ | |
1101 | + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | |
1102 | + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | |
1103 | + arp_tbl.gc_interval); | |
1104 | + | |
921 | 1105 | return 0; |
922 | 1106 | } |
923 | 1107 | |
1108 | +static void neigh_hash_free_rcu(struct rcu_head *head) | |
1109 | +{ | |
1110 | + struct ipoib_neigh_hash *htbl = container_of(head, | |
1111 | + struct ipoib_neigh_hash, | |
1112 | + rcu); | |
1113 | + struct ipoib_neigh __rcu **buckets = htbl->buckets; | |
1114 | + | |
1115 | + kfree(buckets); | |
1116 | + kfree(htbl); | |
1117 | +} | |
1118 | + | |
1119 | +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) | |
1120 | +{ | |
1121 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
1122 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
1123 | + struct ipoib_neigh_hash *htbl; | |
1124 | + unsigned long flags; | |
1125 | + int i; | |
1126 | + | |
1127 | + /* remove all neigh connected to a given path or mcast */ | |
1128 | + write_lock_bh(&ntbl->rwlock); | |
1129 | + | |
1130 | + htbl = rcu_dereference_protected(ntbl->htbl, | |
1131 | + lockdep_is_held(&ntbl->rwlock)); | |
1132 | + | |
1133 | + if (!htbl) | |
1134 | + goto out_unlock; | |
1135 | + | |
1136 | + for (i = 0; i < htbl->size; i++) { | |
1137 | + struct ipoib_neigh *neigh; | |
1138 | + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; | |
1139 | + | |
1140 | + while ((neigh = rcu_dereference_protected(*np, | |
1141 | + lockdep_is_held(&ntbl->rwlock))) != NULL) { | |
1142 | + /* delete neighs belong to this parent */ | |
1143 | + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { | |
1144 | + rcu_assign_pointer(*np, | |
1145 | + rcu_dereference_protected(neigh->hnext, | |
1146 | + lockdep_is_held(&ntbl->rwlock))); | |
1147 | + /* remove from parent list */ | |
1148 | + spin_lock_irqsave(&priv->lock, flags); | |
1149 | + list_del(&neigh->list); | |
1150 | + spin_unlock_irqrestore(&priv->lock, flags); | |
1151 | + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); | |
1152 | + } else { | |
1153 | + np = &neigh->hnext; | |
1154 | + } | |
1155 | + | |
1156 | + } | |
1157 | + } | |
1158 | +out_unlock: | |
1159 | + write_unlock_bh(&ntbl->rwlock); | |
1160 | +} | |
1161 | + | |
1162 | +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) | |
1163 | +{ | |
1164 | + struct ipoib_neigh_table *ntbl = &priv->ntbl; | |
1165 | + struct ipoib_neigh_hash *htbl; | |
1166 | + unsigned long flags; | |
1167 | + int i; | |
1168 | + | |
1169 | + write_lock_bh(&ntbl->rwlock); | |
1170 | + | |
1171 | + htbl = rcu_dereference_protected(ntbl->htbl, | |
1172 | + lockdep_is_held(&ntbl->rwlock)); | |
1173 | + if (!htbl) | |
1174 | + goto out_unlock; | |
1175 | + | |
1176 | + for (i = 0; i < htbl->size; i++) { | |
1177 | + struct ipoib_neigh *neigh; | |
1178 | + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; | |
1179 | + | |
1180 | + while ((neigh = rcu_dereference_protected(*np, | |
1181 | + lockdep_is_held(&ntbl->rwlock))) != NULL) { | |
1182 | + rcu_assign_pointer(*np, | |
1183 | + rcu_dereference_protected(neigh->hnext, | |
1184 | + lockdep_is_held(&ntbl->rwlock))); | |
1185 | + /* remove from path/mc list */ | |
1186 | + spin_lock_irqsave(&priv->lock, flags); | |
1187 | + list_del(&neigh->list); | |
1188 | + spin_unlock_irqrestore(&priv->lock, flags); | |
1189 | + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); | |
1190 | + } | |
1191 | + } | |
1192 | + | |
1193 | + rcu_assign_pointer(ntbl->htbl, NULL); | |
1194 | + call_rcu(&htbl->rcu, neigh_hash_free_rcu); | |
1195 | + | |
1196 | +out_unlock: | |
1197 | + write_unlock_bh(&ntbl->rwlock); | |
1198 | +} | |
1199 | + | |
1200 | +static void ipoib_neigh_hash_uninit(struct net_device *dev) | |
1201 | +{ | |
1202 | + struct ipoib_dev_priv *priv = netdev_priv(dev); | |
1203 | + int stopped; | |
1204 | + | |
1205 | + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); | |
1206 | + init_completion(&priv->ntbl.flushed); | |
1207 | + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); | |
1208 | + | |
1209 | + /* Stop GC if called at init fail need to cancel work */ | |
1210 | + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | |
1211 | + if (!stopped) | |
1212 | + cancel_delayed_work(&priv->neigh_reap_task); | |
1213 | + | |
1214 | + if (atomic_read(&priv->ntbl.entries)) { | |
1215 | + ipoib_flush_neighs(priv); | |
1216 | + wait_for_completion(&priv->ntbl.flushed); | |
1217 | + } | |
1218 | +} | |
1219 | + | |
1220 | + | |
924 | 1221 | int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) |
925 | 1222 | { |
926 | 1223 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
927 | 1224 | |
1225 | + if (ipoib_neigh_hash_init(priv) < 0) | |
1226 | + goto out; | |
928 | 1227 | /* Allocate RX/TX "rings" to hold queued skbs */ |
929 | 1228 | priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, |
930 | 1229 | GFP_KERNEL); |
931 | 1230 | if (!priv->rx_ring) { |
932 | 1231 | printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", |
933 | 1232 | ca->name, ipoib_recvq_size); |
934 | - goto out; | |
1233 | + goto out_neigh_hash_cleanup; | |
935 | 1234 | } |
936 | 1235 | |
937 | 1236 | priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); |
... | ... | @@ -954,6 +1253,8 @@ |
954 | 1253 | out_rx_ring_cleanup: |
955 | 1254 | kfree(priv->rx_ring); |
956 | 1255 | |
1256 | +out_neigh_hash_cleanup: | |
1257 | + ipoib_neigh_hash_uninit(dev); | |
957 | 1258 | out: |
958 | 1259 | return -ENOMEM; |
959 | 1260 | } |
... | ... | @@ -966,6 +1267,9 @@ |
966 | 1267 | |
967 | 1268 | /* Delete any child interfaces first */ |
968 | 1269 | list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { |
1270 | + /* Stop GC on child */ | |
1271 | + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); | |
1272 | + cancel_delayed_work(&cpriv->neigh_reap_task); | |
969 | 1273 | unregister_netdev(cpriv->dev); |
970 | 1274 | ipoib_dev_cleanup(cpriv->dev); |
971 | 1275 | free_netdev(cpriv->dev); |
... | ... | @@ -978,6 +1282,8 @@ |
978 | 1282 | |
979 | 1283 | priv->rx_ring = NULL; |
980 | 1284 | priv->tx_ring = NULL; |
1285 | + | |
1286 | + ipoib_neigh_hash_uninit(dev); | |
981 | 1287 | } |
982 | 1288 | |
983 | 1289 | static const struct header_ops ipoib_header_ops = { |
... | ... | @@ -992,7 +1298,6 @@ |
992 | 1298 | .ndo_start_xmit = ipoib_start_xmit, |
993 | 1299 | .ndo_tx_timeout = ipoib_timeout, |
994 | 1300 | .ndo_set_rx_mode = ipoib_set_mcast_list, |
995 | - .ndo_neigh_setup = ipoib_neigh_setup_dev, | |
996 | 1301 | }; |
997 | 1302 | |
998 | 1303 | static void ipoib_setup(struct net_device *dev) |
... | ... | @@ -1041,6 +1346,7 @@ |
1041 | 1346 | INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); |
1042 | 1347 | INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); |
1043 | 1348 | INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); |
1349 | + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); | |
1044 | 1350 | } |
1045 | 1351 | |
1046 | 1352 | struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) |
... | ... | @@ -1281,6 +1587,9 @@ |
1281 | 1587 | |
1282 | 1588 | register_failed: |
1283 | 1589 | ib_unregister_event_handler(&priv->event_handler); |
1590 | + /* Stop GC if started before flush */ | |
1591 | + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | |
1592 | + cancel_delayed_work(&priv->neigh_reap_task); | |
1284 | 1593 | flush_workqueue(ipoib_workqueue); |
1285 | 1594 | |
1286 | 1595 | event_failed: |
... | ... | @@ -1347,6 +1656,9 @@ |
1347 | 1656 | dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); |
1348 | 1657 | rtnl_unlock(); |
1349 | 1658 | |
1659 | + /* Stop GC */ | |
1660 | + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | |
1661 | + cancel_delayed_work(&priv->neigh_reap_task); | |
1350 | 1662 | flush_workqueue(ipoib_workqueue); |
1351 | 1663 | |
1352 | 1664 | unregister_netdev(priv->dev); |
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
... | ... | @@ -69,29 +69,14 @@ |
69 | 69 | static void ipoib_mcast_free(struct ipoib_mcast *mcast) |
70 | 70 | { |
71 | 71 | struct net_device *dev = mcast->dev; |
72 | - struct ipoib_dev_priv *priv = netdev_priv(dev); | |
73 | - struct ipoib_neigh *neigh, *tmp; | |
74 | 72 | int tx_dropped = 0; |
75 | 73 | |
76 | 74 | ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", |
77 | 75 | mcast->mcmember.mgid.raw); |
78 | 76 | |
79 | - spin_lock_irq(&priv->lock); | |
77 | + /* remove all neigh connected to this mcast */ | |
78 | + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); | |
80 | 79 | |
81 | - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { | |
82 | - /* | |
83 | - * It's safe to call ipoib_put_ah() inside priv->lock | |
84 | - * here, because we know that mcast->ah will always | |
85 | - * hold one more reference, so ipoib_put_ah() will | |
86 | - * never do more than decrement the ref count. | |
87 | - */ | |
88 | - if (neigh->ah) | |
89 | - ipoib_put_ah(neigh->ah); | |
90 | - ipoib_neigh_free(dev, neigh); | |
91 | - } | |
92 | - | |
93 | - spin_unlock_irq(&priv->lock); | |
94 | - | |
95 | 80 | if (mcast->ah) |
96 | 81 | ipoib_put_ah(mcast->ah); |
97 | 82 | |
98 | 83 | |
99 | 84 | |
100 | 85 | |
101 | 86 | |
... | ... | @@ -655,18 +640,13 @@ |
655 | 640 | return 0; |
656 | 641 | } |
657 | 642 | |
658 | -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) | |
643 | +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |
659 | 644 | { |
660 | 645 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
661 | - struct dst_entry *dst = skb_dst(skb); | |
662 | 646 | struct ipoib_mcast *mcast; |
663 | - struct neighbour *n; | |
664 | 647 | unsigned long flags; |
648 | + void *mgid = daddr + 4; | |
665 | 649 | |
666 | - n = NULL; | |
667 | - if (dst) | |
668 | - n = dst_neigh_lookup_skb(dst, skb); | |
669 | - | |
670 | 650 | spin_lock_irqsave(&priv->lock, flags); |
671 | 651 | |
672 | 652 | if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || |
673 | 653 | |
674 | 654 | |
675 | 655 | |
676 | 656 | |
... | ... | @@ -721,28 +701,29 @@ |
721 | 701 | |
722 | 702 | out: |
723 | 703 | if (mcast && mcast->ah) { |
724 | - if (n) { | |
725 | - if (!*to_ipoib_neigh(n)) { | |
726 | - struct ipoib_neigh *neigh; | |
704 | + struct ipoib_neigh *neigh; | |
727 | 705 | |
728 | - neigh = ipoib_neigh_alloc(n, skb->dev); | |
729 | - if (neigh) { | |
730 | - kref_get(&mcast->ah->ref); | |
731 | - neigh->ah = mcast->ah; | |
732 | - list_add_tail(&neigh->list, | |
733 | - &mcast->neigh_list); | |
734 | - } | |
706 | + spin_unlock_irqrestore(&priv->lock, flags); | |
707 | + neigh = ipoib_neigh_get(dev, daddr); | |
708 | + spin_lock_irqsave(&priv->lock, flags); | |
709 | + if (!neigh) { | |
710 | + spin_unlock_irqrestore(&priv->lock, flags); | |
711 | + neigh = ipoib_neigh_alloc(daddr, dev); | |
712 | + spin_lock_irqsave(&priv->lock, flags); | |
713 | + if (neigh) { | |
714 | + kref_get(&mcast->ah->ref); | |
715 | + neigh->ah = mcast->ah; | |
716 | + list_add_tail(&neigh->list, &mcast->neigh_list); | |
735 | 717 | } |
736 | - neigh_release(n); | |
737 | 718 | } |
738 | 719 | spin_unlock_irqrestore(&priv->lock, flags); |
739 | 720 | ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); |
721 | + if (neigh) | |
722 | + ipoib_neigh_put(neigh); | |
740 | 723 | return; |
741 | 724 | } |
742 | 725 | |
743 | 726 | unlock: |
744 | - if (n) | |
745 | - neigh_release(n); | |
746 | 727 | spin_unlock_irqrestore(&priv->lock, flags); |
747 | 728 | } |
748 | 729 |