Commit 29b4433d991c88d86ca48a4c1cc33c671475be4b
Committed by
David S. Miller
1 parent
f0b9f47251
Exists in
master
and in
7 other branches
net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 4 changed files with 41 additions and 14 deletions Side-by-side Diff
drivers/infiniband/hw/nes/nes_cm.c
... | ... | @@ -2701,7 +2701,7 @@ |
2701 | 2701 | nesibdev = nesvnic->nesibdev; |
2702 | 2702 | |
2703 | 2703 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", |
2704 | - atomic_read(&nesvnic->netdev->refcnt)); | |
2704 | + netdev_refcnt_read(nesvnic->netdev)); | |
2705 | 2705 | |
2706 | 2706 | if (nesqp->active_conn) { |
2707 | 2707 | |
... | ... | @@ -2791,7 +2791,7 @@ |
2791 | 2791 | atomic_inc(&cm_accepts); |
2792 | 2792 | |
2793 | 2793 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", |
2794 | - atomic_read(&nesvnic->netdev->refcnt)); | |
2794 | + netdev_refcnt_read(nesvnic->netdev)); | |
2795 | 2795 | |
2796 | 2796 | /* allocate the ietf frame and space for private data */ |
2797 | 2797 | nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, |
drivers/infiniband/hw/nes/nes_verbs.c
... | ... | @@ -785,7 +785,7 @@ |
785 | 785 | |
786 | 786 | nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", |
787 | 787 | nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, |
788 | - atomic_read(&nesvnic->netdev->refcnt)); | |
788 | + netdev_refcnt_read(nesvnic->netdev)); | |
789 | 789 | |
790 | 790 | err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, |
791 | 791 | nesadapter->max_pd, &pd_num, &nesadapter->next_pd); |
... | ... | @@ -1416,7 +1416,7 @@ |
1416 | 1416 | /* update the QP table */ |
1417 | 1417 | nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; |
1418 | 1418 | nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", |
1419 | - atomic_read(&nesvnic->netdev->refcnt)); | |
1419 | + netdev_refcnt_read(nesvnic->netdev)); | |
1420 | 1420 | |
1421 | 1421 | return &nesqp->ibqp; |
1422 | 1422 | } |
include/linux/netdevice.h
... | ... | @@ -1026,7 +1026,7 @@ |
1026 | 1026 | struct timer_list watchdog_timer; |
1027 | 1027 | |
1028 | 1028 | /* Number of references to this device */ |
1029 | - atomic_t refcnt ____cacheline_aligned_in_smp; | |
1029 | + int __percpu *pcpu_refcnt; | |
1030 | 1030 | |
1031 | 1031 | /* delayed register/unregister */ |
1032 | 1032 | struct list_head todo_list; |
... | ... | @@ -1330,6 +1330,7 @@ |
1330 | 1330 | unregister_netdevice_queue(dev, NULL); |
1331 | 1331 | } |
1332 | 1332 | |
1333 | +extern int netdev_refcnt_read(const struct net_device *dev); | |
1333 | 1334 | extern void free_netdev(struct net_device *dev); |
1334 | 1335 | extern void synchronize_net(void); |
1335 | 1336 | extern int register_netdevice_notifier(struct notifier_block *nb); |
... | ... | @@ -1798,7 +1799,7 @@ |
1798 | 1799 | */ |
1799 | 1800 | static inline void dev_put(struct net_device *dev) |
1800 | 1801 | { |
1801 | - atomic_dec(&dev->refcnt); | |
1802 | + irqsafe_cpu_dec(*dev->pcpu_refcnt); | |
1802 | 1803 | } |
1803 | 1804 | |
1804 | 1805 | /** |
... | ... | @@ -1809,7 +1810,7 @@ |
1809 | 1810 | */ |
1810 | 1811 | static inline void dev_hold(struct net_device *dev) |
1811 | 1812 | { |
1812 | - atomic_inc(&dev->refcnt); | |
1813 | + irqsafe_cpu_inc(*dev->pcpu_refcnt); | |
1813 | 1814 | } |
1814 | 1815 | |
1815 | 1816 | /* Carrier loss detection, dial on demand. The functions netif_carrier_on |
net/core/dev.c
... | ... | @@ -5192,9 +5192,6 @@ |
5192 | 5192 | */ |
5193 | 5193 | dev->reg_state = NETREG_DUMMY; |
5194 | 5194 | |
5195 | - /* initialize the ref count */ | |
5196 | - atomic_set(&dev->refcnt, 1); | |
5197 | - | |
5198 | 5195 | /* NAPI wants this */ |
5199 | 5196 | INIT_LIST_HEAD(&dev->napi_list); |
5200 | 5197 | |
... | ... | @@ -5202,6 +5199,11 @@ |
5202 | 5199 | set_bit(__LINK_STATE_PRESENT, &dev->state); |
5203 | 5200 | set_bit(__LINK_STATE_START, &dev->state); |
5204 | 5201 | |
5202 | + /* Note : We dont allocate pcpu_refcnt for dummy devices, | |
5203 | + * because users of this 'device' dont need to change | |
5204 | + * its refcount. | |
5205 | + */ | |
5206 | + | |
5205 | 5207 | return 0; |
5206 | 5208 | } |
5207 | 5209 | EXPORT_SYMBOL_GPL(init_dummy_netdev); |
... | ... | @@ -5243,6 +5245,16 @@ |
5243 | 5245 | } |
5244 | 5246 | EXPORT_SYMBOL(register_netdev); |
5245 | 5247 | |
5248 | +int netdev_refcnt_read(const struct net_device *dev) | |
5249 | +{ | |
5250 | + int i, refcnt = 0; | |
5251 | + | |
5252 | + for_each_possible_cpu(i) | |
5253 | + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); | |
5254 | + return refcnt; | |
5255 | +} | |
5256 | +EXPORT_SYMBOL(netdev_refcnt_read); | |
5257 | + | |
5246 | 5258 | /* |
5247 | 5259 | * netdev_wait_allrefs - wait until all references are gone. |
5248 | 5260 | * |
5249 | 5261 | |
... | ... | @@ -5257,11 +5269,14 @@ |
5257 | 5269 | static void netdev_wait_allrefs(struct net_device *dev) |
5258 | 5270 | { |
5259 | 5271 | unsigned long rebroadcast_time, warning_time; |
5272 | + int refcnt; | |
5260 | 5273 | |
5261 | 5274 | linkwatch_forget_dev(dev); |
5262 | 5275 | |
5263 | 5276 | rebroadcast_time = warning_time = jiffies; |
5264 | - while (atomic_read(&dev->refcnt) != 0) { | |
5277 | + refcnt = netdev_refcnt_read(dev); | |
5278 | + | |
5279 | + while (refcnt != 0) { | |
5265 | 5280 | if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { |
5266 | 5281 | rtnl_lock(); |
5267 | 5282 | |
5268 | 5283 | |
... | ... | @@ -5288,11 +5303,13 @@ |
5288 | 5303 | |
5289 | 5304 | msleep(250); |
5290 | 5305 | |
5306 | + refcnt = netdev_refcnt_read(dev); | |
5307 | + | |
5291 | 5308 | if (time_after(jiffies, warning_time + 10 * HZ)) { |
5292 | 5309 | printk(KERN_EMERG "unregister_netdevice: " |
5293 | 5310 | "waiting for %s to become free. Usage " |
5294 | 5311 | "count = %d\n", |
5295 | - dev->name, atomic_read(&dev->refcnt)); | |
5312 | + dev->name, refcnt); | |
5296 | 5313 | warning_time = jiffies; |
5297 | 5314 | } |
5298 | 5315 | } |
... | ... | @@ -5350,7 +5367,7 @@ |
5350 | 5367 | netdev_wait_allrefs(dev); |
5351 | 5368 | |
5352 | 5369 | /* paranoia */ |
5353 | - BUG_ON(atomic_read(&dev->refcnt)); | |
5370 | + BUG_ON(netdev_refcnt_read(dev)); | |
5354 | 5371 | WARN_ON(rcu_dereference_raw(dev->ip_ptr)); |
5355 | 5372 | WARN_ON(dev->ip6_ptr); |
5356 | 5373 | WARN_ON(dev->dn_ptr); |
5357 | 5374 | |
... | ... | @@ -5520,9 +5537,13 @@ |
5520 | 5537 | dev = PTR_ALIGN(p, NETDEV_ALIGN); |
5521 | 5538 | dev->padded = (char *)dev - (char *)p; |
5522 | 5539 | |
5523 | - if (dev_addr_init(dev)) | |
5540 | + dev->pcpu_refcnt = alloc_percpu(int); | |
5541 | + if (!dev->pcpu_refcnt) | |
5524 | 5542 | goto free_tx; |
5525 | 5543 | |
5544 | + if (dev_addr_init(dev)) | |
5545 | + goto free_pcpu; | |
5546 | + | |
5526 | 5547 | dev_mc_init(dev); |
5527 | 5548 | dev_uc_init(dev); |
5528 | 5549 | |
... | ... | @@ -5553,6 +5574,8 @@ |
5553 | 5574 | |
5554 | 5575 | free_tx: |
5555 | 5576 | kfree(tx); |
5577 | +free_pcpu: | |
5578 | + free_percpu(dev->pcpu_refcnt); | |
5556 | 5579 | free_p: |
5557 | 5580 | kfree(p); |
5558 | 5581 | return NULL; |
... | ... | @@ -5585,6 +5608,9 @@ |
5585 | 5608 | |
5586 | 5609 | list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) |
5587 | 5610 | netif_napi_del(p); |
5611 | + | |
5612 | + free_percpu(dev->pcpu_refcnt); | |
5613 | + dev->pcpu_refcnt = NULL; | |
5588 | 5614 | |
5589 | 5615 | /* Compatibility with error handling in drivers */ |
5590 | 5616 | if (dev->reg_state == NETREG_UNINITIALIZED) { |