Commit 29b4433d991c88d86ca48a4c1cc33c671475be4b

Authored by Eric Dumazet
Committed by David S. Miller
1 parent f0b9f47251

net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 4 changed files with 41 additions and 14 deletions Side-by-side Diff

drivers/infiniband/hw/nes/nes_cm.c
... ... @@ -2701,7 +2701,7 @@
2701 2701 nesibdev = nesvnic->nesibdev;
2702 2702  
2703 2703 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
2704   - atomic_read(&nesvnic->netdev->refcnt));
  2704 + netdev_refcnt_read(nesvnic->netdev));
2705 2705  
2706 2706 if (nesqp->active_conn) {
2707 2707  
... ... @@ -2791,7 +2791,7 @@
2791 2791 atomic_inc(&cm_accepts);
2792 2792  
2793 2793 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
2794   - atomic_read(&nesvnic->netdev->refcnt));
  2794 + netdev_refcnt_read(nesvnic->netdev));
2795 2795  
2796 2796 /* allocate the ietf frame and space for private data */
2797 2797 nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
drivers/infiniband/hw/nes/nes_verbs.c
... ... @@ -785,7 +785,7 @@
785 785  
786 786 nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
787 787 nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
788   - atomic_read(&nesvnic->netdev->refcnt));
  788 + netdev_refcnt_read(nesvnic->netdev));
789 789  
790 790 err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
791 791 nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
... ... @@ -1416,7 +1416,7 @@
1416 1416 /* update the QP table */
1417 1417 nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
1418 1418 nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
1419   - atomic_read(&nesvnic->netdev->refcnt));
  1419 + netdev_refcnt_read(nesvnic->netdev));
1420 1420  
1421 1421 return &nesqp->ibqp;
1422 1422 }
include/linux/netdevice.h
... ... @@ -1026,7 +1026,7 @@
1026 1026 struct timer_list watchdog_timer;
1027 1027  
1028 1028 /* Number of references to this device */
1029   - atomic_t refcnt ____cacheline_aligned_in_smp;
  1029 + int __percpu *pcpu_refcnt;
1030 1030  
1031 1031 /* delayed register/unregister */
1032 1032 struct list_head todo_list;
... ... @@ -1330,6 +1330,7 @@
1330 1330 unregister_netdevice_queue(dev, NULL);
1331 1331 }
1332 1332  
  1333 +extern int netdev_refcnt_read(const struct net_device *dev);
1333 1334 extern void free_netdev(struct net_device *dev);
1334 1335 extern void synchronize_net(void);
1335 1336 extern int register_netdevice_notifier(struct notifier_block *nb);
... ... @@ -1798,7 +1799,7 @@
1798 1799 */
1799 1800 static inline void dev_put(struct net_device *dev)
1800 1801 {
1801   - atomic_dec(&dev->refcnt);
  1802 + irqsafe_cpu_dec(*dev->pcpu_refcnt);
1802 1803 }
1803 1804  
1804 1805 /**
... ... @@ -1809,7 +1810,7 @@
1809 1810 */
1810 1811 static inline void dev_hold(struct net_device *dev)
1811 1812 {
1812   - atomic_inc(&dev->refcnt);
  1813 + irqsafe_cpu_inc(*dev->pcpu_refcnt);
1813 1814 }
1814 1815  
1815 1816 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
... ... @@ -5192,9 +5192,6 @@
5192 5192 */
5193 5193 dev->reg_state = NETREG_DUMMY;
5194 5194  
5195   - /* initialize the ref count */
5196   - atomic_set(&dev->refcnt, 1);
5197   -
5198 5195 /* NAPI wants this */
5199 5196 INIT_LIST_HEAD(&dev->napi_list);
5200 5197  
... ... @@ -5202,6 +5199,11 @@
5202 5199 set_bit(__LINK_STATE_PRESENT, &dev->state);
5203 5200 set_bit(__LINK_STATE_START, &dev->state);
5204 5201  
  5202 + /* Note : We dont allocate pcpu_refcnt for dummy devices,
  5203 + * because users of this 'device' dont need to change
  5204 + * its refcount.
  5205 + */
  5206 +
5205 5207 return 0;
5206 5208 }
5207 5209 EXPORT_SYMBOL_GPL(init_dummy_netdev);
... ... @@ -5243,6 +5245,16 @@
5243 5245 }
5244 5246 EXPORT_SYMBOL(register_netdev);
5245 5247  
  5248 +int netdev_refcnt_read(const struct net_device *dev)
  5249 +{
  5250 + int i, refcnt = 0;
  5251 +
  5252 + for_each_possible_cpu(i)
  5253 + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
  5254 + return refcnt;
  5255 +}
  5256 +EXPORT_SYMBOL(netdev_refcnt_read);
  5257 +
5246 5258 /*
5247 5259 * netdev_wait_allrefs - wait until all references are gone.
5248 5260 *
5249 5261  
... ... @@ -5257,11 +5269,14 @@
5257 5269 static void netdev_wait_allrefs(struct net_device *dev)
5258 5270 {
5259 5271 unsigned long rebroadcast_time, warning_time;
  5272 + int refcnt;
5260 5273  
5261 5274 linkwatch_forget_dev(dev);
5262 5275  
5263 5276 rebroadcast_time = warning_time = jiffies;
5264   - while (atomic_read(&dev->refcnt) != 0) {
  5277 + refcnt = netdev_refcnt_read(dev);
  5278 +
  5279 + while (refcnt != 0) {
5265 5280 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5266 5281 rtnl_lock();
5267 5282  
5268 5283  
... ... @@ -5288,11 +5303,13 @@
5288 5303  
5289 5304 msleep(250);
5290 5305  
  5306 + refcnt = netdev_refcnt_read(dev);
  5307 +
5291 5308 if (time_after(jiffies, warning_time + 10 * HZ)) {
5292 5309 printk(KERN_EMERG "unregister_netdevice: "
5293 5310 "waiting for %s to become free. Usage "
5294 5311 "count = %d\n",
5295   - dev->name, atomic_read(&dev->refcnt));
  5312 + dev->name, refcnt);
5296 5313 warning_time = jiffies;
5297 5314 }
5298 5315 }
... ... @@ -5350,7 +5367,7 @@
5350 5367 netdev_wait_allrefs(dev);
5351 5368  
5352 5369 /* paranoia */
5353   - BUG_ON(atomic_read(&dev->refcnt));
  5370 + BUG_ON(netdev_refcnt_read(dev));
5354 5371 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5355 5372 WARN_ON(dev->ip6_ptr);
5356 5373 WARN_ON(dev->dn_ptr);
5357 5374  
... ... @@ -5520,9 +5537,13 @@
5520 5537 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5521 5538 dev->padded = (char *)dev - (char *)p;
5522 5539  
5523   - if (dev_addr_init(dev))
  5540 + dev->pcpu_refcnt = alloc_percpu(int);
  5541 + if (!dev->pcpu_refcnt)
5524 5542 goto free_tx;
5525 5543  
  5544 + if (dev_addr_init(dev))
  5545 + goto free_pcpu;
  5546 +
5526 5547 dev_mc_init(dev);
5527 5548 dev_uc_init(dev);
5528 5549  
... ... @@ -5553,6 +5574,8 @@
5553 5574  
5554 5575 free_tx:
5555 5576 kfree(tx);
  5577 +free_pcpu:
  5578 + free_percpu(dev->pcpu_refcnt);
5556 5579 free_p:
5557 5580 kfree(p);
5558 5581 return NULL;
... ... @@ -5585,6 +5608,9 @@
5585 5608  
5586 5609 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5587 5610 netif_napi_del(p);
  5611 +
  5612 + free_percpu(dev->pcpu_refcnt);
  5613 + dev->pcpu_refcnt = NULL;
5588 5614  
5589 5615 /* Compatibility with error handling in drivers */
5590 5616 if (dev->reg_state == NETREG_UNINITIALIZED) {