Commit da3bbdd4632c0171406b2677e31494afa5bde2f8

Authored by Kentaro Makita
Committed by Linus Torvalds
1 parent 3c82d0ce2c

fix soft lock up at NFS mount via per-SB LRU-list of unused dentries

[Summary]

 Split LRU-list of unused dentries to one per superblock to avoid soft
 lock up during NFS mounts and remounting of any filesystem.

 Previously I posted here:
 http://lkml.org/lkml/2008/3/5/590

[Descriptions]

- background

  dentry_unused is a list of dentries which are not referenced.
  dentry_unused grows up when references on directories or files are
  released.  This list can be very long if there is huge free memory.

- the problem

  When shrink_dcache_sb() is called, it scans all dentry_unused linearly
  under spin_lock(), and if dentry->d_sb is differnt from given
  superblock, scan next dentry.  This scan costs very much if there are
  many entries, and very ineffective if there are many superblocks.

  IOW, When we need to shrink unused dentries on one dentry, but scans
  unused dentries on all superblocks in the system.  For example, we scan
  500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
  dentries on other superblocks.

  In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
  unused dentries on NFS, but scans 100,000,000 unused dentries on
  superblocks in the system such as local ext3 filesystems.  I hear NFS
  mounting took 1 min on some system in use.

* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
  this problem.

  100,000,000 is possible number on large systems.

  Per-superblock LRU of unused dentried can reduce the cost in
  reasonable manner.

- How to fix

  I found this problem is solved by David Chinner's "Per-superblock
  unused dentry LRU lists V3"(1), so I rebase it and add some fix to
  reclaim with fairness, which is in Andrew Morton's comments(2).

  1) http://lkml.org/lkml/2006/5/25/318
  2) http://lkml.org/lkml/2006/5/25/320

  Split LRU-list of unused dentries to each superblocks.  Then, NFS
  mounting will check dentries under a superblock instead of all.  But
  this spliting will break LRU of dentry-unused.  So, I've attempted to
  make reclaim unused dentrins with fairness by calculate number of
  dentries to scan on this sb based on following way

  number of dentries to scan on this sb =
  count * (number of dentries on this sb / number of dentries in the machine)

- ToDo
 - I have to measuring performance number and do stress tests.

 - When unmount occurs during prune_dcache(), scanning on same
  superblock, It is unable to reach next superblock because it is gone
  away.  We restart scannig superblock from first one, it causes
  unfairness of reclaim unused dentries on first superblock.  But I think
  this happens very rarely.

- Test Results

  Result on 6GB boxes with excessive unused dentries.

Without patch:

$ cat /proc/sys/fs/dentry-state
10181835        10180203        45      0       0       0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real    0m1.830s
user    0m0.001s
sys     0m1.653s

 With this patch:
$ cat /proc/sys/fs/dentry-state
10236610        10234751        45      0       0       0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real    0m0.106s
user    0m0.002s
sys     0m0.032s

[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 185 additions and 155 deletions Side-by-side Diff

... ... @@ -61,7 +61,6 @@
61 61 static unsigned int d_hash_mask __read_mostly;
62 62 static unsigned int d_hash_shift __read_mostly;
63 63 static struct hlist_head *dentry_hashtable __read_mostly;
64   -static LIST_HEAD(dentry_unused);
65 64  
66 65 /* Statistics gathering. */
67 66 struct dentry_stat_t dentry_stat = {
... ... @@ -96,14 +95,6 @@
96 95 call_rcu(&dentry->d_u.d_rcu, d_callback);
97 96 }
98 97  
99   -static void dentry_lru_remove(struct dentry *dentry)
100   -{
101   - if (!list_empty(&dentry->d_lru)) {
102   - list_del_init(&dentry->d_lru);
103   - dentry_stat.nr_unused--;
104   - }
105   -}
106   -
107 98 /*
108 99 * Release the dentry's inode, using the filesystem
109 100 * d_iput() operation if defined.
... ... @@ -130,6 +121,41 @@
130 121 }
131 122 }
132 123  
  124 +/*
  125 + * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
  126 + */
  127 +static void dentry_lru_add(struct dentry *dentry)
  128 +{
  129 + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
  130 + dentry->d_sb->s_nr_dentry_unused++;
  131 + dentry_stat.nr_unused++;
  132 +}
  133 +
  134 +static void dentry_lru_add_tail(struct dentry *dentry)
  135 +{
  136 + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
  137 + dentry->d_sb->s_nr_dentry_unused++;
  138 + dentry_stat.nr_unused++;
  139 +}
  140 +
  141 +static void dentry_lru_del(struct dentry *dentry)
  142 +{
  143 + if (!list_empty(&dentry->d_lru)) {
  144 + list_del(&dentry->d_lru);
  145 + dentry->d_sb->s_nr_dentry_unused--;
  146 + dentry_stat.nr_unused--;
  147 + }
  148 +}
  149 +
  150 +static void dentry_lru_del_init(struct dentry *dentry)
  151 +{
  152 + if (likely(!list_empty(&dentry->d_lru))) {
  153 + list_del_init(&dentry->d_lru);
  154 + dentry->d_sb->s_nr_dentry_unused--;
  155 + dentry_stat.nr_unused--;
  156 + }
  157 +}
  158 +
133 159 /**
134 160 * d_kill - kill dentry and return parent
135 161 * @dentry: dentry to kill
... ... @@ -212,8 +238,7 @@
212 238 goto kill_it;
213 239 if (list_empty(&dentry->d_lru)) {
214 240 dentry->d_flags |= DCACHE_REFERENCED;
215   - list_add(&dentry->d_lru, &dentry_unused);
216   - dentry_stat.nr_unused++;
  241 + dentry_lru_add(dentry);
217 242 }
218 243 spin_unlock(&dentry->d_lock);
219 244 spin_unlock(&dcache_lock);
... ... @@ -222,7 +247,8 @@
222 247 unhash_it:
223 248 __d_drop(dentry);
224 249 kill_it:
225   - dentry_lru_remove(dentry);
  250 + /* if dentry was on the d_lru list delete it from there */
  251 + dentry_lru_del(dentry);
226 252 dentry = d_kill(dentry);
227 253 if (dentry)
228 254 goto repeat;
... ... @@ -290,7 +316,7 @@
290 316 static inline struct dentry * __dget_locked(struct dentry *dentry)
291 317 {
292 318 atomic_inc(&dentry->d_count);
293   - dentry_lru_remove(dentry);
  319 + dentry_lru_del_init(dentry);
294 320 return dentry;
295 321 }
296 322  
297 323  
298 324  
299 325  
300 326  
301 327  
302 328  
303 329  
304 330  
305 331  
306 332  
307 333  
308 334  
309 335  
310 336  
311 337  
312 338  
313 339  
314 340  
315 341  
316 342  
... ... @@ -406,133 +432,167 @@
406 432  
407 433 if (dentry->d_op && dentry->d_op->d_delete)
408 434 dentry->d_op->d_delete(dentry);
409   - dentry_lru_remove(dentry);
  435 + dentry_lru_del_init(dentry);
410 436 __d_drop(dentry);
411 437 dentry = d_kill(dentry);
412 438 spin_lock(&dcache_lock);
413 439 }
414 440 }
415 441  
416   -/**
417   - * prune_dcache - shrink the dcache
418   - * @count: number of entries to try and free
419   - * @sb: if given, ignore dentries for other superblocks
420   - * which are being unmounted.
421   - *
422   - * Shrink the dcache. This is done when we need
423   - * more memory, or simply when we need to unmount
424   - * something (at which point we need to unuse
425   - * all dentries).
426   - *
427   - * This function may fail to free any resources if
428   - * all the dentries are in use.
  442 +/*
  443 + * Shrink the dentry LRU on a given superblock.
  444 + * @sb : superblock to shrink dentry LRU.
  445 + * @count: If count is NULL, we prune all dentries on superblock.
  446 + * @flags: If flags is non-zero, we need to do special processing based on
  447 + * which flags are set. This means we don't need to maintain multiple
  448 + * similar copies of this loop.
429 449 */
430   -
431   -static void prune_dcache(int count, struct super_block *sb)
  450 +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
432 451 {
  452 + LIST_HEAD(referenced);
  453 + LIST_HEAD(tmp);
  454 + struct dentry *dentry;
  455 + int cnt = 0;
  456 +
  457 + BUG_ON(!sb);
  458 + BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
433 459 spin_lock(&dcache_lock);
434   - for (; count ; count--) {
435   - struct dentry *dentry;
436   - struct list_head *tmp;
437   - struct rw_semaphore *s_umount;
  460 + if (count != NULL)
  461 + /* called from prune_dcache() and shrink_dcache_parent() */
  462 + cnt = *count;
  463 +restart:
  464 + if (count == NULL)
  465 + list_splice_init(&sb->s_dentry_lru, &tmp);
  466 + else {
  467 + while (!list_empty(&sb->s_dentry_lru)) {
  468 + dentry = list_entry(sb->s_dentry_lru.prev,
  469 + struct dentry, d_lru);
  470 + BUG_ON(dentry->d_sb != sb);
438 471  
439   - cond_resched_lock(&dcache_lock);
440   -
441   - tmp = dentry_unused.prev;
442   - if (sb) {
443   - /* Try to find a dentry for this sb, but don't try
444   - * too hard, if they aren't near the tail they will
445   - * be moved down again soon
  472 + spin_lock(&dentry->d_lock);
  473 + /*
  474 + * If we are honouring the DCACHE_REFERENCED flag and
  475 + * the dentry has this flag set, don't free it. Clear
  476 + * the flag and put it back on the LRU.
446 477 */
447   - int skip = count;
448   - while (skip && tmp != &dentry_unused &&
449   - list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
450   - skip--;
451   - tmp = tmp->prev;
  478 + if ((flags & DCACHE_REFERENCED)
  479 + && (dentry->d_flags & DCACHE_REFERENCED)) {
  480 + dentry->d_flags &= ~DCACHE_REFERENCED;
  481 + list_move_tail(&dentry->d_lru, &referenced);
  482 + spin_unlock(&dentry->d_lock);
  483 + } else {
  484 + list_move_tail(&dentry->d_lru, &tmp);
  485 + spin_unlock(&dentry->d_lock);
  486 + cnt--;
  487 + if (!cnt)
  488 + break;
452 489 }
453 490 }
454   - if (tmp == &dentry_unused)
455   - break;
456   - list_del_init(tmp);
457   - prefetch(dentry_unused.prev);
458   - dentry_stat.nr_unused--;
459   - dentry = list_entry(tmp, struct dentry, d_lru);
460   -
461   - spin_lock(&dentry->d_lock);
  491 + }
  492 + while (!list_empty(&tmp)) {
  493 + dentry = list_entry(tmp.prev, struct dentry, d_lru);
  494 + dentry_lru_del_init(dentry);
  495 + spin_lock(&dentry->d_lock);
462 496 /*
463 497 * We found an inuse dentry which was not removed from
464   - * dentry_unused because of laziness during lookup. Do not free
465   - * it - just keep it off the dentry_unused list.
  498 + * the LRU because of laziness during lookup. Do not free
  499 + * it - just keep it off the LRU list.
466 500 */
467   - if (atomic_read(&dentry->d_count)) {
468   - spin_unlock(&dentry->d_lock);
  501 + if (atomic_read(&dentry->d_count)) {
  502 + spin_unlock(&dentry->d_lock);
469 503 continue;
470 504 }
471   - /* If the dentry was recently referenced, don't free it. */
472   - if (dentry->d_flags & DCACHE_REFERENCED) {
473   - dentry->d_flags &= ~DCACHE_REFERENCED;
474   - list_add(&dentry->d_lru, &dentry_unused);
475   - dentry_stat.nr_unused++;
476   - spin_unlock(&dentry->d_lock);
  505 + prune_one_dentry(dentry);
  506 + /* dentry->d_lock was dropped in prune_one_dentry() */
  507 + cond_resched_lock(&dcache_lock);
  508 + }
  509 + if (count == NULL && !list_empty(&sb->s_dentry_lru))
  510 + goto restart;
  511 + if (count != NULL)
  512 + *count = cnt;
  513 + if (!list_empty(&referenced))
  514 + list_splice(&referenced, &sb->s_dentry_lru);
  515 + spin_unlock(&dcache_lock);
  516 +}
  517 +
  518 +/**
  519 + * prune_dcache - shrink the dcache
  520 + * @count: number of entries to try to free
  521 + *
  522 + * Shrink the dcache. This is done when we need more memory, or simply when we
  523 + * need to unmount something (at which point we need to unuse all dentries).
  524 + *
  525 + * This function may fail to free any resources if all the dentries are in use.
  526 + */
  527 +static void prune_dcache(int count)
  528 +{
  529 + struct super_block *sb;
  530 + int w_count;
  531 + int unused = dentry_stat.nr_unused;
  532 + int prune_ratio;
  533 + int pruned;
  534 +
  535 + if (unused == 0 || count == 0)
  536 + return;
  537 + spin_lock(&dcache_lock);
  538 +restart:
  539 + if (count >= unused)
  540 + prune_ratio = 1;
  541 + else
  542 + prune_ratio = unused / count;
  543 + spin_lock(&sb_lock);
  544 + list_for_each_entry(sb, &super_blocks, s_list) {
  545 + if (sb->s_nr_dentry_unused == 0)
477 546 continue;
478   - }
479   - /*
480   - * If the dentry is not DCACHED_REFERENCED, it is time
481   - * to remove it from the dcache, provided the super block is
482   - * NULL (which means we are trying to reclaim memory)
483   - * or this dentry belongs to the same super block that
484   - * we want to shrink.
  547 + sb->s_count++;
  548 + /* Now, we reclaim unused dentrins with fairness.
  549 + * We reclaim them same percentage from each superblock.
  550 + * We calculate number of dentries to scan on this sb
  551 + * as follows, but the implementation is arranged to avoid
  552 + * overflows:
  553 + * number of dentries to scan on this sb =
  554 + * count * (number of dentries on this sb /
  555 + * number of dentries in the machine)
485 556 */
  557 + spin_unlock(&sb_lock);
  558 + if (prune_ratio != 1)
  559 + w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
  560 + else
  561 + w_count = sb->s_nr_dentry_unused;
  562 + pruned = w_count;
486 563 /*
487   - * If this dentry is for "my" filesystem, then I can prune it
488   - * without taking the s_umount lock (I already hold it).
  564 + * We need to be sure this filesystem isn't being unmounted,
  565 + * otherwise we could race with generic_shutdown_super(), and
  566 + * end up holding a reference to an inode while the filesystem
  567 + * is unmounted. So we try to get s_umount, and make sure
  568 + * s_root isn't NULL.
489 569 */
490   - if (sb && dentry->d_sb == sb) {
491   - prune_one_dentry(dentry);
492   - continue;
493   - }
494   - /*
495   - * ...otherwise we need to be sure this filesystem isn't being
496   - * unmounted, otherwise we could race with
497   - * generic_shutdown_super(), and end up holding a reference to
498   - * an inode while the filesystem is unmounted.
499   - * So we try to get s_umount, and make sure s_root isn't NULL.
500   - * (Take a local copy of s_umount to avoid a use-after-free of
501   - * `dentry').
502   - */
503   - s_umount = &dentry->d_sb->s_umount;
504   - if (down_read_trylock(s_umount)) {
505   - if (dentry->d_sb->s_root != NULL) {
506   - prune_one_dentry(dentry);
507   - up_read(s_umount);
508   - continue;
  570 + if (down_read_trylock(&sb->s_umount)) {
  571 + if ((sb->s_root != NULL) &&
  572 + (!list_empty(&sb->s_dentry_lru))) {
  573 + spin_unlock(&dcache_lock);
  574 + __shrink_dcache_sb(sb, &w_count,
  575 + DCACHE_REFERENCED);
  576 + pruned -= w_count;
  577 + spin_lock(&dcache_lock);
509 578 }
510   - up_read(s_umount);
  579 + up_read(&sb->s_umount);
511 580 }
512   - spin_unlock(&dentry->d_lock);
  581 + spin_lock(&sb_lock);
  582 + count -= pruned;
513 583 /*
514   - * Insert dentry at the head of the list as inserting at the
515   - * tail leads to a cycle.
  584 + * restart only when sb is no longer on the list and
  585 + * we have more work to do.
516 586 */
517   - list_add(&dentry->d_lru, &dentry_unused);
518   - dentry_stat.nr_unused++;
  587 + if (__put_super_and_need_restart(sb) && count > 0) {
  588 + spin_unlock(&sb_lock);
  589 + goto restart;
  590 + }
519 591 }
  592 + spin_unlock(&sb_lock);
520 593 spin_unlock(&dcache_lock);
521 594 }
522 595  
523   -/*
524   - * Shrink the dcache for the specified super block.
525   - * This allows us to unmount a device without disturbing
526   - * the dcache for the other devices.
527   - *
528   - * This implementation makes just two traversals of the
529   - * unused list. On the first pass we move the selected
530   - * dentries to the most recent end, and on the second
531   - * pass we free them. The second pass must restart after
532   - * each dput(), but since the target dentries are all at
533   - * the end, it's really just a single traversal.
534   - */
535   -
536 596 /**
537 597 * shrink_dcache_sb - shrink dcache for a superblock
538 598 * @sb: superblock
539 599  
... ... @@ -541,44 +601,9 @@
541 601 * is used to free the dcache before unmounting a file
542 602 * system
543 603 */
544   -
545 604 void shrink_dcache_sb(struct super_block * sb)
546 605 {
547   - struct list_head *tmp, *next;
548   - struct dentry *dentry;
549   -
550   - /*
551   - * Pass one ... move the dentries for the specified
552   - * superblock to the most recent end of the unused list.
553   - */
554   - spin_lock(&dcache_lock);
555   - list_for_each_prev_safe(tmp, next, &dentry_unused) {
556   - dentry = list_entry(tmp, struct dentry, d_lru);
557   - if (dentry->d_sb != sb)
558   - continue;
559   - list_move_tail(tmp, &dentry_unused);
560   - }
561   -
562   - /*
563   - * Pass two ... free the dentries for this superblock.
564   - */
565   -repeat:
566   - list_for_each_prev_safe(tmp, next, &dentry_unused) {
567   - dentry = list_entry(tmp, struct dentry, d_lru);
568   - if (dentry->d_sb != sb)
569   - continue;
570   - dentry_stat.nr_unused--;
571   - list_del_init(tmp);
572   - spin_lock(&dentry->d_lock);
573   - if (atomic_read(&dentry->d_count)) {
574   - spin_unlock(&dentry->d_lock);
575   - continue;
576   - }
577   - prune_one_dentry(dentry);
578   - cond_resched_lock(&dcache_lock);
579   - goto repeat;
580   - }
581   - spin_unlock(&dcache_lock);
  606 + __shrink_dcache_sb(sb, NULL, 0);
582 607 }
583 608  
584 609 /*
... ... @@ -595,7 +620,7 @@
595 620  
596 621 /* detach this root from the system */
597 622 spin_lock(&dcache_lock);
598   - dentry_lru_remove(dentry);
  623 + dentry_lru_del_init(dentry);
599 624 __d_drop(dentry);
600 625 spin_unlock(&dcache_lock);
601 626  
... ... @@ -609,7 +634,7 @@
609 634 spin_lock(&dcache_lock);
610 635 list_for_each_entry(loop, &dentry->d_subdirs,
611 636 d_u.d_child) {
612   - dentry_lru_remove(loop);
  637 + dentry_lru_del_init(loop);
613 638 __d_drop(loop);
614 639 cond_resched_lock(&dcache_lock);
615 640 }
616 641  
... ... @@ -791,14 +816,13 @@
791 816 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
792 817 next = tmp->next;
793 818  
794   - dentry_lru_remove(dentry);
  819 + dentry_lru_del_init(dentry);
795 820 /*
796 821 * move only zero ref count dentries to the end
797 822 * of the unused list for prune_dcache
798 823 */
799 824 if (!atomic_read(&dentry->d_count)) {
800   - list_add_tail(&dentry->d_lru, &dentry_unused);
801   - dentry_stat.nr_unused++;
  825 + dentry_lru_add_tail(dentry);
802 826 found++;
803 827 }
804 828  
805 829  
... ... @@ -840,10 +864,11 @@
840 864  
841 865 void shrink_dcache_parent(struct dentry * parent)
842 866 {
  867 + struct super_block *sb = parent->d_sb;
843 868 int found;
844 869  
845 870 while ((found = select_parent(parent)) != 0)
846   - prune_dcache(found, parent->d_sb);
  871 + __shrink_dcache_sb(sb, &found, 0);
847 872 }
848 873  
849 874 /*
... ... @@ -863,7 +888,7 @@
863 888 if (nr) {
864 889 if (!(gfp_mask & __GFP_FS))
865 890 return -1;
866   - prune_dcache(nr, NULL);
  891 + prune_dcache(nr);
867 892 }
868 893 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
869 894 }
... ... @@ -1215,7 +1240,7 @@
1215 1240 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
1216 1241 * lookup is going on.
1217 1242 *
1218   - * dentry_unused list is not updated even if lookup finds the required dentry
  1243 + * The dentry unused LRU is not updated even if lookup finds the required dentry
1219 1244 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
1220 1245 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
1221 1246 * acquisition.
... ... @@ -70,6 +70,7 @@
70 70 INIT_LIST_HEAD(&s->s_instances);
71 71 INIT_HLIST_HEAD(&s->s_anon);
72 72 INIT_LIST_HEAD(&s->s_inodes);
  73 + INIT_LIST_HEAD(&s->s_dentry_lru);
73 74 init_rwsem(&s->s_umount);
74 75 mutex_init(&s->s_lock);
75 76 lockdep_set_class(&s->s_umount, &type->s_umount_key);
... ... @@ -1025,6 +1025,7 @@
1025 1025 extern struct list_head super_blocks;
1026 1026 extern spinlock_t sb_lock;
1027 1027  
  1028 +#define sb_entry(list) list_entry((list), struct super_block, s_list)
1028 1029 #define S_BIAS (1<<30)
1029 1030 struct super_block {
1030 1031 struct list_head s_list; /* Keep this first */
... ... @@ -1058,6 +1059,9 @@
1058 1059 struct list_head s_more_io; /* parked for more writeback */
1059 1060 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
1060 1061 struct list_head s_files;
  1062 + /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
  1063 + struct list_head s_dentry_lru; /* unused dentry lru */
  1064 + int s_nr_dentry_unused; /* # of dentry on lru */
1061 1065  
1062 1066 struct block_device *s_bdev;
1063 1067 struct mtd_info *s_mtd;