Commit b0d40c92adafde7c2d81203ce7c1c69275f41140

Authored by Dave Chinner
Committed by Al Viro
1 parent 12ad3ab661

superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock
shrinker that shrinks the caches attached to the superblock. We
currently have global shrinkers for the inode and dentry caches that
split up into per-superblock operations via a coarse proportioning
method that does not batch very well.  The global shrinkers also
have a dependency - dentries pin inodes - so we have to be very
careful about how we register the global shrinkers so that the
implicit call order is always correct.

With a per-sb shrinker callout, we can encode this dependency
directly into the per-sb shrinker, hence avoiding the need for
strictly ordering shrinker registrations. We also have no need for
any proportioning code for the shrinker subsystem already provides
this functionality across all shrinkers. Allowing the shrinker to
operate on a single superblock at a time means that we do less
superblock list traversals and locking and reclaim should batch more
effectively. This should result in less CPU overhead for reclaim and
potentially faster reclaim of items from each filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 6 changed files with 121 additions and 257 deletions Side-by-side Diff

... ... @@ -743,13 +743,11 @@
743 743 *
744 744 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
745 745 */
746   -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
  746 +static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
747 747 {
748   - /* called from prune_dcache() and shrink_dcache_parent() */
749 748 struct dentry *dentry;
750 749 LIST_HEAD(referenced);
751 750 LIST_HEAD(tmp);
752   - int cnt = *count;
753 751  
754 752 relock:
755 753 spin_lock(&dcache_lru_lock);
... ... @@ -777,7 +775,7 @@
777 775 } else {
778 776 list_move_tail(&dentry->d_lru, &tmp);
779 777 spin_unlock(&dentry->d_lock);
780   - if (!--cnt)
  778 + if (!--count)
781 779 break;
782 780 }
783 781 cond_resched_lock(&dcache_lru_lock);
784 782  
785 783  
786 784  
787 785  
788 786  
... ... @@ -787,83 +785,22 @@
787 785 spin_unlock(&dcache_lru_lock);
788 786  
789 787 shrink_dentry_list(&tmp);
790   -
791   - *count = cnt;
792 788 }
793 789  
794 790 /**
795   - * prune_dcache - shrink the dcache
796   - * @count: number of entries to try to free
  791 + * prune_dcache_sb - shrink the dcache
  792 + * @nr_to_scan: number of entries to try to free
797 793 *
798   - * Shrink the dcache. This is done when we need more memory, or simply when we
799   - * need to unmount something (at which point we need to unuse all dentries).
  794 + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
  795 + * done when we need more memory an called from the superblock shrinker
  796 + * function.
800 797 *
801   - * This function may fail to free any resources if all the dentries are in use.
  798 + * This function may fail to free any resources if all the dentries are in
  799 + * use.
802 800 */
803   -static void prune_dcache(int count)
  801 +void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
804 802 {
805   - struct super_block *sb, *p = NULL;
806   - int w_count;
807   - int unused = dentry_stat.nr_unused;
808   - int prune_ratio;
809   - int pruned;
810   -
811   - if (unused == 0 || count == 0)
812   - return;
813   - if (count >= unused)
814   - prune_ratio = 1;
815   - else
816   - prune_ratio = unused / count;
817   - spin_lock(&sb_lock);
818   - list_for_each_entry(sb, &super_blocks, s_list) {
819   - if (list_empty(&sb->s_instances))
820   - continue;
821   - if (sb->s_nr_dentry_unused == 0)
822   - continue;
823   - sb->s_count++;
824   - /* Now, we reclaim unused dentrins with fairness.
825   - * We reclaim them same percentage from each superblock.
826   - * We calculate number of dentries to scan on this sb
827   - * as follows, but the implementation is arranged to avoid
828   - * overflows:
829   - * number of dentries to scan on this sb =
830   - * count * (number of dentries on this sb /
831   - * number of dentries in the machine)
832   - */
833   - spin_unlock(&sb_lock);
834   - if (prune_ratio != 1)
835   - w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
836   - else
837   - w_count = sb->s_nr_dentry_unused;
838   - pruned = w_count;
839   - /*
840   - * We need to be sure this filesystem isn't being unmounted,
841   - * otherwise we could race with generic_shutdown_super(), and
842   - * end up holding a reference to an inode while the filesystem
843   - * is unmounted. So we try to get s_umount, and make sure
844   - * s_root isn't NULL.
845   - */
846   - if (down_read_trylock(&sb->s_umount)) {
847   - if ((sb->s_root != NULL) &&
848   - (!list_empty(&sb->s_dentry_lru))) {
849   - __shrink_dcache_sb(sb, &w_count,
850   - DCACHE_REFERENCED);
851   - pruned -= w_count;
852   - }
853   - up_read(&sb->s_umount);
854   - }
855   - spin_lock(&sb_lock);
856   - if (p)
857   - __put_super(p);
858   - count -= pruned;
859   - p = sb;
860   - /* more work left to do? */
861   - if (count <= 0)
862   - break;
863   - }
864   - if (p)
865   - __put_super(p);
866   - spin_unlock(&sb_lock);
  803 + __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
867 804 }
868 805  
869 806 /**
870 807  
... ... @@ -1238,42 +1175,10 @@
1238 1175 int found;
1239 1176  
1240 1177 while ((found = select_parent(parent)) != 0)
1241   - __shrink_dcache_sb(sb, &found, 0);
  1178 + __shrink_dcache_sb(sb, found, 0);
1242 1179 }
1243 1180 EXPORT_SYMBOL(shrink_dcache_parent);
1244 1181  
1245   -/*
1246   - * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
1247   - *
1248   - * We need to avoid reentering the filesystem if the caller is performing a
1249   - * GFP_NOFS allocation attempt. One example deadlock is:
1250   - *
1251   - * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
1252   - * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
1253   - * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
1254   - *
1255   - * In this case we return -1 to tell the caller that we baled.
1256   - */
1257   -static int shrink_dcache_memory(struct shrinker *shrink,
1258   - struct shrink_control *sc)
1259   -{
1260   - int nr = sc->nr_to_scan;
1261   - gfp_t gfp_mask = sc->gfp_mask;
1262   -
1263   - if (nr) {
1264   - if (!(gfp_mask & __GFP_FS))
1265   - return -1;
1266   - prune_dcache(nr);
1267   - }
1268   -
1269   - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
1270   -}
1271   -
1272   -static struct shrinker dcache_shrinker = {
1273   - .shrink = shrink_dcache_memory,
1274   - .seeks = DEFAULT_SEEKS,
1275   -};
1276   -
1277 1182 /**
1278 1183 * __d_alloc - allocate a dcache entry
1279 1184 * @sb: filesystem it will belong to
... ... @@ -3083,8 +2988,6 @@
3083 2988 */
3084 2989 dentry_cache = KMEM_CACHE(dentry,
3085 2990 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
3086   -
3087   - register_shrinker(&dcache_shrinker);
3088 2991  
3089 2992 /* Hash may have been set up in dcache_init_early */
3090 2993 if (!hashdist)
... ... @@ -73,7 +73,7 @@
73 73 *
74 74 * We don't actually need it to protect anything in the umount path,
75 75 * but only need to cycle through it to make sure any inode that
76   - * prune_icache took off the LRU list has been fully torn down by the
  76 + * prune_icache_sb took off the LRU list has been fully torn down by the
77 77 * time we are past evict_inodes.
78 78 */
79 79 static DECLARE_RWSEM(iprune_sem);
... ... @@ -544,7 +544,7 @@
544 544 dispose_list(&dispose);
545 545  
546 546 /*
547   - * Cycle through iprune_sem to make sure any inode that prune_icache
  547 + * Cycle through iprune_sem to make sure any inode that prune_icache_sb
548 548 * moved off the list before we took the lock has been fully torn
549 549 * down.
550 550 */
... ... @@ -612,9 +612,10 @@
612 612 }
613 613  
614 614 /*
615   - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
616   - * temporary list and then are freed outside sb->s_inode_lru_lock by
617   - * dispose_list().
  615 + * Walk the superblock inode LRU for freeable inodes and attempt to free them.
  616 + * This is called from the superblock shrinker function with a number of inodes
  617 + * to trim from the LRU. Inodes to be freed are moved to a temporary list and
  618 + * then are freed outside inode_lock by dispose_list().
618 619 *
619 620 * Any inodes which are pinned purely because of attached pagecache have their
620 621 * pagecache removed. If the inode has metadata buffers attached to
621 622  
622 623  
... ... @@ -628,14 +629,15 @@
628 629 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
629 630 * with this flag set because they are the inodes that are out of order.
630 631 */
631   -static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
  632 +void prune_icache_sb(struct super_block *sb, int nr_to_scan)
632 633 {
633 634 LIST_HEAD(freeable);
634 635 int nr_scanned;
635 636 unsigned long reap = 0;
636 637  
  638 + down_read(&iprune_sem);
637 639 spin_lock(&sb->s_inode_lru_lock);
638   - for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
  640 + for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
639 641 struct inode *inode;
640 642  
641 643 if (list_empty(&sb->s_inode_lru))
642 644  
643 645  
... ... @@ -707,111 +709,11 @@
707 709 else
708 710 __count_vm_events(PGINODESTEAL, reap);
709 711 spin_unlock(&sb->s_inode_lru_lock);
710   - *nr_to_scan = nr_scanned;
711 712  
712 713 dispose_list(&freeable);
713   -}
714   -
715   -static void prune_icache(int count)
716   -{
717   - struct super_block *sb, *p = NULL;
718   - int w_count;
719   - int unused = inodes_stat.nr_unused;
720   - int prune_ratio;
721   - int pruned;
722   -
723   - if (unused == 0 || count == 0)
724   - return;
725   - down_read(&iprune_sem);
726   - if (count >= unused)
727   - prune_ratio = 1;
728   - else
729   - prune_ratio = unused / count;
730   - spin_lock(&sb_lock);
731   - list_for_each_entry(sb, &super_blocks, s_list) {
732   - if (list_empty(&sb->s_instances))
733   - continue;
734   - if (sb->s_nr_inodes_unused == 0)
735   - continue;
736   - sb->s_count++;
737   - /* Now, we reclaim unused dentrins with fairness.
738   - * We reclaim them same percentage from each superblock.
739   - * We calculate number of dentries to scan on this sb
740   - * as follows, but the implementation is arranged to avoid
741   - * overflows:
742   - * number of dentries to scan on this sb =
743   - * count * (number of dentries on this sb /
744   - * number of dentries in the machine)
745   - */
746   - spin_unlock(&sb_lock);
747   - if (prune_ratio != 1)
748   - w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
749   - else
750   - w_count = sb->s_nr_inodes_unused;
751   - pruned = w_count;
752   - /*
753   - * We need to be sure this filesystem isn't being unmounted,
754   - * otherwise we could race with generic_shutdown_super(), and
755   - * end up holding a reference to an inode while the filesystem
756   - * is unmounted. So we try to get s_umount, and make sure
757   - * s_root isn't NULL.
758   - */
759   - if (down_read_trylock(&sb->s_umount)) {
760   - if ((sb->s_root != NULL) &&
761   - (!list_empty(&sb->s_dentry_lru))) {
762   - shrink_icache_sb(sb, &w_count);
763   - pruned -= w_count;
764   - }
765   - up_read(&sb->s_umount);
766   - }
767   - spin_lock(&sb_lock);
768   - if (p)
769   - __put_super(p);
770   - count -= pruned;
771   - p = sb;
772   - /* more work left to do? */
773   - if (count <= 0)
774   - break;
775   - }
776   - if (p)
777   - __put_super(p);
778   - spin_unlock(&sb_lock);
779 714 up_read(&iprune_sem);
780 715 }
781 716  
782   -/*
783   - * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
784   - * "unused" means that no dentries are referring to the inodes: the files are
785   - * not open and the dcache references to those inodes have already been
786   - * reclaimed.
787   - *
788   - * This function is passed the number of inodes to scan, and it returns the
789   - * total number of remaining possibly-reclaimable inodes.
790   - */
791   -static int shrink_icache_memory(struct shrinker *shrink,
792   - struct shrink_control *sc)
793   -{
794   - int nr = sc->nr_to_scan;
795   - gfp_t gfp_mask = sc->gfp_mask;
796   -
797   - if (nr) {
798   - /*
799   - * Nasty deadlock avoidance. We may hold various FS locks,
800   - * and we don't want to recurse into the FS that called us
801   - * in clear_inode() and friends..
802   - */
803   - if (!(gfp_mask & __GFP_FS))
804   - return -1;
805   - prune_icache(nr);
806   - }
807   - return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
808   -}
809   -
810   -static struct shrinker icache_shrinker = {
811   - .shrink = shrink_icache_memory,
812   - .seeks = DEFAULT_SEEKS,
813   -};
814   -
815 717 static void __wait_on_freeing_inode(struct inode *inode);
816 718 /*
817 719 * Called with the inode lock held.
... ... @@ -1691,7 +1593,6 @@
1691 1593 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
1692 1594 SLAB_MEM_SPREAD),
1693 1595 init_once);
1694   - register_shrinker(&icache_shrinker);
1695 1596  
1696 1597 /* Hash may have been set up in inode_init_early */
1697 1598 if (!hashdist)
... ... @@ -38,6 +38,48 @@
38 38 LIST_HEAD(super_blocks);
39 39 DEFINE_SPINLOCK(sb_lock);
40 40  
  41 +/*
  42 + * One thing we have to be careful of with a per-sb shrinker is that we don't
  43 + * drop the last active reference to the superblock from within the shrinker.
  44 + * If that happens we could trigger unregistering the shrinker from within the
  45 + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  46 + * take a passive reference to the superblock to avoid this from occurring.
  47 + */
  48 +static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
  49 +{
  50 + struct super_block *sb;
  51 + int count;
  52 +
  53 + sb = container_of(shrink, struct super_block, s_shrink);
  54 +
  55 + /*
  56 + * Deadlock avoidance. We may hold various FS locks, and we don't want
  57 + * to recurse into the FS that called us in clear_inode() and friends..
  58 + */
  59 + if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
  60 + return -1;
  61 +
  62 + if (!grab_super_passive(sb))
  63 + return -1;
  64 +
  65 + if (sc->nr_to_scan) {
  66 + /* proportion the scan between the two caches */
  67 + int total;
  68 +
  69 + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
  70 + count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
  71 +
  72 + /* prune dcache first as icache is pinned by it */
  73 + prune_dcache_sb(sb, count);
  74 + prune_icache_sb(sb, sc->nr_to_scan - count);
  75 + }
  76 +
  77 + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
  78 + * sysctl_vfs_cache_pressure;
  79 + drop_super(sb);
  80 + return count;
  81 +}
  82 +
41 83 /**
42 84 * alloc_super - create new superblock
43 85 * @type: filesystem type superblock should belong to
... ... @@ -116,6 +158,9 @@
116 158 s->s_op = &default_op;
117 159 s->s_time_gran = 1000000000;
118 160 s->cleancache_poolid = -1;
  161 +
  162 + s->s_shrink.seeks = DEFAULT_SEEKS;
  163 + s->s_shrink.shrink = prune_super;
119 164 }
120 165 out:
121 166 return s;
... ... @@ -183,6 +228,10 @@
183 228 if (atomic_dec_and_test(&s->s_active)) {
184 229 cleancache_flush_fs(s);
185 230 fs->kill_sb(s);
  231 +
  232 + /* caches are now gone, we can safely kill the shrinker now */
  233 + unregister_shrinker(&s->s_shrink);
  234 +
186 235 /*
187 236 * We need to call rcu_barrier so all the delayed rcu free
188 237 * inodes are flushed before we release the fs module.
... ... @@ -311,7 +360,6 @@
311 360 {
312 361 const struct super_operations *sop = sb->s_op;
313 362  
314   -
315 363 if (sb->s_root) {
316 364 shrink_dcache_for_umount(sb);
317 365 sync_filesystem(sb);
... ... @@ -399,6 +447,7 @@
399 447 list_add(&s->s_instances, &type->fs_supers);
400 448 spin_unlock(&sb_lock);
401 449 get_filesystem(type);
  450 + register_shrinker(&s->s_shrink);
402 451 return s;
403 452 }
404 453  
... ... @@ -393,6 +393,7 @@
393 393 #include <linux/semaphore.h>
394 394 #include <linux/fiemap.h>
395 395 #include <linux/rculist_bl.h>
  396 +#include <linux/shrinker.h>
396 397 #include <linux/atomic.h>
397 398  
398 399 #include <asm/byteorder.h>
399 400  
... ... @@ -1444,7 +1445,13 @@
1444 1445 * Saved pool identifier for cleancache (-1 means none)
1445 1446 */
1446 1447 int cleancache_poolid;
  1448 +
  1449 + struct shrinker s_shrink; /* per-sb shrinker handle */
1447 1450 };
  1451 +
  1452 +/* superblock cache pruning functions */
  1453 +extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
  1454 +extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
1448 1455  
1449 1456 extern struct timespec current_fs_time(struct super_block *sb);
1450 1457  
... ... @@ -15,6 +15,7 @@
15 15 #include <linux/range.h>
16 16 #include <linux/pfn.h>
17 17 #include <linux/bit_spinlock.h>
  18 +#include <linux/shrinker.h>
18 19  
19 20 struct mempolicy;
20 21 struct anon_vma;
... ... @@ -1120,45 +1121,6 @@
1120 1121 {
1121 1122 }
1122 1123 #endif
1123   -
1124   -/*
1125   - * This struct is used to pass information from page reclaim to the shrinkers.
1126   - * We consolidate the values for easier extention later.
1127   - */
1128   -struct shrink_control {
1129   - gfp_t gfp_mask;
1130   -
1131   - /* How many slab objects shrinker() should scan and try to reclaim */
1132   - unsigned long nr_to_scan;
1133   -};
1134   -
1135   -/*
1136   - * A callback you can register to apply pressure to ageable caches.
1137   - *
1138   - * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
1139   - * and a 'gfpmask'. It should look through the least-recently-used
1140   - * 'nr_to_scan' entries and attempt to free them up. It should return
1141   - * the number of objects which remain in the cache. If it returns -1, it means
1142   - * it cannot do any scanning at this time (eg. there is a risk of deadlock).
1143   - *
1144   - * The 'gfpmask' refers to the allocation we are currently trying to
1145   - * fulfil.
1146   - *
1147   - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
1148   - * querying the cache size, so a fastpath for that case is appropriate.
1149   - */
1150   -struct shrinker {
1151   - int (*shrink)(struct shrinker *, struct shrink_control *sc);
1152   - int seeks; /* seeks to recreate an obj */
1153   - long batch; /* reclaim batch size, 0 = default */
1154   -
1155   - /* These are for internal use */
1156   - struct list_head list;
1157   - long nr; /* objs pending delete */
1158   -};
1159   -#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
1160   -extern void register_shrinker(struct shrinker *);
1161   -extern void unregister_shrinker(struct shrinker *);
1162 1124  
1163 1125 int vma_wants_writenotify(struct vm_area_struct *vma);
1164 1126  
include/linux/shrinker.h
  1 +#ifndef _LINUX_SHRINKER_H
  2 +#define _LINUX_SHRINKER_H
  3 +
  4 +/*
  5 + * This struct is used to pass information from page reclaim to the shrinkers.
  6 + * We consolidate the values for easier extention later.
  7 + */
  8 +struct shrink_control {
  9 + gfp_t gfp_mask;
  10 +
  11 + /* How many slab objects shrinker() should scan and try to reclaim */
  12 + unsigned long nr_to_scan;
  13 +};
  14 +
  15 +/*
  16 + * A callback you can register to apply pressure to ageable caches.
  17 + *
  18 + * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
  19 + * and a 'gfpmask'. It should look through the least-recently-used
  20 + * 'nr_to_scan' entries and attempt to free them up. It should return
  21 + * the number of objects which remain in the cache. If it returns -1, it means
  22 + * it cannot do any scanning at this time (eg. there is a risk of deadlock).
  23 + *
  24 + * The 'gfpmask' refers to the allocation we are currently trying to
  25 + * fulfil.
  26 + *
  27 + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
  28 + * querying the cache size, so a fastpath for that case is appropriate.
  29 + */
  30 +struct shrinker {
  31 + int (*shrink)(struct shrinker *, struct shrink_control *sc);
  32 + int seeks; /* seeks to recreate an obj */
  33 + long batch; /* reclaim batch size, 0 = default */
  34 +
  35 + /* These are for internal use */
  36 + struct list_head list;
  37 + long nr; /* objs pending delete */
  38 +};
  39 +#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
  40 +extern void register_shrinker(struct shrinker *);
  41 +extern void unregister_shrinker(struct shrinker *);
  42 +#endif