superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

superblock: introduce per-sb cache shrinker infrastructure
With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Dave Chinner · Al Viro
1 parent 12ad3ab661
Showing 6 changed files with 121 additions and 257 deletions Side-by-side Diff
fs/dcache.c
fs/inode.c
fs/super.c
include/linux/fs.h
include/linux/mm.h
include/linux/shrinker.h
@@ -743,13 +743,11 @@
  *
  * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
  */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
 {
-	/* called from prune_dcache() and shrink_dcache_parent() */
 	struct dentry *dentry;
 	LIST_HEAD(referenced);
 	LIST_HEAD(tmp);
-	int cnt = *count;
  
 relock:
 	spin_lock(&dcache_lru_lock);
@@ -777,7 +775,7 @@
 		} else {
 			list_move_tail(&dentry->d_lru, &tmp);
 			spin_unlock(&dentry->d_lock);
-			if (!--cnt)
+			if (!--count)
 				break;
 		}
 		cond_resched_lock(&dcache_lru_lock);
  
  
  
  
  
@@ -787,83 +785,22 @@
 	spin_unlock(&dcache_lru_lock);
  
 	shrink_dentry_list(&tmp);
-
-	*count = cnt;
 }
  
 /**
- * prune_dcache - shrink the dcache
- * @count: number of entries to try to free
+ * prune_dcache_sb - shrink the dcache
+ * @nr_to_scan: number of entries to try to free
  *
- * Shrink the dcache. This is done when we need more memory, or simply when we
- * need to unmount something (at which point we need to unuse all dentries).
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
+ * done when we need more memory an called from the superblock shrinker
+ * function.
  *
- * This function may fail to free any resources if all the dentries are in use.
+ * This function may fail to free any resources if all the dentries are in
+ * use.
  */
-static void prune_dcache(int count)
+void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
 {
-	struct super_block *sb, *p = NULL;
-	int w_count;
-	int unused = dentry_stat.nr_unused;
-	int prune_ratio;
-	int pruned;
-
-	if (unused == 0 || count == 0)
-		return;
-	if (count >= unused)
-		prune_ratio = 1;
-	else
-		prune_ratio = unused / count;
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (list_empty(&sb->s_instances))
-			continue;
-		if (sb->s_nr_dentry_unused == 0)
-			continue;
-		sb->s_count++;
-		/* Now, we reclaim unused dentrins with fairness.
-		 * We reclaim them same percentage from each superblock.
-		 * We calculate number of dentries to scan on this sb
-		 * as follows, but the implementation is arranged to avoid
-		 * overflows:
-		 * number of dentries to scan on this sb =
-		 * count * (number of dentries on this sb /
-		 * number of dentries in the machine)
-		 */
-		spin_unlock(&sb_lock);
-		if (prune_ratio != 1)
-			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
-		else
-			w_count = sb->s_nr_dentry_unused;
-		pruned = w_count;
-		/*
-		 * We need to be sure this filesystem isn't being unmounted,
-		 * otherwise we could race with generic_shutdown_super(), and
-		 * end up holding a reference to an inode while the filesystem
-		 * is unmounted.  So we try to get s_umount, and make sure
-		 * s_root isn't NULL.
-		 */
-		if (down_read_trylock(&sb->s_umount)) {
-			if ((sb->s_root != NULL) &&
-			    (!list_empty(&sb->s_dentry_lru))) {
-				__shrink_dcache_sb(sb, &w_count,
-						DCACHE_REFERENCED);
-				pruned -= w_count;
-			}
-			up_read(&sb->s_umount);
-		}
-		spin_lock(&sb_lock);
-		if (p)
-			__put_super(p);
-		count -= pruned;
-		p = sb;
-		/* more work left to do? */
-		if (count <= 0)
-			break;
-	}
-	if (p)
-		__put_super(p);
-	spin_unlock(&sb_lock);
+	__shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
 }
  
 /**
  
@@ -1238,42 +1175,10 @@
 	int found;
  
 	while ((found = select_parent(parent)) != 0)
-		__shrink_dcache_sb(sb, &found, 0);
+		__shrink_dcache_sb(sb, found, 0);
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
  
-/*
- * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
- *
- * We need to avoid reentering the filesystem if the caller is performing a
- * GFP_NOFS allocation attempt.  One example deadlock is:
- *
- * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
- * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
- * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
- *
- * In this case we return -1 to tell the caller that we baled.
- */
-static int shrink_dcache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr) {
-		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_dcache(nr);
-	}
-
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
-}
-
-static struct shrinker dcache_shrinker = {
-	.shrink = shrink_dcache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
 /**
  * __d_alloc	-	allocate a dcache entry
  * @sb: filesystem it will belong to
@@ -3083,8 +2988,6 @@
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-	
-	register_shrinker(&dcache_shrinker);
  
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
@@ -73,7 +73,7 @@
  *
  * We don't actually need it to protect anything in the umount path,
  * but only need to cycle through it to make sure any inode that
- * prune_icache took off the LRU list has been fully torn down by the
+ * prune_icache_sb took off the LRU list has been fully torn down by the
  * time we are past evict_inodes.
  */
 static DECLARE_RWSEM(iprune_sem);
@@ -544,7 +544,7 @@
 	dispose_list(&dispose);
  
 	/*
-	 * Cycle through iprune_sem to make sure any inode that prune_icache
+	 * Cycle through iprune_sem to make sure any inode that prune_icache_sb
 	 * moved off the list before we took the lock has been fully torn
 	 * down.
 	 */
@@ -612,9 +612,10 @@
 }
  
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside sb->s_inode_lru_lock by
- * dispose_list().
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
  
  
@@ -628,14 +629,15 @@
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
+void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 {
 	LIST_HEAD(freeable);
 	int nr_scanned;
 	unsigned long reap = 0;
  
+	down_read(&iprune_sem);
 	spin_lock(&sb->s_inode_lru_lock);
-	for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
+	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
 		struct inode *inode;
  
 		if (list_empty(&sb->s_inode_lru))
  
  
@@ -707,111 +709,11 @@
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&sb->s_inode_lru_lock);
-	*nr_to_scan = nr_scanned;
  
 	dispose_list(&freeable);
-}
-
-static void prune_icache(int count)
-{
-	struct super_block *sb, *p = NULL;
-	int w_count;
-	int unused = inodes_stat.nr_unused;
-	int prune_ratio;
-	int pruned;
-
-	if (unused == 0 || count == 0)
-		return;
-	down_read(&iprune_sem);
-	if (count >= unused)
-		prune_ratio = 1;
-	else
-		prune_ratio = unused / count;
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (list_empty(&sb->s_instances))
-			continue;
-		if (sb->s_nr_inodes_unused == 0)
-			continue;
-		sb->s_count++;
-		/* Now, we reclaim unused dentrins with fairness.
-		 * We reclaim them same percentage from each superblock.
-		 * We calculate number of dentries to scan on this sb
-		 * as follows, but the implementation is arranged to avoid
-		 * overflows:
-		 * number of dentries to scan on this sb =
-		 * count * (number of dentries on this sb /
-		 * number of dentries in the machine)
-		 */
-		spin_unlock(&sb_lock);
-		if (prune_ratio != 1)
-			w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
-		else
-			w_count = sb->s_nr_inodes_unused;
-		pruned = w_count;
-		/*
-		 * We need to be sure this filesystem isn't being unmounted,
-		 * otherwise we could race with generic_shutdown_super(), and
-		 * end up holding a reference to an inode while the filesystem
-		 * is unmounted.  So we try to get s_umount, and make sure
-		 * s_root isn't NULL.
-		 */
-		if (down_read_trylock(&sb->s_umount)) {
-			if ((sb->s_root != NULL) &&
-			    (!list_empty(&sb->s_dentry_lru))) {
-				shrink_icache_sb(sb, &w_count);
-				pruned -= w_count;
-			}
-			up_read(&sb->s_umount);
-		}
-		spin_lock(&sb_lock);
-		if (p)
-			__put_super(p);
-		count -= pruned;
-		p = sb;
-		/* more work left to do? */
-		if (count <= 0)
-			break;
-	}
-	if (p)
-		__put_super(p);
-	spin_unlock(&sb_lock);
 	up_read(&iprune_sem);
 }
  
-/*
- * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
- * "unused" means that no dentries are referring to the inodes: the files are
- * not open and the dcache references to those inodes have already been
- * reclaimed.
- *
- * This function is passed the number of inodes to scan, and it returns the
- * total number of remaining possibly-reclaimable inodes.
- */
-static int shrink_icache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr) {
-		/*
-		 * Nasty deadlock avoidance.  We may hold various FS locks,
-		 * and we don't want to recurse into the FS that called us
-		 * in clear_inode() and friends..
-		 */
-		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_icache(nr);
-	}
-	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
-}
-
-static struct shrinker icache_shrinker = {
-	.shrink = shrink_icache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
@@ -1691,7 +1593,6 @@
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
 					 init_once);
-	register_shrinker(&icache_shrinker);
  
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
@@ -38,6 +38,48 @@
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
  
+/*
+ * One thing we have to be careful of with a per-sb shrinker is that we don't
+ * drop the last active reference to the superblock from within the shrinker.
+ * If that happens we could trigger unregistering the shrinker from within the
+ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * take a passive reference to the superblock to avoid this from occurring.
+ */
+static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct super_block *sb;
+	int count;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
+	 * to recurse into the FS that called us in clear_inode() and friends..
+	 */
+	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
+		return -1;
+
+	if (!grab_super_passive(sb))
+		return -1;
+
+	if (sc->nr_to_scan) {
+		/* proportion the scan between the two caches */
+		int total;
+
+		total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
+		count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
+
+		/* prune dcache first as icache is pinned by it */
+		prune_dcache_sb(sb, count);
+		prune_icache_sb(sb, sc->nr_to_scan - count);
+	}
+
+	count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
+						* sysctl_vfs_cache_pressure;
+	drop_super(sb);
+	return count;
+}
+
 /**
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
@@ -116,6 +158,9 @@
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
 		s->cleancache_poolid = -1;
+
+		s->s_shrink.seeks = DEFAULT_SEEKS;
+		s->s_shrink.shrink = prune_super;
 	}
 out:
 	return s;
@@ -183,6 +228,10 @@
 	if (atomic_dec_and_test(&s->s_active)) {
 		cleancache_flush_fs(s);
 		fs->kill_sb(s);
+
+		/* caches are now gone, we can safely kill the shrinker now */
+		unregister_shrinker(&s->s_shrink);
+
 		/*
 		 * We need to call rcu_barrier so all the delayed rcu free
 		 * inodes are flushed before we release the fs module.
@@ -311,7 +360,6 @@
 {
 	const struct super_operations *sop = sb->s_op;
  
-
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		sync_filesystem(sb);
@@ -399,6 +447,7 @@
 	list_add(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
+	register_shrinker(&s->s_shrink);
 	return s;
 }
  
@@ -393,6 +393,7 @@
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
 #include <linux/rculist_bl.h>
+#include <linux/shrinker.h>
 #include <linux/atomic.h>
  
 #include <asm/byteorder.h>
  
@@ -1444,7 +1445,13 @@
 	 * Saved pool identifier for cleancache (-1 means none)
 	 */
 	int cleancache_poolid;
+
+	struct shrinker s_shrink;	/* per-sb shrinker handle */
 };
+
+/* superblock cache pruning functions */
+extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
+extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
  
 extern struct timespec current_fs_time(struct super_block *sb);
  
@@ -15,6 +15,7 @@
 #include <linux/range.h>
 #include <linux/pfn.h>
 #include <linux/bit_spinlock.h>
+#include <linux/shrinker.h>
  
 struct mempolicy;
 struct anon_vma;
@@ -1120,45 +1121,6 @@
 {
 }
 #endif
-
-/*
- * This struct is used to pass information from page reclaim to the shrinkers.
- * We consolidate the values for easier extention later.
- */
-struct shrink_control {
-	gfp_t gfp_mask;
-
-	/* How many slab objects shrinker() should scan and try to reclaim */
-	unsigned long nr_to_scan;
-};
-
-/*
- * A callback you can register to apply pressure to ageable caches.
- *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
- *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
- *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
- */
-struct shrinker {
-	int (*shrink)(struct shrinker *, struct shrink_control *sc);
-	int seeks;	/* seeks to recreate an obj */
-	long batch;	/* reclaim batch size, 0 = default */
-
-	/* These are for internal use */
-	struct list_head list;
-	long nr;	/* objs pending delete */
-};
-#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
-extern void unregister_shrinker(struct shrinker *);
  
 int vma_wants_writenotify(struct vm_area_struct *vma);
  
+#ifndef _LINUX_SHRINKER_H
+#define _LINUX_SHRINKER_H
+
+/*
+ * This struct is used to pass information from page reclaim to the shrinkers.
+ * We consolidate the values for easier extention later.
+ */
+struct shrink_control {
+	gfp_t gfp_mask;
+
+	/* How many slab objects shrinker() should scan and try to reclaim */
+	unsigned long nr_to_scan;
+};
+
+/*
+ * A callback you can register to apply pressure to ageable caches.
+ *
+ * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
+ * and a 'gfpmask'.  It should look through the least-recently-used
+ * 'nr_to_scan' entries and attempt to free them up.  It should return
+ * the number of objects which remain in the cache.  If it returns -1, it means
+ * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
+ */
+struct shrinker {
+	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	int seeks;	/* seeks to recreate an obj */
+	long batch;	/* reclaim batch size, 0 = default */
+
+	/* These are for internal use */
+	struct list_head list;
+	long nr;	/* objs pending delete */
+};
+#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
+extern void register_shrinker(struct shrinker *);
+extern void unregister_shrinker(struct shrinker *);
+#endif
...	...	@@ -743,13 +743,11 @@
743	743	*
744	744	* If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
745	745	*/
746		-static void __shrink_dcache_sb(struct super_block sb, int count, int flags)
	746	+static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
747	747	{
748		- /* called from prune_dcache() and shrink_dcache_parent() */
749	748	struct dentry *dentry;
750	749	LIST_HEAD(referenced);
751	750	LIST_HEAD(tmp);
752		- int cnt = *count;
753	751
754	752	relock:
755	753	spin_lock(&dcache_lru_lock);
...	...	@@ -777,7 +775,7 @@
777	775	} else {
778	776	list_move_tail(&dentry->d_lru, &tmp);
779	777	spin_unlock(&dentry->d_lock);
780		- if (!--cnt)
	778	+ if (!--count)
781	779	break;
782	780	}
783	781	cond_resched_lock(&dcache_lru_lock);
784	782
785	783
786	784
787	785
788	786
...	...	@@ -787,83 +785,22 @@
787	785	spin_unlock(&dcache_lru_lock);
788	786
789	787	shrink_dentry_list(&tmp);
790		-
791		- *count = cnt;
792	788	}
793	789
794	790	/**
795		- * prune_dcache - shrink the dcache
796		- * @count: number of entries to try to free
	791	+ * prune_dcache_sb - shrink the dcache
	792	+ * @nr_to_scan: number of entries to try to free
797	793	*
798		- * Shrink the dcache. This is done when we need more memory, or simply when we
799		- * need to unmount something (at which point we need to unuse all dentries).
	794	+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
	795	+ * done when we need more memory an called from the superblock shrinker
	796	+ * function.
800	797	*
801		- * This function may fail to free any resources if all the dentries are in use.
	798	+ * This function may fail to free any resources if all the dentries are in
	799	+ * use.
802	800	*/
803		-static void prune_dcache(int count)
	801	+void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
804	802	{
805		- struct super_block sb, p = NULL;
806		- int w_count;
807		- int unused = dentry_stat.nr_unused;
808		- int prune_ratio;
809		- int pruned;
810		-
811		- if (unused == 0 \|\| count == 0)
812		- return;
813		- if (count >= unused)
814		- prune_ratio = 1;
815		- else
816		- prune_ratio = unused / count;
817		- spin_lock(&sb_lock);
818		- list_for_each_entry(sb, &super_blocks, s_list) {
819		- if (list_empty(&sb->s_instances))
820		- continue;
821		- if (sb->s_nr_dentry_unused == 0)
822		- continue;
823		- sb->s_count++;
824		- /* Now, we reclaim unused dentrins with fairness.
825		- * We reclaim them same percentage from each superblock.
826		- * We calculate number of dentries to scan on this sb
827		- * as follows, but the implementation is arranged to avoid
828		- * overflows:
829		- * number of dentries to scan on this sb =
830		- * count * (number of dentries on this sb /
831		- * number of dentries in the machine)
832		- */
833		- spin_unlock(&sb_lock);
834		- if (prune_ratio != 1)
835		- w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
836		- else
837		- w_count = sb->s_nr_dentry_unused;
838		- pruned = w_count;
839		- /*
840		- * We need to be sure this filesystem isn't being unmounted,
841		- * otherwise we could race with generic_shutdown_super(), and
842		- * end up holding a reference to an inode while the filesystem
843		- * is unmounted. So we try to get s_umount, and make sure
844		- * s_root isn't NULL.
845		- */
846		- if (down_read_trylock(&sb->s_umount)) {
847		- if ((sb->s_root != NULL) &&
848		- (!list_empty(&sb->s_dentry_lru))) {
849		- __shrink_dcache_sb(sb, &w_count,
850		- DCACHE_REFERENCED);
851		- pruned -= w_count;
852		- }
853		- up_read(&sb->s_umount);
854		- }
855		- spin_lock(&sb_lock);
856		- if (p)
857		- __put_super(p);
858		- count -= pruned;
859		- p = sb;
860		- /* more work left to do? */
861		- if (count <= 0)
862		- break;
863		- }
864		- if (p)
865		- __put_super(p);
866		- spin_unlock(&sb_lock);
	803	+ __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
867	804	}
868	805
869	806	/**
870	807
...	...	@@ -1238,42 +1175,10 @@
1238	1175	int found;
1239	1176
1240	1177	while ((found = select_parent(parent)) != 0)
1241		- __shrink_dcache_sb(sb, &found, 0);
	1178	+ __shrink_dcache_sb(sb, found, 0);
1242	1179	}
1243	1180	EXPORT_SYMBOL(shrink_dcache_parent);
1244	1181
1245		-/*
1246		- * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
1247		- *
1248		- * We need to avoid reentering the filesystem if the caller is performing a
1249		- * GFP_NOFS allocation attempt. One example deadlock is:
1250		- *
1251		- * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
1252		- * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
1253		- * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
1254		- *
1255		- * In this case we return -1 to tell the caller that we baled.
1256		- */
1257		-static int shrink_dcache_memory(struct shrinker *shrink,
1258		- struct shrink_control *sc)
1259		-{
1260		- int nr = sc->nr_to_scan;
1261		- gfp_t gfp_mask = sc->gfp_mask;
1262		-
1263		- if (nr) {
1264		- if (!(gfp_mask & __GFP_FS))
1265		- return -1;
1266		- prune_dcache(nr);
1267		- }
1268		-
1269		- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
1270		-}
1271		-
1272		-static struct shrinker dcache_shrinker = {
1273		- .shrink = shrink_dcache_memory,
1274		- .seeks = DEFAULT_SEEKS,
1275		-};
1276		-
1277	1182	/**
1278	1183	* __d_alloc - allocate a dcache entry
1279	1184	* @sb: filesystem it will belong to
...	...	@@ -3083,8 +2988,6 @@
3083	2988	*/
3084	2989	dentry_cache = KMEM_CACHE(dentry,
3085	2990	SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|SLAB_MEM_SPREAD);
3086		-
3087		- register_shrinker(&dcache_shrinker);
3088	2991
3089	2992	/* Hash may have been set up in dcache_init_early */
3090	2993	if (!hashdist)
...	...	@@ -73,7 +73,7 @@
73	73	*
74	74	* We don't actually need it to protect anything in the umount path,
75	75	* but only need to cycle through it to make sure any inode that
76		- * prune_icache took off the LRU list has been fully torn down by the
	76	+ * prune_icache_sb took off the LRU list has been fully torn down by the
77	77	* time we are past evict_inodes.
78	78	*/
79	79	static DECLARE_RWSEM(iprune_sem);
...	...	@@ -544,7 +544,7 @@
544	544	dispose_list(&dispose);
545	545
546	546	/*
547		- * Cycle through iprune_sem to make sure any inode that prune_icache
	547	+ * Cycle through iprune_sem to make sure any inode that prune_icache_sb
548	548	* moved off the list before we took the lock has been fully torn
549	549	* down.
550	550	*/
...	...	@@ -612,9 +612,10 @@
612	612	}
613	613
614	614	/*
615		- * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
616		- * temporary list and then are freed outside sb->s_inode_lru_lock by
617		- * dispose_list().
	615	+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
	616	+ * This is called from the superblock shrinker function with a number of inodes
	617	+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
	618	+ * then are freed outside inode_lock by dispose_list().
618	619	*
619	620	* Any inodes which are pinned purely because of attached pagecache have their
620	621	* pagecache removed. If the inode has metadata buffers attached to
621	622
622	623
...	...	@@ -628,14 +629,15 @@
628	629	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
629	630	* with this flag set because they are the inodes that are out of order.
630	631	*/
631		-static void shrink_icache_sb(struct super_block sb, int nr_to_scan)
	632	+void prune_icache_sb(struct super_block *sb, int nr_to_scan)
632	633	{
633	634	LIST_HEAD(freeable);
634	635	int nr_scanned;
635	636	unsigned long reap = 0;
636	637
	638	+ down_read(&iprune_sem);
637	639	spin_lock(&sb->s_inode_lru_lock);
638		- for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
	640	+ for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
639	641	struct inode *inode;
640	642
641	643	if (list_empty(&sb->s_inode_lru))
642	644
643	645
...	...	@@ -707,111 +709,11 @@
707	709	else
708	710	__count_vm_events(PGINODESTEAL, reap);
709	711	spin_unlock(&sb->s_inode_lru_lock);
710		- *nr_to_scan = nr_scanned;
711	712
712	713	dispose_list(&freeable);
713		-}
714		-
715		-static void prune_icache(int count)
716		-{
717		- struct super_block sb, p = NULL;
718		- int w_count;
719		- int unused = inodes_stat.nr_unused;
720		- int prune_ratio;
721		- int pruned;
722		-
723		- if (unused == 0 \|\| count == 0)
724		- return;
725		- down_read(&iprune_sem);
726		- if (count >= unused)
727		- prune_ratio = 1;
728		- else
729		- prune_ratio = unused / count;
730		- spin_lock(&sb_lock);
731		- list_for_each_entry(sb, &super_blocks, s_list) {
732		- if (list_empty(&sb->s_instances))
733		- continue;
734		- if (sb->s_nr_inodes_unused == 0)
735		- continue;
736		- sb->s_count++;
737		- /* Now, we reclaim unused dentrins with fairness.
738		- * We reclaim them same percentage from each superblock.
739		- * We calculate number of dentries to scan on this sb
740		- * as follows, but the implementation is arranged to avoid
741		- * overflows:
742		- * number of dentries to scan on this sb =
743		- * count * (number of dentries on this sb /
744		- * number of dentries in the machine)
745		- */
746		- spin_unlock(&sb_lock);
747		- if (prune_ratio != 1)
748		- w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
749		- else
750		- w_count = sb->s_nr_inodes_unused;
751		- pruned = w_count;
752		- /*
753		- * We need to be sure this filesystem isn't being unmounted,
754		- * otherwise we could race with generic_shutdown_super(), and
755		- * end up holding a reference to an inode while the filesystem
756		- * is unmounted. So we try to get s_umount, and make sure
757		- * s_root isn't NULL.
758		- */
759		- if (down_read_trylock(&sb->s_umount)) {
760		- if ((sb->s_root != NULL) &&
761		- (!list_empty(&sb->s_dentry_lru))) {
762		- shrink_icache_sb(sb, &w_count);
763		- pruned -= w_count;
764		- }
765		- up_read(&sb->s_umount);
766		- }
767		- spin_lock(&sb_lock);
768		- if (p)
769		- __put_super(p);
770		- count -= pruned;
771		- p = sb;
772		- /* more work left to do? */
773		- if (count <= 0)
774		- break;
775		- }
776		- if (p)
777		- __put_super(p);
778		- spin_unlock(&sb_lock);
779	714	up_read(&iprune_sem);
780	715	}
781	716
782		-/*
783		- * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
784		- * "unused" means that no dentries are referring to the inodes: the files are
785		- * not open and the dcache references to those inodes have already been
786		- * reclaimed.
787		- *
788		- * This function is passed the number of inodes to scan, and it returns the
789		- * total number of remaining possibly-reclaimable inodes.
790		- */
791		-static int shrink_icache_memory(struct shrinker *shrink,
792		- struct shrink_control *sc)
793		-{
794		- int nr = sc->nr_to_scan;
795		- gfp_t gfp_mask = sc->gfp_mask;
796		-
797		- if (nr) {
798		- /*
799		- * Nasty deadlock avoidance. We may hold various FS locks,
800		- * and we don't want to recurse into the FS that called us
801		- * in clear_inode() and friends..
802		- */
803		- if (!(gfp_mask & __GFP_FS))
804		- return -1;
805		- prune_icache(nr);
806		- }
807		- return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
808		-}
809		-
810		-static struct shrinker icache_shrinker = {
811		- .shrink = shrink_icache_memory,
812		- .seeks = DEFAULT_SEEKS,
813		-};
814		-
815	717	static void __wait_on_freeing_inode(struct inode *inode);
816	718	/*
817	719	* Called with the inode lock held.
...	...	@@ -1691,7 +1593,6 @@
1691	1593	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
1692	1594	SLAB_MEM_SPREAD),
1693	1595	init_once);
1694		- register_shrinker(&icache_shrinker);
1695	1596
1696	1597	/* Hash may have been set up in inode_init_early */
1697	1598	if (!hashdist)
...	...	@@ -38,6 +38,48 @@
38	38	LIST_HEAD(super_blocks);
39	39	DEFINE_SPINLOCK(sb_lock);
40	40
	41	+/*
	42	+ * One thing we have to be careful of with a per-sb shrinker is that we don't
	43	+ * drop the last active reference to the superblock from within the shrinker.
	44	+ * If that happens we could trigger unregistering the shrinker from within the
	45	+ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
	46	+ * take a passive reference to the superblock to avoid this from occurring.
	47	+ */
	48	+static int prune_super(struct shrinker shrink, struct shrink_control sc)
	49	+{
	50	+ struct super_block *sb;
	51	+ int count;
	52	+
	53	+ sb = container_of(shrink, struct super_block, s_shrink);
	54	+
	55	+ /*
	56	+ * Deadlock avoidance. We may hold various FS locks, and we don't want
	57	+ * to recurse into the FS that called us in clear_inode() and friends..
	58	+ */
	59	+ if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
	60	+ return -1;
	61	+
	62	+ if (!grab_super_passive(sb))
	63	+ return -1;
	64	+
	65	+ if (sc->nr_to_scan) {
	66	+ /* proportion the scan between the two caches */
	67	+ int total;
	68	+
	69	+ total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
	70	+ count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
	71	+
	72	+ /* prune dcache first as icache is pinned by it */
	73	+ prune_dcache_sb(sb, count);
	74	+ prune_icache_sb(sb, sc->nr_to_scan - count);
	75	+ }
	76	+
	77	+ count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
	78	+ * sysctl_vfs_cache_pressure;
	79	+ drop_super(sb);
	80	+ return count;
	81	+}
	82	+
41	83	/**
42	84	* alloc_super - create new superblock
43	85	* @type: filesystem type superblock should belong to
...	...	@@ -116,6 +158,9 @@
116	158	s->s_op = &default_op;
117	159	s->s_time_gran = 1000000000;
118	160	s->cleancache_poolid = -1;
	161	+
	162	+ s->s_shrink.seeks = DEFAULT_SEEKS;
	163	+ s->s_shrink.shrink = prune_super;
119	164	}
120	165	out:
121	166	return s;
...	...	@@ -183,6 +228,10 @@
183	228	if (atomic_dec_and_test(&s->s_active)) {
184	229	cleancache_flush_fs(s);
185	230	fs->kill_sb(s);
	231	+
	232	+ /* caches are now gone, we can safely kill the shrinker now */
	233	+ unregister_shrinker(&s->s_shrink);
	234	+
186	235	/*
187	236	* We need to call rcu_barrier so all the delayed rcu free
188	237	* inodes are flushed before we release the fs module.
...	...	@@ -311,7 +360,6 @@
311	360	{
312	361	const struct super_operations *sop = sb->s_op;
313	362
314		-
315	363	if (sb->s_root) {
316	364	shrink_dcache_for_umount(sb);
317	365	sync_filesystem(sb);
...	...	@@ -399,6 +447,7 @@
399	447	list_add(&s->s_instances, &type->fs_supers);
400	448	spin_unlock(&sb_lock);
401	449	get_filesystem(type);
	450	+ register_shrinker(&s->s_shrink);
402	451	return s;
403	452	}
404	453
...	...	@@ -393,6 +393,7 @@
393	393	#include <linux/semaphore.h>
394	394	#include <linux/fiemap.h>
395	395	#include <linux/rculist_bl.h>
	396	+#include <linux/shrinker.h>
396	397	#include <linux/atomic.h>
397	398
398	399	#include <asm/byteorder.h>
399	400
...	...	@@ -1444,7 +1445,13 @@
1444	1445	* Saved pool identifier for cleancache (-1 means none)
1445	1446	*/
1446	1447	int cleancache_poolid;
	1448	+
	1449	+ struct shrinker s_shrink; /* per-sb shrinker handle */
1447	1450	};
	1451	+
	1452	+/* superblock cache pruning functions */
	1453	+extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
	1454	+extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
1448	1455
1449	1456	extern struct timespec current_fs_time(struct super_block *sb);
1450	1457
...	...	@@ -15,6 +15,7 @@
15	15	#include <linux/range.h>
16	16	#include <linux/pfn.h>
17	17	#include <linux/bit_spinlock.h>
	18	+#include <linux/shrinker.h>
18	19
19	20	struct mempolicy;
20	21	struct anon_vma;
...	...	@@ -1120,45 +1121,6 @@
1120	1121	{
1121	1122	}
1122	1123	#endif
1123		-
1124		-/*
1125		- * This struct is used to pass information from page reclaim to the shrinkers.
1126		- * We consolidate the values for easier extention later.
1127		- */
1128		-struct shrink_control {
1129		- gfp_t gfp_mask;
1130		-
1131		- /* How many slab objects shrinker() should scan and try to reclaim */
1132		- unsigned long nr_to_scan;
1133		-};
1134		-
1135		-/*
1136		- * A callback you can register to apply pressure to ageable caches.
1137		- *
1138		- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
1139		- * and a 'gfpmask'. It should look through the least-recently-used
1140		- * 'nr_to_scan' entries and attempt to free them up. It should return
1141		- * the number of objects which remain in the cache. If it returns -1, it means
1142		- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
1143		- *
1144		- * The 'gfpmask' refers to the allocation we are currently trying to
1145		- * fulfil.
1146		- *
1147		- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
1148		- * querying the cache size, so a fastpath for that case is appropriate.
1149		- */
1150		-struct shrinker {
1151		- int (shrink)(struct shrinker , struct shrink_control *sc);
1152		- int seeks; /* seeks to recreate an obj */
1153		- long batch; /* reclaim batch size, 0 = default */
1154		-
1155		- /* These are for internal use */
1156		- struct list_head list;
1157		- long nr; /* objs pending delete */
1158		-};
1159		-#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
1160		-extern void register_shrinker(struct shrinker *);
1161		-extern void unregister_shrinker(struct shrinker *);
1162	1124
1163	1125	int vma_wants_writenotify(struct vm_area_struct *vma);
1164	1126
	1	+#ifndef _LINUX_SHRINKER_H
	2	+#define _LINUX_SHRINKER_H
	3	+
	4	+/*
	5	+ * This struct is used to pass information from page reclaim to the shrinkers.
	6	+ * We consolidate the values for easier extention later.
	7	+ */
	8	+struct shrink_control {
	9	+ gfp_t gfp_mask;
	10	+
	11	+ /* How many slab objects shrinker() should scan and try to reclaim */
	12	+ unsigned long nr_to_scan;
	13	+};
	14	+
	15	+/*
	16	+ * A callback you can register to apply pressure to ageable caches.
	17	+ *
	18	+ * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
	19	+ * and a 'gfpmask'. It should look through the least-recently-used
	20	+ * 'nr_to_scan' entries and attempt to free them up. It should return
	21	+ * the number of objects which remain in the cache. If it returns -1, it means
	22	+ * it cannot do any scanning at this time (eg. there is a risk of deadlock).
	23	+ *
	24	+ * The 'gfpmask' refers to the allocation we are currently trying to
	25	+ * fulfil.
	26	+ *
	27	+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
	28	+ * querying the cache size, so a fastpath for that case is appropriate.
	29	+ */
	30	+struct shrinker {
	31	+ int (shrink)(struct shrinker , struct shrink_control *sc);
	32	+ int seeks; /* seeks to recreate an obj */
	33	+ long batch; /* reclaim batch size, 0 = default */
	34	+
	35	+ /* These are for internal use */
	36	+ struct list_head list;
	37	+ long nr; /* objs pending delete */
	38	+};
	39	+#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
	40	+extern void register_shrinker(struct shrinker *);
	41	+extern void unregister_shrinker(struct shrinker *);
	42	+#endif