RCU'd vfsmounts

* RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

RCU'd vfsmounts
* RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Al Viro
1 parent 42c326082d
Showing 6 changed files with 136 additions and 83 deletions Side-by-side Diff
fs/dcache.c
fs/mount.h
fs/namei.c
fs/namespace.c
include/linux/mount.h
include/linux/namei.h
@@ -2887,24 +2887,28 @@
 	struct vfsmount *vfsmnt = path->mnt;
 	struct mount *mnt = real_mount(vfsmnt);
 	int error = 0;
-	unsigned seq = 0;
+	unsigned seq, m_seq = 0;
 	char *bptr;
 	int blen;
  
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
+restart_mnt:
+	read_seqbegin_or_lock(&mount_lock, &m_seq);
+	seq = 0;
 restart:
 	bptr = *buffer;
 	blen = *buflen;
+	error = 0;
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
  
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+			struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
 			/* Global root? */
-			if (mnt_has_parent(mnt)) {
-				dentry = mnt->mnt_mountpoint;
-				mnt = mnt->mnt_parent;
+			if (mnt != parent) {
+				dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+				mnt = parent;
 				vfsmnt = &mnt->mnt;
 				continue;
 			}
@@ -2938,7 +2942,11 @@
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
-	br_read_unlock(&vfsmount_lock);
+	if (need_seqretry(&mount_lock, m_seq)) {
+		m_seq = 1;
+		goto restart_mnt;
+	}
+	done_seqretry(&mount_lock, m_seq);
  
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
-#include <linux/lglock.h>
  
 struct mnt_namespace {
 	atomic_t		count;
@@ -30,6 +29,7 @@
 	struct mount *mnt_parent;
 	struct dentry *mnt_mountpoint;
 	struct vfsmount mnt;
+	struct rcu_head mnt_rcu;
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
 #else
  
  
  
@@ -80,21 +80,23 @@
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
  
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
 static inline void get_mnt_ns(struct mnt_namespace *ns)
 {
 	atomic_inc(&ns->count);
 }
  
-extern struct lglock vfsmount_lock;
+extern seqlock_t mount_lock;
  
 static inline void lock_mount_hash(void)
 {
-	br_write_lock(&vfsmount_lock);
+	write_seqlock(&mount_lock);
 }
  
 static inline void unlock_mount_hash(void)
 {
-	br_write_unlock(&vfsmount_lock);
+	write_sequnlock(&mount_lock);
 }
  
 struct proc_mounts {
@@ -484,14 +484,12 @@
  
 static inline void lock_rcu_walk(void)
 {
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
 }
  
 static inline void unlock_rcu_walk(void)
 {
 	rcu_read_unlock();
-	br_read_unlock(&vfsmount_lock);
 }
  
 /**
  
  
  
@@ -512,26 +510,23 @@
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
  
 	/*
-	 * Get a reference to the parent first: we're
-	 * going to make "path_put(nd->path)" valid in
-	 * non-RCU context for "terminate_walk()".
-	 *
-	 * If this doesn't work, return immediately with
-	 * RCU walking still active (and then we will do
-	 * the RCU walk cleanup in terminate_walk()).
+	 * After legitimizing the bastards, terminate_walk()
+	 * will do the right thing for non-RCU mode, and all our
+	 * subsequent exit cases should rcu_read_unlock()
+	 * before returning.  Do vfsmount first; if dentry
+	 * can't be legitimized, just set nd->path.dentry to NULL
+	 * and rely on dput(NULL) being a no-op.
 	 */
-	if (!lockref_get_not_dead(&parent->d_lockref))
+	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
 		return -ECHILD;
-
-	/*
-	 * After the mntget(), we terminate_walk() will do
-	 * the right thing for non-RCU mode, and all our
-	 * subsequent exit cases should unlock_rcu_walk()
-	 * before returning.
-	 */
-	mntget(nd->path.mnt);
 	nd->flags &= ~LOOKUP_RCU;
  
+	if (!lockref_get_not_dead(&parent->d_lockref)) {
+		nd->path.dentry = NULL;	
+		unlock_rcu_walk();
+		return -ECHILD;
+	}
+
 	/*
 	 * For a negative lookup, the lookup sequence point is the parents
 	 * sequence point, and it only needs to revalidate the parent dentry.
  
  
  
@@ -608,16 +603,21 @@
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
  
+		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+			unlock_rcu_walk();
+			return -ECHILD;
+		}
 		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
 			unlock_rcu_walk();
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
 		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
 			unlock_rcu_walk();
 			dput(dentry);
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
-		mntget(nd->path.mnt);
 		unlock_rcu_walk();
 	}
  
  
  
@@ -909,15 +909,15 @@
 	struct mount *parent;
 	struct dentry *mountpoint;
  
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	parent = mnt->mnt_parent;
 	if (parent == mnt) {
-		br_read_unlock(&vfsmount_lock);
+		read_sequnlock_excl(&mount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -1048,8 +1048,8 @@
  
 			/* Something is mounted on this dentry in another
 			 * namespace and/or whatever was mounted there in this
-			 * namespace got unmounted before we managed to get the
-			 * vfsmount_lock */
+			 * namespace got unmounted before lookup_mnt() could
+			 * get it */
 		}
  
 		/* Handle an automount point */
@@ -1864,6 +1864,7 @@
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
@@ -1872,6 +1873,7 @@
  
 	nd->root.mnt = NULL;
  
+	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
@@ -53,7 +53,7 @@
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
  
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
  
  
@@ -547,16 +547,38 @@
 	kmem_cache_free(mnt_cache, mnt);
 }
  
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+	struct mount *mnt;
+	if (read_seqretry(&mount_lock, seq))
+		return false;
+	if (bastard == NULL)
+		return true;
+	mnt = real_mount(bastard);
+	mnt_add_count(mnt, 1);
+	if (likely(!read_seqretry(&mount_lock, seq)))
+		return true;
+	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+		mnt_add_count(mnt, -1);
+		return false;
+	}
+	rcu_read_unlock();
+	mntput(bastard);
+	rcu_read_lock();
+	return false;
+}
+
 /*
  * find the first mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
 	struct mount *p;
  
-	list_for_each_entry(p, head, mnt_hash)
+	list_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
@@ -564,7 +586,7 @@
  
 /*
  * find the last mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
  
@@ -596,17 +618,17 @@
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
+	struct vfsmount *m;
+	unsigned seq;
  
-	br_read_lock(&vfsmount_lock);
-	child_mnt = __lookup_mnt(path->mnt, path->dentry);
-	if (child_mnt) {
-		mnt_add_count(child_mnt, 1);
-		br_read_unlock(&vfsmount_lock);
-		return &child_mnt->mnt;
-	} else {
-		br_read_unlock(&vfsmount_lock);
-		return NULL;
-	}
+	rcu_read_lock();
+	do {
+		seq = read_seqbegin(&mount_lock);
+		child_mnt = __lookup_mnt(path->mnt, path->dentry);
+		m = child_mnt ? &child_mnt->mnt : NULL;
+	} while (!legitimize_mnt(m, seq));
+	rcu_read_unlock();
+	return m;
 }
  
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
  
  
  
  
  
  
  
@@ -874,38 +896,46 @@
 	return ERR_PTR(err);
 }
  
+static void delayed_free(struct rcu_head *head)
+{
+	struct mount *mnt = container_of(head, struct mount, mnt_rcu);
+	kfree(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_pcp);
+#endif
+	kmem_cache_free(mnt_cache, mnt);
+}
+
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
-#ifdef CONFIG_SMP
-	br_read_lock(&vfsmount_lock);
-	if (likely(mnt->mnt_ns)) {
-		/* shouldn't be the last one */
-		mnt_add_count(mnt, -1);
-		br_read_unlock(&vfsmount_lock);
+	rcu_read_lock();
+	mnt_add_count(mnt, -1);
+	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+		rcu_read_unlock();
 		return;
 	}
-	br_read_unlock(&vfsmount_lock);
-
 	lock_mount_hash();
-	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
+		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
-#else
-	mnt_add_count(mnt, -1);
-	if (likely(mnt_get_count(mnt)))
-		return;
-	lock_mount_hash();
-#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
+		rcu_read_unlock();
 		unlock_mount_hash();
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+		rcu_read_unlock();
+		unlock_mount_hash();
+		return;
+	}
+	mnt->mnt.mnt_flags |= MNT_DOOMED;
+	rcu_read_unlock();
  
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
@@ -924,7 +954,8 @@
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
-	free_vfsmnt(mnt);
+	mnt_free_id(mnt);
+	call_rcu(&mnt->mnt_rcu, delayed_free);
 }
  
 void mntput(struct vfsmount *mnt)
@@ -1137,6 +1168,8 @@
 	list_splice_init(&unmounted, &head);
 	up_write(&namespace_sem);
  
+	synchronize_rcu();
+
 	while (!list_empty(&head)) {
 		mnt = list_first_entry(&head, struct mount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
  
  
@@ -1152,10 +1185,13 @@
 }
  
 /*
- * vfsmount lock must be held for write
+ * mount_lock must be held
  * namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
  */
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
 {
 	LIST_HEAD(tmp_list);
 	struct mount *p;
@@ -1163,7 +1199,7 @@
 	for (p = mnt; p; p = next_mnt(p, mnt))
 		list_move(&p->mnt_hash, &tmp_list);
  
-	if (propagate)
+	if (how)
 		propagate_umount(&tmp_list);
  
 	list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1207,8 @@
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		if (how < 2)
+			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
 			put_mountpoint(p->mnt_mp);
  
  
@@ -1262,14 +1300,18 @@
 	lock_mount_hash();
 	event++;
  
-	if (!(flags & MNT_DETACH))
-		shrink_submounts(mnt);
-
-	retval = -EBUSY;
-	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, 1);
+			umount_tree(mnt, 2);
 		retval = 0;
+	} else {
+		shrink_submounts(mnt);
+		retval = -EBUSY;
+		if (!propagate_mount_busy(mnt, 2)) {
+			if (!list_empty(&mnt->mnt_list))
+				umount_tree(mnt, 1);
+			retval = 0;
+		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
@@ -1955,7 +1997,7 @@
 	struct mount *parent;
 	int err;
  
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
  
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
@@ -2172,7 +2214,7 @@
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
@@ -2558,7 +2600,7 @@
 /*
  * Return true if path is reachable from root
  *
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
  
@@ -2573,9 +2615,9 @@
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2790,6 @@
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
  
-	br_lock_init(&vfsmount_lock);
-
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
  
@@ -2788,9 +2828,8 @@
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
-		lock_mount_hash();
 		real_mount(mnt)->mnt_ns = NULL;
-		unlock_mount_hash();
+		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
@@ -49,6 +49,8 @@
  
 #define MNT_LOCK_READONLY	0x400000
 #define MNT_LOCKED		0x800000
+#define MNT_DOOMED		0x1000000
+#define MNT_SYNC_UMOUNT		0x2000000
  
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
@@ -16,7 +16,7 @@
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
-	unsigned	seq;
+	unsigned	seq, m_seq;
 	int		last_type;
 	unsigned	depth;
 	char *saved_names[MAX_NESTED_LINKS + 1];
...	...	@@ -2887,24 +2887,28 @@
2887	2887	struct vfsmount *vfsmnt = path->mnt;
2888	2888	struct mount *mnt = real_mount(vfsmnt);
2889	2889	int error = 0;
2890		- unsigned seq = 0;
	2890	+ unsigned seq, m_seq = 0;
2891	2891	char *bptr;
2892	2892	int blen;
2893	2893
2894		- br_read_lock(&vfsmount_lock);
2895	2894	rcu_read_lock();
	2895	+restart_mnt:
	2896	+ read_seqbegin_or_lock(&mount_lock, &m_seq);
	2897	+ seq = 0;
2896	2898	restart:
2897	2899	bptr = *buffer;
2898	2900	blen = *buflen;
	2901	+ error = 0;
2899	2902	read_seqbegin_or_lock(&rename_lock, &seq);
2900	2903	while (dentry != root->dentry \|\| vfsmnt != root->mnt) {
2901	2904	struct dentry * parent;
2902	2905
2903	2906	if (dentry == vfsmnt->mnt_root \|\| IS_ROOT(dentry)) {
	2907	+ struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
2904	2908	/* Global root? */
2905		- if (mnt_has_parent(mnt)) {
2906		- dentry = mnt->mnt_mountpoint;
2907		- mnt = mnt->mnt_parent;
	2909	+ if (mnt != parent) {
	2910	+ dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
	2911	+ mnt = parent;
2908	2912	vfsmnt = &mnt->mnt;
2909	2913	continue;
2910	2914	}
...	...	@@ -2938,7 +2942,11 @@
2938	2942	goto restart;
2939	2943	}
2940	2944	done_seqretry(&rename_lock, seq);
2941		- br_read_unlock(&vfsmount_lock);
	2945	+ if (need_seqretry(&mount_lock, m_seq)) {
	2946	+ m_seq = 1;
	2947	+ goto restart_mnt;
	2948	+ }
	2949	+ done_seqretry(&mount_lock, m_seq);
2942	2950
2943	2951	if (error >= 0 && bptr == *buffer) {
2944	2952	if (--blen < 0)
1	1	#include <linux/mount.h>
2	2	#include <linux/seq_file.h>
3	3	#include <linux/poll.h>
4		-#include <linux/lglock.h>
5	4
6	5	struct mnt_namespace {
7	6	atomic_t count;
...	...	@@ -30,6 +29,7 @@
30	29	struct mount *mnt_parent;
31	30	struct dentry *mnt_mountpoint;
32	31	struct vfsmount mnt;
	32	+ struct rcu_head mnt_rcu;
33	33	#ifdef CONFIG_SMP
34	34	struct mnt_pcp __percpu *mnt_pcp;
35	35	#else
36	36
37	37
38	38
...	...	@@ -80,21 +80,23 @@
80	80	extern struct mount __lookup_mnt(struct vfsmount , struct dentry *);
81	81	extern struct mount __lookup_mnt_last(struct vfsmount , struct dentry *);
82	82
	83	+extern bool legitimize_mnt(struct vfsmount *, unsigned);
	84	+
83	85	static inline void get_mnt_ns(struct mnt_namespace *ns)
84	86	{
85	87	atomic_inc(&ns->count);
86	88	}
87	89
88		-extern struct lglock vfsmount_lock;
	90	+extern seqlock_t mount_lock;
89	91
90	92	static inline void lock_mount_hash(void)
91	93	{
92		- br_write_lock(&vfsmount_lock);
	94	+ write_seqlock(&mount_lock);
93	95	}
94	96
95	97	static inline void unlock_mount_hash(void)
96	98	{
97		- br_write_unlock(&vfsmount_lock);
	99	+ write_sequnlock(&mount_lock);
98	100	}
99	101
100	102	struct proc_mounts {
...	...	@@ -484,14 +484,12 @@
484	484
485	485	static inline void lock_rcu_walk(void)
486	486	{
487		- br_read_lock(&vfsmount_lock);
488	487	rcu_read_lock();
489	488	}
490	489
491	490	static inline void unlock_rcu_walk(void)
492	491	{
493	492	rcu_read_unlock();
494		- br_read_unlock(&vfsmount_lock);
495	493	}
496	494
497	495	/**
498	496
499	497
500	498
...	...	@@ -512,26 +510,23 @@
512	510	BUG_ON(!(nd->flags & LOOKUP_RCU));
513	511
514	512	/*
515		- * Get a reference to the parent first: we're
516		- * going to make "path_put(nd->path)" valid in
517		- * non-RCU context for "terminate_walk()".
518		- *
519		- * If this doesn't work, return immediately with
520		- * RCU walking still active (and then we will do
521		- * the RCU walk cleanup in terminate_walk()).
	513	+ * After legitimizing the bastards, terminate_walk()
	514	+ * will do the right thing for non-RCU mode, and all our
	515	+ * subsequent exit cases should rcu_read_unlock()
	516	+ * before returning. Do vfsmount first; if dentry
	517	+ * can't be legitimized, just set nd->path.dentry to NULL
	518	+ * and rely on dput(NULL) being a no-op.
522	519	*/
523		- if (!lockref_get_not_dead(&parent->d_lockref))
	520	+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
524	521	return -ECHILD;
525		-
526		- /*
527		- * After the mntget(), we terminate_walk() will do
528		- * the right thing for non-RCU mode, and all our
529		- * subsequent exit cases should unlock_rcu_walk()
530		- * before returning.
531		- */
532		- mntget(nd->path.mnt);
533	522	nd->flags &= ~LOOKUP_RCU;
534	523
	524	+ if (!lockref_get_not_dead(&parent->d_lockref)) {
	525	+ nd->path.dentry = NULL;
	526	+ unlock_rcu_walk();
	527	+ return -ECHILD;
	528	+ }
	529	+
535	530	/*
536	531	* For a negative lookup, the lookup sequence point is the parents
537	532	* sequence point, and it only needs to revalidate the parent dentry.
538	533
539	534
540	535
...	...	@@ -608,16 +603,21 @@
608	603	if (!(nd->flags & LOOKUP_ROOT))
609	604	nd->root.mnt = NULL;
610	605
	606	+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
	607	+ unlock_rcu_walk();
	608	+ return -ECHILD;
	609	+ }
611	610	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
612	611	unlock_rcu_walk();
	612	+ mntput(nd->path.mnt);
613	613	return -ECHILD;
614	614	}
615	615	if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
616	616	unlock_rcu_walk();
617	617	dput(dentry);
	618	+ mntput(nd->path.mnt);
618	619	return -ECHILD;
619	620	}
620		- mntget(nd->path.mnt);
621	621	unlock_rcu_walk();
622	622	}
623	623
624	624
625	625
...	...	@@ -909,15 +909,15 @@
909	909	struct mount *parent;
910	910	struct dentry *mountpoint;
911	911
912		- br_read_lock(&vfsmount_lock);
	912	+ read_seqlock_excl(&mount_lock);
913	913	parent = mnt->mnt_parent;
914	914	if (parent == mnt) {
915		- br_read_unlock(&vfsmount_lock);
	915	+ read_sequnlock_excl(&mount_lock);
916	916	return 0;
917	917	}
918	918	mntget(&parent->mnt);
919	919	mountpoint = dget(mnt->mnt_mountpoint);
920		- br_read_unlock(&vfsmount_lock);
	920	+ read_sequnlock_excl(&mount_lock);
921	921	dput(path->dentry);
922	922	path->dentry = mountpoint;
923	923	mntput(path->mnt);
...	...	@@ -1048,8 +1048,8 @@
1048	1048
1049	1049	/* Something is mounted on this dentry in another
1050	1050	* namespace and/or whatever was mounted there in this
1051		- * namespace got unmounted before we managed to get the
1052		- * vfsmount_lock */
	1051	+ * namespace got unmounted before lookup_mnt() could
	1052	+ * get it */
1053	1053	}
1054	1054
1055	1055	/* Handle an automount point */
...	...	@@ -1864,6 +1864,7 @@
1864	1864	if (flags & LOOKUP_RCU) {
1865	1865	lock_rcu_walk();
1866	1866	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
	1867	+ nd->m_seq = read_seqbegin(&mount_lock);
1867	1868	} else {
1868	1869	path_get(&nd->path);
1869	1870	}
...	...	@@ -1872,6 +1873,7 @@
1872	1873
1873	1874	nd->root.mnt = NULL;
1874	1875
	1876	+ nd->m_seq = read_seqbegin(&mount_lock);
1875	1877	if (*name=='/') {
1876	1878	if (flags & LOOKUP_RCU) {
1877	1879	lock_rcu_walk();
...	...	@@ -53,7 +53,7 @@
53	53	* It should be taken for write in all cases where the vfsmount
54	54	* tree or hash is modified or when a vfsmount structure is modified.
55	55	*/
56		-DEFINE_BRLOCK(vfsmount_lock);
	56	+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
57	57
58	58	static inline unsigned long hash(struct vfsmount mnt, struct dentry dentry)
59	59	{
60	60
61	61
...	...	@@ -547,16 +547,38 @@
547	547	kmem_cache_free(mnt_cache, mnt);
548	548	}
549	549
	550	+/* call under rcu_read_lock */
	551	+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
	552	+{
	553	+ struct mount *mnt;
	554	+ if (read_seqretry(&mount_lock, seq))
	555	+ return false;
	556	+ if (bastard == NULL)
	557	+ return true;
	558	+ mnt = real_mount(bastard);
	559	+ mnt_add_count(mnt, 1);
	560	+ if (likely(!read_seqretry(&mount_lock, seq)))
	561	+ return true;
	562	+ if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
	563	+ mnt_add_count(mnt, -1);
	564	+ return false;
	565	+ }
	566	+ rcu_read_unlock();
	567	+ mntput(bastard);
	568	+ rcu_read_lock();
	569	+ return false;
	570	+}
	571	+
550	572	/*
551	573	* find the first mount at @dentry on vfsmount @mnt.
552		- * vfsmount_lock must be held for read or write.
	574	+ * call under rcu_read_lock()
553	575	*/
554	576	struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
555	577	{
556	578	struct list_head *head = mount_hashtable + hash(mnt, dentry);
557	579	struct mount *p;
558	580
559		- list_for_each_entry(p, head, mnt_hash)
	581	+ list_for_each_entry_rcu(p, head, mnt_hash)
560	582	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
561	583	return p;
562	584	return NULL;
...	...	@@ -564,7 +586,7 @@
564	586
565	587	/*
566	588	* find the last mount at @dentry on vfsmount @mnt.
567		- * vfsmount_lock must be held for read or write.
	589	+ * mount_lock must be held.
568	590	*/
569	591	struct mount __lookup_mnt_last(struct vfsmount mnt, struct dentry *dentry)
570	592	{
571	593
...	...	@@ -596,17 +618,17 @@
596	618	struct vfsmount lookup_mnt(struct path path)
597	619	{
598	620	struct mount *child_mnt;
	621	+ struct vfsmount *m;
	622	+ unsigned seq;
599	623
600		- br_read_lock(&vfsmount_lock);
601		- child_mnt = __lookup_mnt(path->mnt, path->dentry);
602		- if (child_mnt) {
603		- mnt_add_count(child_mnt, 1);
604		- br_read_unlock(&vfsmount_lock);
605		- return &child_mnt->mnt;
606		- } else {
607		- br_read_unlock(&vfsmount_lock);
608		- return NULL;
609		- }
	624	+ rcu_read_lock();
	625	+ do {
	626	+ seq = read_seqbegin(&mount_lock);
	627	+ child_mnt = __lookup_mnt(path->mnt, path->dentry);
	628	+ m = child_mnt ? &child_mnt->mnt : NULL;
	629	+ } while (!legitimize_mnt(m, seq));
	630	+ rcu_read_unlock();
	631	+ return m;
610	632	}
611	633
612	634	static struct mountpoint new_mountpoint(struct dentry dentry)
613	635
614	636
615	637
616	638
617	639
618	640
619	641
...	...	@@ -874,38 +896,46 @@
874	896	return ERR_PTR(err);
875	897	}
876	898
	899	+static void delayed_free(struct rcu_head *head)
	900	+{
	901	+ struct mount *mnt = container_of(head, struct mount, mnt_rcu);
	902	+ kfree(mnt->mnt_devname);
	903	+#ifdef CONFIG_SMP
	904	+ free_percpu(mnt->mnt_pcp);
	905	+#endif
	906	+ kmem_cache_free(mnt_cache, mnt);
	907	+}
	908	+
877	909	static void mntput_no_expire(struct mount *mnt)
878	910	{
879	911	put_again:
880		-#ifdef CONFIG_SMP
881		- br_read_lock(&vfsmount_lock);
882		- if (likely(mnt->mnt_ns)) {
883		- /* shouldn't be the last one */
884		- mnt_add_count(mnt, -1);
885		- br_read_unlock(&vfsmount_lock);
	912	+ rcu_read_lock();
	913	+ mnt_add_count(mnt, -1);
	914	+ if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
	915	+ rcu_read_unlock();
886	916	return;
887	917	}
888		- br_read_unlock(&vfsmount_lock);
889		-
890	918	lock_mount_hash();
891		- mnt_add_count(mnt, -1);
892	919	if (mnt_get_count(mnt)) {
	920	+ rcu_read_unlock();
893	921	unlock_mount_hash();
894	922	return;
895	923	}
896		-#else
897		- mnt_add_count(mnt, -1);
898		- if (likely(mnt_get_count(mnt)))
899		- return;
900		- lock_mount_hash();
901		-#endif
902	924	if (unlikely(mnt->mnt_pinned)) {
903	925	mnt_add_count(mnt, mnt->mnt_pinned + 1);
904	926	mnt->mnt_pinned = 0;
	927	+ rcu_read_unlock();
905	928	unlock_mount_hash();
906	929	acct_auto_close_mnt(&mnt->mnt);
907	930	goto put_again;
908	931	}
	932	+ if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
	933	+ rcu_read_unlock();
	934	+ unlock_mount_hash();
	935	+ return;
	936	+ }
	937	+ mnt->mnt.mnt_flags \|= MNT_DOOMED;
	938	+ rcu_read_unlock();
909	939
910	940	list_del(&mnt->mnt_instance);
911	941	unlock_mount_hash();
...	...	@@ -924,7 +954,8 @@
924	954	fsnotify_vfsmount_delete(&mnt->mnt);
925	955	dput(mnt->mnt.mnt_root);
926	956	deactivate_super(mnt->mnt.mnt_sb);
927		- free_vfsmnt(mnt);
	957	+ mnt_free_id(mnt);
	958	+ call_rcu(&mnt->mnt_rcu, delayed_free);
928	959	}
929	960
930	961	void mntput(struct vfsmount *mnt)
...	...	@@ -1137,6 +1168,8 @@
1137	1168	list_splice_init(&unmounted, &head);
1138	1169	up_write(&namespace_sem);
1139	1170
	1171	+ synchronize_rcu();
	1172	+
1140	1173	while (!list_empty(&head)) {
1141	1174	mnt = list_first_entry(&head, struct mount, mnt_hash);
1142	1175	list_del_init(&mnt->mnt_hash);
1143	1176
1144	1177
...	...	@@ -1152,10 +1185,13 @@
1152	1185	}
1153	1186
1154	1187	/*
1155		- * vfsmount lock must be held for write
	1188	+ * mount_lock must be held
1156	1189	* namespace_sem must be held for write
	1190	+ * how = 0 => just this tree, don't propagate
	1191	+ * how = 1 => propagate; we know that nobody else has reference to any victims
	1192	+ * how = 2 => lazy umount
1157	1193	*/
1158		-void umount_tree(struct mount *mnt, int propagate)
	1194	+void umount_tree(struct mount *mnt, int how)
1159	1195	{
1160	1196	LIST_HEAD(tmp_list);
1161	1197	struct mount *p;
...	...	@@ -1163,7 +1199,7 @@
1163	1199	for (p = mnt; p; p = next_mnt(p, mnt))
1164	1200	list_move(&p->mnt_hash, &tmp_list);
1165	1201
1166		- if (propagate)
	1202	+ if (how)
1167	1203	propagate_umount(&tmp_list);
1168	1204
1169	1205	list_for_each_entry(p, &tmp_list, mnt_hash) {
...	...	@@ -1171,6 +1207,8 @@
1171	1207	list_del_init(&p->mnt_list);
1172	1208	__touch_mnt_namespace(p->mnt_ns);
1173	1209	p->mnt_ns = NULL;
	1210	+ if (how < 2)
	1211	+ p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1174	1212	list_del_init(&p->mnt_child);
1175	1213	if (mnt_has_parent(p)) {
1176	1214	put_mountpoint(p->mnt_mp);
1177	1215
1178	1216
...	...	@@ -1262,14 +1300,18 @@
1262	1300	lock_mount_hash();
1263	1301	event++;
1264	1302
1265		- if (!(flags & MNT_DETACH))
1266		- shrink_submounts(mnt);
1267		-
1268		- retval = -EBUSY;
1269		- if (flags & MNT_DETACH \|\| !propagate_mount_busy(mnt, 2)) {
	1303	+ if (flags & MNT_DETACH) {
1270	1304	if (!list_empty(&mnt->mnt_list))
1271		- umount_tree(mnt, 1);
	1305	+ umount_tree(mnt, 2);
1272	1306	retval = 0;
	1307	+ } else {
	1308	+ shrink_submounts(mnt);
	1309	+ retval = -EBUSY;
	1310	+ if (!propagate_mount_busy(mnt, 2)) {
	1311	+ if (!list_empty(&mnt->mnt_list))
	1312	+ umount_tree(mnt, 1);
	1313	+ retval = 0;
	1314	+ }
1273	1315	}
1274	1316	unlock_mount_hash();
1275	1317	namespace_unlock();
...	...	@@ -1955,7 +1997,7 @@
1955	1997	struct mount *parent;
1956	1998	int err;
1957	1999
1958		- mnt_flags &= ~(MNT_SHARED \| MNT_WRITE_HOLD \| MNT_INTERNAL);
	2000	+ mnt_flags &= ~(MNT_SHARED \| MNT_WRITE_HOLD \| MNT_INTERNAL \| MNT_DOOMED \| MNT_SYNC_UMOUNT);
1959	2001
1960	2002	mp = lock_mount(path);
1961	2003	if (IS_ERR(mp))
...	...	@@ -2172,7 +2214,7 @@
2172	2214	* process a list of expirable mountpoints with the intent of discarding any
2173	2215	* submounts of a specific parent mountpoint
2174	2216	*
2175		- * vfsmount_lock must be held for write
	2217	+ * mount_lock must be held for write
2176	2218	*/
2177	2219	static void shrink_submounts(struct mount *mnt)
2178	2220	{
...	...	@@ -2558,7 +2600,7 @@
2558	2600	/*
2559	2601	* Return true if path is reachable from root
2560	2602	*
2561		- * namespace_sem or vfsmount_lock is held
	2603	+ * namespace_sem or mount_lock is held
2562	2604	*/
2563	2605	bool is_path_reachable(struct mount mnt, struct dentry dentry,
2564	2606	const struct path *root)
2565	2607
...	...	@@ -2573,9 +2615,9 @@
2573	2615	int path_is_under(struct path path1, struct path path2)
2574	2616	{
2575	2617	int res;
2576		- br_read_lock(&vfsmount_lock);
	2618	+ read_seqlock_excl(&mount_lock);
2577	2619	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2578		- br_read_unlock(&vfsmount_lock);
	2620	+ read_sequnlock_excl(&mount_lock);
2579	2621	return res;
2580	2622	}
2581	2623	EXPORT_SYMBOL(path_is_under);
...	...	@@ -2748,8 +2790,6 @@
2748	2790	for (u = 0; u < HASH_SIZE; u++)
2749	2791	INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2750	2792
2751		- br_lock_init(&vfsmount_lock);
2752		-
2753	2793	err = sysfs_init();
2754	2794	if (err)
2755	2795	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
2756	2796
...	...	@@ -2788,9 +2828,8 @@
2788	2828	{
2789	2829	/* release long term mount so mount point can be released */
2790	2830	if (!IS_ERR_OR_NULL(mnt)) {
2791		- lock_mount_hash();
2792	2831	real_mount(mnt)->mnt_ns = NULL;
2793		- unlock_mount_hash();
	2832	+ synchronize_rcu(); /* yecchhh... */
2794	2833	mntput(mnt);
2795	2834	}
2796	2835	}
...	...	@@ -49,6 +49,8 @@
49	49
50	50	#define MNT_LOCK_READONLY 0x400000
51	51	#define MNT_LOCKED 0x800000
	52	+#define MNT_DOOMED 0x1000000
	53	+#define MNT_SYNC_UMOUNT 0x2000000
52	54
53	55	struct vfsmount {
54	56	struct dentry mnt_root; / root of the mounted tree */
...	...	@@ -16,7 +16,7 @@
16	16	struct path root;
17	17	struct inode inode; / path.dentry.d_inode */
18	18	unsigned int flags;
19		- unsigned seq;
	19	+ unsigned seq, m_seq;
20	20	int last_type;
21	21	unsigned depth;
22	22	char *saved_names[MAX_NESTED_LINKS + 1];