Commit 48a066e72d970a3e225a9c18690d570c736fc455

Authored by Al Viro
1 parent 42c326082d

RCU'd vfsmounts

* RCU-delayed freeing of vfsmounts
* vfsmount_lock replaced with a seqlock (mount_lock)
* sequence number from mount_lock is stored in nameidata->m_seq and
used when we exit RCU mode
* new vfsmount flag - MNT_SYNC_UMOUNT.  Set by umount_tree() when its
caller knows that vfsmount will have no surviving references.
* synchronize_rcu() done between unlocking namespace_sem in namespace_unlock()
and doing pending mntput().
* new helper: legitimize_mnt(mnt, seq).  Checks the mount_lock sequence
number against seq, then grabs reference to mnt.  Then it rechecks mount_lock
again to close the race and either returns success or drops the reference it
has acquired.  The subtle point is that in case of MNT_SYNC_UMOUNT we can
simply decrement the refcount and sod off - aforementioned synchronize_rcu()
makes sure that final mntput() won't come until we leave RCU mode.  We need
that, since we don't want to end up with some lazy pathwalk racing with
umount() and stealing the final mntput() from it - caller of umount() may
expect it to return only once the fs is shut down and we don't want to break
that.  In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do
full-blown mntput() in case of mount_lock sequence number mismatch happening
just as we'd grabbed the reference, but in those cases we won't be stealing
the final mntput() from anything that would care.
* mntput_no_expire() doesn't lock anything on the fast path now.  Incidentally,
SMP and UP cases are handled the same way - no ifdefs there.
* normal pathname resolution does *not* do any writes to mount_lock.  It does,
of course, bump the refcounts of vfsmount and dentry in the very end, but that's
it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 6 changed files with 136 additions and 83 deletions Side-by-side Diff

... ... @@ -2887,24 +2887,28 @@
2887 2887 struct vfsmount *vfsmnt = path->mnt;
2888 2888 struct mount *mnt = real_mount(vfsmnt);
2889 2889 int error = 0;
2890   - unsigned seq = 0;
  2890 + unsigned seq, m_seq = 0;
2891 2891 char *bptr;
2892 2892 int blen;
2893 2893  
2894   - br_read_lock(&vfsmount_lock);
2895 2894 rcu_read_lock();
  2895 +restart_mnt:
  2896 + read_seqbegin_or_lock(&mount_lock, &m_seq);
  2897 + seq = 0;
2896 2898 restart:
2897 2899 bptr = *buffer;
2898 2900 blen = *buflen;
  2901 + error = 0;
2899 2902 read_seqbegin_or_lock(&rename_lock, &seq);
2900 2903 while (dentry != root->dentry || vfsmnt != root->mnt) {
2901 2904 struct dentry * parent;
2902 2905  
2903 2906 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
  2907 + struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
2904 2908 /* Global root? */
2905   - if (mnt_has_parent(mnt)) {
2906   - dentry = mnt->mnt_mountpoint;
2907   - mnt = mnt->mnt_parent;
  2909 + if (mnt != parent) {
  2910 + dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
  2911 + mnt = parent;
2908 2912 vfsmnt = &mnt->mnt;
2909 2913 continue;
2910 2914 }
... ... @@ -2938,7 +2942,11 @@
2938 2942 goto restart;
2939 2943 }
2940 2944 done_seqretry(&rename_lock, seq);
2941   - br_read_unlock(&vfsmount_lock);
  2945 + if (need_seqretry(&mount_lock, m_seq)) {
  2946 + m_seq = 1;
  2947 + goto restart_mnt;
  2948 + }
  2949 + done_seqretry(&mount_lock, m_seq);
2942 2950  
2943 2951 if (error >= 0 && bptr == *buffer) {
2944 2952 if (--blen < 0)
1 1 #include <linux/mount.h>
2 2 #include <linux/seq_file.h>
3 3 #include <linux/poll.h>
4   -#include <linux/lglock.h>
5 4  
6 5 struct mnt_namespace {
7 6 atomic_t count;
... ... @@ -30,6 +29,7 @@
30 29 struct mount *mnt_parent;
31 30 struct dentry *mnt_mountpoint;
32 31 struct vfsmount mnt;
  32 + struct rcu_head mnt_rcu;
33 33 #ifdef CONFIG_SMP
34 34 struct mnt_pcp __percpu *mnt_pcp;
35 35 #else
36 36  
37 37  
38 38  
... ... @@ -80,21 +80,23 @@
80 80 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
81 81 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
82 82  
  83 +extern bool legitimize_mnt(struct vfsmount *, unsigned);
  84 +
83 85 static inline void get_mnt_ns(struct mnt_namespace *ns)
84 86 {
85 87 atomic_inc(&ns->count);
86 88 }
87 89  
88   -extern struct lglock vfsmount_lock;
  90 +extern seqlock_t mount_lock;
89 91  
90 92 static inline void lock_mount_hash(void)
91 93 {
92   - br_write_lock(&vfsmount_lock);
  94 + write_seqlock(&mount_lock);
93 95 }
94 96  
95 97 static inline void unlock_mount_hash(void)
96 98 {
97   - br_write_unlock(&vfsmount_lock);
  99 + write_sequnlock(&mount_lock);
98 100 }
99 101  
100 102 struct proc_mounts {
... ... @@ -484,14 +484,12 @@
484 484  
485 485 static inline void lock_rcu_walk(void)
486 486 {
487   - br_read_lock(&vfsmount_lock);
488 487 rcu_read_lock();
489 488 }
490 489  
491 490 static inline void unlock_rcu_walk(void)
492 491 {
493 492 rcu_read_unlock();
494   - br_read_unlock(&vfsmount_lock);
495 493 }
496 494  
497 495 /**
498 496  
499 497  
500 498  
... ... @@ -512,26 +510,23 @@
512 510 BUG_ON(!(nd->flags & LOOKUP_RCU));
513 511  
514 512 /*
515   - * Get a reference to the parent first: we're
516   - * going to make "path_put(nd->path)" valid in
517   - * non-RCU context for "terminate_walk()".
518   - *
519   - * If this doesn't work, return immediately with
520   - * RCU walking still active (and then we will do
521   - * the RCU walk cleanup in terminate_walk()).
  513 + * After legitimizing the bastards, terminate_walk()
  514 + * will do the right thing for non-RCU mode, and all our
  515 + * subsequent exit cases should rcu_read_unlock()
  516 + * before returning. Do vfsmount first; if dentry
  517 + * can't be legitimized, just set nd->path.dentry to NULL
  518 + * and rely on dput(NULL) being a no-op.
522 519 */
523   - if (!lockref_get_not_dead(&parent->d_lockref))
  520 + if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
524 521 return -ECHILD;
525   -
526   - /*
527   - * After the mntget(), we terminate_walk() will do
528   - * the right thing for non-RCU mode, and all our
529   - * subsequent exit cases should unlock_rcu_walk()
530   - * before returning.
531   - */
532   - mntget(nd->path.mnt);
533 522 nd->flags &= ~LOOKUP_RCU;
534 523  
  524 + if (!lockref_get_not_dead(&parent->d_lockref)) {
  525 + nd->path.dentry = NULL;
  526 + unlock_rcu_walk();
  527 + return -ECHILD;
  528 + }
  529 +
535 530 /*
536 531 * For a negative lookup, the lookup sequence point is the parents
537 532 * sequence point, and it only needs to revalidate the parent dentry.
538 533  
539 534  
540 535  
... ... @@ -608,16 +603,21 @@
608 603 if (!(nd->flags & LOOKUP_ROOT))
609 604 nd->root.mnt = NULL;
610 605  
  606 + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
  607 + unlock_rcu_walk();
  608 + return -ECHILD;
  609 + }
611 610 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
612 611 unlock_rcu_walk();
  612 + mntput(nd->path.mnt);
613 613 return -ECHILD;
614 614 }
615 615 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
616 616 unlock_rcu_walk();
617 617 dput(dentry);
  618 + mntput(nd->path.mnt);
618 619 return -ECHILD;
619 620 }
620   - mntget(nd->path.mnt);
621 621 unlock_rcu_walk();
622 622 }
623 623  
624 624  
625 625  
... ... @@ -909,15 +909,15 @@
909 909 struct mount *parent;
910 910 struct dentry *mountpoint;
911 911  
912   - br_read_lock(&vfsmount_lock);
  912 + read_seqlock_excl(&mount_lock);
913 913 parent = mnt->mnt_parent;
914 914 if (parent == mnt) {
915   - br_read_unlock(&vfsmount_lock);
  915 + read_sequnlock_excl(&mount_lock);
916 916 return 0;
917 917 }
918 918 mntget(&parent->mnt);
919 919 mountpoint = dget(mnt->mnt_mountpoint);
920   - br_read_unlock(&vfsmount_lock);
  920 + read_sequnlock_excl(&mount_lock);
921 921 dput(path->dentry);
922 922 path->dentry = mountpoint;
923 923 mntput(path->mnt);
... ... @@ -1048,8 +1048,8 @@
1048 1048  
1049 1049 /* Something is mounted on this dentry in another
1050 1050 * namespace and/or whatever was mounted there in this
1051   - * namespace got unmounted before we managed to get the
1052   - * vfsmount_lock */
  1051 + * namespace got unmounted before lookup_mnt() could
  1052 + * get it */
1053 1053 }
1054 1054  
1055 1055 /* Handle an automount point */
... ... @@ -1864,6 +1864,7 @@
1864 1864 if (flags & LOOKUP_RCU) {
1865 1865 lock_rcu_walk();
1866 1866 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
  1867 + nd->m_seq = read_seqbegin(&mount_lock);
1867 1868 } else {
1868 1869 path_get(&nd->path);
1869 1870 }
... ... @@ -1872,6 +1873,7 @@
1872 1873  
1873 1874 nd->root.mnt = NULL;
1874 1875  
  1876 + nd->m_seq = read_seqbegin(&mount_lock);
1875 1877 if (*name=='/') {
1876 1878 if (flags & LOOKUP_RCU) {
1877 1879 lock_rcu_walk();
... ... @@ -53,7 +53,7 @@
53 53 * It should be taken for write in all cases where the vfsmount
54 54 * tree or hash is modified or when a vfsmount structure is modified.
55 55 */
56   -DEFINE_BRLOCK(vfsmount_lock);
  56 +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
57 57  
58 58 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
59 59 {
60 60  
61 61  
... ... @@ -547,16 +547,38 @@
547 547 kmem_cache_free(mnt_cache, mnt);
548 548 }
549 549  
  550 +/* call under rcu_read_lock */
  551 +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  552 +{
  553 + struct mount *mnt;
  554 + if (read_seqretry(&mount_lock, seq))
  555 + return false;
  556 + if (bastard == NULL)
  557 + return true;
  558 + mnt = real_mount(bastard);
  559 + mnt_add_count(mnt, 1);
  560 + if (likely(!read_seqretry(&mount_lock, seq)))
  561 + return true;
  562 + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
  563 + mnt_add_count(mnt, -1);
  564 + return false;
  565 + }
  566 + rcu_read_unlock();
  567 + mntput(bastard);
  568 + rcu_read_lock();
  569 + return false;
  570 +}
  571 +
550 572 /*
551 573 * find the first mount at @dentry on vfsmount @mnt.
552   - * vfsmount_lock must be held for read or write.
  574 + * call under rcu_read_lock()
553 575 */
554 576 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
555 577 {
556 578 struct list_head *head = mount_hashtable + hash(mnt, dentry);
557 579 struct mount *p;
558 580  
559   - list_for_each_entry(p, head, mnt_hash)
  581 + list_for_each_entry_rcu(p, head, mnt_hash)
560 582 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
561 583 return p;
562 584 return NULL;
... ... @@ -564,7 +586,7 @@
564 586  
565 587 /*
566 588 * find the last mount at @dentry on vfsmount @mnt.
567   - * vfsmount_lock must be held for read or write.
  589 + * mount_lock must be held.
568 590 */
569 591 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
570 592 {
571 593  
... ... @@ -596,17 +618,17 @@
596 618 struct vfsmount *lookup_mnt(struct path *path)
597 619 {
598 620 struct mount *child_mnt;
  621 + struct vfsmount *m;
  622 + unsigned seq;
599 623  
600   - br_read_lock(&vfsmount_lock);
601   - child_mnt = __lookup_mnt(path->mnt, path->dentry);
602   - if (child_mnt) {
603   - mnt_add_count(child_mnt, 1);
604   - br_read_unlock(&vfsmount_lock);
605   - return &child_mnt->mnt;
606   - } else {
607   - br_read_unlock(&vfsmount_lock);
608   - return NULL;
609   - }
  624 + rcu_read_lock();
  625 + do {
  626 + seq = read_seqbegin(&mount_lock);
  627 + child_mnt = __lookup_mnt(path->mnt, path->dentry);
  628 + m = child_mnt ? &child_mnt->mnt : NULL;
  629 + } while (!legitimize_mnt(m, seq));
  630 + rcu_read_unlock();
  631 + return m;
610 632 }
611 633  
612 634 static struct mountpoint *new_mountpoint(struct dentry *dentry)
613 635  
614 636  
615 637  
616 638  
617 639  
618 640  
619 641  
... ... @@ -874,38 +896,46 @@
874 896 return ERR_PTR(err);
875 897 }
876 898  
  899 +static void delayed_free(struct rcu_head *head)
  900 +{
  901 + struct mount *mnt = container_of(head, struct mount, mnt_rcu);
  902 + kfree(mnt->mnt_devname);
  903 +#ifdef CONFIG_SMP
  904 + free_percpu(mnt->mnt_pcp);
  905 +#endif
  906 + kmem_cache_free(mnt_cache, mnt);
  907 +}
  908 +
877 909 static void mntput_no_expire(struct mount *mnt)
878 910 {
879 911 put_again:
880   -#ifdef CONFIG_SMP
881   - br_read_lock(&vfsmount_lock);
882   - if (likely(mnt->mnt_ns)) {
883   - /* shouldn't be the last one */
884   - mnt_add_count(mnt, -1);
885   - br_read_unlock(&vfsmount_lock);
  912 + rcu_read_lock();
  913 + mnt_add_count(mnt, -1);
  914 + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
  915 + rcu_read_unlock();
886 916 return;
887 917 }
888   - br_read_unlock(&vfsmount_lock);
889   -
890 918 lock_mount_hash();
891   - mnt_add_count(mnt, -1);
892 919 if (mnt_get_count(mnt)) {
  920 + rcu_read_unlock();
893 921 unlock_mount_hash();
894 922 return;
895 923 }
896   -#else
897   - mnt_add_count(mnt, -1);
898   - if (likely(mnt_get_count(mnt)))
899   - return;
900   - lock_mount_hash();
901   -#endif
902 924 if (unlikely(mnt->mnt_pinned)) {
903 925 mnt_add_count(mnt, mnt->mnt_pinned + 1);
904 926 mnt->mnt_pinned = 0;
  927 + rcu_read_unlock();
905 928 unlock_mount_hash();
906 929 acct_auto_close_mnt(&mnt->mnt);
907 930 goto put_again;
908 931 }
  932 + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
  933 + rcu_read_unlock();
  934 + unlock_mount_hash();
  935 + return;
  936 + }
  937 + mnt->mnt.mnt_flags |= MNT_DOOMED;
  938 + rcu_read_unlock();
909 939  
910 940 list_del(&mnt->mnt_instance);
911 941 unlock_mount_hash();
... ... @@ -924,7 +954,8 @@
924 954 fsnotify_vfsmount_delete(&mnt->mnt);
925 955 dput(mnt->mnt.mnt_root);
926 956 deactivate_super(mnt->mnt.mnt_sb);
927   - free_vfsmnt(mnt);
  957 + mnt_free_id(mnt);
  958 + call_rcu(&mnt->mnt_rcu, delayed_free);
928 959 }
929 960  
930 961 void mntput(struct vfsmount *mnt)
... ... @@ -1137,6 +1168,8 @@
1137 1168 list_splice_init(&unmounted, &head);
1138 1169 up_write(&namespace_sem);
1139 1170  
  1171 + synchronize_rcu();
  1172 +
1140 1173 while (!list_empty(&head)) {
1141 1174 mnt = list_first_entry(&head, struct mount, mnt_hash);
1142 1175 list_del_init(&mnt->mnt_hash);
1143 1176  
1144 1177  
... ... @@ -1152,10 +1185,13 @@
1152 1185 }
1153 1186  
1154 1187 /*
1155   - * vfsmount lock must be held for write
  1188 + * mount_lock must be held
1156 1189 * namespace_sem must be held for write
  1190 + * how = 0 => just this tree, don't propagate
  1191 + * how = 1 => propagate; we know that nobody else has reference to any victims
  1192 + * how = 2 => lazy umount
1157 1193 */
1158   -void umount_tree(struct mount *mnt, int propagate)
  1194 +void umount_tree(struct mount *mnt, int how)
1159 1195 {
1160 1196 LIST_HEAD(tmp_list);
1161 1197 struct mount *p;
... ... @@ -1163,7 +1199,7 @@
1163 1199 for (p = mnt; p; p = next_mnt(p, mnt))
1164 1200 list_move(&p->mnt_hash, &tmp_list);
1165 1201  
1166   - if (propagate)
  1202 + if (how)
1167 1203 propagate_umount(&tmp_list);
1168 1204  
1169 1205 list_for_each_entry(p, &tmp_list, mnt_hash) {
... ... @@ -1171,6 +1207,8 @@
1171 1207 list_del_init(&p->mnt_list);
1172 1208 __touch_mnt_namespace(p->mnt_ns);
1173 1209 p->mnt_ns = NULL;
  1210 + if (how < 2)
  1211 + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1174 1212 list_del_init(&p->mnt_child);
1175 1213 if (mnt_has_parent(p)) {
1176 1214 put_mountpoint(p->mnt_mp);
1177 1215  
1178 1216  
... ... @@ -1262,14 +1300,18 @@
1262 1300 lock_mount_hash();
1263 1301 event++;
1264 1302  
1265   - if (!(flags & MNT_DETACH))
1266   - shrink_submounts(mnt);
1267   -
1268   - retval = -EBUSY;
1269   - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
  1303 + if (flags & MNT_DETACH) {
1270 1304 if (!list_empty(&mnt->mnt_list))
1271   - umount_tree(mnt, 1);
  1305 + umount_tree(mnt, 2);
1272 1306 retval = 0;
  1307 + } else {
  1308 + shrink_submounts(mnt);
  1309 + retval = -EBUSY;
  1310 + if (!propagate_mount_busy(mnt, 2)) {
  1311 + if (!list_empty(&mnt->mnt_list))
  1312 + umount_tree(mnt, 1);
  1313 + retval = 0;
  1314 + }
1273 1315 }
1274 1316 unlock_mount_hash();
1275 1317 namespace_unlock();
... ... @@ -1955,7 +1997,7 @@
1955 1997 struct mount *parent;
1956 1998 int err;
1957 1999  
1958   - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
  2000 + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
1959 2001  
1960 2002 mp = lock_mount(path);
1961 2003 if (IS_ERR(mp))
... ... @@ -2172,7 +2214,7 @@
2172 2214 * process a list of expirable mountpoints with the intent of discarding any
2173 2215 * submounts of a specific parent mountpoint
2174 2216 *
2175   - * vfsmount_lock must be held for write
  2217 + * mount_lock must be held for write
2176 2218 */
2177 2219 static void shrink_submounts(struct mount *mnt)
2178 2220 {
... ... @@ -2558,7 +2600,7 @@
2558 2600 /*
2559 2601 * Return true if path is reachable from root
2560 2602 *
2561   - * namespace_sem or vfsmount_lock is held
  2603 + * namespace_sem or mount_lock is held
2562 2604 */
2563 2605 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2564 2606 const struct path *root)
2565 2607  
... ... @@ -2573,9 +2615,9 @@
2573 2615 int path_is_under(struct path *path1, struct path *path2)
2574 2616 {
2575 2617 int res;
2576   - br_read_lock(&vfsmount_lock);
  2618 + read_seqlock_excl(&mount_lock);
2577 2619 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2578   - br_read_unlock(&vfsmount_lock);
  2620 + read_sequnlock_excl(&mount_lock);
2579 2621 return res;
2580 2622 }
2581 2623 EXPORT_SYMBOL(path_is_under);
... ... @@ -2748,8 +2790,6 @@
2748 2790 for (u = 0; u < HASH_SIZE; u++)
2749 2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2750 2792  
2751   - br_lock_init(&vfsmount_lock);
2752   -
2753 2793 err = sysfs_init();
2754 2794 if (err)
2755 2795 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
2756 2796  
... ... @@ -2788,9 +2828,8 @@
2788 2828 {
2789 2829 /* release long term mount so mount point can be released */
2790 2830 if (!IS_ERR_OR_NULL(mnt)) {
2791   - lock_mount_hash();
2792 2831 real_mount(mnt)->mnt_ns = NULL;
2793   - unlock_mount_hash();
  2832 + synchronize_rcu(); /* yecchhh... */
2794 2833 mntput(mnt);
2795 2834 }
2796 2835 }
include/linux/mount.h
... ... @@ -49,6 +49,8 @@
49 49  
50 50 #define MNT_LOCK_READONLY 0x400000
51 51 #define MNT_LOCKED 0x800000
  52 +#define MNT_DOOMED 0x1000000
  53 +#define MNT_SYNC_UMOUNT 0x2000000
52 54  
53 55 struct vfsmount {
54 56 struct dentry *mnt_root; /* root of the mounted tree */
include/linux/namei.h
... ... @@ -16,7 +16,7 @@
16 16 struct path root;
17 17 struct inode *inode; /* path.dentry.d_inode */
18 18 unsigned int flags;
19   - unsigned seq;
  19 + unsigned seq, m_seq;
20 20 int last_type;
21 21 unsigned depth;
22 22 char *saved_names[MAX_NESTED_LINKS + 1];