Commit 80d4d8397a96b79d4562f68596ba33432ab47cd1

Authored by Eric W. Biederman
Committed by Greg Kroah-Hartman
1 parent 16811f0192

mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount

commit 3e1866410f11356a9fd869beb3e95983dc79c067 upstream.

Now that remount is properly enforcing the rule that you can't remove
nodev at least sandstorm.io is breaking when performing a remount.

It turns out that there is an easy intuitive solution implicitly
add nodev on remount when nodev was implicitly added on mount.

Tested-by: Cedric Bosdonnat <cbosdonnat@suse.com>
Tested-by: Richard Weinberger <richard@nod.at>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 7 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/namespace.c 2 * linux/fs/namespace.c
3 * 3 *
4 * (C) Copyright Al Viro 2000, 2001 4 * (C) Copyright Al Viro 2000, 2001
5 * Released under GPL v2. 5 * Released under GPL v2.
6 * 6 *
7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 7 * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 * Heavily rewritten. 8 * Heavily rewritten.
9 */ 9 */
10 10
11 #include <linux/syscalls.h> 11 #include <linux/syscalls.h>
12 #include <linux/export.h> 12 #include <linux/export.h>
13 #include <linux/capability.h> 13 #include <linux/capability.h>
14 #include <linux/mnt_namespace.h> 14 #include <linux/mnt_namespace.h>
15 #include <linux/user_namespace.h> 15 #include <linux/user_namespace.h>
16 #include <linux/namei.h> 16 #include <linux/namei.h>
17 #include <linux/security.h> 17 #include <linux/security.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 #include <linux/init.h> /* init_rootfs */ 19 #include <linux/init.h> /* init_rootfs */
20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */
21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22 #include <linux/uaccess.h> 22 #include <linux/uaccess.h>
23 #include <linux/proc_ns.h> 23 #include <linux/proc_ns.h>
24 #include <linux/magic.h> 24 #include <linux/magic.h>
25 #include <linux/bootmem.h> 25 #include <linux/bootmem.h>
26 #include <linux/task_work.h> 26 #include <linux/task_work.h>
27 #include "pnode.h" 27 #include "pnode.h"
28 #include "internal.h" 28 #include "internal.h"
29 29
30 static unsigned int m_hash_mask __read_mostly; 30 static unsigned int m_hash_mask __read_mostly;
31 static unsigned int m_hash_shift __read_mostly; 31 static unsigned int m_hash_shift __read_mostly;
32 static unsigned int mp_hash_mask __read_mostly; 32 static unsigned int mp_hash_mask __read_mostly;
33 static unsigned int mp_hash_shift __read_mostly; 33 static unsigned int mp_hash_shift __read_mostly;
34 34
35 static __initdata unsigned long mhash_entries; 35 static __initdata unsigned long mhash_entries;
36 static int __init set_mhash_entries(char *str) 36 static int __init set_mhash_entries(char *str)
37 { 37 {
38 if (!str) 38 if (!str)
39 return 0; 39 return 0;
40 mhash_entries = simple_strtoul(str, &str, 0); 40 mhash_entries = simple_strtoul(str, &str, 0);
41 return 1; 41 return 1;
42 } 42 }
43 __setup("mhash_entries=", set_mhash_entries); 43 __setup("mhash_entries=", set_mhash_entries);
44 44
45 static __initdata unsigned long mphash_entries; 45 static __initdata unsigned long mphash_entries;
46 static int __init set_mphash_entries(char *str) 46 static int __init set_mphash_entries(char *str)
47 { 47 {
48 if (!str) 48 if (!str)
49 return 0; 49 return 0;
50 mphash_entries = simple_strtoul(str, &str, 0); 50 mphash_entries = simple_strtoul(str, &str, 0);
51 return 1; 51 return 1;
52 } 52 }
53 __setup("mphash_entries=", set_mphash_entries); 53 __setup("mphash_entries=", set_mphash_entries);
54 54
55 static u64 event; 55 static u64 event;
56 static DEFINE_IDA(mnt_id_ida); 56 static DEFINE_IDA(mnt_id_ida);
57 static DEFINE_IDA(mnt_group_ida); 57 static DEFINE_IDA(mnt_group_ida);
58 static DEFINE_SPINLOCK(mnt_id_lock); 58 static DEFINE_SPINLOCK(mnt_id_lock);
59 static int mnt_id_start = 0; 59 static int mnt_id_start = 0;
60 static int mnt_group_start = 1; 60 static int mnt_group_start = 1;
61 61
62 static struct hlist_head *mount_hashtable __read_mostly; 62 static struct hlist_head *mount_hashtable __read_mostly;
63 static struct hlist_head *mountpoint_hashtable __read_mostly; 63 static struct hlist_head *mountpoint_hashtable __read_mostly;
64 static struct kmem_cache *mnt_cache __read_mostly; 64 static struct kmem_cache *mnt_cache __read_mostly;
65 static DECLARE_RWSEM(namespace_sem); 65 static DECLARE_RWSEM(namespace_sem);
66 66
67 /* /sys/fs */ 67 /* /sys/fs */
68 struct kobject *fs_kobj; 68 struct kobject *fs_kobj;
69 EXPORT_SYMBOL_GPL(fs_kobj); 69 EXPORT_SYMBOL_GPL(fs_kobj);
70 70
71 /* 71 /*
72 * vfsmount lock may be taken for read to prevent changes to the 72 * vfsmount lock may be taken for read to prevent changes to the
73 * vfsmount hash, ie. during mountpoint lookups or walking back 73 * vfsmount hash, ie. during mountpoint lookups or walking back
74 * up the tree. 74 * up the tree.
75 * 75 *
76 * It should be taken for write in all cases where the vfsmount 76 * It should be taken for write in all cases where the vfsmount
77 * tree or hash is modified or when a vfsmount structure is modified. 77 * tree or hash is modified or when a vfsmount structure is modified.
78 */ 78 */
79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
80 80
81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) 81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
82 { 82 {
83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
85 tmp = tmp + (tmp >> m_hash_shift); 85 tmp = tmp + (tmp >> m_hash_shift);
86 return &mount_hashtable[tmp & m_hash_mask]; 86 return &mount_hashtable[tmp & m_hash_mask];
87 } 87 }
88 88
89 static inline struct hlist_head *mp_hash(struct dentry *dentry) 89 static inline struct hlist_head *mp_hash(struct dentry *dentry)
90 { 90 {
91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); 91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
92 tmp = tmp + (tmp >> mp_hash_shift); 92 tmp = tmp + (tmp >> mp_hash_shift);
93 return &mountpoint_hashtable[tmp & mp_hash_mask]; 93 return &mountpoint_hashtable[tmp & mp_hash_mask];
94 } 94 }
95 95
96 /* 96 /*
97 * allocation is serialized by namespace_sem, but we need the spinlock to 97 * allocation is serialized by namespace_sem, but we need the spinlock to
98 * serialize with freeing. 98 * serialize with freeing.
99 */ 99 */
100 static int mnt_alloc_id(struct mount *mnt) 100 static int mnt_alloc_id(struct mount *mnt)
101 { 101 {
102 int res; 102 int res;
103 103
104 retry: 104 retry:
105 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 105 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
106 spin_lock(&mnt_id_lock); 106 spin_lock(&mnt_id_lock);
107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
108 if (!res) 108 if (!res)
109 mnt_id_start = mnt->mnt_id + 1; 109 mnt_id_start = mnt->mnt_id + 1;
110 spin_unlock(&mnt_id_lock); 110 spin_unlock(&mnt_id_lock);
111 if (res == -EAGAIN) 111 if (res == -EAGAIN)
112 goto retry; 112 goto retry;
113 113
114 return res; 114 return res;
115 } 115 }
116 116
117 static void mnt_free_id(struct mount *mnt) 117 static void mnt_free_id(struct mount *mnt)
118 { 118 {
119 int id = mnt->mnt_id; 119 int id = mnt->mnt_id;
120 spin_lock(&mnt_id_lock); 120 spin_lock(&mnt_id_lock);
121 ida_remove(&mnt_id_ida, id); 121 ida_remove(&mnt_id_ida, id);
122 if (mnt_id_start > id) 122 if (mnt_id_start > id)
123 mnt_id_start = id; 123 mnt_id_start = id;
124 spin_unlock(&mnt_id_lock); 124 spin_unlock(&mnt_id_lock);
125 } 125 }
126 126
127 /* 127 /*
128 * Allocate a new peer group ID 128 * Allocate a new peer group ID
129 * 129 *
130 * mnt_group_ida is protected by namespace_sem 130 * mnt_group_ida is protected by namespace_sem
131 */ 131 */
132 static int mnt_alloc_group_id(struct mount *mnt) 132 static int mnt_alloc_group_id(struct mount *mnt)
133 { 133 {
134 int res; 134 int res;
135 135
136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
137 return -ENOMEM; 137 return -ENOMEM;
138 138
139 res = ida_get_new_above(&mnt_group_ida, 139 res = ida_get_new_above(&mnt_group_ida,
140 mnt_group_start, 140 mnt_group_start,
141 &mnt->mnt_group_id); 141 &mnt->mnt_group_id);
142 if (!res) 142 if (!res)
143 mnt_group_start = mnt->mnt_group_id + 1; 143 mnt_group_start = mnt->mnt_group_id + 1;
144 144
145 return res; 145 return res;
146 } 146 }
147 147
148 /* 148 /*
149 * Release a peer group ID 149 * Release a peer group ID
150 */ 150 */
151 void mnt_release_group_id(struct mount *mnt) 151 void mnt_release_group_id(struct mount *mnt)
152 { 152 {
153 int id = mnt->mnt_group_id; 153 int id = mnt->mnt_group_id;
154 ida_remove(&mnt_group_ida, id); 154 ida_remove(&mnt_group_ida, id);
155 if (mnt_group_start > id) 155 if (mnt_group_start > id)
156 mnt_group_start = id; 156 mnt_group_start = id;
157 mnt->mnt_group_id = 0; 157 mnt->mnt_group_id = 0;
158 } 158 }
159 159
160 /* 160 /*
161 * vfsmount lock must be held for read 161 * vfsmount lock must be held for read
162 */ 162 */
163 static inline void mnt_add_count(struct mount *mnt, int n) 163 static inline void mnt_add_count(struct mount *mnt, int n)
164 { 164 {
165 #ifdef CONFIG_SMP 165 #ifdef CONFIG_SMP
166 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 166 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
167 #else 167 #else
168 preempt_disable(); 168 preempt_disable();
169 mnt->mnt_count += n; 169 mnt->mnt_count += n;
170 preempt_enable(); 170 preempt_enable();
171 #endif 171 #endif
172 } 172 }
173 173
174 /* 174 /*
175 * vfsmount lock must be held for write 175 * vfsmount lock must be held for write
176 */ 176 */
177 unsigned int mnt_get_count(struct mount *mnt) 177 unsigned int mnt_get_count(struct mount *mnt)
178 { 178 {
179 #ifdef CONFIG_SMP 179 #ifdef CONFIG_SMP
180 unsigned int count = 0; 180 unsigned int count = 0;
181 int cpu; 181 int cpu;
182 182
183 for_each_possible_cpu(cpu) { 183 for_each_possible_cpu(cpu) {
184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
185 } 185 }
186 186
187 return count; 187 return count;
188 #else 188 #else
189 return mnt->mnt_count; 189 return mnt->mnt_count;
190 #endif 190 #endif
191 } 191 }
192 192
193 static struct mount *alloc_vfsmnt(const char *name) 193 static struct mount *alloc_vfsmnt(const char *name)
194 { 194 {
195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
196 if (mnt) { 196 if (mnt) {
197 int err; 197 int err;
198 198
199 err = mnt_alloc_id(mnt); 199 err = mnt_alloc_id(mnt);
200 if (err) 200 if (err)
201 goto out_free_cache; 201 goto out_free_cache;
202 202
203 if (name) { 203 if (name) {
204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
205 if (!mnt->mnt_devname) 205 if (!mnt->mnt_devname)
206 goto out_free_id; 206 goto out_free_id;
207 } 207 }
208 208
209 #ifdef CONFIG_SMP 209 #ifdef CONFIG_SMP
210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
211 if (!mnt->mnt_pcp) 211 if (!mnt->mnt_pcp)
212 goto out_free_devname; 212 goto out_free_devname;
213 213
214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
215 #else 215 #else
216 mnt->mnt_count = 1; 216 mnt->mnt_count = 1;
217 mnt->mnt_writers = 0; 217 mnt->mnt_writers = 0;
218 #endif 218 #endif
219 219
220 INIT_HLIST_NODE(&mnt->mnt_hash); 220 INIT_HLIST_NODE(&mnt->mnt_hash);
221 INIT_LIST_HEAD(&mnt->mnt_child); 221 INIT_LIST_HEAD(&mnt->mnt_child);
222 INIT_LIST_HEAD(&mnt->mnt_mounts); 222 INIT_LIST_HEAD(&mnt->mnt_mounts);
223 INIT_LIST_HEAD(&mnt->mnt_list); 223 INIT_LIST_HEAD(&mnt->mnt_list);
224 INIT_LIST_HEAD(&mnt->mnt_expire); 224 INIT_LIST_HEAD(&mnt->mnt_expire);
225 INIT_LIST_HEAD(&mnt->mnt_share); 225 INIT_LIST_HEAD(&mnt->mnt_share);
226 INIT_LIST_HEAD(&mnt->mnt_slave_list); 226 INIT_LIST_HEAD(&mnt->mnt_slave_list);
227 INIT_LIST_HEAD(&mnt->mnt_slave); 227 INIT_LIST_HEAD(&mnt->mnt_slave);
228 INIT_HLIST_NODE(&mnt->mnt_mp_list); 228 INIT_HLIST_NODE(&mnt->mnt_mp_list);
229 #ifdef CONFIG_FSNOTIFY 229 #ifdef CONFIG_FSNOTIFY
230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
231 #endif 231 #endif
232 } 232 }
233 return mnt; 233 return mnt;
234 234
235 #ifdef CONFIG_SMP 235 #ifdef CONFIG_SMP
236 out_free_devname: 236 out_free_devname:
237 kfree(mnt->mnt_devname); 237 kfree(mnt->mnt_devname);
238 #endif 238 #endif
239 out_free_id: 239 out_free_id:
240 mnt_free_id(mnt); 240 mnt_free_id(mnt);
241 out_free_cache: 241 out_free_cache:
242 kmem_cache_free(mnt_cache, mnt); 242 kmem_cache_free(mnt_cache, mnt);
243 return NULL; 243 return NULL;
244 } 244 }
245 245
246 /* 246 /*
247 * Most r/o checks on a fs are for operations that take 247 * Most r/o checks on a fs are for operations that take
248 * discrete amounts of time, like a write() or unlink(). 248 * discrete amounts of time, like a write() or unlink().
249 * We must keep track of when those operations start 249 * We must keep track of when those operations start
250 * (for permission checks) and when they end, so that 250 * (for permission checks) and when they end, so that
251 * we can determine when writes are able to occur to 251 * we can determine when writes are able to occur to
252 * a filesystem. 252 * a filesystem.
253 */ 253 */
254 /* 254 /*
255 * __mnt_is_readonly: check whether a mount is read-only 255 * __mnt_is_readonly: check whether a mount is read-only
256 * @mnt: the mount to check for its write status 256 * @mnt: the mount to check for its write status
257 * 257 *
258 * This shouldn't be used directly ouside of the VFS. 258 * This shouldn't be used directly ouside of the VFS.
259 * It does not guarantee that the filesystem will stay 259 * It does not guarantee that the filesystem will stay
260 * r/w, just that it is right *now*. This can not and 260 * r/w, just that it is right *now*. This can not and
261 * should not be used in place of IS_RDONLY(inode). 261 * should not be used in place of IS_RDONLY(inode).
262 * mnt_want/drop_write() will _keep_ the filesystem 262 * mnt_want/drop_write() will _keep_ the filesystem
263 * r/w. 263 * r/w.
264 */ 264 */
265 int __mnt_is_readonly(struct vfsmount *mnt) 265 int __mnt_is_readonly(struct vfsmount *mnt)
266 { 266 {
267 if (mnt->mnt_flags & MNT_READONLY) 267 if (mnt->mnt_flags & MNT_READONLY)
268 return 1; 268 return 1;
269 if (mnt->mnt_sb->s_flags & MS_RDONLY) 269 if (mnt->mnt_sb->s_flags & MS_RDONLY)
270 return 1; 270 return 1;
271 return 0; 271 return 0;
272 } 272 }
273 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 273 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
274 274
275 static inline void mnt_inc_writers(struct mount *mnt) 275 static inline void mnt_inc_writers(struct mount *mnt)
276 { 276 {
277 #ifdef CONFIG_SMP 277 #ifdef CONFIG_SMP
278 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 278 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
279 #else 279 #else
280 mnt->mnt_writers++; 280 mnt->mnt_writers++;
281 #endif 281 #endif
282 } 282 }
283 283
284 static inline void mnt_dec_writers(struct mount *mnt) 284 static inline void mnt_dec_writers(struct mount *mnt)
285 { 285 {
286 #ifdef CONFIG_SMP 286 #ifdef CONFIG_SMP
287 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 287 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
288 #else 288 #else
289 mnt->mnt_writers--; 289 mnt->mnt_writers--;
290 #endif 290 #endif
291 } 291 }
292 292
293 static unsigned int mnt_get_writers(struct mount *mnt) 293 static unsigned int mnt_get_writers(struct mount *mnt)
294 { 294 {
295 #ifdef CONFIG_SMP 295 #ifdef CONFIG_SMP
296 unsigned int count = 0; 296 unsigned int count = 0;
297 int cpu; 297 int cpu;
298 298
299 for_each_possible_cpu(cpu) { 299 for_each_possible_cpu(cpu) {
300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
301 } 301 }
302 302
303 return count; 303 return count;
304 #else 304 #else
305 return mnt->mnt_writers; 305 return mnt->mnt_writers;
306 #endif 306 #endif
307 } 307 }
308 308
309 static int mnt_is_readonly(struct vfsmount *mnt) 309 static int mnt_is_readonly(struct vfsmount *mnt)
310 { 310 {
311 if (mnt->mnt_sb->s_readonly_remount) 311 if (mnt->mnt_sb->s_readonly_remount)
312 return 1; 312 return 1;
313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ 313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
314 smp_rmb(); 314 smp_rmb();
315 return __mnt_is_readonly(mnt); 315 return __mnt_is_readonly(mnt);
316 } 316 }
317 317
318 /* 318 /*
319 * Most r/o & frozen checks on a fs are for operations that take discrete 319 * Most r/o & frozen checks on a fs are for operations that take discrete
320 * amounts of time, like a write() or unlink(). We must keep track of when 320 * amounts of time, like a write() or unlink(). We must keep track of when
321 * those operations start (for permission checks) and when they end, so that we 321 * those operations start (for permission checks) and when they end, so that we
322 * can determine when writes are able to occur to a filesystem. 322 * can determine when writes are able to occur to a filesystem.
323 */ 323 */
324 /** 324 /**
325 * __mnt_want_write - get write access to a mount without freeze protection 325 * __mnt_want_write - get write access to a mount without freeze protection
326 * @m: the mount on which to take a write 326 * @m: the mount on which to take a write
327 * 327 *
328 * This tells the low-level filesystem that a write is about to be performed to 328 * This tells the low-level filesystem that a write is about to be performed to
329 * it, and makes sure that writes are allowed (mnt it read-write) before 329 * it, and makes sure that writes are allowed (mnt it read-write) before
330 * returning success. This operation does not protect against filesystem being 330 * returning success. This operation does not protect against filesystem being
331 * frozen. When the write operation is finished, __mnt_drop_write() must be 331 * frozen. When the write operation is finished, __mnt_drop_write() must be
332 * called. This is effectively a refcount. 332 * called. This is effectively a refcount.
333 */ 333 */
334 int __mnt_want_write(struct vfsmount *m) 334 int __mnt_want_write(struct vfsmount *m)
335 { 335 {
336 struct mount *mnt = real_mount(m); 336 struct mount *mnt = real_mount(m);
337 int ret = 0; 337 int ret = 0;
338 338
339 preempt_disable(); 339 preempt_disable();
340 mnt_inc_writers(mnt); 340 mnt_inc_writers(mnt);
341 /* 341 /*
342 * The store to mnt_inc_writers must be visible before we pass 342 * The store to mnt_inc_writers must be visible before we pass
343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
344 * incremented count after it has set MNT_WRITE_HOLD. 344 * incremented count after it has set MNT_WRITE_HOLD.
345 */ 345 */
346 smp_mb(); 346 smp_mb();
347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) 347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
348 cpu_relax(); 348 cpu_relax();
349 /* 349 /*
350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
351 * be set to match its requirements. So we must not load that until 351 * be set to match its requirements. So we must not load that until
352 * MNT_WRITE_HOLD is cleared. 352 * MNT_WRITE_HOLD is cleared.
353 */ 353 */
354 smp_rmb(); 354 smp_rmb();
355 if (mnt_is_readonly(m)) { 355 if (mnt_is_readonly(m)) {
356 mnt_dec_writers(mnt); 356 mnt_dec_writers(mnt);
357 ret = -EROFS; 357 ret = -EROFS;
358 } 358 }
359 preempt_enable(); 359 preempt_enable();
360 360
361 return ret; 361 return ret;
362 } 362 }
363 363
364 /** 364 /**
365 * mnt_want_write - get write access to a mount 365 * mnt_want_write - get write access to a mount
366 * @m: the mount on which to take a write 366 * @m: the mount on which to take a write
367 * 367 *
368 * This tells the low-level filesystem that a write is about to be performed to 368 * This tells the low-level filesystem that a write is about to be performed to
369 * it, and makes sure that writes are allowed (mount is read-write, filesystem 369 * it, and makes sure that writes are allowed (mount is read-write, filesystem
370 * is not frozen) before returning success. When the write operation is 370 * is not frozen) before returning success. When the write operation is
371 * finished, mnt_drop_write() must be called. This is effectively a refcount. 371 * finished, mnt_drop_write() must be called. This is effectively a refcount.
372 */ 372 */
373 int mnt_want_write(struct vfsmount *m) 373 int mnt_want_write(struct vfsmount *m)
374 { 374 {
375 int ret; 375 int ret;
376 376
377 sb_start_write(m->mnt_sb); 377 sb_start_write(m->mnt_sb);
378 ret = __mnt_want_write(m); 378 ret = __mnt_want_write(m);
379 if (ret) 379 if (ret)
380 sb_end_write(m->mnt_sb); 380 sb_end_write(m->mnt_sb);
381 return ret; 381 return ret;
382 } 382 }
383 EXPORT_SYMBOL_GPL(mnt_want_write); 383 EXPORT_SYMBOL_GPL(mnt_want_write);
384 384
385 /** 385 /**
386 * mnt_clone_write - get write access to a mount 386 * mnt_clone_write - get write access to a mount
387 * @mnt: the mount on which to take a write 387 * @mnt: the mount on which to take a write
388 * 388 *
389 * This is effectively like mnt_want_write, except 389 * This is effectively like mnt_want_write, except
390 * it must only be used to take an extra write reference 390 * it must only be used to take an extra write reference
391 * on a mountpoint that we already know has a write reference 391 * on a mountpoint that we already know has a write reference
392 * on it. This allows some optimisation. 392 * on it. This allows some optimisation.
393 * 393 *
394 * After finished, mnt_drop_write must be called as usual to 394 * After finished, mnt_drop_write must be called as usual to
395 * drop the reference. 395 * drop the reference.
396 */ 396 */
397 int mnt_clone_write(struct vfsmount *mnt) 397 int mnt_clone_write(struct vfsmount *mnt)
398 { 398 {
399 /* superblock may be r/o */ 399 /* superblock may be r/o */
400 if (__mnt_is_readonly(mnt)) 400 if (__mnt_is_readonly(mnt))
401 return -EROFS; 401 return -EROFS;
402 preempt_disable(); 402 preempt_disable();
403 mnt_inc_writers(real_mount(mnt)); 403 mnt_inc_writers(real_mount(mnt));
404 preempt_enable(); 404 preempt_enable();
405 return 0; 405 return 0;
406 } 406 }
407 EXPORT_SYMBOL_GPL(mnt_clone_write); 407 EXPORT_SYMBOL_GPL(mnt_clone_write);
408 408
409 /** 409 /**
410 * __mnt_want_write_file - get write access to a file's mount 410 * __mnt_want_write_file - get write access to a file's mount
411 * @file: the file who's mount on which to take a write 411 * @file: the file who's mount on which to take a write
412 * 412 *
413 * This is like __mnt_want_write, but it takes a file and can 413 * This is like __mnt_want_write, but it takes a file and can
414 * do some optimisations if the file is open for write already 414 * do some optimisations if the file is open for write already
415 */ 415 */
416 int __mnt_want_write_file(struct file *file) 416 int __mnt_want_write_file(struct file *file)
417 { 417 {
418 if (!(file->f_mode & FMODE_WRITER)) 418 if (!(file->f_mode & FMODE_WRITER))
419 return __mnt_want_write(file->f_path.mnt); 419 return __mnt_want_write(file->f_path.mnt);
420 else 420 else
421 return mnt_clone_write(file->f_path.mnt); 421 return mnt_clone_write(file->f_path.mnt);
422 } 422 }
423 423
424 /** 424 /**
425 * mnt_want_write_file - get write access to a file's mount 425 * mnt_want_write_file - get write access to a file's mount
426 * @file: the file who's mount on which to take a write 426 * @file: the file who's mount on which to take a write
427 * 427 *
428 * This is like mnt_want_write, but it takes a file and can 428 * This is like mnt_want_write, but it takes a file and can
429 * do some optimisations if the file is open for write already 429 * do some optimisations if the file is open for write already
430 */ 430 */
431 int mnt_want_write_file(struct file *file) 431 int mnt_want_write_file(struct file *file)
432 { 432 {
433 int ret; 433 int ret;
434 434
435 sb_start_write(file->f_path.mnt->mnt_sb); 435 sb_start_write(file->f_path.mnt->mnt_sb);
436 ret = __mnt_want_write_file(file); 436 ret = __mnt_want_write_file(file);
437 if (ret) 437 if (ret)
438 sb_end_write(file->f_path.mnt->mnt_sb); 438 sb_end_write(file->f_path.mnt->mnt_sb);
439 return ret; 439 return ret;
440 } 440 }
441 EXPORT_SYMBOL_GPL(mnt_want_write_file); 441 EXPORT_SYMBOL_GPL(mnt_want_write_file);
442 442
443 /** 443 /**
444 * __mnt_drop_write - give up write access to a mount 444 * __mnt_drop_write - give up write access to a mount
445 * @mnt: the mount on which to give up write access 445 * @mnt: the mount on which to give up write access
446 * 446 *
447 * Tells the low-level filesystem that we are done 447 * Tells the low-level filesystem that we are done
448 * performing writes to it. Must be matched with 448 * performing writes to it. Must be matched with
449 * __mnt_want_write() call above. 449 * __mnt_want_write() call above.
450 */ 450 */
451 void __mnt_drop_write(struct vfsmount *mnt) 451 void __mnt_drop_write(struct vfsmount *mnt)
452 { 452 {
453 preempt_disable(); 453 preempt_disable();
454 mnt_dec_writers(real_mount(mnt)); 454 mnt_dec_writers(real_mount(mnt));
455 preempt_enable(); 455 preempt_enable();
456 } 456 }
457 457
458 /** 458 /**
459 * mnt_drop_write - give up write access to a mount 459 * mnt_drop_write - give up write access to a mount
460 * @mnt: the mount on which to give up write access 460 * @mnt: the mount on which to give up write access
461 * 461 *
462 * Tells the low-level filesystem that we are done performing writes to it and 462 * Tells the low-level filesystem that we are done performing writes to it and
463 * also allows filesystem to be frozen again. Must be matched with 463 * also allows filesystem to be frozen again. Must be matched with
464 * mnt_want_write() call above. 464 * mnt_want_write() call above.
465 */ 465 */
466 void mnt_drop_write(struct vfsmount *mnt) 466 void mnt_drop_write(struct vfsmount *mnt)
467 { 467 {
468 __mnt_drop_write(mnt); 468 __mnt_drop_write(mnt);
469 sb_end_write(mnt->mnt_sb); 469 sb_end_write(mnt->mnt_sb);
470 } 470 }
471 EXPORT_SYMBOL_GPL(mnt_drop_write); 471 EXPORT_SYMBOL_GPL(mnt_drop_write);
472 472
473 void __mnt_drop_write_file(struct file *file) 473 void __mnt_drop_write_file(struct file *file)
474 { 474 {
475 __mnt_drop_write(file->f_path.mnt); 475 __mnt_drop_write(file->f_path.mnt);
476 } 476 }
477 477
478 void mnt_drop_write_file(struct file *file) 478 void mnt_drop_write_file(struct file *file)
479 { 479 {
480 mnt_drop_write(file->f_path.mnt); 480 mnt_drop_write(file->f_path.mnt);
481 } 481 }
482 EXPORT_SYMBOL(mnt_drop_write_file); 482 EXPORT_SYMBOL(mnt_drop_write_file);
483 483
484 static int mnt_make_readonly(struct mount *mnt) 484 static int mnt_make_readonly(struct mount *mnt)
485 { 485 {
486 int ret = 0; 486 int ret = 0;
487 487
488 lock_mount_hash(); 488 lock_mount_hash();
489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
490 /* 490 /*
491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
492 * should be visible before we do. 492 * should be visible before we do.
493 */ 493 */
494 smp_mb(); 494 smp_mb();
495 495
496 /* 496 /*
497 * With writers on hold, if this value is zero, then there are 497 * With writers on hold, if this value is zero, then there are
498 * definitely no active writers (although held writers may subsequently 498 * definitely no active writers (although held writers may subsequently
499 * increment the count, they'll have to wait, and decrement it after 499 * increment the count, they'll have to wait, and decrement it after
500 * seeing MNT_READONLY). 500 * seeing MNT_READONLY).
501 * 501 *
502 * It is OK to have counter incremented on one CPU and decremented on 502 * It is OK to have counter incremented on one CPU and decremented on
503 * another: the sum will add up correctly. The danger would be when we 503 * another: the sum will add up correctly. The danger would be when we
504 * sum up each counter, if we read a counter before it is incremented, 504 * sum up each counter, if we read a counter before it is incremented,
505 * but then read another CPU's count which it has been subsequently 505 * but then read another CPU's count which it has been subsequently
506 * decremented from -- we would see more decrements than we should. 506 * decremented from -- we would see more decrements than we should.
507 * MNT_WRITE_HOLD protects against this scenario, because 507 * MNT_WRITE_HOLD protects against this scenario, because
508 * mnt_want_write first increments count, then smp_mb, then spins on 508 * mnt_want_write first increments count, then smp_mb, then spins on
509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
510 * we're counting up here. 510 * we're counting up here.
511 */ 511 */
512 if (mnt_get_writers(mnt) > 0) 512 if (mnt_get_writers(mnt) > 0)
513 ret = -EBUSY; 513 ret = -EBUSY;
514 else 514 else
515 mnt->mnt.mnt_flags |= MNT_READONLY; 515 mnt->mnt.mnt_flags |= MNT_READONLY;
516 /* 516 /*
517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
518 * that become unheld will see MNT_READONLY. 518 * that become unheld will see MNT_READONLY.
519 */ 519 */
520 smp_wmb(); 520 smp_wmb();
521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
522 unlock_mount_hash(); 522 unlock_mount_hash();
523 return ret; 523 return ret;
524 } 524 }
525 525
526 static void __mnt_unmake_readonly(struct mount *mnt) 526 static void __mnt_unmake_readonly(struct mount *mnt)
527 { 527 {
528 lock_mount_hash(); 528 lock_mount_hash();
529 mnt->mnt.mnt_flags &= ~MNT_READONLY; 529 mnt->mnt.mnt_flags &= ~MNT_READONLY;
530 unlock_mount_hash(); 530 unlock_mount_hash();
531 } 531 }
532 532
533 int sb_prepare_remount_readonly(struct super_block *sb) 533 int sb_prepare_remount_readonly(struct super_block *sb)
534 { 534 {
535 struct mount *mnt; 535 struct mount *mnt;
536 int err = 0; 536 int err = 0;
537 537
538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ 538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
539 if (atomic_long_read(&sb->s_remove_count)) 539 if (atomic_long_read(&sb->s_remove_count))
540 return -EBUSY; 540 return -EBUSY;
541 541
542 lock_mount_hash(); 542 lock_mount_hash();
543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
546 smp_mb(); 546 smp_mb();
547 if (mnt_get_writers(mnt) > 0) { 547 if (mnt_get_writers(mnt) > 0) {
548 err = -EBUSY; 548 err = -EBUSY;
549 break; 549 break;
550 } 550 }
551 } 551 }
552 } 552 }
553 if (!err && atomic_long_read(&sb->s_remove_count)) 553 if (!err && atomic_long_read(&sb->s_remove_count))
554 err = -EBUSY; 554 err = -EBUSY;
555 555
556 if (!err) { 556 if (!err) {
557 sb->s_readonly_remount = 1; 557 sb->s_readonly_remount = 1;
558 smp_wmb(); 558 smp_wmb();
559 } 559 }
560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
563 } 563 }
564 unlock_mount_hash(); 564 unlock_mount_hash();
565 565
566 return err; 566 return err;
567 } 567 }
568 568
569 static void free_vfsmnt(struct mount *mnt) 569 static void free_vfsmnt(struct mount *mnt)
570 { 570 {
571 kfree(mnt->mnt_devname); 571 kfree(mnt->mnt_devname);
572 #ifdef CONFIG_SMP 572 #ifdef CONFIG_SMP
573 free_percpu(mnt->mnt_pcp); 573 free_percpu(mnt->mnt_pcp);
574 #endif 574 #endif
575 kmem_cache_free(mnt_cache, mnt); 575 kmem_cache_free(mnt_cache, mnt);
576 } 576 }
577 577
578 static void delayed_free_vfsmnt(struct rcu_head *head) 578 static void delayed_free_vfsmnt(struct rcu_head *head)
579 { 579 {
580 free_vfsmnt(container_of(head, struct mount, mnt_rcu)); 580 free_vfsmnt(container_of(head, struct mount, mnt_rcu));
581 } 581 }
582 582
583 /* call under rcu_read_lock */ 583 /* call under rcu_read_lock */
584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
585 { 585 {
586 struct mount *mnt; 586 struct mount *mnt;
587 if (read_seqretry(&mount_lock, seq)) 587 if (read_seqretry(&mount_lock, seq))
588 return false; 588 return false;
589 if (bastard == NULL) 589 if (bastard == NULL)
590 return true; 590 return true;
591 mnt = real_mount(bastard); 591 mnt = real_mount(bastard);
592 mnt_add_count(mnt, 1); 592 mnt_add_count(mnt, 1);
593 if (likely(!read_seqretry(&mount_lock, seq))) 593 if (likely(!read_seqretry(&mount_lock, seq)))
594 return true; 594 return true;
595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
596 mnt_add_count(mnt, -1); 596 mnt_add_count(mnt, -1);
597 return false; 597 return false;
598 } 598 }
599 rcu_read_unlock(); 599 rcu_read_unlock();
600 mntput(bastard); 600 mntput(bastard);
601 rcu_read_lock(); 601 rcu_read_lock();
602 return false; 602 return false;
603 } 603 }
604 604
605 /* 605 /*
606 * find the first mount at @dentry on vfsmount @mnt. 606 * find the first mount at @dentry on vfsmount @mnt.
607 * call under rcu_read_lock() 607 * call under rcu_read_lock()
608 */ 608 */
609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
610 { 610 {
611 struct hlist_head *head = m_hash(mnt, dentry); 611 struct hlist_head *head = m_hash(mnt, dentry);
612 struct mount *p; 612 struct mount *p;
613 613
614 hlist_for_each_entry_rcu(p, head, mnt_hash) 614 hlist_for_each_entry_rcu(p, head, mnt_hash)
615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
616 return p; 616 return p;
617 return NULL; 617 return NULL;
618 } 618 }
619 619
620 /* 620 /*
621 * find the last mount at @dentry on vfsmount @mnt. 621 * find the last mount at @dentry on vfsmount @mnt.
622 * mount_lock must be held. 622 * mount_lock must be held.
623 */ 623 */
624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
625 { 625 {
626 struct mount *p, *res; 626 struct mount *p, *res;
627 res = p = __lookup_mnt(mnt, dentry); 627 res = p = __lookup_mnt(mnt, dentry);
628 if (!p) 628 if (!p)
629 goto out; 629 goto out;
630 hlist_for_each_entry_continue(p, mnt_hash) { 630 hlist_for_each_entry_continue(p, mnt_hash) {
631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) 631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
632 break; 632 break;
633 res = p; 633 res = p;
634 } 634 }
635 out: 635 out:
636 return res; 636 return res;
637 } 637 }
638 638
639 /* 639 /*
640 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
641 * 641 *
642 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
643 * following mounts: 643 * following mounts:
644 * 644 *
645 * mount /dev/sda1 /mnt 645 * mount /dev/sda1 /mnt
646 * mount /dev/sda2 /mnt 646 * mount /dev/sda2 /mnt
647 * mount /dev/sda3 /mnt 647 * mount /dev/sda3 /mnt
648 * 648 *
649 * Then lookup_mnt() on the base /mnt dentry in the root mount will 649 * Then lookup_mnt() on the base /mnt dentry in the root mount will
650 * return successively the root dentry and vfsmount of /dev/sda1, then 650 * return successively the root dentry and vfsmount of /dev/sda1, then
651 * /dev/sda2, then /dev/sda3, then NULL. 651 * /dev/sda2, then /dev/sda3, then NULL.
652 * 652 *
653 * lookup_mnt takes a reference to the found vfsmount. 653 * lookup_mnt takes a reference to the found vfsmount.
654 */ 654 */
655 struct vfsmount *lookup_mnt(struct path *path) 655 struct vfsmount *lookup_mnt(struct path *path)
656 { 656 {
657 struct mount *child_mnt; 657 struct mount *child_mnt;
658 struct vfsmount *m; 658 struct vfsmount *m;
659 unsigned seq; 659 unsigned seq;
660 660
661 rcu_read_lock(); 661 rcu_read_lock();
662 do { 662 do {
663 seq = read_seqbegin(&mount_lock); 663 seq = read_seqbegin(&mount_lock);
664 child_mnt = __lookup_mnt(path->mnt, path->dentry); 664 child_mnt = __lookup_mnt(path->mnt, path->dentry);
665 m = child_mnt ? &child_mnt->mnt : NULL; 665 m = child_mnt ? &child_mnt->mnt : NULL;
666 } while (!legitimize_mnt(m, seq)); 666 } while (!legitimize_mnt(m, seq));
667 rcu_read_unlock(); 667 rcu_read_unlock();
668 return m; 668 return m;
669 } 669 }
670 670
671 /* 671 /*
672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the 672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
673 * current mount namespace. 673 * current mount namespace.
674 * 674 *
675 * The common case is dentries are not mountpoints at all and that 675 * The common case is dentries are not mountpoints at all and that
676 * test is handled inline. For the slow case when we are actually 676 * test is handled inline. For the slow case when we are actually
677 * dealing with a mountpoint of some kind, walk through all of the 677 * dealing with a mountpoint of some kind, walk through all of the
678 * mounts in the current mount namespace and test to see if the dentry 678 * mounts in the current mount namespace and test to see if the dentry
679 * is a mountpoint. 679 * is a mountpoint.
680 * 680 *
681 * The mount_hashtable is not usable in the context because we 681 * The mount_hashtable is not usable in the context because we
682 * need to identify all mounts that may be in the current mount 682 * need to identify all mounts that may be in the current mount
683 * namespace not just a mount that happens to have some specified 683 * namespace not just a mount that happens to have some specified
684 * parent mount. 684 * parent mount.
685 */ 685 */
686 bool __is_local_mountpoint(struct dentry *dentry) 686 bool __is_local_mountpoint(struct dentry *dentry)
687 { 687 {
688 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 688 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
689 struct mount *mnt; 689 struct mount *mnt;
690 bool is_covered = false; 690 bool is_covered = false;
691 691
692 if (!d_mountpoint(dentry)) 692 if (!d_mountpoint(dentry))
693 goto out; 693 goto out;
694 694
695 down_read(&namespace_sem); 695 down_read(&namespace_sem);
696 list_for_each_entry(mnt, &ns->list, mnt_list) { 696 list_for_each_entry(mnt, &ns->list, mnt_list) {
697 is_covered = (mnt->mnt_mountpoint == dentry); 697 is_covered = (mnt->mnt_mountpoint == dentry);
698 if (is_covered) 698 if (is_covered)
699 break; 699 break;
700 } 700 }
701 up_read(&namespace_sem); 701 up_read(&namespace_sem);
702 out: 702 out:
703 return is_covered; 703 return is_covered;
704 } 704 }
705 705
706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry) 706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
707 { 707 {
708 struct hlist_head *chain = mp_hash(dentry); 708 struct hlist_head *chain = mp_hash(dentry);
709 struct mountpoint *mp; 709 struct mountpoint *mp;
710 710
711 hlist_for_each_entry(mp, chain, m_hash) { 711 hlist_for_each_entry(mp, chain, m_hash) {
712 if (mp->m_dentry == dentry) { 712 if (mp->m_dentry == dentry) {
713 /* might be worth a WARN_ON() */ 713 /* might be worth a WARN_ON() */
714 if (d_unlinked(dentry)) 714 if (d_unlinked(dentry))
715 return ERR_PTR(-ENOENT); 715 return ERR_PTR(-ENOENT);
716 mp->m_count++; 716 mp->m_count++;
717 return mp; 717 return mp;
718 } 718 }
719 } 719 }
720 return NULL; 720 return NULL;
721 } 721 }
722 722
723 static struct mountpoint *new_mountpoint(struct dentry *dentry) 723 static struct mountpoint *new_mountpoint(struct dentry *dentry)
724 { 724 {
725 struct hlist_head *chain = mp_hash(dentry); 725 struct hlist_head *chain = mp_hash(dentry);
726 struct mountpoint *mp; 726 struct mountpoint *mp;
727 int ret; 727 int ret;
728 728
729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); 729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
730 if (!mp) 730 if (!mp)
731 return ERR_PTR(-ENOMEM); 731 return ERR_PTR(-ENOMEM);
732 732
733 ret = d_set_mounted(dentry); 733 ret = d_set_mounted(dentry);
734 if (ret) { 734 if (ret) {
735 kfree(mp); 735 kfree(mp);
736 return ERR_PTR(ret); 736 return ERR_PTR(ret);
737 } 737 }
738 738
739 mp->m_dentry = dentry; 739 mp->m_dentry = dentry;
740 mp->m_count = 1; 740 mp->m_count = 1;
741 hlist_add_head(&mp->m_hash, chain); 741 hlist_add_head(&mp->m_hash, chain);
742 INIT_HLIST_HEAD(&mp->m_list); 742 INIT_HLIST_HEAD(&mp->m_list);
743 return mp; 743 return mp;
744 } 744 }
745 745
746 static void put_mountpoint(struct mountpoint *mp) 746 static void put_mountpoint(struct mountpoint *mp)
747 { 747 {
748 if (!--mp->m_count) { 748 if (!--mp->m_count) {
749 struct dentry *dentry = mp->m_dentry; 749 struct dentry *dentry = mp->m_dentry;
750 BUG_ON(!hlist_empty(&mp->m_list)); 750 BUG_ON(!hlist_empty(&mp->m_list));
751 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
752 dentry->d_flags &= ~DCACHE_MOUNTED; 752 dentry->d_flags &= ~DCACHE_MOUNTED;
753 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
754 hlist_del(&mp->m_hash); 754 hlist_del(&mp->m_hash);
755 kfree(mp); 755 kfree(mp);
756 } 756 }
757 } 757 }
758 758
759 static inline int check_mnt(struct mount *mnt) 759 static inline int check_mnt(struct mount *mnt)
760 { 760 {
761 return mnt->mnt_ns == current->nsproxy->mnt_ns; 761 return mnt->mnt_ns == current->nsproxy->mnt_ns;
762 } 762 }
763 763
764 /* 764 /*
765 * vfsmount lock must be held for write 765 * vfsmount lock must be held for write
766 */ 766 */
767 static void touch_mnt_namespace(struct mnt_namespace *ns) 767 static void touch_mnt_namespace(struct mnt_namespace *ns)
768 { 768 {
769 if (ns) { 769 if (ns) {
770 ns->event = ++event; 770 ns->event = ++event;
771 wake_up_interruptible(&ns->poll); 771 wake_up_interruptible(&ns->poll);
772 } 772 }
773 } 773 }
774 774
775 /* 775 /*
776 * vfsmount lock must be held for write 776 * vfsmount lock must be held for write
777 */ 777 */
778 static void __touch_mnt_namespace(struct mnt_namespace *ns) 778 static void __touch_mnt_namespace(struct mnt_namespace *ns)
779 { 779 {
780 if (ns && ns->event != event) { 780 if (ns && ns->event != event) {
781 ns->event = event; 781 ns->event = event;
782 wake_up_interruptible(&ns->poll); 782 wake_up_interruptible(&ns->poll);
783 } 783 }
784 } 784 }
785 785
786 /* 786 /*
787 * vfsmount lock must be held for write 787 * vfsmount lock must be held for write
788 */ 788 */
789 static void detach_mnt(struct mount *mnt, struct path *old_path) 789 static void detach_mnt(struct mount *mnt, struct path *old_path)
790 { 790 {
791 old_path->dentry = mnt->mnt_mountpoint; 791 old_path->dentry = mnt->mnt_mountpoint;
792 old_path->mnt = &mnt->mnt_parent->mnt; 792 old_path->mnt = &mnt->mnt_parent->mnt;
793 mnt->mnt_parent = mnt; 793 mnt->mnt_parent = mnt;
794 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 794 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
795 list_del_init(&mnt->mnt_child); 795 list_del_init(&mnt->mnt_child);
796 hlist_del_init_rcu(&mnt->mnt_hash); 796 hlist_del_init_rcu(&mnt->mnt_hash);
797 hlist_del_init(&mnt->mnt_mp_list); 797 hlist_del_init(&mnt->mnt_mp_list);
798 put_mountpoint(mnt->mnt_mp); 798 put_mountpoint(mnt->mnt_mp);
799 mnt->mnt_mp = NULL; 799 mnt->mnt_mp = NULL;
800 } 800 }
801 801
802 /* 802 /*
803 * vfsmount lock must be held for write 803 * vfsmount lock must be held for write
804 */ 804 */
805 void mnt_set_mountpoint(struct mount *mnt, 805 void mnt_set_mountpoint(struct mount *mnt,
806 struct mountpoint *mp, 806 struct mountpoint *mp,
807 struct mount *child_mnt) 807 struct mount *child_mnt)
808 { 808 {
809 mp->m_count++; 809 mp->m_count++;
810 mnt_add_count(mnt, 1); /* essentially, that's mntget */ 810 mnt_add_count(mnt, 1); /* essentially, that's mntget */
811 child_mnt->mnt_mountpoint = dget(mp->m_dentry); 811 child_mnt->mnt_mountpoint = dget(mp->m_dentry);
812 child_mnt->mnt_parent = mnt; 812 child_mnt->mnt_parent = mnt;
813 child_mnt->mnt_mp = mp; 813 child_mnt->mnt_mp = mp;
814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
815 } 815 }
816 816
817 /* 817 /*
818 * vfsmount lock must be held for write 818 * vfsmount lock must be held for write
819 */ 819 */
820 static void attach_mnt(struct mount *mnt, 820 static void attach_mnt(struct mount *mnt,
821 struct mount *parent, 821 struct mount *parent,
822 struct mountpoint *mp) 822 struct mountpoint *mp)
823 { 823 {
824 mnt_set_mountpoint(parent, mp, mnt); 824 mnt_set_mountpoint(parent, mp, mnt);
825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
827 } 827 }
828 828
829 static void attach_shadowed(struct mount *mnt, 829 static void attach_shadowed(struct mount *mnt,
830 struct mount *parent, 830 struct mount *parent,
831 struct mount *shadows) 831 struct mount *shadows)
832 { 832 {
833 if (shadows) { 833 if (shadows) {
834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
835 list_add(&mnt->mnt_child, &shadows->mnt_child); 835 list_add(&mnt->mnt_child, &shadows->mnt_child);
836 } else { 836 } else {
837 hlist_add_head_rcu(&mnt->mnt_hash, 837 hlist_add_head_rcu(&mnt->mnt_hash,
838 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 838 m_hash(&parent->mnt, mnt->mnt_mountpoint));
839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
840 } 840 }
841 } 841 }
842 842
843 /* 843 /*
844 * vfsmount lock must be held for write 844 * vfsmount lock must be held for write
845 */ 845 */
846 static void commit_tree(struct mount *mnt, struct mount *shadows) 846 static void commit_tree(struct mount *mnt, struct mount *shadows)
847 { 847 {
848 struct mount *parent = mnt->mnt_parent; 848 struct mount *parent = mnt->mnt_parent;
849 struct mount *m; 849 struct mount *m;
850 LIST_HEAD(head); 850 LIST_HEAD(head);
851 struct mnt_namespace *n = parent->mnt_ns; 851 struct mnt_namespace *n = parent->mnt_ns;
852 852
853 BUG_ON(parent == mnt); 853 BUG_ON(parent == mnt);
854 854
855 list_add_tail(&head, &mnt->mnt_list); 855 list_add_tail(&head, &mnt->mnt_list);
856 list_for_each_entry(m, &head, mnt_list) 856 list_for_each_entry(m, &head, mnt_list)
857 m->mnt_ns = n; 857 m->mnt_ns = n;
858 858
859 list_splice(&head, n->list.prev); 859 list_splice(&head, n->list.prev);
860 860
861 attach_shadowed(mnt, parent, shadows); 861 attach_shadowed(mnt, parent, shadows);
862 touch_mnt_namespace(n); 862 touch_mnt_namespace(n);
863 } 863 }
864 864
865 static struct mount *next_mnt(struct mount *p, struct mount *root) 865 static struct mount *next_mnt(struct mount *p, struct mount *root)
866 { 866 {
867 struct list_head *next = p->mnt_mounts.next; 867 struct list_head *next = p->mnt_mounts.next;
868 if (next == &p->mnt_mounts) { 868 if (next == &p->mnt_mounts) {
869 while (1) { 869 while (1) {
870 if (p == root) 870 if (p == root)
871 return NULL; 871 return NULL;
872 next = p->mnt_child.next; 872 next = p->mnt_child.next;
873 if (next != &p->mnt_parent->mnt_mounts) 873 if (next != &p->mnt_parent->mnt_mounts)
874 break; 874 break;
875 p = p->mnt_parent; 875 p = p->mnt_parent;
876 } 876 }
877 } 877 }
878 return list_entry(next, struct mount, mnt_child); 878 return list_entry(next, struct mount, mnt_child);
879 } 879 }
880 880
881 static struct mount *skip_mnt_tree(struct mount *p) 881 static struct mount *skip_mnt_tree(struct mount *p)
882 { 882 {
883 struct list_head *prev = p->mnt_mounts.prev; 883 struct list_head *prev = p->mnt_mounts.prev;
884 while (prev != &p->mnt_mounts) { 884 while (prev != &p->mnt_mounts) {
885 p = list_entry(prev, struct mount, mnt_child); 885 p = list_entry(prev, struct mount, mnt_child);
886 prev = p->mnt_mounts.prev; 886 prev = p->mnt_mounts.prev;
887 } 887 }
888 return p; 888 return p;
889 } 889 }
890 890
891 struct vfsmount * 891 struct vfsmount *
892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
893 { 893 {
894 struct mount *mnt; 894 struct mount *mnt;
895 struct dentry *root; 895 struct dentry *root;
896 896
897 if (!type) 897 if (!type)
898 return ERR_PTR(-ENODEV); 898 return ERR_PTR(-ENODEV);
899 899
900 mnt = alloc_vfsmnt(name); 900 mnt = alloc_vfsmnt(name);
901 if (!mnt) 901 if (!mnt)
902 return ERR_PTR(-ENOMEM); 902 return ERR_PTR(-ENOMEM);
903 903
904 if (flags & MS_KERNMOUNT) 904 if (flags & MS_KERNMOUNT)
905 mnt->mnt.mnt_flags = MNT_INTERNAL; 905 mnt->mnt.mnt_flags = MNT_INTERNAL;
906 906
907 root = mount_fs(type, flags, name, data); 907 root = mount_fs(type, flags, name, data);
908 if (IS_ERR(root)) { 908 if (IS_ERR(root)) {
909 mnt_free_id(mnt); 909 mnt_free_id(mnt);
910 free_vfsmnt(mnt); 910 free_vfsmnt(mnt);
911 return ERR_CAST(root); 911 return ERR_CAST(root);
912 } 912 }
913 913
914 mnt->mnt.mnt_root = root; 914 mnt->mnt.mnt_root = root;
915 mnt->mnt.mnt_sb = root->d_sb; 915 mnt->mnt.mnt_sb = root->d_sb;
916 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 916 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
917 mnt->mnt_parent = mnt; 917 mnt->mnt_parent = mnt;
918 lock_mount_hash(); 918 lock_mount_hash();
919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
920 unlock_mount_hash(); 920 unlock_mount_hash();
921 return &mnt->mnt; 921 return &mnt->mnt;
922 } 922 }
923 EXPORT_SYMBOL_GPL(vfs_kern_mount); 923 EXPORT_SYMBOL_GPL(vfs_kern_mount);
924 924
925 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 925 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
926 int flag) 926 int flag)
927 { 927 {
928 struct super_block *sb = old->mnt.mnt_sb; 928 struct super_block *sb = old->mnt.mnt_sb;
929 struct mount *mnt; 929 struct mount *mnt;
930 int err; 930 int err;
931 931
932 mnt = alloc_vfsmnt(old->mnt_devname); 932 mnt = alloc_vfsmnt(old->mnt_devname);
933 if (!mnt) 933 if (!mnt)
934 return ERR_PTR(-ENOMEM); 934 return ERR_PTR(-ENOMEM);
935 935
936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
937 mnt->mnt_group_id = 0; /* not a peer of original */ 937 mnt->mnt_group_id = 0; /* not a peer of original */
938 else 938 else
939 mnt->mnt_group_id = old->mnt_group_id; 939 mnt->mnt_group_id = old->mnt_group_id;
940 940
941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
942 err = mnt_alloc_group_id(mnt); 942 err = mnt_alloc_group_id(mnt);
943 if (err) 943 if (err)
944 goto out_free; 944 goto out_free;
945 } 945 }
946 946
947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
948 /* Don't allow unprivileged users to change mount flags */ 948 /* Don't allow unprivileged users to change mount flags */
949 if (flag & CL_UNPRIVILEGED) { 949 if (flag & CL_UNPRIVILEGED) {
950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; 950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
951 951
952 if (mnt->mnt.mnt_flags & MNT_READONLY) 952 if (mnt->mnt.mnt_flags & MNT_READONLY)
953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
954 954
955 if (mnt->mnt.mnt_flags & MNT_NODEV) 955 if (mnt->mnt.mnt_flags & MNT_NODEV)
956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; 956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
957 957
958 if (mnt->mnt.mnt_flags & MNT_NOSUID) 958 if (mnt->mnt.mnt_flags & MNT_NOSUID)
959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; 959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
960 960
961 if (mnt->mnt.mnt_flags & MNT_NOEXEC) 961 if (mnt->mnt.mnt_flags & MNT_NOEXEC)
962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; 962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
963 } 963 }
964 964
965 /* Don't allow unprivileged users to reveal what is under a mount */ 965 /* Don't allow unprivileged users to reveal what is under a mount */
966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
967 mnt->mnt.mnt_flags |= MNT_LOCKED; 967 mnt->mnt.mnt_flags |= MNT_LOCKED;
968 968
969 atomic_inc(&sb->s_active); 969 atomic_inc(&sb->s_active);
970 mnt->mnt.mnt_sb = sb; 970 mnt->mnt.mnt_sb = sb;
971 mnt->mnt.mnt_root = dget(root); 971 mnt->mnt.mnt_root = dget(root);
972 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 972 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
973 mnt->mnt_parent = mnt; 973 mnt->mnt_parent = mnt;
974 lock_mount_hash(); 974 lock_mount_hash();
975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
976 unlock_mount_hash(); 976 unlock_mount_hash();
977 977
978 if ((flag & CL_SLAVE) || 978 if ((flag & CL_SLAVE) ||
979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
980 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 980 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
981 mnt->mnt_master = old; 981 mnt->mnt_master = old;
982 CLEAR_MNT_SHARED(mnt); 982 CLEAR_MNT_SHARED(mnt);
983 } else if (!(flag & CL_PRIVATE)) { 983 } else if (!(flag & CL_PRIVATE)) {
984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
985 list_add(&mnt->mnt_share, &old->mnt_share); 985 list_add(&mnt->mnt_share, &old->mnt_share);
986 if (IS_MNT_SLAVE(old)) 986 if (IS_MNT_SLAVE(old))
987 list_add(&mnt->mnt_slave, &old->mnt_slave); 987 list_add(&mnt->mnt_slave, &old->mnt_slave);
988 mnt->mnt_master = old->mnt_master; 988 mnt->mnt_master = old->mnt_master;
989 } 989 }
990 if (flag & CL_MAKE_SHARED) 990 if (flag & CL_MAKE_SHARED)
991 set_mnt_shared(mnt); 991 set_mnt_shared(mnt);
992 992
993 /* stick the duplicate mount on the same expiry list 993 /* stick the duplicate mount on the same expiry list
994 * as the original if that was on one */ 994 * as the original if that was on one */
995 if (flag & CL_EXPIRE) { 995 if (flag & CL_EXPIRE) {
996 if (!list_empty(&old->mnt_expire)) 996 if (!list_empty(&old->mnt_expire))
997 list_add(&mnt->mnt_expire, &old->mnt_expire); 997 list_add(&mnt->mnt_expire, &old->mnt_expire);
998 } 998 }
999 999
1000 return mnt; 1000 return mnt;
1001 1001
1002 out_free: 1002 out_free:
1003 mnt_free_id(mnt); 1003 mnt_free_id(mnt);
1004 free_vfsmnt(mnt); 1004 free_vfsmnt(mnt);
1005 return ERR_PTR(err); 1005 return ERR_PTR(err);
1006 } 1006 }
1007 1007
1008 static void cleanup_mnt(struct mount *mnt) 1008 static void cleanup_mnt(struct mount *mnt)
1009 { 1009 {
1010 /* 1010 /*
1011 * This probably indicates that somebody messed 1011 * This probably indicates that somebody messed
1012 * up a mnt_want/drop_write() pair. If this 1012 * up a mnt_want/drop_write() pair. If this
1013 * happens, the filesystem was probably unable 1013 * happens, the filesystem was probably unable
1014 * to make r/w->r/o transitions. 1014 * to make r/w->r/o transitions.
1015 */ 1015 */
1016 /* 1016 /*
1017 * The locking used to deal with mnt_count decrement provides barriers, 1017 * The locking used to deal with mnt_count decrement provides barriers,
1018 * so mnt_get_writers() below is safe. 1018 * so mnt_get_writers() below is safe.
1019 */ 1019 */
1020 WARN_ON(mnt_get_writers(mnt)); 1020 WARN_ON(mnt_get_writers(mnt));
1021 if (unlikely(mnt->mnt_pins.first)) 1021 if (unlikely(mnt->mnt_pins.first))
1022 mnt_pin_kill(mnt); 1022 mnt_pin_kill(mnt);
1023 fsnotify_vfsmount_delete(&mnt->mnt); 1023 fsnotify_vfsmount_delete(&mnt->mnt);
1024 dput(mnt->mnt.mnt_root); 1024 dput(mnt->mnt.mnt_root);
1025 deactivate_super(mnt->mnt.mnt_sb); 1025 deactivate_super(mnt->mnt.mnt_sb);
1026 mnt_free_id(mnt); 1026 mnt_free_id(mnt);
1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); 1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1028 } 1028 }
1029 1029
1030 static void __cleanup_mnt(struct rcu_head *head) 1030 static void __cleanup_mnt(struct rcu_head *head)
1031 { 1031 {
1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu)); 1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1033 } 1033 }
1034 1034
1035 static LLIST_HEAD(delayed_mntput_list); 1035 static LLIST_HEAD(delayed_mntput_list);
1036 static void delayed_mntput(struct work_struct *unused) 1036 static void delayed_mntput(struct work_struct *unused)
1037 { 1037 {
1038 struct llist_node *node = llist_del_all(&delayed_mntput_list); 1038 struct llist_node *node = llist_del_all(&delayed_mntput_list);
1039 struct llist_node *next; 1039 struct llist_node *next;
1040 1040
1041 for (; node; node = next) { 1041 for (; node; node = next) {
1042 next = llist_next(node); 1042 next = llist_next(node);
1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); 1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
1044 } 1044 }
1045 } 1045 }
1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); 1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1047 1047
1048 static void mntput_no_expire(struct mount *mnt) 1048 static void mntput_no_expire(struct mount *mnt)
1049 { 1049 {
1050 rcu_read_lock(); 1050 rcu_read_lock();
1051 mnt_add_count(mnt, -1); 1051 mnt_add_count(mnt, -1);
1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
1053 rcu_read_unlock(); 1053 rcu_read_unlock();
1054 return; 1054 return;
1055 } 1055 }
1056 lock_mount_hash(); 1056 lock_mount_hash();
1057 if (mnt_get_count(mnt)) { 1057 if (mnt_get_count(mnt)) {
1058 rcu_read_unlock(); 1058 rcu_read_unlock();
1059 unlock_mount_hash(); 1059 unlock_mount_hash();
1060 return; 1060 return;
1061 } 1061 }
1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1063 rcu_read_unlock(); 1063 rcu_read_unlock();
1064 unlock_mount_hash(); 1064 unlock_mount_hash();
1065 return; 1065 return;
1066 } 1066 }
1067 mnt->mnt.mnt_flags |= MNT_DOOMED; 1067 mnt->mnt.mnt_flags |= MNT_DOOMED;
1068 rcu_read_unlock(); 1068 rcu_read_unlock();
1069 1069
1070 list_del(&mnt->mnt_instance); 1070 list_del(&mnt->mnt_instance);
1071 unlock_mount_hash(); 1071 unlock_mount_hash();
1072 1072
1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { 1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1074 struct task_struct *task = current; 1074 struct task_struct *task = current;
1075 if (likely(!(task->flags & PF_KTHREAD))) { 1075 if (likely(!(task->flags & PF_KTHREAD))) {
1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt); 1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1077 if (!task_work_add(task, &mnt->mnt_rcu, true)) 1077 if (!task_work_add(task, &mnt->mnt_rcu, true))
1078 return; 1078 return;
1079 } 1079 }
1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) 1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1081 schedule_delayed_work(&delayed_mntput_work, 1); 1081 schedule_delayed_work(&delayed_mntput_work, 1);
1082 return; 1082 return;
1083 } 1083 }
1084 cleanup_mnt(mnt); 1084 cleanup_mnt(mnt);
1085 } 1085 }
1086 1086
1087 void mntput(struct vfsmount *mnt) 1087 void mntput(struct vfsmount *mnt)
1088 { 1088 {
1089 if (mnt) { 1089 if (mnt) {
1090 struct mount *m = real_mount(mnt); 1090 struct mount *m = real_mount(mnt);
1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1092 if (unlikely(m->mnt_expiry_mark)) 1092 if (unlikely(m->mnt_expiry_mark))
1093 m->mnt_expiry_mark = 0; 1093 m->mnt_expiry_mark = 0;
1094 mntput_no_expire(m); 1094 mntput_no_expire(m);
1095 } 1095 }
1096 } 1096 }
1097 EXPORT_SYMBOL(mntput); 1097 EXPORT_SYMBOL(mntput);
1098 1098
1099 struct vfsmount *mntget(struct vfsmount *mnt) 1099 struct vfsmount *mntget(struct vfsmount *mnt)
1100 { 1100 {
1101 if (mnt) 1101 if (mnt)
1102 mnt_add_count(real_mount(mnt), 1); 1102 mnt_add_count(real_mount(mnt), 1);
1103 return mnt; 1103 return mnt;
1104 } 1104 }
1105 EXPORT_SYMBOL(mntget); 1105 EXPORT_SYMBOL(mntget);
1106 1106
1107 struct vfsmount *mnt_clone_internal(struct path *path) 1107 struct vfsmount *mnt_clone_internal(struct path *path)
1108 { 1108 {
1109 struct mount *p; 1109 struct mount *p;
1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); 1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1111 if (IS_ERR(p)) 1111 if (IS_ERR(p))
1112 return ERR_CAST(p); 1112 return ERR_CAST(p);
1113 p->mnt.mnt_flags |= MNT_INTERNAL; 1113 p->mnt.mnt_flags |= MNT_INTERNAL;
1114 return &p->mnt; 1114 return &p->mnt;
1115 } 1115 }
1116 1116
1117 static inline void mangle(struct seq_file *m, const char *s) 1117 static inline void mangle(struct seq_file *m, const char *s)
1118 { 1118 {
1119 seq_escape(m, s, " \t\n\\"); 1119 seq_escape(m, s, " \t\n\\");
1120 } 1120 }
1121 1121
1122 /* 1122 /*
1123 * Simple .show_options callback for filesystems which don't want to 1123 * Simple .show_options callback for filesystems which don't want to
1124 * implement more complex mount option showing. 1124 * implement more complex mount option showing.
1125 * 1125 *
1126 * See also save_mount_options(). 1126 * See also save_mount_options().
1127 */ 1127 */
1128 int generic_show_options(struct seq_file *m, struct dentry *root) 1128 int generic_show_options(struct seq_file *m, struct dentry *root)
1129 { 1129 {
1130 const char *options; 1130 const char *options;
1131 1131
1132 rcu_read_lock(); 1132 rcu_read_lock();
1133 options = rcu_dereference(root->d_sb->s_options); 1133 options = rcu_dereference(root->d_sb->s_options);
1134 1134
1135 if (options != NULL && options[0]) { 1135 if (options != NULL && options[0]) {
1136 seq_putc(m, ','); 1136 seq_putc(m, ',');
1137 mangle(m, options); 1137 mangle(m, options);
1138 } 1138 }
1139 rcu_read_unlock(); 1139 rcu_read_unlock();
1140 1140
1141 return 0; 1141 return 0;
1142 } 1142 }
1143 EXPORT_SYMBOL(generic_show_options); 1143 EXPORT_SYMBOL(generic_show_options);
1144 1144
1145 /* 1145 /*
1146 * If filesystem uses generic_show_options(), this function should be 1146 * If filesystem uses generic_show_options(), this function should be
1147 * called from the fill_super() callback. 1147 * called from the fill_super() callback.
1148 * 1148 *
1149 * The .remount_fs callback usually needs to be handled in a special 1149 * The .remount_fs callback usually needs to be handled in a special
1150 * way, to make sure, that previous options are not overwritten if the 1150 * way, to make sure, that previous options are not overwritten if the
1151 * remount fails. 1151 * remount fails.
1152 * 1152 *
1153 * Also note, that if the filesystem's .remount_fs function doesn't 1153 * Also note, that if the filesystem's .remount_fs function doesn't
1154 * reset all options to their default value, but changes only newly 1154 * reset all options to their default value, but changes only newly
1155 * given options, then the displayed options will not reflect reality 1155 * given options, then the displayed options will not reflect reality
1156 * any more. 1156 * any more.
1157 */ 1157 */
1158 void save_mount_options(struct super_block *sb, char *options) 1158 void save_mount_options(struct super_block *sb, char *options)
1159 { 1159 {
1160 BUG_ON(sb->s_options); 1160 BUG_ON(sb->s_options);
1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
1162 } 1162 }
1163 EXPORT_SYMBOL(save_mount_options); 1163 EXPORT_SYMBOL(save_mount_options);
1164 1164
1165 void replace_mount_options(struct super_block *sb, char *options) 1165 void replace_mount_options(struct super_block *sb, char *options)
1166 { 1166 {
1167 char *old = sb->s_options; 1167 char *old = sb->s_options;
1168 rcu_assign_pointer(sb->s_options, options); 1168 rcu_assign_pointer(sb->s_options, options);
1169 if (old) { 1169 if (old) {
1170 synchronize_rcu(); 1170 synchronize_rcu();
1171 kfree(old); 1171 kfree(old);
1172 } 1172 }
1173 } 1173 }
1174 EXPORT_SYMBOL(replace_mount_options); 1174 EXPORT_SYMBOL(replace_mount_options);
1175 1175
1176 #ifdef CONFIG_PROC_FS 1176 #ifdef CONFIG_PROC_FS
1177 /* iterator; we want it to have access to namespace_sem, thus here... */ 1177 /* iterator; we want it to have access to namespace_sem, thus here... */
1178 static void *m_start(struct seq_file *m, loff_t *pos) 1178 static void *m_start(struct seq_file *m, loff_t *pos)
1179 { 1179 {
1180 struct proc_mounts *p = proc_mounts(m); 1180 struct proc_mounts *p = proc_mounts(m);
1181 1181
1182 down_read(&namespace_sem); 1182 down_read(&namespace_sem);
1183 if (p->cached_event == p->ns->event) { 1183 if (p->cached_event == p->ns->event) {
1184 void *v = p->cached_mount; 1184 void *v = p->cached_mount;
1185 if (*pos == p->cached_index) 1185 if (*pos == p->cached_index)
1186 return v; 1186 return v;
1187 if (*pos == p->cached_index + 1) { 1187 if (*pos == p->cached_index + 1) {
1188 v = seq_list_next(v, &p->ns->list, &p->cached_index); 1188 v = seq_list_next(v, &p->ns->list, &p->cached_index);
1189 return p->cached_mount = v; 1189 return p->cached_mount = v;
1190 } 1190 }
1191 } 1191 }
1192 1192
1193 p->cached_event = p->ns->event; 1193 p->cached_event = p->ns->event;
1194 p->cached_mount = seq_list_start(&p->ns->list, *pos); 1194 p->cached_mount = seq_list_start(&p->ns->list, *pos);
1195 p->cached_index = *pos; 1195 p->cached_index = *pos;
1196 return p->cached_mount; 1196 return p->cached_mount;
1197 } 1197 }
1198 1198
1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1200 { 1200 {
1201 struct proc_mounts *p = proc_mounts(m); 1201 struct proc_mounts *p = proc_mounts(m);
1202 1202
1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos); 1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1204 p->cached_index = *pos; 1204 p->cached_index = *pos;
1205 return p->cached_mount; 1205 return p->cached_mount;
1206 } 1206 }
1207 1207
1208 static void m_stop(struct seq_file *m, void *v) 1208 static void m_stop(struct seq_file *m, void *v)
1209 { 1209 {
1210 up_read(&namespace_sem); 1210 up_read(&namespace_sem);
1211 } 1211 }
1212 1212
1213 static int m_show(struct seq_file *m, void *v) 1213 static int m_show(struct seq_file *m, void *v)
1214 { 1214 {
1215 struct proc_mounts *p = proc_mounts(m); 1215 struct proc_mounts *p = proc_mounts(m);
1216 struct mount *r = list_entry(v, struct mount, mnt_list); 1216 struct mount *r = list_entry(v, struct mount, mnt_list);
1217 return p->show(m, &r->mnt); 1217 return p->show(m, &r->mnt);
1218 } 1218 }
1219 1219
1220 const struct seq_operations mounts_op = { 1220 const struct seq_operations mounts_op = {
1221 .start = m_start, 1221 .start = m_start,
1222 .next = m_next, 1222 .next = m_next,
1223 .stop = m_stop, 1223 .stop = m_stop,
1224 .show = m_show, 1224 .show = m_show,
1225 }; 1225 };
1226 #endif /* CONFIG_PROC_FS */ 1226 #endif /* CONFIG_PROC_FS */
1227 1227
1228 /** 1228 /**
1229 * may_umount_tree - check if a mount tree is busy 1229 * may_umount_tree - check if a mount tree is busy
1230 * @mnt: root of mount tree 1230 * @mnt: root of mount tree
1231 * 1231 *
1232 * This is called to check if a tree of mounts has any 1232 * This is called to check if a tree of mounts has any
1233 * open files, pwds, chroots or sub mounts that are 1233 * open files, pwds, chroots or sub mounts that are
1234 * busy. 1234 * busy.
1235 */ 1235 */
1236 int may_umount_tree(struct vfsmount *m) 1236 int may_umount_tree(struct vfsmount *m)
1237 { 1237 {
1238 struct mount *mnt = real_mount(m); 1238 struct mount *mnt = real_mount(m);
1239 int actual_refs = 0; 1239 int actual_refs = 0;
1240 int minimum_refs = 0; 1240 int minimum_refs = 0;
1241 struct mount *p; 1241 struct mount *p;
1242 BUG_ON(!m); 1242 BUG_ON(!m);
1243 1243
1244 /* write lock needed for mnt_get_count */ 1244 /* write lock needed for mnt_get_count */
1245 lock_mount_hash(); 1245 lock_mount_hash();
1246 for (p = mnt; p; p = next_mnt(p, mnt)) { 1246 for (p = mnt; p; p = next_mnt(p, mnt)) {
1247 actual_refs += mnt_get_count(p); 1247 actual_refs += mnt_get_count(p);
1248 minimum_refs += 2; 1248 minimum_refs += 2;
1249 } 1249 }
1250 unlock_mount_hash(); 1250 unlock_mount_hash();
1251 1251
1252 if (actual_refs > minimum_refs) 1252 if (actual_refs > minimum_refs)
1253 return 0; 1253 return 0;
1254 1254
1255 return 1; 1255 return 1;
1256 } 1256 }
1257 1257
1258 EXPORT_SYMBOL(may_umount_tree); 1258 EXPORT_SYMBOL(may_umount_tree);
1259 1259
1260 /** 1260 /**
1261 * may_umount - check if a mount point is busy 1261 * may_umount - check if a mount point is busy
1262 * @mnt: root of mount 1262 * @mnt: root of mount
1263 * 1263 *
1264 * This is called to check if a mount point has any 1264 * This is called to check if a mount point has any
1265 * open files, pwds, chroots or sub mounts. If the 1265 * open files, pwds, chroots or sub mounts. If the
1266 * mount has sub mounts this will return busy 1266 * mount has sub mounts this will return busy
1267 * regardless of whether the sub mounts are busy. 1267 * regardless of whether the sub mounts are busy.
1268 * 1268 *
1269 * Doesn't take quota and stuff into account. IOW, in some cases it will 1269 * Doesn't take quota and stuff into account. IOW, in some cases it will
1270 * give false negatives. The main reason why it's here is that we need 1270 * give false negatives. The main reason why it's here is that we need
1271 * a non-destructive way to look for easily umountable filesystems. 1271 * a non-destructive way to look for easily umountable filesystems.
1272 */ 1272 */
1273 int may_umount(struct vfsmount *mnt) 1273 int may_umount(struct vfsmount *mnt)
1274 { 1274 {
1275 int ret = 1; 1275 int ret = 1;
1276 down_read(&namespace_sem); 1276 down_read(&namespace_sem);
1277 lock_mount_hash(); 1277 lock_mount_hash();
1278 if (propagate_mount_busy(real_mount(mnt), 2)) 1278 if (propagate_mount_busy(real_mount(mnt), 2))
1279 ret = 0; 1279 ret = 0;
1280 unlock_mount_hash(); 1280 unlock_mount_hash();
1281 up_read(&namespace_sem); 1281 up_read(&namespace_sem);
1282 return ret; 1282 return ret;
1283 } 1283 }
1284 1284
1285 EXPORT_SYMBOL(may_umount); 1285 EXPORT_SYMBOL(may_umount);
1286 1286
1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1288 1288
1289 static void namespace_unlock(void) 1289 static void namespace_unlock(void)
1290 { 1290 {
1291 struct mount *mnt; 1291 struct mount *mnt;
1292 struct hlist_head head = unmounted; 1292 struct hlist_head head = unmounted;
1293 1293
1294 if (likely(hlist_empty(&head))) { 1294 if (likely(hlist_empty(&head))) {
1295 up_write(&namespace_sem); 1295 up_write(&namespace_sem);
1296 return; 1296 return;
1297 } 1297 }
1298 1298
1299 head.first->pprev = &head.first; 1299 head.first->pprev = &head.first;
1300 INIT_HLIST_HEAD(&unmounted); 1300 INIT_HLIST_HEAD(&unmounted);
1301 1301
1302 /* undo decrements we'd done in umount_tree() */ 1302 /* undo decrements we'd done in umount_tree() */
1303 hlist_for_each_entry(mnt, &head, mnt_hash) 1303 hlist_for_each_entry(mnt, &head, mnt_hash)
1304 if (mnt->mnt_ex_mountpoint.mnt) 1304 if (mnt->mnt_ex_mountpoint.mnt)
1305 mntget(mnt->mnt_ex_mountpoint.mnt); 1305 mntget(mnt->mnt_ex_mountpoint.mnt);
1306 1306
1307 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1308 1308
1309 synchronize_rcu(); 1309 synchronize_rcu();
1310 1310
1311 while (!hlist_empty(&head)) { 1311 while (!hlist_empty(&head)) {
1312 mnt = hlist_entry(head.first, struct mount, mnt_hash); 1312 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1313 hlist_del_init(&mnt->mnt_hash); 1313 hlist_del_init(&mnt->mnt_hash);
1314 if (mnt->mnt_ex_mountpoint.mnt) 1314 if (mnt->mnt_ex_mountpoint.mnt)
1315 path_put(&mnt->mnt_ex_mountpoint); 1315 path_put(&mnt->mnt_ex_mountpoint);
1316 mntput(&mnt->mnt); 1316 mntput(&mnt->mnt);
1317 } 1317 }
1318 } 1318 }
1319 1319
1320 static inline void namespace_lock(void) 1320 static inline void namespace_lock(void)
1321 { 1321 {
1322 down_write(&namespace_sem); 1322 down_write(&namespace_sem);
1323 } 1323 }
1324 1324
1325 /* 1325 /*
1326 * mount_lock must be held 1326 * mount_lock must be held
1327 * namespace_sem must be held for write 1327 * namespace_sem must be held for write
1328 * how = 0 => just this tree, don't propagate 1328 * how = 0 => just this tree, don't propagate
1329 * how = 1 => propagate; we know that nobody else has reference to any victims 1329 * how = 1 => propagate; we know that nobody else has reference to any victims
1330 * how = 2 => lazy umount 1330 * how = 2 => lazy umount
1331 */ 1331 */
1332 void umount_tree(struct mount *mnt, int how) 1332 void umount_tree(struct mount *mnt, int how)
1333 { 1333 {
1334 HLIST_HEAD(tmp_list); 1334 HLIST_HEAD(tmp_list);
1335 struct mount *p; 1335 struct mount *p;
1336 struct mount *last = NULL; 1336 struct mount *last = NULL;
1337 1337
1338 for (p = mnt; p; p = next_mnt(p, mnt)) { 1338 for (p = mnt; p; p = next_mnt(p, mnt)) {
1339 hlist_del_init_rcu(&p->mnt_hash); 1339 hlist_del_init_rcu(&p->mnt_hash);
1340 hlist_add_head(&p->mnt_hash, &tmp_list); 1340 hlist_add_head(&p->mnt_hash, &tmp_list);
1341 } 1341 }
1342 1342
1343 hlist_for_each_entry(p, &tmp_list, mnt_hash) 1343 hlist_for_each_entry(p, &tmp_list, mnt_hash)
1344 list_del_init(&p->mnt_child); 1344 list_del_init(&p->mnt_child);
1345 1345
1346 if (how) 1346 if (how)
1347 propagate_umount(&tmp_list); 1347 propagate_umount(&tmp_list);
1348 1348
1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) {
1350 list_del_init(&p->mnt_expire); 1350 list_del_init(&p->mnt_expire);
1351 list_del_init(&p->mnt_list); 1351 list_del_init(&p->mnt_list);
1352 __touch_mnt_namespace(p->mnt_ns); 1352 __touch_mnt_namespace(p->mnt_ns);
1353 p->mnt_ns = NULL; 1353 p->mnt_ns = NULL;
1354 if (how < 2) 1354 if (how < 2)
1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1356 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1357 hlist_del_init(&p->mnt_mp_list); 1357 hlist_del_init(&p->mnt_mp_list);
1358 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1359 mnt_add_count(p->mnt_parent, -1); 1359 mnt_add_count(p->mnt_parent, -1);
1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1363 p->mnt_mountpoint = p->mnt.mnt_root; 1363 p->mnt_mountpoint = p->mnt.mnt_root;
1364 p->mnt_parent = p; 1364 p->mnt_parent = p;
1365 p->mnt_mp = NULL; 1365 p->mnt_mp = NULL;
1366 } 1366 }
1367 change_mnt_propagation(p, MS_PRIVATE); 1367 change_mnt_propagation(p, MS_PRIVATE);
1368 last = p; 1368 last = p;
1369 } 1369 }
1370 if (last) { 1370 if (last) {
1371 last->mnt_hash.next = unmounted.first; 1371 last->mnt_hash.next = unmounted.first;
1372 if (unmounted.first) 1372 if (unmounted.first)
1373 unmounted.first->pprev = &last->mnt_hash.next; 1373 unmounted.first->pprev = &last->mnt_hash.next;
1374 unmounted.first = tmp_list.first; 1374 unmounted.first = tmp_list.first;
1375 unmounted.first->pprev = &unmounted.first; 1375 unmounted.first->pprev = &unmounted.first;
1376 } 1376 }
1377 } 1377 }
1378 1378
1379 static void shrink_submounts(struct mount *mnt); 1379 static void shrink_submounts(struct mount *mnt);
1380 1380
1381 static int do_umount(struct mount *mnt, int flags) 1381 static int do_umount(struct mount *mnt, int flags)
1382 { 1382 {
1383 struct super_block *sb = mnt->mnt.mnt_sb; 1383 struct super_block *sb = mnt->mnt.mnt_sb;
1384 int retval; 1384 int retval;
1385 1385
1386 retval = security_sb_umount(&mnt->mnt, flags); 1386 retval = security_sb_umount(&mnt->mnt, flags);
1387 if (retval) 1387 if (retval)
1388 return retval; 1388 return retval;
1389 1389
1390 /* 1390 /*
1391 * Allow userspace to request a mountpoint be expired rather than 1391 * Allow userspace to request a mountpoint be expired rather than
1392 * unmounting unconditionally. Unmount only happens if: 1392 * unmounting unconditionally. Unmount only happens if:
1393 * (1) the mark is already set (the mark is cleared by mntput()) 1393 * (1) the mark is already set (the mark is cleared by mntput())
1394 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1394 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1395 */ 1395 */
1396 if (flags & MNT_EXPIRE) { 1396 if (flags & MNT_EXPIRE) {
1397 if (&mnt->mnt == current->fs->root.mnt || 1397 if (&mnt->mnt == current->fs->root.mnt ||
1398 flags & (MNT_FORCE | MNT_DETACH)) 1398 flags & (MNT_FORCE | MNT_DETACH))
1399 return -EINVAL; 1399 return -EINVAL;
1400 1400
1401 /* 1401 /*
1402 * probably don't strictly need the lock here if we examined 1402 * probably don't strictly need the lock here if we examined
1403 * all race cases, but it's a slowpath. 1403 * all race cases, but it's a slowpath.
1404 */ 1404 */
1405 lock_mount_hash(); 1405 lock_mount_hash();
1406 if (mnt_get_count(mnt) != 2) { 1406 if (mnt_get_count(mnt) != 2) {
1407 unlock_mount_hash(); 1407 unlock_mount_hash();
1408 return -EBUSY; 1408 return -EBUSY;
1409 } 1409 }
1410 unlock_mount_hash(); 1410 unlock_mount_hash();
1411 1411
1412 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1412 if (!xchg(&mnt->mnt_expiry_mark, 1))
1413 return -EAGAIN; 1413 return -EAGAIN;
1414 } 1414 }
1415 1415
1416 /* 1416 /*
1417 * If we may have to abort operations to get out of this 1417 * If we may have to abort operations to get out of this
1418 * mount, and they will themselves hold resources we must 1418 * mount, and they will themselves hold resources we must
1419 * allow the fs to do things. In the Unix tradition of 1419 * allow the fs to do things. In the Unix tradition of
1420 * 'Gee thats tricky lets do it in userspace' the umount_begin 1420 * 'Gee thats tricky lets do it in userspace' the umount_begin
1421 * might fail to complete on the first run through as other tasks 1421 * might fail to complete on the first run through as other tasks
1422 * must return, and the like. Thats for the mount program to worry 1422 * must return, and the like. Thats for the mount program to worry
1423 * about for the moment. 1423 * about for the moment.
1424 */ 1424 */
1425 1425
1426 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1426 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1427 sb->s_op->umount_begin(sb); 1427 sb->s_op->umount_begin(sb);
1428 } 1428 }
1429 1429
1430 /* 1430 /*
1431 * No sense to grab the lock for this test, but test itself looks 1431 * No sense to grab the lock for this test, but test itself looks
1432 * somewhat bogus. Suggestions for better replacement? 1432 * somewhat bogus. Suggestions for better replacement?
1433 * Ho-hum... In principle, we might treat that as umount + switch 1433 * Ho-hum... In principle, we might treat that as umount + switch
1434 * to rootfs. GC would eventually take care of the old vfsmount. 1434 * to rootfs. GC would eventually take care of the old vfsmount.
1435 * Actually it makes sense, especially if rootfs would contain a 1435 * Actually it makes sense, especially if rootfs would contain a
1436 * /reboot - static binary that would close all descriptors and 1436 * /reboot - static binary that would close all descriptors and
1437 * call reboot(9). Then init(8) could umount root and exec /reboot. 1437 * call reboot(9). Then init(8) could umount root and exec /reboot.
1438 */ 1438 */
1439 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1439 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1440 /* 1440 /*
1441 * Special case for "unmounting" root ... 1441 * Special case for "unmounting" root ...
1442 * we just try to remount it readonly. 1442 * we just try to remount it readonly.
1443 */ 1443 */
1444 if (!capable(CAP_SYS_ADMIN)) 1444 if (!capable(CAP_SYS_ADMIN))
1445 return -EPERM; 1445 return -EPERM;
1446 down_write(&sb->s_umount); 1446 down_write(&sb->s_umount);
1447 if (!(sb->s_flags & MS_RDONLY)) 1447 if (!(sb->s_flags & MS_RDONLY))
1448 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1448 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1449 up_write(&sb->s_umount); 1449 up_write(&sb->s_umount);
1450 return retval; 1450 return retval;
1451 } 1451 }
1452 1452
1453 namespace_lock(); 1453 namespace_lock();
1454 lock_mount_hash(); 1454 lock_mount_hash();
1455 event++; 1455 event++;
1456 1456
1457 if (flags & MNT_DETACH) { 1457 if (flags & MNT_DETACH) {
1458 if (!list_empty(&mnt->mnt_list)) 1458 if (!list_empty(&mnt->mnt_list))
1459 umount_tree(mnt, 2); 1459 umount_tree(mnt, 2);
1460 retval = 0; 1460 retval = 0;
1461 } else { 1461 } else {
1462 shrink_submounts(mnt); 1462 shrink_submounts(mnt);
1463 retval = -EBUSY; 1463 retval = -EBUSY;
1464 if (!propagate_mount_busy(mnt, 2)) { 1464 if (!propagate_mount_busy(mnt, 2)) {
1465 if (!list_empty(&mnt->mnt_list)) 1465 if (!list_empty(&mnt->mnt_list))
1466 umount_tree(mnt, 1); 1466 umount_tree(mnt, 1);
1467 retval = 0; 1467 retval = 0;
1468 } 1468 }
1469 } 1469 }
1470 unlock_mount_hash(); 1470 unlock_mount_hash();
1471 namespace_unlock(); 1471 namespace_unlock();
1472 return retval; 1472 return retval;
1473 } 1473 }
1474 1474
1475 /* 1475 /*
1476 * __detach_mounts - lazily unmount all mounts on the specified dentry 1476 * __detach_mounts - lazily unmount all mounts on the specified dentry
1477 * 1477 *
1478 * During unlink, rmdir, and d_drop it is possible to loose the path 1478 * During unlink, rmdir, and d_drop it is possible to loose the path
1479 * to an existing mountpoint, and wind up leaking the mount. 1479 * to an existing mountpoint, and wind up leaking the mount.
1480 * detach_mounts allows lazily unmounting those mounts instead of 1480 * detach_mounts allows lazily unmounting those mounts instead of
1481 * leaking them. 1481 * leaking them.
1482 * 1482 *
1483 * The caller may hold dentry->d_inode->i_mutex. 1483 * The caller may hold dentry->d_inode->i_mutex.
1484 */ 1484 */
1485 void __detach_mounts(struct dentry *dentry) 1485 void __detach_mounts(struct dentry *dentry)
1486 { 1486 {
1487 struct mountpoint *mp; 1487 struct mountpoint *mp;
1488 struct mount *mnt; 1488 struct mount *mnt;
1489 1489
1490 namespace_lock(); 1490 namespace_lock();
1491 mp = lookup_mountpoint(dentry); 1491 mp = lookup_mountpoint(dentry);
1492 if (!mp) 1492 if (!mp)
1493 goto out_unlock; 1493 goto out_unlock;
1494 1494
1495 lock_mount_hash(); 1495 lock_mount_hash();
1496 while (!hlist_empty(&mp->m_list)) { 1496 while (!hlist_empty(&mp->m_list)) {
1497 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); 1497 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1498 umount_tree(mnt, 2); 1498 umount_tree(mnt, 2);
1499 } 1499 }
1500 unlock_mount_hash(); 1500 unlock_mount_hash();
1501 put_mountpoint(mp); 1501 put_mountpoint(mp);
1502 out_unlock: 1502 out_unlock:
1503 namespace_unlock(); 1503 namespace_unlock();
1504 } 1504 }
1505 1505
1506 /* 1506 /*
1507 * Is the caller allowed to modify his namespace? 1507 * Is the caller allowed to modify his namespace?
1508 */ 1508 */
1509 static inline bool may_mount(void) 1509 static inline bool may_mount(void)
1510 { 1510 {
1511 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 1511 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1512 } 1512 }
1513 1513
1514 /* 1514 /*
1515 * Now umount can handle mount points as well as block devices. 1515 * Now umount can handle mount points as well as block devices.
1516 * This is important for filesystems which use unnamed block devices. 1516 * This is important for filesystems which use unnamed block devices.
1517 * 1517 *
1518 * We now support a flag for forced unmount like the other 'big iron' 1518 * We now support a flag for forced unmount like the other 'big iron'
1519 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1519 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1520 */ 1520 */
1521 1521
1522 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1522 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1523 { 1523 {
1524 struct path path; 1524 struct path path;
1525 struct mount *mnt; 1525 struct mount *mnt;
1526 int retval; 1526 int retval;
1527 int lookup_flags = 0; 1527 int lookup_flags = 0;
1528 1528
1529 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1529 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1530 return -EINVAL; 1530 return -EINVAL;
1531 1531
1532 if (!may_mount()) 1532 if (!may_mount())
1533 return -EPERM; 1533 return -EPERM;
1534 1534
1535 if (!(flags & UMOUNT_NOFOLLOW)) 1535 if (!(flags & UMOUNT_NOFOLLOW))
1536 lookup_flags |= LOOKUP_FOLLOW; 1536 lookup_flags |= LOOKUP_FOLLOW;
1537 1537
1538 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); 1538 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1539 if (retval) 1539 if (retval)
1540 goto out; 1540 goto out;
1541 mnt = real_mount(path.mnt); 1541 mnt = real_mount(path.mnt);
1542 retval = -EINVAL; 1542 retval = -EINVAL;
1543 if (path.dentry != path.mnt->mnt_root) 1543 if (path.dentry != path.mnt->mnt_root)
1544 goto dput_and_out; 1544 goto dput_and_out;
1545 if (!check_mnt(mnt)) 1545 if (!check_mnt(mnt))
1546 goto dput_and_out; 1546 goto dput_and_out;
1547 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1547 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1548 goto dput_and_out; 1548 goto dput_and_out;
1549 1549
1550 retval = do_umount(mnt, flags); 1550 retval = do_umount(mnt, flags);
1551 dput_and_out: 1551 dput_and_out:
1552 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1552 /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1553 dput(path.dentry); 1553 dput(path.dentry);
1554 mntput_no_expire(mnt); 1554 mntput_no_expire(mnt);
1555 out: 1555 out:
1556 return retval; 1556 return retval;
1557 } 1557 }
1558 1558
1559 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1559 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1560 1560
1561 /* 1561 /*
1562 * The 2.0 compatible umount. No flags. 1562 * The 2.0 compatible umount. No flags.
1563 */ 1563 */
1564 SYSCALL_DEFINE1(oldumount, char __user *, name) 1564 SYSCALL_DEFINE1(oldumount, char __user *, name)
1565 { 1565 {
1566 return sys_umount(name, 0); 1566 return sys_umount(name, 0);
1567 } 1567 }
1568 1568
1569 #endif 1569 #endif
1570 1570
1571 static bool is_mnt_ns_file(struct dentry *dentry) 1571 static bool is_mnt_ns_file(struct dentry *dentry)
1572 { 1572 {
1573 /* Is this a proxy for a mount namespace? */ 1573 /* Is this a proxy for a mount namespace? */
1574 struct inode *inode = dentry->d_inode; 1574 struct inode *inode = dentry->d_inode;
1575 struct proc_ns *ei; 1575 struct proc_ns *ei;
1576 1576
1577 if (!proc_ns_inode(inode)) 1577 if (!proc_ns_inode(inode))
1578 return false; 1578 return false;
1579 1579
1580 ei = get_proc_ns(inode); 1580 ei = get_proc_ns(inode);
1581 if (ei->ns_ops != &mntns_operations) 1581 if (ei->ns_ops != &mntns_operations)
1582 return false; 1582 return false;
1583 1583
1584 return true; 1584 return true;
1585 } 1585 }
1586 1586
1587 static bool mnt_ns_loop(struct dentry *dentry) 1587 static bool mnt_ns_loop(struct dentry *dentry)
1588 { 1588 {
1589 /* Could bind mounting the mount namespace inode cause a 1589 /* Could bind mounting the mount namespace inode cause a
1590 * mount namespace loop? 1590 * mount namespace loop?
1591 */ 1591 */
1592 struct mnt_namespace *mnt_ns; 1592 struct mnt_namespace *mnt_ns;
1593 if (!is_mnt_ns_file(dentry)) 1593 if (!is_mnt_ns_file(dentry))
1594 return false; 1594 return false;
1595 1595
1596 mnt_ns = get_proc_ns(dentry->d_inode)->ns; 1596 mnt_ns = get_proc_ns(dentry->d_inode)->ns;
1597 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1597 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1598 } 1598 }
1599 1599
1600 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1600 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1601 int flag) 1601 int flag)
1602 { 1602 {
1603 struct mount *res, *p, *q, *r, *parent; 1603 struct mount *res, *p, *q, *r, *parent;
1604 1604
1605 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) 1605 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1606 return ERR_PTR(-EINVAL); 1606 return ERR_PTR(-EINVAL);
1607 1607
1608 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 1608 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1609 return ERR_PTR(-EINVAL); 1609 return ERR_PTR(-EINVAL);
1610 1610
1611 res = q = clone_mnt(mnt, dentry, flag); 1611 res = q = clone_mnt(mnt, dentry, flag);
1612 if (IS_ERR(q)) 1612 if (IS_ERR(q))
1613 return q; 1613 return q;
1614 1614
1615 q->mnt.mnt_flags &= ~MNT_LOCKED; 1615 q->mnt.mnt_flags &= ~MNT_LOCKED;
1616 q->mnt_mountpoint = mnt->mnt_mountpoint; 1616 q->mnt_mountpoint = mnt->mnt_mountpoint;
1617 1617
1618 p = mnt; 1618 p = mnt;
1619 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1619 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1620 struct mount *s; 1620 struct mount *s;
1621 if (!is_subdir(r->mnt_mountpoint, dentry)) 1621 if (!is_subdir(r->mnt_mountpoint, dentry))
1622 continue; 1622 continue;
1623 1623
1624 for (s = r; s; s = next_mnt(s, r)) { 1624 for (s = r; s; s = next_mnt(s, r)) {
1625 struct mount *t = NULL; 1625 struct mount *t = NULL;
1626 if (!(flag & CL_COPY_UNBINDABLE) && 1626 if (!(flag & CL_COPY_UNBINDABLE) &&
1627 IS_MNT_UNBINDABLE(s)) { 1627 IS_MNT_UNBINDABLE(s)) {
1628 s = skip_mnt_tree(s); 1628 s = skip_mnt_tree(s);
1629 continue; 1629 continue;
1630 } 1630 }
1631 if (!(flag & CL_COPY_MNT_NS_FILE) && 1631 if (!(flag & CL_COPY_MNT_NS_FILE) &&
1632 is_mnt_ns_file(s->mnt.mnt_root)) { 1632 is_mnt_ns_file(s->mnt.mnt_root)) {
1633 s = skip_mnt_tree(s); 1633 s = skip_mnt_tree(s);
1634 continue; 1634 continue;
1635 } 1635 }
1636 while (p != s->mnt_parent) { 1636 while (p != s->mnt_parent) {
1637 p = p->mnt_parent; 1637 p = p->mnt_parent;
1638 q = q->mnt_parent; 1638 q = q->mnt_parent;
1639 } 1639 }
1640 p = s; 1640 p = s;
1641 parent = q; 1641 parent = q;
1642 q = clone_mnt(p, p->mnt.mnt_root, flag); 1642 q = clone_mnt(p, p->mnt.mnt_root, flag);
1643 if (IS_ERR(q)) 1643 if (IS_ERR(q))
1644 goto out; 1644 goto out;
1645 lock_mount_hash(); 1645 lock_mount_hash();
1646 list_add_tail(&q->mnt_list, &res->mnt_list); 1646 list_add_tail(&q->mnt_list, &res->mnt_list);
1647 mnt_set_mountpoint(parent, p->mnt_mp, q); 1647 mnt_set_mountpoint(parent, p->mnt_mp, q);
1648 if (!list_empty(&parent->mnt_mounts)) { 1648 if (!list_empty(&parent->mnt_mounts)) {
1649 t = list_last_entry(&parent->mnt_mounts, 1649 t = list_last_entry(&parent->mnt_mounts,
1650 struct mount, mnt_child); 1650 struct mount, mnt_child);
1651 if (t->mnt_mp != p->mnt_mp) 1651 if (t->mnt_mp != p->mnt_mp)
1652 t = NULL; 1652 t = NULL;
1653 } 1653 }
1654 attach_shadowed(q, parent, t); 1654 attach_shadowed(q, parent, t);
1655 unlock_mount_hash(); 1655 unlock_mount_hash();
1656 } 1656 }
1657 } 1657 }
1658 return res; 1658 return res;
1659 out: 1659 out:
1660 if (res) { 1660 if (res) {
1661 lock_mount_hash(); 1661 lock_mount_hash();
1662 umount_tree(res, 0); 1662 umount_tree(res, 0);
1663 unlock_mount_hash(); 1663 unlock_mount_hash();
1664 } 1664 }
1665 return q; 1665 return q;
1666 } 1666 }
1667 1667
1668 /* Caller should check returned pointer for errors */ 1668 /* Caller should check returned pointer for errors */
1669 1669
1670 struct vfsmount *collect_mounts(struct path *path) 1670 struct vfsmount *collect_mounts(struct path *path)
1671 { 1671 {
1672 struct mount *tree; 1672 struct mount *tree;
1673 namespace_lock(); 1673 namespace_lock();
1674 tree = copy_tree(real_mount(path->mnt), path->dentry, 1674 tree = copy_tree(real_mount(path->mnt), path->dentry,
1675 CL_COPY_ALL | CL_PRIVATE); 1675 CL_COPY_ALL | CL_PRIVATE);
1676 namespace_unlock(); 1676 namespace_unlock();
1677 if (IS_ERR(tree)) 1677 if (IS_ERR(tree))
1678 return ERR_CAST(tree); 1678 return ERR_CAST(tree);
1679 return &tree->mnt; 1679 return &tree->mnt;
1680 } 1680 }
1681 1681
1682 void drop_collected_mounts(struct vfsmount *mnt) 1682 void drop_collected_mounts(struct vfsmount *mnt)
1683 { 1683 {
1684 namespace_lock(); 1684 namespace_lock();
1685 lock_mount_hash(); 1685 lock_mount_hash();
1686 umount_tree(real_mount(mnt), 0); 1686 umount_tree(real_mount(mnt), 0);
1687 unlock_mount_hash(); 1687 unlock_mount_hash();
1688 namespace_unlock(); 1688 namespace_unlock();
1689 } 1689 }
1690 1690
1691 /** 1691 /**
1692 * clone_private_mount - create a private clone of a path 1692 * clone_private_mount - create a private clone of a path
1693 * 1693 *
1694 * This creates a new vfsmount, which will be the clone of @path. The new will 1694 * This creates a new vfsmount, which will be the clone of @path. The new will
1695 * not be attached anywhere in the namespace and will be private (i.e. changes 1695 * not be attached anywhere in the namespace and will be private (i.e. changes
1696 * to the originating mount won't be propagated into this). 1696 * to the originating mount won't be propagated into this).
1697 * 1697 *
1698 * Release with mntput(). 1698 * Release with mntput().
1699 */ 1699 */
1700 struct vfsmount *clone_private_mount(struct path *path) 1700 struct vfsmount *clone_private_mount(struct path *path)
1701 { 1701 {
1702 struct mount *old_mnt = real_mount(path->mnt); 1702 struct mount *old_mnt = real_mount(path->mnt);
1703 struct mount *new_mnt; 1703 struct mount *new_mnt;
1704 1704
1705 if (IS_MNT_UNBINDABLE(old_mnt)) 1705 if (IS_MNT_UNBINDABLE(old_mnt))
1706 return ERR_PTR(-EINVAL); 1706 return ERR_PTR(-EINVAL);
1707 1707
1708 down_read(&namespace_sem); 1708 down_read(&namespace_sem);
1709 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); 1709 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1710 up_read(&namespace_sem); 1710 up_read(&namespace_sem);
1711 if (IS_ERR(new_mnt)) 1711 if (IS_ERR(new_mnt))
1712 return ERR_CAST(new_mnt); 1712 return ERR_CAST(new_mnt);
1713 1713
1714 return &new_mnt->mnt; 1714 return &new_mnt->mnt;
1715 } 1715 }
1716 EXPORT_SYMBOL_GPL(clone_private_mount); 1716 EXPORT_SYMBOL_GPL(clone_private_mount);
1717 1717
1718 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1718 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1719 struct vfsmount *root) 1719 struct vfsmount *root)
1720 { 1720 {
1721 struct mount *mnt; 1721 struct mount *mnt;
1722 int res = f(root, arg); 1722 int res = f(root, arg);
1723 if (res) 1723 if (res)
1724 return res; 1724 return res;
1725 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { 1725 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
1726 res = f(&mnt->mnt, arg); 1726 res = f(&mnt->mnt, arg);
1727 if (res) 1727 if (res)
1728 return res; 1728 return res;
1729 } 1729 }
1730 return 0; 1730 return 0;
1731 } 1731 }
1732 1732
1733 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 1733 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
1734 { 1734 {
1735 struct mount *p; 1735 struct mount *p;
1736 1736
1737 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1737 for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1738 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1738 if (p->mnt_group_id && !IS_MNT_SHARED(p))
1739 mnt_release_group_id(p); 1739 mnt_release_group_id(p);
1740 } 1740 }
1741 } 1741 }
1742 1742
1743 static int invent_group_ids(struct mount *mnt, bool recurse) 1743 static int invent_group_ids(struct mount *mnt, bool recurse)
1744 { 1744 {
1745 struct mount *p; 1745 struct mount *p;
1746 1746
1747 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1747 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1748 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1748 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1749 int err = mnt_alloc_group_id(p); 1749 int err = mnt_alloc_group_id(p);
1750 if (err) { 1750 if (err) {
1751 cleanup_group_ids(mnt, p); 1751 cleanup_group_ids(mnt, p);
1752 return err; 1752 return err;
1753 } 1753 }
1754 } 1754 }
1755 } 1755 }
1756 1756
1757 return 0; 1757 return 0;
1758 } 1758 }
1759 1759
1760 /* 1760 /*
1761 * @source_mnt : mount tree to be attached 1761 * @source_mnt : mount tree to be attached
1762 * @nd : place the mount tree @source_mnt is attached 1762 * @nd : place the mount tree @source_mnt is attached
1763 * @parent_nd : if non-null, detach the source_mnt from its parent and 1763 * @parent_nd : if non-null, detach the source_mnt from its parent and
1764 * store the parent mount and mountpoint dentry. 1764 * store the parent mount and mountpoint dentry.
1765 * (done when source_mnt is moved) 1765 * (done when source_mnt is moved)
1766 * 1766 *
1767 * NOTE: in the table below explains the semantics when a source mount 1767 * NOTE: in the table below explains the semantics when a source mount
1768 * of a given type is attached to a destination mount of a given type. 1768 * of a given type is attached to a destination mount of a given type.
1769 * --------------------------------------------------------------------------- 1769 * ---------------------------------------------------------------------------
1770 * | BIND MOUNT OPERATION | 1770 * | BIND MOUNT OPERATION |
1771 * |************************************************************************** 1771 * |**************************************************************************
1772 * | source-->| shared | private | slave | unbindable | 1772 * | source-->| shared | private | slave | unbindable |
1773 * | dest | | | | | 1773 * | dest | | | | |
1774 * | | | | | | | 1774 * | | | | | | |
1775 * | v | | | | | 1775 * | v | | | | |
1776 * |************************************************************************** 1776 * |**************************************************************************
1777 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1777 * | shared | shared (++) | shared (+) | shared(+++)| invalid |
1778 * | | | | | | 1778 * | | | | | |
1779 * |non-shared| shared (+) | private | slave (*) | invalid | 1779 * |non-shared| shared (+) | private | slave (*) | invalid |
1780 * *************************************************************************** 1780 * ***************************************************************************
1781 * A bind operation clones the source mount and mounts the clone on the 1781 * A bind operation clones the source mount and mounts the clone on the
1782 * destination mount. 1782 * destination mount.
1783 * 1783 *
1784 * (++) the cloned mount is propagated to all the mounts in the propagation 1784 * (++) the cloned mount is propagated to all the mounts in the propagation
1785 * tree of the destination mount and the cloned mount is added to 1785 * tree of the destination mount and the cloned mount is added to
1786 * the peer group of the source mount. 1786 * the peer group of the source mount.
1787 * (+) the cloned mount is created under the destination mount and is marked 1787 * (+) the cloned mount is created under the destination mount and is marked
1788 * as shared. The cloned mount is added to the peer group of the source 1788 * as shared. The cloned mount is added to the peer group of the source
1789 * mount. 1789 * mount.
1790 * (+++) the mount is propagated to all the mounts in the propagation tree 1790 * (+++) the mount is propagated to all the mounts in the propagation tree
1791 * of the destination mount and the cloned mount is made slave 1791 * of the destination mount and the cloned mount is made slave
1792 * of the same master as that of the source mount. The cloned mount 1792 * of the same master as that of the source mount. The cloned mount
1793 * is marked as 'shared and slave'. 1793 * is marked as 'shared and slave'.
1794 * (*) the cloned mount is made a slave of the same master as that of the 1794 * (*) the cloned mount is made a slave of the same master as that of the
1795 * source mount. 1795 * source mount.
1796 * 1796 *
1797 * --------------------------------------------------------------------------- 1797 * ---------------------------------------------------------------------------
1798 * | MOVE MOUNT OPERATION | 1798 * | MOVE MOUNT OPERATION |
1799 * |************************************************************************** 1799 * |**************************************************************************
1800 * | source-->| shared | private | slave | unbindable | 1800 * | source-->| shared | private | slave | unbindable |
1801 * | dest | | | | | 1801 * | dest | | | | |
1802 * | | | | | | | 1802 * | | | | | | |
1803 * | v | | | | | 1803 * | v | | | | |
1804 * |************************************************************************** 1804 * |**************************************************************************
1805 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1805 * | shared | shared (+) | shared (+) | shared(+++) | invalid |
1806 * | | | | | | 1806 * | | | | | |
1807 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1807 * |non-shared| shared (+*) | private | slave (*) | unbindable |
1808 * *************************************************************************** 1808 * ***************************************************************************
1809 * 1809 *
1810 * (+) the mount is moved to the destination. And is then propagated to 1810 * (+) the mount is moved to the destination. And is then propagated to
1811 * all the mounts in the propagation tree of the destination mount. 1811 * all the mounts in the propagation tree of the destination mount.
1812 * (+*) the mount is moved to the destination. 1812 * (+*) the mount is moved to the destination.
1813 * (+++) the mount is moved to the destination and is then propagated to 1813 * (+++) the mount is moved to the destination and is then propagated to
1814 * all the mounts belonging to the destination mount's propagation tree. 1814 * all the mounts belonging to the destination mount's propagation tree.
1815 * the mount is marked as 'shared and slave'. 1815 * the mount is marked as 'shared and slave'.
1816 * (*) the mount continues to be a slave at the new location. 1816 * (*) the mount continues to be a slave at the new location.
1817 * 1817 *
1818 * if the source mount is a tree, the operations explained above is 1818 * if the source mount is a tree, the operations explained above is
1819 * applied to each mount in the tree. 1819 * applied to each mount in the tree.
1820 * Must be called without spinlocks held, since this function can sleep 1820 * Must be called without spinlocks held, since this function can sleep
1821 * in allocations. 1821 * in allocations.
1822 */ 1822 */
1823 static int attach_recursive_mnt(struct mount *source_mnt, 1823 static int attach_recursive_mnt(struct mount *source_mnt,
1824 struct mount *dest_mnt, 1824 struct mount *dest_mnt,
1825 struct mountpoint *dest_mp, 1825 struct mountpoint *dest_mp,
1826 struct path *parent_path) 1826 struct path *parent_path)
1827 { 1827 {
1828 HLIST_HEAD(tree_list); 1828 HLIST_HEAD(tree_list);
1829 struct mount *child, *p; 1829 struct mount *child, *p;
1830 struct hlist_node *n; 1830 struct hlist_node *n;
1831 int err; 1831 int err;
1832 1832
1833 if (IS_MNT_SHARED(dest_mnt)) { 1833 if (IS_MNT_SHARED(dest_mnt)) {
1834 err = invent_group_ids(source_mnt, true); 1834 err = invent_group_ids(source_mnt, true);
1835 if (err) 1835 if (err)
1836 goto out; 1836 goto out;
1837 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1837 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1838 lock_mount_hash(); 1838 lock_mount_hash();
1839 if (err) 1839 if (err)
1840 goto out_cleanup_ids; 1840 goto out_cleanup_ids;
1841 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1841 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1842 set_mnt_shared(p); 1842 set_mnt_shared(p);
1843 } else { 1843 } else {
1844 lock_mount_hash(); 1844 lock_mount_hash();
1845 } 1845 }
1846 if (parent_path) { 1846 if (parent_path) {
1847 detach_mnt(source_mnt, parent_path); 1847 detach_mnt(source_mnt, parent_path);
1848 attach_mnt(source_mnt, dest_mnt, dest_mp); 1848 attach_mnt(source_mnt, dest_mnt, dest_mp);
1849 touch_mnt_namespace(source_mnt->mnt_ns); 1849 touch_mnt_namespace(source_mnt->mnt_ns);
1850 } else { 1850 } else {
1851 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 1851 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
1852 commit_tree(source_mnt, NULL); 1852 commit_tree(source_mnt, NULL);
1853 } 1853 }
1854 1854
1855 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 1855 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
1856 struct mount *q; 1856 struct mount *q;
1857 hlist_del_init(&child->mnt_hash); 1857 hlist_del_init(&child->mnt_hash);
1858 q = __lookup_mnt_last(&child->mnt_parent->mnt, 1858 q = __lookup_mnt_last(&child->mnt_parent->mnt,
1859 child->mnt_mountpoint); 1859 child->mnt_mountpoint);
1860 commit_tree(child, q); 1860 commit_tree(child, q);
1861 } 1861 }
1862 unlock_mount_hash(); 1862 unlock_mount_hash();
1863 1863
1864 return 0; 1864 return 0;
1865 1865
1866 out_cleanup_ids: 1866 out_cleanup_ids:
1867 while (!hlist_empty(&tree_list)) { 1867 while (!hlist_empty(&tree_list)) {
1868 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 1868 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1869 umount_tree(child, 0); 1869 umount_tree(child, 0);
1870 } 1870 }
1871 unlock_mount_hash(); 1871 unlock_mount_hash();
1872 cleanup_group_ids(source_mnt, NULL); 1872 cleanup_group_ids(source_mnt, NULL);
1873 out: 1873 out:
1874 return err; 1874 return err;
1875 } 1875 }
1876 1876
1877 static struct mountpoint *lock_mount(struct path *path) 1877 static struct mountpoint *lock_mount(struct path *path)
1878 { 1878 {
1879 struct vfsmount *mnt; 1879 struct vfsmount *mnt;
1880 struct dentry *dentry = path->dentry; 1880 struct dentry *dentry = path->dentry;
1881 retry: 1881 retry:
1882 mutex_lock(&dentry->d_inode->i_mutex); 1882 mutex_lock(&dentry->d_inode->i_mutex);
1883 if (unlikely(cant_mount(dentry))) { 1883 if (unlikely(cant_mount(dentry))) {
1884 mutex_unlock(&dentry->d_inode->i_mutex); 1884 mutex_unlock(&dentry->d_inode->i_mutex);
1885 return ERR_PTR(-ENOENT); 1885 return ERR_PTR(-ENOENT);
1886 } 1886 }
1887 namespace_lock(); 1887 namespace_lock();
1888 mnt = lookup_mnt(path); 1888 mnt = lookup_mnt(path);
1889 if (likely(!mnt)) { 1889 if (likely(!mnt)) {
1890 struct mountpoint *mp = lookup_mountpoint(dentry); 1890 struct mountpoint *mp = lookup_mountpoint(dentry);
1891 if (!mp) 1891 if (!mp)
1892 mp = new_mountpoint(dentry); 1892 mp = new_mountpoint(dentry);
1893 if (IS_ERR(mp)) { 1893 if (IS_ERR(mp)) {
1894 namespace_unlock(); 1894 namespace_unlock();
1895 mutex_unlock(&dentry->d_inode->i_mutex); 1895 mutex_unlock(&dentry->d_inode->i_mutex);
1896 return mp; 1896 return mp;
1897 } 1897 }
1898 return mp; 1898 return mp;
1899 } 1899 }
1900 namespace_unlock(); 1900 namespace_unlock();
1901 mutex_unlock(&path->dentry->d_inode->i_mutex); 1901 mutex_unlock(&path->dentry->d_inode->i_mutex);
1902 path_put(path); 1902 path_put(path);
1903 path->mnt = mnt; 1903 path->mnt = mnt;
1904 dentry = path->dentry = dget(mnt->mnt_root); 1904 dentry = path->dentry = dget(mnt->mnt_root);
1905 goto retry; 1905 goto retry;
1906 } 1906 }
1907 1907
1908 static void unlock_mount(struct mountpoint *where) 1908 static void unlock_mount(struct mountpoint *where)
1909 { 1909 {
1910 struct dentry *dentry = where->m_dentry; 1910 struct dentry *dentry = where->m_dentry;
1911 put_mountpoint(where); 1911 put_mountpoint(where);
1912 namespace_unlock(); 1912 namespace_unlock();
1913 mutex_unlock(&dentry->d_inode->i_mutex); 1913 mutex_unlock(&dentry->d_inode->i_mutex);
1914 } 1914 }
1915 1915
1916 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) 1916 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
1917 { 1917 {
1918 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 1918 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
1919 return -EINVAL; 1919 return -EINVAL;
1920 1920
1921 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != 1921 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
1922 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 1922 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
1923 return -ENOTDIR; 1923 return -ENOTDIR;
1924 1924
1925 return attach_recursive_mnt(mnt, p, mp, NULL); 1925 return attach_recursive_mnt(mnt, p, mp, NULL);
1926 } 1926 }
1927 1927
1928 /* 1928 /*
1929 * Sanity check the flags to change_mnt_propagation. 1929 * Sanity check the flags to change_mnt_propagation.
1930 */ 1930 */
1931 1931
1932 static int flags_to_propagation_type(int flags) 1932 static int flags_to_propagation_type(int flags)
1933 { 1933 {
1934 int type = flags & ~(MS_REC | MS_SILENT); 1934 int type = flags & ~(MS_REC | MS_SILENT);
1935 1935
1936 /* Fail if any non-propagation flags are set */ 1936 /* Fail if any non-propagation flags are set */
1937 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1937 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1938 return 0; 1938 return 0;
1939 /* Only one propagation flag should be set */ 1939 /* Only one propagation flag should be set */
1940 if (!is_power_of_2(type)) 1940 if (!is_power_of_2(type))
1941 return 0; 1941 return 0;
1942 return type; 1942 return type;
1943 } 1943 }
1944 1944
1945 /* 1945 /*
1946 * recursively change the type of the mountpoint. 1946 * recursively change the type of the mountpoint.
1947 */ 1947 */
1948 static int do_change_type(struct path *path, int flag) 1948 static int do_change_type(struct path *path, int flag)
1949 { 1949 {
1950 struct mount *m; 1950 struct mount *m;
1951 struct mount *mnt = real_mount(path->mnt); 1951 struct mount *mnt = real_mount(path->mnt);
1952 int recurse = flag & MS_REC; 1952 int recurse = flag & MS_REC;
1953 int type; 1953 int type;
1954 int err = 0; 1954 int err = 0;
1955 1955
1956 if (path->dentry != path->mnt->mnt_root) 1956 if (path->dentry != path->mnt->mnt_root)
1957 return -EINVAL; 1957 return -EINVAL;
1958 1958
1959 type = flags_to_propagation_type(flag); 1959 type = flags_to_propagation_type(flag);
1960 if (!type) 1960 if (!type)
1961 return -EINVAL; 1961 return -EINVAL;
1962 1962
1963 namespace_lock(); 1963 namespace_lock();
1964 if (type == MS_SHARED) { 1964 if (type == MS_SHARED) {
1965 err = invent_group_ids(mnt, recurse); 1965 err = invent_group_ids(mnt, recurse);
1966 if (err) 1966 if (err)
1967 goto out_unlock; 1967 goto out_unlock;
1968 } 1968 }
1969 1969
1970 lock_mount_hash(); 1970 lock_mount_hash();
1971 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1971 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1972 change_mnt_propagation(m, type); 1972 change_mnt_propagation(m, type);
1973 unlock_mount_hash(); 1973 unlock_mount_hash();
1974 1974
1975 out_unlock: 1975 out_unlock:
1976 namespace_unlock(); 1976 namespace_unlock();
1977 return err; 1977 return err;
1978 } 1978 }
1979 1979
1980 static bool has_locked_children(struct mount *mnt, struct dentry *dentry) 1980 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1981 { 1981 {
1982 struct mount *child; 1982 struct mount *child;
1983 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 1983 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
1984 if (!is_subdir(child->mnt_mountpoint, dentry)) 1984 if (!is_subdir(child->mnt_mountpoint, dentry))
1985 continue; 1985 continue;
1986 1986
1987 if (child->mnt.mnt_flags & MNT_LOCKED) 1987 if (child->mnt.mnt_flags & MNT_LOCKED)
1988 return true; 1988 return true;
1989 } 1989 }
1990 return false; 1990 return false;
1991 } 1991 }
1992 1992
1993 /* 1993 /*
1994 * do loopback mount. 1994 * do loopback mount.
1995 */ 1995 */
1996 static int do_loopback(struct path *path, const char *old_name, 1996 static int do_loopback(struct path *path, const char *old_name,
1997 int recurse) 1997 int recurse)
1998 { 1998 {
1999 struct path old_path; 1999 struct path old_path;
2000 struct mount *mnt = NULL, *old, *parent; 2000 struct mount *mnt = NULL, *old, *parent;
2001 struct mountpoint *mp; 2001 struct mountpoint *mp;
2002 int err; 2002 int err;
2003 if (!old_name || !*old_name) 2003 if (!old_name || !*old_name)
2004 return -EINVAL; 2004 return -EINVAL;
2005 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 2005 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2006 if (err) 2006 if (err)
2007 return err; 2007 return err;
2008 2008
2009 err = -EINVAL; 2009 err = -EINVAL;
2010 if (mnt_ns_loop(old_path.dentry)) 2010 if (mnt_ns_loop(old_path.dentry))
2011 goto out; 2011 goto out;
2012 2012
2013 mp = lock_mount(path); 2013 mp = lock_mount(path);
2014 err = PTR_ERR(mp); 2014 err = PTR_ERR(mp);
2015 if (IS_ERR(mp)) 2015 if (IS_ERR(mp))
2016 goto out; 2016 goto out;
2017 2017
2018 old = real_mount(old_path.mnt); 2018 old = real_mount(old_path.mnt);
2019 parent = real_mount(path->mnt); 2019 parent = real_mount(path->mnt);
2020 2020
2021 err = -EINVAL; 2021 err = -EINVAL;
2022 if (IS_MNT_UNBINDABLE(old)) 2022 if (IS_MNT_UNBINDABLE(old))
2023 goto out2; 2023 goto out2;
2024 2024
2025 if (!check_mnt(parent) || !check_mnt(old)) 2025 if (!check_mnt(parent) || !check_mnt(old))
2026 goto out2; 2026 goto out2;
2027 2027
2028 if (!recurse && has_locked_children(old, old_path.dentry)) 2028 if (!recurse && has_locked_children(old, old_path.dentry))
2029 goto out2; 2029 goto out2;
2030 2030
2031 if (recurse) 2031 if (recurse)
2032 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); 2032 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2033 else 2033 else
2034 mnt = clone_mnt(old, old_path.dentry, 0); 2034 mnt = clone_mnt(old, old_path.dentry, 0);
2035 2035
2036 if (IS_ERR(mnt)) { 2036 if (IS_ERR(mnt)) {
2037 err = PTR_ERR(mnt); 2037 err = PTR_ERR(mnt);
2038 goto out2; 2038 goto out2;
2039 } 2039 }
2040 2040
2041 mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2041 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2042 2042
2043 err = graft_tree(mnt, parent, mp); 2043 err = graft_tree(mnt, parent, mp);
2044 if (err) { 2044 if (err) {
2045 lock_mount_hash(); 2045 lock_mount_hash();
2046 umount_tree(mnt, 0); 2046 umount_tree(mnt, 0);
2047 unlock_mount_hash(); 2047 unlock_mount_hash();
2048 } 2048 }
2049 out2: 2049 out2:
2050 unlock_mount(mp); 2050 unlock_mount(mp);
2051 out: 2051 out:
2052 path_put(&old_path); 2052 path_put(&old_path);
2053 return err; 2053 return err;
2054 } 2054 }
2055 2055
2056 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 2056 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
2057 { 2057 {
2058 int error = 0; 2058 int error = 0;
2059 int readonly_request = 0; 2059 int readonly_request = 0;
2060 2060
2061 if (ms_flags & MS_RDONLY) 2061 if (ms_flags & MS_RDONLY)
2062 readonly_request = 1; 2062 readonly_request = 1;
2063 if (readonly_request == __mnt_is_readonly(mnt)) 2063 if (readonly_request == __mnt_is_readonly(mnt))
2064 return 0; 2064 return 0;
2065 2065
2066 if (readonly_request) 2066 if (readonly_request)
2067 error = mnt_make_readonly(real_mount(mnt)); 2067 error = mnt_make_readonly(real_mount(mnt));
2068 else 2068 else
2069 __mnt_unmake_readonly(real_mount(mnt)); 2069 __mnt_unmake_readonly(real_mount(mnt));
2070 return error; 2070 return error;
2071 } 2071 }
2072 2072
2073 /* 2073 /*
2074 * change filesystem flags. dir should be a physical root of filesystem. 2074 * change filesystem flags. dir should be a physical root of filesystem.
2075 * If you've mounted a non-root directory somewhere and want to do remount 2075 * If you've mounted a non-root directory somewhere and want to do remount
2076 * on it - tough luck. 2076 * on it - tough luck.
2077 */ 2077 */
2078 static int do_remount(struct path *path, int flags, int mnt_flags, 2078 static int do_remount(struct path *path, int flags, int mnt_flags,
2079 void *data) 2079 void *data)
2080 { 2080 {
2081 int err; 2081 int err;
2082 struct super_block *sb = path->mnt->mnt_sb; 2082 struct super_block *sb = path->mnt->mnt_sb;
2083 struct mount *mnt = real_mount(path->mnt); 2083 struct mount *mnt = real_mount(path->mnt);
2084 2084
2085 if (!check_mnt(mnt)) 2085 if (!check_mnt(mnt))
2086 return -EINVAL; 2086 return -EINVAL;
2087 2087
2088 if (path->dentry != path->mnt->mnt_root) 2088 if (path->dentry != path->mnt->mnt_root)
2089 return -EINVAL; 2089 return -EINVAL;
2090 2090
2091 /* Don't allow changing of locked mnt flags. 2091 /* Don't allow changing of locked mnt flags.
2092 * 2092 *
2093 * No locks need to be held here while testing the various 2093 * No locks need to be held here while testing the various
2094 * MNT_LOCK flags because those flags can never be cleared 2094 * MNT_LOCK flags because those flags can never be cleared
2095 * once they are set. 2095 * once they are set.
2096 */ 2096 */
2097 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && 2097 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2098 !(mnt_flags & MNT_READONLY)) { 2098 !(mnt_flags & MNT_READONLY)) {
2099 return -EPERM; 2099 return -EPERM;
2100 } 2100 }
2101 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 2101 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2102 !(mnt_flags & MNT_NODEV)) { 2102 !(mnt_flags & MNT_NODEV)) {
2103 return -EPERM; 2103 /* Was the nodev implicitly added in mount? */
2104 if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
2105 !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2106 mnt_flags |= MNT_NODEV;
2107 } else {
2108 return -EPERM;
2109 }
2104 } 2110 }
2105 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 2111 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2106 !(mnt_flags & MNT_NOSUID)) { 2112 !(mnt_flags & MNT_NOSUID)) {
2107 return -EPERM; 2113 return -EPERM;
2108 } 2114 }
2109 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && 2115 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2110 !(mnt_flags & MNT_NOEXEC)) { 2116 !(mnt_flags & MNT_NOEXEC)) {
2111 return -EPERM; 2117 return -EPERM;
2112 } 2118 }
2113 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && 2119 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2114 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { 2120 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2115 return -EPERM; 2121 return -EPERM;
2116 } 2122 }
2117 2123
2118 err = security_sb_remount(sb, data); 2124 err = security_sb_remount(sb, data);
2119 if (err) 2125 if (err)
2120 return err; 2126 return err;
2121 2127
2122 down_write(&sb->s_umount); 2128 down_write(&sb->s_umount);
2123 if (flags & MS_BIND) 2129 if (flags & MS_BIND)
2124 err = change_mount_flags(path->mnt, flags); 2130 err = change_mount_flags(path->mnt, flags);
2125 else if (!capable(CAP_SYS_ADMIN)) 2131 else if (!capable(CAP_SYS_ADMIN))
2126 err = -EPERM; 2132 err = -EPERM;
2127 else 2133 else
2128 err = do_remount_sb(sb, flags, data, 0); 2134 err = do_remount_sb(sb, flags, data, 0);
2129 if (!err) { 2135 if (!err) {
2130 lock_mount_hash(); 2136 lock_mount_hash();
2131 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 2137 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2132 mnt->mnt.mnt_flags = mnt_flags; 2138 mnt->mnt.mnt_flags = mnt_flags;
2133 touch_mnt_namespace(mnt->mnt_ns); 2139 touch_mnt_namespace(mnt->mnt_ns);
2134 unlock_mount_hash(); 2140 unlock_mount_hash();
2135 } 2141 }
2136 up_write(&sb->s_umount); 2142 up_write(&sb->s_umount);
2137 return err; 2143 return err;
2138 } 2144 }
2139 2145
2140 static inline int tree_contains_unbindable(struct mount *mnt) 2146 static inline int tree_contains_unbindable(struct mount *mnt)
2141 { 2147 {
2142 struct mount *p; 2148 struct mount *p;
2143 for (p = mnt; p; p = next_mnt(p, mnt)) { 2149 for (p = mnt; p; p = next_mnt(p, mnt)) {
2144 if (IS_MNT_UNBINDABLE(p)) 2150 if (IS_MNT_UNBINDABLE(p))
2145 return 1; 2151 return 1;
2146 } 2152 }
2147 return 0; 2153 return 0;
2148 } 2154 }
2149 2155
2150 static int do_move_mount(struct path *path, const char *old_name) 2156 static int do_move_mount(struct path *path, const char *old_name)
2151 { 2157 {
2152 struct path old_path, parent_path; 2158 struct path old_path, parent_path;
2153 struct mount *p; 2159 struct mount *p;
2154 struct mount *old; 2160 struct mount *old;
2155 struct mountpoint *mp; 2161 struct mountpoint *mp;
2156 int err; 2162 int err;
2157 if (!old_name || !*old_name) 2163 if (!old_name || !*old_name)
2158 return -EINVAL; 2164 return -EINVAL;
2159 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 2165 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2160 if (err) 2166 if (err)
2161 return err; 2167 return err;
2162 2168
2163 mp = lock_mount(path); 2169 mp = lock_mount(path);
2164 err = PTR_ERR(mp); 2170 err = PTR_ERR(mp);
2165 if (IS_ERR(mp)) 2171 if (IS_ERR(mp))
2166 goto out; 2172 goto out;
2167 2173
2168 old = real_mount(old_path.mnt); 2174 old = real_mount(old_path.mnt);
2169 p = real_mount(path->mnt); 2175 p = real_mount(path->mnt);
2170 2176
2171 err = -EINVAL; 2177 err = -EINVAL;
2172 if (!check_mnt(p) || !check_mnt(old)) 2178 if (!check_mnt(p) || !check_mnt(old))
2173 goto out1; 2179 goto out1;
2174 2180
2175 if (old->mnt.mnt_flags & MNT_LOCKED) 2181 if (old->mnt.mnt_flags & MNT_LOCKED)
2176 goto out1; 2182 goto out1;
2177 2183
2178 err = -EINVAL; 2184 err = -EINVAL;
2179 if (old_path.dentry != old_path.mnt->mnt_root) 2185 if (old_path.dentry != old_path.mnt->mnt_root)
2180 goto out1; 2186 goto out1;
2181 2187
2182 if (!mnt_has_parent(old)) 2188 if (!mnt_has_parent(old))
2183 goto out1; 2189 goto out1;
2184 2190
2185 if (S_ISDIR(path->dentry->d_inode->i_mode) != 2191 if (S_ISDIR(path->dentry->d_inode->i_mode) !=
2186 S_ISDIR(old_path.dentry->d_inode->i_mode)) 2192 S_ISDIR(old_path.dentry->d_inode->i_mode))
2187 goto out1; 2193 goto out1;
2188 /* 2194 /*
2189 * Don't move a mount residing in a shared parent. 2195 * Don't move a mount residing in a shared parent.
2190 */ 2196 */
2191 if (IS_MNT_SHARED(old->mnt_parent)) 2197 if (IS_MNT_SHARED(old->mnt_parent))
2192 goto out1; 2198 goto out1;
2193 /* 2199 /*
2194 * Don't move a mount tree containing unbindable mounts to a destination 2200 * Don't move a mount tree containing unbindable mounts to a destination
2195 * mount which is shared. 2201 * mount which is shared.
2196 */ 2202 */
2197 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) 2203 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2198 goto out1; 2204 goto out1;
2199 err = -ELOOP; 2205 err = -ELOOP;
2200 for (; mnt_has_parent(p); p = p->mnt_parent) 2206 for (; mnt_has_parent(p); p = p->mnt_parent)
2201 if (p == old) 2207 if (p == old)
2202 goto out1; 2208 goto out1;
2203 2209
2204 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); 2210 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
2205 if (err) 2211 if (err)
2206 goto out1; 2212 goto out1;
2207 2213
2208 /* if the mount is moved, it should no longer be expire 2214 /* if the mount is moved, it should no longer be expire
2209 * automatically */ 2215 * automatically */
2210 list_del_init(&old->mnt_expire); 2216 list_del_init(&old->mnt_expire);
2211 out1: 2217 out1:
2212 unlock_mount(mp); 2218 unlock_mount(mp);
2213 out: 2219 out:
2214 if (!err) 2220 if (!err)
2215 path_put(&parent_path); 2221 path_put(&parent_path);
2216 path_put(&old_path); 2222 path_put(&old_path);
2217 return err; 2223 return err;
2218 } 2224 }
2219 2225
2220 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 2226 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
2221 { 2227 {
2222 int err; 2228 int err;
2223 const char *subtype = strchr(fstype, '.'); 2229 const char *subtype = strchr(fstype, '.');
2224 if (subtype) { 2230 if (subtype) {
2225 subtype++; 2231 subtype++;
2226 err = -EINVAL; 2232 err = -EINVAL;
2227 if (!subtype[0]) 2233 if (!subtype[0])
2228 goto err; 2234 goto err;
2229 } else 2235 } else
2230 subtype = ""; 2236 subtype = "";
2231 2237
2232 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); 2238 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2233 err = -ENOMEM; 2239 err = -ENOMEM;
2234 if (!mnt->mnt_sb->s_subtype) 2240 if (!mnt->mnt_sb->s_subtype)
2235 goto err; 2241 goto err;
2236 return mnt; 2242 return mnt;
2237 2243
2238 err: 2244 err:
2239 mntput(mnt); 2245 mntput(mnt);
2240 return ERR_PTR(err); 2246 return ERR_PTR(err);
2241 } 2247 }
2242 2248
2243 /* 2249 /*
2244 * add a mount into a namespace's mount tree 2250 * add a mount into a namespace's mount tree
2245 */ 2251 */
2246 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 2252 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
2247 { 2253 {
2248 struct mountpoint *mp; 2254 struct mountpoint *mp;
2249 struct mount *parent; 2255 struct mount *parent;
2250 int err; 2256 int err;
2251 2257
2252 mnt_flags &= ~MNT_INTERNAL_FLAGS; 2258 mnt_flags &= ~MNT_INTERNAL_FLAGS;
2253 2259
2254 mp = lock_mount(path); 2260 mp = lock_mount(path);
2255 if (IS_ERR(mp)) 2261 if (IS_ERR(mp))
2256 return PTR_ERR(mp); 2262 return PTR_ERR(mp);
2257 2263
2258 parent = real_mount(path->mnt); 2264 parent = real_mount(path->mnt);
2259 err = -EINVAL; 2265 err = -EINVAL;
2260 if (unlikely(!check_mnt(parent))) { 2266 if (unlikely(!check_mnt(parent))) {
2261 /* that's acceptable only for automounts done in private ns */ 2267 /* that's acceptable only for automounts done in private ns */
2262 if (!(mnt_flags & MNT_SHRINKABLE)) 2268 if (!(mnt_flags & MNT_SHRINKABLE))
2263 goto unlock; 2269 goto unlock;
2264 /* ... and for those we'd better have mountpoint still alive */ 2270 /* ... and for those we'd better have mountpoint still alive */
2265 if (!parent->mnt_ns) 2271 if (!parent->mnt_ns)
2266 goto unlock; 2272 goto unlock;
2267 } 2273 }
2268 2274
2269 /* Refuse the same filesystem on the same mount point */ 2275 /* Refuse the same filesystem on the same mount point */
2270 err = -EBUSY; 2276 err = -EBUSY;
2271 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 2277 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2272 path->mnt->mnt_root == path->dentry) 2278 path->mnt->mnt_root == path->dentry)
2273 goto unlock; 2279 goto unlock;
2274 2280
2275 err = -EINVAL; 2281 err = -EINVAL;
2276 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 2282 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
2277 goto unlock; 2283 goto unlock;
2278 2284
2279 newmnt->mnt.mnt_flags = mnt_flags; 2285 newmnt->mnt.mnt_flags = mnt_flags;
2280 err = graft_tree(newmnt, parent, mp); 2286 err = graft_tree(newmnt, parent, mp);
2281 2287
2282 unlock: 2288 unlock:
2283 unlock_mount(mp); 2289 unlock_mount(mp);
2284 return err; 2290 return err;
2285 } 2291 }
2286 2292
2287 /* 2293 /*
2288 * create a new mount for userspace and request it to be added into the 2294 * create a new mount for userspace and request it to be added into the
2289 * namespace's tree 2295 * namespace's tree
2290 */ 2296 */
2291 static int do_new_mount(struct path *path, const char *fstype, int flags, 2297 static int do_new_mount(struct path *path, const char *fstype, int flags,
2292 int mnt_flags, const char *name, void *data) 2298 int mnt_flags, const char *name, void *data)
2293 { 2299 {
2294 struct file_system_type *type; 2300 struct file_system_type *type;
2295 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 2301 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2296 struct vfsmount *mnt; 2302 struct vfsmount *mnt;
2297 int err; 2303 int err;
2298 2304
2299 if (!fstype) 2305 if (!fstype)
2300 return -EINVAL; 2306 return -EINVAL;
2301 2307
2302 type = get_fs_type(fstype); 2308 type = get_fs_type(fstype);
2303 if (!type) 2309 if (!type)
2304 return -ENODEV; 2310 return -ENODEV;
2305 2311
2306 if (user_ns != &init_user_ns) { 2312 if (user_ns != &init_user_ns) {
2307 if (!(type->fs_flags & FS_USERNS_MOUNT)) { 2313 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
2308 put_filesystem(type); 2314 put_filesystem(type);
2309 return -EPERM; 2315 return -EPERM;
2310 } 2316 }
2311 /* Only in special cases allow devices from mounts 2317 /* Only in special cases allow devices from mounts
2312 * created outside the initial user namespace. 2318 * created outside the initial user namespace.
2313 */ 2319 */
2314 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2320 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2315 flags |= MS_NODEV; 2321 flags |= MS_NODEV;
2316 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 2322 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
2317 } 2323 }
2318 } 2324 }
2319 2325
2320 mnt = vfs_kern_mount(type, flags, name, data); 2326 mnt = vfs_kern_mount(type, flags, name, data);
2321 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 2327 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2322 !mnt->mnt_sb->s_subtype) 2328 !mnt->mnt_sb->s_subtype)
2323 mnt = fs_set_subtype(mnt, fstype); 2329 mnt = fs_set_subtype(mnt, fstype);
2324 2330
2325 put_filesystem(type); 2331 put_filesystem(type);
2326 if (IS_ERR(mnt)) 2332 if (IS_ERR(mnt))
2327 return PTR_ERR(mnt); 2333 return PTR_ERR(mnt);
2328 2334
2329 err = do_add_mount(real_mount(mnt), path, mnt_flags); 2335 err = do_add_mount(real_mount(mnt), path, mnt_flags);
2330 if (err) 2336 if (err)
2331 mntput(mnt); 2337 mntput(mnt);
2332 return err; 2338 return err;
2333 } 2339 }
2334 2340
2335 int finish_automount(struct vfsmount *m, struct path *path) 2341 int finish_automount(struct vfsmount *m, struct path *path)
2336 { 2342 {
2337 struct mount *mnt = real_mount(m); 2343 struct mount *mnt = real_mount(m);
2338 int err; 2344 int err;
2339 /* The new mount record should have at least 2 refs to prevent it being 2345 /* The new mount record should have at least 2 refs to prevent it being
2340 * expired before we get a chance to add it 2346 * expired before we get a chance to add it
2341 */ 2347 */
2342 BUG_ON(mnt_get_count(mnt) < 2); 2348 BUG_ON(mnt_get_count(mnt) < 2);
2343 2349
2344 if (m->mnt_sb == path->mnt->mnt_sb && 2350 if (m->mnt_sb == path->mnt->mnt_sb &&
2345 m->mnt_root == path->dentry) { 2351 m->mnt_root == path->dentry) {
2346 err = -ELOOP; 2352 err = -ELOOP;
2347 goto fail; 2353 goto fail;
2348 } 2354 }
2349 2355
2350 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 2356 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
2351 if (!err) 2357 if (!err)
2352 return 0; 2358 return 0;
2353 fail: 2359 fail:
2354 /* remove m from any expiration list it may be on */ 2360 /* remove m from any expiration list it may be on */
2355 if (!list_empty(&mnt->mnt_expire)) { 2361 if (!list_empty(&mnt->mnt_expire)) {
2356 namespace_lock(); 2362 namespace_lock();
2357 list_del_init(&mnt->mnt_expire); 2363 list_del_init(&mnt->mnt_expire);
2358 namespace_unlock(); 2364 namespace_unlock();
2359 } 2365 }
2360 mntput(m); 2366 mntput(m);
2361 mntput(m); 2367 mntput(m);
2362 return err; 2368 return err;
2363 } 2369 }
2364 2370
2365 /** 2371 /**
2366 * mnt_set_expiry - Put a mount on an expiration list 2372 * mnt_set_expiry - Put a mount on an expiration list
2367 * @mnt: The mount to list. 2373 * @mnt: The mount to list.
2368 * @expiry_list: The list to add the mount to. 2374 * @expiry_list: The list to add the mount to.
2369 */ 2375 */
2370 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 2376 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2371 { 2377 {
2372 namespace_lock(); 2378 namespace_lock();
2373 2379
2374 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 2380 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2375 2381
2376 namespace_unlock(); 2382 namespace_unlock();
2377 } 2383 }
2378 EXPORT_SYMBOL(mnt_set_expiry); 2384 EXPORT_SYMBOL(mnt_set_expiry);
2379 2385
2380 /* 2386 /*
2381 * process a list of expirable mountpoints with the intent of discarding any 2387 * process a list of expirable mountpoints with the intent of discarding any
2382 * mountpoints that aren't in use and haven't been touched since last we came 2388 * mountpoints that aren't in use and haven't been touched since last we came
2383 * here 2389 * here
2384 */ 2390 */
2385 void mark_mounts_for_expiry(struct list_head *mounts) 2391 void mark_mounts_for_expiry(struct list_head *mounts)
2386 { 2392 {
2387 struct mount *mnt, *next; 2393 struct mount *mnt, *next;
2388 LIST_HEAD(graveyard); 2394 LIST_HEAD(graveyard);
2389 2395
2390 if (list_empty(mounts)) 2396 if (list_empty(mounts))
2391 return; 2397 return;
2392 2398
2393 namespace_lock(); 2399 namespace_lock();
2394 lock_mount_hash(); 2400 lock_mount_hash();
2395 2401
2396 /* extract from the expiration list every vfsmount that matches the 2402 /* extract from the expiration list every vfsmount that matches the
2397 * following criteria: 2403 * following criteria:
2398 * - only referenced by its parent vfsmount 2404 * - only referenced by its parent vfsmount
2399 * - still marked for expiry (marked on the last call here; marks are 2405 * - still marked for expiry (marked on the last call here; marks are
2400 * cleared by mntput()) 2406 * cleared by mntput())
2401 */ 2407 */
2402 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2408 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
2403 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2409 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
2404 propagate_mount_busy(mnt, 1)) 2410 propagate_mount_busy(mnt, 1))
2405 continue; 2411 continue;
2406 list_move(&mnt->mnt_expire, &graveyard); 2412 list_move(&mnt->mnt_expire, &graveyard);
2407 } 2413 }
2408 while (!list_empty(&graveyard)) { 2414 while (!list_empty(&graveyard)) {
2409 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2415 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
2410 touch_mnt_namespace(mnt->mnt_ns); 2416 touch_mnt_namespace(mnt->mnt_ns);
2411 umount_tree(mnt, 1); 2417 umount_tree(mnt, 1);
2412 } 2418 }
2413 unlock_mount_hash(); 2419 unlock_mount_hash();
2414 namespace_unlock(); 2420 namespace_unlock();
2415 } 2421 }
2416 2422
2417 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2423 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
2418 2424
2419 /* 2425 /*
2420 * Ripoff of 'select_parent()' 2426 * Ripoff of 'select_parent()'
2421 * 2427 *
2422 * search the list of submounts for a given mountpoint, and move any 2428 * search the list of submounts for a given mountpoint, and move any
2423 * shrinkable submounts to the 'graveyard' list. 2429 * shrinkable submounts to the 'graveyard' list.
2424 */ 2430 */
2425 static int select_submounts(struct mount *parent, struct list_head *graveyard) 2431 static int select_submounts(struct mount *parent, struct list_head *graveyard)
2426 { 2432 {
2427 struct mount *this_parent = parent; 2433 struct mount *this_parent = parent;
2428 struct list_head *next; 2434 struct list_head *next;
2429 int found = 0; 2435 int found = 0;
2430 2436
2431 repeat: 2437 repeat:
2432 next = this_parent->mnt_mounts.next; 2438 next = this_parent->mnt_mounts.next;
2433 resume: 2439 resume:
2434 while (next != &this_parent->mnt_mounts) { 2440 while (next != &this_parent->mnt_mounts) {
2435 struct list_head *tmp = next; 2441 struct list_head *tmp = next;
2436 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 2442 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
2437 2443
2438 next = tmp->next; 2444 next = tmp->next;
2439 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 2445 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
2440 continue; 2446 continue;
2441 /* 2447 /*
2442 * Descend a level if the d_mounts list is non-empty. 2448 * Descend a level if the d_mounts list is non-empty.
2443 */ 2449 */
2444 if (!list_empty(&mnt->mnt_mounts)) { 2450 if (!list_empty(&mnt->mnt_mounts)) {
2445 this_parent = mnt; 2451 this_parent = mnt;
2446 goto repeat; 2452 goto repeat;
2447 } 2453 }
2448 2454
2449 if (!propagate_mount_busy(mnt, 1)) { 2455 if (!propagate_mount_busy(mnt, 1)) {
2450 list_move_tail(&mnt->mnt_expire, graveyard); 2456 list_move_tail(&mnt->mnt_expire, graveyard);
2451 found++; 2457 found++;
2452 } 2458 }
2453 } 2459 }
2454 /* 2460 /*
2455 * All done at this level ... ascend and resume the search 2461 * All done at this level ... ascend and resume the search
2456 */ 2462 */
2457 if (this_parent != parent) { 2463 if (this_parent != parent) {
2458 next = this_parent->mnt_child.next; 2464 next = this_parent->mnt_child.next;
2459 this_parent = this_parent->mnt_parent; 2465 this_parent = this_parent->mnt_parent;
2460 goto resume; 2466 goto resume;
2461 } 2467 }
2462 return found; 2468 return found;
2463 } 2469 }
2464 2470
2465 /* 2471 /*
2466 * process a list of expirable mountpoints with the intent of discarding any 2472 * process a list of expirable mountpoints with the intent of discarding any
2467 * submounts of a specific parent mountpoint 2473 * submounts of a specific parent mountpoint
2468 * 2474 *
2469 * mount_lock must be held for write 2475 * mount_lock must be held for write
2470 */ 2476 */
2471 static void shrink_submounts(struct mount *mnt) 2477 static void shrink_submounts(struct mount *mnt)
2472 { 2478 {
2473 LIST_HEAD(graveyard); 2479 LIST_HEAD(graveyard);
2474 struct mount *m; 2480 struct mount *m;
2475 2481
2476 /* extract submounts of 'mountpoint' from the expiration list */ 2482 /* extract submounts of 'mountpoint' from the expiration list */
2477 while (select_submounts(mnt, &graveyard)) { 2483 while (select_submounts(mnt, &graveyard)) {
2478 while (!list_empty(&graveyard)) { 2484 while (!list_empty(&graveyard)) {
2479 m = list_first_entry(&graveyard, struct mount, 2485 m = list_first_entry(&graveyard, struct mount,
2480 mnt_expire); 2486 mnt_expire);
2481 touch_mnt_namespace(m->mnt_ns); 2487 touch_mnt_namespace(m->mnt_ns);
2482 umount_tree(m, 1); 2488 umount_tree(m, 1);
2483 } 2489 }
2484 } 2490 }
2485 } 2491 }
2486 2492
2487 /* 2493 /*
2488 * Some copy_from_user() implementations do not return the exact number of 2494 * Some copy_from_user() implementations do not return the exact number of
2489 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2495 * bytes remaining to copy on a fault. But copy_mount_options() requires that.
2490 * Note that this function differs from copy_from_user() in that it will oops 2496 * Note that this function differs from copy_from_user() in that it will oops
2491 * on bad values of `to', rather than returning a short copy. 2497 * on bad values of `to', rather than returning a short copy.
2492 */ 2498 */
2493 static long exact_copy_from_user(void *to, const void __user * from, 2499 static long exact_copy_from_user(void *to, const void __user * from,
2494 unsigned long n) 2500 unsigned long n)
2495 { 2501 {
2496 char *t = to; 2502 char *t = to;
2497 const char __user *f = from; 2503 const char __user *f = from;
2498 char c; 2504 char c;
2499 2505
2500 if (!access_ok(VERIFY_READ, from, n)) 2506 if (!access_ok(VERIFY_READ, from, n))
2501 return n; 2507 return n;
2502 2508
2503 while (n) { 2509 while (n) {
2504 if (__get_user(c, f)) { 2510 if (__get_user(c, f)) {
2505 memset(t, 0, n); 2511 memset(t, 0, n);
2506 break; 2512 break;
2507 } 2513 }
2508 *t++ = c; 2514 *t++ = c;
2509 f++; 2515 f++;
2510 n--; 2516 n--;
2511 } 2517 }
2512 return n; 2518 return n;
2513 } 2519 }
2514 2520
2515 int copy_mount_options(const void __user * data, unsigned long *where) 2521 int copy_mount_options(const void __user * data, unsigned long *where)
2516 { 2522 {
2517 int i; 2523 int i;
2518 unsigned long page; 2524 unsigned long page;
2519 unsigned long size; 2525 unsigned long size;
2520 2526
2521 *where = 0; 2527 *where = 0;
2522 if (!data) 2528 if (!data)
2523 return 0; 2529 return 0;
2524 2530
2525 if (!(page = __get_free_page(GFP_KERNEL))) 2531 if (!(page = __get_free_page(GFP_KERNEL)))
2526 return -ENOMEM; 2532 return -ENOMEM;
2527 2533
2528 /* We only care that *some* data at the address the user 2534 /* We only care that *some* data at the address the user
2529 * gave us is valid. Just in case, we'll zero 2535 * gave us is valid. Just in case, we'll zero
2530 * the remainder of the page. 2536 * the remainder of the page.
2531 */ 2537 */
2532 /* copy_from_user cannot cross TASK_SIZE ! */ 2538 /* copy_from_user cannot cross TASK_SIZE ! */
2533 size = TASK_SIZE - (unsigned long)data; 2539 size = TASK_SIZE - (unsigned long)data;
2534 if (size > PAGE_SIZE) 2540 if (size > PAGE_SIZE)
2535 size = PAGE_SIZE; 2541 size = PAGE_SIZE;
2536 2542
2537 i = size - exact_copy_from_user((void *)page, data, size); 2543 i = size - exact_copy_from_user((void *)page, data, size);
2538 if (!i) { 2544 if (!i) {
2539 free_page(page); 2545 free_page(page);
2540 return -EFAULT; 2546 return -EFAULT;
2541 } 2547 }
2542 if (i != PAGE_SIZE) 2548 if (i != PAGE_SIZE)
2543 memset((char *)page + i, 0, PAGE_SIZE - i); 2549 memset((char *)page + i, 0, PAGE_SIZE - i);
2544 *where = page; 2550 *where = page;
2545 return 0; 2551 return 0;
2546 } 2552 }
2547 2553
2548 char *copy_mount_string(const void __user *data) 2554 char *copy_mount_string(const void __user *data)
2549 { 2555 {
2550 return data ? strndup_user(data, PAGE_SIZE) : NULL; 2556 return data ? strndup_user(data, PAGE_SIZE) : NULL;
2551 } 2557 }
2552 2558
2553 /* 2559 /*
2554 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2560 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
2555 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2561 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
2556 * 2562 *
2557 * data is a (void *) that can point to any structure up to 2563 * data is a (void *) that can point to any structure up to
2558 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2564 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
2559 * information (or be NULL). 2565 * information (or be NULL).
2560 * 2566 *
2561 * Pre-0.97 versions of mount() didn't have a flags word. 2567 * Pre-0.97 versions of mount() didn't have a flags word.
2562 * When the flags word was introduced its top half was required 2568 * When the flags word was introduced its top half was required
2563 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2569 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
2564 * Therefore, if this magic number is present, it carries no information 2570 * Therefore, if this magic number is present, it carries no information
2565 * and must be discarded. 2571 * and must be discarded.
2566 */ 2572 */
2567 long do_mount(const char *dev_name, const char __user *dir_name, 2573 long do_mount(const char *dev_name, const char __user *dir_name,
2568 const char *type_page, unsigned long flags, void *data_page) 2574 const char *type_page, unsigned long flags, void *data_page)
2569 { 2575 {
2570 struct path path; 2576 struct path path;
2571 int retval = 0; 2577 int retval = 0;
2572 int mnt_flags = 0; 2578 int mnt_flags = 0;
2573 2579
2574 /* Discard magic */ 2580 /* Discard magic */
2575 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2581 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
2576 flags &= ~MS_MGC_MSK; 2582 flags &= ~MS_MGC_MSK;
2577 2583
2578 /* Basic sanity checks */ 2584 /* Basic sanity checks */
2579 if (data_page) 2585 if (data_page)
2580 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2586 ((char *)data_page)[PAGE_SIZE - 1] = 0;
2581 2587
2582 /* ... and get the mountpoint */ 2588 /* ... and get the mountpoint */
2583 retval = user_path(dir_name, &path); 2589 retval = user_path(dir_name, &path);
2584 if (retval) 2590 if (retval)
2585 return retval; 2591 return retval;
2586 2592
2587 retval = security_sb_mount(dev_name, &path, 2593 retval = security_sb_mount(dev_name, &path,
2588 type_page, flags, data_page); 2594 type_page, flags, data_page);
2589 if (!retval && !may_mount()) 2595 if (!retval && !may_mount())
2590 retval = -EPERM; 2596 retval = -EPERM;
2591 if (retval) 2597 if (retval)
2592 goto dput_out; 2598 goto dput_out;
2593 2599
2594 /* Default to relatime unless overriden */ 2600 /* Default to relatime unless overriden */
2595 if (!(flags & MS_NOATIME)) 2601 if (!(flags & MS_NOATIME))
2596 mnt_flags |= MNT_RELATIME; 2602 mnt_flags |= MNT_RELATIME;
2597 2603
2598 /* Separate the per-mountpoint flags */ 2604 /* Separate the per-mountpoint flags */
2599 if (flags & MS_NOSUID) 2605 if (flags & MS_NOSUID)
2600 mnt_flags |= MNT_NOSUID; 2606 mnt_flags |= MNT_NOSUID;
2601 if (flags & MS_NODEV) 2607 if (flags & MS_NODEV)
2602 mnt_flags |= MNT_NODEV; 2608 mnt_flags |= MNT_NODEV;
2603 if (flags & MS_NOEXEC) 2609 if (flags & MS_NOEXEC)
2604 mnt_flags |= MNT_NOEXEC; 2610 mnt_flags |= MNT_NOEXEC;
2605 if (flags & MS_NOATIME) 2611 if (flags & MS_NOATIME)
2606 mnt_flags |= MNT_NOATIME; 2612 mnt_flags |= MNT_NOATIME;
2607 if (flags & MS_NODIRATIME) 2613 if (flags & MS_NODIRATIME)
2608 mnt_flags |= MNT_NODIRATIME; 2614 mnt_flags |= MNT_NODIRATIME;
2609 if (flags & MS_STRICTATIME) 2615 if (flags & MS_STRICTATIME)
2610 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2616 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
2611 if (flags & MS_RDONLY) 2617 if (flags & MS_RDONLY)
2612 mnt_flags |= MNT_READONLY; 2618 mnt_flags |= MNT_READONLY;
2613 2619
2614 /* The default atime for remount is preservation */ 2620 /* The default atime for remount is preservation */
2615 if ((flags & MS_REMOUNT) && 2621 if ((flags & MS_REMOUNT) &&
2616 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 2622 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
2617 MS_STRICTATIME)) == 0)) { 2623 MS_STRICTATIME)) == 0)) {
2618 mnt_flags &= ~MNT_ATIME_MASK; 2624 mnt_flags &= ~MNT_ATIME_MASK;
2619 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; 2625 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
2620 } 2626 }
2621 2627
2622 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2628 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2623 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2629 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2624 MS_STRICTATIME); 2630 MS_STRICTATIME);
2625 2631
2626 if (flags & MS_REMOUNT) 2632 if (flags & MS_REMOUNT)
2627 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2633 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
2628 data_page); 2634 data_page);
2629 else if (flags & MS_BIND) 2635 else if (flags & MS_BIND)
2630 retval = do_loopback(&path, dev_name, flags & MS_REC); 2636 retval = do_loopback(&path, dev_name, flags & MS_REC);
2631 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2637 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2632 retval = do_change_type(&path, flags); 2638 retval = do_change_type(&path, flags);
2633 else if (flags & MS_MOVE) 2639 else if (flags & MS_MOVE)
2634 retval = do_move_mount(&path, dev_name); 2640 retval = do_move_mount(&path, dev_name);
2635 else 2641 else
2636 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2642 retval = do_new_mount(&path, type_page, flags, mnt_flags,
2637 dev_name, data_page); 2643 dev_name, data_page);
2638 dput_out: 2644 dput_out:
2639 path_put(&path); 2645 path_put(&path);
2640 return retval; 2646 return retval;
2641 } 2647 }
2642 2648
2643 static void free_mnt_ns(struct mnt_namespace *ns) 2649 static void free_mnt_ns(struct mnt_namespace *ns)
2644 { 2650 {
2645 proc_free_inum(ns->proc_inum); 2651 proc_free_inum(ns->proc_inum);
2646 put_user_ns(ns->user_ns); 2652 put_user_ns(ns->user_ns);
2647 kfree(ns); 2653 kfree(ns);
2648 } 2654 }
2649 2655
2650 /* 2656 /*
2651 * Assign a sequence number so we can detect when we attempt to bind 2657 * Assign a sequence number so we can detect when we attempt to bind
2652 * mount a reference to an older mount namespace into the current 2658 * mount a reference to an older mount namespace into the current
2653 * mount namespace, preventing reference counting loops. A 64bit 2659 * mount namespace, preventing reference counting loops. A 64bit
2654 * number incrementing at 10Ghz will take 12,427 years to wrap which 2660 * number incrementing at 10Ghz will take 12,427 years to wrap which
2655 * is effectively never, so we can ignore the possibility. 2661 * is effectively never, so we can ignore the possibility.
2656 */ 2662 */
2657 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2663 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2658 2664
2659 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2665 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2660 { 2666 {
2661 struct mnt_namespace *new_ns; 2667 struct mnt_namespace *new_ns;
2662 int ret; 2668 int ret;
2663 2669
2664 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2670 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2665 if (!new_ns) 2671 if (!new_ns)
2666 return ERR_PTR(-ENOMEM); 2672 return ERR_PTR(-ENOMEM);
2667 ret = proc_alloc_inum(&new_ns->proc_inum); 2673 ret = proc_alloc_inum(&new_ns->proc_inum);
2668 if (ret) { 2674 if (ret) {
2669 kfree(new_ns); 2675 kfree(new_ns);
2670 return ERR_PTR(ret); 2676 return ERR_PTR(ret);
2671 } 2677 }
2672 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2678 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2673 atomic_set(&new_ns->count, 1); 2679 atomic_set(&new_ns->count, 1);
2674 new_ns->root = NULL; 2680 new_ns->root = NULL;
2675 INIT_LIST_HEAD(&new_ns->list); 2681 INIT_LIST_HEAD(&new_ns->list);
2676 init_waitqueue_head(&new_ns->poll); 2682 init_waitqueue_head(&new_ns->poll);
2677 new_ns->event = 0; 2683 new_ns->event = 0;
2678 new_ns->user_ns = get_user_ns(user_ns); 2684 new_ns->user_ns = get_user_ns(user_ns);
2679 return new_ns; 2685 return new_ns;
2680 } 2686 }
2681 2687
2682 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2688 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2683 struct user_namespace *user_ns, struct fs_struct *new_fs) 2689 struct user_namespace *user_ns, struct fs_struct *new_fs)
2684 { 2690 {
2685 struct mnt_namespace *new_ns; 2691 struct mnt_namespace *new_ns;
2686 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2692 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2687 struct mount *p, *q; 2693 struct mount *p, *q;
2688 struct mount *old; 2694 struct mount *old;
2689 struct mount *new; 2695 struct mount *new;
2690 int copy_flags; 2696 int copy_flags;
2691 2697
2692 BUG_ON(!ns); 2698 BUG_ON(!ns);
2693 2699
2694 if (likely(!(flags & CLONE_NEWNS))) { 2700 if (likely(!(flags & CLONE_NEWNS))) {
2695 get_mnt_ns(ns); 2701 get_mnt_ns(ns);
2696 return ns; 2702 return ns;
2697 } 2703 }
2698 2704
2699 old = ns->root; 2705 old = ns->root;
2700 2706
2701 new_ns = alloc_mnt_ns(user_ns); 2707 new_ns = alloc_mnt_ns(user_ns);
2702 if (IS_ERR(new_ns)) 2708 if (IS_ERR(new_ns))
2703 return new_ns; 2709 return new_ns;
2704 2710
2705 namespace_lock(); 2711 namespace_lock();
2706 /* First pass: copy the tree topology */ 2712 /* First pass: copy the tree topology */
2707 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 2713 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2708 if (user_ns != ns->user_ns) 2714 if (user_ns != ns->user_ns)
2709 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2715 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2710 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2716 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2711 if (IS_ERR(new)) { 2717 if (IS_ERR(new)) {
2712 namespace_unlock(); 2718 namespace_unlock();
2713 free_mnt_ns(new_ns); 2719 free_mnt_ns(new_ns);
2714 return ERR_CAST(new); 2720 return ERR_CAST(new);
2715 } 2721 }
2716 new_ns->root = new; 2722 new_ns->root = new;
2717 list_add_tail(&new_ns->list, &new->mnt_list); 2723 list_add_tail(&new_ns->list, &new->mnt_list);
2718 2724
2719 /* 2725 /*
2720 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2726 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
2721 * as belonging to new namespace. We have already acquired a private 2727 * as belonging to new namespace. We have already acquired a private
2722 * fs_struct, so tsk->fs->lock is not needed. 2728 * fs_struct, so tsk->fs->lock is not needed.
2723 */ 2729 */
2724 p = old; 2730 p = old;
2725 q = new; 2731 q = new;
2726 while (p) { 2732 while (p) {
2727 q->mnt_ns = new_ns; 2733 q->mnt_ns = new_ns;
2728 if (new_fs) { 2734 if (new_fs) {
2729 if (&p->mnt == new_fs->root.mnt) { 2735 if (&p->mnt == new_fs->root.mnt) {
2730 new_fs->root.mnt = mntget(&q->mnt); 2736 new_fs->root.mnt = mntget(&q->mnt);
2731 rootmnt = &p->mnt; 2737 rootmnt = &p->mnt;
2732 } 2738 }
2733 if (&p->mnt == new_fs->pwd.mnt) { 2739 if (&p->mnt == new_fs->pwd.mnt) {
2734 new_fs->pwd.mnt = mntget(&q->mnt); 2740 new_fs->pwd.mnt = mntget(&q->mnt);
2735 pwdmnt = &p->mnt; 2741 pwdmnt = &p->mnt;
2736 } 2742 }
2737 } 2743 }
2738 p = next_mnt(p, old); 2744 p = next_mnt(p, old);
2739 q = next_mnt(q, new); 2745 q = next_mnt(q, new);
2740 if (!q) 2746 if (!q)
2741 break; 2747 break;
2742 while (p->mnt.mnt_root != q->mnt.mnt_root) 2748 while (p->mnt.mnt_root != q->mnt.mnt_root)
2743 p = next_mnt(p, old); 2749 p = next_mnt(p, old);
2744 } 2750 }
2745 namespace_unlock(); 2751 namespace_unlock();
2746 2752
2747 if (rootmnt) 2753 if (rootmnt)
2748 mntput(rootmnt); 2754 mntput(rootmnt);
2749 if (pwdmnt) 2755 if (pwdmnt)
2750 mntput(pwdmnt); 2756 mntput(pwdmnt);
2751 2757
2752 return new_ns; 2758 return new_ns;
2753 } 2759 }
2754 2760
2755 /** 2761 /**
2756 * create_mnt_ns - creates a private namespace and adds a root filesystem 2762 * create_mnt_ns - creates a private namespace and adds a root filesystem
2757 * @mnt: pointer to the new root filesystem mountpoint 2763 * @mnt: pointer to the new root filesystem mountpoint
2758 */ 2764 */
2759 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2765 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2760 { 2766 {
2761 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2767 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2762 if (!IS_ERR(new_ns)) { 2768 if (!IS_ERR(new_ns)) {
2763 struct mount *mnt = real_mount(m); 2769 struct mount *mnt = real_mount(m);
2764 mnt->mnt_ns = new_ns; 2770 mnt->mnt_ns = new_ns;
2765 new_ns->root = mnt; 2771 new_ns->root = mnt;
2766 list_add(&mnt->mnt_list, &new_ns->list); 2772 list_add(&mnt->mnt_list, &new_ns->list);
2767 } else { 2773 } else {
2768 mntput(m); 2774 mntput(m);
2769 } 2775 }
2770 return new_ns; 2776 return new_ns;
2771 } 2777 }
2772 2778
2773 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) 2779 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2774 { 2780 {
2775 struct mnt_namespace *ns; 2781 struct mnt_namespace *ns;
2776 struct super_block *s; 2782 struct super_block *s;
2777 struct path path; 2783 struct path path;
2778 int err; 2784 int err;
2779 2785
2780 ns = create_mnt_ns(mnt); 2786 ns = create_mnt_ns(mnt);
2781 if (IS_ERR(ns)) 2787 if (IS_ERR(ns))
2782 return ERR_CAST(ns); 2788 return ERR_CAST(ns);
2783 2789
2784 err = vfs_path_lookup(mnt->mnt_root, mnt, 2790 err = vfs_path_lookup(mnt->mnt_root, mnt,
2785 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2791 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2786 2792
2787 put_mnt_ns(ns); 2793 put_mnt_ns(ns);
2788 2794
2789 if (err) 2795 if (err)
2790 return ERR_PTR(err); 2796 return ERR_PTR(err);
2791 2797
2792 /* trade a vfsmount reference for active sb one */ 2798 /* trade a vfsmount reference for active sb one */
2793 s = path.mnt->mnt_sb; 2799 s = path.mnt->mnt_sb;
2794 atomic_inc(&s->s_active); 2800 atomic_inc(&s->s_active);
2795 mntput(path.mnt); 2801 mntput(path.mnt);
2796 /* lock the sucker */ 2802 /* lock the sucker */
2797 down_write(&s->s_umount); 2803 down_write(&s->s_umount);
2798 /* ... and return the root of (sub)tree on it */ 2804 /* ... and return the root of (sub)tree on it */
2799 return path.dentry; 2805 return path.dentry;
2800 } 2806 }
2801 EXPORT_SYMBOL(mount_subtree); 2807 EXPORT_SYMBOL(mount_subtree);
2802 2808
2803 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2809 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2804 char __user *, type, unsigned long, flags, void __user *, data) 2810 char __user *, type, unsigned long, flags, void __user *, data)
2805 { 2811 {
2806 int ret; 2812 int ret;
2807 char *kernel_type; 2813 char *kernel_type;
2808 char *kernel_dev; 2814 char *kernel_dev;
2809 unsigned long data_page; 2815 unsigned long data_page;
2810 2816
2811 kernel_type = copy_mount_string(type); 2817 kernel_type = copy_mount_string(type);
2812 ret = PTR_ERR(kernel_type); 2818 ret = PTR_ERR(kernel_type);
2813 if (IS_ERR(kernel_type)) 2819 if (IS_ERR(kernel_type))
2814 goto out_type; 2820 goto out_type;
2815 2821
2816 kernel_dev = copy_mount_string(dev_name); 2822 kernel_dev = copy_mount_string(dev_name);
2817 ret = PTR_ERR(kernel_dev); 2823 ret = PTR_ERR(kernel_dev);
2818 if (IS_ERR(kernel_dev)) 2824 if (IS_ERR(kernel_dev))
2819 goto out_dev; 2825 goto out_dev;
2820 2826
2821 ret = copy_mount_options(data, &data_page); 2827 ret = copy_mount_options(data, &data_page);
2822 if (ret < 0) 2828 if (ret < 0)
2823 goto out_data; 2829 goto out_data;
2824 2830
2825 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, 2831 ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
2826 (void *) data_page); 2832 (void *) data_page);
2827 2833
2828 free_page(data_page); 2834 free_page(data_page);
2829 out_data: 2835 out_data:
2830 kfree(kernel_dev); 2836 kfree(kernel_dev);
2831 out_dev: 2837 out_dev:
2832 kfree(kernel_type); 2838 kfree(kernel_type);
2833 out_type: 2839 out_type:
2834 return ret; 2840 return ret;
2835 } 2841 }
2836 2842
2837 /* 2843 /*
2838 * Return true if path is reachable from root 2844 * Return true if path is reachable from root
2839 * 2845 *
2840 * namespace_sem or mount_lock is held 2846 * namespace_sem or mount_lock is held
2841 */ 2847 */
2842 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2848 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2843 const struct path *root) 2849 const struct path *root)
2844 { 2850 {
2845 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 2851 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
2846 dentry = mnt->mnt_mountpoint; 2852 dentry = mnt->mnt_mountpoint;
2847 mnt = mnt->mnt_parent; 2853 mnt = mnt->mnt_parent;
2848 } 2854 }
2849 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 2855 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
2850 } 2856 }
2851 2857
2852 int path_is_under(struct path *path1, struct path *path2) 2858 int path_is_under(struct path *path1, struct path *path2)
2853 { 2859 {
2854 int res; 2860 int res;
2855 read_seqlock_excl(&mount_lock); 2861 read_seqlock_excl(&mount_lock);
2856 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2862 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2857 read_sequnlock_excl(&mount_lock); 2863 read_sequnlock_excl(&mount_lock);
2858 return res; 2864 return res;
2859 } 2865 }
2860 EXPORT_SYMBOL(path_is_under); 2866 EXPORT_SYMBOL(path_is_under);
2861 2867
2862 /* 2868 /*
2863 * pivot_root Semantics: 2869 * pivot_root Semantics:
2864 * Moves the root file system of the current process to the directory put_old, 2870 * Moves the root file system of the current process to the directory put_old,
2865 * makes new_root as the new root file system of the current process, and sets 2871 * makes new_root as the new root file system of the current process, and sets
2866 * root/cwd of all processes which had them on the current root to new_root. 2872 * root/cwd of all processes which had them on the current root to new_root.
2867 * 2873 *
2868 * Restrictions: 2874 * Restrictions:
2869 * The new_root and put_old must be directories, and must not be on the 2875 * The new_root and put_old must be directories, and must not be on the
2870 * same file system as the current process root. The put_old must be 2876 * same file system as the current process root. The put_old must be
2871 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2877 * underneath new_root, i.e. adding a non-zero number of /.. to the string
2872 * pointed to by put_old must yield the same directory as new_root. No other 2878 * pointed to by put_old must yield the same directory as new_root. No other
2873 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2879 * file system may be mounted on put_old. After all, new_root is a mountpoint.
2874 * 2880 *
2875 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2881 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
2876 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2882 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
2877 * in this situation. 2883 * in this situation.
2878 * 2884 *
2879 * Notes: 2885 * Notes:
2880 * - we don't move root/cwd if they are not at the root (reason: if something 2886 * - we don't move root/cwd if they are not at the root (reason: if something
2881 * cared enough to change them, it's probably wrong to force them elsewhere) 2887 * cared enough to change them, it's probably wrong to force them elsewhere)
2882 * - it's okay to pick a root that isn't the root of a file system, e.g. 2888 * - it's okay to pick a root that isn't the root of a file system, e.g.
2883 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2889 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
2884 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2890 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
2885 * first. 2891 * first.
2886 */ 2892 */
2887 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2893 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2888 const char __user *, put_old) 2894 const char __user *, put_old)
2889 { 2895 {
2890 struct path new, old, parent_path, root_parent, root; 2896 struct path new, old, parent_path, root_parent, root;
2891 struct mount *new_mnt, *root_mnt, *old_mnt; 2897 struct mount *new_mnt, *root_mnt, *old_mnt;
2892 struct mountpoint *old_mp, *root_mp; 2898 struct mountpoint *old_mp, *root_mp;
2893 int error; 2899 int error;
2894 2900
2895 if (!may_mount()) 2901 if (!may_mount())
2896 return -EPERM; 2902 return -EPERM;
2897 2903
2898 error = user_path_dir(new_root, &new); 2904 error = user_path_dir(new_root, &new);
2899 if (error) 2905 if (error)
2900 goto out0; 2906 goto out0;
2901 2907
2902 error = user_path_dir(put_old, &old); 2908 error = user_path_dir(put_old, &old);
2903 if (error) 2909 if (error)
2904 goto out1; 2910 goto out1;
2905 2911
2906 error = security_sb_pivotroot(&old, &new); 2912 error = security_sb_pivotroot(&old, &new);
2907 if (error) 2913 if (error)
2908 goto out2; 2914 goto out2;
2909 2915
2910 get_fs_root(current->fs, &root); 2916 get_fs_root(current->fs, &root);
2911 old_mp = lock_mount(&old); 2917 old_mp = lock_mount(&old);
2912 error = PTR_ERR(old_mp); 2918 error = PTR_ERR(old_mp);
2913 if (IS_ERR(old_mp)) 2919 if (IS_ERR(old_mp))
2914 goto out3; 2920 goto out3;
2915 2921
2916 error = -EINVAL; 2922 error = -EINVAL;
2917 new_mnt = real_mount(new.mnt); 2923 new_mnt = real_mount(new.mnt);
2918 root_mnt = real_mount(root.mnt); 2924 root_mnt = real_mount(root.mnt);
2919 old_mnt = real_mount(old.mnt); 2925 old_mnt = real_mount(old.mnt);
2920 if (IS_MNT_SHARED(old_mnt) || 2926 if (IS_MNT_SHARED(old_mnt) ||
2921 IS_MNT_SHARED(new_mnt->mnt_parent) || 2927 IS_MNT_SHARED(new_mnt->mnt_parent) ||
2922 IS_MNT_SHARED(root_mnt->mnt_parent)) 2928 IS_MNT_SHARED(root_mnt->mnt_parent))
2923 goto out4; 2929 goto out4;
2924 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2930 if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2925 goto out4; 2931 goto out4;
2926 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 2932 if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
2927 goto out4; 2933 goto out4;
2928 error = -ENOENT; 2934 error = -ENOENT;
2929 if (d_unlinked(new.dentry)) 2935 if (d_unlinked(new.dentry))
2930 goto out4; 2936 goto out4;
2931 error = -EBUSY; 2937 error = -EBUSY;
2932 if (new_mnt == root_mnt || old_mnt == root_mnt) 2938 if (new_mnt == root_mnt || old_mnt == root_mnt)
2933 goto out4; /* loop, on the same file system */ 2939 goto out4; /* loop, on the same file system */
2934 error = -EINVAL; 2940 error = -EINVAL;
2935 if (root.mnt->mnt_root != root.dentry) 2941 if (root.mnt->mnt_root != root.dentry)
2936 goto out4; /* not a mountpoint */ 2942 goto out4; /* not a mountpoint */
2937 if (!mnt_has_parent(root_mnt)) 2943 if (!mnt_has_parent(root_mnt))
2938 goto out4; /* not attached */ 2944 goto out4; /* not attached */
2939 root_mp = root_mnt->mnt_mp; 2945 root_mp = root_mnt->mnt_mp;
2940 if (new.mnt->mnt_root != new.dentry) 2946 if (new.mnt->mnt_root != new.dentry)
2941 goto out4; /* not a mountpoint */ 2947 goto out4; /* not a mountpoint */
2942 if (!mnt_has_parent(new_mnt)) 2948 if (!mnt_has_parent(new_mnt))
2943 goto out4; /* not attached */ 2949 goto out4; /* not attached */
2944 /* make sure we can reach put_old from new_root */ 2950 /* make sure we can reach put_old from new_root */
2945 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2951 if (!is_path_reachable(old_mnt, old.dentry, &new))
2946 goto out4; 2952 goto out4;
2947 /* make certain new is below the root */ 2953 /* make certain new is below the root */
2948 if (!is_path_reachable(new_mnt, new.dentry, &root)) 2954 if (!is_path_reachable(new_mnt, new.dentry, &root))
2949 goto out4; 2955 goto out4;
2950 root_mp->m_count++; /* pin it so it won't go away */ 2956 root_mp->m_count++; /* pin it so it won't go away */
2951 lock_mount_hash(); 2957 lock_mount_hash();
2952 detach_mnt(new_mnt, &parent_path); 2958 detach_mnt(new_mnt, &parent_path);
2953 detach_mnt(root_mnt, &root_parent); 2959 detach_mnt(root_mnt, &root_parent);
2954 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 2960 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
2955 new_mnt->mnt.mnt_flags |= MNT_LOCKED; 2961 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
2956 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2962 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2957 } 2963 }
2958 /* mount old root on put_old */ 2964 /* mount old root on put_old */
2959 attach_mnt(root_mnt, old_mnt, old_mp); 2965 attach_mnt(root_mnt, old_mnt, old_mp);
2960 /* mount new_root on / */ 2966 /* mount new_root on / */
2961 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2967 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2962 touch_mnt_namespace(current->nsproxy->mnt_ns); 2968 touch_mnt_namespace(current->nsproxy->mnt_ns);
2963 unlock_mount_hash(); 2969 unlock_mount_hash();
2964 chroot_fs_refs(&root, &new); 2970 chroot_fs_refs(&root, &new);
2965 put_mountpoint(root_mp); 2971 put_mountpoint(root_mp);
2966 error = 0; 2972 error = 0;
2967 out4: 2973 out4:
2968 unlock_mount(old_mp); 2974 unlock_mount(old_mp);
2969 if (!error) { 2975 if (!error) {
2970 path_put(&root_parent); 2976 path_put(&root_parent);
2971 path_put(&parent_path); 2977 path_put(&parent_path);
2972 } 2978 }
2973 out3: 2979 out3:
2974 path_put(&root); 2980 path_put(&root);
2975 out2: 2981 out2:
2976 path_put(&old); 2982 path_put(&old);
2977 out1: 2983 out1:
2978 path_put(&new); 2984 path_put(&new);
2979 out0: 2985 out0:
2980 return error; 2986 return error;
2981 } 2987 }
2982 2988
2983 static void __init init_mount_tree(void) 2989 static void __init init_mount_tree(void)
2984 { 2990 {
2985 struct vfsmount *mnt; 2991 struct vfsmount *mnt;
2986 struct mnt_namespace *ns; 2992 struct mnt_namespace *ns;
2987 struct path root; 2993 struct path root;
2988 struct file_system_type *type; 2994 struct file_system_type *type;
2989 2995
2990 type = get_fs_type("rootfs"); 2996 type = get_fs_type("rootfs");
2991 if (!type) 2997 if (!type)
2992 panic("Can't find rootfs type"); 2998 panic("Can't find rootfs type");
2993 mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2999 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2994 put_filesystem(type); 3000 put_filesystem(type);
2995 if (IS_ERR(mnt)) 3001 if (IS_ERR(mnt))
2996 panic("Can't create rootfs"); 3002 panic("Can't create rootfs");
2997 3003
2998 ns = create_mnt_ns(mnt); 3004 ns = create_mnt_ns(mnt);
2999 if (IS_ERR(ns)) 3005 if (IS_ERR(ns))
3000 panic("Can't allocate initial namespace"); 3006 panic("Can't allocate initial namespace");
3001 3007
3002 init_task.nsproxy->mnt_ns = ns; 3008 init_task.nsproxy->mnt_ns = ns;
3003 get_mnt_ns(ns); 3009 get_mnt_ns(ns);
3004 3010
3005 root.mnt = mnt; 3011 root.mnt = mnt;
3006 root.dentry = mnt->mnt_root; 3012 root.dentry = mnt->mnt_root;
3007 3013
3008 set_fs_pwd(current->fs, &root); 3014 set_fs_pwd(current->fs, &root);
3009 set_fs_root(current->fs, &root); 3015 set_fs_root(current->fs, &root);
3010 } 3016 }
3011 3017
3012 void __init mnt_init(void) 3018 void __init mnt_init(void)
3013 { 3019 {
3014 unsigned u; 3020 unsigned u;
3015 int err; 3021 int err;
3016 3022
3017 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 3023 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
3018 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3024 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3019 3025
3020 mount_hashtable = alloc_large_system_hash("Mount-cache", 3026 mount_hashtable = alloc_large_system_hash("Mount-cache",
3021 sizeof(struct hlist_head), 3027 sizeof(struct hlist_head),
3022 mhash_entries, 19, 3028 mhash_entries, 19,
3023 0, 3029 0,
3024 &m_hash_shift, &m_hash_mask, 0, 0); 3030 &m_hash_shift, &m_hash_mask, 0, 0);
3025 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", 3031 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
3026 sizeof(struct hlist_head), 3032 sizeof(struct hlist_head),
3027 mphash_entries, 19, 3033 mphash_entries, 19,
3028 0, 3034 0,
3029 &mp_hash_shift, &mp_hash_mask, 0, 0); 3035 &mp_hash_shift, &mp_hash_mask, 0, 0);
3030 3036
3031 if (!mount_hashtable || !mountpoint_hashtable) 3037 if (!mount_hashtable || !mountpoint_hashtable)
3032 panic("Failed to allocate mount hash table\n"); 3038 panic("Failed to allocate mount hash table\n");
3033 3039
3034 for (u = 0; u <= m_hash_mask; u++) 3040 for (u = 0; u <= m_hash_mask; u++)
3035 INIT_HLIST_HEAD(&mount_hashtable[u]); 3041 INIT_HLIST_HEAD(&mount_hashtable[u]);
3036 for (u = 0; u <= mp_hash_mask; u++) 3042 for (u = 0; u <= mp_hash_mask; u++)
3037 INIT_HLIST_HEAD(&mountpoint_hashtable[u]); 3043 INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
3038 3044
3039 kernfs_init(); 3045 kernfs_init();
3040 3046
3041 err = sysfs_init(); 3047 err = sysfs_init();
3042 if (err) 3048 if (err)
3043 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 3049 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
3044 __func__, err); 3050 __func__, err);
3045 fs_kobj = kobject_create_and_add("fs", NULL); 3051 fs_kobj = kobject_create_and_add("fs", NULL);
3046 if (!fs_kobj) 3052 if (!fs_kobj)
3047 printk(KERN_WARNING "%s: kobj create error\n", __func__); 3053 printk(KERN_WARNING "%s: kobj create error\n", __func__);
3048 init_rootfs(); 3054 init_rootfs();
3049 init_mount_tree(); 3055 init_mount_tree();
3050 } 3056 }
3051 3057
3052 void put_mnt_ns(struct mnt_namespace *ns) 3058 void put_mnt_ns(struct mnt_namespace *ns)
3053 { 3059 {
3054 if (!atomic_dec_and_test(&ns->count)) 3060 if (!atomic_dec_and_test(&ns->count))
3055 return; 3061 return;
3056 drop_collected_mounts(&ns->root->mnt); 3062 drop_collected_mounts(&ns->root->mnt);
3057 free_mnt_ns(ns); 3063 free_mnt_ns(ns);
3058 } 3064 }
3059 3065
3060 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 3066 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
3061 { 3067 {
3062 struct vfsmount *mnt; 3068 struct vfsmount *mnt;
3063 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); 3069 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
3064 if (!IS_ERR(mnt)) { 3070 if (!IS_ERR(mnt)) {
3065 /* 3071 /*
3066 * it is a longterm mount, don't release mnt until 3072 * it is a longterm mount, don't release mnt until
3067 * we unmount before file sys is unregistered 3073 * we unmount before file sys is unregistered
3068 */ 3074 */
3069 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 3075 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
3070 } 3076 }
3071 return mnt; 3077 return mnt;
3072 } 3078 }
3073 EXPORT_SYMBOL_GPL(kern_mount_data); 3079 EXPORT_SYMBOL_GPL(kern_mount_data);
3074 3080
3075 void kern_unmount(struct vfsmount *mnt) 3081 void kern_unmount(struct vfsmount *mnt)
3076 { 3082 {
3077 /* release long term mount so mount point can be released */ 3083 /* release long term mount so mount point can be released */
3078 if (!IS_ERR_OR_NULL(mnt)) { 3084 if (!IS_ERR_OR_NULL(mnt)) {
3079 real_mount(mnt)->mnt_ns = NULL; 3085 real_mount(mnt)->mnt_ns = NULL;
3080 synchronize_rcu(); /* yecchhh... */ 3086 synchronize_rcu(); /* yecchhh... */
3081 mntput(mnt); 3087 mntput(mnt);
3082 } 3088 }
3083 } 3089 }
3084 EXPORT_SYMBOL(kern_unmount); 3090 EXPORT_SYMBOL(kern_unmount);
3085 3091
3086 bool our_mnt(struct vfsmount *mnt) 3092 bool our_mnt(struct vfsmount *mnt)
3087 { 3093 {
3088 return check_mnt(real_mount(mnt)); 3094 return check_mnt(real_mount(mnt));
3089 } 3095 }
3090 3096
3091 bool current_chrooted(void) 3097 bool current_chrooted(void)
3092 { 3098 {
3093 /* Does the current process have a non-standard root */ 3099 /* Does the current process have a non-standard root */
3094 struct path ns_root; 3100 struct path ns_root;
3095 struct path fs_root; 3101 struct path fs_root;
3096 bool chrooted; 3102 bool chrooted;
3097 3103
3098 /* Find the namespace root */ 3104 /* Find the namespace root */
3099 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; 3105 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
3100 ns_root.dentry = ns_root.mnt->mnt_root; 3106 ns_root.dentry = ns_root.mnt->mnt_root;
3101 path_get(&ns_root); 3107 path_get(&ns_root);
3102 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) 3108 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
3103 ; 3109 ;
3104 3110
3105 get_fs_root(current->fs, &fs_root); 3111 get_fs_root(current->fs, &fs_root);
3106 3112
3107 chrooted = !path_equal(&fs_root, &ns_root); 3113 chrooted = !path_equal(&fs_root, &ns_root);
3108 3114
3109 path_put(&fs_root); 3115 path_put(&fs_root);
3110 path_put(&ns_root); 3116 path_put(&ns_root);
3111 3117
3112 return chrooted; 3118 return chrooted;
3113 } 3119 }
3114 3120
3115 bool fs_fully_visible(struct file_system_type *type) 3121 bool fs_fully_visible(struct file_system_type *type)
3116 { 3122 {
3117 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3123 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
3118 struct mount *mnt; 3124 struct mount *mnt;
3119 bool visible = false; 3125 bool visible = false;
3120 3126
3121 if (unlikely(!ns)) 3127 if (unlikely(!ns))
3122 return false; 3128 return false;
3123 3129
3124 down_read(&namespace_sem); 3130 down_read(&namespace_sem);
3125 list_for_each_entry(mnt, &ns->list, mnt_list) { 3131 list_for_each_entry(mnt, &ns->list, mnt_list) {
3126 struct mount *child; 3132 struct mount *child;
3127 if (mnt->mnt.mnt_sb->s_type != type) 3133 if (mnt->mnt.mnt_sb->s_type != type)
3128 continue; 3134 continue;
3129 3135
3130 /* This mount is not fully visible if there are any child mounts 3136 /* This mount is not fully visible if there are any child mounts
3131 * that cover anything except for empty directories. 3137 * that cover anything except for empty directories.
3132 */ 3138 */
3133 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 3139 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
3134 struct inode *inode = child->mnt_mountpoint->d_inode; 3140 struct inode *inode = child->mnt_mountpoint->d_inode;
3135 if (!S_ISDIR(inode->i_mode)) 3141 if (!S_ISDIR(inode->i_mode))
3136 goto next; 3142 goto next;
3137 if (inode->i_nlink > 2) 3143 if (inode->i_nlink > 2)
3138 goto next; 3144 goto next;
3139 } 3145 }
3140 visible = true; 3146 visible = true;
3141 goto found; 3147 goto found;
3142 next: ; 3148 next: ;
3143 } 3149 }
3144 found: 3150 found:
3145 up_read(&namespace_sem); 3151 up_read(&namespace_sem);
3146 return visible; 3152 return visible;
3147 } 3153 }
3148 3154
3149 static void *mntns_get(struct task_struct *task) 3155 static void *mntns_get(struct task_struct *task)
3150 { 3156 {
3151 struct mnt_namespace *ns = NULL; 3157 struct mnt_namespace *ns = NULL;
3152 struct nsproxy *nsproxy; 3158 struct nsproxy *nsproxy;
3153 3159
3154 task_lock(task); 3160 task_lock(task);
3155 nsproxy = task->nsproxy; 3161 nsproxy = task->nsproxy;
3156 if (nsproxy) { 3162 if (nsproxy) {
3157 ns = nsproxy->mnt_ns; 3163 ns = nsproxy->mnt_ns;
3158 get_mnt_ns(ns); 3164 get_mnt_ns(ns);
3159 } 3165 }
3160 task_unlock(task); 3166 task_unlock(task);
3161 3167
3162 return ns; 3168 return ns;
3163 } 3169 }
3164 3170
3165 static void mntns_put(void *ns) 3171 static void mntns_put(void *ns)
3166 { 3172 {
3167 put_mnt_ns(ns); 3173 put_mnt_ns(ns);
3168 } 3174 }
3169 3175
3170 static int mntns_install(struct nsproxy *nsproxy, void *ns) 3176 static int mntns_install(struct nsproxy *nsproxy, void *ns)
3171 { 3177 {
3172 struct fs_struct *fs = current->fs; 3178 struct fs_struct *fs = current->fs;
3173 struct mnt_namespace *mnt_ns = ns; 3179 struct mnt_namespace *mnt_ns = ns;
3174 struct path root; 3180 struct path root;
3175 3181
3176 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 3182 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
3177 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || 3183 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
3178 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 3184 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
3179 return -EPERM; 3185 return -EPERM;
3180 3186
3181 if (fs->users != 1) 3187 if (fs->users != 1)
3182 return -EINVAL; 3188 return -EINVAL;
3183 3189
3184 get_mnt_ns(mnt_ns); 3190 get_mnt_ns(mnt_ns);
3185 put_mnt_ns(nsproxy->mnt_ns); 3191 put_mnt_ns(nsproxy->mnt_ns);
3186 nsproxy->mnt_ns = mnt_ns; 3192 nsproxy->mnt_ns = mnt_ns;
3187 3193
3188 /* Find the root */ 3194 /* Find the root */
3189 root.mnt = &mnt_ns->root->mnt; 3195 root.mnt = &mnt_ns->root->mnt;
3190 root.dentry = mnt_ns->root->mnt.mnt_root; 3196 root.dentry = mnt_ns->root->mnt.mnt_root;
3191 path_get(&root); 3197 path_get(&root);
3192 while(d_mountpoint(root.dentry) && follow_down_one(&root)) 3198 while(d_mountpoint(root.dentry) && follow_down_one(&root))
3193 ; 3199 ;
3194 3200
3195 /* Update the pwd and root */ 3201 /* Update the pwd and root */
3196 set_fs_pwd(fs, &root); 3202 set_fs_pwd(fs, &root);
3197 set_fs_root(fs, &root); 3203 set_fs_root(fs, &root);
3198 3204
3199 path_put(&root); 3205 path_put(&root);
3200 return 0; 3206 return 0;
3201 } 3207 }
3202 3208
3203 static unsigned int mntns_inum(void *ns) 3209 static unsigned int mntns_inum(void *ns)
3204 { 3210 {
3205 struct mnt_namespace *mnt_ns = ns; 3211 struct mnt_namespace *mnt_ns = ns;
3206 return mnt_ns->proc_inum; 3212 return mnt_ns->proc_inum;
3207 } 3213 }
3208 3214
3209 const struct proc_ns_operations mntns_operations = { 3215 const struct proc_ns_operations mntns_operations = {
3210 .name = "mnt", 3216 .name = "mnt",
3211 .type = CLONE_NEWNS, 3217 .type = CLONE_NEWNS,
3212 .get = mntns_get, 3218 .get = mntns_get,
3213 .put = mntns_put, 3219 .put = mntns_put,
3214 .install = mntns_install, 3220 .install = mntns_install,
3215 .inum = mntns_inum, 3221 .inum = mntns_inum,
3216 }; 3222 };
3217 3223