Commit 877c27dbaacad7b8c8c37c62ab9f5726f48c15ad

Authored by Eric W. Biederman
Committed by Greg Kroah-Hartman
1 parent 9e2a8e62a4

mnt: Fix a memory stomp in umount

commit c297abfdf15b4480704d6b566ca5ca9438b12456 upstream.

While reviewing the code of umount_tree I realized that when we append
to a preexisting unmounted list we do not change pprev of the former
first item in the list.

Which means later in namespace_unlock hlist_del_init(&mnt->mnt_hash) on
the former first item of the list will stomp unmounted.first leaving
it set to some random mount point which we are likely to free soon.

This isn't likely to hit, but if it does I don't know how anyone could
track it down.

[ This happened because we don't have all the same operations for
  hlist's as we do for normal doubly-linked lists. In particular,
  list_splice() is easy on our standard doubly-linked lists, while
  hlist_splice() doesn't exist and needs both start/end entries of the
  hlist.  And commit 38129a13e6e7 incorrectly open-coded that missing
  hlist_splice().

  We should think about making these kinds of "mindless" conversions
  easier to get right by adding the missing hlist helpers   - Linus ]

Fixes: 38129a13e6e71f666e0468e99fdd932a687b4d7e switch mnt_hash to hlist
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 2 additions and 0 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/namespace.c 2 * linux/fs/namespace.c
3 * 3 *
4 * (C) Copyright Al Viro 2000, 2001 4 * (C) Copyright Al Viro 2000, 2001
5 * Released under GPL v2. 5 * Released under GPL v2.
6 * 6 *
7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 7 * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 * Heavily rewritten. 8 * Heavily rewritten.
9 */ 9 */
10 10
11 #include <linux/syscalls.h> 11 #include <linux/syscalls.h>
12 #include <linux/export.h> 12 #include <linux/export.h>
13 #include <linux/capability.h> 13 #include <linux/capability.h>
14 #include <linux/mnt_namespace.h> 14 #include <linux/mnt_namespace.h>
15 #include <linux/user_namespace.h> 15 #include <linux/user_namespace.h>
16 #include <linux/namei.h> 16 #include <linux/namei.h>
17 #include <linux/security.h> 17 #include <linux/security.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 #include <linux/init.h> /* init_rootfs */ 19 #include <linux/init.h> /* init_rootfs */
20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */
21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22 #include <linux/uaccess.h> 22 #include <linux/uaccess.h>
23 #include <linux/proc_ns.h> 23 #include <linux/proc_ns.h>
24 #include <linux/magic.h> 24 #include <linux/magic.h>
25 #include <linux/bootmem.h> 25 #include <linux/bootmem.h>
26 #include <linux/task_work.h> 26 #include <linux/task_work.h>
27 #include "pnode.h" 27 #include "pnode.h"
28 #include "internal.h" 28 #include "internal.h"
29 29
30 static unsigned int m_hash_mask __read_mostly; 30 static unsigned int m_hash_mask __read_mostly;
31 static unsigned int m_hash_shift __read_mostly; 31 static unsigned int m_hash_shift __read_mostly;
32 static unsigned int mp_hash_mask __read_mostly; 32 static unsigned int mp_hash_mask __read_mostly;
33 static unsigned int mp_hash_shift __read_mostly; 33 static unsigned int mp_hash_shift __read_mostly;
34 34
35 static __initdata unsigned long mhash_entries; 35 static __initdata unsigned long mhash_entries;
36 static int __init set_mhash_entries(char *str) 36 static int __init set_mhash_entries(char *str)
37 { 37 {
38 if (!str) 38 if (!str)
39 return 0; 39 return 0;
40 mhash_entries = simple_strtoul(str, &str, 0); 40 mhash_entries = simple_strtoul(str, &str, 0);
41 return 1; 41 return 1;
42 } 42 }
43 __setup("mhash_entries=", set_mhash_entries); 43 __setup("mhash_entries=", set_mhash_entries);
44 44
45 static __initdata unsigned long mphash_entries; 45 static __initdata unsigned long mphash_entries;
46 static int __init set_mphash_entries(char *str) 46 static int __init set_mphash_entries(char *str)
47 { 47 {
48 if (!str) 48 if (!str)
49 return 0; 49 return 0;
50 mphash_entries = simple_strtoul(str, &str, 0); 50 mphash_entries = simple_strtoul(str, &str, 0);
51 return 1; 51 return 1;
52 } 52 }
53 __setup("mphash_entries=", set_mphash_entries); 53 __setup("mphash_entries=", set_mphash_entries);
54 54
55 static u64 event; 55 static u64 event;
56 static DEFINE_IDA(mnt_id_ida); 56 static DEFINE_IDA(mnt_id_ida);
57 static DEFINE_IDA(mnt_group_ida); 57 static DEFINE_IDA(mnt_group_ida);
58 static DEFINE_SPINLOCK(mnt_id_lock); 58 static DEFINE_SPINLOCK(mnt_id_lock);
59 static int mnt_id_start = 0; 59 static int mnt_id_start = 0;
60 static int mnt_group_start = 1; 60 static int mnt_group_start = 1;
61 61
62 static struct hlist_head *mount_hashtable __read_mostly; 62 static struct hlist_head *mount_hashtable __read_mostly;
63 static struct hlist_head *mountpoint_hashtable __read_mostly; 63 static struct hlist_head *mountpoint_hashtable __read_mostly;
64 static struct kmem_cache *mnt_cache __read_mostly; 64 static struct kmem_cache *mnt_cache __read_mostly;
65 static DECLARE_RWSEM(namespace_sem); 65 static DECLARE_RWSEM(namespace_sem);
66 66
67 /* /sys/fs */ 67 /* /sys/fs */
68 struct kobject *fs_kobj; 68 struct kobject *fs_kobj;
69 EXPORT_SYMBOL_GPL(fs_kobj); 69 EXPORT_SYMBOL_GPL(fs_kobj);
70 70
71 /* 71 /*
72 * vfsmount lock may be taken for read to prevent changes to the 72 * vfsmount lock may be taken for read to prevent changes to the
73 * vfsmount hash, ie. during mountpoint lookups or walking back 73 * vfsmount hash, ie. during mountpoint lookups or walking back
74 * up the tree. 74 * up the tree.
75 * 75 *
76 * It should be taken for write in all cases where the vfsmount 76 * It should be taken for write in all cases where the vfsmount
77 * tree or hash is modified or when a vfsmount structure is modified. 77 * tree or hash is modified or when a vfsmount structure is modified.
78 */ 78 */
79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
80 80
81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) 81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
82 { 82 {
83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
85 tmp = tmp + (tmp >> m_hash_shift); 85 tmp = tmp + (tmp >> m_hash_shift);
86 return &mount_hashtable[tmp & m_hash_mask]; 86 return &mount_hashtable[tmp & m_hash_mask];
87 } 87 }
88 88
89 static inline struct hlist_head *mp_hash(struct dentry *dentry) 89 static inline struct hlist_head *mp_hash(struct dentry *dentry)
90 { 90 {
91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); 91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
92 tmp = tmp + (tmp >> mp_hash_shift); 92 tmp = tmp + (tmp >> mp_hash_shift);
93 return &mountpoint_hashtable[tmp & mp_hash_mask]; 93 return &mountpoint_hashtable[tmp & mp_hash_mask];
94 } 94 }
95 95
96 /* 96 /*
97 * allocation is serialized by namespace_sem, but we need the spinlock to 97 * allocation is serialized by namespace_sem, but we need the spinlock to
98 * serialize with freeing. 98 * serialize with freeing.
99 */ 99 */
100 static int mnt_alloc_id(struct mount *mnt) 100 static int mnt_alloc_id(struct mount *mnt)
101 { 101 {
102 int res; 102 int res;
103 103
104 retry: 104 retry:
105 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 105 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
106 spin_lock(&mnt_id_lock); 106 spin_lock(&mnt_id_lock);
107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
108 if (!res) 108 if (!res)
109 mnt_id_start = mnt->mnt_id + 1; 109 mnt_id_start = mnt->mnt_id + 1;
110 spin_unlock(&mnt_id_lock); 110 spin_unlock(&mnt_id_lock);
111 if (res == -EAGAIN) 111 if (res == -EAGAIN)
112 goto retry; 112 goto retry;
113 113
114 return res; 114 return res;
115 } 115 }
116 116
117 static void mnt_free_id(struct mount *mnt) 117 static void mnt_free_id(struct mount *mnt)
118 { 118 {
119 int id = mnt->mnt_id; 119 int id = mnt->mnt_id;
120 spin_lock(&mnt_id_lock); 120 spin_lock(&mnt_id_lock);
121 ida_remove(&mnt_id_ida, id); 121 ida_remove(&mnt_id_ida, id);
122 if (mnt_id_start > id) 122 if (mnt_id_start > id)
123 mnt_id_start = id; 123 mnt_id_start = id;
124 spin_unlock(&mnt_id_lock); 124 spin_unlock(&mnt_id_lock);
125 } 125 }
126 126
127 /* 127 /*
128 * Allocate a new peer group ID 128 * Allocate a new peer group ID
129 * 129 *
130 * mnt_group_ida is protected by namespace_sem 130 * mnt_group_ida is protected by namespace_sem
131 */ 131 */
132 static int mnt_alloc_group_id(struct mount *mnt) 132 static int mnt_alloc_group_id(struct mount *mnt)
133 { 133 {
134 int res; 134 int res;
135 135
136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
137 return -ENOMEM; 137 return -ENOMEM;
138 138
139 res = ida_get_new_above(&mnt_group_ida, 139 res = ida_get_new_above(&mnt_group_ida,
140 mnt_group_start, 140 mnt_group_start,
141 &mnt->mnt_group_id); 141 &mnt->mnt_group_id);
142 if (!res) 142 if (!res)
143 mnt_group_start = mnt->mnt_group_id + 1; 143 mnt_group_start = mnt->mnt_group_id + 1;
144 144
145 return res; 145 return res;
146 } 146 }
147 147
148 /* 148 /*
149 * Release a peer group ID 149 * Release a peer group ID
150 */ 150 */
151 void mnt_release_group_id(struct mount *mnt) 151 void mnt_release_group_id(struct mount *mnt)
152 { 152 {
153 int id = mnt->mnt_group_id; 153 int id = mnt->mnt_group_id;
154 ida_remove(&mnt_group_ida, id); 154 ida_remove(&mnt_group_ida, id);
155 if (mnt_group_start > id) 155 if (mnt_group_start > id)
156 mnt_group_start = id; 156 mnt_group_start = id;
157 mnt->mnt_group_id = 0; 157 mnt->mnt_group_id = 0;
158 } 158 }
159 159
160 /* 160 /*
161 * vfsmount lock must be held for read 161 * vfsmount lock must be held for read
162 */ 162 */
163 static inline void mnt_add_count(struct mount *mnt, int n) 163 static inline void mnt_add_count(struct mount *mnt, int n)
164 { 164 {
165 #ifdef CONFIG_SMP 165 #ifdef CONFIG_SMP
166 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 166 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
167 #else 167 #else
168 preempt_disable(); 168 preempt_disable();
169 mnt->mnt_count += n; 169 mnt->mnt_count += n;
170 preempt_enable(); 170 preempt_enable();
171 #endif 171 #endif
172 } 172 }
173 173
174 /* 174 /*
175 * vfsmount lock must be held for write 175 * vfsmount lock must be held for write
176 */ 176 */
177 unsigned int mnt_get_count(struct mount *mnt) 177 unsigned int mnt_get_count(struct mount *mnt)
178 { 178 {
179 #ifdef CONFIG_SMP 179 #ifdef CONFIG_SMP
180 unsigned int count = 0; 180 unsigned int count = 0;
181 int cpu; 181 int cpu;
182 182
183 for_each_possible_cpu(cpu) { 183 for_each_possible_cpu(cpu) {
184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
185 } 185 }
186 186
187 return count; 187 return count;
188 #else 188 #else
189 return mnt->mnt_count; 189 return mnt->mnt_count;
190 #endif 190 #endif
191 } 191 }
192 192
193 static struct mount *alloc_vfsmnt(const char *name) 193 static struct mount *alloc_vfsmnt(const char *name)
194 { 194 {
195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
196 if (mnt) { 196 if (mnt) {
197 int err; 197 int err;
198 198
199 err = mnt_alloc_id(mnt); 199 err = mnt_alloc_id(mnt);
200 if (err) 200 if (err)
201 goto out_free_cache; 201 goto out_free_cache;
202 202
203 if (name) { 203 if (name) {
204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
205 if (!mnt->mnt_devname) 205 if (!mnt->mnt_devname)
206 goto out_free_id; 206 goto out_free_id;
207 } 207 }
208 208
209 #ifdef CONFIG_SMP 209 #ifdef CONFIG_SMP
210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
211 if (!mnt->mnt_pcp) 211 if (!mnt->mnt_pcp)
212 goto out_free_devname; 212 goto out_free_devname;
213 213
214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
215 #else 215 #else
216 mnt->mnt_count = 1; 216 mnt->mnt_count = 1;
217 mnt->mnt_writers = 0; 217 mnt->mnt_writers = 0;
218 #endif 218 #endif
219 219
220 INIT_HLIST_NODE(&mnt->mnt_hash); 220 INIT_HLIST_NODE(&mnt->mnt_hash);
221 INIT_LIST_HEAD(&mnt->mnt_child); 221 INIT_LIST_HEAD(&mnt->mnt_child);
222 INIT_LIST_HEAD(&mnt->mnt_mounts); 222 INIT_LIST_HEAD(&mnt->mnt_mounts);
223 INIT_LIST_HEAD(&mnt->mnt_list); 223 INIT_LIST_HEAD(&mnt->mnt_list);
224 INIT_LIST_HEAD(&mnt->mnt_expire); 224 INIT_LIST_HEAD(&mnt->mnt_expire);
225 INIT_LIST_HEAD(&mnt->mnt_share); 225 INIT_LIST_HEAD(&mnt->mnt_share);
226 INIT_LIST_HEAD(&mnt->mnt_slave_list); 226 INIT_LIST_HEAD(&mnt->mnt_slave_list);
227 INIT_LIST_HEAD(&mnt->mnt_slave); 227 INIT_LIST_HEAD(&mnt->mnt_slave);
228 INIT_HLIST_NODE(&mnt->mnt_mp_list); 228 INIT_HLIST_NODE(&mnt->mnt_mp_list);
229 #ifdef CONFIG_FSNOTIFY 229 #ifdef CONFIG_FSNOTIFY
230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
231 #endif 231 #endif
232 } 232 }
233 return mnt; 233 return mnt;
234 234
235 #ifdef CONFIG_SMP 235 #ifdef CONFIG_SMP
236 out_free_devname: 236 out_free_devname:
237 kfree(mnt->mnt_devname); 237 kfree(mnt->mnt_devname);
238 #endif 238 #endif
239 out_free_id: 239 out_free_id:
240 mnt_free_id(mnt); 240 mnt_free_id(mnt);
241 out_free_cache: 241 out_free_cache:
242 kmem_cache_free(mnt_cache, mnt); 242 kmem_cache_free(mnt_cache, mnt);
243 return NULL; 243 return NULL;
244 } 244 }
245 245
246 /* 246 /*
247 * Most r/o checks on a fs are for operations that take 247 * Most r/o checks on a fs are for operations that take
248 * discrete amounts of time, like a write() or unlink(). 248 * discrete amounts of time, like a write() or unlink().
249 * We must keep track of when those operations start 249 * We must keep track of when those operations start
250 * (for permission checks) and when they end, so that 250 * (for permission checks) and when they end, so that
251 * we can determine when writes are able to occur to 251 * we can determine when writes are able to occur to
252 * a filesystem. 252 * a filesystem.
253 */ 253 */
254 /* 254 /*
255 * __mnt_is_readonly: check whether a mount is read-only 255 * __mnt_is_readonly: check whether a mount is read-only
256 * @mnt: the mount to check for its write status 256 * @mnt: the mount to check for its write status
257 * 257 *
258 * This shouldn't be used directly ouside of the VFS. 258 * This shouldn't be used directly ouside of the VFS.
259 * It does not guarantee that the filesystem will stay 259 * It does not guarantee that the filesystem will stay
260 * r/w, just that it is right *now*. This can not and 260 * r/w, just that it is right *now*. This can not and
261 * should not be used in place of IS_RDONLY(inode). 261 * should not be used in place of IS_RDONLY(inode).
262 * mnt_want/drop_write() will _keep_ the filesystem 262 * mnt_want/drop_write() will _keep_ the filesystem
263 * r/w. 263 * r/w.
264 */ 264 */
265 int __mnt_is_readonly(struct vfsmount *mnt) 265 int __mnt_is_readonly(struct vfsmount *mnt)
266 { 266 {
267 if (mnt->mnt_flags & MNT_READONLY) 267 if (mnt->mnt_flags & MNT_READONLY)
268 return 1; 268 return 1;
269 if (mnt->mnt_sb->s_flags & MS_RDONLY) 269 if (mnt->mnt_sb->s_flags & MS_RDONLY)
270 return 1; 270 return 1;
271 return 0; 271 return 0;
272 } 272 }
273 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 273 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
274 274
275 static inline void mnt_inc_writers(struct mount *mnt) 275 static inline void mnt_inc_writers(struct mount *mnt)
276 { 276 {
277 #ifdef CONFIG_SMP 277 #ifdef CONFIG_SMP
278 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 278 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
279 #else 279 #else
280 mnt->mnt_writers++; 280 mnt->mnt_writers++;
281 #endif 281 #endif
282 } 282 }
283 283
284 static inline void mnt_dec_writers(struct mount *mnt) 284 static inline void mnt_dec_writers(struct mount *mnt)
285 { 285 {
286 #ifdef CONFIG_SMP 286 #ifdef CONFIG_SMP
287 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 287 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
288 #else 288 #else
289 mnt->mnt_writers--; 289 mnt->mnt_writers--;
290 #endif 290 #endif
291 } 291 }
292 292
293 static unsigned int mnt_get_writers(struct mount *mnt) 293 static unsigned int mnt_get_writers(struct mount *mnt)
294 { 294 {
295 #ifdef CONFIG_SMP 295 #ifdef CONFIG_SMP
296 unsigned int count = 0; 296 unsigned int count = 0;
297 int cpu; 297 int cpu;
298 298
299 for_each_possible_cpu(cpu) { 299 for_each_possible_cpu(cpu) {
300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
301 } 301 }
302 302
303 return count; 303 return count;
304 #else 304 #else
305 return mnt->mnt_writers; 305 return mnt->mnt_writers;
306 #endif 306 #endif
307 } 307 }
308 308
309 static int mnt_is_readonly(struct vfsmount *mnt) 309 static int mnt_is_readonly(struct vfsmount *mnt)
310 { 310 {
311 if (mnt->mnt_sb->s_readonly_remount) 311 if (mnt->mnt_sb->s_readonly_remount)
312 return 1; 312 return 1;
313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ 313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
314 smp_rmb(); 314 smp_rmb();
315 return __mnt_is_readonly(mnt); 315 return __mnt_is_readonly(mnt);
316 } 316 }
317 317
318 /* 318 /*
319 * Most r/o & frozen checks on a fs are for operations that take discrete 319 * Most r/o & frozen checks on a fs are for operations that take discrete
320 * amounts of time, like a write() or unlink(). We must keep track of when 320 * amounts of time, like a write() or unlink(). We must keep track of when
321 * those operations start (for permission checks) and when they end, so that we 321 * those operations start (for permission checks) and when they end, so that we
322 * can determine when writes are able to occur to a filesystem. 322 * can determine when writes are able to occur to a filesystem.
323 */ 323 */
324 /** 324 /**
325 * __mnt_want_write - get write access to a mount without freeze protection 325 * __mnt_want_write - get write access to a mount without freeze protection
326 * @m: the mount on which to take a write 326 * @m: the mount on which to take a write
327 * 327 *
328 * This tells the low-level filesystem that a write is about to be performed to 328 * This tells the low-level filesystem that a write is about to be performed to
329 * it, and makes sure that writes are allowed (mnt it read-write) before 329 * it, and makes sure that writes are allowed (mnt it read-write) before
330 * returning success. This operation does not protect against filesystem being 330 * returning success. This operation does not protect against filesystem being
331 * frozen. When the write operation is finished, __mnt_drop_write() must be 331 * frozen. When the write operation is finished, __mnt_drop_write() must be
332 * called. This is effectively a refcount. 332 * called. This is effectively a refcount.
333 */ 333 */
334 int __mnt_want_write(struct vfsmount *m) 334 int __mnt_want_write(struct vfsmount *m)
335 { 335 {
336 struct mount *mnt = real_mount(m); 336 struct mount *mnt = real_mount(m);
337 int ret = 0; 337 int ret = 0;
338 338
339 preempt_disable(); 339 preempt_disable();
340 mnt_inc_writers(mnt); 340 mnt_inc_writers(mnt);
341 /* 341 /*
342 * The store to mnt_inc_writers must be visible before we pass 342 * The store to mnt_inc_writers must be visible before we pass
343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
344 * incremented count after it has set MNT_WRITE_HOLD. 344 * incremented count after it has set MNT_WRITE_HOLD.
345 */ 345 */
346 smp_mb(); 346 smp_mb();
347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) 347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
348 cpu_relax(); 348 cpu_relax();
349 /* 349 /*
350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
351 * be set to match its requirements. So we must not load that until 351 * be set to match its requirements. So we must not load that until
352 * MNT_WRITE_HOLD is cleared. 352 * MNT_WRITE_HOLD is cleared.
353 */ 353 */
354 smp_rmb(); 354 smp_rmb();
355 if (mnt_is_readonly(m)) { 355 if (mnt_is_readonly(m)) {
356 mnt_dec_writers(mnt); 356 mnt_dec_writers(mnt);
357 ret = -EROFS; 357 ret = -EROFS;
358 } 358 }
359 preempt_enable(); 359 preempt_enable();
360 360
361 return ret; 361 return ret;
362 } 362 }
363 363
364 /** 364 /**
365 * mnt_want_write - get write access to a mount 365 * mnt_want_write - get write access to a mount
366 * @m: the mount on which to take a write 366 * @m: the mount on which to take a write
367 * 367 *
368 * This tells the low-level filesystem that a write is about to be performed to 368 * This tells the low-level filesystem that a write is about to be performed to
369 * it, and makes sure that writes are allowed (mount is read-write, filesystem 369 * it, and makes sure that writes are allowed (mount is read-write, filesystem
370 * is not frozen) before returning success. When the write operation is 370 * is not frozen) before returning success. When the write operation is
371 * finished, mnt_drop_write() must be called. This is effectively a refcount. 371 * finished, mnt_drop_write() must be called. This is effectively a refcount.
372 */ 372 */
373 int mnt_want_write(struct vfsmount *m) 373 int mnt_want_write(struct vfsmount *m)
374 { 374 {
375 int ret; 375 int ret;
376 376
377 sb_start_write(m->mnt_sb); 377 sb_start_write(m->mnt_sb);
378 ret = __mnt_want_write(m); 378 ret = __mnt_want_write(m);
379 if (ret) 379 if (ret)
380 sb_end_write(m->mnt_sb); 380 sb_end_write(m->mnt_sb);
381 return ret; 381 return ret;
382 } 382 }
383 EXPORT_SYMBOL_GPL(mnt_want_write); 383 EXPORT_SYMBOL_GPL(mnt_want_write);
384 384
385 /** 385 /**
386 * mnt_clone_write - get write access to a mount 386 * mnt_clone_write - get write access to a mount
387 * @mnt: the mount on which to take a write 387 * @mnt: the mount on which to take a write
388 * 388 *
389 * This is effectively like mnt_want_write, except 389 * This is effectively like mnt_want_write, except
390 * it must only be used to take an extra write reference 390 * it must only be used to take an extra write reference
391 * on a mountpoint that we already know has a write reference 391 * on a mountpoint that we already know has a write reference
392 * on it. This allows some optimisation. 392 * on it. This allows some optimisation.
393 * 393 *
394 * After finished, mnt_drop_write must be called as usual to 394 * After finished, mnt_drop_write must be called as usual to
395 * drop the reference. 395 * drop the reference.
396 */ 396 */
397 int mnt_clone_write(struct vfsmount *mnt) 397 int mnt_clone_write(struct vfsmount *mnt)
398 { 398 {
399 /* superblock may be r/o */ 399 /* superblock may be r/o */
400 if (__mnt_is_readonly(mnt)) 400 if (__mnt_is_readonly(mnt))
401 return -EROFS; 401 return -EROFS;
402 preempt_disable(); 402 preempt_disable();
403 mnt_inc_writers(real_mount(mnt)); 403 mnt_inc_writers(real_mount(mnt));
404 preempt_enable(); 404 preempt_enable();
405 return 0; 405 return 0;
406 } 406 }
407 EXPORT_SYMBOL_GPL(mnt_clone_write); 407 EXPORT_SYMBOL_GPL(mnt_clone_write);
408 408
409 /** 409 /**
410 * __mnt_want_write_file - get write access to a file's mount 410 * __mnt_want_write_file - get write access to a file's mount
411 * @file: the file who's mount on which to take a write 411 * @file: the file who's mount on which to take a write
412 * 412 *
413 * This is like __mnt_want_write, but it takes a file and can 413 * This is like __mnt_want_write, but it takes a file and can
414 * do some optimisations if the file is open for write already 414 * do some optimisations if the file is open for write already
415 */ 415 */
416 int __mnt_want_write_file(struct file *file) 416 int __mnt_want_write_file(struct file *file)
417 { 417 {
418 if (!(file->f_mode & FMODE_WRITER)) 418 if (!(file->f_mode & FMODE_WRITER))
419 return __mnt_want_write(file->f_path.mnt); 419 return __mnt_want_write(file->f_path.mnt);
420 else 420 else
421 return mnt_clone_write(file->f_path.mnt); 421 return mnt_clone_write(file->f_path.mnt);
422 } 422 }
423 423
424 /** 424 /**
425 * mnt_want_write_file - get write access to a file's mount 425 * mnt_want_write_file - get write access to a file's mount
426 * @file: the file who's mount on which to take a write 426 * @file: the file who's mount on which to take a write
427 * 427 *
428 * This is like mnt_want_write, but it takes a file and can 428 * This is like mnt_want_write, but it takes a file and can
429 * do some optimisations if the file is open for write already 429 * do some optimisations if the file is open for write already
430 */ 430 */
431 int mnt_want_write_file(struct file *file) 431 int mnt_want_write_file(struct file *file)
432 { 432 {
433 int ret; 433 int ret;
434 434
435 sb_start_write(file->f_path.mnt->mnt_sb); 435 sb_start_write(file->f_path.mnt->mnt_sb);
436 ret = __mnt_want_write_file(file); 436 ret = __mnt_want_write_file(file);
437 if (ret) 437 if (ret)
438 sb_end_write(file->f_path.mnt->mnt_sb); 438 sb_end_write(file->f_path.mnt->mnt_sb);
439 return ret; 439 return ret;
440 } 440 }
441 EXPORT_SYMBOL_GPL(mnt_want_write_file); 441 EXPORT_SYMBOL_GPL(mnt_want_write_file);
442 442
443 /** 443 /**
444 * __mnt_drop_write - give up write access to a mount 444 * __mnt_drop_write - give up write access to a mount
445 * @mnt: the mount on which to give up write access 445 * @mnt: the mount on which to give up write access
446 * 446 *
447 * Tells the low-level filesystem that we are done 447 * Tells the low-level filesystem that we are done
448 * performing writes to it. Must be matched with 448 * performing writes to it. Must be matched with
449 * __mnt_want_write() call above. 449 * __mnt_want_write() call above.
450 */ 450 */
451 void __mnt_drop_write(struct vfsmount *mnt) 451 void __mnt_drop_write(struct vfsmount *mnt)
452 { 452 {
453 preempt_disable(); 453 preempt_disable();
454 mnt_dec_writers(real_mount(mnt)); 454 mnt_dec_writers(real_mount(mnt));
455 preempt_enable(); 455 preempt_enable();
456 } 456 }
457 457
458 /** 458 /**
459 * mnt_drop_write - give up write access to a mount 459 * mnt_drop_write - give up write access to a mount
460 * @mnt: the mount on which to give up write access 460 * @mnt: the mount on which to give up write access
461 * 461 *
462 * Tells the low-level filesystem that we are done performing writes to it and 462 * Tells the low-level filesystem that we are done performing writes to it and
463 * also allows filesystem to be frozen again. Must be matched with 463 * also allows filesystem to be frozen again. Must be matched with
464 * mnt_want_write() call above. 464 * mnt_want_write() call above.
465 */ 465 */
466 void mnt_drop_write(struct vfsmount *mnt) 466 void mnt_drop_write(struct vfsmount *mnt)
467 { 467 {
468 __mnt_drop_write(mnt); 468 __mnt_drop_write(mnt);
469 sb_end_write(mnt->mnt_sb); 469 sb_end_write(mnt->mnt_sb);
470 } 470 }
471 EXPORT_SYMBOL_GPL(mnt_drop_write); 471 EXPORT_SYMBOL_GPL(mnt_drop_write);
472 472
473 void __mnt_drop_write_file(struct file *file) 473 void __mnt_drop_write_file(struct file *file)
474 { 474 {
475 __mnt_drop_write(file->f_path.mnt); 475 __mnt_drop_write(file->f_path.mnt);
476 } 476 }
477 477
478 void mnt_drop_write_file(struct file *file) 478 void mnt_drop_write_file(struct file *file)
479 { 479 {
480 mnt_drop_write(file->f_path.mnt); 480 mnt_drop_write(file->f_path.mnt);
481 } 481 }
482 EXPORT_SYMBOL(mnt_drop_write_file); 482 EXPORT_SYMBOL(mnt_drop_write_file);
483 483
484 static int mnt_make_readonly(struct mount *mnt) 484 static int mnt_make_readonly(struct mount *mnt)
485 { 485 {
486 int ret = 0; 486 int ret = 0;
487 487
488 lock_mount_hash(); 488 lock_mount_hash();
489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
490 /* 490 /*
491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
492 * should be visible before we do. 492 * should be visible before we do.
493 */ 493 */
494 smp_mb(); 494 smp_mb();
495 495
496 /* 496 /*
497 * With writers on hold, if this value is zero, then there are 497 * With writers on hold, if this value is zero, then there are
498 * definitely no active writers (although held writers may subsequently 498 * definitely no active writers (although held writers may subsequently
499 * increment the count, they'll have to wait, and decrement it after 499 * increment the count, they'll have to wait, and decrement it after
500 * seeing MNT_READONLY). 500 * seeing MNT_READONLY).
501 * 501 *
502 * It is OK to have counter incremented on one CPU and decremented on 502 * It is OK to have counter incremented on one CPU and decremented on
503 * another: the sum will add up correctly. The danger would be when we 503 * another: the sum will add up correctly. The danger would be when we
504 * sum up each counter, if we read a counter before it is incremented, 504 * sum up each counter, if we read a counter before it is incremented,
505 * but then read another CPU's count which it has been subsequently 505 * but then read another CPU's count which it has been subsequently
506 * decremented from -- we would see more decrements than we should. 506 * decremented from -- we would see more decrements than we should.
507 * MNT_WRITE_HOLD protects against this scenario, because 507 * MNT_WRITE_HOLD protects against this scenario, because
508 * mnt_want_write first increments count, then smp_mb, then spins on 508 * mnt_want_write first increments count, then smp_mb, then spins on
509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
510 * we're counting up here. 510 * we're counting up here.
511 */ 511 */
512 if (mnt_get_writers(mnt) > 0) 512 if (mnt_get_writers(mnt) > 0)
513 ret = -EBUSY; 513 ret = -EBUSY;
514 else 514 else
515 mnt->mnt.mnt_flags |= MNT_READONLY; 515 mnt->mnt.mnt_flags |= MNT_READONLY;
516 /* 516 /*
517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
518 * that become unheld will see MNT_READONLY. 518 * that become unheld will see MNT_READONLY.
519 */ 519 */
520 smp_wmb(); 520 smp_wmb();
521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
522 unlock_mount_hash(); 522 unlock_mount_hash();
523 return ret; 523 return ret;
524 } 524 }
525 525
526 static void __mnt_unmake_readonly(struct mount *mnt) 526 static void __mnt_unmake_readonly(struct mount *mnt)
527 { 527 {
528 lock_mount_hash(); 528 lock_mount_hash();
529 mnt->mnt.mnt_flags &= ~MNT_READONLY; 529 mnt->mnt.mnt_flags &= ~MNT_READONLY;
530 unlock_mount_hash(); 530 unlock_mount_hash();
531 } 531 }
532 532
533 int sb_prepare_remount_readonly(struct super_block *sb) 533 int sb_prepare_remount_readonly(struct super_block *sb)
534 { 534 {
535 struct mount *mnt; 535 struct mount *mnt;
536 int err = 0; 536 int err = 0;
537 537
538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ 538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
539 if (atomic_long_read(&sb->s_remove_count)) 539 if (atomic_long_read(&sb->s_remove_count))
540 return -EBUSY; 540 return -EBUSY;
541 541
542 lock_mount_hash(); 542 lock_mount_hash();
543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
546 smp_mb(); 546 smp_mb();
547 if (mnt_get_writers(mnt) > 0) { 547 if (mnt_get_writers(mnt) > 0) {
548 err = -EBUSY; 548 err = -EBUSY;
549 break; 549 break;
550 } 550 }
551 } 551 }
552 } 552 }
553 if (!err && atomic_long_read(&sb->s_remove_count)) 553 if (!err && atomic_long_read(&sb->s_remove_count))
554 err = -EBUSY; 554 err = -EBUSY;
555 555
556 if (!err) { 556 if (!err) {
557 sb->s_readonly_remount = 1; 557 sb->s_readonly_remount = 1;
558 smp_wmb(); 558 smp_wmb();
559 } 559 }
560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
563 } 563 }
564 unlock_mount_hash(); 564 unlock_mount_hash();
565 565
566 return err; 566 return err;
567 } 567 }
568 568
569 static void free_vfsmnt(struct mount *mnt) 569 static void free_vfsmnt(struct mount *mnt)
570 { 570 {
571 kfree(mnt->mnt_devname); 571 kfree(mnt->mnt_devname);
572 #ifdef CONFIG_SMP 572 #ifdef CONFIG_SMP
573 free_percpu(mnt->mnt_pcp); 573 free_percpu(mnt->mnt_pcp);
574 #endif 574 #endif
575 kmem_cache_free(mnt_cache, mnt); 575 kmem_cache_free(mnt_cache, mnt);
576 } 576 }
577 577
578 static void delayed_free_vfsmnt(struct rcu_head *head) 578 static void delayed_free_vfsmnt(struct rcu_head *head)
579 { 579 {
580 free_vfsmnt(container_of(head, struct mount, mnt_rcu)); 580 free_vfsmnt(container_of(head, struct mount, mnt_rcu));
581 } 581 }
582 582
583 /* call under rcu_read_lock */ 583 /* call under rcu_read_lock */
584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
585 { 585 {
586 struct mount *mnt; 586 struct mount *mnt;
587 if (read_seqretry(&mount_lock, seq)) 587 if (read_seqretry(&mount_lock, seq))
588 return false; 588 return false;
589 if (bastard == NULL) 589 if (bastard == NULL)
590 return true; 590 return true;
591 mnt = real_mount(bastard); 591 mnt = real_mount(bastard);
592 mnt_add_count(mnt, 1); 592 mnt_add_count(mnt, 1);
593 if (likely(!read_seqretry(&mount_lock, seq))) 593 if (likely(!read_seqretry(&mount_lock, seq)))
594 return true; 594 return true;
595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
596 mnt_add_count(mnt, -1); 596 mnt_add_count(mnt, -1);
597 return false; 597 return false;
598 } 598 }
599 rcu_read_unlock(); 599 rcu_read_unlock();
600 mntput(bastard); 600 mntput(bastard);
601 rcu_read_lock(); 601 rcu_read_lock();
602 return false; 602 return false;
603 } 603 }
604 604
605 /* 605 /*
606 * find the first mount at @dentry on vfsmount @mnt. 606 * find the first mount at @dentry on vfsmount @mnt.
607 * call under rcu_read_lock() 607 * call under rcu_read_lock()
608 */ 608 */
609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
610 { 610 {
611 struct hlist_head *head = m_hash(mnt, dentry); 611 struct hlist_head *head = m_hash(mnt, dentry);
612 struct mount *p; 612 struct mount *p;
613 613
614 hlist_for_each_entry_rcu(p, head, mnt_hash) 614 hlist_for_each_entry_rcu(p, head, mnt_hash)
615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
616 return p; 616 return p;
617 return NULL; 617 return NULL;
618 } 618 }
619 619
620 /* 620 /*
621 * find the last mount at @dentry on vfsmount @mnt. 621 * find the last mount at @dentry on vfsmount @mnt.
622 * mount_lock must be held. 622 * mount_lock must be held.
623 */ 623 */
624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
625 { 625 {
626 struct mount *p, *res; 626 struct mount *p, *res;
627 res = p = __lookup_mnt(mnt, dentry); 627 res = p = __lookup_mnt(mnt, dentry);
628 if (!p) 628 if (!p)
629 goto out; 629 goto out;
630 hlist_for_each_entry_continue(p, mnt_hash) { 630 hlist_for_each_entry_continue(p, mnt_hash) {
631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) 631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
632 break; 632 break;
633 res = p; 633 res = p;
634 } 634 }
635 out: 635 out:
636 return res; 636 return res;
637 } 637 }
638 638
639 /* 639 /*
640 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
641 * 641 *
642 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
643 * following mounts: 643 * following mounts:
644 * 644 *
645 * mount /dev/sda1 /mnt 645 * mount /dev/sda1 /mnt
646 * mount /dev/sda2 /mnt 646 * mount /dev/sda2 /mnt
647 * mount /dev/sda3 /mnt 647 * mount /dev/sda3 /mnt
648 * 648 *
649 * Then lookup_mnt() on the base /mnt dentry in the root mount will 649 * Then lookup_mnt() on the base /mnt dentry in the root mount will
650 * return successively the root dentry and vfsmount of /dev/sda1, then 650 * return successively the root dentry and vfsmount of /dev/sda1, then
651 * /dev/sda2, then /dev/sda3, then NULL. 651 * /dev/sda2, then /dev/sda3, then NULL.
652 * 652 *
653 * lookup_mnt takes a reference to the found vfsmount. 653 * lookup_mnt takes a reference to the found vfsmount.
654 */ 654 */
655 struct vfsmount *lookup_mnt(struct path *path) 655 struct vfsmount *lookup_mnt(struct path *path)
656 { 656 {
657 struct mount *child_mnt; 657 struct mount *child_mnt;
658 struct vfsmount *m; 658 struct vfsmount *m;
659 unsigned seq; 659 unsigned seq;
660 660
661 rcu_read_lock(); 661 rcu_read_lock();
662 do { 662 do {
663 seq = read_seqbegin(&mount_lock); 663 seq = read_seqbegin(&mount_lock);
664 child_mnt = __lookup_mnt(path->mnt, path->dentry); 664 child_mnt = __lookup_mnt(path->mnt, path->dentry);
665 m = child_mnt ? &child_mnt->mnt : NULL; 665 m = child_mnt ? &child_mnt->mnt : NULL;
666 } while (!legitimize_mnt(m, seq)); 666 } while (!legitimize_mnt(m, seq));
667 rcu_read_unlock(); 667 rcu_read_unlock();
668 return m; 668 return m;
669 } 669 }
670 670
671 /* 671 /*
672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the 672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
673 * current mount namespace. 673 * current mount namespace.
674 * 674 *
675 * The common case is dentries are not mountpoints at all and that 675 * The common case is dentries are not mountpoints at all and that
676 * test is handled inline. For the slow case when we are actually 676 * test is handled inline. For the slow case when we are actually
677 * dealing with a mountpoint of some kind, walk through all of the 677 * dealing with a mountpoint of some kind, walk through all of the
678 * mounts in the current mount namespace and test to see if the dentry 678 * mounts in the current mount namespace and test to see if the dentry
679 * is a mountpoint. 679 * is a mountpoint.
680 * 680 *
681 * The mount_hashtable is not usable in the context because we 681 * The mount_hashtable is not usable in the context because we
682 * need to identify all mounts that may be in the current mount 682 * need to identify all mounts that may be in the current mount
683 * namespace not just a mount that happens to have some specified 683 * namespace not just a mount that happens to have some specified
684 * parent mount. 684 * parent mount.
685 */ 685 */
686 bool __is_local_mountpoint(struct dentry *dentry) 686 bool __is_local_mountpoint(struct dentry *dentry)
687 { 687 {
688 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 688 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
689 struct mount *mnt; 689 struct mount *mnt;
690 bool is_covered = false; 690 bool is_covered = false;
691 691
692 if (!d_mountpoint(dentry)) 692 if (!d_mountpoint(dentry))
693 goto out; 693 goto out;
694 694
695 down_read(&namespace_sem); 695 down_read(&namespace_sem);
696 list_for_each_entry(mnt, &ns->list, mnt_list) { 696 list_for_each_entry(mnt, &ns->list, mnt_list) {
697 is_covered = (mnt->mnt_mountpoint == dentry); 697 is_covered = (mnt->mnt_mountpoint == dentry);
698 if (is_covered) 698 if (is_covered)
699 break; 699 break;
700 } 700 }
701 up_read(&namespace_sem); 701 up_read(&namespace_sem);
702 out: 702 out:
703 return is_covered; 703 return is_covered;
704 } 704 }
705 705
706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry) 706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
707 { 707 {
708 struct hlist_head *chain = mp_hash(dentry); 708 struct hlist_head *chain = mp_hash(dentry);
709 struct mountpoint *mp; 709 struct mountpoint *mp;
710 710
711 hlist_for_each_entry(mp, chain, m_hash) { 711 hlist_for_each_entry(mp, chain, m_hash) {
712 if (mp->m_dentry == dentry) { 712 if (mp->m_dentry == dentry) {
713 /* might be worth a WARN_ON() */ 713 /* might be worth a WARN_ON() */
714 if (d_unlinked(dentry)) 714 if (d_unlinked(dentry))
715 return ERR_PTR(-ENOENT); 715 return ERR_PTR(-ENOENT);
716 mp->m_count++; 716 mp->m_count++;
717 return mp; 717 return mp;
718 } 718 }
719 } 719 }
720 return NULL; 720 return NULL;
721 } 721 }
722 722
723 static struct mountpoint *new_mountpoint(struct dentry *dentry) 723 static struct mountpoint *new_mountpoint(struct dentry *dentry)
724 { 724 {
725 struct hlist_head *chain = mp_hash(dentry); 725 struct hlist_head *chain = mp_hash(dentry);
726 struct mountpoint *mp; 726 struct mountpoint *mp;
727 int ret; 727 int ret;
728 728
729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); 729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
730 if (!mp) 730 if (!mp)
731 return ERR_PTR(-ENOMEM); 731 return ERR_PTR(-ENOMEM);
732 732
733 ret = d_set_mounted(dentry); 733 ret = d_set_mounted(dentry);
734 if (ret) { 734 if (ret) {
735 kfree(mp); 735 kfree(mp);
736 return ERR_PTR(ret); 736 return ERR_PTR(ret);
737 } 737 }
738 738
739 mp->m_dentry = dentry; 739 mp->m_dentry = dentry;
740 mp->m_count = 1; 740 mp->m_count = 1;
741 hlist_add_head(&mp->m_hash, chain); 741 hlist_add_head(&mp->m_hash, chain);
742 INIT_HLIST_HEAD(&mp->m_list); 742 INIT_HLIST_HEAD(&mp->m_list);
743 return mp; 743 return mp;
744 } 744 }
745 745
746 static void put_mountpoint(struct mountpoint *mp) 746 static void put_mountpoint(struct mountpoint *mp)
747 { 747 {
748 if (!--mp->m_count) { 748 if (!--mp->m_count) {
749 struct dentry *dentry = mp->m_dentry; 749 struct dentry *dentry = mp->m_dentry;
750 BUG_ON(!hlist_empty(&mp->m_list)); 750 BUG_ON(!hlist_empty(&mp->m_list));
751 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
752 dentry->d_flags &= ~DCACHE_MOUNTED; 752 dentry->d_flags &= ~DCACHE_MOUNTED;
753 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
754 hlist_del(&mp->m_hash); 754 hlist_del(&mp->m_hash);
755 kfree(mp); 755 kfree(mp);
756 } 756 }
757 } 757 }
758 758
759 static inline int check_mnt(struct mount *mnt) 759 static inline int check_mnt(struct mount *mnt)
760 { 760 {
761 return mnt->mnt_ns == current->nsproxy->mnt_ns; 761 return mnt->mnt_ns == current->nsproxy->mnt_ns;
762 } 762 }
763 763
764 /* 764 /*
765 * vfsmount lock must be held for write 765 * vfsmount lock must be held for write
766 */ 766 */
767 static void touch_mnt_namespace(struct mnt_namespace *ns) 767 static void touch_mnt_namespace(struct mnt_namespace *ns)
768 { 768 {
769 if (ns) { 769 if (ns) {
770 ns->event = ++event; 770 ns->event = ++event;
771 wake_up_interruptible(&ns->poll); 771 wake_up_interruptible(&ns->poll);
772 } 772 }
773 } 773 }
774 774
775 /* 775 /*
776 * vfsmount lock must be held for write 776 * vfsmount lock must be held for write
777 */ 777 */
778 static void __touch_mnt_namespace(struct mnt_namespace *ns) 778 static void __touch_mnt_namespace(struct mnt_namespace *ns)
779 { 779 {
780 if (ns && ns->event != event) { 780 if (ns && ns->event != event) {
781 ns->event = event; 781 ns->event = event;
782 wake_up_interruptible(&ns->poll); 782 wake_up_interruptible(&ns->poll);
783 } 783 }
784 } 784 }
785 785
786 /* 786 /*
787 * vfsmount lock must be held for write 787 * vfsmount lock must be held for write
788 */ 788 */
789 static void detach_mnt(struct mount *mnt, struct path *old_path) 789 static void detach_mnt(struct mount *mnt, struct path *old_path)
790 { 790 {
791 old_path->dentry = mnt->mnt_mountpoint; 791 old_path->dentry = mnt->mnt_mountpoint;
792 old_path->mnt = &mnt->mnt_parent->mnt; 792 old_path->mnt = &mnt->mnt_parent->mnt;
793 mnt->mnt_parent = mnt; 793 mnt->mnt_parent = mnt;
794 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 794 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
795 list_del_init(&mnt->mnt_child); 795 list_del_init(&mnt->mnt_child);
796 hlist_del_init_rcu(&mnt->mnt_hash); 796 hlist_del_init_rcu(&mnt->mnt_hash);
797 hlist_del_init(&mnt->mnt_mp_list); 797 hlist_del_init(&mnt->mnt_mp_list);
798 put_mountpoint(mnt->mnt_mp); 798 put_mountpoint(mnt->mnt_mp);
799 mnt->mnt_mp = NULL; 799 mnt->mnt_mp = NULL;
800 } 800 }
801 801
802 /* 802 /*
803 * vfsmount lock must be held for write 803 * vfsmount lock must be held for write
804 */ 804 */
805 void mnt_set_mountpoint(struct mount *mnt, 805 void mnt_set_mountpoint(struct mount *mnt,
806 struct mountpoint *mp, 806 struct mountpoint *mp,
807 struct mount *child_mnt) 807 struct mount *child_mnt)
808 { 808 {
809 mp->m_count++; 809 mp->m_count++;
810 mnt_add_count(mnt, 1); /* essentially, that's mntget */ 810 mnt_add_count(mnt, 1); /* essentially, that's mntget */
811 child_mnt->mnt_mountpoint = dget(mp->m_dentry); 811 child_mnt->mnt_mountpoint = dget(mp->m_dentry);
812 child_mnt->mnt_parent = mnt; 812 child_mnt->mnt_parent = mnt;
813 child_mnt->mnt_mp = mp; 813 child_mnt->mnt_mp = mp;
814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
815 } 815 }
816 816
817 /* 817 /*
818 * vfsmount lock must be held for write 818 * vfsmount lock must be held for write
819 */ 819 */
820 static void attach_mnt(struct mount *mnt, 820 static void attach_mnt(struct mount *mnt,
821 struct mount *parent, 821 struct mount *parent,
822 struct mountpoint *mp) 822 struct mountpoint *mp)
823 { 823 {
824 mnt_set_mountpoint(parent, mp, mnt); 824 mnt_set_mountpoint(parent, mp, mnt);
825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
827 } 827 }
828 828
829 static void attach_shadowed(struct mount *mnt, 829 static void attach_shadowed(struct mount *mnt,
830 struct mount *parent, 830 struct mount *parent,
831 struct mount *shadows) 831 struct mount *shadows)
832 { 832 {
833 if (shadows) { 833 if (shadows) {
834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
835 list_add(&mnt->mnt_child, &shadows->mnt_child); 835 list_add(&mnt->mnt_child, &shadows->mnt_child);
836 } else { 836 } else {
837 hlist_add_head_rcu(&mnt->mnt_hash, 837 hlist_add_head_rcu(&mnt->mnt_hash,
838 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 838 m_hash(&parent->mnt, mnt->mnt_mountpoint));
839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
840 } 840 }
841 } 841 }
842 842
843 /* 843 /*
844 * vfsmount lock must be held for write 844 * vfsmount lock must be held for write
845 */ 845 */
846 static void commit_tree(struct mount *mnt, struct mount *shadows) 846 static void commit_tree(struct mount *mnt, struct mount *shadows)
847 { 847 {
848 struct mount *parent = mnt->mnt_parent; 848 struct mount *parent = mnt->mnt_parent;
849 struct mount *m; 849 struct mount *m;
850 LIST_HEAD(head); 850 LIST_HEAD(head);
851 struct mnt_namespace *n = parent->mnt_ns; 851 struct mnt_namespace *n = parent->mnt_ns;
852 852
853 BUG_ON(parent == mnt); 853 BUG_ON(parent == mnt);
854 854
855 list_add_tail(&head, &mnt->mnt_list); 855 list_add_tail(&head, &mnt->mnt_list);
856 list_for_each_entry(m, &head, mnt_list) 856 list_for_each_entry(m, &head, mnt_list)
857 m->mnt_ns = n; 857 m->mnt_ns = n;
858 858
859 list_splice(&head, n->list.prev); 859 list_splice(&head, n->list.prev);
860 860
861 attach_shadowed(mnt, parent, shadows); 861 attach_shadowed(mnt, parent, shadows);
862 touch_mnt_namespace(n); 862 touch_mnt_namespace(n);
863 } 863 }
864 864
865 static struct mount *next_mnt(struct mount *p, struct mount *root) 865 static struct mount *next_mnt(struct mount *p, struct mount *root)
866 { 866 {
867 struct list_head *next = p->mnt_mounts.next; 867 struct list_head *next = p->mnt_mounts.next;
868 if (next == &p->mnt_mounts) { 868 if (next == &p->mnt_mounts) {
869 while (1) { 869 while (1) {
870 if (p == root) 870 if (p == root)
871 return NULL; 871 return NULL;
872 next = p->mnt_child.next; 872 next = p->mnt_child.next;
873 if (next != &p->mnt_parent->mnt_mounts) 873 if (next != &p->mnt_parent->mnt_mounts)
874 break; 874 break;
875 p = p->mnt_parent; 875 p = p->mnt_parent;
876 } 876 }
877 } 877 }
878 return list_entry(next, struct mount, mnt_child); 878 return list_entry(next, struct mount, mnt_child);
879 } 879 }
880 880
881 static struct mount *skip_mnt_tree(struct mount *p) 881 static struct mount *skip_mnt_tree(struct mount *p)
882 { 882 {
883 struct list_head *prev = p->mnt_mounts.prev; 883 struct list_head *prev = p->mnt_mounts.prev;
884 while (prev != &p->mnt_mounts) { 884 while (prev != &p->mnt_mounts) {
885 p = list_entry(prev, struct mount, mnt_child); 885 p = list_entry(prev, struct mount, mnt_child);
886 prev = p->mnt_mounts.prev; 886 prev = p->mnt_mounts.prev;
887 } 887 }
888 return p; 888 return p;
889 } 889 }
890 890
891 struct vfsmount * 891 struct vfsmount *
892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
893 { 893 {
894 struct mount *mnt; 894 struct mount *mnt;
895 struct dentry *root; 895 struct dentry *root;
896 896
897 if (!type) 897 if (!type)
898 return ERR_PTR(-ENODEV); 898 return ERR_PTR(-ENODEV);
899 899
900 mnt = alloc_vfsmnt(name); 900 mnt = alloc_vfsmnt(name);
901 if (!mnt) 901 if (!mnt)
902 return ERR_PTR(-ENOMEM); 902 return ERR_PTR(-ENOMEM);
903 903
904 if (flags & MS_KERNMOUNT) 904 if (flags & MS_KERNMOUNT)
905 mnt->mnt.mnt_flags = MNT_INTERNAL; 905 mnt->mnt.mnt_flags = MNT_INTERNAL;
906 906
907 root = mount_fs(type, flags, name, data); 907 root = mount_fs(type, flags, name, data);
908 if (IS_ERR(root)) { 908 if (IS_ERR(root)) {
909 mnt_free_id(mnt); 909 mnt_free_id(mnt);
910 free_vfsmnt(mnt); 910 free_vfsmnt(mnt);
911 return ERR_CAST(root); 911 return ERR_CAST(root);
912 } 912 }
913 913
914 mnt->mnt.mnt_root = root; 914 mnt->mnt.mnt_root = root;
915 mnt->mnt.mnt_sb = root->d_sb; 915 mnt->mnt.mnt_sb = root->d_sb;
916 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 916 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
917 mnt->mnt_parent = mnt; 917 mnt->mnt_parent = mnt;
918 lock_mount_hash(); 918 lock_mount_hash();
919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
920 unlock_mount_hash(); 920 unlock_mount_hash();
921 return &mnt->mnt; 921 return &mnt->mnt;
922 } 922 }
923 EXPORT_SYMBOL_GPL(vfs_kern_mount); 923 EXPORT_SYMBOL_GPL(vfs_kern_mount);
924 924
925 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 925 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
926 int flag) 926 int flag)
927 { 927 {
928 struct super_block *sb = old->mnt.mnt_sb; 928 struct super_block *sb = old->mnt.mnt_sb;
929 struct mount *mnt; 929 struct mount *mnt;
930 int err; 930 int err;
931 931
932 mnt = alloc_vfsmnt(old->mnt_devname); 932 mnt = alloc_vfsmnt(old->mnt_devname);
933 if (!mnt) 933 if (!mnt)
934 return ERR_PTR(-ENOMEM); 934 return ERR_PTR(-ENOMEM);
935 935
936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
937 mnt->mnt_group_id = 0; /* not a peer of original */ 937 mnt->mnt_group_id = 0; /* not a peer of original */
938 else 938 else
939 mnt->mnt_group_id = old->mnt_group_id; 939 mnt->mnt_group_id = old->mnt_group_id;
940 940
941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
942 err = mnt_alloc_group_id(mnt); 942 err = mnt_alloc_group_id(mnt);
943 if (err) 943 if (err)
944 goto out_free; 944 goto out_free;
945 } 945 }
946 946
947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
948 /* Don't allow unprivileged users to change mount flags */ 948 /* Don't allow unprivileged users to change mount flags */
949 if (flag & CL_UNPRIVILEGED) { 949 if (flag & CL_UNPRIVILEGED) {
950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; 950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
951 951
952 if (mnt->mnt.mnt_flags & MNT_READONLY) 952 if (mnt->mnt.mnt_flags & MNT_READONLY)
953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
954 954
955 if (mnt->mnt.mnt_flags & MNT_NODEV) 955 if (mnt->mnt.mnt_flags & MNT_NODEV)
956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; 956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
957 957
958 if (mnt->mnt.mnt_flags & MNT_NOSUID) 958 if (mnt->mnt.mnt_flags & MNT_NOSUID)
959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; 959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
960 960
961 if (mnt->mnt.mnt_flags & MNT_NOEXEC) 961 if (mnt->mnt.mnt_flags & MNT_NOEXEC)
962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; 962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
963 } 963 }
964 964
965 /* Don't allow unprivileged users to reveal what is under a mount */ 965 /* Don't allow unprivileged users to reveal what is under a mount */
966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
967 mnt->mnt.mnt_flags |= MNT_LOCKED; 967 mnt->mnt.mnt_flags |= MNT_LOCKED;
968 968
969 atomic_inc(&sb->s_active); 969 atomic_inc(&sb->s_active);
970 mnt->mnt.mnt_sb = sb; 970 mnt->mnt.mnt_sb = sb;
971 mnt->mnt.mnt_root = dget(root); 971 mnt->mnt.mnt_root = dget(root);
972 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 972 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
973 mnt->mnt_parent = mnt; 973 mnt->mnt_parent = mnt;
974 lock_mount_hash(); 974 lock_mount_hash();
975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
976 unlock_mount_hash(); 976 unlock_mount_hash();
977 977
978 if ((flag & CL_SLAVE) || 978 if ((flag & CL_SLAVE) ||
979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
980 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 980 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
981 mnt->mnt_master = old; 981 mnt->mnt_master = old;
982 CLEAR_MNT_SHARED(mnt); 982 CLEAR_MNT_SHARED(mnt);
983 } else if (!(flag & CL_PRIVATE)) { 983 } else if (!(flag & CL_PRIVATE)) {
984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
985 list_add(&mnt->mnt_share, &old->mnt_share); 985 list_add(&mnt->mnt_share, &old->mnt_share);
986 if (IS_MNT_SLAVE(old)) 986 if (IS_MNT_SLAVE(old))
987 list_add(&mnt->mnt_slave, &old->mnt_slave); 987 list_add(&mnt->mnt_slave, &old->mnt_slave);
988 mnt->mnt_master = old->mnt_master; 988 mnt->mnt_master = old->mnt_master;
989 } 989 }
990 if (flag & CL_MAKE_SHARED) 990 if (flag & CL_MAKE_SHARED)
991 set_mnt_shared(mnt); 991 set_mnt_shared(mnt);
992 992
993 /* stick the duplicate mount on the same expiry list 993 /* stick the duplicate mount on the same expiry list
994 * as the original if that was on one */ 994 * as the original if that was on one */
995 if (flag & CL_EXPIRE) { 995 if (flag & CL_EXPIRE) {
996 if (!list_empty(&old->mnt_expire)) 996 if (!list_empty(&old->mnt_expire))
997 list_add(&mnt->mnt_expire, &old->mnt_expire); 997 list_add(&mnt->mnt_expire, &old->mnt_expire);
998 } 998 }
999 999
1000 return mnt; 1000 return mnt;
1001 1001
1002 out_free: 1002 out_free:
1003 mnt_free_id(mnt); 1003 mnt_free_id(mnt);
1004 free_vfsmnt(mnt); 1004 free_vfsmnt(mnt);
1005 return ERR_PTR(err); 1005 return ERR_PTR(err);
1006 } 1006 }
1007 1007
1008 static void cleanup_mnt(struct mount *mnt) 1008 static void cleanup_mnt(struct mount *mnt)
1009 { 1009 {
1010 /* 1010 /*
1011 * This probably indicates that somebody messed 1011 * This probably indicates that somebody messed
1012 * up a mnt_want/drop_write() pair. If this 1012 * up a mnt_want/drop_write() pair. If this
1013 * happens, the filesystem was probably unable 1013 * happens, the filesystem was probably unable
1014 * to make r/w->r/o transitions. 1014 * to make r/w->r/o transitions.
1015 */ 1015 */
1016 /* 1016 /*
1017 * The locking used to deal with mnt_count decrement provides barriers, 1017 * The locking used to deal with mnt_count decrement provides barriers,
1018 * so mnt_get_writers() below is safe. 1018 * so mnt_get_writers() below is safe.
1019 */ 1019 */
1020 WARN_ON(mnt_get_writers(mnt)); 1020 WARN_ON(mnt_get_writers(mnt));
1021 if (unlikely(mnt->mnt_pins.first)) 1021 if (unlikely(mnt->mnt_pins.first))
1022 mnt_pin_kill(mnt); 1022 mnt_pin_kill(mnt);
1023 fsnotify_vfsmount_delete(&mnt->mnt); 1023 fsnotify_vfsmount_delete(&mnt->mnt);
1024 dput(mnt->mnt.mnt_root); 1024 dput(mnt->mnt.mnt_root);
1025 deactivate_super(mnt->mnt.mnt_sb); 1025 deactivate_super(mnt->mnt.mnt_sb);
1026 mnt_free_id(mnt); 1026 mnt_free_id(mnt);
1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); 1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1028 } 1028 }
1029 1029
1030 static void __cleanup_mnt(struct rcu_head *head) 1030 static void __cleanup_mnt(struct rcu_head *head)
1031 { 1031 {
1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu)); 1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1033 } 1033 }
1034 1034
1035 static LLIST_HEAD(delayed_mntput_list); 1035 static LLIST_HEAD(delayed_mntput_list);
1036 static void delayed_mntput(struct work_struct *unused) 1036 static void delayed_mntput(struct work_struct *unused)
1037 { 1037 {
1038 struct llist_node *node = llist_del_all(&delayed_mntput_list); 1038 struct llist_node *node = llist_del_all(&delayed_mntput_list);
1039 struct llist_node *next; 1039 struct llist_node *next;
1040 1040
1041 for (; node; node = next) { 1041 for (; node; node = next) {
1042 next = llist_next(node); 1042 next = llist_next(node);
1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); 1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
1044 } 1044 }
1045 } 1045 }
1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); 1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1047 1047
1048 static void mntput_no_expire(struct mount *mnt) 1048 static void mntput_no_expire(struct mount *mnt)
1049 { 1049 {
1050 rcu_read_lock(); 1050 rcu_read_lock();
1051 mnt_add_count(mnt, -1); 1051 mnt_add_count(mnt, -1);
1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
1053 rcu_read_unlock(); 1053 rcu_read_unlock();
1054 return; 1054 return;
1055 } 1055 }
1056 lock_mount_hash(); 1056 lock_mount_hash();
1057 if (mnt_get_count(mnt)) { 1057 if (mnt_get_count(mnt)) {
1058 rcu_read_unlock(); 1058 rcu_read_unlock();
1059 unlock_mount_hash(); 1059 unlock_mount_hash();
1060 return; 1060 return;
1061 } 1061 }
1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1063 rcu_read_unlock(); 1063 rcu_read_unlock();
1064 unlock_mount_hash(); 1064 unlock_mount_hash();
1065 return; 1065 return;
1066 } 1066 }
1067 mnt->mnt.mnt_flags |= MNT_DOOMED; 1067 mnt->mnt.mnt_flags |= MNT_DOOMED;
1068 rcu_read_unlock(); 1068 rcu_read_unlock();
1069 1069
1070 list_del(&mnt->mnt_instance); 1070 list_del(&mnt->mnt_instance);
1071 unlock_mount_hash(); 1071 unlock_mount_hash();
1072 1072
1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { 1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1074 struct task_struct *task = current; 1074 struct task_struct *task = current;
1075 if (likely(!(task->flags & PF_KTHREAD))) { 1075 if (likely(!(task->flags & PF_KTHREAD))) {
1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt); 1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1077 if (!task_work_add(task, &mnt->mnt_rcu, true)) 1077 if (!task_work_add(task, &mnt->mnt_rcu, true))
1078 return; 1078 return;
1079 } 1079 }
1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) 1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1081 schedule_delayed_work(&delayed_mntput_work, 1); 1081 schedule_delayed_work(&delayed_mntput_work, 1);
1082 return; 1082 return;
1083 } 1083 }
1084 cleanup_mnt(mnt); 1084 cleanup_mnt(mnt);
1085 } 1085 }
1086 1086
1087 void mntput(struct vfsmount *mnt) 1087 void mntput(struct vfsmount *mnt)
1088 { 1088 {
1089 if (mnt) { 1089 if (mnt) {
1090 struct mount *m = real_mount(mnt); 1090 struct mount *m = real_mount(mnt);
1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1092 if (unlikely(m->mnt_expiry_mark)) 1092 if (unlikely(m->mnt_expiry_mark))
1093 m->mnt_expiry_mark = 0; 1093 m->mnt_expiry_mark = 0;
1094 mntput_no_expire(m); 1094 mntput_no_expire(m);
1095 } 1095 }
1096 } 1096 }
1097 EXPORT_SYMBOL(mntput); 1097 EXPORT_SYMBOL(mntput);
1098 1098
1099 struct vfsmount *mntget(struct vfsmount *mnt) 1099 struct vfsmount *mntget(struct vfsmount *mnt)
1100 { 1100 {
1101 if (mnt) 1101 if (mnt)
1102 mnt_add_count(real_mount(mnt), 1); 1102 mnt_add_count(real_mount(mnt), 1);
1103 return mnt; 1103 return mnt;
1104 } 1104 }
1105 EXPORT_SYMBOL(mntget); 1105 EXPORT_SYMBOL(mntget);
1106 1106
1107 struct vfsmount *mnt_clone_internal(struct path *path) 1107 struct vfsmount *mnt_clone_internal(struct path *path)
1108 { 1108 {
1109 struct mount *p; 1109 struct mount *p;
1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); 1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1111 if (IS_ERR(p)) 1111 if (IS_ERR(p))
1112 return ERR_CAST(p); 1112 return ERR_CAST(p);
1113 p->mnt.mnt_flags |= MNT_INTERNAL; 1113 p->mnt.mnt_flags |= MNT_INTERNAL;
1114 return &p->mnt; 1114 return &p->mnt;
1115 } 1115 }
1116 1116
1117 static inline void mangle(struct seq_file *m, const char *s) 1117 static inline void mangle(struct seq_file *m, const char *s)
1118 { 1118 {
1119 seq_escape(m, s, " \t\n\\"); 1119 seq_escape(m, s, " \t\n\\");
1120 } 1120 }
1121 1121
1122 /* 1122 /*
1123 * Simple .show_options callback for filesystems which don't want to 1123 * Simple .show_options callback for filesystems which don't want to
1124 * implement more complex mount option showing. 1124 * implement more complex mount option showing.
1125 * 1125 *
1126 * See also save_mount_options(). 1126 * See also save_mount_options().
1127 */ 1127 */
1128 int generic_show_options(struct seq_file *m, struct dentry *root) 1128 int generic_show_options(struct seq_file *m, struct dentry *root)
1129 { 1129 {
1130 const char *options; 1130 const char *options;
1131 1131
1132 rcu_read_lock(); 1132 rcu_read_lock();
1133 options = rcu_dereference(root->d_sb->s_options); 1133 options = rcu_dereference(root->d_sb->s_options);
1134 1134
1135 if (options != NULL && options[0]) { 1135 if (options != NULL && options[0]) {
1136 seq_putc(m, ','); 1136 seq_putc(m, ',');
1137 mangle(m, options); 1137 mangle(m, options);
1138 } 1138 }
1139 rcu_read_unlock(); 1139 rcu_read_unlock();
1140 1140
1141 return 0; 1141 return 0;
1142 } 1142 }
1143 EXPORT_SYMBOL(generic_show_options); 1143 EXPORT_SYMBOL(generic_show_options);
1144 1144
1145 /* 1145 /*
1146 * If filesystem uses generic_show_options(), this function should be 1146 * If filesystem uses generic_show_options(), this function should be
1147 * called from the fill_super() callback. 1147 * called from the fill_super() callback.
1148 * 1148 *
1149 * The .remount_fs callback usually needs to be handled in a special 1149 * The .remount_fs callback usually needs to be handled in a special
1150 * way, to make sure, that previous options are not overwritten if the 1150 * way, to make sure, that previous options are not overwritten if the
1151 * remount fails. 1151 * remount fails.
1152 * 1152 *
1153 * Also note, that if the filesystem's .remount_fs function doesn't 1153 * Also note, that if the filesystem's .remount_fs function doesn't
1154 * reset all options to their default value, but changes only newly 1154 * reset all options to their default value, but changes only newly
1155 * given options, then the displayed options will not reflect reality 1155 * given options, then the displayed options will not reflect reality
1156 * any more. 1156 * any more.
1157 */ 1157 */
1158 void save_mount_options(struct super_block *sb, char *options) 1158 void save_mount_options(struct super_block *sb, char *options)
1159 { 1159 {
1160 BUG_ON(sb->s_options); 1160 BUG_ON(sb->s_options);
1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
1162 } 1162 }
1163 EXPORT_SYMBOL(save_mount_options); 1163 EXPORT_SYMBOL(save_mount_options);
1164 1164
1165 void replace_mount_options(struct super_block *sb, char *options) 1165 void replace_mount_options(struct super_block *sb, char *options)
1166 { 1166 {
1167 char *old = sb->s_options; 1167 char *old = sb->s_options;
1168 rcu_assign_pointer(sb->s_options, options); 1168 rcu_assign_pointer(sb->s_options, options);
1169 if (old) { 1169 if (old) {
1170 synchronize_rcu(); 1170 synchronize_rcu();
1171 kfree(old); 1171 kfree(old);
1172 } 1172 }
1173 } 1173 }
1174 EXPORT_SYMBOL(replace_mount_options); 1174 EXPORT_SYMBOL(replace_mount_options);
1175 1175
1176 #ifdef CONFIG_PROC_FS 1176 #ifdef CONFIG_PROC_FS
1177 /* iterator; we want it to have access to namespace_sem, thus here... */ 1177 /* iterator; we want it to have access to namespace_sem, thus here... */
1178 static void *m_start(struct seq_file *m, loff_t *pos) 1178 static void *m_start(struct seq_file *m, loff_t *pos)
1179 { 1179 {
1180 struct proc_mounts *p = proc_mounts(m); 1180 struct proc_mounts *p = proc_mounts(m);
1181 1181
1182 down_read(&namespace_sem); 1182 down_read(&namespace_sem);
1183 if (p->cached_event == p->ns->event) { 1183 if (p->cached_event == p->ns->event) {
1184 void *v = p->cached_mount; 1184 void *v = p->cached_mount;
1185 if (*pos == p->cached_index) 1185 if (*pos == p->cached_index)
1186 return v; 1186 return v;
1187 if (*pos == p->cached_index + 1) { 1187 if (*pos == p->cached_index + 1) {
1188 v = seq_list_next(v, &p->ns->list, &p->cached_index); 1188 v = seq_list_next(v, &p->ns->list, &p->cached_index);
1189 return p->cached_mount = v; 1189 return p->cached_mount = v;
1190 } 1190 }
1191 } 1191 }
1192 1192
1193 p->cached_event = p->ns->event; 1193 p->cached_event = p->ns->event;
1194 p->cached_mount = seq_list_start(&p->ns->list, *pos); 1194 p->cached_mount = seq_list_start(&p->ns->list, *pos);
1195 p->cached_index = *pos; 1195 p->cached_index = *pos;
1196 return p->cached_mount; 1196 return p->cached_mount;
1197 } 1197 }
1198 1198
1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1200 { 1200 {
1201 struct proc_mounts *p = proc_mounts(m); 1201 struct proc_mounts *p = proc_mounts(m);
1202 1202
1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos); 1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1204 p->cached_index = *pos; 1204 p->cached_index = *pos;
1205 return p->cached_mount; 1205 return p->cached_mount;
1206 } 1206 }
1207 1207
1208 static void m_stop(struct seq_file *m, void *v) 1208 static void m_stop(struct seq_file *m, void *v)
1209 { 1209 {
1210 up_read(&namespace_sem); 1210 up_read(&namespace_sem);
1211 } 1211 }
1212 1212
1213 static int m_show(struct seq_file *m, void *v) 1213 static int m_show(struct seq_file *m, void *v)
1214 { 1214 {
1215 struct proc_mounts *p = proc_mounts(m); 1215 struct proc_mounts *p = proc_mounts(m);
1216 struct mount *r = list_entry(v, struct mount, mnt_list); 1216 struct mount *r = list_entry(v, struct mount, mnt_list);
1217 return p->show(m, &r->mnt); 1217 return p->show(m, &r->mnt);
1218 } 1218 }
1219 1219
1220 const struct seq_operations mounts_op = { 1220 const struct seq_operations mounts_op = {
1221 .start = m_start, 1221 .start = m_start,
1222 .next = m_next, 1222 .next = m_next,
1223 .stop = m_stop, 1223 .stop = m_stop,
1224 .show = m_show, 1224 .show = m_show,
1225 }; 1225 };
1226 #endif /* CONFIG_PROC_FS */ 1226 #endif /* CONFIG_PROC_FS */
1227 1227
1228 /** 1228 /**
1229 * may_umount_tree - check if a mount tree is busy 1229 * may_umount_tree - check if a mount tree is busy
1230 * @mnt: root of mount tree 1230 * @mnt: root of mount tree
1231 * 1231 *
1232 * This is called to check if a tree of mounts has any 1232 * This is called to check if a tree of mounts has any
1233 * open files, pwds, chroots or sub mounts that are 1233 * open files, pwds, chroots or sub mounts that are
1234 * busy. 1234 * busy.
1235 */ 1235 */
1236 int may_umount_tree(struct vfsmount *m) 1236 int may_umount_tree(struct vfsmount *m)
1237 { 1237 {
1238 struct mount *mnt = real_mount(m); 1238 struct mount *mnt = real_mount(m);
1239 int actual_refs = 0; 1239 int actual_refs = 0;
1240 int minimum_refs = 0; 1240 int minimum_refs = 0;
1241 struct mount *p; 1241 struct mount *p;
1242 BUG_ON(!m); 1242 BUG_ON(!m);
1243 1243
1244 /* write lock needed for mnt_get_count */ 1244 /* write lock needed for mnt_get_count */
1245 lock_mount_hash(); 1245 lock_mount_hash();
1246 for (p = mnt; p; p = next_mnt(p, mnt)) { 1246 for (p = mnt; p; p = next_mnt(p, mnt)) {
1247 actual_refs += mnt_get_count(p); 1247 actual_refs += mnt_get_count(p);
1248 minimum_refs += 2; 1248 minimum_refs += 2;
1249 } 1249 }
1250 unlock_mount_hash(); 1250 unlock_mount_hash();
1251 1251
1252 if (actual_refs > minimum_refs) 1252 if (actual_refs > minimum_refs)
1253 return 0; 1253 return 0;
1254 1254
1255 return 1; 1255 return 1;
1256 } 1256 }
1257 1257
1258 EXPORT_SYMBOL(may_umount_tree); 1258 EXPORT_SYMBOL(may_umount_tree);
1259 1259
1260 /** 1260 /**
1261 * may_umount - check if a mount point is busy 1261 * may_umount - check if a mount point is busy
1262 * @mnt: root of mount 1262 * @mnt: root of mount
1263 * 1263 *
1264 * This is called to check if a mount point has any 1264 * This is called to check if a mount point has any
1265 * open files, pwds, chroots or sub mounts. If the 1265 * open files, pwds, chroots or sub mounts. If the
1266 * mount has sub mounts this will return busy 1266 * mount has sub mounts this will return busy
1267 * regardless of whether the sub mounts are busy. 1267 * regardless of whether the sub mounts are busy.
1268 * 1268 *
1269 * Doesn't take quota and stuff into account. IOW, in some cases it will 1269 * Doesn't take quota and stuff into account. IOW, in some cases it will
1270 * give false negatives. The main reason why it's here is that we need 1270 * give false negatives. The main reason why it's here is that we need
1271 * a non-destructive way to look for easily umountable filesystems. 1271 * a non-destructive way to look for easily umountable filesystems.
1272 */ 1272 */
1273 int may_umount(struct vfsmount *mnt) 1273 int may_umount(struct vfsmount *mnt)
1274 { 1274 {
1275 int ret = 1; 1275 int ret = 1;
1276 down_read(&namespace_sem); 1276 down_read(&namespace_sem);
1277 lock_mount_hash(); 1277 lock_mount_hash();
1278 if (propagate_mount_busy(real_mount(mnt), 2)) 1278 if (propagate_mount_busy(real_mount(mnt), 2))
1279 ret = 0; 1279 ret = 0;
1280 unlock_mount_hash(); 1280 unlock_mount_hash();
1281 up_read(&namespace_sem); 1281 up_read(&namespace_sem);
1282 return ret; 1282 return ret;
1283 } 1283 }
1284 1284
1285 EXPORT_SYMBOL(may_umount); 1285 EXPORT_SYMBOL(may_umount);
1286 1286
1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1288 1288
1289 static void namespace_unlock(void) 1289 static void namespace_unlock(void)
1290 { 1290 {
1291 struct mount *mnt; 1291 struct mount *mnt;
1292 struct hlist_head head = unmounted; 1292 struct hlist_head head = unmounted;
1293 1293
1294 if (likely(hlist_empty(&head))) { 1294 if (likely(hlist_empty(&head))) {
1295 up_write(&namespace_sem); 1295 up_write(&namespace_sem);
1296 return; 1296 return;
1297 } 1297 }
1298 1298
1299 head.first->pprev = &head.first; 1299 head.first->pprev = &head.first;
1300 INIT_HLIST_HEAD(&unmounted); 1300 INIT_HLIST_HEAD(&unmounted);
1301 1301
1302 /* undo decrements we'd done in umount_tree() */ 1302 /* undo decrements we'd done in umount_tree() */
1303 hlist_for_each_entry(mnt, &head, mnt_hash) 1303 hlist_for_each_entry(mnt, &head, mnt_hash)
1304 if (mnt->mnt_ex_mountpoint.mnt) 1304 if (mnt->mnt_ex_mountpoint.mnt)
1305 mntget(mnt->mnt_ex_mountpoint.mnt); 1305 mntget(mnt->mnt_ex_mountpoint.mnt);
1306 1306
1307 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1308 1308
1309 synchronize_rcu(); 1309 synchronize_rcu();
1310 1310
1311 while (!hlist_empty(&head)) { 1311 while (!hlist_empty(&head)) {
1312 mnt = hlist_entry(head.first, struct mount, mnt_hash); 1312 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1313 hlist_del_init(&mnt->mnt_hash); 1313 hlist_del_init(&mnt->mnt_hash);
1314 if (mnt->mnt_ex_mountpoint.mnt) 1314 if (mnt->mnt_ex_mountpoint.mnt)
1315 path_put(&mnt->mnt_ex_mountpoint); 1315 path_put(&mnt->mnt_ex_mountpoint);
1316 mntput(&mnt->mnt); 1316 mntput(&mnt->mnt);
1317 } 1317 }
1318 } 1318 }
1319 1319
1320 static inline void namespace_lock(void) 1320 static inline void namespace_lock(void)
1321 { 1321 {
1322 down_write(&namespace_sem); 1322 down_write(&namespace_sem);
1323 } 1323 }
1324 1324
1325 /* 1325 /*
1326 * mount_lock must be held 1326 * mount_lock must be held
1327 * namespace_sem must be held for write 1327 * namespace_sem must be held for write
1328 * how = 0 => just this tree, don't propagate 1328 * how = 0 => just this tree, don't propagate
1329 * how = 1 => propagate; we know that nobody else has reference to any victims 1329 * how = 1 => propagate; we know that nobody else has reference to any victims
1330 * how = 2 => lazy umount 1330 * how = 2 => lazy umount
1331 */ 1331 */
1332 void umount_tree(struct mount *mnt, int how) 1332 void umount_tree(struct mount *mnt, int how)
1333 { 1333 {
1334 HLIST_HEAD(tmp_list); 1334 HLIST_HEAD(tmp_list);
1335 struct mount *p; 1335 struct mount *p;
1336 struct mount *last = NULL; 1336 struct mount *last = NULL;
1337 1337
1338 for (p = mnt; p; p = next_mnt(p, mnt)) { 1338 for (p = mnt; p; p = next_mnt(p, mnt)) {
1339 hlist_del_init_rcu(&p->mnt_hash); 1339 hlist_del_init_rcu(&p->mnt_hash);
1340 hlist_add_head(&p->mnt_hash, &tmp_list); 1340 hlist_add_head(&p->mnt_hash, &tmp_list);
1341 } 1341 }
1342 1342
1343 hlist_for_each_entry(p, &tmp_list, mnt_hash) 1343 hlist_for_each_entry(p, &tmp_list, mnt_hash)
1344 list_del_init(&p->mnt_child); 1344 list_del_init(&p->mnt_child);
1345 1345
1346 if (how) 1346 if (how)
1347 propagate_umount(&tmp_list); 1347 propagate_umount(&tmp_list);
1348 1348
1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) {
1350 list_del_init(&p->mnt_expire); 1350 list_del_init(&p->mnt_expire);
1351 list_del_init(&p->mnt_list); 1351 list_del_init(&p->mnt_list);
1352 __touch_mnt_namespace(p->mnt_ns); 1352 __touch_mnt_namespace(p->mnt_ns);
1353 p->mnt_ns = NULL; 1353 p->mnt_ns = NULL;
1354 if (how < 2) 1354 if (how < 2)
1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1356 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1357 hlist_del_init(&p->mnt_mp_list); 1357 hlist_del_init(&p->mnt_mp_list);
1358 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1359 mnt_add_count(p->mnt_parent, -1); 1359 mnt_add_count(p->mnt_parent, -1);
1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1363 p->mnt_mountpoint = p->mnt.mnt_root; 1363 p->mnt_mountpoint = p->mnt.mnt_root;
1364 p->mnt_parent = p; 1364 p->mnt_parent = p;
1365 p->mnt_mp = NULL; 1365 p->mnt_mp = NULL;
1366 } 1366 }
1367 change_mnt_propagation(p, MS_PRIVATE); 1367 change_mnt_propagation(p, MS_PRIVATE);
1368 last = p; 1368 last = p;
1369 } 1369 }
1370 if (last) { 1370 if (last) {
1371 last->mnt_hash.next = unmounted.first; 1371 last->mnt_hash.next = unmounted.first;
1372 if (unmounted.first)
1373 unmounted.first->pprev = &last->mnt_hash.next;
1372 unmounted.first = tmp_list.first; 1374 unmounted.first = tmp_list.first;
1373 unmounted.first->pprev = &unmounted.first; 1375 unmounted.first->pprev = &unmounted.first;
1374 } 1376 }
1375 } 1377 }
1376 1378
1377 static void shrink_submounts(struct mount *mnt); 1379 static void shrink_submounts(struct mount *mnt);
1378 1380
1379 static int do_umount(struct mount *mnt, int flags) 1381 static int do_umount(struct mount *mnt, int flags)
1380 { 1382 {
1381 struct super_block *sb = mnt->mnt.mnt_sb; 1383 struct super_block *sb = mnt->mnt.mnt_sb;
1382 int retval; 1384 int retval;
1383 1385
1384 retval = security_sb_umount(&mnt->mnt, flags); 1386 retval = security_sb_umount(&mnt->mnt, flags);
1385 if (retval) 1387 if (retval)
1386 return retval; 1388 return retval;
1387 1389
1388 /* 1390 /*
1389 * Allow userspace to request a mountpoint be expired rather than 1391 * Allow userspace to request a mountpoint be expired rather than
1390 * unmounting unconditionally. Unmount only happens if: 1392 * unmounting unconditionally. Unmount only happens if:
1391 * (1) the mark is already set (the mark is cleared by mntput()) 1393 * (1) the mark is already set (the mark is cleared by mntput())
1392 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1394 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1393 */ 1395 */
1394 if (flags & MNT_EXPIRE) { 1396 if (flags & MNT_EXPIRE) {
1395 if (&mnt->mnt == current->fs->root.mnt || 1397 if (&mnt->mnt == current->fs->root.mnt ||
1396 flags & (MNT_FORCE | MNT_DETACH)) 1398 flags & (MNT_FORCE | MNT_DETACH))
1397 return -EINVAL; 1399 return -EINVAL;
1398 1400
1399 /* 1401 /*
1400 * probably don't strictly need the lock here if we examined 1402 * probably don't strictly need the lock here if we examined
1401 * all race cases, but it's a slowpath. 1403 * all race cases, but it's a slowpath.
1402 */ 1404 */
1403 lock_mount_hash(); 1405 lock_mount_hash();
1404 if (mnt_get_count(mnt) != 2) { 1406 if (mnt_get_count(mnt) != 2) {
1405 unlock_mount_hash(); 1407 unlock_mount_hash();
1406 return -EBUSY; 1408 return -EBUSY;
1407 } 1409 }
1408 unlock_mount_hash(); 1410 unlock_mount_hash();
1409 1411
1410 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1412 if (!xchg(&mnt->mnt_expiry_mark, 1))
1411 return -EAGAIN; 1413 return -EAGAIN;
1412 } 1414 }
1413 1415
1414 /* 1416 /*
1415 * If we may have to abort operations to get out of this 1417 * If we may have to abort operations to get out of this
1416 * mount, and they will themselves hold resources we must 1418 * mount, and they will themselves hold resources we must
1417 * allow the fs to do things. In the Unix tradition of 1419 * allow the fs to do things. In the Unix tradition of
1418 * 'Gee thats tricky lets do it in userspace' the umount_begin 1420 * 'Gee thats tricky lets do it in userspace' the umount_begin
1419 * might fail to complete on the first run through as other tasks 1421 * might fail to complete on the first run through as other tasks
1420 * must return, and the like. Thats for the mount program to worry 1422 * must return, and the like. Thats for the mount program to worry
1421 * about for the moment. 1423 * about for the moment.
1422 */ 1424 */
1423 1425
1424 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1426 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1425 sb->s_op->umount_begin(sb); 1427 sb->s_op->umount_begin(sb);
1426 } 1428 }
1427 1429
1428 /* 1430 /*
1429 * No sense to grab the lock for this test, but test itself looks 1431 * No sense to grab the lock for this test, but test itself looks
1430 * somewhat bogus. Suggestions for better replacement? 1432 * somewhat bogus. Suggestions for better replacement?
1431 * Ho-hum... In principle, we might treat that as umount + switch 1433 * Ho-hum... In principle, we might treat that as umount + switch
1432 * to rootfs. GC would eventually take care of the old vfsmount. 1434 * to rootfs. GC would eventually take care of the old vfsmount.
1433 * Actually it makes sense, especially if rootfs would contain a 1435 * Actually it makes sense, especially if rootfs would contain a
1434 * /reboot - static binary that would close all descriptors and 1436 * /reboot - static binary that would close all descriptors and
1435 * call reboot(9). Then init(8) could umount root and exec /reboot. 1437 * call reboot(9). Then init(8) could umount root and exec /reboot.
1436 */ 1438 */
1437 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1439 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1438 /* 1440 /*
1439 * Special case for "unmounting" root ... 1441 * Special case for "unmounting" root ...
1440 * we just try to remount it readonly. 1442 * we just try to remount it readonly.
1441 */ 1443 */
1442 if (!capable(CAP_SYS_ADMIN)) 1444 if (!capable(CAP_SYS_ADMIN))
1443 return -EPERM; 1445 return -EPERM;
1444 down_write(&sb->s_umount); 1446 down_write(&sb->s_umount);
1445 if (!(sb->s_flags & MS_RDONLY)) 1447 if (!(sb->s_flags & MS_RDONLY))
1446 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1448 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1447 up_write(&sb->s_umount); 1449 up_write(&sb->s_umount);
1448 return retval; 1450 return retval;
1449 } 1451 }
1450 1452
1451 namespace_lock(); 1453 namespace_lock();
1452 lock_mount_hash(); 1454 lock_mount_hash();
1453 event++; 1455 event++;
1454 1456
1455 if (flags & MNT_DETACH) { 1457 if (flags & MNT_DETACH) {
1456 if (!list_empty(&mnt->mnt_list)) 1458 if (!list_empty(&mnt->mnt_list))
1457 umount_tree(mnt, 2); 1459 umount_tree(mnt, 2);
1458 retval = 0; 1460 retval = 0;
1459 } else { 1461 } else {
1460 shrink_submounts(mnt); 1462 shrink_submounts(mnt);
1461 retval = -EBUSY; 1463 retval = -EBUSY;
1462 if (!propagate_mount_busy(mnt, 2)) { 1464 if (!propagate_mount_busy(mnt, 2)) {
1463 if (!list_empty(&mnt->mnt_list)) 1465 if (!list_empty(&mnt->mnt_list))
1464 umount_tree(mnt, 1); 1466 umount_tree(mnt, 1);
1465 retval = 0; 1467 retval = 0;
1466 } 1468 }
1467 } 1469 }
1468 unlock_mount_hash(); 1470 unlock_mount_hash();
1469 namespace_unlock(); 1471 namespace_unlock();
1470 return retval; 1472 return retval;
1471 } 1473 }
1472 1474
1473 /* 1475 /*
1474 * __detach_mounts - lazily unmount all mounts on the specified dentry 1476 * __detach_mounts - lazily unmount all mounts on the specified dentry
1475 * 1477 *
1476 * During unlink, rmdir, and d_drop it is possible to loose the path 1478 * During unlink, rmdir, and d_drop it is possible to loose the path
1477 * to an existing mountpoint, and wind up leaking the mount. 1479 * to an existing mountpoint, and wind up leaking the mount.
1478 * detach_mounts allows lazily unmounting those mounts instead of 1480 * detach_mounts allows lazily unmounting those mounts instead of
1479 * leaking them. 1481 * leaking them.
1480 * 1482 *
1481 * The caller may hold dentry->d_inode->i_mutex. 1483 * The caller may hold dentry->d_inode->i_mutex.
1482 */ 1484 */
1483 void __detach_mounts(struct dentry *dentry) 1485 void __detach_mounts(struct dentry *dentry)
1484 { 1486 {
1485 struct mountpoint *mp; 1487 struct mountpoint *mp;
1486 struct mount *mnt; 1488 struct mount *mnt;
1487 1489
1488 namespace_lock(); 1490 namespace_lock();
1489 mp = lookup_mountpoint(dentry); 1491 mp = lookup_mountpoint(dentry);
1490 if (!mp) 1492 if (!mp)
1491 goto out_unlock; 1493 goto out_unlock;
1492 1494
1493 lock_mount_hash(); 1495 lock_mount_hash();
1494 while (!hlist_empty(&mp->m_list)) { 1496 while (!hlist_empty(&mp->m_list)) {
1495 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); 1497 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1496 umount_tree(mnt, 2); 1498 umount_tree(mnt, 2);
1497 } 1499 }
1498 unlock_mount_hash(); 1500 unlock_mount_hash();
1499 put_mountpoint(mp); 1501 put_mountpoint(mp);
1500 out_unlock: 1502 out_unlock:
1501 namespace_unlock(); 1503 namespace_unlock();
1502 } 1504 }
1503 1505
1504 /* 1506 /*
1505 * Is the caller allowed to modify his namespace? 1507 * Is the caller allowed to modify his namespace?
1506 */ 1508 */
1507 static inline bool may_mount(void) 1509 static inline bool may_mount(void)
1508 { 1510 {
1509 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 1511 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1510 } 1512 }
1511 1513
1512 /* 1514 /*
1513 * Now umount can handle mount points as well as block devices. 1515 * Now umount can handle mount points as well as block devices.
1514 * This is important for filesystems which use unnamed block devices. 1516 * This is important for filesystems which use unnamed block devices.
1515 * 1517 *
1516 * We now support a flag for forced unmount like the other 'big iron' 1518 * We now support a flag for forced unmount like the other 'big iron'
1517 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1519 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1518 */ 1520 */
1519 1521
1520 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1522 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1521 { 1523 {
1522 struct path path; 1524 struct path path;
1523 struct mount *mnt; 1525 struct mount *mnt;
1524 int retval; 1526 int retval;
1525 int lookup_flags = 0; 1527 int lookup_flags = 0;
1526 1528
1527 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1529 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1528 return -EINVAL; 1530 return -EINVAL;
1529 1531
1530 if (!may_mount()) 1532 if (!may_mount())
1531 return -EPERM; 1533 return -EPERM;
1532 1534
1533 if (!(flags & UMOUNT_NOFOLLOW)) 1535 if (!(flags & UMOUNT_NOFOLLOW))
1534 lookup_flags |= LOOKUP_FOLLOW; 1536 lookup_flags |= LOOKUP_FOLLOW;
1535 1537
1536 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); 1538 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1537 if (retval) 1539 if (retval)
1538 goto out; 1540 goto out;
1539 mnt = real_mount(path.mnt); 1541 mnt = real_mount(path.mnt);
1540 retval = -EINVAL; 1542 retval = -EINVAL;
1541 if (path.dentry != path.mnt->mnt_root) 1543 if (path.dentry != path.mnt->mnt_root)
1542 goto dput_and_out; 1544 goto dput_and_out;
1543 if (!check_mnt(mnt)) 1545 if (!check_mnt(mnt))
1544 goto dput_and_out; 1546 goto dput_and_out;
1545 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1547 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1546 goto dput_and_out; 1548 goto dput_and_out;
1547 1549
1548 retval = do_umount(mnt, flags); 1550 retval = do_umount(mnt, flags);
1549 dput_and_out: 1551 dput_and_out:
1550 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1552 /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1551 dput(path.dentry); 1553 dput(path.dentry);
1552 mntput_no_expire(mnt); 1554 mntput_no_expire(mnt);
1553 out: 1555 out:
1554 return retval; 1556 return retval;
1555 } 1557 }
1556 1558
1557 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1559 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1558 1560
1559 /* 1561 /*
1560 * The 2.0 compatible umount. No flags. 1562 * The 2.0 compatible umount. No flags.
1561 */ 1563 */
1562 SYSCALL_DEFINE1(oldumount, char __user *, name) 1564 SYSCALL_DEFINE1(oldumount, char __user *, name)
1563 { 1565 {
1564 return sys_umount(name, 0); 1566 return sys_umount(name, 0);
1565 } 1567 }
1566 1568
1567 #endif 1569 #endif
1568 1570
1569 static bool is_mnt_ns_file(struct dentry *dentry) 1571 static bool is_mnt_ns_file(struct dentry *dentry)
1570 { 1572 {
1571 /* Is this a proxy for a mount namespace? */ 1573 /* Is this a proxy for a mount namespace? */
1572 struct inode *inode = dentry->d_inode; 1574 struct inode *inode = dentry->d_inode;
1573 struct proc_ns *ei; 1575 struct proc_ns *ei;
1574 1576
1575 if (!proc_ns_inode(inode)) 1577 if (!proc_ns_inode(inode))
1576 return false; 1578 return false;
1577 1579
1578 ei = get_proc_ns(inode); 1580 ei = get_proc_ns(inode);
1579 if (ei->ns_ops != &mntns_operations) 1581 if (ei->ns_ops != &mntns_operations)
1580 return false; 1582 return false;
1581 1583
1582 return true; 1584 return true;
1583 } 1585 }
1584 1586
1585 static bool mnt_ns_loop(struct dentry *dentry) 1587 static bool mnt_ns_loop(struct dentry *dentry)
1586 { 1588 {
1587 /* Could bind mounting the mount namespace inode cause a 1589 /* Could bind mounting the mount namespace inode cause a
1588 * mount namespace loop? 1590 * mount namespace loop?
1589 */ 1591 */
1590 struct mnt_namespace *mnt_ns; 1592 struct mnt_namespace *mnt_ns;
1591 if (!is_mnt_ns_file(dentry)) 1593 if (!is_mnt_ns_file(dentry))
1592 return false; 1594 return false;
1593 1595
1594 mnt_ns = get_proc_ns(dentry->d_inode)->ns; 1596 mnt_ns = get_proc_ns(dentry->d_inode)->ns;
1595 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1597 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1596 } 1598 }
1597 1599
1598 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1600 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1599 int flag) 1601 int flag)
1600 { 1602 {
1601 struct mount *res, *p, *q, *r, *parent; 1603 struct mount *res, *p, *q, *r, *parent;
1602 1604
1603 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) 1605 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1604 return ERR_PTR(-EINVAL); 1606 return ERR_PTR(-EINVAL);
1605 1607
1606 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 1608 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1607 return ERR_PTR(-EINVAL); 1609 return ERR_PTR(-EINVAL);
1608 1610
1609 res = q = clone_mnt(mnt, dentry, flag); 1611 res = q = clone_mnt(mnt, dentry, flag);
1610 if (IS_ERR(q)) 1612 if (IS_ERR(q))
1611 return q; 1613 return q;
1612 1614
1613 q->mnt.mnt_flags &= ~MNT_LOCKED; 1615 q->mnt.mnt_flags &= ~MNT_LOCKED;
1614 q->mnt_mountpoint = mnt->mnt_mountpoint; 1616 q->mnt_mountpoint = mnt->mnt_mountpoint;
1615 1617
1616 p = mnt; 1618 p = mnt;
1617 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1619 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1618 struct mount *s; 1620 struct mount *s;
1619 if (!is_subdir(r->mnt_mountpoint, dentry)) 1621 if (!is_subdir(r->mnt_mountpoint, dentry))
1620 continue; 1622 continue;
1621 1623
1622 for (s = r; s; s = next_mnt(s, r)) { 1624 for (s = r; s; s = next_mnt(s, r)) {
1623 struct mount *t = NULL; 1625 struct mount *t = NULL;
1624 if (!(flag & CL_COPY_UNBINDABLE) && 1626 if (!(flag & CL_COPY_UNBINDABLE) &&
1625 IS_MNT_UNBINDABLE(s)) { 1627 IS_MNT_UNBINDABLE(s)) {
1626 s = skip_mnt_tree(s); 1628 s = skip_mnt_tree(s);
1627 continue; 1629 continue;
1628 } 1630 }
1629 if (!(flag & CL_COPY_MNT_NS_FILE) && 1631 if (!(flag & CL_COPY_MNT_NS_FILE) &&
1630 is_mnt_ns_file(s->mnt.mnt_root)) { 1632 is_mnt_ns_file(s->mnt.mnt_root)) {
1631 s = skip_mnt_tree(s); 1633 s = skip_mnt_tree(s);
1632 continue; 1634 continue;
1633 } 1635 }
1634 while (p != s->mnt_parent) { 1636 while (p != s->mnt_parent) {
1635 p = p->mnt_parent; 1637 p = p->mnt_parent;
1636 q = q->mnt_parent; 1638 q = q->mnt_parent;
1637 } 1639 }
1638 p = s; 1640 p = s;
1639 parent = q; 1641 parent = q;
1640 q = clone_mnt(p, p->mnt.mnt_root, flag); 1642 q = clone_mnt(p, p->mnt.mnt_root, flag);
1641 if (IS_ERR(q)) 1643 if (IS_ERR(q))
1642 goto out; 1644 goto out;
1643 lock_mount_hash(); 1645 lock_mount_hash();
1644 list_add_tail(&q->mnt_list, &res->mnt_list); 1646 list_add_tail(&q->mnt_list, &res->mnt_list);
1645 mnt_set_mountpoint(parent, p->mnt_mp, q); 1647 mnt_set_mountpoint(parent, p->mnt_mp, q);
1646 if (!list_empty(&parent->mnt_mounts)) { 1648 if (!list_empty(&parent->mnt_mounts)) {
1647 t = list_last_entry(&parent->mnt_mounts, 1649 t = list_last_entry(&parent->mnt_mounts,
1648 struct mount, mnt_child); 1650 struct mount, mnt_child);
1649 if (t->mnt_mp != p->mnt_mp) 1651 if (t->mnt_mp != p->mnt_mp)
1650 t = NULL; 1652 t = NULL;
1651 } 1653 }
1652 attach_shadowed(q, parent, t); 1654 attach_shadowed(q, parent, t);
1653 unlock_mount_hash(); 1655 unlock_mount_hash();
1654 } 1656 }
1655 } 1657 }
1656 return res; 1658 return res;
1657 out: 1659 out:
1658 if (res) { 1660 if (res) {
1659 lock_mount_hash(); 1661 lock_mount_hash();
1660 umount_tree(res, 0); 1662 umount_tree(res, 0);
1661 unlock_mount_hash(); 1663 unlock_mount_hash();
1662 } 1664 }
1663 return q; 1665 return q;
1664 } 1666 }
1665 1667
1666 /* Caller should check returned pointer for errors */ 1668 /* Caller should check returned pointer for errors */
1667 1669
1668 struct vfsmount *collect_mounts(struct path *path) 1670 struct vfsmount *collect_mounts(struct path *path)
1669 { 1671 {
1670 struct mount *tree; 1672 struct mount *tree;
1671 namespace_lock(); 1673 namespace_lock();
1672 tree = copy_tree(real_mount(path->mnt), path->dentry, 1674 tree = copy_tree(real_mount(path->mnt), path->dentry,
1673 CL_COPY_ALL | CL_PRIVATE); 1675 CL_COPY_ALL | CL_PRIVATE);
1674 namespace_unlock(); 1676 namespace_unlock();
1675 if (IS_ERR(tree)) 1677 if (IS_ERR(tree))
1676 return ERR_CAST(tree); 1678 return ERR_CAST(tree);
1677 return &tree->mnt; 1679 return &tree->mnt;
1678 } 1680 }
1679 1681
1680 void drop_collected_mounts(struct vfsmount *mnt) 1682 void drop_collected_mounts(struct vfsmount *mnt)
1681 { 1683 {
1682 namespace_lock(); 1684 namespace_lock();
1683 lock_mount_hash(); 1685 lock_mount_hash();
1684 umount_tree(real_mount(mnt), 0); 1686 umount_tree(real_mount(mnt), 0);
1685 unlock_mount_hash(); 1687 unlock_mount_hash();
1686 namespace_unlock(); 1688 namespace_unlock();
1687 } 1689 }
1688 1690
1689 /** 1691 /**
1690 * clone_private_mount - create a private clone of a path 1692 * clone_private_mount - create a private clone of a path
1691 * 1693 *
1692 * This creates a new vfsmount, which will be the clone of @path. The new will 1694 * This creates a new vfsmount, which will be the clone of @path. The new will
1693 * not be attached anywhere in the namespace and will be private (i.e. changes 1695 * not be attached anywhere in the namespace and will be private (i.e. changes
1694 * to the originating mount won't be propagated into this). 1696 * to the originating mount won't be propagated into this).
1695 * 1697 *
1696 * Release with mntput(). 1698 * Release with mntput().
1697 */ 1699 */
1698 struct vfsmount *clone_private_mount(struct path *path) 1700 struct vfsmount *clone_private_mount(struct path *path)
1699 { 1701 {
1700 struct mount *old_mnt = real_mount(path->mnt); 1702 struct mount *old_mnt = real_mount(path->mnt);
1701 struct mount *new_mnt; 1703 struct mount *new_mnt;
1702 1704
1703 if (IS_MNT_UNBINDABLE(old_mnt)) 1705 if (IS_MNT_UNBINDABLE(old_mnt))
1704 return ERR_PTR(-EINVAL); 1706 return ERR_PTR(-EINVAL);
1705 1707
1706 down_read(&namespace_sem); 1708 down_read(&namespace_sem);
1707 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); 1709 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1708 up_read(&namespace_sem); 1710 up_read(&namespace_sem);
1709 if (IS_ERR(new_mnt)) 1711 if (IS_ERR(new_mnt))
1710 return ERR_CAST(new_mnt); 1712 return ERR_CAST(new_mnt);
1711 1713
1712 return &new_mnt->mnt; 1714 return &new_mnt->mnt;
1713 } 1715 }
1714 EXPORT_SYMBOL_GPL(clone_private_mount); 1716 EXPORT_SYMBOL_GPL(clone_private_mount);
1715 1717
1716 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1718 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1717 struct vfsmount *root) 1719 struct vfsmount *root)
1718 { 1720 {
1719 struct mount *mnt; 1721 struct mount *mnt;
1720 int res = f(root, arg); 1722 int res = f(root, arg);
1721 if (res) 1723 if (res)
1722 return res; 1724 return res;
1723 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { 1725 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
1724 res = f(&mnt->mnt, arg); 1726 res = f(&mnt->mnt, arg);
1725 if (res) 1727 if (res)
1726 return res; 1728 return res;
1727 } 1729 }
1728 return 0; 1730 return 0;
1729 } 1731 }
1730 1732
1731 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 1733 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
1732 { 1734 {
1733 struct mount *p; 1735 struct mount *p;
1734 1736
1735 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1737 for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1736 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1738 if (p->mnt_group_id && !IS_MNT_SHARED(p))
1737 mnt_release_group_id(p); 1739 mnt_release_group_id(p);
1738 } 1740 }
1739 } 1741 }
1740 1742
1741 static int invent_group_ids(struct mount *mnt, bool recurse) 1743 static int invent_group_ids(struct mount *mnt, bool recurse)
1742 { 1744 {
1743 struct mount *p; 1745 struct mount *p;
1744 1746
1745 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1747 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1746 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1748 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1747 int err = mnt_alloc_group_id(p); 1749 int err = mnt_alloc_group_id(p);
1748 if (err) { 1750 if (err) {
1749 cleanup_group_ids(mnt, p); 1751 cleanup_group_ids(mnt, p);
1750 return err; 1752 return err;
1751 } 1753 }
1752 } 1754 }
1753 } 1755 }
1754 1756
1755 return 0; 1757 return 0;
1756 } 1758 }
1757 1759
1758 /* 1760 /*
1759 * @source_mnt : mount tree to be attached 1761 * @source_mnt : mount tree to be attached
1760 * @nd : place the mount tree @source_mnt is attached 1762 * @nd : place the mount tree @source_mnt is attached
1761 * @parent_nd : if non-null, detach the source_mnt from its parent and 1763 * @parent_nd : if non-null, detach the source_mnt from its parent and
1762 * store the parent mount and mountpoint dentry. 1764 * store the parent mount and mountpoint dentry.
1763 * (done when source_mnt is moved) 1765 * (done when source_mnt is moved)
1764 * 1766 *
1765 * NOTE: in the table below explains the semantics when a source mount 1767 * NOTE: in the table below explains the semantics when a source mount
1766 * of a given type is attached to a destination mount of a given type. 1768 * of a given type is attached to a destination mount of a given type.
1767 * --------------------------------------------------------------------------- 1769 * ---------------------------------------------------------------------------
1768 * | BIND MOUNT OPERATION | 1770 * | BIND MOUNT OPERATION |
1769 * |************************************************************************** 1771 * |**************************************************************************
1770 * | source-->| shared | private | slave | unbindable | 1772 * | source-->| shared | private | slave | unbindable |
1771 * | dest | | | | | 1773 * | dest | | | | |
1772 * | | | | | | | 1774 * | | | | | | |
1773 * | v | | | | | 1775 * | v | | | | |
1774 * |************************************************************************** 1776 * |**************************************************************************
1775 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1777 * | shared | shared (++) | shared (+) | shared(+++)| invalid |
1776 * | | | | | | 1778 * | | | | | |
1777 * |non-shared| shared (+) | private | slave (*) | invalid | 1779 * |non-shared| shared (+) | private | slave (*) | invalid |
1778 * *************************************************************************** 1780 * ***************************************************************************
1779 * A bind operation clones the source mount and mounts the clone on the 1781 * A bind operation clones the source mount and mounts the clone on the
1780 * destination mount. 1782 * destination mount.
1781 * 1783 *
1782 * (++) the cloned mount is propagated to all the mounts in the propagation 1784 * (++) the cloned mount is propagated to all the mounts in the propagation
1783 * tree of the destination mount and the cloned mount is added to 1785 * tree of the destination mount and the cloned mount is added to
1784 * the peer group of the source mount. 1786 * the peer group of the source mount.
1785 * (+) the cloned mount is created under the destination mount and is marked 1787 * (+) the cloned mount is created under the destination mount and is marked
1786 * as shared. The cloned mount is added to the peer group of the source 1788 * as shared. The cloned mount is added to the peer group of the source
1787 * mount. 1789 * mount.
1788 * (+++) the mount is propagated to all the mounts in the propagation tree 1790 * (+++) the mount is propagated to all the mounts in the propagation tree
1789 * of the destination mount and the cloned mount is made slave 1791 * of the destination mount and the cloned mount is made slave
1790 * of the same master as that of the source mount. The cloned mount 1792 * of the same master as that of the source mount. The cloned mount
1791 * is marked as 'shared and slave'. 1793 * is marked as 'shared and slave'.
1792 * (*) the cloned mount is made a slave of the same master as that of the 1794 * (*) the cloned mount is made a slave of the same master as that of the
1793 * source mount. 1795 * source mount.
1794 * 1796 *
1795 * --------------------------------------------------------------------------- 1797 * ---------------------------------------------------------------------------
1796 * | MOVE MOUNT OPERATION | 1798 * | MOVE MOUNT OPERATION |
1797 * |************************************************************************** 1799 * |**************************************************************************
1798 * | source-->| shared | private | slave | unbindable | 1800 * | source-->| shared | private | slave | unbindable |
1799 * | dest | | | | | 1801 * | dest | | | | |
1800 * | | | | | | | 1802 * | | | | | | |
1801 * | v | | | | | 1803 * | v | | | | |
1802 * |************************************************************************** 1804 * |**************************************************************************
1803 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1805 * | shared | shared (+) | shared (+) | shared(+++) | invalid |
1804 * | | | | | | 1806 * | | | | | |
1805 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1807 * |non-shared| shared (+*) | private | slave (*) | unbindable |
1806 * *************************************************************************** 1808 * ***************************************************************************
1807 * 1809 *
1808 * (+) the mount is moved to the destination. And is then propagated to 1810 * (+) the mount is moved to the destination. And is then propagated to
1809 * all the mounts in the propagation tree of the destination mount. 1811 * all the mounts in the propagation tree of the destination mount.
1810 * (+*) the mount is moved to the destination. 1812 * (+*) the mount is moved to the destination.
1811 * (+++) the mount is moved to the destination and is then propagated to 1813 * (+++) the mount is moved to the destination and is then propagated to
1812 * all the mounts belonging to the destination mount's propagation tree. 1814 * all the mounts belonging to the destination mount's propagation tree.
1813 * the mount is marked as 'shared and slave'. 1815 * the mount is marked as 'shared and slave'.
1814 * (*) the mount continues to be a slave at the new location. 1816 * (*) the mount continues to be a slave at the new location.
1815 * 1817 *
1816 * if the source mount is a tree, the operations explained above is 1818 * if the source mount is a tree, the operations explained above is
1817 * applied to each mount in the tree. 1819 * applied to each mount in the tree.
1818 * Must be called without spinlocks held, since this function can sleep 1820 * Must be called without spinlocks held, since this function can sleep
1819 * in allocations. 1821 * in allocations.
1820 */ 1822 */
1821 static int attach_recursive_mnt(struct mount *source_mnt, 1823 static int attach_recursive_mnt(struct mount *source_mnt,
1822 struct mount *dest_mnt, 1824 struct mount *dest_mnt,
1823 struct mountpoint *dest_mp, 1825 struct mountpoint *dest_mp,
1824 struct path *parent_path) 1826 struct path *parent_path)
1825 { 1827 {
1826 HLIST_HEAD(tree_list); 1828 HLIST_HEAD(tree_list);
1827 struct mount *child, *p; 1829 struct mount *child, *p;
1828 struct hlist_node *n; 1830 struct hlist_node *n;
1829 int err; 1831 int err;
1830 1832
1831 if (IS_MNT_SHARED(dest_mnt)) { 1833 if (IS_MNT_SHARED(dest_mnt)) {
1832 err = invent_group_ids(source_mnt, true); 1834 err = invent_group_ids(source_mnt, true);
1833 if (err) 1835 if (err)
1834 goto out; 1836 goto out;
1835 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1837 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1836 lock_mount_hash(); 1838 lock_mount_hash();
1837 if (err) 1839 if (err)
1838 goto out_cleanup_ids; 1840 goto out_cleanup_ids;
1839 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1841 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1840 set_mnt_shared(p); 1842 set_mnt_shared(p);
1841 } else { 1843 } else {
1842 lock_mount_hash(); 1844 lock_mount_hash();
1843 } 1845 }
1844 if (parent_path) { 1846 if (parent_path) {
1845 detach_mnt(source_mnt, parent_path); 1847 detach_mnt(source_mnt, parent_path);
1846 attach_mnt(source_mnt, dest_mnt, dest_mp); 1848 attach_mnt(source_mnt, dest_mnt, dest_mp);
1847 touch_mnt_namespace(source_mnt->mnt_ns); 1849 touch_mnt_namespace(source_mnt->mnt_ns);
1848 } else { 1850 } else {
1849 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 1851 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
1850 commit_tree(source_mnt, NULL); 1852 commit_tree(source_mnt, NULL);
1851 } 1853 }
1852 1854
1853 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 1855 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
1854 struct mount *q; 1856 struct mount *q;
1855 hlist_del_init(&child->mnt_hash); 1857 hlist_del_init(&child->mnt_hash);
1856 q = __lookup_mnt_last(&child->mnt_parent->mnt, 1858 q = __lookup_mnt_last(&child->mnt_parent->mnt,
1857 child->mnt_mountpoint); 1859 child->mnt_mountpoint);
1858 commit_tree(child, q); 1860 commit_tree(child, q);
1859 } 1861 }
1860 unlock_mount_hash(); 1862 unlock_mount_hash();
1861 1863
1862 return 0; 1864 return 0;
1863 1865
1864 out_cleanup_ids: 1866 out_cleanup_ids:
1865 while (!hlist_empty(&tree_list)) { 1867 while (!hlist_empty(&tree_list)) {
1866 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 1868 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1867 umount_tree(child, 0); 1869 umount_tree(child, 0);
1868 } 1870 }
1869 unlock_mount_hash(); 1871 unlock_mount_hash();
1870 cleanup_group_ids(source_mnt, NULL); 1872 cleanup_group_ids(source_mnt, NULL);
1871 out: 1873 out:
1872 return err; 1874 return err;
1873 } 1875 }
1874 1876
1875 static struct mountpoint *lock_mount(struct path *path) 1877 static struct mountpoint *lock_mount(struct path *path)
1876 { 1878 {
1877 struct vfsmount *mnt; 1879 struct vfsmount *mnt;
1878 struct dentry *dentry = path->dentry; 1880 struct dentry *dentry = path->dentry;
1879 retry: 1881 retry:
1880 mutex_lock(&dentry->d_inode->i_mutex); 1882 mutex_lock(&dentry->d_inode->i_mutex);
1881 if (unlikely(cant_mount(dentry))) { 1883 if (unlikely(cant_mount(dentry))) {
1882 mutex_unlock(&dentry->d_inode->i_mutex); 1884 mutex_unlock(&dentry->d_inode->i_mutex);
1883 return ERR_PTR(-ENOENT); 1885 return ERR_PTR(-ENOENT);
1884 } 1886 }
1885 namespace_lock(); 1887 namespace_lock();
1886 mnt = lookup_mnt(path); 1888 mnt = lookup_mnt(path);
1887 if (likely(!mnt)) { 1889 if (likely(!mnt)) {
1888 struct mountpoint *mp = lookup_mountpoint(dentry); 1890 struct mountpoint *mp = lookup_mountpoint(dentry);
1889 if (!mp) 1891 if (!mp)
1890 mp = new_mountpoint(dentry); 1892 mp = new_mountpoint(dentry);
1891 if (IS_ERR(mp)) { 1893 if (IS_ERR(mp)) {
1892 namespace_unlock(); 1894 namespace_unlock();
1893 mutex_unlock(&dentry->d_inode->i_mutex); 1895 mutex_unlock(&dentry->d_inode->i_mutex);
1894 return mp; 1896 return mp;
1895 } 1897 }
1896 return mp; 1898 return mp;
1897 } 1899 }
1898 namespace_unlock(); 1900 namespace_unlock();
1899 mutex_unlock(&path->dentry->d_inode->i_mutex); 1901 mutex_unlock(&path->dentry->d_inode->i_mutex);
1900 path_put(path); 1902 path_put(path);
1901 path->mnt = mnt; 1903 path->mnt = mnt;
1902 dentry = path->dentry = dget(mnt->mnt_root); 1904 dentry = path->dentry = dget(mnt->mnt_root);
1903 goto retry; 1905 goto retry;
1904 } 1906 }
1905 1907
1906 static void unlock_mount(struct mountpoint *where) 1908 static void unlock_mount(struct mountpoint *where)
1907 { 1909 {
1908 struct dentry *dentry = where->m_dentry; 1910 struct dentry *dentry = where->m_dentry;
1909 put_mountpoint(where); 1911 put_mountpoint(where);
1910 namespace_unlock(); 1912 namespace_unlock();
1911 mutex_unlock(&dentry->d_inode->i_mutex); 1913 mutex_unlock(&dentry->d_inode->i_mutex);
1912 } 1914 }
1913 1915
1914 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) 1916 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
1915 { 1917 {
1916 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 1918 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
1917 return -EINVAL; 1919 return -EINVAL;
1918 1920
1919 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != 1921 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
1920 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 1922 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
1921 return -ENOTDIR; 1923 return -ENOTDIR;
1922 1924
1923 return attach_recursive_mnt(mnt, p, mp, NULL); 1925 return attach_recursive_mnt(mnt, p, mp, NULL);
1924 } 1926 }
1925 1927
1926 /* 1928 /*
1927 * Sanity check the flags to change_mnt_propagation. 1929 * Sanity check the flags to change_mnt_propagation.
1928 */ 1930 */
1929 1931
1930 static int flags_to_propagation_type(int flags) 1932 static int flags_to_propagation_type(int flags)
1931 { 1933 {
1932 int type = flags & ~(MS_REC | MS_SILENT); 1934 int type = flags & ~(MS_REC | MS_SILENT);
1933 1935
1934 /* Fail if any non-propagation flags are set */ 1936 /* Fail if any non-propagation flags are set */
1935 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1937 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1936 return 0; 1938 return 0;
1937 /* Only one propagation flag should be set */ 1939 /* Only one propagation flag should be set */
1938 if (!is_power_of_2(type)) 1940 if (!is_power_of_2(type))
1939 return 0; 1941 return 0;
1940 return type; 1942 return type;
1941 } 1943 }
1942 1944
1943 /* 1945 /*
1944 * recursively change the type of the mountpoint. 1946 * recursively change the type of the mountpoint.
1945 */ 1947 */
1946 static int do_change_type(struct path *path, int flag) 1948 static int do_change_type(struct path *path, int flag)
1947 { 1949 {
1948 struct mount *m; 1950 struct mount *m;
1949 struct mount *mnt = real_mount(path->mnt); 1951 struct mount *mnt = real_mount(path->mnt);
1950 int recurse = flag & MS_REC; 1952 int recurse = flag & MS_REC;
1951 int type; 1953 int type;
1952 int err = 0; 1954 int err = 0;
1953 1955
1954 if (path->dentry != path->mnt->mnt_root) 1956 if (path->dentry != path->mnt->mnt_root)
1955 return -EINVAL; 1957 return -EINVAL;
1956 1958
1957 type = flags_to_propagation_type(flag); 1959 type = flags_to_propagation_type(flag);
1958 if (!type) 1960 if (!type)
1959 return -EINVAL; 1961 return -EINVAL;
1960 1962
1961 namespace_lock(); 1963 namespace_lock();
1962 if (type == MS_SHARED) { 1964 if (type == MS_SHARED) {
1963 err = invent_group_ids(mnt, recurse); 1965 err = invent_group_ids(mnt, recurse);
1964 if (err) 1966 if (err)
1965 goto out_unlock; 1967 goto out_unlock;
1966 } 1968 }
1967 1969
1968 lock_mount_hash(); 1970 lock_mount_hash();
1969 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1971 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1970 change_mnt_propagation(m, type); 1972 change_mnt_propagation(m, type);
1971 unlock_mount_hash(); 1973 unlock_mount_hash();
1972 1974
1973 out_unlock: 1975 out_unlock:
1974 namespace_unlock(); 1976 namespace_unlock();
1975 return err; 1977 return err;
1976 } 1978 }
1977 1979
1978 static bool has_locked_children(struct mount *mnt, struct dentry *dentry) 1980 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1979 { 1981 {
1980 struct mount *child; 1982 struct mount *child;
1981 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 1983 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
1982 if (!is_subdir(child->mnt_mountpoint, dentry)) 1984 if (!is_subdir(child->mnt_mountpoint, dentry))
1983 continue; 1985 continue;
1984 1986
1985 if (child->mnt.mnt_flags & MNT_LOCKED) 1987 if (child->mnt.mnt_flags & MNT_LOCKED)
1986 return true; 1988 return true;
1987 } 1989 }
1988 return false; 1990 return false;
1989 } 1991 }
1990 1992
1991 /* 1993 /*
1992 * do loopback mount. 1994 * do loopback mount.
1993 */ 1995 */
1994 static int do_loopback(struct path *path, const char *old_name, 1996 static int do_loopback(struct path *path, const char *old_name,
1995 int recurse) 1997 int recurse)
1996 { 1998 {
1997 struct path old_path; 1999 struct path old_path;
1998 struct mount *mnt = NULL, *old, *parent; 2000 struct mount *mnt = NULL, *old, *parent;
1999 struct mountpoint *mp; 2001 struct mountpoint *mp;
2000 int err; 2002 int err;
2001 if (!old_name || !*old_name) 2003 if (!old_name || !*old_name)
2002 return -EINVAL; 2004 return -EINVAL;
2003 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 2005 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2004 if (err) 2006 if (err)
2005 return err; 2007 return err;
2006 2008
2007 err = -EINVAL; 2009 err = -EINVAL;
2008 if (mnt_ns_loop(old_path.dentry)) 2010 if (mnt_ns_loop(old_path.dentry))
2009 goto out; 2011 goto out;
2010 2012
2011 mp = lock_mount(path); 2013 mp = lock_mount(path);
2012 err = PTR_ERR(mp); 2014 err = PTR_ERR(mp);
2013 if (IS_ERR(mp)) 2015 if (IS_ERR(mp))
2014 goto out; 2016 goto out;
2015 2017
2016 old = real_mount(old_path.mnt); 2018 old = real_mount(old_path.mnt);
2017 parent = real_mount(path->mnt); 2019 parent = real_mount(path->mnt);
2018 2020
2019 err = -EINVAL; 2021 err = -EINVAL;
2020 if (IS_MNT_UNBINDABLE(old)) 2022 if (IS_MNT_UNBINDABLE(old))
2021 goto out2; 2023 goto out2;
2022 2024
2023 if (!check_mnt(parent) || !check_mnt(old)) 2025 if (!check_mnt(parent) || !check_mnt(old))
2024 goto out2; 2026 goto out2;
2025 2027
2026 if (!recurse && has_locked_children(old, old_path.dentry)) 2028 if (!recurse && has_locked_children(old, old_path.dentry))
2027 goto out2; 2029 goto out2;
2028 2030
2029 if (recurse) 2031 if (recurse)
2030 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); 2032 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2031 else 2033 else
2032 mnt = clone_mnt(old, old_path.dentry, 0); 2034 mnt = clone_mnt(old, old_path.dentry, 0);
2033 2035
2034 if (IS_ERR(mnt)) { 2036 if (IS_ERR(mnt)) {
2035 err = PTR_ERR(mnt); 2037 err = PTR_ERR(mnt);
2036 goto out2; 2038 goto out2;
2037 } 2039 }
2038 2040
2039 mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2041 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2040 2042
2041 err = graft_tree(mnt, parent, mp); 2043 err = graft_tree(mnt, parent, mp);
2042 if (err) { 2044 if (err) {
2043 lock_mount_hash(); 2045 lock_mount_hash();
2044 umount_tree(mnt, 0); 2046 umount_tree(mnt, 0);
2045 unlock_mount_hash(); 2047 unlock_mount_hash();
2046 } 2048 }
2047 out2: 2049 out2:
2048 unlock_mount(mp); 2050 unlock_mount(mp);
2049 out: 2051 out:
2050 path_put(&old_path); 2052 path_put(&old_path);
2051 return err; 2053 return err;
2052 } 2054 }
2053 2055
2054 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 2056 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
2055 { 2057 {
2056 int error = 0; 2058 int error = 0;
2057 int readonly_request = 0; 2059 int readonly_request = 0;
2058 2060
2059 if (ms_flags & MS_RDONLY) 2061 if (ms_flags & MS_RDONLY)
2060 readonly_request = 1; 2062 readonly_request = 1;
2061 if (readonly_request == __mnt_is_readonly(mnt)) 2063 if (readonly_request == __mnt_is_readonly(mnt))
2062 return 0; 2064 return 0;
2063 2065
2064 if (readonly_request) 2066 if (readonly_request)
2065 error = mnt_make_readonly(real_mount(mnt)); 2067 error = mnt_make_readonly(real_mount(mnt));
2066 else 2068 else
2067 __mnt_unmake_readonly(real_mount(mnt)); 2069 __mnt_unmake_readonly(real_mount(mnt));
2068 return error; 2070 return error;
2069 } 2071 }
2070 2072
2071 /* 2073 /*
2072 * change filesystem flags. dir should be a physical root of filesystem. 2074 * change filesystem flags. dir should be a physical root of filesystem.
2073 * If you've mounted a non-root directory somewhere and want to do remount 2075 * If you've mounted a non-root directory somewhere and want to do remount
2074 * on it - tough luck. 2076 * on it - tough luck.
2075 */ 2077 */
2076 static int do_remount(struct path *path, int flags, int mnt_flags, 2078 static int do_remount(struct path *path, int flags, int mnt_flags,
2077 void *data) 2079 void *data)
2078 { 2080 {
2079 int err; 2081 int err;
2080 struct super_block *sb = path->mnt->mnt_sb; 2082 struct super_block *sb = path->mnt->mnt_sb;
2081 struct mount *mnt = real_mount(path->mnt); 2083 struct mount *mnt = real_mount(path->mnt);
2082 2084
2083 if (!check_mnt(mnt)) 2085 if (!check_mnt(mnt))
2084 return -EINVAL; 2086 return -EINVAL;
2085 2087
2086 if (path->dentry != path->mnt->mnt_root) 2088 if (path->dentry != path->mnt->mnt_root)
2087 return -EINVAL; 2089 return -EINVAL;
2088 2090
2089 /* Don't allow changing of locked mnt flags. 2091 /* Don't allow changing of locked mnt flags.
2090 * 2092 *
2091 * No locks need to be held here while testing the various 2093 * No locks need to be held here while testing the various
2092 * MNT_LOCK flags because those flags can never be cleared 2094 * MNT_LOCK flags because those flags can never be cleared
2093 * once they are set. 2095 * once they are set.
2094 */ 2096 */
2095 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && 2097 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2096 !(mnt_flags & MNT_READONLY)) { 2098 !(mnt_flags & MNT_READONLY)) {
2097 return -EPERM; 2099 return -EPERM;
2098 } 2100 }
2099 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 2101 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2100 !(mnt_flags & MNT_NODEV)) { 2102 !(mnt_flags & MNT_NODEV)) {
2101 return -EPERM; 2103 return -EPERM;
2102 } 2104 }
2103 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 2105 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2104 !(mnt_flags & MNT_NOSUID)) { 2106 !(mnt_flags & MNT_NOSUID)) {
2105 return -EPERM; 2107 return -EPERM;
2106 } 2108 }
2107 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && 2109 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2108 !(mnt_flags & MNT_NOEXEC)) { 2110 !(mnt_flags & MNT_NOEXEC)) {
2109 return -EPERM; 2111 return -EPERM;
2110 } 2112 }
2111 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && 2113 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2112 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { 2114 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2113 return -EPERM; 2115 return -EPERM;
2114 } 2116 }
2115 2117
2116 err = security_sb_remount(sb, data); 2118 err = security_sb_remount(sb, data);
2117 if (err) 2119 if (err)
2118 return err; 2120 return err;
2119 2121
2120 down_write(&sb->s_umount); 2122 down_write(&sb->s_umount);
2121 if (flags & MS_BIND) 2123 if (flags & MS_BIND)
2122 err = change_mount_flags(path->mnt, flags); 2124 err = change_mount_flags(path->mnt, flags);
2123 else if (!capable(CAP_SYS_ADMIN)) 2125 else if (!capable(CAP_SYS_ADMIN))
2124 err = -EPERM; 2126 err = -EPERM;
2125 else 2127 else
2126 err = do_remount_sb(sb, flags, data, 0); 2128 err = do_remount_sb(sb, flags, data, 0);
2127 if (!err) { 2129 if (!err) {
2128 lock_mount_hash(); 2130 lock_mount_hash();
2129 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 2131 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2130 mnt->mnt.mnt_flags = mnt_flags; 2132 mnt->mnt.mnt_flags = mnt_flags;
2131 touch_mnt_namespace(mnt->mnt_ns); 2133 touch_mnt_namespace(mnt->mnt_ns);
2132 unlock_mount_hash(); 2134 unlock_mount_hash();
2133 } 2135 }
2134 up_write(&sb->s_umount); 2136 up_write(&sb->s_umount);
2135 return err; 2137 return err;
2136 } 2138 }
2137 2139
2138 static inline int tree_contains_unbindable(struct mount *mnt) 2140 static inline int tree_contains_unbindable(struct mount *mnt)
2139 { 2141 {
2140 struct mount *p; 2142 struct mount *p;
2141 for (p = mnt; p; p = next_mnt(p, mnt)) { 2143 for (p = mnt; p; p = next_mnt(p, mnt)) {
2142 if (IS_MNT_UNBINDABLE(p)) 2144 if (IS_MNT_UNBINDABLE(p))
2143 return 1; 2145 return 1;
2144 } 2146 }
2145 return 0; 2147 return 0;
2146 } 2148 }
2147 2149
2148 static int do_move_mount(struct path *path, const char *old_name) 2150 static int do_move_mount(struct path *path, const char *old_name)
2149 { 2151 {
2150 struct path old_path, parent_path; 2152 struct path old_path, parent_path;
2151 struct mount *p; 2153 struct mount *p;
2152 struct mount *old; 2154 struct mount *old;
2153 struct mountpoint *mp; 2155 struct mountpoint *mp;
2154 int err; 2156 int err;
2155 if (!old_name || !*old_name) 2157 if (!old_name || !*old_name)
2156 return -EINVAL; 2158 return -EINVAL;
2157 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 2159 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2158 if (err) 2160 if (err)
2159 return err; 2161 return err;
2160 2162
2161 mp = lock_mount(path); 2163 mp = lock_mount(path);
2162 err = PTR_ERR(mp); 2164 err = PTR_ERR(mp);
2163 if (IS_ERR(mp)) 2165 if (IS_ERR(mp))
2164 goto out; 2166 goto out;
2165 2167
2166 old = real_mount(old_path.mnt); 2168 old = real_mount(old_path.mnt);
2167 p = real_mount(path->mnt); 2169 p = real_mount(path->mnt);
2168 2170
2169 err = -EINVAL; 2171 err = -EINVAL;
2170 if (!check_mnt(p) || !check_mnt(old)) 2172 if (!check_mnt(p) || !check_mnt(old))
2171 goto out1; 2173 goto out1;
2172 2174
2173 if (old->mnt.mnt_flags & MNT_LOCKED) 2175 if (old->mnt.mnt_flags & MNT_LOCKED)
2174 goto out1; 2176 goto out1;
2175 2177
2176 err = -EINVAL; 2178 err = -EINVAL;
2177 if (old_path.dentry != old_path.mnt->mnt_root) 2179 if (old_path.dentry != old_path.mnt->mnt_root)
2178 goto out1; 2180 goto out1;
2179 2181
2180 if (!mnt_has_parent(old)) 2182 if (!mnt_has_parent(old))
2181 goto out1; 2183 goto out1;
2182 2184
2183 if (S_ISDIR(path->dentry->d_inode->i_mode) != 2185 if (S_ISDIR(path->dentry->d_inode->i_mode) !=
2184 S_ISDIR(old_path.dentry->d_inode->i_mode)) 2186 S_ISDIR(old_path.dentry->d_inode->i_mode))
2185 goto out1; 2187 goto out1;
2186 /* 2188 /*
2187 * Don't move a mount residing in a shared parent. 2189 * Don't move a mount residing in a shared parent.
2188 */ 2190 */
2189 if (IS_MNT_SHARED(old->mnt_parent)) 2191 if (IS_MNT_SHARED(old->mnt_parent))
2190 goto out1; 2192 goto out1;
2191 /* 2193 /*
2192 * Don't move a mount tree containing unbindable mounts to a destination 2194 * Don't move a mount tree containing unbindable mounts to a destination
2193 * mount which is shared. 2195 * mount which is shared.
2194 */ 2196 */
2195 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) 2197 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2196 goto out1; 2198 goto out1;
2197 err = -ELOOP; 2199 err = -ELOOP;
2198 for (; mnt_has_parent(p); p = p->mnt_parent) 2200 for (; mnt_has_parent(p); p = p->mnt_parent)
2199 if (p == old) 2201 if (p == old)
2200 goto out1; 2202 goto out1;
2201 2203
2202 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); 2204 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
2203 if (err) 2205 if (err)
2204 goto out1; 2206 goto out1;
2205 2207
2206 /* if the mount is moved, it should no longer be expire 2208 /* if the mount is moved, it should no longer be expire
2207 * automatically */ 2209 * automatically */
2208 list_del_init(&old->mnt_expire); 2210 list_del_init(&old->mnt_expire);
2209 out1: 2211 out1:
2210 unlock_mount(mp); 2212 unlock_mount(mp);
2211 out: 2213 out:
2212 if (!err) 2214 if (!err)
2213 path_put(&parent_path); 2215 path_put(&parent_path);
2214 path_put(&old_path); 2216 path_put(&old_path);
2215 return err; 2217 return err;
2216 } 2218 }
2217 2219
2218 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 2220 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
2219 { 2221 {
2220 int err; 2222 int err;
2221 const char *subtype = strchr(fstype, '.'); 2223 const char *subtype = strchr(fstype, '.');
2222 if (subtype) { 2224 if (subtype) {
2223 subtype++; 2225 subtype++;
2224 err = -EINVAL; 2226 err = -EINVAL;
2225 if (!subtype[0]) 2227 if (!subtype[0])
2226 goto err; 2228 goto err;
2227 } else 2229 } else
2228 subtype = ""; 2230 subtype = "";
2229 2231
2230 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); 2232 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2231 err = -ENOMEM; 2233 err = -ENOMEM;
2232 if (!mnt->mnt_sb->s_subtype) 2234 if (!mnt->mnt_sb->s_subtype)
2233 goto err; 2235 goto err;
2234 return mnt; 2236 return mnt;
2235 2237
2236 err: 2238 err:
2237 mntput(mnt); 2239 mntput(mnt);
2238 return ERR_PTR(err); 2240 return ERR_PTR(err);
2239 } 2241 }
2240 2242
2241 /* 2243 /*
2242 * add a mount into a namespace's mount tree 2244 * add a mount into a namespace's mount tree
2243 */ 2245 */
2244 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 2246 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
2245 { 2247 {
2246 struct mountpoint *mp; 2248 struct mountpoint *mp;
2247 struct mount *parent; 2249 struct mount *parent;
2248 int err; 2250 int err;
2249 2251
2250 mnt_flags &= ~MNT_INTERNAL_FLAGS; 2252 mnt_flags &= ~MNT_INTERNAL_FLAGS;
2251 2253
2252 mp = lock_mount(path); 2254 mp = lock_mount(path);
2253 if (IS_ERR(mp)) 2255 if (IS_ERR(mp))
2254 return PTR_ERR(mp); 2256 return PTR_ERR(mp);
2255 2257
2256 parent = real_mount(path->mnt); 2258 parent = real_mount(path->mnt);
2257 err = -EINVAL; 2259 err = -EINVAL;
2258 if (unlikely(!check_mnt(parent))) { 2260 if (unlikely(!check_mnt(parent))) {
2259 /* that's acceptable only for automounts done in private ns */ 2261 /* that's acceptable only for automounts done in private ns */
2260 if (!(mnt_flags & MNT_SHRINKABLE)) 2262 if (!(mnt_flags & MNT_SHRINKABLE))
2261 goto unlock; 2263 goto unlock;
2262 /* ... and for those we'd better have mountpoint still alive */ 2264 /* ... and for those we'd better have mountpoint still alive */
2263 if (!parent->mnt_ns) 2265 if (!parent->mnt_ns)
2264 goto unlock; 2266 goto unlock;
2265 } 2267 }
2266 2268
2267 /* Refuse the same filesystem on the same mount point */ 2269 /* Refuse the same filesystem on the same mount point */
2268 err = -EBUSY; 2270 err = -EBUSY;
2269 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 2271 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2270 path->mnt->mnt_root == path->dentry) 2272 path->mnt->mnt_root == path->dentry)
2271 goto unlock; 2273 goto unlock;
2272 2274
2273 err = -EINVAL; 2275 err = -EINVAL;
2274 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 2276 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
2275 goto unlock; 2277 goto unlock;
2276 2278
2277 newmnt->mnt.mnt_flags = mnt_flags; 2279 newmnt->mnt.mnt_flags = mnt_flags;
2278 err = graft_tree(newmnt, parent, mp); 2280 err = graft_tree(newmnt, parent, mp);
2279 2281
2280 unlock: 2282 unlock:
2281 unlock_mount(mp); 2283 unlock_mount(mp);
2282 return err; 2284 return err;
2283 } 2285 }
2284 2286
2285 /* 2287 /*
2286 * create a new mount for userspace and request it to be added into the 2288 * create a new mount for userspace and request it to be added into the
2287 * namespace's tree 2289 * namespace's tree
2288 */ 2290 */
2289 static int do_new_mount(struct path *path, const char *fstype, int flags, 2291 static int do_new_mount(struct path *path, const char *fstype, int flags,
2290 int mnt_flags, const char *name, void *data) 2292 int mnt_flags, const char *name, void *data)
2291 { 2293 {
2292 struct file_system_type *type; 2294 struct file_system_type *type;
2293 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 2295 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2294 struct vfsmount *mnt; 2296 struct vfsmount *mnt;
2295 int err; 2297 int err;
2296 2298
2297 if (!fstype) 2299 if (!fstype)
2298 return -EINVAL; 2300 return -EINVAL;
2299 2301
2300 type = get_fs_type(fstype); 2302 type = get_fs_type(fstype);
2301 if (!type) 2303 if (!type)
2302 return -ENODEV; 2304 return -ENODEV;
2303 2305
2304 if (user_ns != &init_user_ns) { 2306 if (user_ns != &init_user_ns) {
2305 if (!(type->fs_flags & FS_USERNS_MOUNT)) { 2307 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
2306 put_filesystem(type); 2308 put_filesystem(type);
2307 return -EPERM; 2309 return -EPERM;
2308 } 2310 }
2309 /* Only in special cases allow devices from mounts 2311 /* Only in special cases allow devices from mounts
2310 * created outside the initial user namespace. 2312 * created outside the initial user namespace.
2311 */ 2313 */
2312 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2314 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2313 flags |= MS_NODEV; 2315 flags |= MS_NODEV;
2314 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 2316 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
2315 } 2317 }
2316 } 2318 }
2317 2319
2318 mnt = vfs_kern_mount(type, flags, name, data); 2320 mnt = vfs_kern_mount(type, flags, name, data);
2319 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 2321 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2320 !mnt->mnt_sb->s_subtype) 2322 !mnt->mnt_sb->s_subtype)
2321 mnt = fs_set_subtype(mnt, fstype); 2323 mnt = fs_set_subtype(mnt, fstype);
2322 2324
2323 put_filesystem(type); 2325 put_filesystem(type);
2324 if (IS_ERR(mnt)) 2326 if (IS_ERR(mnt))
2325 return PTR_ERR(mnt); 2327 return PTR_ERR(mnt);
2326 2328
2327 err = do_add_mount(real_mount(mnt), path, mnt_flags); 2329 err = do_add_mount(real_mount(mnt), path, mnt_flags);
2328 if (err) 2330 if (err)
2329 mntput(mnt); 2331 mntput(mnt);
2330 return err; 2332 return err;
2331 } 2333 }
2332 2334
2333 int finish_automount(struct vfsmount *m, struct path *path) 2335 int finish_automount(struct vfsmount *m, struct path *path)
2334 { 2336 {
2335 struct mount *mnt = real_mount(m); 2337 struct mount *mnt = real_mount(m);
2336 int err; 2338 int err;
2337 /* The new mount record should have at least 2 refs to prevent it being 2339 /* The new mount record should have at least 2 refs to prevent it being
2338 * expired before we get a chance to add it 2340 * expired before we get a chance to add it
2339 */ 2341 */
2340 BUG_ON(mnt_get_count(mnt) < 2); 2342 BUG_ON(mnt_get_count(mnt) < 2);
2341 2343
2342 if (m->mnt_sb == path->mnt->mnt_sb && 2344 if (m->mnt_sb == path->mnt->mnt_sb &&
2343 m->mnt_root == path->dentry) { 2345 m->mnt_root == path->dentry) {
2344 err = -ELOOP; 2346 err = -ELOOP;
2345 goto fail; 2347 goto fail;
2346 } 2348 }
2347 2349
2348 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 2350 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
2349 if (!err) 2351 if (!err)
2350 return 0; 2352 return 0;
2351 fail: 2353 fail:
2352 /* remove m from any expiration list it may be on */ 2354 /* remove m from any expiration list it may be on */
2353 if (!list_empty(&mnt->mnt_expire)) { 2355 if (!list_empty(&mnt->mnt_expire)) {
2354 namespace_lock(); 2356 namespace_lock();
2355 list_del_init(&mnt->mnt_expire); 2357 list_del_init(&mnt->mnt_expire);
2356 namespace_unlock(); 2358 namespace_unlock();
2357 } 2359 }
2358 mntput(m); 2360 mntput(m);
2359 mntput(m); 2361 mntput(m);
2360 return err; 2362 return err;
2361 } 2363 }
2362 2364
2363 /** 2365 /**
2364 * mnt_set_expiry - Put a mount on an expiration list 2366 * mnt_set_expiry - Put a mount on an expiration list
2365 * @mnt: The mount to list. 2367 * @mnt: The mount to list.
2366 * @expiry_list: The list to add the mount to. 2368 * @expiry_list: The list to add the mount to.
2367 */ 2369 */
2368 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 2370 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2369 { 2371 {
2370 namespace_lock(); 2372 namespace_lock();
2371 2373
2372 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 2374 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2373 2375
2374 namespace_unlock(); 2376 namespace_unlock();
2375 } 2377 }
2376 EXPORT_SYMBOL(mnt_set_expiry); 2378 EXPORT_SYMBOL(mnt_set_expiry);
2377 2379
2378 /* 2380 /*
2379 * process a list of expirable mountpoints with the intent of discarding any 2381 * process a list of expirable mountpoints with the intent of discarding any
2380 * mountpoints that aren't in use and haven't been touched since last we came 2382 * mountpoints that aren't in use and haven't been touched since last we came
2381 * here 2383 * here
2382 */ 2384 */
2383 void mark_mounts_for_expiry(struct list_head *mounts) 2385 void mark_mounts_for_expiry(struct list_head *mounts)
2384 { 2386 {
2385 struct mount *mnt, *next; 2387 struct mount *mnt, *next;
2386 LIST_HEAD(graveyard); 2388 LIST_HEAD(graveyard);
2387 2389
2388 if (list_empty(mounts)) 2390 if (list_empty(mounts))
2389 return; 2391 return;
2390 2392
2391 namespace_lock(); 2393 namespace_lock();
2392 lock_mount_hash(); 2394 lock_mount_hash();
2393 2395
2394 /* extract from the expiration list every vfsmount that matches the 2396 /* extract from the expiration list every vfsmount that matches the
2395 * following criteria: 2397 * following criteria:
2396 * - only referenced by its parent vfsmount 2398 * - only referenced by its parent vfsmount
2397 * - still marked for expiry (marked on the last call here; marks are 2399 * - still marked for expiry (marked on the last call here; marks are
2398 * cleared by mntput()) 2400 * cleared by mntput())
2399 */ 2401 */
2400 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2402 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
2401 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2403 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
2402 propagate_mount_busy(mnt, 1)) 2404 propagate_mount_busy(mnt, 1))
2403 continue; 2405 continue;
2404 list_move(&mnt->mnt_expire, &graveyard); 2406 list_move(&mnt->mnt_expire, &graveyard);
2405 } 2407 }
2406 while (!list_empty(&graveyard)) { 2408 while (!list_empty(&graveyard)) {
2407 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2409 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
2408 touch_mnt_namespace(mnt->mnt_ns); 2410 touch_mnt_namespace(mnt->mnt_ns);
2409 umount_tree(mnt, 1); 2411 umount_tree(mnt, 1);
2410 } 2412 }
2411 unlock_mount_hash(); 2413 unlock_mount_hash();
2412 namespace_unlock(); 2414 namespace_unlock();
2413 } 2415 }
2414 2416
2415 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2417 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
2416 2418
2417 /* 2419 /*
2418 * Ripoff of 'select_parent()' 2420 * Ripoff of 'select_parent()'
2419 * 2421 *
2420 * search the list of submounts for a given mountpoint, and move any 2422 * search the list of submounts for a given mountpoint, and move any
2421 * shrinkable submounts to the 'graveyard' list. 2423 * shrinkable submounts to the 'graveyard' list.
2422 */ 2424 */
2423 static int select_submounts(struct mount *parent, struct list_head *graveyard) 2425 static int select_submounts(struct mount *parent, struct list_head *graveyard)
2424 { 2426 {
2425 struct mount *this_parent = parent; 2427 struct mount *this_parent = parent;
2426 struct list_head *next; 2428 struct list_head *next;
2427 int found = 0; 2429 int found = 0;
2428 2430
2429 repeat: 2431 repeat:
2430 next = this_parent->mnt_mounts.next; 2432 next = this_parent->mnt_mounts.next;
2431 resume: 2433 resume:
2432 while (next != &this_parent->mnt_mounts) { 2434 while (next != &this_parent->mnt_mounts) {
2433 struct list_head *tmp = next; 2435 struct list_head *tmp = next;
2434 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 2436 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
2435 2437
2436 next = tmp->next; 2438 next = tmp->next;
2437 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 2439 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
2438 continue; 2440 continue;
2439 /* 2441 /*
2440 * Descend a level if the d_mounts list is non-empty. 2442 * Descend a level if the d_mounts list is non-empty.
2441 */ 2443 */
2442 if (!list_empty(&mnt->mnt_mounts)) { 2444 if (!list_empty(&mnt->mnt_mounts)) {
2443 this_parent = mnt; 2445 this_parent = mnt;
2444 goto repeat; 2446 goto repeat;
2445 } 2447 }
2446 2448
2447 if (!propagate_mount_busy(mnt, 1)) { 2449 if (!propagate_mount_busy(mnt, 1)) {
2448 list_move_tail(&mnt->mnt_expire, graveyard); 2450 list_move_tail(&mnt->mnt_expire, graveyard);
2449 found++; 2451 found++;
2450 } 2452 }
2451 } 2453 }
2452 /* 2454 /*
2453 * All done at this level ... ascend and resume the search 2455 * All done at this level ... ascend and resume the search
2454 */ 2456 */
2455 if (this_parent != parent) { 2457 if (this_parent != parent) {
2456 next = this_parent->mnt_child.next; 2458 next = this_parent->mnt_child.next;
2457 this_parent = this_parent->mnt_parent; 2459 this_parent = this_parent->mnt_parent;
2458 goto resume; 2460 goto resume;
2459 } 2461 }
2460 return found; 2462 return found;
2461 } 2463 }
2462 2464
2463 /* 2465 /*
2464 * process a list of expirable mountpoints with the intent of discarding any 2466 * process a list of expirable mountpoints with the intent of discarding any
2465 * submounts of a specific parent mountpoint 2467 * submounts of a specific parent mountpoint
2466 * 2468 *
2467 * mount_lock must be held for write 2469 * mount_lock must be held for write
2468 */ 2470 */
2469 static void shrink_submounts(struct mount *mnt) 2471 static void shrink_submounts(struct mount *mnt)
2470 { 2472 {
2471 LIST_HEAD(graveyard); 2473 LIST_HEAD(graveyard);
2472 struct mount *m; 2474 struct mount *m;
2473 2475
2474 /* extract submounts of 'mountpoint' from the expiration list */ 2476 /* extract submounts of 'mountpoint' from the expiration list */
2475 while (select_submounts(mnt, &graveyard)) { 2477 while (select_submounts(mnt, &graveyard)) {
2476 while (!list_empty(&graveyard)) { 2478 while (!list_empty(&graveyard)) {
2477 m = list_first_entry(&graveyard, struct mount, 2479 m = list_first_entry(&graveyard, struct mount,
2478 mnt_expire); 2480 mnt_expire);
2479 touch_mnt_namespace(m->mnt_ns); 2481 touch_mnt_namespace(m->mnt_ns);
2480 umount_tree(m, 1); 2482 umount_tree(m, 1);
2481 } 2483 }
2482 } 2484 }
2483 } 2485 }
2484 2486
2485 /* 2487 /*
2486 * Some copy_from_user() implementations do not return the exact number of 2488 * Some copy_from_user() implementations do not return the exact number of
2487 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2489 * bytes remaining to copy on a fault. But copy_mount_options() requires that.
2488 * Note that this function differs from copy_from_user() in that it will oops 2490 * Note that this function differs from copy_from_user() in that it will oops
2489 * on bad values of `to', rather than returning a short copy. 2491 * on bad values of `to', rather than returning a short copy.
2490 */ 2492 */
2491 static long exact_copy_from_user(void *to, const void __user * from, 2493 static long exact_copy_from_user(void *to, const void __user * from,
2492 unsigned long n) 2494 unsigned long n)
2493 { 2495 {
2494 char *t = to; 2496 char *t = to;
2495 const char __user *f = from; 2497 const char __user *f = from;
2496 char c; 2498 char c;
2497 2499
2498 if (!access_ok(VERIFY_READ, from, n)) 2500 if (!access_ok(VERIFY_READ, from, n))
2499 return n; 2501 return n;
2500 2502
2501 while (n) { 2503 while (n) {
2502 if (__get_user(c, f)) { 2504 if (__get_user(c, f)) {
2503 memset(t, 0, n); 2505 memset(t, 0, n);
2504 break; 2506 break;
2505 } 2507 }
2506 *t++ = c; 2508 *t++ = c;
2507 f++; 2509 f++;
2508 n--; 2510 n--;
2509 } 2511 }
2510 return n; 2512 return n;
2511 } 2513 }
2512 2514
2513 int copy_mount_options(const void __user * data, unsigned long *where) 2515 int copy_mount_options(const void __user * data, unsigned long *where)
2514 { 2516 {
2515 int i; 2517 int i;
2516 unsigned long page; 2518 unsigned long page;
2517 unsigned long size; 2519 unsigned long size;
2518 2520
2519 *where = 0; 2521 *where = 0;
2520 if (!data) 2522 if (!data)
2521 return 0; 2523 return 0;
2522 2524
2523 if (!(page = __get_free_page(GFP_KERNEL))) 2525 if (!(page = __get_free_page(GFP_KERNEL)))
2524 return -ENOMEM; 2526 return -ENOMEM;
2525 2527
2526 /* We only care that *some* data at the address the user 2528 /* We only care that *some* data at the address the user
2527 * gave us is valid. Just in case, we'll zero 2529 * gave us is valid. Just in case, we'll zero
2528 * the remainder of the page. 2530 * the remainder of the page.
2529 */ 2531 */
2530 /* copy_from_user cannot cross TASK_SIZE ! */ 2532 /* copy_from_user cannot cross TASK_SIZE ! */
2531 size = TASK_SIZE - (unsigned long)data; 2533 size = TASK_SIZE - (unsigned long)data;
2532 if (size > PAGE_SIZE) 2534 if (size > PAGE_SIZE)
2533 size = PAGE_SIZE; 2535 size = PAGE_SIZE;
2534 2536
2535 i = size - exact_copy_from_user((void *)page, data, size); 2537 i = size - exact_copy_from_user((void *)page, data, size);
2536 if (!i) { 2538 if (!i) {
2537 free_page(page); 2539 free_page(page);
2538 return -EFAULT; 2540 return -EFAULT;
2539 } 2541 }
2540 if (i != PAGE_SIZE) 2542 if (i != PAGE_SIZE)
2541 memset((char *)page + i, 0, PAGE_SIZE - i); 2543 memset((char *)page + i, 0, PAGE_SIZE - i);
2542 *where = page; 2544 *where = page;
2543 return 0; 2545 return 0;
2544 } 2546 }
2545 2547
2546 char *copy_mount_string(const void __user *data) 2548 char *copy_mount_string(const void __user *data)
2547 { 2549 {
2548 return data ? strndup_user(data, PAGE_SIZE) : NULL; 2550 return data ? strndup_user(data, PAGE_SIZE) : NULL;
2549 } 2551 }
2550 2552
2551 /* 2553 /*
2552 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2554 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
2553 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2555 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
2554 * 2556 *
2555 * data is a (void *) that can point to any structure up to 2557 * data is a (void *) that can point to any structure up to
2556 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2558 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
2557 * information (or be NULL). 2559 * information (or be NULL).
2558 * 2560 *
2559 * Pre-0.97 versions of mount() didn't have a flags word. 2561 * Pre-0.97 versions of mount() didn't have a flags word.
2560 * When the flags word was introduced its top half was required 2562 * When the flags word was introduced its top half was required
2561 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2563 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
2562 * Therefore, if this magic number is present, it carries no information 2564 * Therefore, if this magic number is present, it carries no information
2563 * and must be discarded. 2565 * and must be discarded.
2564 */ 2566 */
2565 long do_mount(const char *dev_name, const char __user *dir_name, 2567 long do_mount(const char *dev_name, const char __user *dir_name,
2566 const char *type_page, unsigned long flags, void *data_page) 2568 const char *type_page, unsigned long flags, void *data_page)
2567 { 2569 {
2568 struct path path; 2570 struct path path;
2569 int retval = 0; 2571 int retval = 0;
2570 int mnt_flags = 0; 2572 int mnt_flags = 0;
2571 2573
2572 /* Discard magic */ 2574 /* Discard magic */
2573 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2575 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
2574 flags &= ~MS_MGC_MSK; 2576 flags &= ~MS_MGC_MSK;
2575 2577
2576 /* Basic sanity checks */ 2578 /* Basic sanity checks */
2577 if (data_page) 2579 if (data_page)
2578 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2580 ((char *)data_page)[PAGE_SIZE - 1] = 0;
2579 2581
2580 /* ... and get the mountpoint */ 2582 /* ... and get the mountpoint */
2581 retval = user_path(dir_name, &path); 2583 retval = user_path(dir_name, &path);
2582 if (retval) 2584 if (retval)
2583 return retval; 2585 return retval;
2584 2586
2585 retval = security_sb_mount(dev_name, &path, 2587 retval = security_sb_mount(dev_name, &path,
2586 type_page, flags, data_page); 2588 type_page, flags, data_page);
2587 if (!retval && !may_mount()) 2589 if (!retval && !may_mount())
2588 retval = -EPERM; 2590 retval = -EPERM;
2589 if (retval) 2591 if (retval)
2590 goto dput_out; 2592 goto dput_out;
2591 2593
2592 /* Default to relatime unless overriden */ 2594 /* Default to relatime unless overriden */
2593 if (!(flags & MS_NOATIME)) 2595 if (!(flags & MS_NOATIME))
2594 mnt_flags |= MNT_RELATIME; 2596 mnt_flags |= MNT_RELATIME;
2595 2597
2596 /* Separate the per-mountpoint flags */ 2598 /* Separate the per-mountpoint flags */
2597 if (flags & MS_NOSUID) 2599 if (flags & MS_NOSUID)
2598 mnt_flags |= MNT_NOSUID; 2600 mnt_flags |= MNT_NOSUID;
2599 if (flags & MS_NODEV) 2601 if (flags & MS_NODEV)
2600 mnt_flags |= MNT_NODEV; 2602 mnt_flags |= MNT_NODEV;
2601 if (flags & MS_NOEXEC) 2603 if (flags & MS_NOEXEC)
2602 mnt_flags |= MNT_NOEXEC; 2604 mnt_flags |= MNT_NOEXEC;
2603 if (flags & MS_NOATIME) 2605 if (flags & MS_NOATIME)
2604 mnt_flags |= MNT_NOATIME; 2606 mnt_flags |= MNT_NOATIME;
2605 if (flags & MS_NODIRATIME) 2607 if (flags & MS_NODIRATIME)
2606 mnt_flags |= MNT_NODIRATIME; 2608 mnt_flags |= MNT_NODIRATIME;
2607 if (flags & MS_STRICTATIME) 2609 if (flags & MS_STRICTATIME)
2608 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2610 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
2609 if (flags & MS_RDONLY) 2611 if (flags & MS_RDONLY)
2610 mnt_flags |= MNT_READONLY; 2612 mnt_flags |= MNT_READONLY;
2611 2613
2612 /* The default atime for remount is preservation */ 2614 /* The default atime for remount is preservation */
2613 if ((flags & MS_REMOUNT) && 2615 if ((flags & MS_REMOUNT) &&
2614 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 2616 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
2615 MS_STRICTATIME)) == 0)) { 2617 MS_STRICTATIME)) == 0)) {
2616 mnt_flags &= ~MNT_ATIME_MASK; 2618 mnt_flags &= ~MNT_ATIME_MASK;
2617 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; 2619 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
2618 } 2620 }
2619 2621
2620 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2622 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2621 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2623 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2622 MS_STRICTATIME); 2624 MS_STRICTATIME);
2623 2625
2624 if (flags & MS_REMOUNT) 2626 if (flags & MS_REMOUNT)
2625 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2627 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
2626 data_page); 2628 data_page);
2627 else if (flags & MS_BIND) 2629 else if (flags & MS_BIND)
2628 retval = do_loopback(&path, dev_name, flags & MS_REC); 2630 retval = do_loopback(&path, dev_name, flags & MS_REC);
2629 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2631 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2630 retval = do_change_type(&path, flags); 2632 retval = do_change_type(&path, flags);
2631 else if (flags & MS_MOVE) 2633 else if (flags & MS_MOVE)
2632 retval = do_move_mount(&path, dev_name); 2634 retval = do_move_mount(&path, dev_name);
2633 else 2635 else
2634 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2636 retval = do_new_mount(&path, type_page, flags, mnt_flags,
2635 dev_name, data_page); 2637 dev_name, data_page);
2636 dput_out: 2638 dput_out:
2637 path_put(&path); 2639 path_put(&path);
2638 return retval; 2640 return retval;
2639 } 2641 }
2640 2642
2641 static void free_mnt_ns(struct mnt_namespace *ns) 2643 static void free_mnt_ns(struct mnt_namespace *ns)
2642 { 2644 {
2643 proc_free_inum(ns->proc_inum); 2645 proc_free_inum(ns->proc_inum);
2644 put_user_ns(ns->user_ns); 2646 put_user_ns(ns->user_ns);
2645 kfree(ns); 2647 kfree(ns);
2646 } 2648 }
2647 2649
2648 /* 2650 /*
2649 * Assign a sequence number so we can detect when we attempt to bind 2651 * Assign a sequence number so we can detect when we attempt to bind
2650 * mount a reference to an older mount namespace into the current 2652 * mount a reference to an older mount namespace into the current
2651 * mount namespace, preventing reference counting loops. A 64bit 2653 * mount namespace, preventing reference counting loops. A 64bit
2652 * number incrementing at 10Ghz will take 12,427 years to wrap which 2654 * number incrementing at 10Ghz will take 12,427 years to wrap which
2653 * is effectively never, so we can ignore the possibility. 2655 * is effectively never, so we can ignore the possibility.
2654 */ 2656 */
2655 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2657 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2656 2658
2657 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2659 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2658 { 2660 {
2659 struct mnt_namespace *new_ns; 2661 struct mnt_namespace *new_ns;
2660 int ret; 2662 int ret;
2661 2663
2662 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2664 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2663 if (!new_ns) 2665 if (!new_ns)
2664 return ERR_PTR(-ENOMEM); 2666 return ERR_PTR(-ENOMEM);
2665 ret = proc_alloc_inum(&new_ns->proc_inum); 2667 ret = proc_alloc_inum(&new_ns->proc_inum);
2666 if (ret) { 2668 if (ret) {
2667 kfree(new_ns); 2669 kfree(new_ns);
2668 return ERR_PTR(ret); 2670 return ERR_PTR(ret);
2669 } 2671 }
2670 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2672 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2671 atomic_set(&new_ns->count, 1); 2673 atomic_set(&new_ns->count, 1);
2672 new_ns->root = NULL; 2674 new_ns->root = NULL;
2673 INIT_LIST_HEAD(&new_ns->list); 2675 INIT_LIST_HEAD(&new_ns->list);
2674 init_waitqueue_head(&new_ns->poll); 2676 init_waitqueue_head(&new_ns->poll);
2675 new_ns->event = 0; 2677 new_ns->event = 0;
2676 new_ns->user_ns = get_user_ns(user_ns); 2678 new_ns->user_ns = get_user_ns(user_ns);
2677 return new_ns; 2679 return new_ns;
2678 } 2680 }
2679 2681
2680 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2682 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2681 struct user_namespace *user_ns, struct fs_struct *new_fs) 2683 struct user_namespace *user_ns, struct fs_struct *new_fs)
2682 { 2684 {
2683 struct mnt_namespace *new_ns; 2685 struct mnt_namespace *new_ns;
2684 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2686 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2685 struct mount *p, *q; 2687 struct mount *p, *q;
2686 struct mount *old; 2688 struct mount *old;
2687 struct mount *new; 2689 struct mount *new;
2688 int copy_flags; 2690 int copy_flags;
2689 2691
2690 BUG_ON(!ns); 2692 BUG_ON(!ns);
2691 2693
2692 if (likely(!(flags & CLONE_NEWNS))) { 2694 if (likely(!(flags & CLONE_NEWNS))) {
2693 get_mnt_ns(ns); 2695 get_mnt_ns(ns);
2694 return ns; 2696 return ns;
2695 } 2697 }
2696 2698
2697 old = ns->root; 2699 old = ns->root;
2698 2700
2699 new_ns = alloc_mnt_ns(user_ns); 2701 new_ns = alloc_mnt_ns(user_ns);
2700 if (IS_ERR(new_ns)) 2702 if (IS_ERR(new_ns))
2701 return new_ns; 2703 return new_ns;
2702 2704
2703 namespace_lock(); 2705 namespace_lock();
2704 /* First pass: copy the tree topology */ 2706 /* First pass: copy the tree topology */
2705 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 2707 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2706 if (user_ns != ns->user_ns) 2708 if (user_ns != ns->user_ns)
2707 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2709 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2708 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2710 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2709 if (IS_ERR(new)) { 2711 if (IS_ERR(new)) {
2710 namespace_unlock(); 2712 namespace_unlock();
2711 free_mnt_ns(new_ns); 2713 free_mnt_ns(new_ns);
2712 return ERR_CAST(new); 2714 return ERR_CAST(new);
2713 } 2715 }
2714 new_ns->root = new; 2716 new_ns->root = new;
2715 list_add_tail(&new_ns->list, &new->mnt_list); 2717 list_add_tail(&new_ns->list, &new->mnt_list);
2716 2718
2717 /* 2719 /*
2718 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2720 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
2719 * as belonging to new namespace. We have already acquired a private 2721 * as belonging to new namespace. We have already acquired a private
2720 * fs_struct, so tsk->fs->lock is not needed. 2722 * fs_struct, so tsk->fs->lock is not needed.
2721 */ 2723 */
2722 p = old; 2724 p = old;
2723 q = new; 2725 q = new;
2724 while (p) { 2726 while (p) {
2725 q->mnt_ns = new_ns; 2727 q->mnt_ns = new_ns;
2726 if (new_fs) { 2728 if (new_fs) {
2727 if (&p->mnt == new_fs->root.mnt) { 2729 if (&p->mnt == new_fs->root.mnt) {
2728 new_fs->root.mnt = mntget(&q->mnt); 2730 new_fs->root.mnt = mntget(&q->mnt);
2729 rootmnt = &p->mnt; 2731 rootmnt = &p->mnt;
2730 } 2732 }
2731 if (&p->mnt == new_fs->pwd.mnt) { 2733 if (&p->mnt == new_fs->pwd.mnt) {
2732 new_fs->pwd.mnt = mntget(&q->mnt); 2734 new_fs->pwd.mnt = mntget(&q->mnt);
2733 pwdmnt = &p->mnt; 2735 pwdmnt = &p->mnt;
2734 } 2736 }
2735 } 2737 }
2736 p = next_mnt(p, old); 2738 p = next_mnt(p, old);
2737 q = next_mnt(q, new); 2739 q = next_mnt(q, new);
2738 if (!q) 2740 if (!q)
2739 break; 2741 break;
2740 while (p->mnt.mnt_root != q->mnt.mnt_root) 2742 while (p->mnt.mnt_root != q->mnt.mnt_root)
2741 p = next_mnt(p, old); 2743 p = next_mnt(p, old);
2742 } 2744 }
2743 namespace_unlock(); 2745 namespace_unlock();
2744 2746
2745 if (rootmnt) 2747 if (rootmnt)
2746 mntput(rootmnt); 2748 mntput(rootmnt);
2747 if (pwdmnt) 2749 if (pwdmnt)
2748 mntput(pwdmnt); 2750 mntput(pwdmnt);
2749 2751
2750 return new_ns; 2752 return new_ns;
2751 } 2753 }
2752 2754
2753 /** 2755 /**
2754 * create_mnt_ns - creates a private namespace and adds a root filesystem 2756 * create_mnt_ns - creates a private namespace and adds a root filesystem
2755 * @mnt: pointer to the new root filesystem mountpoint 2757 * @mnt: pointer to the new root filesystem mountpoint
2756 */ 2758 */
2757 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2759 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2758 { 2760 {
2759 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2761 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2760 if (!IS_ERR(new_ns)) { 2762 if (!IS_ERR(new_ns)) {
2761 struct mount *mnt = real_mount(m); 2763 struct mount *mnt = real_mount(m);
2762 mnt->mnt_ns = new_ns; 2764 mnt->mnt_ns = new_ns;
2763 new_ns->root = mnt; 2765 new_ns->root = mnt;
2764 list_add(&mnt->mnt_list, &new_ns->list); 2766 list_add(&mnt->mnt_list, &new_ns->list);
2765 } else { 2767 } else {
2766 mntput(m); 2768 mntput(m);
2767 } 2769 }
2768 return new_ns; 2770 return new_ns;
2769 } 2771 }
2770 2772
2771 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) 2773 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2772 { 2774 {
2773 struct mnt_namespace *ns; 2775 struct mnt_namespace *ns;
2774 struct super_block *s; 2776 struct super_block *s;
2775 struct path path; 2777 struct path path;
2776 int err; 2778 int err;
2777 2779
2778 ns = create_mnt_ns(mnt); 2780 ns = create_mnt_ns(mnt);
2779 if (IS_ERR(ns)) 2781 if (IS_ERR(ns))
2780 return ERR_CAST(ns); 2782 return ERR_CAST(ns);
2781 2783
2782 err = vfs_path_lookup(mnt->mnt_root, mnt, 2784 err = vfs_path_lookup(mnt->mnt_root, mnt,
2783 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2785 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2784 2786
2785 put_mnt_ns(ns); 2787 put_mnt_ns(ns);
2786 2788
2787 if (err) 2789 if (err)
2788 return ERR_PTR(err); 2790 return ERR_PTR(err);
2789 2791
2790 /* trade a vfsmount reference for active sb one */ 2792 /* trade a vfsmount reference for active sb one */
2791 s = path.mnt->mnt_sb; 2793 s = path.mnt->mnt_sb;
2792 atomic_inc(&s->s_active); 2794 atomic_inc(&s->s_active);
2793 mntput(path.mnt); 2795 mntput(path.mnt);
2794 /* lock the sucker */ 2796 /* lock the sucker */
2795 down_write(&s->s_umount); 2797 down_write(&s->s_umount);
2796 /* ... and return the root of (sub)tree on it */ 2798 /* ... and return the root of (sub)tree on it */
2797 return path.dentry; 2799 return path.dentry;
2798 } 2800 }
2799 EXPORT_SYMBOL(mount_subtree); 2801 EXPORT_SYMBOL(mount_subtree);
2800 2802
2801 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2803 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2802 char __user *, type, unsigned long, flags, void __user *, data) 2804 char __user *, type, unsigned long, flags, void __user *, data)
2803 { 2805 {
2804 int ret; 2806 int ret;
2805 char *kernel_type; 2807 char *kernel_type;
2806 char *kernel_dev; 2808 char *kernel_dev;
2807 unsigned long data_page; 2809 unsigned long data_page;
2808 2810
2809 kernel_type = copy_mount_string(type); 2811 kernel_type = copy_mount_string(type);
2810 ret = PTR_ERR(kernel_type); 2812 ret = PTR_ERR(kernel_type);
2811 if (IS_ERR(kernel_type)) 2813 if (IS_ERR(kernel_type))
2812 goto out_type; 2814 goto out_type;
2813 2815
2814 kernel_dev = copy_mount_string(dev_name); 2816 kernel_dev = copy_mount_string(dev_name);
2815 ret = PTR_ERR(kernel_dev); 2817 ret = PTR_ERR(kernel_dev);
2816 if (IS_ERR(kernel_dev)) 2818 if (IS_ERR(kernel_dev))
2817 goto out_dev; 2819 goto out_dev;
2818 2820
2819 ret = copy_mount_options(data, &data_page); 2821 ret = copy_mount_options(data, &data_page);
2820 if (ret < 0) 2822 if (ret < 0)
2821 goto out_data; 2823 goto out_data;
2822 2824
2823 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, 2825 ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
2824 (void *) data_page); 2826 (void *) data_page);
2825 2827
2826 free_page(data_page); 2828 free_page(data_page);
2827 out_data: 2829 out_data:
2828 kfree(kernel_dev); 2830 kfree(kernel_dev);
2829 out_dev: 2831 out_dev:
2830 kfree(kernel_type); 2832 kfree(kernel_type);
2831 out_type: 2833 out_type:
2832 return ret; 2834 return ret;
2833 } 2835 }
2834 2836
2835 /* 2837 /*
2836 * Return true if path is reachable from root 2838 * Return true if path is reachable from root
2837 * 2839 *
2838 * namespace_sem or mount_lock is held 2840 * namespace_sem or mount_lock is held
2839 */ 2841 */
2840 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2842 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2841 const struct path *root) 2843 const struct path *root)
2842 { 2844 {
2843 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 2845 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
2844 dentry = mnt->mnt_mountpoint; 2846 dentry = mnt->mnt_mountpoint;
2845 mnt = mnt->mnt_parent; 2847 mnt = mnt->mnt_parent;
2846 } 2848 }
2847 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 2849 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
2848 } 2850 }
2849 2851
2850 int path_is_under(struct path *path1, struct path *path2) 2852 int path_is_under(struct path *path1, struct path *path2)
2851 { 2853 {
2852 int res; 2854 int res;
2853 read_seqlock_excl(&mount_lock); 2855 read_seqlock_excl(&mount_lock);
2854 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2856 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2855 read_sequnlock_excl(&mount_lock); 2857 read_sequnlock_excl(&mount_lock);
2856 return res; 2858 return res;
2857 } 2859 }
2858 EXPORT_SYMBOL(path_is_under); 2860 EXPORT_SYMBOL(path_is_under);
2859 2861
2860 /* 2862 /*
2861 * pivot_root Semantics: 2863 * pivot_root Semantics:
2862 * Moves the root file system of the current process to the directory put_old, 2864 * Moves the root file system of the current process to the directory put_old,
2863 * makes new_root as the new root file system of the current process, and sets 2865 * makes new_root as the new root file system of the current process, and sets
2864 * root/cwd of all processes which had them on the current root to new_root. 2866 * root/cwd of all processes which had them on the current root to new_root.
2865 * 2867 *
2866 * Restrictions: 2868 * Restrictions:
2867 * The new_root and put_old must be directories, and must not be on the 2869 * The new_root and put_old must be directories, and must not be on the
2868 * same file system as the current process root. The put_old must be 2870 * same file system as the current process root. The put_old must be
2869 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2871 * underneath new_root, i.e. adding a non-zero number of /.. to the string
2870 * pointed to by put_old must yield the same directory as new_root. No other 2872 * pointed to by put_old must yield the same directory as new_root. No other
2871 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2873 * file system may be mounted on put_old. After all, new_root is a mountpoint.
2872 * 2874 *
2873 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2875 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
2874 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2876 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
2875 * in this situation. 2877 * in this situation.
2876 * 2878 *
2877 * Notes: 2879 * Notes:
2878 * - we don't move root/cwd if they are not at the root (reason: if something 2880 * - we don't move root/cwd if they are not at the root (reason: if something
2879 * cared enough to change them, it's probably wrong to force them elsewhere) 2881 * cared enough to change them, it's probably wrong to force them elsewhere)
2880 * - it's okay to pick a root that isn't the root of a file system, e.g. 2882 * - it's okay to pick a root that isn't the root of a file system, e.g.
2881 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2883 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
2882 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2884 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
2883 * first. 2885 * first.
2884 */ 2886 */
2885 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2887 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2886 const char __user *, put_old) 2888 const char __user *, put_old)
2887 { 2889 {
2888 struct path new, old, parent_path, root_parent, root; 2890 struct path new, old, parent_path, root_parent, root;
2889 struct mount *new_mnt, *root_mnt, *old_mnt; 2891 struct mount *new_mnt, *root_mnt, *old_mnt;
2890 struct mountpoint *old_mp, *root_mp; 2892 struct mountpoint *old_mp, *root_mp;
2891 int error; 2893 int error;
2892 2894
2893 if (!may_mount()) 2895 if (!may_mount())
2894 return -EPERM; 2896 return -EPERM;
2895 2897
2896 error = user_path_dir(new_root, &new); 2898 error = user_path_dir(new_root, &new);
2897 if (error) 2899 if (error)
2898 goto out0; 2900 goto out0;
2899 2901
2900 error = user_path_dir(put_old, &old); 2902 error = user_path_dir(put_old, &old);
2901 if (error) 2903 if (error)
2902 goto out1; 2904 goto out1;
2903 2905
2904 error = security_sb_pivotroot(&old, &new); 2906 error = security_sb_pivotroot(&old, &new);
2905 if (error) 2907 if (error)
2906 goto out2; 2908 goto out2;
2907 2909
2908 get_fs_root(current->fs, &root); 2910 get_fs_root(current->fs, &root);
2909 old_mp = lock_mount(&old); 2911 old_mp = lock_mount(&old);
2910 error = PTR_ERR(old_mp); 2912 error = PTR_ERR(old_mp);
2911 if (IS_ERR(old_mp)) 2913 if (IS_ERR(old_mp))
2912 goto out3; 2914 goto out3;
2913 2915
2914 error = -EINVAL; 2916 error = -EINVAL;
2915 new_mnt = real_mount(new.mnt); 2917 new_mnt = real_mount(new.mnt);
2916 root_mnt = real_mount(root.mnt); 2918 root_mnt = real_mount(root.mnt);
2917 old_mnt = real_mount(old.mnt); 2919 old_mnt = real_mount(old.mnt);
2918 if (IS_MNT_SHARED(old_mnt) || 2920 if (IS_MNT_SHARED(old_mnt) ||
2919 IS_MNT_SHARED(new_mnt->mnt_parent) || 2921 IS_MNT_SHARED(new_mnt->mnt_parent) ||
2920 IS_MNT_SHARED(root_mnt->mnt_parent)) 2922 IS_MNT_SHARED(root_mnt->mnt_parent))
2921 goto out4; 2923 goto out4;
2922 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2924 if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2923 goto out4; 2925 goto out4;
2924 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 2926 if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
2925 goto out4; 2927 goto out4;
2926 error = -ENOENT; 2928 error = -ENOENT;
2927 if (d_unlinked(new.dentry)) 2929 if (d_unlinked(new.dentry))
2928 goto out4; 2930 goto out4;
2929 error = -EBUSY; 2931 error = -EBUSY;
2930 if (new_mnt == root_mnt || old_mnt == root_mnt) 2932 if (new_mnt == root_mnt || old_mnt == root_mnt)
2931 goto out4; /* loop, on the same file system */ 2933 goto out4; /* loop, on the same file system */
2932 error = -EINVAL; 2934 error = -EINVAL;
2933 if (root.mnt->mnt_root != root.dentry) 2935 if (root.mnt->mnt_root != root.dentry)
2934 goto out4; /* not a mountpoint */ 2936 goto out4; /* not a mountpoint */
2935 if (!mnt_has_parent(root_mnt)) 2937 if (!mnt_has_parent(root_mnt))
2936 goto out4; /* not attached */ 2938 goto out4; /* not attached */
2937 root_mp = root_mnt->mnt_mp; 2939 root_mp = root_mnt->mnt_mp;
2938 if (new.mnt->mnt_root != new.dentry) 2940 if (new.mnt->mnt_root != new.dentry)
2939 goto out4; /* not a mountpoint */ 2941 goto out4; /* not a mountpoint */
2940 if (!mnt_has_parent(new_mnt)) 2942 if (!mnt_has_parent(new_mnt))
2941 goto out4; /* not attached */ 2943 goto out4; /* not attached */
2942 /* make sure we can reach put_old from new_root */ 2944 /* make sure we can reach put_old from new_root */
2943 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2945 if (!is_path_reachable(old_mnt, old.dentry, &new))
2944 goto out4; 2946 goto out4;
2945 /* make certain new is below the root */ 2947 /* make certain new is below the root */
2946 if (!is_path_reachable(new_mnt, new.dentry, &root)) 2948 if (!is_path_reachable(new_mnt, new.dentry, &root))
2947 goto out4; 2949 goto out4;
2948 root_mp->m_count++; /* pin it so it won't go away */ 2950 root_mp->m_count++; /* pin it so it won't go away */
2949 lock_mount_hash(); 2951 lock_mount_hash();
2950 detach_mnt(new_mnt, &parent_path); 2952 detach_mnt(new_mnt, &parent_path);
2951 detach_mnt(root_mnt, &root_parent); 2953 detach_mnt(root_mnt, &root_parent);
2952 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 2954 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
2953 new_mnt->mnt.mnt_flags |= MNT_LOCKED; 2955 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
2954 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2956 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2955 } 2957 }
2956 /* mount old root on put_old */ 2958 /* mount old root on put_old */
2957 attach_mnt(root_mnt, old_mnt, old_mp); 2959 attach_mnt(root_mnt, old_mnt, old_mp);
2958 /* mount new_root on / */ 2960 /* mount new_root on / */
2959 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2961 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2960 touch_mnt_namespace(current->nsproxy->mnt_ns); 2962 touch_mnt_namespace(current->nsproxy->mnt_ns);
2961 unlock_mount_hash(); 2963 unlock_mount_hash();
2962 chroot_fs_refs(&root, &new); 2964 chroot_fs_refs(&root, &new);
2963 put_mountpoint(root_mp); 2965 put_mountpoint(root_mp);
2964 error = 0; 2966 error = 0;
2965 out4: 2967 out4:
2966 unlock_mount(old_mp); 2968 unlock_mount(old_mp);
2967 if (!error) { 2969 if (!error) {
2968 path_put(&root_parent); 2970 path_put(&root_parent);
2969 path_put(&parent_path); 2971 path_put(&parent_path);
2970 } 2972 }
2971 out3: 2973 out3:
2972 path_put(&root); 2974 path_put(&root);
2973 out2: 2975 out2:
2974 path_put(&old); 2976 path_put(&old);
2975 out1: 2977 out1:
2976 path_put(&new); 2978 path_put(&new);
2977 out0: 2979 out0:
2978 return error; 2980 return error;
2979 } 2981 }
2980 2982
2981 static void __init init_mount_tree(void) 2983 static void __init init_mount_tree(void)
2982 { 2984 {
2983 struct vfsmount *mnt; 2985 struct vfsmount *mnt;
2984 struct mnt_namespace *ns; 2986 struct mnt_namespace *ns;
2985 struct path root; 2987 struct path root;
2986 struct file_system_type *type; 2988 struct file_system_type *type;
2987 2989
2988 type = get_fs_type("rootfs"); 2990 type = get_fs_type("rootfs");
2989 if (!type) 2991 if (!type)
2990 panic("Can't find rootfs type"); 2992 panic("Can't find rootfs type");
2991 mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2993 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2992 put_filesystem(type); 2994 put_filesystem(type);
2993 if (IS_ERR(mnt)) 2995 if (IS_ERR(mnt))
2994 panic("Can't create rootfs"); 2996 panic("Can't create rootfs");
2995 2997
2996 ns = create_mnt_ns(mnt); 2998 ns = create_mnt_ns(mnt);
2997 if (IS_ERR(ns)) 2999 if (IS_ERR(ns))
2998 panic("Can't allocate initial namespace"); 3000 panic("Can't allocate initial namespace");
2999 3001
3000 init_task.nsproxy->mnt_ns = ns; 3002 init_task.nsproxy->mnt_ns = ns;
3001 get_mnt_ns(ns); 3003 get_mnt_ns(ns);
3002 3004
3003 root.mnt = mnt; 3005 root.mnt = mnt;
3004 root.dentry = mnt->mnt_root; 3006 root.dentry = mnt->mnt_root;
3005 3007
3006 set_fs_pwd(current->fs, &root); 3008 set_fs_pwd(current->fs, &root);
3007 set_fs_root(current->fs, &root); 3009 set_fs_root(current->fs, &root);
3008 } 3010 }
3009 3011
3010 void __init mnt_init(void) 3012 void __init mnt_init(void)
3011 { 3013 {
3012 unsigned u; 3014 unsigned u;
3013 int err; 3015 int err;
3014 3016
3015 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 3017 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
3016 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3018 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3017 3019
3018 mount_hashtable = alloc_large_system_hash("Mount-cache", 3020 mount_hashtable = alloc_large_system_hash("Mount-cache",
3019 sizeof(struct hlist_head), 3021 sizeof(struct hlist_head),
3020 mhash_entries, 19, 3022 mhash_entries, 19,
3021 0, 3023 0,
3022 &m_hash_shift, &m_hash_mask, 0, 0); 3024 &m_hash_shift, &m_hash_mask, 0, 0);
3023 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", 3025 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
3024 sizeof(struct hlist_head), 3026 sizeof(struct hlist_head),
3025 mphash_entries, 19, 3027 mphash_entries, 19,
3026 0, 3028 0,
3027 &mp_hash_shift, &mp_hash_mask, 0, 0); 3029 &mp_hash_shift, &mp_hash_mask, 0, 0);
3028 3030
3029 if (!mount_hashtable || !mountpoint_hashtable) 3031 if (!mount_hashtable || !mountpoint_hashtable)
3030 panic("Failed to allocate mount hash table\n"); 3032 panic("Failed to allocate mount hash table\n");
3031 3033
3032 for (u = 0; u <= m_hash_mask; u++) 3034 for (u = 0; u <= m_hash_mask; u++)
3033 INIT_HLIST_HEAD(&mount_hashtable[u]); 3035 INIT_HLIST_HEAD(&mount_hashtable[u]);
3034 for (u = 0; u <= mp_hash_mask; u++) 3036 for (u = 0; u <= mp_hash_mask; u++)
3035 INIT_HLIST_HEAD(&mountpoint_hashtable[u]); 3037 INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
3036 3038
3037 kernfs_init(); 3039 kernfs_init();
3038 3040
3039 err = sysfs_init(); 3041 err = sysfs_init();
3040 if (err) 3042 if (err)
3041 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 3043 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
3042 __func__, err); 3044 __func__, err);
3043 fs_kobj = kobject_create_and_add("fs", NULL); 3045 fs_kobj = kobject_create_and_add("fs", NULL);
3044 if (!fs_kobj) 3046 if (!fs_kobj)
3045 printk(KERN_WARNING "%s: kobj create error\n", __func__); 3047 printk(KERN_WARNING "%s: kobj create error\n", __func__);
3046 init_rootfs(); 3048 init_rootfs();
3047 init_mount_tree(); 3049 init_mount_tree();
3048 } 3050 }
3049 3051
3050 void put_mnt_ns(struct mnt_namespace *ns) 3052 void put_mnt_ns(struct mnt_namespace *ns)
3051 { 3053 {
3052 if (!atomic_dec_and_test(&ns->count)) 3054 if (!atomic_dec_and_test(&ns->count))
3053 return; 3055 return;
3054 drop_collected_mounts(&ns->root->mnt); 3056 drop_collected_mounts(&ns->root->mnt);
3055 free_mnt_ns(ns); 3057 free_mnt_ns(ns);
3056 } 3058 }
3057 3059
3058 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 3060 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
3059 { 3061 {
3060 struct vfsmount *mnt; 3062 struct vfsmount *mnt;
3061 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); 3063 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
3062 if (!IS_ERR(mnt)) { 3064 if (!IS_ERR(mnt)) {
3063 /* 3065 /*
3064 * it is a longterm mount, don't release mnt until 3066 * it is a longterm mount, don't release mnt until
3065 * we unmount before file sys is unregistered 3067 * we unmount before file sys is unregistered
3066 */ 3068 */
3067 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 3069 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
3068 } 3070 }
3069 return mnt; 3071 return mnt;
3070 } 3072 }
3071 EXPORT_SYMBOL_GPL(kern_mount_data); 3073 EXPORT_SYMBOL_GPL(kern_mount_data);
3072 3074
3073 void kern_unmount(struct vfsmount *mnt) 3075 void kern_unmount(struct vfsmount *mnt)
3074 { 3076 {
3075 /* release long term mount so mount point can be released */ 3077 /* release long term mount so mount point can be released */
3076 if (!IS_ERR_OR_NULL(mnt)) { 3078 if (!IS_ERR_OR_NULL(mnt)) {
3077 real_mount(mnt)->mnt_ns = NULL; 3079 real_mount(mnt)->mnt_ns = NULL;
3078 synchronize_rcu(); /* yecchhh... */ 3080 synchronize_rcu(); /* yecchhh... */
3079 mntput(mnt); 3081 mntput(mnt);
3080 } 3082 }
3081 } 3083 }
3082 EXPORT_SYMBOL(kern_unmount); 3084 EXPORT_SYMBOL(kern_unmount);
3083 3085
3084 bool our_mnt(struct vfsmount *mnt) 3086 bool our_mnt(struct vfsmount *mnt)
3085 { 3087 {
3086 return check_mnt(real_mount(mnt)); 3088 return check_mnt(real_mount(mnt));
3087 } 3089 }
3088 3090
3089 bool current_chrooted(void) 3091 bool current_chrooted(void)
3090 { 3092 {
3091 /* Does the current process have a non-standard root */ 3093 /* Does the current process have a non-standard root */
3092 struct path ns_root; 3094 struct path ns_root;
3093 struct path fs_root; 3095 struct path fs_root;
3094 bool chrooted; 3096 bool chrooted;
3095 3097
3096 /* Find the namespace root */ 3098 /* Find the namespace root */
3097 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; 3099 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
3098 ns_root.dentry = ns_root.mnt->mnt_root; 3100 ns_root.dentry = ns_root.mnt->mnt_root;
3099 path_get(&ns_root); 3101 path_get(&ns_root);
3100 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) 3102 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
3101 ; 3103 ;
3102 3104
3103 get_fs_root(current->fs, &fs_root); 3105 get_fs_root(current->fs, &fs_root);
3104 3106
3105 chrooted = !path_equal(&fs_root, &ns_root); 3107 chrooted = !path_equal(&fs_root, &ns_root);
3106 3108
3107 path_put(&fs_root); 3109 path_put(&fs_root);
3108 path_put(&ns_root); 3110 path_put(&ns_root);
3109 3111
3110 return chrooted; 3112 return chrooted;
3111 } 3113 }
3112 3114
3113 bool fs_fully_visible(struct file_system_type *type) 3115 bool fs_fully_visible(struct file_system_type *type)
3114 { 3116 {
3115 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3117 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
3116 struct mount *mnt; 3118 struct mount *mnt;
3117 bool visible = false; 3119 bool visible = false;
3118 3120
3119 if (unlikely(!ns)) 3121 if (unlikely(!ns))
3120 return false; 3122 return false;
3121 3123
3122 down_read(&namespace_sem); 3124 down_read(&namespace_sem);
3123 list_for_each_entry(mnt, &ns->list, mnt_list) { 3125 list_for_each_entry(mnt, &ns->list, mnt_list) {
3124 struct mount *child; 3126 struct mount *child;
3125 if (mnt->mnt.mnt_sb->s_type != type) 3127 if (mnt->mnt.mnt_sb->s_type != type)
3126 continue; 3128 continue;
3127 3129
3128 /* This mount is not fully visible if there are any child mounts 3130 /* This mount is not fully visible if there are any child mounts
3129 * that cover anything except for empty directories. 3131 * that cover anything except for empty directories.
3130 */ 3132 */
3131 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 3133 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
3132 struct inode *inode = child->mnt_mountpoint->d_inode; 3134 struct inode *inode = child->mnt_mountpoint->d_inode;
3133 if (!S_ISDIR(inode->i_mode)) 3135 if (!S_ISDIR(inode->i_mode))
3134 goto next; 3136 goto next;
3135 if (inode->i_nlink > 2) 3137 if (inode->i_nlink > 2)
3136 goto next; 3138 goto next;
3137 } 3139 }
3138 visible = true; 3140 visible = true;
3139 goto found; 3141 goto found;
3140 next: ; 3142 next: ;
3141 } 3143 }
3142 found: 3144 found:
3143 up_read(&namespace_sem); 3145 up_read(&namespace_sem);
3144 return visible; 3146 return visible;
3145 } 3147 }
3146 3148
3147 static void *mntns_get(struct task_struct *task) 3149 static void *mntns_get(struct task_struct *task)
3148 { 3150 {
3149 struct mnt_namespace *ns = NULL; 3151 struct mnt_namespace *ns = NULL;
3150 struct nsproxy *nsproxy; 3152 struct nsproxy *nsproxy;
3151 3153
3152 task_lock(task); 3154 task_lock(task);
3153 nsproxy = task->nsproxy; 3155 nsproxy = task->nsproxy;
3154 if (nsproxy) { 3156 if (nsproxy) {
3155 ns = nsproxy->mnt_ns; 3157 ns = nsproxy->mnt_ns;
3156 get_mnt_ns(ns); 3158 get_mnt_ns(ns);
3157 } 3159 }
3158 task_unlock(task); 3160 task_unlock(task);
3159 3161
3160 return ns; 3162 return ns;
3161 } 3163 }
3162 3164
3163 static void mntns_put(void *ns) 3165 static void mntns_put(void *ns)
3164 { 3166 {
3165 put_mnt_ns(ns); 3167 put_mnt_ns(ns);
3166 } 3168 }
3167 3169
3168 static int mntns_install(struct nsproxy *nsproxy, void *ns) 3170 static int mntns_install(struct nsproxy *nsproxy, void *ns)
3169 { 3171 {
3170 struct fs_struct *fs = current->fs; 3172 struct fs_struct *fs = current->fs;
3171 struct mnt_namespace *mnt_ns = ns; 3173 struct mnt_namespace *mnt_ns = ns;
3172 struct path root; 3174 struct path root;
3173 3175
3174 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 3176 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
3175 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || 3177 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
3176 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 3178 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
3177 return -EPERM; 3179 return -EPERM;
3178 3180
3179 if (fs->users != 1) 3181 if (fs->users != 1)
3180 return -EINVAL; 3182 return -EINVAL;
3181 3183
3182 get_mnt_ns(mnt_ns); 3184 get_mnt_ns(mnt_ns);
3183 put_mnt_ns(nsproxy->mnt_ns); 3185 put_mnt_ns(nsproxy->mnt_ns);
3184 nsproxy->mnt_ns = mnt_ns; 3186 nsproxy->mnt_ns = mnt_ns;
3185 3187
3186 /* Find the root */ 3188 /* Find the root */
3187 root.mnt = &mnt_ns->root->mnt; 3189 root.mnt = &mnt_ns->root->mnt;
3188 root.dentry = mnt_ns->root->mnt.mnt_root; 3190 root.dentry = mnt_ns->root->mnt.mnt_root;
3189 path_get(&root); 3191 path_get(&root);
3190 while(d_mountpoint(root.dentry) && follow_down_one(&root)) 3192 while(d_mountpoint(root.dentry) && follow_down_one(&root))
3191 ; 3193 ;
3192 3194
3193 /* Update the pwd and root */ 3195 /* Update the pwd and root */
3194 set_fs_pwd(fs, &root); 3196 set_fs_pwd(fs, &root);
3195 set_fs_root(fs, &root); 3197 set_fs_root(fs, &root);
3196 3198
3197 path_put(&root); 3199 path_put(&root);
3198 return 0; 3200 return 0;
3199 } 3201 }
3200 3202
3201 static unsigned int mntns_inum(void *ns) 3203 static unsigned int mntns_inum(void *ns)
3202 { 3204 {
3203 struct mnt_namespace *mnt_ns = ns; 3205 struct mnt_namespace *mnt_ns = ns;
3204 return mnt_ns->proc_inum; 3206 return mnt_ns->proc_inum;
3205 } 3207 }
3206 3208
3207 const struct proc_ns_operations mntns_operations = { 3209 const struct proc_ns_operations mntns_operations = {
3208 .name = "mnt", 3210 .name = "mnt",
3209 .type = CLONE_NEWNS, 3211 .type = CLONE_NEWNS,
3210 .get = mntns_get, 3212 .get = mntns_get,
3211 .put = mntns_put, 3213 .put = mntns_put,
3212 .install = mntns_install, 3214 .install = mntns_install,
3213 .inum = mntns_inum, 3215 .inum = mntns_inum,
3214 }; 3216 };
3215 3217