Commit e149ed2b805fefdccf7ccdfc19eca22fdd4514ac

Authored by Al Viro
1 parent f77c80142e

take the targets of /proc/*/ns/* symlinks to separate fs

New pseudo-filesystem: nsfs.  Targets of /proc/*/ns/* live there now.
It's not mountable (not even registered, so it's not in /proc/filesystems,
etc.).  Files on it *are* bindable - we explicitly permit that in do_loopback().

This stuff lives in fs/nsfs.c now; proc_ns_fget() moved there as well.
get_proc_ns() is a macro now (it's simply returning ->i_private; would
have been an inline, if not for header ordering headache).
proc_ns_inode() is an ex-parrot.  The interface used in procfs is
ns_get_path(path, task, ops) and ns_get_name(buf, size, task, ops).

Dentries and inodes are never hashed; a non-counting reference to dentry
is stashed in ns_common (removed by ->d_prune()) and reused by ns_get_path()
if present.  See ns_get_path()/ns_prune_dentry/nsfs_evict() for details
of that mechanism.

As the result, proc_ns_follow_link() has stopped poking in nd->path.mnt;
it does nd_jump_link() on a consistent <vfsmount,dentry> pair it gets
from ns_get_path().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 10 changed files with 208 additions and 161 deletions Inline Diff

1 # 1 #
2 # Makefile for the Linux filesystems. 2 # Makefile for the Linux filesystems.
3 # 3 #
4 # 14 Sep 2000, Christoph Hellwig <hch@infradead.org> 4 # 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
5 # Rewritten to use lists instead of if-statements. 5 # Rewritten to use lists instead of if-statements.
6 # 6 #
7 7
8 obj-y := open.o read_write.o file_table.o super.o \ 8 obj-y := open.o read_write.o file_table.o super.o \
9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ 9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
10 ioctl.o readdir.o select.o dcache.o inode.o \ 10 ioctl.o readdir.o select.o dcache.o inode.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o splice.o sync.o utimes.o \ 13 pnode.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o statfs.o fs_pin.o 14 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
15 15
16 ifeq ($(CONFIG_BLOCK),y) 16 ifeq ($(CONFIG_BLOCK),y)
17 obj-y += buffer.o block_dev.o direct-io.o mpage.o 17 obj-y += buffer.o block_dev.o direct-io.o mpage.o
18 else 18 else
19 obj-y += no-block.o 19 obj-y += no-block.o
20 endif 20 endif
21 21
22 obj-$(CONFIG_PROC_FS) += proc_namespace.o 22 obj-$(CONFIG_PROC_FS) += proc_namespace.o
23 23
24 obj-y += notify/ 24 obj-y += notify/
25 obj-$(CONFIG_EPOLL) += eventpoll.o 25 obj-$(CONFIG_EPOLL) += eventpoll.o
26 obj-$(CONFIG_ANON_INODES) += anon_inodes.o 26 obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27 obj-$(CONFIG_SIGNALFD) += signalfd.o 27 obj-$(CONFIG_SIGNALFD) += signalfd.o
28 obj-$(CONFIG_TIMERFD) += timerfd.o 28 obj-$(CONFIG_TIMERFD) += timerfd.o
29 obj-$(CONFIG_EVENTFD) += eventfd.o 29 obj-$(CONFIG_EVENTFD) += eventfd.o
30 obj-$(CONFIG_AIO) += aio.o 30 obj-$(CONFIG_AIO) += aio.o
31 obj-$(CONFIG_FILE_LOCKING) += locks.o 31 obj-$(CONFIG_FILE_LOCKING) += locks.o
32 obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 32 obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
33 obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 33 obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
34 obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o 34 obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
35 obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o 35 obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
36 obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o 36 obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
37 obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o 37 obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
38 obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o 38 obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
39 obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 39 obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
40 obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o 40 obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
41 obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 41 obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
42 42
43 obj-$(CONFIG_FS_MBCACHE) += mbcache.o 43 obj-$(CONFIG_FS_MBCACHE) += mbcache.o
44 obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o 44 obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
45 obj-$(CONFIG_NFS_COMMON) += nfs_common/ 45 obj-$(CONFIG_NFS_COMMON) += nfs_common/
46 obj-$(CONFIG_COREDUMP) += coredump.o 46 obj-$(CONFIG_COREDUMP) += coredump.o
47 obj-$(CONFIG_SYSCTL) += drop_caches.o 47 obj-$(CONFIG_SYSCTL) += drop_caches.o
48 48
49 obj-$(CONFIG_FHANDLE) += fhandle.o 49 obj-$(CONFIG_FHANDLE) += fhandle.o
50 50
51 obj-y += quota/ 51 obj-y += quota/
52 52
53 obj-$(CONFIG_PROC_FS) += proc/ 53 obj-$(CONFIG_PROC_FS) += proc/
54 obj-$(CONFIG_KERNFS) += kernfs/ 54 obj-$(CONFIG_KERNFS) += kernfs/
55 obj-$(CONFIG_SYSFS) += sysfs/ 55 obj-$(CONFIG_SYSFS) += sysfs/
56 obj-$(CONFIG_CONFIGFS_FS) += configfs/ 56 obj-$(CONFIG_CONFIGFS_FS) += configfs/
57 obj-y += devpts/ 57 obj-y += devpts/
58 58
59 obj-$(CONFIG_PROFILING) += dcookies.o 59 obj-$(CONFIG_PROFILING) += dcookies.o
60 obj-$(CONFIG_DLM) += dlm/ 60 obj-$(CONFIG_DLM) += dlm/
61 61
62 # Do not add any filesystems before this line 62 # Do not add any filesystems before this line
63 obj-$(CONFIG_FSCACHE) += fscache/ 63 obj-$(CONFIG_FSCACHE) += fscache/
64 obj-$(CONFIG_REISERFS_FS) += reiserfs/ 64 obj-$(CONFIG_REISERFS_FS) += reiserfs/
65 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 65 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
66 obj-$(CONFIG_EXT2_FS) += ext2/ 66 obj-$(CONFIG_EXT2_FS) += ext2/
67 # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 67 # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
68 # unless explicitly requested by rootfstype 68 # unless explicitly requested by rootfstype
69 obj-$(CONFIG_EXT4_FS) += ext4/ 69 obj-$(CONFIG_EXT4_FS) += ext4/
70 obj-$(CONFIG_JBD) += jbd/ 70 obj-$(CONFIG_JBD) += jbd/
71 obj-$(CONFIG_JBD2) += jbd2/ 71 obj-$(CONFIG_JBD2) += jbd2/
72 obj-$(CONFIG_CRAMFS) += cramfs/ 72 obj-$(CONFIG_CRAMFS) += cramfs/
73 obj-$(CONFIG_SQUASHFS) += squashfs/ 73 obj-$(CONFIG_SQUASHFS) += squashfs/
74 obj-y += ramfs/ 74 obj-y += ramfs/
75 obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ 75 obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
76 obj-$(CONFIG_CODA_FS) += coda/ 76 obj-$(CONFIG_CODA_FS) += coda/
77 obj-$(CONFIG_MINIX_FS) += minix/ 77 obj-$(CONFIG_MINIX_FS) += minix/
78 obj-$(CONFIG_FAT_FS) += fat/ 78 obj-$(CONFIG_FAT_FS) += fat/
79 obj-$(CONFIG_BFS_FS) += bfs/ 79 obj-$(CONFIG_BFS_FS) += bfs/
80 obj-$(CONFIG_ISO9660_FS) += isofs/ 80 obj-$(CONFIG_ISO9660_FS) += isofs/
81 obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ 81 obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
82 obj-$(CONFIG_HFS_FS) += hfs/ 82 obj-$(CONFIG_HFS_FS) += hfs/
83 obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ 83 obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
84 obj-$(CONFIG_VXFS_FS) += freevxfs/ 84 obj-$(CONFIG_VXFS_FS) += freevxfs/
85 obj-$(CONFIG_NFS_FS) += nfs/ 85 obj-$(CONFIG_NFS_FS) += nfs/
86 obj-$(CONFIG_EXPORTFS) += exportfs/ 86 obj-$(CONFIG_EXPORTFS) += exportfs/
87 obj-$(CONFIG_NFSD) += nfsd/ 87 obj-$(CONFIG_NFSD) += nfsd/
88 obj-$(CONFIG_LOCKD) += lockd/ 88 obj-$(CONFIG_LOCKD) += lockd/
89 obj-$(CONFIG_NLS) += nls/ 89 obj-$(CONFIG_NLS) += nls/
90 obj-$(CONFIG_SYSV_FS) += sysv/ 90 obj-$(CONFIG_SYSV_FS) += sysv/
91 obj-$(CONFIG_CIFS) += cifs/ 91 obj-$(CONFIG_CIFS) += cifs/
92 obj-$(CONFIG_NCP_FS) += ncpfs/ 92 obj-$(CONFIG_NCP_FS) += ncpfs/
93 obj-$(CONFIG_HPFS_FS) += hpfs/ 93 obj-$(CONFIG_HPFS_FS) += hpfs/
94 obj-$(CONFIG_NTFS_FS) += ntfs/ 94 obj-$(CONFIG_NTFS_FS) += ntfs/
95 obj-$(CONFIG_UFS_FS) += ufs/ 95 obj-$(CONFIG_UFS_FS) += ufs/
96 obj-$(CONFIG_EFS_FS) += efs/ 96 obj-$(CONFIG_EFS_FS) += efs/
97 obj-$(CONFIG_JFFS2_FS) += jffs2/ 97 obj-$(CONFIG_JFFS2_FS) += jffs2/
98 obj-$(CONFIG_LOGFS) += logfs/ 98 obj-$(CONFIG_LOGFS) += logfs/
99 obj-$(CONFIG_UBIFS_FS) += ubifs/ 99 obj-$(CONFIG_UBIFS_FS) += ubifs/
100 obj-$(CONFIG_AFFS_FS) += affs/ 100 obj-$(CONFIG_AFFS_FS) += affs/
101 obj-$(CONFIG_ROMFS_FS) += romfs/ 101 obj-$(CONFIG_ROMFS_FS) += romfs/
102 obj-$(CONFIG_QNX4FS_FS) += qnx4/ 102 obj-$(CONFIG_QNX4FS_FS) += qnx4/
103 obj-$(CONFIG_QNX6FS_FS) += qnx6/ 103 obj-$(CONFIG_QNX6FS_FS) += qnx6/
104 obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 104 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
105 obj-$(CONFIG_ADFS_FS) += adfs/ 105 obj-$(CONFIG_ADFS_FS) += adfs/
106 obj-$(CONFIG_FUSE_FS) += fuse/ 106 obj-$(CONFIG_FUSE_FS) += fuse/
107 obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ 107 obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
108 obj-$(CONFIG_UDF_FS) += udf/ 108 obj-$(CONFIG_UDF_FS) += udf/
109 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ 109 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
110 obj-$(CONFIG_OMFS_FS) += omfs/ 110 obj-$(CONFIG_OMFS_FS) += omfs/
111 obj-$(CONFIG_JFS_FS) += jfs/ 111 obj-$(CONFIG_JFS_FS) += jfs/
112 obj-$(CONFIG_XFS_FS) += xfs/ 112 obj-$(CONFIG_XFS_FS) += xfs/
113 obj-$(CONFIG_9P_FS) += 9p/ 113 obj-$(CONFIG_9P_FS) += 9p/
114 obj-$(CONFIG_AFS_FS) += afs/ 114 obj-$(CONFIG_AFS_FS) += afs/
115 obj-$(CONFIG_NILFS2_FS) += nilfs2/ 115 obj-$(CONFIG_NILFS2_FS) += nilfs2/
116 obj-$(CONFIG_BEFS_FS) += befs/ 116 obj-$(CONFIG_BEFS_FS) += befs/
117 obj-$(CONFIG_HOSTFS) += hostfs/ 117 obj-$(CONFIG_HOSTFS) += hostfs/
118 obj-$(CONFIG_HPPFS) += hppfs/ 118 obj-$(CONFIG_HPPFS) += hppfs/
119 obj-$(CONFIG_CACHEFILES) += cachefiles/ 119 obj-$(CONFIG_CACHEFILES) += cachefiles/
120 obj-$(CONFIG_DEBUG_FS) += debugfs/ 120 obj-$(CONFIG_DEBUG_FS) += debugfs/
121 obj-$(CONFIG_OCFS2_FS) += ocfs2/ 121 obj-$(CONFIG_OCFS2_FS) += ocfs2/
122 obj-$(CONFIG_BTRFS_FS) += btrfs/ 122 obj-$(CONFIG_BTRFS_FS) += btrfs/
123 obj-$(CONFIG_GFS2_FS) += gfs2/ 123 obj-$(CONFIG_GFS2_FS) += gfs2/
124 obj-$(CONFIG_F2FS_FS) += f2fs/ 124 obj-$(CONFIG_F2FS_FS) += f2fs/
125 obj-y += exofs/ # Multiple modules 125 obj-y += exofs/ # Multiple modules
126 obj-$(CONFIG_CEPH_FS) += ceph/ 126 obj-$(CONFIG_CEPH_FS) += ceph/
127 obj-$(CONFIG_PSTORE) += pstore/ 127 obj-$(CONFIG_PSTORE) += pstore/
128 obj-$(CONFIG_EFIVAR_FS) += efivarfs/ 128 obj-$(CONFIG_EFIVAR_FS) += efivarfs/
129 129
1 /* fs/ internal definitions 1 /* fs/ internal definitions
2 * 2 *
3 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12 struct super_block; 12 struct super_block;
13 struct file_system_type; 13 struct file_system_type;
14 struct linux_binprm; 14 struct linux_binprm;
15 struct path; 15 struct path;
16 struct mount; 16 struct mount;
17 17
18 /* 18 /*
19 * block_dev.c 19 * block_dev.c
20 */ 20 */
21 #ifdef CONFIG_BLOCK 21 #ifdef CONFIG_BLOCK
22 extern void __init bdev_cache_init(void); 22 extern void __init bdev_cache_init(void);
23 23
24 extern int __sync_blockdev(struct block_device *bdev, int wait); 24 extern int __sync_blockdev(struct block_device *bdev, int wait);
25 25
26 #else 26 #else
27 static inline void bdev_cache_init(void) 27 static inline void bdev_cache_init(void)
28 { 28 {
29 } 29 }
30 30
31 static inline int __sync_blockdev(struct block_device *bdev, int wait) 31 static inline int __sync_blockdev(struct block_device *bdev, int wait)
32 { 32 {
33 return 0; 33 return 0;
34 } 34 }
35 #endif 35 #endif
36 36
37 /* 37 /*
38 * buffer.c 38 * buffer.c
39 */ 39 */
40 extern void guard_bio_eod(int rw, struct bio *bio); 40 extern void guard_bio_eod(int rw, struct bio *bio);
41 41
42 /* 42 /*
43 * char_dev.c 43 * char_dev.c
44 */ 44 */
45 extern void __init chrdev_init(void); 45 extern void __init chrdev_init(void);
46 46
47 /* 47 /*
48 * namei.c 48 * namei.c
49 */ 49 */
50 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); 50 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
51 extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 51 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
52 const char *, unsigned int, struct path *); 52 const char *, unsigned int, struct path *);
53 53
54 /* 54 /*
55 * namespace.c 55 * namespace.c
56 */ 56 */
57 extern int copy_mount_options(const void __user *, unsigned long *); 57 extern int copy_mount_options(const void __user *, unsigned long *);
58 extern char *copy_mount_string(const void __user *); 58 extern char *copy_mount_string(const void __user *);
59 59
60 extern struct vfsmount *lookup_mnt(struct path *); 60 extern struct vfsmount *lookup_mnt(struct path *);
61 extern int finish_automount(struct vfsmount *, struct path *); 61 extern int finish_automount(struct vfsmount *, struct path *);
62 62
63 extern int sb_prepare_remount_readonly(struct super_block *); 63 extern int sb_prepare_remount_readonly(struct super_block *);
64 64
65 extern void __init mnt_init(void); 65 extern void __init mnt_init(void);
66 66
67 extern int __mnt_want_write(struct vfsmount *); 67 extern int __mnt_want_write(struct vfsmount *);
68 extern int __mnt_want_write_file(struct file *); 68 extern int __mnt_want_write_file(struct file *);
69 extern void __mnt_drop_write(struct vfsmount *); 69 extern void __mnt_drop_write(struct vfsmount *);
70 extern void __mnt_drop_write_file(struct file *); 70 extern void __mnt_drop_write_file(struct file *);
71 71
72 /* 72 /*
73 * fs_struct.c 73 * fs_struct.c
74 */ 74 */
75 extern void chroot_fs_refs(const struct path *, const struct path *); 75 extern void chroot_fs_refs(const struct path *, const struct path *);
76 76
77 /* 77 /*
78 * file_table.c 78 * file_table.c
79 */ 79 */
80 extern struct file *get_empty_filp(void); 80 extern struct file *get_empty_filp(void);
81 81
82 /* 82 /*
83 * super.c 83 * super.c
84 */ 84 */
85 extern int do_remount_sb(struct super_block *, int, void *, int); 85 extern int do_remount_sb(struct super_block *, int, void *, int);
86 extern bool grab_super_passive(struct super_block *sb); 86 extern bool grab_super_passive(struct super_block *sb);
87 extern struct dentry *mount_fs(struct file_system_type *, 87 extern struct dentry *mount_fs(struct file_system_type *,
88 int, const char *, void *); 88 int, const char *, void *);
89 extern struct super_block *user_get_super(dev_t); 89 extern struct super_block *user_get_super(dev_t);
90 90
91 /* 91 /*
92 * open.c 92 * open.c
93 */ 93 */
94 struct open_flags { 94 struct open_flags {
95 int open_flag; 95 int open_flag;
96 umode_t mode; 96 umode_t mode;
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99 int lookup_flags; 99 int lookup_flags;
100 }; 100 };
101 extern struct file *do_filp_open(int dfd, struct filename *pathname, 101 extern struct file *do_filp_open(int dfd, struct filename *pathname,
102 const struct open_flags *op); 102 const struct open_flags *op);
103 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 103 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
104 const char *, const struct open_flags *); 104 const char *, const struct open_flags *);
105 105
106 extern long do_handle_open(int mountdirfd, 106 extern long do_handle_open(int mountdirfd,
107 struct file_handle __user *ufh, int open_flag); 107 struct file_handle __user *ufh, int open_flag);
108 extern int open_check_o_direct(struct file *f); 108 extern int open_check_o_direct(struct file *f);
109 109
110 /* 110 /*
111 * inode.c 111 * inode.c
112 */ 112 */
113 extern spinlock_t inode_sb_list_lock; 113 extern spinlock_t inode_sb_list_lock;
114 extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 114 extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
115 int nid); 115 int nid);
116 extern void inode_add_lru(struct inode *inode); 116 extern void inode_add_lru(struct inode *inode);
117 117
118 /* 118 /*
119 * fs-writeback.c 119 * fs-writeback.c
120 */ 120 */
121 extern void inode_wb_list_del(struct inode *inode); 121 extern void inode_wb_list_del(struct inode *inode);
122 122
123 extern long get_nr_dirty_inodes(void); 123 extern long get_nr_dirty_inodes(void);
124 extern void evict_inodes(struct super_block *); 124 extern void evict_inodes(struct super_block *);
125 extern int invalidate_inodes(struct super_block *, bool); 125 extern int invalidate_inodes(struct super_block *, bool);
126 126
127 /* 127 /*
128 * dcache.c 128 * dcache.c
129 */ 129 */
130 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 130 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
131 extern int d_set_mounted(struct dentry *dentry); 131 extern int d_set_mounted(struct dentry *dentry);
132 extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 132 extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
133 int nid); 133 int nid);
134 134
135 /* 135 /*
136 * read_write.c 136 * read_write.c
137 */ 137 */
138 extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 138 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
139 139
140 /* 140 /*
141 * pipe.c 141 * pipe.c
142 */ 142 */
143 extern const struct file_operations pipefifo_fops; 143 extern const struct file_operations pipefifo_fops;
144 144
145 /* 145 /*
146 * fs_pin.c 146 * fs_pin.c
147 */ 147 */
148 extern void sb_pin_kill(struct super_block *sb); 148 extern void sb_pin_kill(struct super_block *sb);
149 extern void mnt_pin_kill(struct mount *m); 149 extern void mnt_pin_kill(struct mount *m);
150
151 /*
152 * fs/nsfs.c
153 */
154 extern struct dentry_operations ns_dentry_operations;
150 155
1 /* 1 /*
2 * linux/fs/namespace.c 2 * linux/fs/namespace.c
3 * 3 *
4 * (C) Copyright Al Viro 2000, 2001 4 * (C) Copyright Al Viro 2000, 2001
5 * Released under GPL v2. 5 * Released under GPL v2.
6 * 6 *
7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 7 * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 * Heavily rewritten. 8 * Heavily rewritten.
9 */ 9 */
10 10
11 #include <linux/syscalls.h> 11 #include <linux/syscalls.h>
12 #include <linux/export.h> 12 #include <linux/export.h>
13 #include <linux/capability.h> 13 #include <linux/capability.h>
14 #include <linux/mnt_namespace.h> 14 #include <linux/mnt_namespace.h>
15 #include <linux/user_namespace.h> 15 #include <linux/user_namespace.h>
16 #include <linux/namei.h> 16 #include <linux/namei.h>
17 #include <linux/security.h> 17 #include <linux/security.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 #include <linux/init.h> /* init_rootfs */ 19 #include <linux/init.h> /* init_rootfs */
20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */
21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22 #include <linux/uaccess.h> 22 #include <linux/uaccess.h>
23 #include <linux/proc_ns.h> 23 #include <linux/proc_ns.h>
24 #include <linux/magic.h> 24 #include <linux/magic.h>
25 #include <linux/bootmem.h> 25 #include <linux/bootmem.h>
26 #include <linux/task_work.h> 26 #include <linux/task_work.h>
27 #include "pnode.h" 27 #include "pnode.h"
28 #include "internal.h" 28 #include "internal.h"
29 29
30 static unsigned int m_hash_mask __read_mostly; 30 static unsigned int m_hash_mask __read_mostly;
31 static unsigned int m_hash_shift __read_mostly; 31 static unsigned int m_hash_shift __read_mostly;
32 static unsigned int mp_hash_mask __read_mostly; 32 static unsigned int mp_hash_mask __read_mostly;
33 static unsigned int mp_hash_shift __read_mostly; 33 static unsigned int mp_hash_shift __read_mostly;
34 34
35 static __initdata unsigned long mhash_entries; 35 static __initdata unsigned long mhash_entries;
36 static int __init set_mhash_entries(char *str) 36 static int __init set_mhash_entries(char *str)
37 { 37 {
38 if (!str) 38 if (!str)
39 return 0; 39 return 0;
40 mhash_entries = simple_strtoul(str, &str, 0); 40 mhash_entries = simple_strtoul(str, &str, 0);
41 return 1; 41 return 1;
42 } 42 }
43 __setup("mhash_entries=", set_mhash_entries); 43 __setup("mhash_entries=", set_mhash_entries);
44 44
45 static __initdata unsigned long mphash_entries; 45 static __initdata unsigned long mphash_entries;
46 static int __init set_mphash_entries(char *str) 46 static int __init set_mphash_entries(char *str)
47 { 47 {
48 if (!str) 48 if (!str)
49 return 0; 49 return 0;
50 mphash_entries = simple_strtoul(str, &str, 0); 50 mphash_entries = simple_strtoul(str, &str, 0);
51 return 1; 51 return 1;
52 } 52 }
53 __setup("mphash_entries=", set_mphash_entries); 53 __setup("mphash_entries=", set_mphash_entries);
54 54
55 static u64 event; 55 static u64 event;
56 static DEFINE_IDA(mnt_id_ida); 56 static DEFINE_IDA(mnt_id_ida);
57 static DEFINE_IDA(mnt_group_ida); 57 static DEFINE_IDA(mnt_group_ida);
58 static DEFINE_SPINLOCK(mnt_id_lock); 58 static DEFINE_SPINLOCK(mnt_id_lock);
59 static int mnt_id_start = 0; 59 static int mnt_id_start = 0;
60 static int mnt_group_start = 1; 60 static int mnt_group_start = 1;
61 61
62 static struct hlist_head *mount_hashtable __read_mostly; 62 static struct hlist_head *mount_hashtable __read_mostly;
63 static struct hlist_head *mountpoint_hashtable __read_mostly; 63 static struct hlist_head *mountpoint_hashtable __read_mostly;
64 static struct kmem_cache *mnt_cache __read_mostly; 64 static struct kmem_cache *mnt_cache __read_mostly;
65 static DECLARE_RWSEM(namespace_sem); 65 static DECLARE_RWSEM(namespace_sem);
66 66
67 /* /sys/fs */ 67 /* /sys/fs */
68 struct kobject *fs_kobj; 68 struct kobject *fs_kobj;
69 EXPORT_SYMBOL_GPL(fs_kobj); 69 EXPORT_SYMBOL_GPL(fs_kobj);
70 70
71 /* 71 /*
72 * vfsmount lock may be taken for read to prevent changes to the 72 * vfsmount lock may be taken for read to prevent changes to the
73 * vfsmount hash, ie. during mountpoint lookups or walking back 73 * vfsmount hash, ie. during mountpoint lookups or walking back
74 * up the tree. 74 * up the tree.
75 * 75 *
76 * It should be taken for write in all cases where the vfsmount 76 * It should be taken for write in all cases where the vfsmount
77 * tree or hash is modified or when a vfsmount structure is modified. 77 * tree or hash is modified or when a vfsmount structure is modified.
78 */ 78 */
79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
80 80
81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) 81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
82 { 82 {
83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
85 tmp = tmp + (tmp >> m_hash_shift); 85 tmp = tmp + (tmp >> m_hash_shift);
86 return &mount_hashtable[tmp & m_hash_mask]; 86 return &mount_hashtable[tmp & m_hash_mask];
87 } 87 }
88 88
89 static inline struct hlist_head *mp_hash(struct dentry *dentry) 89 static inline struct hlist_head *mp_hash(struct dentry *dentry)
90 { 90 {
91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); 91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
92 tmp = tmp + (tmp >> mp_hash_shift); 92 tmp = tmp + (tmp >> mp_hash_shift);
93 return &mountpoint_hashtable[tmp & mp_hash_mask]; 93 return &mountpoint_hashtable[tmp & mp_hash_mask];
94 } 94 }
95 95
96 /* 96 /*
97 * allocation is serialized by namespace_sem, but we need the spinlock to 97 * allocation is serialized by namespace_sem, but we need the spinlock to
98 * serialize with freeing. 98 * serialize with freeing.
99 */ 99 */
100 static int mnt_alloc_id(struct mount *mnt) 100 static int mnt_alloc_id(struct mount *mnt)
101 { 101 {
102 int res; 102 int res;
103 103
104 retry: 104 retry:
105 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 105 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
106 spin_lock(&mnt_id_lock); 106 spin_lock(&mnt_id_lock);
107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
108 if (!res) 108 if (!res)
109 mnt_id_start = mnt->mnt_id + 1; 109 mnt_id_start = mnt->mnt_id + 1;
110 spin_unlock(&mnt_id_lock); 110 spin_unlock(&mnt_id_lock);
111 if (res == -EAGAIN) 111 if (res == -EAGAIN)
112 goto retry; 112 goto retry;
113 113
114 return res; 114 return res;
115 } 115 }
116 116
117 static void mnt_free_id(struct mount *mnt) 117 static void mnt_free_id(struct mount *mnt)
118 { 118 {
119 int id = mnt->mnt_id; 119 int id = mnt->mnt_id;
120 spin_lock(&mnt_id_lock); 120 spin_lock(&mnt_id_lock);
121 ida_remove(&mnt_id_ida, id); 121 ida_remove(&mnt_id_ida, id);
122 if (mnt_id_start > id) 122 if (mnt_id_start > id)
123 mnt_id_start = id; 123 mnt_id_start = id;
124 spin_unlock(&mnt_id_lock); 124 spin_unlock(&mnt_id_lock);
125 } 125 }
126 126
127 /* 127 /*
128 * Allocate a new peer group ID 128 * Allocate a new peer group ID
129 * 129 *
130 * mnt_group_ida is protected by namespace_sem 130 * mnt_group_ida is protected by namespace_sem
131 */ 131 */
132 static int mnt_alloc_group_id(struct mount *mnt) 132 static int mnt_alloc_group_id(struct mount *mnt)
133 { 133 {
134 int res; 134 int res;
135 135
136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
137 return -ENOMEM; 137 return -ENOMEM;
138 138
139 res = ida_get_new_above(&mnt_group_ida, 139 res = ida_get_new_above(&mnt_group_ida,
140 mnt_group_start, 140 mnt_group_start,
141 &mnt->mnt_group_id); 141 &mnt->mnt_group_id);
142 if (!res) 142 if (!res)
143 mnt_group_start = mnt->mnt_group_id + 1; 143 mnt_group_start = mnt->mnt_group_id + 1;
144 144
145 return res; 145 return res;
146 } 146 }
147 147
148 /* 148 /*
149 * Release a peer group ID 149 * Release a peer group ID
150 */ 150 */
151 void mnt_release_group_id(struct mount *mnt) 151 void mnt_release_group_id(struct mount *mnt)
152 { 152 {
153 int id = mnt->mnt_group_id; 153 int id = mnt->mnt_group_id;
154 ida_remove(&mnt_group_ida, id); 154 ida_remove(&mnt_group_ida, id);
155 if (mnt_group_start > id) 155 if (mnt_group_start > id)
156 mnt_group_start = id; 156 mnt_group_start = id;
157 mnt->mnt_group_id = 0; 157 mnt->mnt_group_id = 0;
158 } 158 }
159 159
160 /* 160 /*
161 * vfsmount lock must be held for read 161 * vfsmount lock must be held for read
162 */ 162 */
163 static inline void mnt_add_count(struct mount *mnt, int n) 163 static inline void mnt_add_count(struct mount *mnt, int n)
164 { 164 {
165 #ifdef CONFIG_SMP 165 #ifdef CONFIG_SMP
166 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 166 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
167 #else 167 #else
168 preempt_disable(); 168 preempt_disable();
169 mnt->mnt_count += n; 169 mnt->mnt_count += n;
170 preempt_enable(); 170 preempt_enable();
171 #endif 171 #endif
172 } 172 }
173 173
174 /* 174 /*
175 * vfsmount lock must be held for write 175 * vfsmount lock must be held for write
176 */ 176 */
177 unsigned int mnt_get_count(struct mount *mnt) 177 unsigned int mnt_get_count(struct mount *mnt)
178 { 178 {
179 #ifdef CONFIG_SMP 179 #ifdef CONFIG_SMP
180 unsigned int count = 0; 180 unsigned int count = 0;
181 int cpu; 181 int cpu;
182 182
183 for_each_possible_cpu(cpu) { 183 for_each_possible_cpu(cpu) {
184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
185 } 185 }
186 186
187 return count; 187 return count;
188 #else 188 #else
189 return mnt->mnt_count; 189 return mnt->mnt_count;
190 #endif 190 #endif
191 } 191 }
192 192
193 static struct mount *alloc_vfsmnt(const char *name) 193 static struct mount *alloc_vfsmnt(const char *name)
194 { 194 {
195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
196 if (mnt) { 196 if (mnt) {
197 int err; 197 int err;
198 198
199 err = mnt_alloc_id(mnt); 199 err = mnt_alloc_id(mnt);
200 if (err) 200 if (err)
201 goto out_free_cache; 201 goto out_free_cache;
202 202
203 if (name) { 203 if (name) {
204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
205 if (!mnt->mnt_devname) 205 if (!mnt->mnt_devname)
206 goto out_free_id; 206 goto out_free_id;
207 } 207 }
208 208
209 #ifdef CONFIG_SMP 209 #ifdef CONFIG_SMP
210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
211 if (!mnt->mnt_pcp) 211 if (!mnt->mnt_pcp)
212 goto out_free_devname; 212 goto out_free_devname;
213 213
214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
215 #else 215 #else
216 mnt->mnt_count = 1; 216 mnt->mnt_count = 1;
217 mnt->mnt_writers = 0; 217 mnt->mnt_writers = 0;
218 #endif 218 #endif
219 219
220 INIT_HLIST_NODE(&mnt->mnt_hash); 220 INIT_HLIST_NODE(&mnt->mnt_hash);
221 INIT_LIST_HEAD(&mnt->mnt_child); 221 INIT_LIST_HEAD(&mnt->mnt_child);
222 INIT_LIST_HEAD(&mnt->mnt_mounts); 222 INIT_LIST_HEAD(&mnt->mnt_mounts);
223 INIT_LIST_HEAD(&mnt->mnt_list); 223 INIT_LIST_HEAD(&mnt->mnt_list);
224 INIT_LIST_HEAD(&mnt->mnt_expire); 224 INIT_LIST_HEAD(&mnt->mnt_expire);
225 INIT_LIST_HEAD(&mnt->mnt_share); 225 INIT_LIST_HEAD(&mnt->mnt_share);
226 INIT_LIST_HEAD(&mnt->mnt_slave_list); 226 INIT_LIST_HEAD(&mnt->mnt_slave_list);
227 INIT_LIST_HEAD(&mnt->mnt_slave); 227 INIT_LIST_HEAD(&mnt->mnt_slave);
228 INIT_HLIST_NODE(&mnt->mnt_mp_list); 228 INIT_HLIST_NODE(&mnt->mnt_mp_list);
229 #ifdef CONFIG_FSNOTIFY 229 #ifdef CONFIG_FSNOTIFY
230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
231 #endif 231 #endif
232 } 232 }
233 return mnt; 233 return mnt;
234 234
235 #ifdef CONFIG_SMP 235 #ifdef CONFIG_SMP
236 out_free_devname: 236 out_free_devname:
237 kfree(mnt->mnt_devname); 237 kfree(mnt->mnt_devname);
238 #endif 238 #endif
239 out_free_id: 239 out_free_id:
240 mnt_free_id(mnt); 240 mnt_free_id(mnt);
241 out_free_cache: 241 out_free_cache:
242 kmem_cache_free(mnt_cache, mnt); 242 kmem_cache_free(mnt_cache, mnt);
243 return NULL; 243 return NULL;
244 } 244 }
245 245
246 /* 246 /*
247 * Most r/o checks on a fs are for operations that take 247 * Most r/o checks on a fs are for operations that take
248 * discrete amounts of time, like a write() or unlink(). 248 * discrete amounts of time, like a write() or unlink().
249 * We must keep track of when those operations start 249 * We must keep track of when those operations start
250 * (for permission checks) and when they end, so that 250 * (for permission checks) and when they end, so that
251 * we can determine when writes are able to occur to 251 * we can determine when writes are able to occur to
252 * a filesystem. 252 * a filesystem.
253 */ 253 */
254 /* 254 /*
255 * __mnt_is_readonly: check whether a mount is read-only 255 * __mnt_is_readonly: check whether a mount is read-only
256 * @mnt: the mount to check for its write status 256 * @mnt: the mount to check for its write status
257 * 257 *
258 * This shouldn't be used directly ouside of the VFS. 258 * This shouldn't be used directly ouside of the VFS.
259 * It does not guarantee that the filesystem will stay 259 * It does not guarantee that the filesystem will stay
260 * r/w, just that it is right *now*. This can not and 260 * r/w, just that it is right *now*. This can not and
261 * should not be used in place of IS_RDONLY(inode). 261 * should not be used in place of IS_RDONLY(inode).
262 * mnt_want/drop_write() will _keep_ the filesystem 262 * mnt_want/drop_write() will _keep_ the filesystem
263 * r/w. 263 * r/w.
264 */ 264 */
265 int __mnt_is_readonly(struct vfsmount *mnt) 265 int __mnt_is_readonly(struct vfsmount *mnt)
266 { 266 {
267 if (mnt->mnt_flags & MNT_READONLY) 267 if (mnt->mnt_flags & MNT_READONLY)
268 return 1; 268 return 1;
269 if (mnt->mnt_sb->s_flags & MS_RDONLY) 269 if (mnt->mnt_sb->s_flags & MS_RDONLY)
270 return 1; 270 return 1;
271 return 0; 271 return 0;
272 } 272 }
273 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 273 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
274 274
275 static inline void mnt_inc_writers(struct mount *mnt) 275 static inline void mnt_inc_writers(struct mount *mnt)
276 { 276 {
277 #ifdef CONFIG_SMP 277 #ifdef CONFIG_SMP
278 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 278 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
279 #else 279 #else
280 mnt->mnt_writers++; 280 mnt->mnt_writers++;
281 #endif 281 #endif
282 } 282 }
283 283
284 static inline void mnt_dec_writers(struct mount *mnt) 284 static inline void mnt_dec_writers(struct mount *mnt)
285 { 285 {
286 #ifdef CONFIG_SMP 286 #ifdef CONFIG_SMP
287 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 287 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
288 #else 288 #else
289 mnt->mnt_writers--; 289 mnt->mnt_writers--;
290 #endif 290 #endif
291 } 291 }
292 292
293 static unsigned int mnt_get_writers(struct mount *mnt) 293 static unsigned int mnt_get_writers(struct mount *mnt)
294 { 294 {
295 #ifdef CONFIG_SMP 295 #ifdef CONFIG_SMP
296 unsigned int count = 0; 296 unsigned int count = 0;
297 int cpu; 297 int cpu;
298 298
299 for_each_possible_cpu(cpu) { 299 for_each_possible_cpu(cpu) {
300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 300 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
301 } 301 }
302 302
303 return count; 303 return count;
304 #else 304 #else
305 return mnt->mnt_writers; 305 return mnt->mnt_writers;
306 #endif 306 #endif
307 } 307 }
308 308
309 static int mnt_is_readonly(struct vfsmount *mnt) 309 static int mnt_is_readonly(struct vfsmount *mnt)
310 { 310 {
311 if (mnt->mnt_sb->s_readonly_remount) 311 if (mnt->mnt_sb->s_readonly_remount)
312 return 1; 312 return 1;
313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ 313 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
314 smp_rmb(); 314 smp_rmb();
315 return __mnt_is_readonly(mnt); 315 return __mnt_is_readonly(mnt);
316 } 316 }
317 317
318 /* 318 /*
319 * Most r/o & frozen checks on a fs are for operations that take discrete 319 * Most r/o & frozen checks on a fs are for operations that take discrete
320 * amounts of time, like a write() or unlink(). We must keep track of when 320 * amounts of time, like a write() or unlink(). We must keep track of when
321 * those operations start (for permission checks) and when they end, so that we 321 * those operations start (for permission checks) and when they end, so that we
322 * can determine when writes are able to occur to a filesystem. 322 * can determine when writes are able to occur to a filesystem.
323 */ 323 */
324 /** 324 /**
325 * __mnt_want_write - get write access to a mount without freeze protection 325 * __mnt_want_write - get write access to a mount without freeze protection
326 * @m: the mount on which to take a write 326 * @m: the mount on which to take a write
327 * 327 *
328 * This tells the low-level filesystem that a write is about to be performed to 328 * This tells the low-level filesystem that a write is about to be performed to
329 * it, and makes sure that writes are allowed (mnt it read-write) before 329 * it, and makes sure that writes are allowed (mnt it read-write) before
330 * returning success. This operation does not protect against filesystem being 330 * returning success. This operation does not protect against filesystem being
331 * frozen. When the write operation is finished, __mnt_drop_write() must be 331 * frozen. When the write operation is finished, __mnt_drop_write() must be
332 * called. This is effectively a refcount. 332 * called. This is effectively a refcount.
333 */ 333 */
334 int __mnt_want_write(struct vfsmount *m) 334 int __mnt_want_write(struct vfsmount *m)
335 { 335 {
336 struct mount *mnt = real_mount(m); 336 struct mount *mnt = real_mount(m);
337 int ret = 0; 337 int ret = 0;
338 338
339 preempt_disable(); 339 preempt_disable();
340 mnt_inc_writers(mnt); 340 mnt_inc_writers(mnt);
341 /* 341 /*
342 * The store to mnt_inc_writers must be visible before we pass 342 * The store to mnt_inc_writers must be visible before we pass
343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 343 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
344 * incremented count after it has set MNT_WRITE_HOLD. 344 * incremented count after it has set MNT_WRITE_HOLD.
345 */ 345 */
346 smp_mb(); 346 smp_mb();
347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) 347 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
348 cpu_relax(); 348 cpu_relax();
349 /* 349 /*
350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 350 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
351 * be set to match its requirements. So we must not load that until 351 * be set to match its requirements. So we must not load that until
352 * MNT_WRITE_HOLD is cleared. 352 * MNT_WRITE_HOLD is cleared.
353 */ 353 */
354 smp_rmb(); 354 smp_rmb();
355 if (mnt_is_readonly(m)) { 355 if (mnt_is_readonly(m)) {
356 mnt_dec_writers(mnt); 356 mnt_dec_writers(mnt);
357 ret = -EROFS; 357 ret = -EROFS;
358 } 358 }
359 preempt_enable(); 359 preempt_enable();
360 360
361 return ret; 361 return ret;
362 } 362 }
363 363
364 /** 364 /**
365 * mnt_want_write - get write access to a mount 365 * mnt_want_write - get write access to a mount
366 * @m: the mount on which to take a write 366 * @m: the mount on which to take a write
367 * 367 *
368 * This tells the low-level filesystem that a write is about to be performed to 368 * This tells the low-level filesystem that a write is about to be performed to
369 * it, and makes sure that writes are allowed (mount is read-write, filesystem 369 * it, and makes sure that writes are allowed (mount is read-write, filesystem
370 * is not frozen) before returning success. When the write operation is 370 * is not frozen) before returning success. When the write operation is
371 * finished, mnt_drop_write() must be called. This is effectively a refcount. 371 * finished, mnt_drop_write() must be called. This is effectively a refcount.
372 */ 372 */
373 int mnt_want_write(struct vfsmount *m) 373 int mnt_want_write(struct vfsmount *m)
374 { 374 {
375 int ret; 375 int ret;
376 376
377 sb_start_write(m->mnt_sb); 377 sb_start_write(m->mnt_sb);
378 ret = __mnt_want_write(m); 378 ret = __mnt_want_write(m);
379 if (ret) 379 if (ret)
380 sb_end_write(m->mnt_sb); 380 sb_end_write(m->mnt_sb);
381 return ret; 381 return ret;
382 } 382 }
383 EXPORT_SYMBOL_GPL(mnt_want_write); 383 EXPORT_SYMBOL_GPL(mnt_want_write);
384 384
385 /** 385 /**
386 * mnt_clone_write - get write access to a mount 386 * mnt_clone_write - get write access to a mount
387 * @mnt: the mount on which to take a write 387 * @mnt: the mount on which to take a write
388 * 388 *
389 * This is effectively like mnt_want_write, except 389 * This is effectively like mnt_want_write, except
390 * it must only be used to take an extra write reference 390 * it must only be used to take an extra write reference
391 * on a mountpoint that we already know has a write reference 391 * on a mountpoint that we already know has a write reference
392 * on it. This allows some optimisation. 392 * on it. This allows some optimisation.
393 * 393 *
394 * After finished, mnt_drop_write must be called as usual to 394 * After finished, mnt_drop_write must be called as usual to
395 * drop the reference. 395 * drop the reference.
396 */ 396 */
397 int mnt_clone_write(struct vfsmount *mnt) 397 int mnt_clone_write(struct vfsmount *mnt)
398 { 398 {
399 /* superblock may be r/o */ 399 /* superblock may be r/o */
400 if (__mnt_is_readonly(mnt)) 400 if (__mnt_is_readonly(mnt))
401 return -EROFS; 401 return -EROFS;
402 preempt_disable(); 402 preempt_disable();
403 mnt_inc_writers(real_mount(mnt)); 403 mnt_inc_writers(real_mount(mnt));
404 preempt_enable(); 404 preempt_enable();
405 return 0; 405 return 0;
406 } 406 }
407 EXPORT_SYMBOL_GPL(mnt_clone_write); 407 EXPORT_SYMBOL_GPL(mnt_clone_write);
408 408
409 /** 409 /**
410 * __mnt_want_write_file - get write access to a file's mount 410 * __mnt_want_write_file - get write access to a file's mount
411 * @file: the file who's mount on which to take a write 411 * @file: the file who's mount on which to take a write
412 * 412 *
413 * This is like __mnt_want_write, but it takes a file and can 413 * This is like __mnt_want_write, but it takes a file and can
414 * do some optimisations if the file is open for write already 414 * do some optimisations if the file is open for write already
415 */ 415 */
416 int __mnt_want_write_file(struct file *file) 416 int __mnt_want_write_file(struct file *file)
417 { 417 {
418 if (!(file->f_mode & FMODE_WRITER)) 418 if (!(file->f_mode & FMODE_WRITER))
419 return __mnt_want_write(file->f_path.mnt); 419 return __mnt_want_write(file->f_path.mnt);
420 else 420 else
421 return mnt_clone_write(file->f_path.mnt); 421 return mnt_clone_write(file->f_path.mnt);
422 } 422 }
423 423
424 /** 424 /**
425 * mnt_want_write_file - get write access to a file's mount 425 * mnt_want_write_file - get write access to a file's mount
426 * @file: the file who's mount on which to take a write 426 * @file: the file who's mount on which to take a write
427 * 427 *
428 * This is like mnt_want_write, but it takes a file and can 428 * This is like mnt_want_write, but it takes a file and can
429 * do some optimisations if the file is open for write already 429 * do some optimisations if the file is open for write already
430 */ 430 */
431 int mnt_want_write_file(struct file *file) 431 int mnt_want_write_file(struct file *file)
432 { 432 {
433 int ret; 433 int ret;
434 434
435 sb_start_write(file->f_path.mnt->mnt_sb); 435 sb_start_write(file->f_path.mnt->mnt_sb);
436 ret = __mnt_want_write_file(file); 436 ret = __mnt_want_write_file(file);
437 if (ret) 437 if (ret)
438 sb_end_write(file->f_path.mnt->mnt_sb); 438 sb_end_write(file->f_path.mnt->mnt_sb);
439 return ret; 439 return ret;
440 } 440 }
441 EXPORT_SYMBOL_GPL(mnt_want_write_file); 441 EXPORT_SYMBOL_GPL(mnt_want_write_file);
442 442
443 /** 443 /**
444 * __mnt_drop_write - give up write access to a mount 444 * __mnt_drop_write - give up write access to a mount
445 * @mnt: the mount on which to give up write access 445 * @mnt: the mount on which to give up write access
446 * 446 *
447 * Tells the low-level filesystem that we are done 447 * Tells the low-level filesystem that we are done
448 * performing writes to it. Must be matched with 448 * performing writes to it. Must be matched with
449 * __mnt_want_write() call above. 449 * __mnt_want_write() call above.
450 */ 450 */
451 void __mnt_drop_write(struct vfsmount *mnt) 451 void __mnt_drop_write(struct vfsmount *mnt)
452 { 452 {
453 preempt_disable(); 453 preempt_disable();
454 mnt_dec_writers(real_mount(mnt)); 454 mnt_dec_writers(real_mount(mnt));
455 preempt_enable(); 455 preempt_enable();
456 } 456 }
457 457
458 /** 458 /**
459 * mnt_drop_write - give up write access to a mount 459 * mnt_drop_write - give up write access to a mount
460 * @mnt: the mount on which to give up write access 460 * @mnt: the mount on which to give up write access
461 * 461 *
462 * Tells the low-level filesystem that we are done performing writes to it and 462 * Tells the low-level filesystem that we are done performing writes to it and
463 * also allows filesystem to be frozen again. Must be matched with 463 * also allows filesystem to be frozen again. Must be matched with
464 * mnt_want_write() call above. 464 * mnt_want_write() call above.
465 */ 465 */
466 void mnt_drop_write(struct vfsmount *mnt) 466 void mnt_drop_write(struct vfsmount *mnt)
467 { 467 {
468 __mnt_drop_write(mnt); 468 __mnt_drop_write(mnt);
469 sb_end_write(mnt->mnt_sb); 469 sb_end_write(mnt->mnt_sb);
470 } 470 }
471 EXPORT_SYMBOL_GPL(mnt_drop_write); 471 EXPORT_SYMBOL_GPL(mnt_drop_write);
472 472
473 void __mnt_drop_write_file(struct file *file) 473 void __mnt_drop_write_file(struct file *file)
474 { 474 {
475 __mnt_drop_write(file->f_path.mnt); 475 __mnt_drop_write(file->f_path.mnt);
476 } 476 }
477 477
478 void mnt_drop_write_file(struct file *file) 478 void mnt_drop_write_file(struct file *file)
479 { 479 {
480 mnt_drop_write(file->f_path.mnt); 480 mnt_drop_write(file->f_path.mnt);
481 } 481 }
482 EXPORT_SYMBOL(mnt_drop_write_file); 482 EXPORT_SYMBOL(mnt_drop_write_file);
483 483
484 static int mnt_make_readonly(struct mount *mnt) 484 static int mnt_make_readonly(struct mount *mnt)
485 { 485 {
486 int ret = 0; 486 int ret = 0;
487 487
488 lock_mount_hash(); 488 lock_mount_hash();
489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 489 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
490 /* 490 /*
491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 491 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
492 * should be visible before we do. 492 * should be visible before we do.
493 */ 493 */
494 smp_mb(); 494 smp_mb();
495 495
496 /* 496 /*
497 * With writers on hold, if this value is zero, then there are 497 * With writers on hold, if this value is zero, then there are
498 * definitely no active writers (although held writers may subsequently 498 * definitely no active writers (although held writers may subsequently
499 * increment the count, they'll have to wait, and decrement it after 499 * increment the count, they'll have to wait, and decrement it after
500 * seeing MNT_READONLY). 500 * seeing MNT_READONLY).
501 * 501 *
502 * It is OK to have counter incremented on one CPU and decremented on 502 * It is OK to have counter incremented on one CPU and decremented on
503 * another: the sum will add up correctly. The danger would be when we 503 * another: the sum will add up correctly. The danger would be when we
504 * sum up each counter, if we read a counter before it is incremented, 504 * sum up each counter, if we read a counter before it is incremented,
505 * but then read another CPU's count which it has been subsequently 505 * but then read another CPU's count which it has been subsequently
506 * decremented from -- we would see more decrements than we should. 506 * decremented from -- we would see more decrements than we should.
507 * MNT_WRITE_HOLD protects against this scenario, because 507 * MNT_WRITE_HOLD protects against this scenario, because
508 * mnt_want_write first increments count, then smp_mb, then spins on 508 * mnt_want_write first increments count, then smp_mb, then spins on
509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 509 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
510 * we're counting up here. 510 * we're counting up here.
511 */ 511 */
512 if (mnt_get_writers(mnt) > 0) 512 if (mnt_get_writers(mnt) > 0)
513 ret = -EBUSY; 513 ret = -EBUSY;
514 else 514 else
515 mnt->mnt.mnt_flags |= MNT_READONLY; 515 mnt->mnt.mnt_flags |= MNT_READONLY;
516 /* 516 /*
517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 517 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
518 * that become unheld will see MNT_READONLY. 518 * that become unheld will see MNT_READONLY.
519 */ 519 */
520 smp_wmb(); 520 smp_wmb();
521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 521 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
522 unlock_mount_hash(); 522 unlock_mount_hash();
523 return ret; 523 return ret;
524 } 524 }
525 525
526 static void __mnt_unmake_readonly(struct mount *mnt) 526 static void __mnt_unmake_readonly(struct mount *mnt)
527 { 527 {
528 lock_mount_hash(); 528 lock_mount_hash();
529 mnt->mnt.mnt_flags &= ~MNT_READONLY; 529 mnt->mnt.mnt_flags &= ~MNT_READONLY;
530 unlock_mount_hash(); 530 unlock_mount_hash();
531 } 531 }
532 532
533 int sb_prepare_remount_readonly(struct super_block *sb) 533 int sb_prepare_remount_readonly(struct super_block *sb)
534 { 534 {
535 struct mount *mnt; 535 struct mount *mnt;
536 int err = 0; 536 int err = 0;
537 537
538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ 538 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
539 if (atomic_long_read(&sb->s_remove_count)) 539 if (atomic_long_read(&sb->s_remove_count))
540 return -EBUSY; 540 return -EBUSY;
541 541
542 lock_mount_hash(); 542 lock_mount_hash();
543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 543 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 544 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 545 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
546 smp_mb(); 546 smp_mb();
547 if (mnt_get_writers(mnt) > 0) { 547 if (mnt_get_writers(mnt) > 0) {
548 err = -EBUSY; 548 err = -EBUSY;
549 break; 549 break;
550 } 550 }
551 } 551 }
552 } 552 }
553 if (!err && atomic_long_read(&sb->s_remove_count)) 553 if (!err && atomic_long_read(&sb->s_remove_count))
554 err = -EBUSY; 554 err = -EBUSY;
555 555
556 if (!err) { 556 if (!err) {
557 sb->s_readonly_remount = 1; 557 sb->s_readonly_remount = 1;
558 smp_wmb(); 558 smp_wmb();
559 } 559 }
560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 560 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 561 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 562 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
563 } 563 }
564 unlock_mount_hash(); 564 unlock_mount_hash();
565 565
566 return err; 566 return err;
567 } 567 }
568 568
569 static void free_vfsmnt(struct mount *mnt) 569 static void free_vfsmnt(struct mount *mnt)
570 { 570 {
571 kfree(mnt->mnt_devname); 571 kfree(mnt->mnt_devname);
572 #ifdef CONFIG_SMP 572 #ifdef CONFIG_SMP
573 free_percpu(mnt->mnt_pcp); 573 free_percpu(mnt->mnt_pcp);
574 #endif 574 #endif
575 kmem_cache_free(mnt_cache, mnt); 575 kmem_cache_free(mnt_cache, mnt);
576 } 576 }
577 577
578 static void delayed_free_vfsmnt(struct rcu_head *head) 578 static void delayed_free_vfsmnt(struct rcu_head *head)
579 { 579 {
580 free_vfsmnt(container_of(head, struct mount, mnt_rcu)); 580 free_vfsmnt(container_of(head, struct mount, mnt_rcu));
581 } 581 }
582 582
583 /* call under rcu_read_lock */ 583 /* call under rcu_read_lock */
584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 584 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
585 { 585 {
586 struct mount *mnt; 586 struct mount *mnt;
587 if (read_seqretry(&mount_lock, seq)) 587 if (read_seqretry(&mount_lock, seq))
588 return false; 588 return false;
589 if (bastard == NULL) 589 if (bastard == NULL)
590 return true; 590 return true;
591 mnt = real_mount(bastard); 591 mnt = real_mount(bastard);
592 mnt_add_count(mnt, 1); 592 mnt_add_count(mnt, 1);
593 if (likely(!read_seqretry(&mount_lock, seq))) 593 if (likely(!read_seqretry(&mount_lock, seq)))
594 return true; 594 return true;
595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 595 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
596 mnt_add_count(mnt, -1); 596 mnt_add_count(mnt, -1);
597 return false; 597 return false;
598 } 598 }
599 rcu_read_unlock(); 599 rcu_read_unlock();
600 mntput(bastard); 600 mntput(bastard);
601 rcu_read_lock(); 601 rcu_read_lock();
602 return false; 602 return false;
603 } 603 }
604 604
605 /* 605 /*
606 * find the first mount at @dentry on vfsmount @mnt. 606 * find the first mount at @dentry on vfsmount @mnt.
607 * call under rcu_read_lock() 607 * call under rcu_read_lock()
608 */ 608 */
609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 609 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
610 { 610 {
611 struct hlist_head *head = m_hash(mnt, dentry); 611 struct hlist_head *head = m_hash(mnt, dentry);
612 struct mount *p; 612 struct mount *p;
613 613
614 hlist_for_each_entry_rcu(p, head, mnt_hash) 614 hlist_for_each_entry_rcu(p, head, mnt_hash)
615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 615 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
616 return p; 616 return p;
617 return NULL; 617 return NULL;
618 } 618 }
619 619
620 /* 620 /*
621 * find the last mount at @dentry on vfsmount @mnt. 621 * find the last mount at @dentry on vfsmount @mnt.
622 * mount_lock must be held. 622 * mount_lock must be held.
623 */ 623 */
624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 624 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
625 { 625 {
626 struct mount *p, *res; 626 struct mount *p, *res;
627 res = p = __lookup_mnt(mnt, dentry); 627 res = p = __lookup_mnt(mnt, dentry);
628 if (!p) 628 if (!p)
629 goto out; 629 goto out;
630 hlist_for_each_entry_continue(p, mnt_hash) { 630 hlist_for_each_entry_continue(p, mnt_hash) {
631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) 631 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
632 break; 632 break;
633 res = p; 633 res = p;
634 } 634 }
635 out: 635 out:
636 return res; 636 return res;
637 } 637 }
638 638
639 /* 639 /*
640 * lookup_mnt - Return the first child mount mounted at path 640 * lookup_mnt - Return the first child mount mounted at path
641 * 641 *
642 * "First" means first mounted chronologically. If you create the 642 * "First" means first mounted chronologically. If you create the
643 * following mounts: 643 * following mounts:
644 * 644 *
645 * mount /dev/sda1 /mnt 645 * mount /dev/sda1 /mnt
646 * mount /dev/sda2 /mnt 646 * mount /dev/sda2 /mnt
647 * mount /dev/sda3 /mnt 647 * mount /dev/sda3 /mnt
648 * 648 *
649 * Then lookup_mnt() on the base /mnt dentry in the root mount will 649 * Then lookup_mnt() on the base /mnt dentry in the root mount will
650 * return successively the root dentry and vfsmount of /dev/sda1, then 650 * return successively the root dentry and vfsmount of /dev/sda1, then
651 * /dev/sda2, then /dev/sda3, then NULL. 651 * /dev/sda2, then /dev/sda3, then NULL.
652 * 652 *
653 * lookup_mnt takes a reference to the found vfsmount. 653 * lookup_mnt takes a reference to the found vfsmount.
654 */ 654 */
655 struct vfsmount *lookup_mnt(struct path *path) 655 struct vfsmount *lookup_mnt(struct path *path)
656 { 656 {
657 struct mount *child_mnt; 657 struct mount *child_mnt;
658 struct vfsmount *m; 658 struct vfsmount *m;
659 unsigned seq; 659 unsigned seq;
660 660
661 rcu_read_lock(); 661 rcu_read_lock();
662 do { 662 do {
663 seq = read_seqbegin(&mount_lock); 663 seq = read_seqbegin(&mount_lock);
664 child_mnt = __lookup_mnt(path->mnt, path->dentry); 664 child_mnt = __lookup_mnt(path->mnt, path->dentry);
665 m = child_mnt ? &child_mnt->mnt : NULL; 665 m = child_mnt ? &child_mnt->mnt : NULL;
666 } while (!legitimize_mnt(m, seq)); 666 } while (!legitimize_mnt(m, seq));
667 rcu_read_unlock(); 667 rcu_read_unlock();
668 return m; 668 return m;
669 } 669 }
670 670
671 /* 671 /*
672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the 672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
673 * current mount namespace. 673 * current mount namespace.
674 * 674 *
675 * The common case is dentries are not mountpoints at all and that 675 * The common case is dentries are not mountpoints at all and that
676 * test is handled inline. For the slow case when we are actually 676 * test is handled inline. For the slow case when we are actually
677 * dealing with a mountpoint of some kind, walk through all of the 677 * dealing with a mountpoint of some kind, walk through all of the
678 * mounts in the current mount namespace and test to see if the dentry 678 * mounts in the current mount namespace and test to see if the dentry
679 * is a mountpoint. 679 * is a mountpoint.
680 * 680 *
681 * The mount_hashtable is not usable in the context because we 681 * The mount_hashtable is not usable in the context because we
682 * need to identify all mounts that may be in the current mount 682 * need to identify all mounts that may be in the current mount
683 * namespace not just a mount that happens to have some specified 683 * namespace not just a mount that happens to have some specified
684 * parent mount. 684 * parent mount.
685 */ 685 */
686 bool __is_local_mountpoint(struct dentry *dentry) 686 bool __is_local_mountpoint(struct dentry *dentry)
687 { 687 {
688 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 688 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
689 struct mount *mnt; 689 struct mount *mnt;
690 bool is_covered = false; 690 bool is_covered = false;
691 691
692 if (!d_mountpoint(dentry)) 692 if (!d_mountpoint(dentry))
693 goto out; 693 goto out;
694 694
695 down_read(&namespace_sem); 695 down_read(&namespace_sem);
696 list_for_each_entry(mnt, &ns->list, mnt_list) { 696 list_for_each_entry(mnt, &ns->list, mnt_list) {
697 is_covered = (mnt->mnt_mountpoint == dentry); 697 is_covered = (mnt->mnt_mountpoint == dentry);
698 if (is_covered) 698 if (is_covered)
699 break; 699 break;
700 } 700 }
701 up_read(&namespace_sem); 701 up_read(&namespace_sem);
702 out: 702 out:
703 return is_covered; 703 return is_covered;
704 } 704 }
705 705
706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry) 706 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
707 { 707 {
708 struct hlist_head *chain = mp_hash(dentry); 708 struct hlist_head *chain = mp_hash(dentry);
709 struct mountpoint *mp; 709 struct mountpoint *mp;
710 710
711 hlist_for_each_entry(mp, chain, m_hash) { 711 hlist_for_each_entry(mp, chain, m_hash) {
712 if (mp->m_dentry == dentry) { 712 if (mp->m_dentry == dentry) {
713 /* might be worth a WARN_ON() */ 713 /* might be worth a WARN_ON() */
714 if (d_unlinked(dentry)) 714 if (d_unlinked(dentry))
715 return ERR_PTR(-ENOENT); 715 return ERR_PTR(-ENOENT);
716 mp->m_count++; 716 mp->m_count++;
717 return mp; 717 return mp;
718 } 718 }
719 } 719 }
720 return NULL; 720 return NULL;
721 } 721 }
722 722
723 static struct mountpoint *new_mountpoint(struct dentry *dentry) 723 static struct mountpoint *new_mountpoint(struct dentry *dentry)
724 { 724 {
725 struct hlist_head *chain = mp_hash(dentry); 725 struct hlist_head *chain = mp_hash(dentry);
726 struct mountpoint *mp; 726 struct mountpoint *mp;
727 int ret; 727 int ret;
728 728
729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); 729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
730 if (!mp) 730 if (!mp)
731 return ERR_PTR(-ENOMEM); 731 return ERR_PTR(-ENOMEM);
732 732
733 ret = d_set_mounted(dentry); 733 ret = d_set_mounted(dentry);
734 if (ret) { 734 if (ret) {
735 kfree(mp); 735 kfree(mp);
736 return ERR_PTR(ret); 736 return ERR_PTR(ret);
737 } 737 }
738 738
739 mp->m_dentry = dentry; 739 mp->m_dentry = dentry;
740 mp->m_count = 1; 740 mp->m_count = 1;
741 hlist_add_head(&mp->m_hash, chain); 741 hlist_add_head(&mp->m_hash, chain);
742 INIT_HLIST_HEAD(&mp->m_list); 742 INIT_HLIST_HEAD(&mp->m_list);
743 return mp; 743 return mp;
744 } 744 }
745 745
746 static void put_mountpoint(struct mountpoint *mp) 746 static void put_mountpoint(struct mountpoint *mp)
747 { 747 {
748 if (!--mp->m_count) { 748 if (!--mp->m_count) {
749 struct dentry *dentry = mp->m_dentry; 749 struct dentry *dentry = mp->m_dentry;
750 BUG_ON(!hlist_empty(&mp->m_list)); 750 BUG_ON(!hlist_empty(&mp->m_list));
751 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
752 dentry->d_flags &= ~DCACHE_MOUNTED; 752 dentry->d_flags &= ~DCACHE_MOUNTED;
753 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
754 hlist_del(&mp->m_hash); 754 hlist_del(&mp->m_hash);
755 kfree(mp); 755 kfree(mp);
756 } 756 }
757 } 757 }
758 758
759 static inline int check_mnt(struct mount *mnt) 759 static inline int check_mnt(struct mount *mnt)
760 { 760 {
761 return mnt->mnt_ns == current->nsproxy->mnt_ns; 761 return mnt->mnt_ns == current->nsproxy->mnt_ns;
762 } 762 }
763 763
764 /* 764 /*
765 * vfsmount lock must be held for write 765 * vfsmount lock must be held for write
766 */ 766 */
767 static void touch_mnt_namespace(struct mnt_namespace *ns) 767 static void touch_mnt_namespace(struct mnt_namespace *ns)
768 { 768 {
769 if (ns) { 769 if (ns) {
770 ns->event = ++event; 770 ns->event = ++event;
771 wake_up_interruptible(&ns->poll); 771 wake_up_interruptible(&ns->poll);
772 } 772 }
773 } 773 }
774 774
775 /* 775 /*
776 * vfsmount lock must be held for write 776 * vfsmount lock must be held for write
777 */ 777 */
778 static void __touch_mnt_namespace(struct mnt_namespace *ns) 778 static void __touch_mnt_namespace(struct mnt_namespace *ns)
779 { 779 {
780 if (ns && ns->event != event) { 780 if (ns && ns->event != event) {
781 ns->event = event; 781 ns->event = event;
782 wake_up_interruptible(&ns->poll); 782 wake_up_interruptible(&ns->poll);
783 } 783 }
784 } 784 }
785 785
786 /* 786 /*
787 * vfsmount lock must be held for write 787 * vfsmount lock must be held for write
788 */ 788 */
789 static void detach_mnt(struct mount *mnt, struct path *old_path) 789 static void detach_mnt(struct mount *mnt, struct path *old_path)
790 { 790 {
791 old_path->dentry = mnt->mnt_mountpoint; 791 old_path->dentry = mnt->mnt_mountpoint;
792 old_path->mnt = &mnt->mnt_parent->mnt; 792 old_path->mnt = &mnt->mnt_parent->mnt;
793 mnt->mnt_parent = mnt; 793 mnt->mnt_parent = mnt;
794 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 794 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
795 list_del_init(&mnt->mnt_child); 795 list_del_init(&mnt->mnt_child);
796 hlist_del_init_rcu(&mnt->mnt_hash); 796 hlist_del_init_rcu(&mnt->mnt_hash);
797 hlist_del_init(&mnt->mnt_mp_list); 797 hlist_del_init(&mnt->mnt_mp_list);
798 put_mountpoint(mnt->mnt_mp); 798 put_mountpoint(mnt->mnt_mp);
799 mnt->mnt_mp = NULL; 799 mnt->mnt_mp = NULL;
800 } 800 }
801 801
802 /* 802 /*
803 * vfsmount lock must be held for write 803 * vfsmount lock must be held for write
804 */ 804 */
805 void mnt_set_mountpoint(struct mount *mnt, 805 void mnt_set_mountpoint(struct mount *mnt,
806 struct mountpoint *mp, 806 struct mountpoint *mp,
807 struct mount *child_mnt) 807 struct mount *child_mnt)
808 { 808 {
809 mp->m_count++; 809 mp->m_count++;
810 mnt_add_count(mnt, 1); /* essentially, that's mntget */ 810 mnt_add_count(mnt, 1); /* essentially, that's mntget */
811 child_mnt->mnt_mountpoint = dget(mp->m_dentry); 811 child_mnt->mnt_mountpoint = dget(mp->m_dentry);
812 child_mnt->mnt_parent = mnt; 812 child_mnt->mnt_parent = mnt;
813 child_mnt->mnt_mp = mp; 813 child_mnt->mnt_mp = mp;
814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
815 } 815 }
816 816
817 /* 817 /*
818 * vfsmount lock must be held for write 818 * vfsmount lock must be held for write
819 */ 819 */
820 static void attach_mnt(struct mount *mnt, 820 static void attach_mnt(struct mount *mnt,
821 struct mount *parent, 821 struct mount *parent,
822 struct mountpoint *mp) 822 struct mountpoint *mp)
823 { 823 {
824 mnt_set_mountpoint(parent, mp, mnt); 824 mnt_set_mountpoint(parent, mp, mnt);
825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 825 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
827 } 827 }
828 828
829 static void attach_shadowed(struct mount *mnt, 829 static void attach_shadowed(struct mount *mnt,
830 struct mount *parent, 830 struct mount *parent,
831 struct mount *shadows) 831 struct mount *shadows)
832 { 832 {
833 if (shadows) { 833 if (shadows) {
834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
835 list_add(&mnt->mnt_child, &shadows->mnt_child); 835 list_add(&mnt->mnt_child, &shadows->mnt_child);
836 } else { 836 } else {
837 hlist_add_head_rcu(&mnt->mnt_hash, 837 hlist_add_head_rcu(&mnt->mnt_hash,
838 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 838 m_hash(&parent->mnt, mnt->mnt_mountpoint));
839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
840 } 840 }
841 } 841 }
842 842
843 /* 843 /*
844 * vfsmount lock must be held for write 844 * vfsmount lock must be held for write
845 */ 845 */
846 static void commit_tree(struct mount *mnt, struct mount *shadows) 846 static void commit_tree(struct mount *mnt, struct mount *shadows)
847 { 847 {
848 struct mount *parent = mnt->mnt_parent; 848 struct mount *parent = mnt->mnt_parent;
849 struct mount *m; 849 struct mount *m;
850 LIST_HEAD(head); 850 LIST_HEAD(head);
851 struct mnt_namespace *n = parent->mnt_ns; 851 struct mnt_namespace *n = parent->mnt_ns;
852 852
853 BUG_ON(parent == mnt); 853 BUG_ON(parent == mnt);
854 854
855 list_add_tail(&head, &mnt->mnt_list); 855 list_add_tail(&head, &mnt->mnt_list);
856 list_for_each_entry(m, &head, mnt_list) 856 list_for_each_entry(m, &head, mnt_list)
857 m->mnt_ns = n; 857 m->mnt_ns = n;
858 858
859 list_splice(&head, n->list.prev); 859 list_splice(&head, n->list.prev);
860 860
861 attach_shadowed(mnt, parent, shadows); 861 attach_shadowed(mnt, parent, shadows);
862 touch_mnt_namespace(n); 862 touch_mnt_namespace(n);
863 } 863 }
864 864
865 static struct mount *next_mnt(struct mount *p, struct mount *root) 865 static struct mount *next_mnt(struct mount *p, struct mount *root)
866 { 866 {
867 struct list_head *next = p->mnt_mounts.next; 867 struct list_head *next = p->mnt_mounts.next;
868 if (next == &p->mnt_mounts) { 868 if (next == &p->mnt_mounts) {
869 while (1) { 869 while (1) {
870 if (p == root) 870 if (p == root)
871 return NULL; 871 return NULL;
872 next = p->mnt_child.next; 872 next = p->mnt_child.next;
873 if (next != &p->mnt_parent->mnt_mounts) 873 if (next != &p->mnt_parent->mnt_mounts)
874 break; 874 break;
875 p = p->mnt_parent; 875 p = p->mnt_parent;
876 } 876 }
877 } 877 }
878 return list_entry(next, struct mount, mnt_child); 878 return list_entry(next, struct mount, mnt_child);
879 } 879 }
880 880
881 static struct mount *skip_mnt_tree(struct mount *p) 881 static struct mount *skip_mnt_tree(struct mount *p)
882 { 882 {
883 struct list_head *prev = p->mnt_mounts.prev; 883 struct list_head *prev = p->mnt_mounts.prev;
884 while (prev != &p->mnt_mounts) { 884 while (prev != &p->mnt_mounts) {
885 p = list_entry(prev, struct mount, mnt_child); 885 p = list_entry(prev, struct mount, mnt_child);
886 prev = p->mnt_mounts.prev; 886 prev = p->mnt_mounts.prev;
887 } 887 }
888 return p; 888 return p;
889 } 889 }
890 890
891 struct vfsmount * 891 struct vfsmount *
892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 892 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
893 { 893 {
894 struct mount *mnt; 894 struct mount *mnt;
895 struct dentry *root; 895 struct dentry *root;
896 896
897 if (!type) 897 if (!type)
898 return ERR_PTR(-ENODEV); 898 return ERR_PTR(-ENODEV);
899 899
900 mnt = alloc_vfsmnt(name); 900 mnt = alloc_vfsmnt(name);
901 if (!mnt) 901 if (!mnt)
902 return ERR_PTR(-ENOMEM); 902 return ERR_PTR(-ENOMEM);
903 903
904 if (flags & MS_KERNMOUNT) 904 if (flags & MS_KERNMOUNT)
905 mnt->mnt.mnt_flags = MNT_INTERNAL; 905 mnt->mnt.mnt_flags = MNT_INTERNAL;
906 906
907 root = mount_fs(type, flags, name, data); 907 root = mount_fs(type, flags, name, data);
908 if (IS_ERR(root)) { 908 if (IS_ERR(root)) {
909 mnt_free_id(mnt); 909 mnt_free_id(mnt);
910 free_vfsmnt(mnt); 910 free_vfsmnt(mnt);
911 return ERR_CAST(root); 911 return ERR_CAST(root);
912 } 912 }
913 913
914 mnt->mnt.mnt_root = root; 914 mnt->mnt.mnt_root = root;
915 mnt->mnt.mnt_sb = root->d_sb; 915 mnt->mnt.mnt_sb = root->d_sb;
916 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 916 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
917 mnt->mnt_parent = mnt; 917 mnt->mnt_parent = mnt;
918 lock_mount_hash(); 918 lock_mount_hash();
919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 919 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
920 unlock_mount_hash(); 920 unlock_mount_hash();
921 return &mnt->mnt; 921 return &mnt->mnt;
922 } 922 }
923 EXPORT_SYMBOL_GPL(vfs_kern_mount); 923 EXPORT_SYMBOL_GPL(vfs_kern_mount);
924 924
925 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 925 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
926 int flag) 926 int flag)
927 { 927 {
928 struct super_block *sb = old->mnt.mnt_sb; 928 struct super_block *sb = old->mnt.mnt_sb;
929 struct mount *mnt; 929 struct mount *mnt;
930 int err; 930 int err;
931 931
932 mnt = alloc_vfsmnt(old->mnt_devname); 932 mnt = alloc_vfsmnt(old->mnt_devname);
933 if (!mnt) 933 if (!mnt)
934 return ERR_PTR(-ENOMEM); 934 return ERR_PTR(-ENOMEM);
935 935
936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 936 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
937 mnt->mnt_group_id = 0; /* not a peer of original */ 937 mnt->mnt_group_id = 0; /* not a peer of original */
938 else 938 else
939 mnt->mnt_group_id = old->mnt_group_id; 939 mnt->mnt_group_id = old->mnt_group_id;
940 940
941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 941 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
942 err = mnt_alloc_group_id(mnt); 942 err = mnt_alloc_group_id(mnt);
943 if (err) 943 if (err)
944 goto out_free; 944 goto out_free;
945 } 945 }
946 946
947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
948 /* Don't allow unprivileged users to change mount flags */ 948 /* Don't allow unprivileged users to change mount flags */
949 if (flag & CL_UNPRIVILEGED) { 949 if (flag & CL_UNPRIVILEGED) {
950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; 950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
951 951
952 if (mnt->mnt.mnt_flags & MNT_READONLY) 952 if (mnt->mnt.mnt_flags & MNT_READONLY)
953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
954 954
955 if (mnt->mnt.mnt_flags & MNT_NODEV) 955 if (mnt->mnt.mnt_flags & MNT_NODEV)
956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; 956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
957 957
958 if (mnt->mnt.mnt_flags & MNT_NOSUID) 958 if (mnt->mnt.mnt_flags & MNT_NOSUID)
959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; 959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
960 960
961 if (mnt->mnt.mnt_flags & MNT_NOEXEC) 961 if (mnt->mnt.mnt_flags & MNT_NOEXEC)
962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; 962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
963 } 963 }
964 964
965 /* Don't allow unprivileged users to reveal what is under a mount */ 965 /* Don't allow unprivileged users to reveal what is under a mount */
966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
967 mnt->mnt.mnt_flags |= MNT_LOCKED; 967 mnt->mnt.mnt_flags |= MNT_LOCKED;
968 968
969 atomic_inc(&sb->s_active); 969 atomic_inc(&sb->s_active);
970 mnt->mnt.mnt_sb = sb; 970 mnt->mnt.mnt_sb = sb;
971 mnt->mnt.mnt_root = dget(root); 971 mnt->mnt.mnt_root = dget(root);
972 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 972 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
973 mnt->mnt_parent = mnt; 973 mnt->mnt_parent = mnt;
974 lock_mount_hash(); 974 lock_mount_hash();
975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 975 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
976 unlock_mount_hash(); 976 unlock_mount_hash();
977 977
978 if ((flag & CL_SLAVE) || 978 if ((flag & CL_SLAVE) ||
979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 979 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
980 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 980 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
981 mnt->mnt_master = old; 981 mnt->mnt_master = old;
982 CLEAR_MNT_SHARED(mnt); 982 CLEAR_MNT_SHARED(mnt);
983 } else if (!(flag & CL_PRIVATE)) { 983 } else if (!(flag & CL_PRIVATE)) {
984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 984 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
985 list_add(&mnt->mnt_share, &old->mnt_share); 985 list_add(&mnt->mnt_share, &old->mnt_share);
986 if (IS_MNT_SLAVE(old)) 986 if (IS_MNT_SLAVE(old))
987 list_add(&mnt->mnt_slave, &old->mnt_slave); 987 list_add(&mnt->mnt_slave, &old->mnt_slave);
988 mnt->mnt_master = old->mnt_master; 988 mnt->mnt_master = old->mnt_master;
989 } 989 }
990 if (flag & CL_MAKE_SHARED) 990 if (flag & CL_MAKE_SHARED)
991 set_mnt_shared(mnt); 991 set_mnt_shared(mnt);
992 992
993 /* stick the duplicate mount on the same expiry list 993 /* stick the duplicate mount on the same expiry list
994 * as the original if that was on one */ 994 * as the original if that was on one */
995 if (flag & CL_EXPIRE) { 995 if (flag & CL_EXPIRE) {
996 if (!list_empty(&old->mnt_expire)) 996 if (!list_empty(&old->mnt_expire))
997 list_add(&mnt->mnt_expire, &old->mnt_expire); 997 list_add(&mnt->mnt_expire, &old->mnt_expire);
998 } 998 }
999 999
1000 return mnt; 1000 return mnt;
1001 1001
1002 out_free: 1002 out_free:
1003 mnt_free_id(mnt); 1003 mnt_free_id(mnt);
1004 free_vfsmnt(mnt); 1004 free_vfsmnt(mnt);
1005 return ERR_PTR(err); 1005 return ERR_PTR(err);
1006 } 1006 }
1007 1007
1008 static void cleanup_mnt(struct mount *mnt) 1008 static void cleanup_mnt(struct mount *mnt)
1009 { 1009 {
1010 /* 1010 /*
1011 * This probably indicates that somebody messed 1011 * This probably indicates that somebody messed
1012 * up a mnt_want/drop_write() pair. If this 1012 * up a mnt_want/drop_write() pair. If this
1013 * happens, the filesystem was probably unable 1013 * happens, the filesystem was probably unable
1014 * to make r/w->r/o transitions. 1014 * to make r/w->r/o transitions.
1015 */ 1015 */
1016 /* 1016 /*
1017 * The locking used to deal with mnt_count decrement provides barriers, 1017 * The locking used to deal with mnt_count decrement provides barriers,
1018 * so mnt_get_writers() below is safe. 1018 * so mnt_get_writers() below is safe.
1019 */ 1019 */
1020 WARN_ON(mnt_get_writers(mnt)); 1020 WARN_ON(mnt_get_writers(mnt));
1021 if (unlikely(mnt->mnt_pins.first)) 1021 if (unlikely(mnt->mnt_pins.first))
1022 mnt_pin_kill(mnt); 1022 mnt_pin_kill(mnt);
1023 fsnotify_vfsmount_delete(&mnt->mnt); 1023 fsnotify_vfsmount_delete(&mnt->mnt);
1024 dput(mnt->mnt.mnt_root); 1024 dput(mnt->mnt.mnt_root);
1025 deactivate_super(mnt->mnt.mnt_sb); 1025 deactivate_super(mnt->mnt.mnt_sb);
1026 mnt_free_id(mnt); 1026 mnt_free_id(mnt);
1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); 1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1028 } 1028 }
1029 1029
1030 static void __cleanup_mnt(struct rcu_head *head) 1030 static void __cleanup_mnt(struct rcu_head *head)
1031 { 1031 {
1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu)); 1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1033 } 1033 }
1034 1034
1035 static LLIST_HEAD(delayed_mntput_list); 1035 static LLIST_HEAD(delayed_mntput_list);
1036 static void delayed_mntput(struct work_struct *unused) 1036 static void delayed_mntput(struct work_struct *unused)
1037 { 1037 {
1038 struct llist_node *node = llist_del_all(&delayed_mntput_list); 1038 struct llist_node *node = llist_del_all(&delayed_mntput_list);
1039 struct llist_node *next; 1039 struct llist_node *next;
1040 1040
1041 for (; node; node = next) { 1041 for (; node; node = next) {
1042 next = llist_next(node); 1042 next = llist_next(node);
1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); 1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
1044 } 1044 }
1045 } 1045 }
1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); 1046 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1047 1047
1048 static void mntput_no_expire(struct mount *mnt) 1048 static void mntput_no_expire(struct mount *mnt)
1049 { 1049 {
1050 rcu_read_lock(); 1050 rcu_read_lock();
1051 mnt_add_count(mnt, -1); 1051 mnt_add_count(mnt, -1);
1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
1053 rcu_read_unlock(); 1053 rcu_read_unlock();
1054 return; 1054 return;
1055 } 1055 }
1056 lock_mount_hash(); 1056 lock_mount_hash();
1057 if (mnt_get_count(mnt)) { 1057 if (mnt_get_count(mnt)) {
1058 rcu_read_unlock(); 1058 rcu_read_unlock();
1059 unlock_mount_hash(); 1059 unlock_mount_hash();
1060 return; 1060 return;
1061 } 1061 }
1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1063 rcu_read_unlock(); 1063 rcu_read_unlock();
1064 unlock_mount_hash(); 1064 unlock_mount_hash();
1065 return; 1065 return;
1066 } 1066 }
1067 mnt->mnt.mnt_flags |= MNT_DOOMED; 1067 mnt->mnt.mnt_flags |= MNT_DOOMED;
1068 rcu_read_unlock(); 1068 rcu_read_unlock();
1069 1069
1070 list_del(&mnt->mnt_instance); 1070 list_del(&mnt->mnt_instance);
1071 unlock_mount_hash(); 1071 unlock_mount_hash();
1072 1072
1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { 1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1074 struct task_struct *task = current; 1074 struct task_struct *task = current;
1075 if (likely(!(task->flags & PF_KTHREAD))) { 1075 if (likely(!(task->flags & PF_KTHREAD))) {
1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt); 1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1077 if (!task_work_add(task, &mnt->mnt_rcu, true)) 1077 if (!task_work_add(task, &mnt->mnt_rcu, true))
1078 return; 1078 return;
1079 } 1079 }
1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) 1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1081 schedule_delayed_work(&delayed_mntput_work, 1); 1081 schedule_delayed_work(&delayed_mntput_work, 1);
1082 return; 1082 return;
1083 } 1083 }
1084 cleanup_mnt(mnt); 1084 cleanup_mnt(mnt);
1085 } 1085 }
1086 1086
1087 void mntput(struct vfsmount *mnt) 1087 void mntput(struct vfsmount *mnt)
1088 { 1088 {
1089 if (mnt) { 1089 if (mnt) {
1090 struct mount *m = real_mount(mnt); 1090 struct mount *m = real_mount(mnt);
1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 1091 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1092 if (unlikely(m->mnt_expiry_mark)) 1092 if (unlikely(m->mnt_expiry_mark))
1093 m->mnt_expiry_mark = 0; 1093 m->mnt_expiry_mark = 0;
1094 mntput_no_expire(m); 1094 mntput_no_expire(m);
1095 } 1095 }
1096 } 1096 }
1097 EXPORT_SYMBOL(mntput); 1097 EXPORT_SYMBOL(mntput);
1098 1098
1099 struct vfsmount *mntget(struct vfsmount *mnt) 1099 struct vfsmount *mntget(struct vfsmount *mnt)
1100 { 1100 {
1101 if (mnt) 1101 if (mnt)
1102 mnt_add_count(real_mount(mnt), 1); 1102 mnt_add_count(real_mount(mnt), 1);
1103 return mnt; 1103 return mnt;
1104 } 1104 }
1105 EXPORT_SYMBOL(mntget); 1105 EXPORT_SYMBOL(mntget);
1106 1106
1107 struct vfsmount *mnt_clone_internal(struct path *path) 1107 struct vfsmount *mnt_clone_internal(struct path *path)
1108 { 1108 {
1109 struct mount *p; 1109 struct mount *p;
1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); 1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1111 if (IS_ERR(p)) 1111 if (IS_ERR(p))
1112 return ERR_CAST(p); 1112 return ERR_CAST(p);
1113 p->mnt.mnt_flags |= MNT_INTERNAL; 1113 p->mnt.mnt_flags |= MNT_INTERNAL;
1114 return &p->mnt; 1114 return &p->mnt;
1115 } 1115 }
1116 1116
1117 static inline void mangle(struct seq_file *m, const char *s) 1117 static inline void mangle(struct seq_file *m, const char *s)
1118 { 1118 {
1119 seq_escape(m, s, " \t\n\\"); 1119 seq_escape(m, s, " \t\n\\");
1120 } 1120 }
1121 1121
1122 /* 1122 /*
1123 * Simple .show_options callback for filesystems which don't want to 1123 * Simple .show_options callback for filesystems which don't want to
1124 * implement more complex mount option showing. 1124 * implement more complex mount option showing.
1125 * 1125 *
1126 * See also save_mount_options(). 1126 * See also save_mount_options().
1127 */ 1127 */
1128 int generic_show_options(struct seq_file *m, struct dentry *root) 1128 int generic_show_options(struct seq_file *m, struct dentry *root)
1129 { 1129 {
1130 const char *options; 1130 const char *options;
1131 1131
1132 rcu_read_lock(); 1132 rcu_read_lock();
1133 options = rcu_dereference(root->d_sb->s_options); 1133 options = rcu_dereference(root->d_sb->s_options);
1134 1134
1135 if (options != NULL && options[0]) { 1135 if (options != NULL && options[0]) {
1136 seq_putc(m, ','); 1136 seq_putc(m, ',');
1137 mangle(m, options); 1137 mangle(m, options);
1138 } 1138 }
1139 rcu_read_unlock(); 1139 rcu_read_unlock();
1140 1140
1141 return 0; 1141 return 0;
1142 } 1142 }
1143 EXPORT_SYMBOL(generic_show_options); 1143 EXPORT_SYMBOL(generic_show_options);
1144 1144
1145 /* 1145 /*
1146 * If filesystem uses generic_show_options(), this function should be 1146 * If filesystem uses generic_show_options(), this function should be
1147 * called from the fill_super() callback. 1147 * called from the fill_super() callback.
1148 * 1148 *
1149 * The .remount_fs callback usually needs to be handled in a special 1149 * The .remount_fs callback usually needs to be handled in a special
1150 * way, to make sure, that previous options are not overwritten if the 1150 * way, to make sure, that previous options are not overwritten if the
1151 * remount fails. 1151 * remount fails.
1152 * 1152 *
1153 * Also note, that if the filesystem's .remount_fs function doesn't 1153 * Also note, that if the filesystem's .remount_fs function doesn't
1154 * reset all options to their default value, but changes only newly 1154 * reset all options to their default value, but changes only newly
1155 * given options, then the displayed options will not reflect reality 1155 * given options, then the displayed options will not reflect reality
1156 * any more. 1156 * any more.
1157 */ 1157 */
1158 void save_mount_options(struct super_block *sb, char *options) 1158 void save_mount_options(struct super_block *sb, char *options)
1159 { 1159 {
1160 BUG_ON(sb->s_options); 1160 BUG_ON(sb->s_options);
1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 1161 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
1162 } 1162 }
1163 EXPORT_SYMBOL(save_mount_options); 1163 EXPORT_SYMBOL(save_mount_options);
1164 1164
1165 void replace_mount_options(struct super_block *sb, char *options) 1165 void replace_mount_options(struct super_block *sb, char *options)
1166 { 1166 {
1167 char *old = sb->s_options; 1167 char *old = sb->s_options;
1168 rcu_assign_pointer(sb->s_options, options); 1168 rcu_assign_pointer(sb->s_options, options);
1169 if (old) { 1169 if (old) {
1170 synchronize_rcu(); 1170 synchronize_rcu();
1171 kfree(old); 1171 kfree(old);
1172 } 1172 }
1173 } 1173 }
1174 EXPORT_SYMBOL(replace_mount_options); 1174 EXPORT_SYMBOL(replace_mount_options);
1175 1175
1176 #ifdef CONFIG_PROC_FS 1176 #ifdef CONFIG_PROC_FS
1177 /* iterator; we want it to have access to namespace_sem, thus here... */ 1177 /* iterator; we want it to have access to namespace_sem, thus here... */
1178 static void *m_start(struct seq_file *m, loff_t *pos) 1178 static void *m_start(struct seq_file *m, loff_t *pos)
1179 { 1179 {
1180 struct proc_mounts *p = proc_mounts(m); 1180 struct proc_mounts *p = proc_mounts(m);
1181 1181
1182 down_read(&namespace_sem); 1182 down_read(&namespace_sem);
1183 if (p->cached_event == p->ns->event) { 1183 if (p->cached_event == p->ns->event) {
1184 void *v = p->cached_mount; 1184 void *v = p->cached_mount;
1185 if (*pos == p->cached_index) 1185 if (*pos == p->cached_index)
1186 return v; 1186 return v;
1187 if (*pos == p->cached_index + 1) { 1187 if (*pos == p->cached_index + 1) {
1188 v = seq_list_next(v, &p->ns->list, &p->cached_index); 1188 v = seq_list_next(v, &p->ns->list, &p->cached_index);
1189 return p->cached_mount = v; 1189 return p->cached_mount = v;
1190 } 1190 }
1191 } 1191 }
1192 1192
1193 p->cached_event = p->ns->event; 1193 p->cached_event = p->ns->event;
1194 p->cached_mount = seq_list_start(&p->ns->list, *pos); 1194 p->cached_mount = seq_list_start(&p->ns->list, *pos);
1195 p->cached_index = *pos; 1195 p->cached_index = *pos;
1196 return p->cached_mount; 1196 return p->cached_mount;
1197 } 1197 }
1198 1198
1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1199 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1200 { 1200 {
1201 struct proc_mounts *p = proc_mounts(m); 1201 struct proc_mounts *p = proc_mounts(m);
1202 1202
1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos); 1203 p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1204 p->cached_index = *pos; 1204 p->cached_index = *pos;
1205 return p->cached_mount; 1205 return p->cached_mount;
1206 } 1206 }
1207 1207
1208 static void m_stop(struct seq_file *m, void *v) 1208 static void m_stop(struct seq_file *m, void *v)
1209 { 1209 {
1210 up_read(&namespace_sem); 1210 up_read(&namespace_sem);
1211 } 1211 }
1212 1212
1213 static int m_show(struct seq_file *m, void *v) 1213 static int m_show(struct seq_file *m, void *v)
1214 { 1214 {
1215 struct proc_mounts *p = proc_mounts(m); 1215 struct proc_mounts *p = proc_mounts(m);
1216 struct mount *r = list_entry(v, struct mount, mnt_list); 1216 struct mount *r = list_entry(v, struct mount, mnt_list);
1217 return p->show(m, &r->mnt); 1217 return p->show(m, &r->mnt);
1218 } 1218 }
1219 1219
1220 const struct seq_operations mounts_op = { 1220 const struct seq_operations mounts_op = {
1221 .start = m_start, 1221 .start = m_start,
1222 .next = m_next, 1222 .next = m_next,
1223 .stop = m_stop, 1223 .stop = m_stop,
1224 .show = m_show, 1224 .show = m_show,
1225 }; 1225 };
1226 #endif /* CONFIG_PROC_FS */ 1226 #endif /* CONFIG_PROC_FS */
1227 1227
1228 /** 1228 /**
1229 * may_umount_tree - check if a mount tree is busy 1229 * may_umount_tree - check if a mount tree is busy
1230 * @mnt: root of mount tree 1230 * @mnt: root of mount tree
1231 * 1231 *
1232 * This is called to check if a tree of mounts has any 1232 * This is called to check if a tree of mounts has any
1233 * open files, pwds, chroots or sub mounts that are 1233 * open files, pwds, chroots or sub mounts that are
1234 * busy. 1234 * busy.
1235 */ 1235 */
1236 int may_umount_tree(struct vfsmount *m) 1236 int may_umount_tree(struct vfsmount *m)
1237 { 1237 {
1238 struct mount *mnt = real_mount(m); 1238 struct mount *mnt = real_mount(m);
1239 int actual_refs = 0; 1239 int actual_refs = 0;
1240 int minimum_refs = 0; 1240 int minimum_refs = 0;
1241 struct mount *p; 1241 struct mount *p;
1242 BUG_ON(!m); 1242 BUG_ON(!m);
1243 1243
1244 /* write lock needed for mnt_get_count */ 1244 /* write lock needed for mnt_get_count */
1245 lock_mount_hash(); 1245 lock_mount_hash();
1246 for (p = mnt; p; p = next_mnt(p, mnt)) { 1246 for (p = mnt; p; p = next_mnt(p, mnt)) {
1247 actual_refs += mnt_get_count(p); 1247 actual_refs += mnt_get_count(p);
1248 minimum_refs += 2; 1248 minimum_refs += 2;
1249 } 1249 }
1250 unlock_mount_hash(); 1250 unlock_mount_hash();
1251 1251
1252 if (actual_refs > minimum_refs) 1252 if (actual_refs > minimum_refs)
1253 return 0; 1253 return 0;
1254 1254
1255 return 1; 1255 return 1;
1256 } 1256 }
1257 1257
1258 EXPORT_SYMBOL(may_umount_tree); 1258 EXPORT_SYMBOL(may_umount_tree);
1259 1259
1260 /** 1260 /**
1261 * may_umount - check if a mount point is busy 1261 * may_umount - check if a mount point is busy
1262 * @mnt: root of mount 1262 * @mnt: root of mount
1263 * 1263 *
1264 * This is called to check if a mount point has any 1264 * This is called to check if a mount point has any
1265 * open files, pwds, chroots or sub mounts. If the 1265 * open files, pwds, chroots or sub mounts. If the
1266 * mount has sub mounts this will return busy 1266 * mount has sub mounts this will return busy
1267 * regardless of whether the sub mounts are busy. 1267 * regardless of whether the sub mounts are busy.
1268 * 1268 *
1269 * Doesn't take quota and stuff into account. IOW, in some cases it will 1269 * Doesn't take quota and stuff into account. IOW, in some cases it will
1270 * give false negatives. The main reason why it's here is that we need 1270 * give false negatives. The main reason why it's here is that we need
1271 * a non-destructive way to look for easily umountable filesystems. 1271 * a non-destructive way to look for easily umountable filesystems.
1272 */ 1272 */
1273 int may_umount(struct vfsmount *mnt) 1273 int may_umount(struct vfsmount *mnt)
1274 { 1274 {
1275 int ret = 1; 1275 int ret = 1;
1276 down_read(&namespace_sem); 1276 down_read(&namespace_sem);
1277 lock_mount_hash(); 1277 lock_mount_hash();
1278 if (propagate_mount_busy(real_mount(mnt), 2)) 1278 if (propagate_mount_busy(real_mount(mnt), 2))
1279 ret = 0; 1279 ret = 0;
1280 unlock_mount_hash(); 1280 unlock_mount_hash();
1281 up_read(&namespace_sem); 1281 up_read(&namespace_sem);
1282 return ret; 1282 return ret;
1283 } 1283 }
1284 1284
1285 EXPORT_SYMBOL(may_umount); 1285 EXPORT_SYMBOL(may_umount);
1286 1286
1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 1287 static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1288 1288
1289 static void namespace_unlock(void) 1289 static void namespace_unlock(void)
1290 { 1290 {
1291 struct mount *mnt; 1291 struct mount *mnt;
1292 struct hlist_head head = unmounted; 1292 struct hlist_head head = unmounted;
1293 1293
1294 if (likely(hlist_empty(&head))) { 1294 if (likely(hlist_empty(&head))) {
1295 up_write(&namespace_sem); 1295 up_write(&namespace_sem);
1296 return; 1296 return;
1297 } 1297 }
1298 1298
1299 head.first->pprev = &head.first; 1299 head.first->pprev = &head.first;
1300 INIT_HLIST_HEAD(&unmounted); 1300 INIT_HLIST_HEAD(&unmounted);
1301 1301
1302 /* undo decrements we'd done in umount_tree() */ 1302 /* undo decrements we'd done in umount_tree() */
1303 hlist_for_each_entry(mnt, &head, mnt_hash) 1303 hlist_for_each_entry(mnt, &head, mnt_hash)
1304 if (mnt->mnt_ex_mountpoint.mnt) 1304 if (mnt->mnt_ex_mountpoint.mnt)
1305 mntget(mnt->mnt_ex_mountpoint.mnt); 1305 mntget(mnt->mnt_ex_mountpoint.mnt);
1306 1306
1307 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1308 1308
1309 synchronize_rcu(); 1309 synchronize_rcu();
1310 1310
1311 while (!hlist_empty(&head)) { 1311 while (!hlist_empty(&head)) {
1312 mnt = hlist_entry(head.first, struct mount, mnt_hash); 1312 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1313 hlist_del_init(&mnt->mnt_hash); 1313 hlist_del_init(&mnt->mnt_hash);
1314 if (mnt->mnt_ex_mountpoint.mnt) 1314 if (mnt->mnt_ex_mountpoint.mnt)
1315 path_put(&mnt->mnt_ex_mountpoint); 1315 path_put(&mnt->mnt_ex_mountpoint);
1316 mntput(&mnt->mnt); 1316 mntput(&mnt->mnt);
1317 } 1317 }
1318 } 1318 }
1319 1319
1320 static inline void namespace_lock(void) 1320 static inline void namespace_lock(void)
1321 { 1321 {
1322 down_write(&namespace_sem); 1322 down_write(&namespace_sem);
1323 } 1323 }
1324 1324
1325 /* 1325 /*
1326 * mount_lock must be held 1326 * mount_lock must be held
1327 * namespace_sem must be held for write 1327 * namespace_sem must be held for write
1328 * how = 0 => just this tree, don't propagate 1328 * how = 0 => just this tree, don't propagate
1329 * how = 1 => propagate; we know that nobody else has reference to any victims 1329 * how = 1 => propagate; we know that nobody else has reference to any victims
1330 * how = 2 => lazy umount 1330 * how = 2 => lazy umount
1331 */ 1331 */
1332 void umount_tree(struct mount *mnt, int how) 1332 void umount_tree(struct mount *mnt, int how)
1333 { 1333 {
1334 HLIST_HEAD(tmp_list); 1334 HLIST_HEAD(tmp_list);
1335 struct mount *p; 1335 struct mount *p;
1336 struct mount *last = NULL; 1336 struct mount *last = NULL;
1337 1337
1338 for (p = mnt; p; p = next_mnt(p, mnt)) { 1338 for (p = mnt; p; p = next_mnt(p, mnt)) {
1339 hlist_del_init_rcu(&p->mnt_hash); 1339 hlist_del_init_rcu(&p->mnt_hash);
1340 hlist_add_head(&p->mnt_hash, &tmp_list); 1340 hlist_add_head(&p->mnt_hash, &tmp_list);
1341 } 1341 }
1342 1342
1343 hlist_for_each_entry(p, &tmp_list, mnt_hash) 1343 hlist_for_each_entry(p, &tmp_list, mnt_hash)
1344 list_del_init(&p->mnt_child); 1344 list_del_init(&p->mnt_child);
1345 1345
1346 if (how) 1346 if (how)
1347 propagate_umount(&tmp_list); 1347 propagate_umount(&tmp_list);
1348 1348
1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1349 hlist_for_each_entry(p, &tmp_list, mnt_hash) {
1350 list_del_init(&p->mnt_expire); 1350 list_del_init(&p->mnt_expire);
1351 list_del_init(&p->mnt_list); 1351 list_del_init(&p->mnt_list);
1352 __touch_mnt_namespace(p->mnt_ns); 1352 __touch_mnt_namespace(p->mnt_ns);
1353 p->mnt_ns = NULL; 1353 p->mnt_ns = NULL;
1354 if (how < 2) 1354 if (how < 2)
1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1356 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1357 hlist_del_init(&p->mnt_mp_list); 1357 hlist_del_init(&p->mnt_mp_list);
1358 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1359 mnt_add_count(p->mnt_parent, -1); 1359 mnt_add_count(p->mnt_parent, -1);
1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1363 p->mnt_mountpoint = p->mnt.mnt_root; 1363 p->mnt_mountpoint = p->mnt.mnt_root;
1364 p->mnt_parent = p; 1364 p->mnt_parent = p;
1365 p->mnt_mp = NULL; 1365 p->mnt_mp = NULL;
1366 } 1366 }
1367 change_mnt_propagation(p, MS_PRIVATE); 1367 change_mnt_propagation(p, MS_PRIVATE);
1368 last = p; 1368 last = p;
1369 } 1369 }
1370 if (last) { 1370 if (last) {
1371 last->mnt_hash.next = unmounted.first; 1371 last->mnt_hash.next = unmounted.first;
1372 unmounted.first = tmp_list.first; 1372 unmounted.first = tmp_list.first;
1373 unmounted.first->pprev = &unmounted.first; 1373 unmounted.first->pprev = &unmounted.first;
1374 } 1374 }
1375 } 1375 }
1376 1376
1377 static void shrink_submounts(struct mount *mnt); 1377 static void shrink_submounts(struct mount *mnt);
1378 1378
1379 static int do_umount(struct mount *mnt, int flags) 1379 static int do_umount(struct mount *mnt, int flags)
1380 { 1380 {
1381 struct super_block *sb = mnt->mnt.mnt_sb; 1381 struct super_block *sb = mnt->mnt.mnt_sb;
1382 int retval; 1382 int retval;
1383 1383
1384 retval = security_sb_umount(&mnt->mnt, flags); 1384 retval = security_sb_umount(&mnt->mnt, flags);
1385 if (retval) 1385 if (retval)
1386 return retval; 1386 return retval;
1387 1387
1388 /* 1388 /*
1389 * Allow userspace to request a mountpoint be expired rather than 1389 * Allow userspace to request a mountpoint be expired rather than
1390 * unmounting unconditionally. Unmount only happens if: 1390 * unmounting unconditionally. Unmount only happens if:
1391 * (1) the mark is already set (the mark is cleared by mntput()) 1391 * (1) the mark is already set (the mark is cleared by mntput())
1392 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1392 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1393 */ 1393 */
1394 if (flags & MNT_EXPIRE) { 1394 if (flags & MNT_EXPIRE) {
1395 if (&mnt->mnt == current->fs->root.mnt || 1395 if (&mnt->mnt == current->fs->root.mnt ||
1396 flags & (MNT_FORCE | MNT_DETACH)) 1396 flags & (MNT_FORCE | MNT_DETACH))
1397 return -EINVAL; 1397 return -EINVAL;
1398 1398
1399 /* 1399 /*
1400 * probably don't strictly need the lock here if we examined 1400 * probably don't strictly need the lock here if we examined
1401 * all race cases, but it's a slowpath. 1401 * all race cases, but it's a slowpath.
1402 */ 1402 */
1403 lock_mount_hash(); 1403 lock_mount_hash();
1404 if (mnt_get_count(mnt) != 2) { 1404 if (mnt_get_count(mnt) != 2) {
1405 unlock_mount_hash(); 1405 unlock_mount_hash();
1406 return -EBUSY; 1406 return -EBUSY;
1407 } 1407 }
1408 unlock_mount_hash(); 1408 unlock_mount_hash();
1409 1409
1410 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1410 if (!xchg(&mnt->mnt_expiry_mark, 1))
1411 return -EAGAIN; 1411 return -EAGAIN;
1412 } 1412 }
1413 1413
1414 /* 1414 /*
1415 * If we may have to abort operations to get out of this 1415 * If we may have to abort operations to get out of this
1416 * mount, and they will themselves hold resources we must 1416 * mount, and they will themselves hold resources we must
1417 * allow the fs to do things. In the Unix tradition of 1417 * allow the fs to do things. In the Unix tradition of
1418 * 'Gee thats tricky lets do it in userspace' the umount_begin 1418 * 'Gee thats tricky lets do it in userspace' the umount_begin
1419 * might fail to complete on the first run through as other tasks 1419 * might fail to complete on the first run through as other tasks
1420 * must return, and the like. Thats for the mount program to worry 1420 * must return, and the like. Thats for the mount program to worry
1421 * about for the moment. 1421 * about for the moment.
1422 */ 1422 */
1423 1423
1424 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1424 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1425 sb->s_op->umount_begin(sb); 1425 sb->s_op->umount_begin(sb);
1426 } 1426 }
1427 1427
1428 /* 1428 /*
1429 * No sense to grab the lock for this test, but test itself looks 1429 * No sense to grab the lock for this test, but test itself looks
1430 * somewhat bogus. Suggestions for better replacement? 1430 * somewhat bogus. Suggestions for better replacement?
1431 * Ho-hum... In principle, we might treat that as umount + switch 1431 * Ho-hum... In principle, we might treat that as umount + switch
1432 * to rootfs. GC would eventually take care of the old vfsmount. 1432 * to rootfs. GC would eventually take care of the old vfsmount.
1433 * Actually it makes sense, especially if rootfs would contain a 1433 * Actually it makes sense, especially if rootfs would contain a
1434 * /reboot - static binary that would close all descriptors and 1434 * /reboot - static binary that would close all descriptors and
1435 * call reboot(9). Then init(8) could umount root and exec /reboot. 1435 * call reboot(9). Then init(8) could umount root and exec /reboot.
1436 */ 1436 */
1437 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1437 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1438 /* 1438 /*
1439 * Special case for "unmounting" root ... 1439 * Special case for "unmounting" root ...
1440 * we just try to remount it readonly. 1440 * we just try to remount it readonly.
1441 */ 1441 */
1442 if (!capable(CAP_SYS_ADMIN)) 1442 if (!capable(CAP_SYS_ADMIN))
1443 return -EPERM; 1443 return -EPERM;
1444 down_write(&sb->s_umount); 1444 down_write(&sb->s_umount);
1445 if (!(sb->s_flags & MS_RDONLY)) 1445 if (!(sb->s_flags & MS_RDONLY))
1446 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1446 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1447 up_write(&sb->s_umount); 1447 up_write(&sb->s_umount);
1448 return retval; 1448 return retval;
1449 } 1449 }
1450 1450
1451 namespace_lock(); 1451 namespace_lock();
1452 lock_mount_hash(); 1452 lock_mount_hash();
1453 event++; 1453 event++;
1454 1454
1455 if (flags & MNT_DETACH) { 1455 if (flags & MNT_DETACH) {
1456 if (!list_empty(&mnt->mnt_list)) 1456 if (!list_empty(&mnt->mnt_list))
1457 umount_tree(mnt, 2); 1457 umount_tree(mnt, 2);
1458 retval = 0; 1458 retval = 0;
1459 } else { 1459 } else {
1460 shrink_submounts(mnt); 1460 shrink_submounts(mnt);
1461 retval = -EBUSY; 1461 retval = -EBUSY;
1462 if (!propagate_mount_busy(mnt, 2)) { 1462 if (!propagate_mount_busy(mnt, 2)) {
1463 if (!list_empty(&mnt->mnt_list)) 1463 if (!list_empty(&mnt->mnt_list))
1464 umount_tree(mnt, 1); 1464 umount_tree(mnt, 1);
1465 retval = 0; 1465 retval = 0;
1466 } 1466 }
1467 } 1467 }
1468 unlock_mount_hash(); 1468 unlock_mount_hash();
1469 namespace_unlock(); 1469 namespace_unlock();
1470 return retval; 1470 return retval;
1471 } 1471 }
1472 1472
1473 /* 1473 /*
1474 * __detach_mounts - lazily unmount all mounts on the specified dentry 1474 * __detach_mounts - lazily unmount all mounts on the specified dentry
1475 * 1475 *
1476 * During unlink, rmdir, and d_drop it is possible to loose the path 1476 * During unlink, rmdir, and d_drop it is possible to loose the path
1477 * to an existing mountpoint, and wind up leaking the mount. 1477 * to an existing mountpoint, and wind up leaking the mount.
1478 * detach_mounts allows lazily unmounting those mounts instead of 1478 * detach_mounts allows lazily unmounting those mounts instead of
1479 * leaking them. 1479 * leaking them.
1480 * 1480 *
1481 * The caller may hold dentry->d_inode->i_mutex. 1481 * The caller may hold dentry->d_inode->i_mutex.
1482 */ 1482 */
1483 void __detach_mounts(struct dentry *dentry) 1483 void __detach_mounts(struct dentry *dentry)
1484 { 1484 {
1485 struct mountpoint *mp; 1485 struct mountpoint *mp;
1486 struct mount *mnt; 1486 struct mount *mnt;
1487 1487
1488 namespace_lock(); 1488 namespace_lock();
1489 mp = lookup_mountpoint(dentry); 1489 mp = lookup_mountpoint(dentry);
1490 if (!mp) 1490 if (!mp)
1491 goto out_unlock; 1491 goto out_unlock;
1492 1492
1493 lock_mount_hash(); 1493 lock_mount_hash();
1494 while (!hlist_empty(&mp->m_list)) { 1494 while (!hlist_empty(&mp->m_list)) {
1495 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); 1495 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1496 umount_tree(mnt, 2); 1496 umount_tree(mnt, 2);
1497 } 1497 }
1498 unlock_mount_hash(); 1498 unlock_mount_hash();
1499 put_mountpoint(mp); 1499 put_mountpoint(mp);
1500 out_unlock: 1500 out_unlock:
1501 namespace_unlock(); 1501 namespace_unlock();
1502 } 1502 }
1503 1503
1504 /* 1504 /*
1505 * Is the caller allowed to modify his namespace? 1505 * Is the caller allowed to modify his namespace?
1506 */ 1506 */
1507 static inline bool may_mount(void) 1507 static inline bool may_mount(void)
1508 { 1508 {
1509 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 1509 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1510 } 1510 }
1511 1511
1512 /* 1512 /*
1513 * Now umount can handle mount points as well as block devices. 1513 * Now umount can handle mount points as well as block devices.
1514 * This is important for filesystems which use unnamed block devices. 1514 * This is important for filesystems which use unnamed block devices.
1515 * 1515 *
1516 * We now support a flag for forced unmount like the other 'big iron' 1516 * We now support a flag for forced unmount like the other 'big iron'
1517 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1517 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1518 */ 1518 */
1519 1519
1520 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1520 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1521 { 1521 {
1522 struct path path; 1522 struct path path;
1523 struct mount *mnt; 1523 struct mount *mnt;
1524 int retval; 1524 int retval;
1525 int lookup_flags = 0; 1525 int lookup_flags = 0;
1526 1526
1527 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1527 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1528 return -EINVAL; 1528 return -EINVAL;
1529 1529
1530 if (!may_mount()) 1530 if (!may_mount())
1531 return -EPERM; 1531 return -EPERM;
1532 1532
1533 if (!(flags & UMOUNT_NOFOLLOW)) 1533 if (!(flags & UMOUNT_NOFOLLOW))
1534 lookup_flags |= LOOKUP_FOLLOW; 1534 lookup_flags |= LOOKUP_FOLLOW;
1535 1535
1536 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); 1536 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1537 if (retval) 1537 if (retval)
1538 goto out; 1538 goto out;
1539 mnt = real_mount(path.mnt); 1539 mnt = real_mount(path.mnt);
1540 retval = -EINVAL; 1540 retval = -EINVAL;
1541 if (path.dentry != path.mnt->mnt_root) 1541 if (path.dentry != path.mnt->mnt_root)
1542 goto dput_and_out; 1542 goto dput_and_out;
1543 if (!check_mnt(mnt)) 1543 if (!check_mnt(mnt))
1544 goto dput_and_out; 1544 goto dput_and_out;
1545 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1545 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1546 goto dput_and_out; 1546 goto dput_and_out;
1547 1547
1548 retval = do_umount(mnt, flags); 1548 retval = do_umount(mnt, flags);
1549 dput_and_out: 1549 dput_and_out:
1550 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1550 /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1551 dput(path.dentry); 1551 dput(path.dentry);
1552 mntput_no_expire(mnt); 1552 mntput_no_expire(mnt);
1553 out: 1553 out:
1554 return retval; 1554 return retval;
1555 } 1555 }
1556 1556
1557 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1557 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1558 1558
1559 /* 1559 /*
1560 * The 2.0 compatible umount. No flags. 1560 * The 2.0 compatible umount. No flags.
1561 */ 1561 */
1562 SYSCALL_DEFINE1(oldumount, char __user *, name) 1562 SYSCALL_DEFINE1(oldumount, char __user *, name)
1563 { 1563 {
1564 return sys_umount(name, 0); 1564 return sys_umount(name, 0);
1565 } 1565 }
1566 1566
1567 #endif 1567 #endif
1568 1568
1569 static bool is_mnt_ns_file(struct dentry *dentry) 1569 static bool is_mnt_ns_file(struct dentry *dentry)
1570 { 1570 {
1571 /* Is this a proxy for a mount namespace? */ 1571 /* Is this a proxy for a mount namespace? */
1572 struct inode *inode = dentry->d_inode; 1572 return dentry->d_op == &ns_dentry_operations &&
1573 return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations; 1573 dentry->d_fsdata == &mntns_operations;
1574 } 1574 }
1575 1575
1576 struct mnt_namespace *to_mnt_ns(struct ns_common *ns) 1576 struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1577 { 1577 {
1578 return container_of(ns, struct mnt_namespace, ns); 1578 return container_of(ns, struct mnt_namespace, ns);
1579 } 1579 }
1580 1580
1581 static bool mnt_ns_loop(struct dentry *dentry) 1581 static bool mnt_ns_loop(struct dentry *dentry)
1582 { 1582 {
1583 /* Could bind mounting the mount namespace inode cause a 1583 /* Could bind mounting the mount namespace inode cause a
1584 * mount namespace loop? 1584 * mount namespace loop?
1585 */ 1585 */
1586 struct mnt_namespace *mnt_ns; 1586 struct mnt_namespace *mnt_ns;
1587 if (!is_mnt_ns_file(dentry)) 1587 if (!is_mnt_ns_file(dentry))
1588 return false; 1588 return false;
1589 1589
1590 mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); 1590 mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1591 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1591 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1592 } 1592 }
1593 1593
1594 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1594 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1595 int flag) 1595 int flag)
1596 { 1596 {
1597 struct mount *res, *p, *q, *r, *parent; 1597 struct mount *res, *p, *q, *r, *parent;
1598 1598
1599 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) 1599 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1600 return ERR_PTR(-EINVAL); 1600 return ERR_PTR(-EINVAL);
1601 1601
1602 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 1602 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1603 return ERR_PTR(-EINVAL); 1603 return ERR_PTR(-EINVAL);
1604 1604
1605 res = q = clone_mnt(mnt, dentry, flag); 1605 res = q = clone_mnt(mnt, dentry, flag);
1606 if (IS_ERR(q)) 1606 if (IS_ERR(q))
1607 return q; 1607 return q;
1608 1608
1609 q->mnt.mnt_flags &= ~MNT_LOCKED; 1609 q->mnt.mnt_flags &= ~MNT_LOCKED;
1610 q->mnt_mountpoint = mnt->mnt_mountpoint; 1610 q->mnt_mountpoint = mnt->mnt_mountpoint;
1611 1611
1612 p = mnt; 1612 p = mnt;
1613 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1613 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1614 struct mount *s; 1614 struct mount *s;
1615 if (!is_subdir(r->mnt_mountpoint, dentry)) 1615 if (!is_subdir(r->mnt_mountpoint, dentry))
1616 continue; 1616 continue;
1617 1617
1618 for (s = r; s; s = next_mnt(s, r)) { 1618 for (s = r; s; s = next_mnt(s, r)) {
1619 struct mount *t = NULL; 1619 struct mount *t = NULL;
1620 if (!(flag & CL_COPY_UNBINDABLE) && 1620 if (!(flag & CL_COPY_UNBINDABLE) &&
1621 IS_MNT_UNBINDABLE(s)) { 1621 IS_MNT_UNBINDABLE(s)) {
1622 s = skip_mnt_tree(s); 1622 s = skip_mnt_tree(s);
1623 continue; 1623 continue;
1624 } 1624 }
1625 if (!(flag & CL_COPY_MNT_NS_FILE) && 1625 if (!(flag & CL_COPY_MNT_NS_FILE) &&
1626 is_mnt_ns_file(s->mnt.mnt_root)) { 1626 is_mnt_ns_file(s->mnt.mnt_root)) {
1627 s = skip_mnt_tree(s); 1627 s = skip_mnt_tree(s);
1628 continue; 1628 continue;
1629 } 1629 }
1630 while (p != s->mnt_parent) { 1630 while (p != s->mnt_parent) {
1631 p = p->mnt_parent; 1631 p = p->mnt_parent;
1632 q = q->mnt_parent; 1632 q = q->mnt_parent;
1633 } 1633 }
1634 p = s; 1634 p = s;
1635 parent = q; 1635 parent = q;
1636 q = clone_mnt(p, p->mnt.mnt_root, flag); 1636 q = clone_mnt(p, p->mnt.mnt_root, flag);
1637 if (IS_ERR(q)) 1637 if (IS_ERR(q))
1638 goto out; 1638 goto out;
1639 lock_mount_hash(); 1639 lock_mount_hash();
1640 list_add_tail(&q->mnt_list, &res->mnt_list); 1640 list_add_tail(&q->mnt_list, &res->mnt_list);
1641 mnt_set_mountpoint(parent, p->mnt_mp, q); 1641 mnt_set_mountpoint(parent, p->mnt_mp, q);
1642 if (!list_empty(&parent->mnt_mounts)) { 1642 if (!list_empty(&parent->mnt_mounts)) {
1643 t = list_last_entry(&parent->mnt_mounts, 1643 t = list_last_entry(&parent->mnt_mounts,
1644 struct mount, mnt_child); 1644 struct mount, mnt_child);
1645 if (t->mnt_mp != p->mnt_mp) 1645 if (t->mnt_mp != p->mnt_mp)
1646 t = NULL; 1646 t = NULL;
1647 } 1647 }
1648 attach_shadowed(q, parent, t); 1648 attach_shadowed(q, parent, t);
1649 unlock_mount_hash(); 1649 unlock_mount_hash();
1650 } 1650 }
1651 } 1651 }
1652 return res; 1652 return res;
1653 out: 1653 out:
1654 if (res) { 1654 if (res) {
1655 lock_mount_hash(); 1655 lock_mount_hash();
1656 umount_tree(res, 0); 1656 umount_tree(res, 0);
1657 unlock_mount_hash(); 1657 unlock_mount_hash();
1658 } 1658 }
1659 return q; 1659 return q;
1660 } 1660 }
1661 1661
1662 /* Caller should check returned pointer for errors */ 1662 /* Caller should check returned pointer for errors */
1663 1663
1664 struct vfsmount *collect_mounts(struct path *path) 1664 struct vfsmount *collect_mounts(struct path *path)
1665 { 1665 {
1666 struct mount *tree; 1666 struct mount *tree;
1667 namespace_lock(); 1667 namespace_lock();
1668 tree = copy_tree(real_mount(path->mnt), path->dentry, 1668 tree = copy_tree(real_mount(path->mnt), path->dentry,
1669 CL_COPY_ALL | CL_PRIVATE); 1669 CL_COPY_ALL | CL_PRIVATE);
1670 namespace_unlock(); 1670 namespace_unlock();
1671 if (IS_ERR(tree)) 1671 if (IS_ERR(tree))
1672 return ERR_CAST(tree); 1672 return ERR_CAST(tree);
1673 return &tree->mnt; 1673 return &tree->mnt;
1674 } 1674 }
1675 1675
1676 void drop_collected_mounts(struct vfsmount *mnt) 1676 void drop_collected_mounts(struct vfsmount *mnt)
1677 { 1677 {
1678 namespace_lock(); 1678 namespace_lock();
1679 lock_mount_hash(); 1679 lock_mount_hash();
1680 umount_tree(real_mount(mnt), 0); 1680 umount_tree(real_mount(mnt), 0);
1681 unlock_mount_hash(); 1681 unlock_mount_hash();
1682 namespace_unlock(); 1682 namespace_unlock();
1683 } 1683 }
1684 1684
1685 /** 1685 /**
1686 * clone_private_mount - create a private clone of a path 1686 * clone_private_mount - create a private clone of a path
1687 * 1687 *
1688 * This creates a new vfsmount, which will be the clone of @path. The new will 1688 * This creates a new vfsmount, which will be the clone of @path. The new will
1689 * not be attached anywhere in the namespace and will be private (i.e. changes 1689 * not be attached anywhere in the namespace and will be private (i.e. changes
1690 * to the originating mount won't be propagated into this). 1690 * to the originating mount won't be propagated into this).
1691 * 1691 *
1692 * Release with mntput(). 1692 * Release with mntput().
1693 */ 1693 */
1694 struct vfsmount *clone_private_mount(struct path *path) 1694 struct vfsmount *clone_private_mount(struct path *path)
1695 { 1695 {
1696 struct mount *old_mnt = real_mount(path->mnt); 1696 struct mount *old_mnt = real_mount(path->mnt);
1697 struct mount *new_mnt; 1697 struct mount *new_mnt;
1698 1698
1699 if (IS_MNT_UNBINDABLE(old_mnt)) 1699 if (IS_MNT_UNBINDABLE(old_mnt))
1700 return ERR_PTR(-EINVAL); 1700 return ERR_PTR(-EINVAL);
1701 1701
1702 down_read(&namespace_sem); 1702 down_read(&namespace_sem);
1703 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); 1703 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1704 up_read(&namespace_sem); 1704 up_read(&namespace_sem);
1705 if (IS_ERR(new_mnt)) 1705 if (IS_ERR(new_mnt))
1706 return ERR_CAST(new_mnt); 1706 return ERR_CAST(new_mnt);
1707 1707
1708 return &new_mnt->mnt; 1708 return &new_mnt->mnt;
1709 } 1709 }
1710 EXPORT_SYMBOL_GPL(clone_private_mount); 1710 EXPORT_SYMBOL_GPL(clone_private_mount);
1711 1711
1712 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1712 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1713 struct vfsmount *root) 1713 struct vfsmount *root)
1714 { 1714 {
1715 struct mount *mnt; 1715 struct mount *mnt;
1716 int res = f(root, arg); 1716 int res = f(root, arg);
1717 if (res) 1717 if (res)
1718 return res; 1718 return res;
1719 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { 1719 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
1720 res = f(&mnt->mnt, arg); 1720 res = f(&mnt->mnt, arg);
1721 if (res) 1721 if (res)
1722 return res; 1722 return res;
1723 } 1723 }
1724 return 0; 1724 return 0;
1725 } 1725 }
1726 1726
1727 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 1727 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
1728 { 1728 {
1729 struct mount *p; 1729 struct mount *p;
1730 1730
1731 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1731 for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1732 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1732 if (p->mnt_group_id && !IS_MNT_SHARED(p))
1733 mnt_release_group_id(p); 1733 mnt_release_group_id(p);
1734 } 1734 }
1735 } 1735 }
1736 1736
1737 static int invent_group_ids(struct mount *mnt, bool recurse) 1737 static int invent_group_ids(struct mount *mnt, bool recurse)
1738 { 1738 {
1739 struct mount *p; 1739 struct mount *p;
1740 1740
1741 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1741 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1742 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1742 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1743 int err = mnt_alloc_group_id(p); 1743 int err = mnt_alloc_group_id(p);
1744 if (err) { 1744 if (err) {
1745 cleanup_group_ids(mnt, p); 1745 cleanup_group_ids(mnt, p);
1746 return err; 1746 return err;
1747 } 1747 }
1748 } 1748 }
1749 } 1749 }
1750 1750
1751 return 0; 1751 return 0;
1752 } 1752 }
1753 1753
1754 /* 1754 /*
1755 * @source_mnt : mount tree to be attached 1755 * @source_mnt : mount tree to be attached
1756 * @nd : place the mount tree @source_mnt is attached 1756 * @nd : place the mount tree @source_mnt is attached
1757 * @parent_nd : if non-null, detach the source_mnt from its parent and 1757 * @parent_nd : if non-null, detach the source_mnt from its parent and
1758 * store the parent mount and mountpoint dentry. 1758 * store the parent mount and mountpoint dentry.
1759 * (done when source_mnt is moved) 1759 * (done when source_mnt is moved)
1760 * 1760 *
1761 * NOTE: in the table below explains the semantics when a source mount 1761 * NOTE: in the table below explains the semantics when a source mount
1762 * of a given type is attached to a destination mount of a given type. 1762 * of a given type is attached to a destination mount of a given type.
1763 * --------------------------------------------------------------------------- 1763 * ---------------------------------------------------------------------------
1764 * | BIND MOUNT OPERATION | 1764 * | BIND MOUNT OPERATION |
1765 * |************************************************************************** 1765 * |**************************************************************************
1766 * | source-->| shared | private | slave | unbindable | 1766 * | source-->| shared | private | slave | unbindable |
1767 * | dest | | | | | 1767 * | dest | | | | |
1768 * | | | | | | | 1768 * | | | | | | |
1769 * | v | | | | | 1769 * | v | | | | |
1770 * |************************************************************************** 1770 * |**************************************************************************
1771 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1771 * | shared | shared (++) | shared (+) | shared(+++)| invalid |
1772 * | | | | | | 1772 * | | | | | |
1773 * |non-shared| shared (+) | private | slave (*) | invalid | 1773 * |non-shared| shared (+) | private | slave (*) | invalid |
1774 * *************************************************************************** 1774 * ***************************************************************************
1775 * A bind operation clones the source mount and mounts the clone on the 1775 * A bind operation clones the source mount and mounts the clone on the
1776 * destination mount. 1776 * destination mount.
1777 * 1777 *
1778 * (++) the cloned mount is propagated to all the mounts in the propagation 1778 * (++) the cloned mount is propagated to all the mounts in the propagation
1779 * tree of the destination mount and the cloned mount is added to 1779 * tree of the destination mount and the cloned mount is added to
1780 * the peer group of the source mount. 1780 * the peer group of the source mount.
1781 * (+) the cloned mount is created under the destination mount and is marked 1781 * (+) the cloned mount is created under the destination mount and is marked
1782 * as shared. The cloned mount is added to the peer group of the source 1782 * as shared. The cloned mount is added to the peer group of the source
1783 * mount. 1783 * mount.
1784 * (+++) the mount is propagated to all the mounts in the propagation tree 1784 * (+++) the mount is propagated to all the mounts in the propagation tree
1785 * of the destination mount and the cloned mount is made slave 1785 * of the destination mount and the cloned mount is made slave
1786 * of the same master as that of the source mount. The cloned mount 1786 * of the same master as that of the source mount. The cloned mount
1787 * is marked as 'shared and slave'. 1787 * is marked as 'shared and slave'.
1788 * (*) the cloned mount is made a slave of the same master as that of the 1788 * (*) the cloned mount is made a slave of the same master as that of the
1789 * source mount. 1789 * source mount.
1790 * 1790 *
1791 * --------------------------------------------------------------------------- 1791 * ---------------------------------------------------------------------------
1792 * | MOVE MOUNT OPERATION | 1792 * | MOVE MOUNT OPERATION |
1793 * |************************************************************************** 1793 * |**************************************************************************
1794 * | source-->| shared | private | slave | unbindable | 1794 * | source-->| shared | private | slave | unbindable |
1795 * | dest | | | | | 1795 * | dest | | | | |
1796 * | | | | | | | 1796 * | | | | | | |
1797 * | v | | | | | 1797 * | v | | | | |
1798 * |************************************************************************** 1798 * |**************************************************************************
1799 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1799 * | shared | shared (+) | shared (+) | shared(+++) | invalid |
1800 * | | | | | | 1800 * | | | | | |
1801 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1801 * |non-shared| shared (+*) | private | slave (*) | unbindable |
1802 * *************************************************************************** 1802 * ***************************************************************************
1803 * 1803 *
1804 * (+) the mount is moved to the destination. And is then propagated to 1804 * (+) the mount is moved to the destination. And is then propagated to
1805 * all the mounts in the propagation tree of the destination mount. 1805 * all the mounts in the propagation tree of the destination mount.
1806 * (+*) the mount is moved to the destination. 1806 * (+*) the mount is moved to the destination.
1807 * (+++) the mount is moved to the destination and is then propagated to 1807 * (+++) the mount is moved to the destination and is then propagated to
1808 * all the mounts belonging to the destination mount's propagation tree. 1808 * all the mounts belonging to the destination mount's propagation tree.
1809 * the mount is marked as 'shared and slave'. 1809 * the mount is marked as 'shared and slave'.
1810 * (*) the mount continues to be a slave at the new location. 1810 * (*) the mount continues to be a slave at the new location.
1811 * 1811 *
1812 * if the source mount is a tree, the operations explained above is 1812 * if the source mount is a tree, the operations explained above is
1813 * applied to each mount in the tree. 1813 * applied to each mount in the tree.
1814 * Must be called without spinlocks held, since this function can sleep 1814 * Must be called without spinlocks held, since this function can sleep
1815 * in allocations. 1815 * in allocations.
1816 */ 1816 */
1817 static int attach_recursive_mnt(struct mount *source_mnt, 1817 static int attach_recursive_mnt(struct mount *source_mnt,
1818 struct mount *dest_mnt, 1818 struct mount *dest_mnt,
1819 struct mountpoint *dest_mp, 1819 struct mountpoint *dest_mp,
1820 struct path *parent_path) 1820 struct path *parent_path)
1821 { 1821 {
1822 HLIST_HEAD(tree_list); 1822 HLIST_HEAD(tree_list);
1823 struct mount *child, *p; 1823 struct mount *child, *p;
1824 struct hlist_node *n; 1824 struct hlist_node *n;
1825 int err; 1825 int err;
1826 1826
1827 if (IS_MNT_SHARED(dest_mnt)) { 1827 if (IS_MNT_SHARED(dest_mnt)) {
1828 err = invent_group_ids(source_mnt, true); 1828 err = invent_group_ids(source_mnt, true);
1829 if (err) 1829 if (err)
1830 goto out; 1830 goto out;
1831 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1831 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1832 lock_mount_hash(); 1832 lock_mount_hash();
1833 if (err) 1833 if (err)
1834 goto out_cleanup_ids; 1834 goto out_cleanup_ids;
1835 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1835 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1836 set_mnt_shared(p); 1836 set_mnt_shared(p);
1837 } else { 1837 } else {
1838 lock_mount_hash(); 1838 lock_mount_hash();
1839 } 1839 }
1840 if (parent_path) { 1840 if (parent_path) {
1841 detach_mnt(source_mnt, parent_path); 1841 detach_mnt(source_mnt, parent_path);
1842 attach_mnt(source_mnt, dest_mnt, dest_mp); 1842 attach_mnt(source_mnt, dest_mnt, dest_mp);
1843 touch_mnt_namespace(source_mnt->mnt_ns); 1843 touch_mnt_namespace(source_mnt->mnt_ns);
1844 } else { 1844 } else {
1845 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 1845 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
1846 commit_tree(source_mnt, NULL); 1846 commit_tree(source_mnt, NULL);
1847 } 1847 }
1848 1848
1849 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 1849 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
1850 struct mount *q; 1850 struct mount *q;
1851 hlist_del_init(&child->mnt_hash); 1851 hlist_del_init(&child->mnt_hash);
1852 q = __lookup_mnt_last(&child->mnt_parent->mnt, 1852 q = __lookup_mnt_last(&child->mnt_parent->mnt,
1853 child->mnt_mountpoint); 1853 child->mnt_mountpoint);
1854 commit_tree(child, q); 1854 commit_tree(child, q);
1855 } 1855 }
1856 unlock_mount_hash(); 1856 unlock_mount_hash();
1857 1857
1858 return 0; 1858 return 0;
1859 1859
1860 out_cleanup_ids: 1860 out_cleanup_ids:
1861 while (!hlist_empty(&tree_list)) { 1861 while (!hlist_empty(&tree_list)) {
1862 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 1862 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1863 umount_tree(child, 0); 1863 umount_tree(child, 0);
1864 } 1864 }
1865 unlock_mount_hash(); 1865 unlock_mount_hash();
1866 cleanup_group_ids(source_mnt, NULL); 1866 cleanup_group_ids(source_mnt, NULL);
1867 out: 1867 out:
1868 return err; 1868 return err;
1869 } 1869 }
1870 1870
1871 static struct mountpoint *lock_mount(struct path *path) 1871 static struct mountpoint *lock_mount(struct path *path)
1872 { 1872 {
1873 struct vfsmount *mnt; 1873 struct vfsmount *mnt;
1874 struct dentry *dentry = path->dentry; 1874 struct dentry *dentry = path->dentry;
1875 retry: 1875 retry:
1876 mutex_lock(&dentry->d_inode->i_mutex); 1876 mutex_lock(&dentry->d_inode->i_mutex);
1877 if (unlikely(cant_mount(dentry))) { 1877 if (unlikely(cant_mount(dentry))) {
1878 mutex_unlock(&dentry->d_inode->i_mutex); 1878 mutex_unlock(&dentry->d_inode->i_mutex);
1879 return ERR_PTR(-ENOENT); 1879 return ERR_PTR(-ENOENT);
1880 } 1880 }
1881 namespace_lock(); 1881 namespace_lock();
1882 mnt = lookup_mnt(path); 1882 mnt = lookup_mnt(path);
1883 if (likely(!mnt)) { 1883 if (likely(!mnt)) {
1884 struct mountpoint *mp = lookup_mountpoint(dentry); 1884 struct mountpoint *mp = lookup_mountpoint(dentry);
1885 if (!mp) 1885 if (!mp)
1886 mp = new_mountpoint(dentry); 1886 mp = new_mountpoint(dentry);
1887 if (IS_ERR(mp)) { 1887 if (IS_ERR(mp)) {
1888 namespace_unlock(); 1888 namespace_unlock();
1889 mutex_unlock(&dentry->d_inode->i_mutex); 1889 mutex_unlock(&dentry->d_inode->i_mutex);
1890 return mp; 1890 return mp;
1891 } 1891 }
1892 return mp; 1892 return mp;
1893 } 1893 }
1894 namespace_unlock(); 1894 namespace_unlock();
1895 mutex_unlock(&path->dentry->d_inode->i_mutex); 1895 mutex_unlock(&path->dentry->d_inode->i_mutex);
1896 path_put(path); 1896 path_put(path);
1897 path->mnt = mnt; 1897 path->mnt = mnt;
1898 dentry = path->dentry = dget(mnt->mnt_root); 1898 dentry = path->dentry = dget(mnt->mnt_root);
1899 goto retry; 1899 goto retry;
1900 } 1900 }
1901 1901
1902 static void unlock_mount(struct mountpoint *where) 1902 static void unlock_mount(struct mountpoint *where)
1903 { 1903 {
1904 struct dentry *dentry = where->m_dentry; 1904 struct dentry *dentry = where->m_dentry;
1905 put_mountpoint(where); 1905 put_mountpoint(where);
1906 namespace_unlock(); 1906 namespace_unlock();
1907 mutex_unlock(&dentry->d_inode->i_mutex); 1907 mutex_unlock(&dentry->d_inode->i_mutex);
1908 } 1908 }
1909 1909
1910 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) 1910 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
1911 { 1911 {
1912 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 1912 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
1913 return -EINVAL; 1913 return -EINVAL;
1914 1914
1915 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != 1915 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
1916 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 1916 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
1917 return -ENOTDIR; 1917 return -ENOTDIR;
1918 1918
1919 return attach_recursive_mnt(mnt, p, mp, NULL); 1919 return attach_recursive_mnt(mnt, p, mp, NULL);
1920 } 1920 }
1921 1921
1922 /* 1922 /*
1923 * Sanity check the flags to change_mnt_propagation. 1923 * Sanity check the flags to change_mnt_propagation.
1924 */ 1924 */
1925 1925
1926 static int flags_to_propagation_type(int flags) 1926 static int flags_to_propagation_type(int flags)
1927 { 1927 {
1928 int type = flags & ~(MS_REC | MS_SILENT); 1928 int type = flags & ~(MS_REC | MS_SILENT);
1929 1929
1930 /* Fail if any non-propagation flags are set */ 1930 /* Fail if any non-propagation flags are set */
1931 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1931 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1932 return 0; 1932 return 0;
1933 /* Only one propagation flag should be set */ 1933 /* Only one propagation flag should be set */
1934 if (!is_power_of_2(type)) 1934 if (!is_power_of_2(type))
1935 return 0; 1935 return 0;
1936 return type; 1936 return type;
1937 } 1937 }
1938 1938
1939 /* 1939 /*
1940 * recursively change the type of the mountpoint. 1940 * recursively change the type of the mountpoint.
1941 */ 1941 */
1942 static int do_change_type(struct path *path, int flag) 1942 static int do_change_type(struct path *path, int flag)
1943 { 1943 {
1944 struct mount *m; 1944 struct mount *m;
1945 struct mount *mnt = real_mount(path->mnt); 1945 struct mount *mnt = real_mount(path->mnt);
1946 int recurse = flag & MS_REC; 1946 int recurse = flag & MS_REC;
1947 int type; 1947 int type;
1948 int err = 0; 1948 int err = 0;
1949 1949
1950 if (path->dentry != path->mnt->mnt_root) 1950 if (path->dentry != path->mnt->mnt_root)
1951 return -EINVAL; 1951 return -EINVAL;
1952 1952
1953 type = flags_to_propagation_type(flag); 1953 type = flags_to_propagation_type(flag);
1954 if (!type) 1954 if (!type)
1955 return -EINVAL; 1955 return -EINVAL;
1956 1956
1957 namespace_lock(); 1957 namespace_lock();
1958 if (type == MS_SHARED) { 1958 if (type == MS_SHARED) {
1959 err = invent_group_ids(mnt, recurse); 1959 err = invent_group_ids(mnt, recurse);
1960 if (err) 1960 if (err)
1961 goto out_unlock; 1961 goto out_unlock;
1962 } 1962 }
1963 1963
1964 lock_mount_hash(); 1964 lock_mount_hash();
1965 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1965 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1966 change_mnt_propagation(m, type); 1966 change_mnt_propagation(m, type);
1967 unlock_mount_hash(); 1967 unlock_mount_hash();
1968 1968
1969 out_unlock: 1969 out_unlock:
1970 namespace_unlock(); 1970 namespace_unlock();
1971 return err; 1971 return err;
1972 } 1972 }
1973 1973
1974 static bool has_locked_children(struct mount *mnt, struct dentry *dentry) 1974 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1975 { 1975 {
1976 struct mount *child; 1976 struct mount *child;
1977 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 1977 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
1978 if (!is_subdir(child->mnt_mountpoint, dentry)) 1978 if (!is_subdir(child->mnt_mountpoint, dentry))
1979 continue; 1979 continue;
1980 1980
1981 if (child->mnt.mnt_flags & MNT_LOCKED) 1981 if (child->mnt.mnt_flags & MNT_LOCKED)
1982 return true; 1982 return true;
1983 } 1983 }
1984 return false; 1984 return false;
1985 } 1985 }
1986 1986
1987 /* 1987 /*
1988 * do loopback mount. 1988 * do loopback mount.
1989 */ 1989 */
1990 static int do_loopback(struct path *path, const char *old_name, 1990 static int do_loopback(struct path *path, const char *old_name,
1991 int recurse) 1991 int recurse)
1992 { 1992 {
1993 struct path old_path; 1993 struct path old_path;
1994 struct mount *mnt = NULL, *old, *parent; 1994 struct mount *mnt = NULL, *old, *parent;
1995 struct mountpoint *mp; 1995 struct mountpoint *mp;
1996 int err; 1996 int err;
1997 if (!old_name || !*old_name) 1997 if (!old_name || !*old_name)
1998 return -EINVAL; 1998 return -EINVAL;
1999 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 1999 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2000 if (err) 2000 if (err)
2001 return err; 2001 return err;
2002 2002
2003 err = -EINVAL; 2003 err = -EINVAL;
2004 if (mnt_ns_loop(old_path.dentry)) 2004 if (mnt_ns_loop(old_path.dentry))
2005 goto out; 2005 goto out;
2006 2006
2007 mp = lock_mount(path); 2007 mp = lock_mount(path);
2008 err = PTR_ERR(mp); 2008 err = PTR_ERR(mp);
2009 if (IS_ERR(mp)) 2009 if (IS_ERR(mp))
2010 goto out; 2010 goto out;
2011 2011
2012 old = real_mount(old_path.mnt); 2012 old = real_mount(old_path.mnt);
2013 parent = real_mount(path->mnt); 2013 parent = real_mount(path->mnt);
2014 2014
2015 err = -EINVAL; 2015 err = -EINVAL;
2016 if (IS_MNT_UNBINDABLE(old)) 2016 if (IS_MNT_UNBINDABLE(old))
2017 goto out2; 2017 goto out2;
2018 2018
2019 if (!check_mnt(parent) || !check_mnt(old)) 2019 if (!check_mnt(parent))
2020 goto out2;
2021
2022 if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
2020 goto out2; 2023 goto out2;
2021 2024
2022 if (!recurse && has_locked_children(old, old_path.dentry)) 2025 if (!recurse && has_locked_children(old, old_path.dentry))
2023 goto out2; 2026 goto out2;
2024 2027
2025 if (recurse) 2028 if (recurse)
2026 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); 2029 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2027 else 2030 else
2028 mnt = clone_mnt(old, old_path.dentry, 0); 2031 mnt = clone_mnt(old, old_path.dentry, 0);
2029 2032
2030 if (IS_ERR(mnt)) { 2033 if (IS_ERR(mnt)) {
2031 err = PTR_ERR(mnt); 2034 err = PTR_ERR(mnt);
2032 goto out2; 2035 goto out2;
2033 } 2036 }
2034 2037
2035 mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2038 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2036 2039
2037 err = graft_tree(mnt, parent, mp); 2040 err = graft_tree(mnt, parent, mp);
2038 if (err) { 2041 if (err) {
2039 lock_mount_hash(); 2042 lock_mount_hash();
2040 umount_tree(mnt, 0); 2043 umount_tree(mnt, 0);
2041 unlock_mount_hash(); 2044 unlock_mount_hash();
2042 } 2045 }
2043 out2: 2046 out2:
2044 unlock_mount(mp); 2047 unlock_mount(mp);
2045 out: 2048 out:
2046 path_put(&old_path); 2049 path_put(&old_path);
2047 return err; 2050 return err;
2048 } 2051 }
2049 2052
2050 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 2053 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
2051 { 2054 {
2052 int error = 0; 2055 int error = 0;
2053 int readonly_request = 0; 2056 int readonly_request = 0;
2054 2057
2055 if (ms_flags & MS_RDONLY) 2058 if (ms_flags & MS_RDONLY)
2056 readonly_request = 1; 2059 readonly_request = 1;
2057 if (readonly_request == __mnt_is_readonly(mnt)) 2060 if (readonly_request == __mnt_is_readonly(mnt))
2058 return 0; 2061 return 0;
2059 2062
2060 if (readonly_request) 2063 if (readonly_request)
2061 error = mnt_make_readonly(real_mount(mnt)); 2064 error = mnt_make_readonly(real_mount(mnt));
2062 else 2065 else
2063 __mnt_unmake_readonly(real_mount(mnt)); 2066 __mnt_unmake_readonly(real_mount(mnt));
2064 return error; 2067 return error;
2065 } 2068 }
2066 2069
2067 /* 2070 /*
2068 * change filesystem flags. dir should be a physical root of filesystem. 2071 * change filesystem flags. dir should be a physical root of filesystem.
2069 * If you've mounted a non-root directory somewhere and want to do remount 2072 * If you've mounted a non-root directory somewhere and want to do remount
2070 * on it - tough luck. 2073 * on it - tough luck.
2071 */ 2074 */
2072 static int do_remount(struct path *path, int flags, int mnt_flags, 2075 static int do_remount(struct path *path, int flags, int mnt_flags,
2073 void *data) 2076 void *data)
2074 { 2077 {
2075 int err; 2078 int err;
2076 struct super_block *sb = path->mnt->mnt_sb; 2079 struct super_block *sb = path->mnt->mnt_sb;
2077 struct mount *mnt = real_mount(path->mnt); 2080 struct mount *mnt = real_mount(path->mnt);
2078 2081
2079 if (!check_mnt(mnt)) 2082 if (!check_mnt(mnt))
2080 return -EINVAL; 2083 return -EINVAL;
2081 2084
2082 if (path->dentry != path->mnt->mnt_root) 2085 if (path->dentry != path->mnt->mnt_root)
2083 return -EINVAL; 2086 return -EINVAL;
2084 2087
2085 /* Don't allow changing of locked mnt flags. 2088 /* Don't allow changing of locked mnt flags.
2086 * 2089 *
2087 * No locks need to be held here while testing the various 2090 * No locks need to be held here while testing the various
2088 * MNT_LOCK flags because those flags can never be cleared 2091 * MNT_LOCK flags because those flags can never be cleared
2089 * once they are set. 2092 * once they are set.
2090 */ 2093 */
2091 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && 2094 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2092 !(mnt_flags & MNT_READONLY)) { 2095 !(mnt_flags & MNT_READONLY)) {
2093 return -EPERM; 2096 return -EPERM;
2094 } 2097 }
2095 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 2098 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2096 !(mnt_flags & MNT_NODEV)) { 2099 !(mnt_flags & MNT_NODEV)) {
2097 return -EPERM; 2100 return -EPERM;
2098 } 2101 }
2099 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 2102 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2100 !(mnt_flags & MNT_NOSUID)) { 2103 !(mnt_flags & MNT_NOSUID)) {
2101 return -EPERM; 2104 return -EPERM;
2102 } 2105 }
2103 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && 2106 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2104 !(mnt_flags & MNT_NOEXEC)) { 2107 !(mnt_flags & MNT_NOEXEC)) {
2105 return -EPERM; 2108 return -EPERM;
2106 } 2109 }
2107 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && 2110 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2108 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { 2111 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2109 return -EPERM; 2112 return -EPERM;
2110 } 2113 }
2111 2114
2112 err = security_sb_remount(sb, data); 2115 err = security_sb_remount(sb, data);
2113 if (err) 2116 if (err)
2114 return err; 2117 return err;
2115 2118
2116 down_write(&sb->s_umount); 2119 down_write(&sb->s_umount);
2117 if (flags & MS_BIND) 2120 if (flags & MS_BIND)
2118 err = change_mount_flags(path->mnt, flags); 2121 err = change_mount_flags(path->mnt, flags);
2119 else if (!capable(CAP_SYS_ADMIN)) 2122 else if (!capable(CAP_SYS_ADMIN))
2120 err = -EPERM; 2123 err = -EPERM;
2121 else 2124 else
2122 err = do_remount_sb(sb, flags, data, 0); 2125 err = do_remount_sb(sb, flags, data, 0);
2123 if (!err) { 2126 if (!err) {
2124 lock_mount_hash(); 2127 lock_mount_hash();
2125 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 2128 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2126 mnt->mnt.mnt_flags = mnt_flags; 2129 mnt->mnt.mnt_flags = mnt_flags;
2127 touch_mnt_namespace(mnt->mnt_ns); 2130 touch_mnt_namespace(mnt->mnt_ns);
2128 unlock_mount_hash(); 2131 unlock_mount_hash();
2129 } 2132 }
2130 up_write(&sb->s_umount); 2133 up_write(&sb->s_umount);
2131 return err; 2134 return err;
2132 } 2135 }
2133 2136
2134 static inline int tree_contains_unbindable(struct mount *mnt) 2137 static inline int tree_contains_unbindable(struct mount *mnt)
2135 { 2138 {
2136 struct mount *p; 2139 struct mount *p;
2137 for (p = mnt; p; p = next_mnt(p, mnt)) { 2140 for (p = mnt; p; p = next_mnt(p, mnt)) {
2138 if (IS_MNT_UNBINDABLE(p)) 2141 if (IS_MNT_UNBINDABLE(p))
2139 return 1; 2142 return 1;
2140 } 2143 }
2141 return 0; 2144 return 0;
2142 } 2145 }
2143 2146
2144 static int do_move_mount(struct path *path, const char *old_name) 2147 static int do_move_mount(struct path *path, const char *old_name)
2145 { 2148 {
2146 struct path old_path, parent_path; 2149 struct path old_path, parent_path;
2147 struct mount *p; 2150 struct mount *p;
2148 struct mount *old; 2151 struct mount *old;
2149 struct mountpoint *mp; 2152 struct mountpoint *mp;
2150 int err; 2153 int err;
2151 if (!old_name || !*old_name) 2154 if (!old_name || !*old_name)
2152 return -EINVAL; 2155 return -EINVAL;
2153 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 2156 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2154 if (err) 2157 if (err)
2155 return err; 2158 return err;
2156 2159
2157 mp = lock_mount(path); 2160 mp = lock_mount(path);
2158 err = PTR_ERR(mp); 2161 err = PTR_ERR(mp);
2159 if (IS_ERR(mp)) 2162 if (IS_ERR(mp))
2160 goto out; 2163 goto out;
2161 2164
2162 old = real_mount(old_path.mnt); 2165 old = real_mount(old_path.mnt);
2163 p = real_mount(path->mnt); 2166 p = real_mount(path->mnt);
2164 2167
2165 err = -EINVAL; 2168 err = -EINVAL;
2166 if (!check_mnt(p) || !check_mnt(old)) 2169 if (!check_mnt(p) || !check_mnt(old))
2167 goto out1; 2170 goto out1;
2168 2171
2169 if (old->mnt.mnt_flags & MNT_LOCKED) 2172 if (old->mnt.mnt_flags & MNT_LOCKED)
2170 goto out1; 2173 goto out1;
2171 2174
2172 err = -EINVAL; 2175 err = -EINVAL;
2173 if (old_path.dentry != old_path.mnt->mnt_root) 2176 if (old_path.dentry != old_path.mnt->mnt_root)
2174 goto out1; 2177 goto out1;
2175 2178
2176 if (!mnt_has_parent(old)) 2179 if (!mnt_has_parent(old))
2177 goto out1; 2180 goto out1;
2178 2181
2179 if (S_ISDIR(path->dentry->d_inode->i_mode) != 2182 if (S_ISDIR(path->dentry->d_inode->i_mode) !=
2180 S_ISDIR(old_path.dentry->d_inode->i_mode)) 2183 S_ISDIR(old_path.dentry->d_inode->i_mode))
2181 goto out1; 2184 goto out1;
2182 /* 2185 /*
2183 * Don't move a mount residing in a shared parent. 2186 * Don't move a mount residing in a shared parent.
2184 */ 2187 */
2185 if (IS_MNT_SHARED(old->mnt_parent)) 2188 if (IS_MNT_SHARED(old->mnt_parent))
2186 goto out1; 2189 goto out1;
2187 /* 2190 /*
2188 * Don't move a mount tree containing unbindable mounts to a destination 2191 * Don't move a mount tree containing unbindable mounts to a destination
2189 * mount which is shared. 2192 * mount which is shared.
2190 */ 2193 */
2191 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) 2194 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2192 goto out1; 2195 goto out1;
2193 err = -ELOOP; 2196 err = -ELOOP;
2194 for (; mnt_has_parent(p); p = p->mnt_parent) 2197 for (; mnt_has_parent(p); p = p->mnt_parent)
2195 if (p == old) 2198 if (p == old)
2196 goto out1; 2199 goto out1;
2197 2200
2198 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); 2201 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
2199 if (err) 2202 if (err)
2200 goto out1; 2203 goto out1;
2201 2204
2202 /* if the mount is moved, it should no longer be expire 2205 /* if the mount is moved, it should no longer be expire
2203 * automatically */ 2206 * automatically */
2204 list_del_init(&old->mnt_expire); 2207 list_del_init(&old->mnt_expire);
2205 out1: 2208 out1:
2206 unlock_mount(mp); 2209 unlock_mount(mp);
2207 out: 2210 out:
2208 if (!err) 2211 if (!err)
2209 path_put(&parent_path); 2212 path_put(&parent_path);
2210 path_put(&old_path); 2213 path_put(&old_path);
2211 return err; 2214 return err;
2212 } 2215 }
2213 2216
2214 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 2217 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
2215 { 2218 {
2216 int err; 2219 int err;
2217 const char *subtype = strchr(fstype, '.'); 2220 const char *subtype = strchr(fstype, '.');
2218 if (subtype) { 2221 if (subtype) {
2219 subtype++; 2222 subtype++;
2220 err = -EINVAL; 2223 err = -EINVAL;
2221 if (!subtype[0]) 2224 if (!subtype[0])
2222 goto err; 2225 goto err;
2223 } else 2226 } else
2224 subtype = ""; 2227 subtype = "";
2225 2228
2226 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); 2229 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2227 err = -ENOMEM; 2230 err = -ENOMEM;
2228 if (!mnt->mnt_sb->s_subtype) 2231 if (!mnt->mnt_sb->s_subtype)
2229 goto err; 2232 goto err;
2230 return mnt; 2233 return mnt;
2231 2234
2232 err: 2235 err:
2233 mntput(mnt); 2236 mntput(mnt);
2234 return ERR_PTR(err); 2237 return ERR_PTR(err);
2235 } 2238 }
2236 2239
2237 /* 2240 /*
2238 * add a mount into a namespace's mount tree 2241 * add a mount into a namespace's mount tree
2239 */ 2242 */
2240 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 2243 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
2241 { 2244 {
2242 struct mountpoint *mp; 2245 struct mountpoint *mp;
2243 struct mount *parent; 2246 struct mount *parent;
2244 int err; 2247 int err;
2245 2248
2246 mnt_flags &= ~MNT_INTERNAL_FLAGS; 2249 mnt_flags &= ~MNT_INTERNAL_FLAGS;
2247 2250
2248 mp = lock_mount(path); 2251 mp = lock_mount(path);
2249 if (IS_ERR(mp)) 2252 if (IS_ERR(mp))
2250 return PTR_ERR(mp); 2253 return PTR_ERR(mp);
2251 2254
2252 parent = real_mount(path->mnt); 2255 parent = real_mount(path->mnt);
2253 err = -EINVAL; 2256 err = -EINVAL;
2254 if (unlikely(!check_mnt(parent))) { 2257 if (unlikely(!check_mnt(parent))) {
2255 /* that's acceptable only for automounts done in private ns */ 2258 /* that's acceptable only for automounts done in private ns */
2256 if (!(mnt_flags & MNT_SHRINKABLE)) 2259 if (!(mnt_flags & MNT_SHRINKABLE))
2257 goto unlock; 2260 goto unlock;
2258 /* ... and for those we'd better have mountpoint still alive */ 2261 /* ... and for those we'd better have mountpoint still alive */
2259 if (!parent->mnt_ns) 2262 if (!parent->mnt_ns)
2260 goto unlock; 2263 goto unlock;
2261 } 2264 }
2262 2265
2263 /* Refuse the same filesystem on the same mount point */ 2266 /* Refuse the same filesystem on the same mount point */
2264 err = -EBUSY; 2267 err = -EBUSY;
2265 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 2268 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2266 path->mnt->mnt_root == path->dentry) 2269 path->mnt->mnt_root == path->dentry)
2267 goto unlock; 2270 goto unlock;
2268 2271
2269 err = -EINVAL; 2272 err = -EINVAL;
2270 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 2273 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
2271 goto unlock; 2274 goto unlock;
2272 2275
2273 newmnt->mnt.mnt_flags = mnt_flags; 2276 newmnt->mnt.mnt_flags = mnt_flags;
2274 err = graft_tree(newmnt, parent, mp); 2277 err = graft_tree(newmnt, parent, mp);
2275 2278
2276 unlock: 2279 unlock:
2277 unlock_mount(mp); 2280 unlock_mount(mp);
2278 return err; 2281 return err;
2279 } 2282 }
2280 2283
2281 /* 2284 /*
2282 * create a new mount for userspace and request it to be added into the 2285 * create a new mount for userspace and request it to be added into the
2283 * namespace's tree 2286 * namespace's tree
2284 */ 2287 */
2285 static int do_new_mount(struct path *path, const char *fstype, int flags, 2288 static int do_new_mount(struct path *path, const char *fstype, int flags,
2286 int mnt_flags, const char *name, void *data) 2289 int mnt_flags, const char *name, void *data)
2287 { 2290 {
2288 struct file_system_type *type; 2291 struct file_system_type *type;
2289 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 2292 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2290 struct vfsmount *mnt; 2293 struct vfsmount *mnt;
2291 int err; 2294 int err;
2292 2295
2293 if (!fstype) 2296 if (!fstype)
2294 return -EINVAL; 2297 return -EINVAL;
2295 2298
2296 type = get_fs_type(fstype); 2299 type = get_fs_type(fstype);
2297 if (!type) 2300 if (!type)
2298 return -ENODEV; 2301 return -ENODEV;
2299 2302
2300 if (user_ns != &init_user_ns) { 2303 if (user_ns != &init_user_ns) {
2301 if (!(type->fs_flags & FS_USERNS_MOUNT)) { 2304 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
2302 put_filesystem(type); 2305 put_filesystem(type);
2303 return -EPERM; 2306 return -EPERM;
2304 } 2307 }
2305 /* Only in special cases allow devices from mounts 2308 /* Only in special cases allow devices from mounts
2306 * created outside the initial user namespace. 2309 * created outside the initial user namespace.
2307 */ 2310 */
2308 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2311 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2309 flags |= MS_NODEV; 2312 flags |= MS_NODEV;
2310 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 2313 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
2311 } 2314 }
2312 } 2315 }
2313 2316
2314 mnt = vfs_kern_mount(type, flags, name, data); 2317 mnt = vfs_kern_mount(type, flags, name, data);
2315 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 2318 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2316 !mnt->mnt_sb->s_subtype) 2319 !mnt->mnt_sb->s_subtype)
2317 mnt = fs_set_subtype(mnt, fstype); 2320 mnt = fs_set_subtype(mnt, fstype);
2318 2321
2319 put_filesystem(type); 2322 put_filesystem(type);
2320 if (IS_ERR(mnt)) 2323 if (IS_ERR(mnt))
2321 return PTR_ERR(mnt); 2324 return PTR_ERR(mnt);
2322 2325
2323 err = do_add_mount(real_mount(mnt), path, mnt_flags); 2326 err = do_add_mount(real_mount(mnt), path, mnt_flags);
2324 if (err) 2327 if (err)
2325 mntput(mnt); 2328 mntput(mnt);
2326 return err; 2329 return err;
2327 } 2330 }
2328 2331
2329 int finish_automount(struct vfsmount *m, struct path *path) 2332 int finish_automount(struct vfsmount *m, struct path *path)
2330 { 2333 {
2331 struct mount *mnt = real_mount(m); 2334 struct mount *mnt = real_mount(m);
2332 int err; 2335 int err;
2333 /* The new mount record should have at least 2 refs to prevent it being 2336 /* The new mount record should have at least 2 refs to prevent it being
2334 * expired before we get a chance to add it 2337 * expired before we get a chance to add it
2335 */ 2338 */
2336 BUG_ON(mnt_get_count(mnt) < 2); 2339 BUG_ON(mnt_get_count(mnt) < 2);
2337 2340
2338 if (m->mnt_sb == path->mnt->mnt_sb && 2341 if (m->mnt_sb == path->mnt->mnt_sb &&
2339 m->mnt_root == path->dentry) { 2342 m->mnt_root == path->dentry) {
2340 err = -ELOOP; 2343 err = -ELOOP;
2341 goto fail; 2344 goto fail;
2342 } 2345 }
2343 2346
2344 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 2347 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
2345 if (!err) 2348 if (!err)
2346 return 0; 2349 return 0;
2347 fail: 2350 fail:
2348 /* remove m from any expiration list it may be on */ 2351 /* remove m from any expiration list it may be on */
2349 if (!list_empty(&mnt->mnt_expire)) { 2352 if (!list_empty(&mnt->mnt_expire)) {
2350 namespace_lock(); 2353 namespace_lock();
2351 list_del_init(&mnt->mnt_expire); 2354 list_del_init(&mnt->mnt_expire);
2352 namespace_unlock(); 2355 namespace_unlock();
2353 } 2356 }
2354 mntput(m); 2357 mntput(m);
2355 mntput(m); 2358 mntput(m);
2356 return err; 2359 return err;
2357 } 2360 }
2358 2361
2359 /** 2362 /**
2360 * mnt_set_expiry - Put a mount on an expiration list 2363 * mnt_set_expiry - Put a mount on an expiration list
2361 * @mnt: The mount to list. 2364 * @mnt: The mount to list.
2362 * @expiry_list: The list to add the mount to. 2365 * @expiry_list: The list to add the mount to.
2363 */ 2366 */
2364 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 2367 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2365 { 2368 {
2366 namespace_lock(); 2369 namespace_lock();
2367 2370
2368 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 2371 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2369 2372
2370 namespace_unlock(); 2373 namespace_unlock();
2371 } 2374 }
2372 EXPORT_SYMBOL(mnt_set_expiry); 2375 EXPORT_SYMBOL(mnt_set_expiry);
2373 2376
2374 /* 2377 /*
2375 * process a list of expirable mountpoints with the intent of discarding any 2378 * process a list of expirable mountpoints with the intent of discarding any
2376 * mountpoints that aren't in use and haven't been touched since last we came 2379 * mountpoints that aren't in use and haven't been touched since last we came
2377 * here 2380 * here
2378 */ 2381 */
2379 void mark_mounts_for_expiry(struct list_head *mounts) 2382 void mark_mounts_for_expiry(struct list_head *mounts)
2380 { 2383 {
2381 struct mount *mnt, *next; 2384 struct mount *mnt, *next;
2382 LIST_HEAD(graveyard); 2385 LIST_HEAD(graveyard);
2383 2386
2384 if (list_empty(mounts)) 2387 if (list_empty(mounts))
2385 return; 2388 return;
2386 2389
2387 namespace_lock(); 2390 namespace_lock();
2388 lock_mount_hash(); 2391 lock_mount_hash();
2389 2392
2390 /* extract from the expiration list every vfsmount that matches the 2393 /* extract from the expiration list every vfsmount that matches the
2391 * following criteria: 2394 * following criteria:
2392 * - only referenced by its parent vfsmount 2395 * - only referenced by its parent vfsmount
2393 * - still marked for expiry (marked on the last call here; marks are 2396 * - still marked for expiry (marked on the last call here; marks are
2394 * cleared by mntput()) 2397 * cleared by mntput())
2395 */ 2398 */
2396 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2399 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
2397 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2400 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
2398 propagate_mount_busy(mnt, 1)) 2401 propagate_mount_busy(mnt, 1))
2399 continue; 2402 continue;
2400 list_move(&mnt->mnt_expire, &graveyard); 2403 list_move(&mnt->mnt_expire, &graveyard);
2401 } 2404 }
2402 while (!list_empty(&graveyard)) { 2405 while (!list_empty(&graveyard)) {
2403 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2406 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
2404 touch_mnt_namespace(mnt->mnt_ns); 2407 touch_mnt_namespace(mnt->mnt_ns);
2405 umount_tree(mnt, 1); 2408 umount_tree(mnt, 1);
2406 } 2409 }
2407 unlock_mount_hash(); 2410 unlock_mount_hash();
2408 namespace_unlock(); 2411 namespace_unlock();
2409 } 2412 }
2410 2413
2411 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2414 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
2412 2415
2413 /* 2416 /*
2414 * Ripoff of 'select_parent()' 2417 * Ripoff of 'select_parent()'
2415 * 2418 *
2416 * search the list of submounts for a given mountpoint, and move any 2419 * search the list of submounts for a given mountpoint, and move any
2417 * shrinkable submounts to the 'graveyard' list. 2420 * shrinkable submounts to the 'graveyard' list.
2418 */ 2421 */
2419 static int select_submounts(struct mount *parent, struct list_head *graveyard) 2422 static int select_submounts(struct mount *parent, struct list_head *graveyard)
2420 { 2423 {
2421 struct mount *this_parent = parent; 2424 struct mount *this_parent = parent;
2422 struct list_head *next; 2425 struct list_head *next;
2423 int found = 0; 2426 int found = 0;
2424 2427
2425 repeat: 2428 repeat:
2426 next = this_parent->mnt_mounts.next; 2429 next = this_parent->mnt_mounts.next;
2427 resume: 2430 resume:
2428 while (next != &this_parent->mnt_mounts) { 2431 while (next != &this_parent->mnt_mounts) {
2429 struct list_head *tmp = next; 2432 struct list_head *tmp = next;
2430 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 2433 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
2431 2434
2432 next = tmp->next; 2435 next = tmp->next;
2433 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 2436 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
2434 continue; 2437 continue;
2435 /* 2438 /*
2436 * Descend a level if the d_mounts list is non-empty. 2439 * Descend a level if the d_mounts list is non-empty.
2437 */ 2440 */
2438 if (!list_empty(&mnt->mnt_mounts)) { 2441 if (!list_empty(&mnt->mnt_mounts)) {
2439 this_parent = mnt; 2442 this_parent = mnt;
2440 goto repeat; 2443 goto repeat;
2441 } 2444 }
2442 2445
2443 if (!propagate_mount_busy(mnt, 1)) { 2446 if (!propagate_mount_busy(mnt, 1)) {
2444 list_move_tail(&mnt->mnt_expire, graveyard); 2447 list_move_tail(&mnt->mnt_expire, graveyard);
2445 found++; 2448 found++;
2446 } 2449 }
2447 } 2450 }
2448 /* 2451 /*
2449 * All done at this level ... ascend and resume the search 2452 * All done at this level ... ascend and resume the search
2450 */ 2453 */
2451 if (this_parent != parent) { 2454 if (this_parent != parent) {
2452 next = this_parent->mnt_child.next; 2455 next = this_parent->mnt_child.next;
2453 this_parent = this_parent->mnt_parent; 2456 this_parent = this_parent->mnt_parent;
2454 goto resume; 2457 goto resume;
2455 } 2458 }
2456 return found; 2459 return found;
2457 } 2460 }
2458 2461
2459 /* 2462 /*
2460 * process a list of expirable mountpoints with the intent of discarding any 2463 * process a list of expirable mountpoints with the intent of discarding any
2461 * submounts of a specific parent mountpoint 2464 * submounts of a specific parent mountpoint
2462 * 2465 *
2463 * mount_lock must be held for write 2466 * mount_lock must be held for write
2464 */ 2467 */
2465 static void shrink_submounts(struct mount *mnt) 2468 static void shrink_submounts(struct mount *mnt)
2466 { 2469 {
2467 LIST_HEAD(graveyard); 2470 LIST_HEAD(graveyard);
2468 struct mount *m; 2471 struct mount *m;
2469 2472
2470 /* extract submounts of 'mountpoint' from the expiration list */ 2473 /* extract submounts of 'mountpoint' from the expiration list */
2471 while (select_submounts(mnt, &graveyard)) { 2474 while (select_submounts(mnt, &graveyard)) {
2472 while (!list_empty(&graveyard)) { 2475 while (!list_empty(&graveyard)) {
2473 m = list_first_entry(&graveyard, struct mount, 2476 m = list_first_entry(&graveyard, struct mount,
2474 mnt_expire); 2477 mnt_expire);
2475 touch_mnt_namespace(m->mnt_ns); 2478 touch_mnt_namespace(m->mnt_ns);
2476 umount_tree(m, 1); 2479 umount_tree(m, 1);
2477 } 2480 }
2478 } 2481 }
2479 } 2482 }
2480 2483
2481 /* 2484 /*
2482 * Some copy_from_user() implementations do not return the exact number of 2485 * Some copy_from_user() implementations do not return the exact number of
2483 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2486 * bytes remaining to copy on a fault. But copy_mount_options() requires that.
2484 * Note that this function differs from copy_from_user() in that it will oops 2487 * Note that this function differs from copy_from_user() in that it will oops
2485 * on bad values of `to', rather than returning a short copy. 2488 * on bad values of `to', rather than returning a short copy.
2486 */ 2489 */
2487 static long exact_copy_from_user(void *to, const void __user * from, 2490 static long exact_copy_from_user(void *to, const void __user * from,
2488 unsigned long n) 2491 unsigned long n)
2489 { 2492 {
2490 char *t = to; 2493 char *t = to;
2491 const char __user *f = from; 2494 const char __user *f = from;
2492 char c; 2495 char c;
2493 2496
2494 if (!access_ok(VERIFY_READ, from, n)) 2497 if (!access_ok(VERIFY_READ, from, n))
2495 return n; 2498 return n;
2496 2499
2497 while (n) { 2500 while (n) {
2498 if (__get_user(c, f)) { 2501 if (__get_user(c, f)) {
2499 memset(t, 0, n); 2502 memset(t, 0, n);
2500 break; 2503 break;
2501 } 2504 }
2502 *t++ = c; 2505 *t++ = c;
2503 f++; 2506 f++;
2504 n--; 2507 n--;
2505 } 2508 }
2506 return n; 2509 return n;
2507 } 2510 }
2508 2511
2509 int copy_mount_options(const void __user * data, unsigned long *where) 2512 int copy_mount_options(const void __user * data, unsigned long *where)
2510 { 2513 {
2511 int i; 2514 int i;
2512 unsigned long page; 2515 unsigned long page;
2513 unsigned long size; 2516 unsigned long size;
2514 2517
2515 *where = 0; 2518 *where = 0;
2516 if (!data) 2519 if (!data)
2517 return 0; 2520 return 0;
2518 2521
2519 if (!(page = __get_free_page(GFP_KERNEL))) 2522 if (!(page = __get_free_page(GFP_KERNEL)))
2520 return -ENOMEM; 2523 return -ENOMEM;
2521 2524
2522 /* We only care that *some* data at the address the user 2525 /* We only care that *some* data at the address the user
2523 * gave us is valid. Just in case, we'll zero 2526 * gave us is valid. Just in case, we'll zero
2524 * the remainder of the page. 2527 * the remainder of the page.
2525 */ 2528 */
2526 /* copy_from_user cannot cross TASK_SIZE ! */ 2529 /* copy_from_user cannot cross TASK_SIZE ! */
2527 size = TASK_SIZE - (unsigned long)data; 2530 size = TASK_SIZE - (unsigned long)data;
2528 if (size > PAGE_SIZE) 2531 if (size > PAGE_SIZE)
2529 size = PAGE_SIZE; 2532 size = PAGE_SIZE;
2530 2533
2531 i = size - exact_copy_from_user((void *)page, data, size); 2534 i = size - exact_copy_from_user((void *)page, data, size);
2532 if (!i) { 2535 if (!i) {
2533 free_page(page); 2536 free_page(page);
2534 return -EFAULT; 2537 return -EFAULT;
2535 } 2538 }
2536 if (i != PAGE_SIZE) 2539 if (i != PAGE_SIZE)
2537 memset((char *)page + i, 0, PAGE_SIZE - i); 2540 memset((char *)page + i, 0, PAGE_SIZE - i);
2538 *where = page; 2541 *where = page;
2539 return 0; 2542 return 0;
2540 } 2543 }
2541 2544
2542 char *copy_mount_string(const void __user *data) 2545 char *copy_mount_string(const void __user *data)
2543 { 2546 {
2544 return data ? strndup_user(data, PAGE_SIZE) : NULL; 2547 return data ? strndup_user(data, PAGE_SIZE) : NULL;
2545 } 2548 }
2546 2549
2547 /* 2550 /*
2548 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2551 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
2549 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2552 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
2550 * 2553 *
2551 * data is a (void *) that can point to any structure up to 2554 * data is a (void *) that can point to any structure up to
2552 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2555 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
2553 * information (or be NULL). 2556 * information (or be NULL).
2554 * 2557 *
2555 * Pre-0.97 versions of mount() didn't have a flags word. 2558 * Pre-0.97 versions of mount() didn't have a flags word.
2556 * When the flags word was introduced its top half was required 2559 * When the flags word was introduced its top half was required
2557 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2560 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
2558 * Therefore, if this magic number is present, it carries no information 2561 * Therefore, if this magic number is present, it carries no information
2559 * and must be discarded. 2562 * and must be discarded.
2560 */ 2563 */
2561 long do_mount(const char *dev_name, const char __user *dir_name, 2564 long do_mount(const char *dev_name, const char __user *dir_name,
2562 const char *type_page, unsigned long flags, void *data_page) 2565 const char *type_page, unsigned long flags, void *data_page)
2563 { 2566 {
2564 struct path path; 2567 struct path path;
2565 int retval = 0; 2568 int retval = 0;
2566 int mnt_flags = 0; 2569 int mnt_flags = 0;
2567 2570
2568 /* Discard magic */ 2571 /* Discard magic */
2569 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2572 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
2570 flags &= ~MS_MGC_MSK; 2573 flags &= ~MS_MGC_MSK;
2571 2574
2572 /* Basic sanity checks */ 2575 /* Basic sanity checks */
2573 if (data_page) 2576 if (data_page)
2574 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2577 ((char *)data_page)[PAGE_SIZE - 1] = 0;
2575 2578
2576 /* ... and get the mountpoint */ 2579 /* ... and get the mountpoint */
2577 retval = user_path(dir_name, &path); 2580 retval = user_path(dir_name, &path);
2578 if (retval) 2581 if (retval)
2579 return retval; 2582 return retval;
2580 2583
2581 retval = security_sb_mount(dev_name, &path, 2584 retval = security_sb_mount(dev_name, &path,
2582 type_page, flags, data_page); 2585 type_page, flags, data_page);
2583 if (!retval && !may_mount()) 2586 if (!retval && !may_mount())
2584 retval = -EPERM; 2587 retval = -EPERM;
2585 if (retval) 2588 if (retval)
2586 goto dput_out; 2589 goto dput_out;
2587 2590
2588 /* Default to relatime unless overriden */ 2591 /* Default to relatime unless overriden */
2589 if (!(flags & MS_NOATIME)) 2592 if (!(flags & MS_NOATIME))
2590 mnt_flags |= MNT_RELATIME; 2593 mnt_flags |= MNT_RELATIME;
2591 2594
2592 /* Separate the per-mountpoint flags */ 2595 /* Separate the per-mountpoint flags */
2593 if (flags & MS_NOSUID) 2596 if (flags & MS_NOSUID)
2594 mnt_flags |= MNT_NOSUID; 2597 mnt_flags |= MNT_NOSUID;
2595 if (flags & MS_NODEV) 2598 if (flags & MS_NODEV)
2596 mnt_flags |= MNT_NODEV; 2599 mnt_flags |= MNT_NODEV;
2597 if (flags & MS_NOEXEC) 2600 if (flags & MS_NOEXEC)
2598 mnt_flags |= MNT_NOEXEC; 2601 mnt_flags |= MNT_NOEXEC;
2599 if (flags & MS_NOATIME) 2602 if (flags & MS_NOATIME)
2600 mnt_flags |= MNT_NOATIME; 2603 mnt_flags |= MNT_NOATIME;
2601 if (flags & MS_NODIRATIME) 2604 if (flags & MS_NODIRATIME)
2602 mnt_flags |= MNT_NODIRATIME; 2605 mnt_flags |= MNT_NODIRATIME;
2603 if (flags & MS_STRICTATIME) 2606 if (flags & MS_STRICTATIME)
2604 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2607 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
2605 if (flags & MS_RDONLY) 2608 if (flags & MS_RDONLY)
2606 mnt_flags |= MNT_READONLY; 2609 mnt_flags |= MNT_READONLY;
2607 2610
2608 /* The default atime for remount is preservation */ 2611 /* The default atime for remount is preservation */
2609 if ((flags & MS_REMOUNT) && 2612 if ((flags & MS_REMOUNT) &&
2610 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 2613 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
2611 MS_STRICTATIME)) == 0)) { 2614 MS_STRICTATIME)) == 0)) {
2612 mnt_flags &= ~MNT_ATIME_MASK; 2615 mnt_flags &= ~MNT_ATIME_MASK;
2613 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; 2616 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
2614 } 2617 }
2615 2618
2616 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2619 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2617 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2620 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2618 MS_STRICTATIME); 2621 MS_STRICTATIME);
2619 2622
2620 if (flags & MS_REMOUNT) 2623 if (flags & MS_REMOUNT)
2621 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2624 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
2622 data_page); 2625 data_page);
2623 else if (flags & MS_BIND) 2626 else if (flags & MS_BIND)
2624 retval = do_loopback(&path, dev_name, flags & MS_REC); 2627 retval = do_loopback(&path, dev_name, flags & MS_REC);
2625 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2628 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2626 retval = do_change_type(&path, flags); 2629 retval = do_change_type(&path, flags);
2627 else if (flags & MS_MOVE) 2630 else if (flags & MS_MOVE)
2628 retval = do_move_mount(&path, dev_name); 2631 retval = do_move_mount(&path, dev_name);
2629 else 2632 else
2630 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2633 retval = do_new_mount(&path, type_page, flags, mnt_flags,
2631 dev_name, data_page); 2634 dev_name, data_page);
2632 dput_out: 2635 dput_out:
2633 path_put(&path); 2636 path_put(&path);
2634 return retval; 2637 return retval;
2635 } 2638 }
2636 2639
2637 static void free_mnt_ns(struct mnt_namespace *ns) 2640 static void free_mnt_ns(struct mnt_namespace *ns)
2638 { 2641 {
2639 ns_free_inum(&ns->ns); 2642 ns_free_inum(&ns->ns);
2640 put_user_ns(ns->user_ns); 2643 put_user_ns(ns->user_ns);
2641 kfree(ns); 2644 kfree(ns);
2642 } 2645 }
2643 2646
2644 /* 2647 /*
2645 * Assign a sequence number so we can detect when we attempt to bind 2648 * Assign a sequence number so we can detect when we attempt to bind
2646 * mount a reference to an older mount namespace into the current 2649 * mount a reference to an older mount namespace into the current
2647 * mount namespace, preventing reference counting loops. A 64bit 2650 * mount namespace, preventing reference counting loops. A 64bit
2648 * number incrementing at 10Ghz will take 12,427 years to wrap which 2651 * number incrementing at 10Ghz will take 12,427 years to wrap which
2649 * is effectively never, so we can ignore the possibility. 2652 * is effectively never, so we can ignore the possibility.
2650 */ 2653 */
2651 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2654 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2652 2655
2653 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2656 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2654 { 2657 {
2655 struct mnt_namespace *new_ns; 2658 struct mnt_namespace *new_ns;
2656 int ret; 2659 int ret;
2657 2660
2658 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2661 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2659 if (!new_ns) 2662 if (!new_ns)
2660 return ERR_PTR(-ENOMEM); 2663 return ERR_PTR(-ENOMEM);
2661 ret = ns_alloc_inum(&new_ns->ns); 2664 ret = ns_alloc_inum(&new_ns->ns);
2662 if (ret) { 2665 if (ret) {
2663 kfree(new_ns); 2666 kfree(new_ns);
2664 return ERR_PTR(ret); 2667 return ERR_PTR(ret);
2665 } 2668 }
2666 new_ns->ns.ops = &mntns_operations; 2669 new_ns->ns.ops = &mntns_operations;
2667 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2670 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2668 atomic_set(&new_ns->count, 1); 2671 atomic_set(&new_ns->count, 1);
2669 new_ns->root = NULL; 2672 new_ns->root = NULL;
2670 INIT_LIST_HEAD(&new_ns->list); 2673 INIT_LIST_HEAD(&new_ns->list);
2671 init_waitqueue_head(&new_ns->poll); 2674 init_waitqueue_head(&new_ns->poll);
2672 new_ns->event = 0; 2675 new_ns->event = 0;
2673 new_ns->user_ns = get_user_ns(user_ns); 2676 new_ns->user_ns = get_user_ns(user_ns);
2674 return new_ns; 2677 return new_ns;
2675 } 2678 }
2676 2679
2677 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2680 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2678 struct user_namespace *user_ns, struct fs_struct *new_fs) 2681 struct user_namespace *user_ns, struct fs_struct *new_fs)
2679 { 2682 {
2680 struct mnt_namespace *new_ns; 2683 struct mnt_namespace *new_ns;
2681 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2684 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2682 struct mount *p, *q; 2685 struct mount *p, *q;
2683 struct mount *old; 2686 struct mount *old;
2684 struct mount *new; 2687 struct mount *new;
2685 int copy_flags; 2688 int copy_flags;
2686 2689
2687 BUG_ON(!ns); 2690 BUG_ON(!ns);
2688 2691
2689 if (likely(!(flags & CLONE_NEWNS))) { 2692 if (likely(!(flags & CLONE_NEWNS))) {
2690 get_mnt_ns(ns); 2693 get_mnt_ns(ns);
2691 return ns; 2694 return ns;
2692 } 2695 }
2693 2696
2694 old = ns->root; 2697 old = ns->root;
2695 2698
2696 new_ns = alloc_mnt_ns(user_ns); 2699 new_ns = alloc_mnt_ns(user_ns);
2697 if (IS_ERR(new_ns)) 2700 if (IS_ERR(new_ns))
2698 return new_ns; 2701 return new_ns;
2699 2702
2700 namespace_lock(); 2703 namespace_lock();
2701 /* First pass: copy the tree topology */ 2704 /* First pass: copy the tree topology */
2702 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 2705 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2703 if (user_ns != ns->user_ns) 2706 if (user_ns != ns->user_ns)
2704 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2707 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2705 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2708 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2706 if (IS_ERR(new)) { 2709 if (IS_ERR(new)) {
2707 namespace_unlock(); 2710 namespace_unlock();
2708 free_mnt_ns(new_ns); 2711 free_mnt_ns(new_ns);
2709 return ERR_CAST(new); 2712 return ERR_CAST(new);
2710 } 2713 }
2711 new_ns->root = new; 2714 new_ns->root = new;
2712 list_add_tail(&new_ns->list, &new->mnt_list); 2715 list_add_tail(&new_ns->list, &new->mnt_list);
2713 2716
2714 /* 2717 /*
2715 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2718 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
2716 * as belonging to new namespace. We have already acquired a private 2719 * as belonging to new namespace. We have already acquired a private
2717 * fs_struct, so tsk->fs->lock is not needed. 2720 * fs_struct, so tsk->fs->lock is not needed.
2718 */ 2721 */
2719 p = old; 2722 p = old;
2720 q = new; 2723 q = new;
2721 while (p) { 2724 while (p) {
2722 q->mnt_ns = new_ns; 2725 q->mnt_ns = new_ns;
2723 if (new_fs) { 2726 if (new_fs) {
2724 if (&p->mnt == new_fs->root.mnt) { 2727 if (&p->mnt == new_fs->root.mnt) {
2725 new_fs->root.mnt = mntget(&q->mnt); 2728 new_fs->root.mnt = mntget(&q->mnt);
2726 rootmnt = &p->mnt; 2729 rootmnt = &p->mnt;
2727 } 2730 }
2728 if (&p->mnt == new_fs->pwd.mnt) { 2731 if (&p->mnt == new_fs->pwd.mnt) {
2729 new_fs->pwd.mnt = mntget(&q->mnt); 2732 new_fs->pwd.mnt = mntget(&q->mnt);
2730 pwdmnt = &p->mnt; 2733 pwdmnt = &p->mnt;
2731 } 2734 }
2732 } 2735 }
2733 p = next_mnt(p, old); 2736 p = next_mnt(p, old);
2734 q = next_mnt(q, new); 2737 q = next_mnt(q, new);
2735 if (!q) 2738 if (!q)
2736 break; 2739 break;
2737 while (p->mnt.mnt_root != q->mnt.mnt_root) 2740 while (p->mnt.mnt_root != q->mnt.mnt_root)
2738 p = next_mnt(p, old); 2741 p = next_mnt(p, old);
2739 } 2742 }
2740 namespace_unlock(); 2743 namespace_unlock();
2741 2744
2742 if (rootmnt) 2745 if (rootmnt)
2743 mntput(rootmnt); 2746 mntput(rootmnt);
2744 if (pwdmnt) 2747 if (pwdmnt)
2745 mntput(pwdmnt); 2748 mntput(pwdmnt);
2746 2749
2747 return new_ns; 2750 return new_ns;
2748 } 2751 }
2749 2752
2750 /** 2753 /**
2751 * create_mnt_ns - creates a private namespace and adds a root filesystem 2754 * create_mnt_ns - creates a private namespace and adds a root filesystem
2752 * @mnt: pointer to the new root filesystem mountpoint 2755 * @mnt: pointer to the new root filesystem mountpoint
2753 */ 2756 */
2754 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2757 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2755 { 2758 {
2756 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2759 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2757 if (!IS_ERR(new_ns)) { 2760 if (!IS_ERR(new_ns)) {
2758 struct mount *mnt = real_mount(m); 2761 struct mount *mnt = real_mount(m);
2759 mnt->mnt_ns = new_ns; 2762 mnt->mnt_ns = new_ns;
2760 new_ns->root = mnt; 2763 new_ns->root = mnt;
2761 list_add(&mnt->mnt_list, &new_ns->list); 2764 list_add(&mnt->mnt_list, &new_ns->list);
2762 } else { 2765 } else {
2763 mntput(m); 2766 mntput(m);
2764 } 2767 }
2765 return new_ns; 2768 return new_ns;
2766 } 2769 }
2767 2770
2768 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) 2771 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2769 { 2772 {
2770 struct mnt_namespace *ns; 2773 struct mnt_namespace *ns;
2771 struct super_block *s; 2774 struct super_block *s;
2772 struct path path; 2775 struct path path;
2773 int err; 2776 int err;
2774 2777
2775 ns = create_mnt_ns(mnt); 2778 ns = create_mnt_ns(mnt);
2776 if (IS_ERR(ns)) 2779 if (IS_ERR(ns))
2777 return ERR_CAST(ns); 2780 return ERR_CAST(ns);
2778 2781
2779 err = vfs_path_lookup(mnt->mnt_root, mnt, 2782 err = vfs_path_lookup(mnt->mnt_root, mnt,
2780 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2783 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2781 2784
2782 put_mnt_ns(ns); 2785 put_mnt_ns(ns);
2783 2786
2784 if (err) 2787 if (err)
2785 return ERR_PTR(err); 2788 return ERR_PTR(err);
2786 2789
2787 /* trade a vfsmount reference for active sb one */ 2790 /* trade a vfsmount reference for active sb one */
2788 s = path.mnt->mnt_sb; 2791 s = path.mnt->mnt_sb;
2789 atomic_inc(&s->s_active); 2792 atomic_inc(&s->s_active);
2790 mntput(path.mnt); 2793 mntput(path.mnt);
2791 /* lock the sucker */ 2794 /* lock the sucker */
2792 down_write(&s->s_umount); 2795 down_write(&s->s_umount);
2793 /* ... and return the root of (sub)tree on it */ 2796 /* ... and return the root of (sub)tree on it */
2794 return path.dentry; 2797 return path.dentry;
2795 } 2798 }
2796 EXPORT_SYMBOL(mount_subtree); 2799 EXPORT_SYMBOL(mount_subtree);
2797 2800
2798 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2801 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2799 char __user *, type, unsigned long, flags, void __user *, data) 2802 char __user *, type, unsigned long, flags, void __user *, data)
2800 { 2803 {
2801 int ret; 2804 int ret;
2802 char *kernel_type; 2805 char *kernel_type;
2803 char *kernel_dev; 2806 char *kernel_dev;
2804 unsigned long data_page; 2807 unsigned long data_page;
2805 2808
2806 kernel_type = copy_mount_string(type); 2809 kernel_type = copy_mount_string(type);
2807 ret = PTR_ERR(kernel_type); 2810 ret = PTR_ERR(kernel_type);
2808 if (IS_ERR(kernel_type)) 2811 if (IS_ERR(kernel_type))
2809 goto out_type; 2812 goto out_type;
2810 2813
2811 kernel_dev = copy_mount_string(dev_name); 2814 kernel_dev = copy_mount_string(dev_name);
2812 ret = PTR_ERR(kernel_dev); 2815 ret = PTR_ERR(kernel_dev);
2813 if (IS_ERR(kernel_dev)) 2816 if (IS_ERR(kernel_dev))
2814 goto out_dev; 2817 goto out_dev;
2815 2818
2816 ret = copy_mount_options(data, &data_page); 2819 ret = copy_mount_options(data, &data_page);
2817 if (ret < 0) 2820 if (ret < 0)
2818 goto out_data; 2821 goto out_data;
2819 2822
2820 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, 2823 ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
2821 (void *) data_page); 2824 (void *) data_page);
2822 2825
2823 free_page(data_page); 2826 free_page(data_page);
2824 out_data: 2827 out_data:
2825 kfree(kernel_dev); 2828 kfree(kernel_dev);
2826 out_dev: 2829 out_dev:
2827 kfree(kernel_type); 2830 kfree(kernel_type);
2828 out_type: 2831 out_type:
2829 return ret; 2832 return ret;
2830 } 2833 }
2831 2834
2832 /* 2835 /*
2833 * Return true if path is reachable from root 2836 * Return true if path is reachable from root
2834 * 2837 *
2835 * namespace_sem or mount_lock is held 2838 * namespace_sem or mount_lock is held
2836 */ 2839 */
2837 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2840 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2838 const struct path *root) 2841 const struct path *root)
2839 { 2842 {
2840 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 2843 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
2841 dentry = mnt->mnt_mountpoint; 2844 dentry = mnt->mnt_mountpoint;
2842 mnt = mnt->mnt_parent; 2845 mnt = mnt->mnt_parent;
2843 } 2846 }
2844 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 2847 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
2845 } 2848 }
2846 2849
2847 int path_is_under(struct path *path1, struct path *path2) 2850 int path_is_under(struct path *path1, struct path *path2)
2848 { 2851 {
2849 int res; 2852 int res;
2850 read_seqlock_excl(&mount_lock); 2853 read_seqlock_excl(&mount_lock);
2851 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2854 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2852 read_sequnlock_excl(&mount_lock); 2855 read_sequnlock_excl(&mount_lock);
2853 return res; 2856 return res;
2854 } 2857 }
2855 EXPORT_SYMBOL(path_is_under); 2858 EXPORT_SYMBOL(path_is_under);
2856 2859
2857 /* 2860 /*
2858 * pivot_root Semantics: 2861 * pivot_root Semantics:
2859 * Moves the root file system of the current process to the directory put_old, 2862 * Moves the root file system of the current process to the directory put_old,
2860 * makes new_root as the new root file system of the current process, and sets 2863 * makes new_root as the new root file system of the current process, and sets
2861 * root/cwd of all processes which had them on the current root to new_root. 2864 * root/cwd of all processes which had them on the current root to new_root.
2862 * 2865 *
2863 * Restrictions: 2866 * Restrictions:
2864 * The new_root and put_old must be directories, and must not be on the 2867 * The new_root and put_old must be directories, and must not be on the
2865 * same file system as the current process root. The put_old must be 2868 * same file system as the current process root. The put_old must be
2866 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2869 * underneath new_root, i.e. adding a non-zero number of /.. to the string
2867 * pointed to by put_old must yield the same directory as new_root. No other 2870 * pointed to by put_old must yield the same directory as new_root. No other
2868 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2871 * file system may be mounted on put_old. After all, new_root is a mountpoint.
2869 * 2872 *
2870 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2873 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
2871 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2874 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
2872 * in this situation. 2875 * in this situation.
2873 * 2876 *
2874 * Notes: 2877 * Notes:
2875 * - we don't move root/cwd if they are not at the root (reason: if something 2878 * - we don't move root/cwd if they are not at the root (reason: if something
2876 * cared enough to change them, it's probably wrong to force them elsewhere) 2879 * cared enough to change them, it's probably wrong to force them elsewhere)
2877 * - it's okay to pick a root that isn't the root of a file system, e.g. 2880 * - it's okay to pick a root that isn't the root of a file system, e.g.
2878 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2881 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
2879 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2882 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
2880 * first. 2883 * first.
2881 */ 2884 */
2882 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2885 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2883 const char __user *, put_old) 2886 const char __user *, put_old)
2884 { 2887 {
2885 struct path new, old, parent_path, root_parent, root; 2888 struct path new, old, parent_path, root_parent, root;
2886 struct mount *new_mnt, *root_mnt, *old_mnt; 2889 struct mount *new_mnt, *root_mnt, *old_mnt;
2887 struct mountpoint *old_mp, *root_mp; 2890 struct mountpoint *old_mp, *root_mp;
2888 int error; 2891 int error;
2889 2892
2890 if (!may_mount()) 2893 if (!may_mount())
2891 return -EPERM; 2894 return -EPERM;
2892 2895
2893 error = user_path_dir(new_root, &new); 2896 error = user_path_dir(new_root, &new);
2894 if (error) 2897 if (error)
2895 goto out0; 2898 goto out0;
2896 2899
2897 error = user_path_dir(put_old, &old); 2900 error = user_path_dir(put_old, &old);
2898 if (error) 2901 if (error)
2899 goto out1; 2902 goto out1;
2900 2903
2901 error = security_sb_pivotroot(&old, &new); 2904 error = security_sb_pivotroot(&old, &new);
2902 if (error) 2905 if (error)
2903 goto out2; 2906 goto out2;
2904 2907
2905 get_fs_root(current->fs, &root); 2908 get_fs_root(current->fs, &root);
2906 old_mp = lock_mount(&old); 2909 old_mp = lock_mount(&old);
2907 error = PTR_ERR(old_mp); 2910 error = PTR_ERR(old_mp);
2908 if (IS_ERR(old_mp)) 2911 if (IS_ERR(old_mp))
2909 goto out3; 2912 goto out3;
2910 2913
2911 error = -EINVAL; 2914 error = -EINVAL;
2912 new_mnt = real_mount(new.mnt); 2915 new_mnt = real_mount(new.mnt);
2913 root_mnt = real_mount(root.mnt); 2916 root_mnt = real_mount(root.mnt);
2914 old_mnt = real_mount(old.mnt); 2917 old_mnt = real_mount(old.mnt);
2915 if (IS_MNT_SHARED(old_mnt) || 2918 if (IS_MNT_SHARED(old_mnt) ||
2916 IS_MNT_SHARED(new_mnt->mnt_parent) || 2919 IS_MNT_SHARED(new_mnt->mnt_parent) ||
2917 IS_MNT_SHARED(root_mnt->mnt_parent)) 2920 IS_MNT_SHARED(root_mnt->mnt_parent))
2918 goto out4; 2921 goto out4;
2919 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2922 if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2920 goto out4; 2923 goto out4;
2921 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 2924 if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
2922 goto out4; 2925 goto out4;
2923 error = -ENOENT; 2926 error = -ENOENT;
2924 if (d_unlinked(new.dentry)) 2927 if (d_unlinked(new.dentry))
2925 goto out4; 2928 goto out4;
2926 error = -EBUSY; 2929 error = -EBUSY;
2927 if (new_mnt == root_mnt || old_mnt == root_mnt) 2930 if (new_mnt == root_mnt || old_mnt == root_mnt)
2928 goto out4; /* loop, on the same file system */ 2931 goto out4; /* loop, on the same file system */
2929 error = -EINVAL; 2932 error = -EINVAL;
2930 if (root.mnt->mnt_root != root.dentry) 2933 if (root.mnt->mnt_root != root.dentry)
2931 goto out4; /* not a mountpoint */ 2934 goto out4; /* not a mountpoint */
2932 if (!mnt_has_parent(root_mnt)) 2935 if (!mnt_has_parent(root_mnt))
2933 goto out4; /* not attached */ 2936 goto out4; /* not attached */
2934 root_mp = root_mnt->mnt_mp; 2937 root_mp = root_mnt->mnt_mp;
2935 if (new.mnt->mnt_root != new.dentry) 2938 if (new.mnt->mnt_root != new.dentry)
2936 goto out4; /* not a mountpoint */ 2939 goto out4; /* not a mountpoint */
2937 if (!mnt_has_parent(new_mnt)) 2940 if (!mnt_has_parent(new_mnt))
2938 goto out4; /* not attached */ 2941 goto out4; /* not attached */
2939 /* make sure we can reach put_old from new_root */ 2942 /* make sure we can reach put_old from new_root */
2940 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2943 if (!is_path_reachable(old_mnt, old.dentry, &new))
2941 goto out4; 2944 goto out4;
2942 /* make certain new is below the root */ 2945 /* make certain new is below the root */
2943 if (!is_path_reachable(new_mnt, new.dentry, &root)) 2946 if (!is_path_reachable(new_mnt, new.dentry, &root))
2944 goto out4; 2947 goto out4;
2945 root_mp->m_count++; /* pin it so it won't go away */ 2948 root_mp->m_count++; /* pin it so it won't go away */
2946 lock_mount_hash(); 2949 lock_mount_hash();
2947 detach_mnt(new_mnt, &parent_path); 2950 detach_mnt(new_mnt, &parent_path);
2948 detach_mnt(root_mnt, &root_parent); 2951 detach_mnt(root_mnt, &root_parent);
2949 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 2952 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
2950 new_mnt->mnt.mnt_flags |= MNT_LOCKED; 2953 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
2951 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2954 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2952 } 2955 }
2953 /* mount old root on put_old */ 2956 /* mount old root on put_old */
2954 attach_mnt(root_mnt, old_mnt, old_mp); 2957 attach_mnt(root_mnt, old_mnt, old_mp);
2955 /* mount new_root on / */ 2958 /* mount new_root on / */
2956 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2959 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2957 touch_mnt_namespace(current->nsproxy->mnt_ns); 2960 touch_mnt_namespace(current->nsproxy->mnt_ns);
2958 unlock_mount_hash(); 2961 unlock_mount_hash();
2959 chroot_fs_refs(&root, &new); 2962 chroot_fs_refs(&root, &new);
2960 put_mountpoint(root_mp); 2963 put_mountpoint(root_mp);
2961 error = 0; 2964 error = 0;
2962 out4: 2965 out4:
2963 unlock_mount(old_mp); 2966 unlock_mount(old_mp);
2964 if (!error) { 2967 if (!error) {
2965 path_put(&root_parent); 2968 path_put(&root_parent);
2966 path_put(&parent_path); 2969 path_put(&parent_path);
2967 } 2970 }
2968 out3: 2971 out3:
2969 path_put(&root); 2972 path_put(&root);
2970 out2: 2973 out2:
2971 path_put(&old); 2974 path_put(&old);
2972 out1: 2975 out1:
2973 path_put(&new); 2976 path_put(&new);
2974 out0: 2977 out0:
2975 return error; 2978 return error;
2976 } 2979 }
2977 2980
2978 static void __init init_mount_tree(void) 2981 static void __init init_mount_tree(void)
2979 { 2982 {
2980 struct vfsmount *mnt; 2983 struct vfsmount *mnt;
2981 struct mnt_namespace *ns; 2984 struct mnt_namespace *ns;
2982 struct path root; 2985 struct path root;
2983 struct file_system_type *type; 2986 struct file_system_type *type;
2984 2987
2985 type = get_fs_type("rootfs"); 2988 type = get_fs_type("rootfs");
2986 if (!type) 2989 if (!type)
2987 panic("Can't find rootfs type"); 2990 panic("Can't find rootfs type");
2988 mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2991 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2989 put_filesystem(type); 2992 put_filesystem(type);
2990 if (IS_ERR(mnt)) 2993 if (IS_ERR(mnt))
2991 panic("Can't create rootfs"); 2994 panic("Can't create rootfs");
2992 2995
2993 ns = create_mnt_ns(mnt); 2996 ns = create_mnt_ns(mnt);
2994 if (IS_ERR(ns)) 2997 if (IS_ERR(ns))
2995 panic("Can't allocate initial namespace"); 2998 panic("Can't allocate initial namespace");
2996 2999
2997 init_task.nsproxy->mnt_ns = ns; 3000 init_task.nsproxy->mnt_ns = ns;
2998 get_mnt_ns(ns); 3001 get_mnt_ns(ns);
2999 3002
3000 root.mnt = mnt; 3003 root.mnt = mnt;
3001 root.dentry = mnt->mnt_root; 3004 root.dentry = mnt->mnt_root;
3002 3005
3003 set_fs_pwd(current->fs, &root); 3006 set_fs_pwd(current->fs, &root);
3004 set_fs_root(current->fs, &root); 3007 set_fs_root(current->fs, &root);
3005 } 3008 }
3006 3009
3007 void __init mnt_init(void) 3010 void __init mnt_init(void)
3008 { 3011 {
3009 unsigned u; 3012 unsigned u;
3010 int err; 3013 int err;
3011 3014
3012 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 3015 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
3013 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3016 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3014 3017
3015 mount_hashtable = alloc_large_system_hash("Mount-cache", 3018 mount_hashtable = alloc_large_system_hash("Mount-cache",
3016 sizeof(struct hlist_head), 3019 sizeof(struct hlist_head),
3017 mhash_entries, 19, 3020 mhash_entries, 19,
3018 0, 3021 0,
3019 &m_hash_shift, &m_hash_mask, 0, 0); 3022 &m_hash_shift, &m_hash_mask, 0, 0);
3020 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", 3023 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
3021 sizeof(struct hlist_head), 3024 sizeof(struct hlist_head),
3022 mphash_entries, 19, 3025 mphash_entries, 19,
3023 0, 3026 0,
3024 &mp_hash_shift, &mp_hash_mask, 0, 0); 3027 &mp_hash_shift, &mp_hash_mask, 0, 0);
3025 3028
3026 if (!mount_hashtable || !mountpoint_hashtable) 3029 if (!mount_hashtable || !mountpoint_hashtable)
3027 panic("Failed to allocate mount hash table\n"); 3030 panic("Failed to allocate mount hash table\n");
3028 3031
3029 for (u = 0; u <= m_hash_mask; u++) 3032 for (u = 0; u <= m_hash_mask; u++)
3030 INIT_HLIST_HEAD(&mount_hashtable[u]); 3033 INIT_HLIST_HEAD(&mount_hashtable[u]);
3031 for (u = 0; u <= mp_hash_mask; u++) 3034 for (u = 0; u <= mp_hash_mask; u++)
3032 INIT_HLIST_HEAD(&mountpoint_hashtable[u]); 3035 INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
3033 3036
3034 kernfs_init(); 3037 kernfs_init();
3035 3038
3036 err = sysfs_init(); 3039 err = sysfs_init();
3037 if (err) 3040 if (err)
3038 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 3041 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
3039 __func__, err); 3042 __func__, err);
3040 fs_kobj = kobject_create_and_add("fs", NULL); 3043 fs_kobj = kobject_create_and_add("fs", NULL);
3041 if (!fs_kobj) 3044 if (!fs_kobj)
3042 printk(KERN_WARNING "%s: kobj create error\n", __func__); 3045 printk(KERN_WARNING "%s: kobj create error\n", __func__);
3043 init_rootfs(); 3046 init_rootfs();
3044 init_mount_tree(); 3047 init_mount_tree();
3045 } 3048 }
3046 3049
3047 void put_mnt_ns(struct mnt_namespace *ns) 3050 void put_mnt_ns(struct mnt_namespace *ns)
3048 { 3051 {
3049 if (!atomic_dec_and_test(&ns->count)) 3052 if (!atomic_dec_and_test(&ns->count))
3050 return; 3053 return;
3051 drop_collected_mounts(&ns->root->mnt); 3054 drop_collected_mounts(&ns->root->mnt);
3052 free_mnt_ns(ns); 3055 free_mnt_ns(ns);
3053 } 3056 }
3054 3057
3055 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 3058 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
3056 { 3059 {
3057 struct vfsmount *mnt; 3060 struct vfsmount *mnt;
3058 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); 3061 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
3059 if (!IS_ERR(mnt)) { 3062 if (!IS_ERR(mnt)) {
3060 /* 3063 /*
3061 * it is a longterm mount, don't release mnt until 3064 * it is a longterm mount, don't release mnt until
3062 * we unmount before file sys is unregistered 3065 * we unmount before file sys is unregistered
3063 */ 3066 */
3064 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 3067 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
3065 } 3068 }
3066 return mnt; 3069 return mnt;
3067 } 3070 }
3068 EXPORT_SYMBOL_GPL(kern_mount_data); 3071 EXPORT_SYMBOL_GPL(kern_mount_data);
3069 3072
3070 void kern_unmount(struct vfsmount *mnt) 3073 void kern_unmount(struct vfsmount *mnt)
3071 { 3074 {
3072 /* release long term mount so mount point can be released */ 3075 /* release long term mount so mount point can be released */
3073 if (!IS_ERR_OR_NULL(mnt)) { 3076 if (!IS_ERR_OR_NULL(mnt)) {
3074 real_mount(mnt)->mnt_ns = NULL; 3077 real_mount(mnt)->mnt_ns = NULL;
3075 synchronize_rcu(); /* yecchhh... */ 3078 synchronize_rcu(); /* yecchhh... */
3076 mntput(mnt); 3079 mntput(mnt);
3077 } 3080 }
3078 } 3081 }
3079 EXPORT_SYMBOL(kern_unmount); 3082 EXPORT_SYMBOL(kern_unmount);
3080 3083
3081 bool our_mnt(struct vfsmount *mnt) 3084 bool our_mnt(struct vfsmount *mnt)
3082 { 3085 {
3083 return check_mnt(real_mount(mnt)); 3086 return check_mnt(real_mount(mnt));
3084 } 3087 }
3085 3088
3086 bool current_chrooted(void) 3089 bool current_chrooted(void)
3087 { 3090 {
3088 /* Does the current process have a non-standard root */ 3091 /* Does the current process have a non-standard root */
3089 struct path ns_root; 3092 struct path ns_root;
3090 struct path fs_root; 3093 struct path fs_root;
3091 bool chrooted; 3094 bool chrooted;
3092 3095
3093 /* Find the namespace root */ 3096 /* Find the namespace root */
3094 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; 3097 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
3095 ns_root.dentry = ns_root.mnt->mnt_root; 3098 ns_root.dentry = ns_root.mnt->mnt_root;
3096 path_get(&ns_root); 3099 path_get(&ns_root);
3097 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) 3100 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
3098 ; 3101 ;
3099 3102
3100 get_fs_root(current->fs, &fs_root); 3103 get_fs_root(current->fs, &fs_root);
3101 3104
3102 chrooted = !path_equal(&fs_root, &ns_root); 3105 chrooted = !path_equal(&fs_root, &ns_root);
3103 3106
3104 path_put(&fs_root); 3107 path_put(&fs_root);
3105 path_put(&ns_root); 3108 path_put(&ns_root);
3106 3109
3107 return chrooted; 3110 return chrooted;
3108 } 3111 }
3109 3112
3110 bool fs_fully_visible(struct file_system_type *type) 3113 bool fs_fully_visible(struct file_system_type *type)
3111 { 3114 {
3112 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3115 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
3113 struct mount *mnt; 3116 struct mount *mnt;
3114 bool visible = false; 3117 bool visible = false;
3115 3118
3116 if (unlikely(!ns)) 3119 if (unlikely(!ns))
3117 return false; 3120 return false;
3118 3121
3119 down_read(&namespace_sem); 3122 down_read(&namespace_sem);
3120 list_for_each_entry(mnt, &ns->list, mnt_list) { 3123 list_for_each_entry(mnt, &ns->list, mnt_list) {
3121 struct mount *child; 3124 struct mount *child;
3122 if (mnt->mnt.mnt_sb->s_type != type) 3125 if (mnt->mnt.mnt_sb->s_type != type)
3123 continue; 3126 continue;
3124 3127
3125 /* This mount is not fully visible if there are any child mounts 3128 /* This mount is not fully visible if there are any child mounts
3126 * that cover anything except for empty directories. 3129 * that cover anything except for empty directories.
3127 */ 3130 */
3128 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 3131 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
3129 struct inode *inode = child->mnt_mountpoint->d_inode; 3132 struct inode *inode = child->mnt_mountpoint->d_inode;
3130 if (!S_ISDIR(inode->i_mode)) 3133 if (!S_ISDIR(inode->i_mode))
3131 goto next; 3134 goto next;
3132 if (inode->i_nlink > 2) 3135 if (inode->i_nlink > 2)
3133 goto next; 3136 goto next;
3134 } 3137 }
3135 visible = true; 3138 visible = true;
3136 goto found; 3139 goto found;
3137 next: ; 3140 next: ;
3138 } 3141 }
3139 found: 3142 found:
3140 up_read(&namespace_sem); 3143 up_read(&namespace_sem);
3141 return visible; 3144 return visible;
3142 } 3145 }
3143 3146
3144 static struct ns_common *mntns_get(struct task_struct *task) 3147 static struct ns_common *mntns_get(struct task_struct *task)
3145 { 3148 {
3146 struct ns_common *ns = NULL; 3149 struct ns_common *ns = NULL;
3147 struct nsproxy *nsproxy; 3150 struct nsproxy *nsproxy;
3148 3151
3149 task_lock(task); 3152 task_lock(task);
3150 nsproxy = task->nsproxy; 3153 nsproxy = task->nsproxy;
3151 if (nsproxy) { 3154 if (nsproxy) {
3152 ns = &nsproxy->mnt_ns->ns; 3155 ns = &nsproxy->mnt_ns->ns;
3153 get_mnt_ns(to_mnt_ns(ns)); 3156 get_mnt_ns(to_mnt_ns(ns));
3154 } 3157 }
3155 task_unlock(task); 3158 task_unlock(task);
3156 3159
3157 return ns; 3160 return ns;
3158 } 3161 }
3159 3162
3160 static void mntns_put(struct ns_common *ns) 3163 static void mntns_put(struct ns_common *ns)
3161 { 3164 {
3162 put_mnt_ns(to_mnt_ns(ns)); 3165 put_mnt_ns(to_mnt_ns(ns));
3163 } 3166 }
3164 3167
3165 static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) 3168 static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
3166 { 3169 {
3167 struct fs_struct *fs = current->fs; 3170 struct fs_struct *fs = current->fs;
3168 struct mnt_namespace *mnt_ns = to_mnt_ns(ns); 3171 struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
3169 struct path root; 3172 struct path root;
3170 3173
3171 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 3174 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
3172 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || 3175 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
3173 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 3176 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
3174 return -EPERM; 3177 return -EPERM;
3175 3178
3176 if (fs->users != 1) 3179 if (fs->users != 1)
3177 return -EINVAL; 3180 return -EINVAL;
3178 3181
3179 get_mnt_ns(mnt_ns); 3182 get_mnt_ns(mnt_ns);
3180 put_mnt_ns(nsproxy->mnt_ns); 3183 put_mnt_ns(nsproxy->mnt_ns);
3181 nsproxy->mnt_ns = mnt_ns; 3184 nsproxy->mnt_ns = mnt_ns;
3182 3185
3183 /* Find the root */ 3186 /* Find the root */
3184 root.mnt = &mnt_ns->root->mnt; 3187 root.mnt = &mnt_ns->root->mnt;
3185 root.dentry = mnt_ns->root->mnt.mnt_root; 3188 root.dentry = mnt_ns->root->mnt.mnt_root;
3186 path_get(&root); 3189 path_get(&root);
3187 while(d_mountpoint(root.dentry) && follow_down_one(&root)) 3190 while(d_mountpoint(root.dentry) && follow_down_one(&root))
3188 ; 3191 ;
3189 3192
3190 /* Update the pwd and root */ 3193 /* Update the pwd and root */
3191 set_fs_pwd(fs, &root); 3194 set_fs_pwd(fs, &root);
3192 set_fs_root(fs, &root); 3195 set_fs_root(fs, &root);
3193 3196
3194 path_put(&root); 3197 path_put(&root);
3195 return 0; 3198 return 0;
3196 } 3199 }
3197 3200
3198 const struct proc_ns_operations mntns_operations = { 3201 const struct proc_ns_operations mntns_operations = {
3199 .name = "mnt", 3202 .name = "mnt",
3200 .type = CLONE_NEWNS, 3203 .type = CLONE_NEWNS,
3201 .get = mntns_get, 3204 .get = mntns_get,
3202 .put = mntns_put, 3205 .put = mntns_put,
3203 .install = mntns_install, 3206 .install = mntns_install,
3204 }; 3207 };
3205 3208
File was created 1 #include <linux/mount.h>
2 #include <linux/file.h>
3 #include <linux/fs.h>
4 #include <linux/proc_ns.h>
5 #include <linux/magic.h>
6 #include <linux/ktime.h>
7
8 static struct vfsmount *nsfs_mnt;
9
10 static const struct file_operations ns_file_operations = {
11 .llseek = no_llseek,
12 };
13
14 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
15 {
16 struct inode *inode = dentry->d_inode;
17 const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
18
19 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
20 ns_ops->name, inode->i_ino);
21 }
22
23 static void ns_prune_dentry(struct dentry *dentry)
24 {
25 struct inode *inode = dentry->d_inode;
26 if (inode) {
27 struct ns_common *ns = inode->i_private;
28 atomic_long_set(&ns->stashed, 0);
29 }
30 }
31
32 const struct dentry_operations ns_dentry_operations =
33 {
34 .d_prune = ns_prune_dentry,
35 .d_delete = always_delete_dentry,
36 .d_dname = ns_dname,
37 };
38
39 static void nsfs_evict(struct inode *inode)
40 {
41 struct ns_common *ns = inode->i_private;
42 clear_inode(inode);
43 ns->ops->put(ns);
44 }
45
46 void *ns_get_path(struct path *path, struct task_struct *task,
47 const struct proc_ns_operations *ns_ops)
48 {
49 struct vfsmount *mnt = mntget(nsfs_mnt);
50 struct qstr qname = { .name = "", };
51 struct dentry *dentry;
52 struct inode *inode;
53 struct ns_common *ns;
54 unsigned long d;
55
56 again:
57 ns = ns_ops->get(task);
58 if (!ns) {
59 mntput(mnt);
60 return ERR_PTR(-ENOENT);
61 }
62 rcu_read_lock();
63 d = atomic_long_read(&ns->stashed);
64 if (!d)
65 goto slow;
66 dentry = (struct dentry *)d;
67 if (!lockref_get_not_dead(&dentry->d_lockref))
68 goto slow;
69 rcu_read_unlock();
70 ns_ops->put(ns);
71 got_it:
72 path->mnt = mnt;
73 path->dentry = dentry;
74 return NULL;
75 slow:
76 rcu_read_unlock();
77 inode = new_inode_pseudo(mnt->mnt_sb);
78 if (!inode) {
79 ns_ops->put(ns);
80 mntput(mnt);
81 return ERR_PTR(-ENOMEM);
82 }
83 inode->i_ino = ns->inum;
84 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
85 inode->i_flags |= S_IMMUTABLE;
86 inode->i_mode = S_IFREG | S_IRUGO;
87 inode->i_fop = &ns_file_operations;
88 inode->i_private = ns;
89
90 dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
91 if (!dentry) {
92 iput(inode);
93 mntput(mnt);
94 return ERR_PTR(-ENOMEM);
95 }
96 d_instantiate(dentry, inode);
97 dentry->d_fsdata = (void *)ns_ops;
98 d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
99 if (d) {
100 d_delete(dentry); /* make sure ->d_prune() does nothing */
101 dput(dentry);
102 cpu_relax();
103 goto again;
104 }
105 goto got_it;
106 }
107
108 int ns_get_name(char *buf, size_t size, struct task_struct *task,
109 const struct proc_ns_operations *ns_ops)
110 {
111 struct ns_common *ns;
112 int res = -ENOENT;
113 ns = ns_ops->get(task);
114 if (ns) {
115 res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
116 ns_ops->put(ns);
117 }
118 return res;
119 }
120
121 struct file *proc_ns_fget(int fd)
122 {
123 struct file *file;
124
125 file = fget(fd);
126 if (!file)
127 return ERR_PTR(-EBADF);
128
129 if (file->f_op != &ns_file_operations)
130 goto out_invalid;
131
132 return file;
133
134 out_invalid:
135 fput(file);
136 return ERR_PTR(-EINVAL);
137 }
138
139 static const struct super_operations nsfs_ops = {
140 .statfs = simple_statfs,
141 .evict_inode = nsfs_evict,
142 };
143 static struct dentry *nsfs_mount(struct file_system_type *fs_type,
144 int flags, const char *dev_name, void *data)
145 {
146 return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
147 &ns_dentry_operations, NSFS_MAGIC);
148 }
149 static struct file_system_type nsfs = {
150 .name = "nsfs",
151 .mount = nsfs_mount,
152 .kill_sb = kill_anon_super,
153 };
154
155 void __init nsfs_init(void)
156 {
157 nsfs_mnt = kern_mount(&nsfs);
158 if (IS_ERR(nsfs_mnt))
159 panic("can't set nsfs up\n");
160 nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
161 }
162
1 /* 1 /*
2 * linux/fs/proc/inode.c 2 * linux/fs/proc/inode.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/time.h> 7 #include <linux/time.h>
8 #include <linux/proc_fs.h> 8 #include <linux/proc_fs.h>
9 #include <linux/kernel.h> 9 #include <linux/kernel.h>
10 #include <linux/pid_namespace.h> 10 #include <linux/pid_namespace.h>
11 #include <linux/mm.h> 11 #include <linux/mm.h>
12 #include <linux/string.h> 12 #include <linux/string.h>
13 #include <linux/stat.h> 13 #include <linux/stat.h>
14 #include <linux/completion.h> 14 #include <linux/completion.h>
15 #include <linux/poll.h> 15 #include <linux/poll.h>
16 #include <linux/printk.h> 16 #include <linux/printk.h>
17 #include <linux/file.h> 17 #include <linux/file.h>
18 #include <linux/limits.h> 18 #include <linux/limits.h>
19 #include <linux/init.h> 19 #include <linux/init.h>
20 #include <linux/module.h> 20 #include <linux/module.h>
21 #include <linux/sysctl.h> 21 #include <linux/sysctl.h>
22 #include <linux/seq_file.h> 22 #include <linux/seq_file.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/mount.h> 24 #include <linux/mount.h>
25 #include <linux/magic.h> 25 #include <linux/magic.h>
26 26
27 #include <asm/uaccess.h> 27 #include <asm/uaccess.h>
28 28
29 #include "internal.h" 29 #include "internal.h"
30 30
31 static void proc_evict_inode(struct inode *inode) 31 static void proc_evict_inode(struct inode *inode)
32 { 32 {
33 struct proc_dir_entry *de; 33 struct proc_dir_entry *de;
34 struct ctl_table_header *head; 34 struct ctl_table_header *head;
35 struct ns_common *ns;
36 35
37 truncate_inode_pages_final(&inode->i_data); 36 truncate_inode_pages_final(&inode->i_data);
38 clear_inode(inode); 37 clear_inode(inode);
39 38
40 /* Stop tracking associated processes */ 39 /* Stop tracking associated processes */
41 put_pid(PROC_I(inode)->pid); 40 put_pid(PROC_I(inode)->pid);
42 41
43 /* Let go of any associated proc directory entry */ 42 /* Let go of any associated proc directory entry */
44 de = PROC_I(inode)->pde; 43 de = PROC_I(inode)->pde;
45 if (de) 44 if (de)
46 pde_put(de); 45 pde_put(de);
47 head = PROC_I(inode)->sysctl; 46 head = PROC_I(inode)->sysctl;
48 if (head) { 47 if (head) {
49 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); 48 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
50 sysctl_head_put(head); 49 sysctl_head_put(head);
51 } 50 }
52 /* Release any associated namespace */
53 ns = PROC_I(inode)->ns.ns;
54 if (ns && ns->ops)
55 ns->ops->put(ns);
56 } 51 }
57 52
58 static struct kmem_cache * proc_inode_cachep; 53 static struct kmem_cache * proc_inode_cachep;
59 54
60 static struct inode *proc_alloc_inode(struct super_block *sb) 55 static struct inode *proc_alloc_inode(struct super_block *sb)
61 { 56 {
62 struct proc_inode *ei; 57 struct proc_inode *ei;
63 struct inode *inode; 58 struct inode *inode;
64 59
65 ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); 60 ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
66 if (!ei) 61 if (!ei)
67 return NULL; 62 return NULL;
68 ei->pid = NULL; 63 ei->pid = NULL;
69 ei->fd = 0; 64 ei->fd = 0;
70 ei->op.proc_get_link = NULL; 65 ei->op.proc_get_link = NULL;
71 ei->pde = NULL; 66 ei->pde = NULL;
72 ei->sysctl = NULL; 67 ei->sysctl = NULL;
73 ei->sysctl_entry = NULL; 68 ei->sysctl_entry = NULL;
74 ei->ns.ns = NULL; 69 ei->ns.ns = NULL;
75 ei->ns.ns_ops = NULL; 70 ei->ns.ns_ops = NULL;
76 inode = &ei->vfs_inode; 71 inode = &ei->vfs_inode;
77 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 72 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
78 return inode; 73 return inode;
79 } 74 }
80 75
81 static void proc_i_callback(struct rcu_head *head) 76 static void proc_i_callback(struct rcu_head *head)
82 { 77 {
83 struct inode *inode = container_of(head, struct inode, i_rcu); 78 struct inode *inode = container_of(head, struct inode, i_rcu);
84 kmem_cache_free(proc_inode_cachep, PROC_I(inode)); 79 kmem_cache_free(proc_inode_cachep, PROC_I(inode));
85 } 80 }
86 81
87 static void proc_destroy_inode(struct inode *inode) 82 static void proc_destroy_inode(struct inode *inode)
88 { 83 {
89 call_rcu(&inode->i_rcu, proc_i_callback); 84 call_rcu(&inode->i_rcu, proc_i_callback);
90 } 85 }
91 86
92 static void init_once(void *foo) 87 static void init_once(void *foo)
93 { 88 {
94 struct proc_inode *ei = (struct proc_inode *) foo; 89 struct proc_inode *ei = (struct proc_inode *) foo;
95 90
96 inode_init_once(&ei->vfs_inode); 91 inode_init_once(&ei->vfs_inode);
97 } 92 }
98 93
99 void __init proc_init_inodecache(void) 94 void __init proc_init_inodecache(void)
100 { 95 {
101 proc_inode_cachep = kmem_cache_create("proc_inode_cache", 96 proc_inode_cachep = kmem_cache_create("proc_inode_cache",
102 sizeof(struct proc_inode), 97 sizeof(struct proc_inode),
103 0, (SLAB_RECLAIM_ACCOUNT| 98 0, (SLAB_RECLAIM_ACCOUNT|
104 SLAB_MEM_SPREAD|SLAB_PANIC), 99 SLAB_MEM_SPREAD|SLAB_PANIC),
105 init_once); 100 init_once);
106 } 101 }
107 102
108 static int proc_show_options(struct seq_file *seq, struct dentry *root) 103 static int proc_show_options(struct seq_file *seq, struct dentry *root)
109 { 104 {
110 struct super_block *sb = root->d_sb; 105 struct super_block *sb = root->d_sb;
111 struct pid_namespace *pid = sb->s_fs_info; 106 struct pid_namespace *pid = sb->s_fs_info;
112 107
113 if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) 108 if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
114 seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); 109 seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
115 if (pid->hide_pid != 0) 110 if (pid->hide_pid != 0)
116 seq_printf(seq, ",hidepid=%u", pid->hide_pid); 111 seq_printf(seq, ",hidepid=%u", pid->hide_pid);
117 112
118 return 0; 113 return 0;
119 } 114 }
120 115
121 static const struct super_operations proc_sops = { 116 static const struct super_operations proc_sops = {
122 .alloc_inode = proc_alloc_inode, 117 .alloc_inode = proc_alloc_inode,
123 .destroy_inode = proc_destroy_inode, 118 .destroy_inode = proc_destroy_inode,
124 .drop_inode = generic_delete_inode, 119 .drop_inode = generic_delete_inode,
125 .evict_inode = proc_evict_inode, 120 .evict_inode = proc_evict_inode,
126 .statfs = simple_statfs, 121 .statfs = simple_statfs,
127 .remount_fs = proc_remount, 122 .remount_fs = proc_remount,
128 .show_options = proc_show_options, 123 .show_options = proc_show_options,
129 }; 124 };
130 125
131 enum {BIAS = -1U<<31}; 126 enum {BIAS = -1U<<31};
132 127
133 static inline int use_pde(struct proc_dir_entry *pde) 128 static inline int use_pde(struct proc_dir_entry *pde)
134 { 129 {
135 return atomic_inc_unless_negative(&pde->in_use); 130 return atomic_inc_unless_negative(&pde->in_use);
136 } 131 }
137 132
138 static void unuse_pde(struct proc_dir_entry *pde) 133 static void unuse_pde(struct proc_dir_entry *pde)
139 { 134 {
140 if (atomic_dec_return(&pde->in_use) == BIAS) 135 if (atomic_dec_return(&pde->in_use) == BIAS)
141 complete(pde->pde_unload_completion); 136 complete(pde->pde_unload_completion);
142 } 137 }
143 138
144 /* pde is locked */ 139 /* pde is locked */
145 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) 140 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
146 { 141 {
147 if (pdeo->closing) { 142 if (pdeo->closing) {
148 /* somebody else is doing that, just wait */ 143 /* somebody else is doing that, just wait */
149 DECLARE_COMPLETION_ONSTACK(c); 144 DECLARE_COMPLETION_ONSTACK(c);
150 pdeo->c = &c; 145 pdeo->c = &c;
151 spin_unlock(&pde->pde_unload_lock); 146 spin_unlock(&pde->pde_unload_lock);
152 wait_for_completion(&c); 147 wait_for_completion(&c);
153 spin_lock(&pde->pde_unload_lock); 148 spin_lock(&pde->pde_unload_lock);
154 } else { 149 } else {
155 struct file *file; 150 struct file *file;
156 pdeo->closing = 1; 151 pdeo->closing = 1;
157 spin_unlock(&pde->pde_unload_lock); 152 spin_unlock(&pde->pde_unload_lock);
158 file = pdeo->file; 153 file = pdeo->file;
159 pde->proc_fops->release(file_inode(file), file); 154 pde->proc_fops->release(file_inode(file), file);
160 spin_lock(&pde->pde_unload_lock); 155 spin_lock(&pde->pde_unload_lock);
161 list_del_init(&pdeo->lh); 156 list_del_init(&pdeo->lh);
162 if (pdeo->c) 157 if (pdeo->c)
163 complete(pdeo->c); 158 complete(pdeo->c);
164 kfree(pdeo); 159 kfree(pdeo);
165 } 160 }
166 } 161 }
167 162
168 void proc_entry_rundown(struct proc_dir_entry *de) 163 void proc_entry_rundown(struct proc_dir_entry *de)
169 { 164 {
170 DECLARE_COMPLETION_ONSTACK(c); 165 DECLARE_COMPLETION_ONSTACK(c);
171 /* Wait until all existing callers into module are done. */ 166 /* Wait until all existing callers into module are done. */
172 de->pde_unload_completion = &c; 167 de->pde_unload_completion = &c;
173 if (atomic_add_return(BIAS, &de->in_use) != BIAS) 168 if (atomic_add_return(BIAS, &de->in_use) != BIAS)
174 wait_for_completion(&c); 169 wait_for_completion(&c);
175 170
176 spin_lock(&de->pde_unload_lock); 171 spin_lock(&de->pde_unload_lock);
177 while (!list_empty(&de->pde_openers)) { 172 while (!list_empty(&de->pde_openers)) {
178 struct pde_opener *pdeo; 173 struct pde_opener *pdeo;
179 pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); 174 pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
180 close_pdeo(de, pdeo); 175 close_pdeo(de, pdeo);
181 } 176 }
182 spin_unlock(&de->pde_unload_lock); 177 spin_unlock(&de->pde_unload_lock);
183 } 178 }
184 179
185 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) 180 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
186 { 181 {
187 struct proc_dir_entry *pde = PDE(file_inode(file)); 182 struct proc_dir_entry *pde = PDE(file_inode(file));
188 loff_t rv = -EINVAL; 183 loff_t rv = -EINVAL;
189 if (use_pde(pde)) { 184 if (use_pde(pde)) {
190 loff_t (*llseek)(struct file *, loff_t, int); 185 loff_t (*llseek)(struct file *, loff_t, int);
191 llseek = pde->proc_fops->llseek; 186 llseek = pde->proc_fops->llseek;
192 if (!llseek) 187 if (!llseek)
193 llseek = default_llseek; 188 llseek = default_llseek;
194 rv = llseek(file, offset, whence); 189 rv = llseek(file, offset, whence);
195 unuse_pde(pde); 190 unuse_pde(pde);
196 } 191 }
197 return rv; 192 return rv;
198 } 193 }
199 194
200 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 195 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
201 { 196 {
202 ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); 197 ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
203 struct proc_dir_entry *pde = PDE(file_inode(file)); 198 struct proc_dir_entry *pde = PDE(file_inode(file));
204 ssize_t rv = -EIO; 199 ssize_t rv = -EIO;
205 if (use_pde(pde)) { 200 if (use_pde(pde)) {
206 read = pde->proc_fops->read; 201 read = pde->proc_fops->read;
207 if (read) 202 if (read)
208 rv = read(file, buf, count, ppos); 203 rv = read(file, buf, count, ppos);
209 unuse_pde(pde); 204 unuse_pde(pde);
210 } 205 }
211 return rv; 206 return rv;
212 } 207 }
213 208
214 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 209 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
215 { 210 {
216 ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); 211 ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
217 struct proc_dir_entry *pde = PDE(file_inode(file)); 212 struct proc_dir_entry *pde = PDE(file_inode(file));
218 ssize_t rv = -EIO; 213 ssize_t rv = -EIO;
219 if (use_pde(pde)) { 214 if (use_pde(pde)) {
220 write = pde->proc_fops->write; 215 write = pde->proc_fops->write;
221 if (write) 216 if (write)
222 rv = write(file, buf, count, ppos); 217 rv = write(file, buf, count, ppos);
223 unuse_pde(pde); 218 unuse_pde(pde);
224 } 219 }
225 return rv; 220 return rv;
226 } 221 }
227 222
228 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) 223 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
229 { 224 {
230 struct proc_dir_entry *pde = PDE(file_inode(file)); 225 struct proc_dir_entry *pde = PDE(file_inode(file));
231 unsigned int rv = DEFAULT_POLLMASK; 226 unsigned int rv = DEFAULT_POLLMASK;
232 unsigned int (*poll)(struct file *, struct poll_table_struct *); 227 unsigned int (*poll)(struct file *, struct poll_table_struct *);
233 if (use_pde(pde)) { 228 if (use_pde(pde)) {
234 poll = pde->proc_fops->poll; 229 poll = pde->proc_fops->poll;
235 if (poll) 230 if (poll)
236 rv = poll(file, pts); 231 rv = poll(file, pts);
237 unuse_pde(pde); 232 unuse_pde(pde);
238 } 233 }
239 return rv; 234 return rv;
240 } 235 }
241 236
242 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 237 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
243 { 238 {
244 struct proc_dir_entry *pde = PDE(file_inode(file)); 239 struct proc_dir_entry *pde = PDE(file_inode(file));
245 long rv = -ENOTTY; 240 long rv = -ENOTTY;
246 long (*ioctl)(struct file *, unsigned int, unsigned long); 241 long (*ioctl)(struct file *, unsigned int, unsigned long);
247 if (use_pde(pde)) { 242 if (use_pde(pde)) {
248 ioctl = pde->proc_fops->unlocked_ioctl; 243 ioctl = pde->proc_fops->unlocked_ioctl;
249 if (ioctl) 244 if (ioctl)
250 rv = ioctl(file, cmd, arg); 245 rv = ioctl(file, cmd, arg);
251 unuse_pde(pde); 246 unuse_pde(pde);
252 } 247 }
253 return rv; 248 return rv;
254 } 249 }
255 250
256 #ifdef CONFIG_COMPAT 251 #ifdef CONFIG_COMPAT
257 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 252 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
258 { 253 {
259 struct proc_dir_entry *pde = PDE(file_inode(file)); 254 struct proc_dir_entry *pde = PDE(file_inode(file));
260 long rv = -ENOTTY; 255 long rv = -ENOTTY;
261 long (*compat_ioctl)(struct file *, unsigned int, unsigned long); 256 long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
262 if (use_pde(pde)) { 257 if (use_pde(pde)) {
263 compat_ioctl = pde->proc_fops->compat_ioctl; 258 compat_ioctl = pde->proc_fops->compat_ioctl;
264 if (compat_ioctl) 259 if (compat_ioctl)
265 rv = compat_ioctl(file, cmd, arg); 260 rv = compat_ioctl(file, cmd, arg);
266 unuse_pde(pde); 261 unuse_pde(pde);
267 } 262 }
268 return rv; 263 return rv;
269 } 264 }
270 #endif 265 #endif
271 266
272 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) 267 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
273 { 268 {
274 struct proc_dir_entry *pde = PDE(file_inode(file)); 269 struct proc_dir_entry *pde = PDE(file_inode(file));
275 int rv = -EIO; 270 int rv = -EIO;
276 int (*mmap)(struct file *, struct vm_area_struct *); 271 int (*mmap)(struct file *, struct vm_area_struct *);
277 if (use_pde(pde)) { 272 if (use_pde(pde)) {
278 mmap = pde->proc_fops->mmap; 273 mmap = pde->proc_fops->mmap;
279 if (mmap) 274 if (mmap)
280 rv = mmap(file, vma); 275 rv = mmap(file, vma);
281 unuse_pde(pde); 276 unuse_pde(pde);
282 } 277 }
283 return rv; 278 return rv;
284 } 279 }
285 280
286 static unsigned long 281 static unsigned long
287 proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, 282 proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
288 unsigned long len, unsigned long pgoff, 283 unsigned long len, unsigned long pgoff,
289 unsigned long flags) 284 unsigned long flags)
290 { 285 {
291 struct proc_dir_entry *pde = PDE(file_inode(file)); 286 struct proc_dir_entry *pde = PDE(file_inode(file));
292 unsigned long rv = -EIO; 287 unsigned long rv = -EIO;
293 288
294 if (use_pde(pde)) { 289 if (use_pde(pde)) {
295 typeof(proc_reg_get_unmapped_area) *get_area; 290 typeof(proc_reg_get_unmapped_area) *get_area;
296 291
297 get_area = pde->proc_fops->get_unmapped_area; 292 get_area = pde->proc_fops->get_unmapped_area;
298 #ifdef CONFIG_MMU 293 #ifdef CONFIG_MMU
299 if (!get_area) 294 if (!get_area)
300 get_area = current->mm->get_unmapped_area; 295 get_area = current->mm->get_unmapped_area;
301 #endif 296 #endif
302 297
303 if (get_area) 298 if (get_area)
304 rv = get_area(file, orig_addr, len, pgoff, flags); 299 rv = get_area(file, orig_addr, len, pgoff, flags);
305 else 300 else
306 rv = orig_addr; 301 rv = orig_addr;
307 unuse_pde(pde); 302 unuse_pde(pde);
308 } 303 }
309 return rv; 304 return rv;
310 } 305 }
311 306
312 static int proc_reg_open(struct inode *inode, struct file *file) 307 static int proc_reg_open(struct inode *inode, struct file *file)
313 { 308 {
314 struct proc_dir_entry *pde = PDE(inode); 309 struct proc_dir_entry *pde = PDE(inode);
315 int rv = 0; 310 int rv = 0;
316 int (*open)(struct inode *, struct file *); 311 int (*open)(struct inode *, struct file *);
317 int (*release)(struct inode *, struct file *); 312 int (*release)(struct inode *, struct file *);
318 struct pde_opener *pdeo; 313 struct pde_opener *pdeo;
319 314
320 /* 315 /*
321 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry 316 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
322 * sequence. ->release won't be called because ->proc_fops will be 317 * sequence. ->release won't be called because ->proc_fops will be
323 * cleared. Depending on complexity of ->release, consequences vary. 318 * cleared. Depending on complexity of ->release, consequences vary.
324 * 319 *
325 * We can't wait for mercy when close will be done for real, it's 320 * We can't wait for mercy when close will be done for real, it's
326 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release 321 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
327 * by hand in remove_proc_entry(). For this, save opener's credentials 322 * by hand in remove_proc_entry(). For this, save opener's credentials
328 * for later. 323 * for later.
329 */ 324 */
330 pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); 325 pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
331 if (!pdeo) 326 if (!pdeo)
332 return -ENOMEM; 327 return -ENOMEM;
333 328
334 if (!use_pde(pde)) { 329 if (!use_pde(pde)) {
335 kfree(pdeo); 330 kfree(pdeo);
336 return -ENOENT; 331 return -ENOENT;
337 } 332 }
338 open = pde->proc_fops->open; 333 open = pde->proc_fops->open;
339 release = pde->proc_fops->release; 334 release = pde->proc_fops->release;
340 335
341 if (open) 336 if (open)
342 rv = open(inode, file); 337 rv = open(inode, file);
343 338
344 if (rv == 0 && release) { 339 if (rv == 0 && release) {
345 /* To know what to release. */ 340 /* To know what to release. */
346 pdeo->file = file; 341 pdeo->file = file;
347 /* Strictly for "too late" ->release in proc_reg_release(). */ 342 /* Strictly for "too late" ->release in proc_reg_release(). */
348 spin_lock(&pde->pde_unload_lock); 343 spin_lock(&pde->pde_unload_lock);
349 list_add(&pdeo->lh, &pde->pde_openers); 344 list_add(&pdeo->lh, &pde->pde_openers);
350 spin_unlock(&pde->pde_unload_lock); 345 spin_unlock(&pde->pde_unload_lock);
351 } else 346 } else
352 kfree(pdeo); 347 kfree(pdeo);
353 348
354 unuse_pde(pde); 349 unuse_pde(pde);
355 return rv; 350 return rv;
356 } 351 }
357 352
358 static int proc_reg_release(struct inode *inode, struct file *file) 353 static int proc_reg_release(struct inode *inode, struct file *file)
359 { 354 {
360 struct proc_dir_entry *pde = PDE(inode); 355 struct proc_dir_entry *pde = PDE(inode);
361 struct pde_opener *pdeo; 356 struct pde_opener *pdeo;
362 spin_lock(&pde->pde_unload_lock); 357 spin_lock(&pde->pde_unload_lock);
363 list_for_each_entry(pdeo, &pde->pde_openers, lh) { 358 list_for_each_entry(pdeo, &pde->pde_openers, lh) {
364 if (pdeo->file == file) { 359 if (pdeo->file == file) {
365 close_pdeo(pde, pdeo); 360 close_pdeo(pde, pdeo);
366 break; 361 break;
367 } 362 }
368 } 363 }
369 spin_unlock(&pde->pde_unload_lock); 364 spin_unlock(&pde->pde_unload_lock);
370 return 0; 365 return 0;
371 } 366 }
372 367
373 static const struct file_operations proc_reg_file_ops = { 368 static const struct file_operations proc_reg_file_ops = {
374 .llseek = proc_reg_llseek, 369 .llseek = proc_reg_llseek,
375 .read = proc_reg_read, 370 .read = proc_reg_read,
376 .write = proc_reg_write, 371 .write = proc_reg_write,
377 .poll = proc_reg_poll, 372 .poll = proc_reg_poll,
378 .unlocked_ioctl = proc_reg_unlocked_ioctl, 373 .unlocked_ioctl = proc_reg_unlocked_ioctl,
379 #ifdef CONFIG_COMPAT 374 #ifdef CONFIG_COMPAT
380 .compat_ioctl = proc_reg_compat_ioctl, 375 .compat_ioctl = proc_reg_compat_ioctl,
381 #endif 376 #endif
382 .mmap = proc_reg_mmap, 377 .mmap = proc_reg_mmap,
383 .get_unmapped_area = proc_reg_get_unmapped_area, 378 .get_unmapped_area = proc_reg_get_unmapped_area,
384 .open = proc_reg_open, 379 .open = proc_reg_open,
385 .release = proc_reg_release, 380 .release = proc_reg_release,
386 }; 381 };
387 382
388 #ifdef CONFIG_COMPAT 383 #ifdef CONFIG_COMPAT
389 static const struct file_operations proc_reg_file_ops_no_compat = { 384 static const struct file_operations proc_reg_file_ops_no_compat = {
390 .llseek = proc_reg_llseek, 385 .llseek = proc_reg_llseek,
391 .read = proc_reg_read, 386 .read = proc_reg_read,
392 .write = proc_reg_write, 387 .write = proc_reg_write,
393 .poll = proc_reg_poll, 388 .poll = proc_reg_poll,
394 .unlocked_ioctl = proc_reg_unlocked_ioctl, 389 .unlocked_ioctl = proc_reg_unlocked_ioctl,
395 .mmap = proc_reg_mmap, 390 .mmap = proc_reg_mmap,
396 .get_unmapped_area = proc_reg_get_unmapped_area, 391 .get_unmapped_area = proc_reg_get_unmapped_area,
397 .open = proc_reg_open, 392 .open = proc_reg_open,
398 .release = proc_reg_release, 393 .release = proc_reg_release,
399 }; 394 };
400 #endif 395 #endif
401 396
402 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) 397 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
403 { 398 {
404 struct inode *inode = new_inode_pseudo(sb); 399 struct inode *inode = new_inode_pseudo(sb);
405 400
406 if (inode) { 401 if (inode) {
407 inode->i_ino = de->low_ino; 402 inode->i_ino = de->low_ino;
408 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 403 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
409 PROC_I(inode)->pde = de; 404 PROC_I(inode)->pde = de;
410 405
411 if (de->mode) { 406 if (de->mode) {
412 inode->i_mode = de->mode; 407 inode->i_mode = de->mode;
413 inode->i_uid = de->uid; 408 inode->i_uid = de->uid;
414 inode->i_gid = de->gid; 409 inode->i_gid = de->gid;
415 } 410 }
416 if (de->size) 411 if (de->size)
417 inode->i_size = de->size; 412 inode->i_size = de->size;
418 if (de->nlink) 413 if (de->nlink)
419 set_nlink(inode, de->nlink); 414 set_nlink(inode, de->nlink);
420 WARN_ON(!de->proc_iops); 415 WARN_ON(!de->proc_iops);
421 inode->i_op = de->proc_iops; 416 inode->i_op = de->proc_iops;
422 if (de->proc_fops) { 417 if (de->proc_fops) {
423 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
424 #ifdef CONFIG_COMPAT 419 #ifdef CONFIG_COMPAT
425 if (!de->proc_fops->compat_ioctl) 420 if (!de->proc_fops->compat_ioctl)
426 inode->i_fop = 421 inode->i_fop =
427 &proc_reg_file_ops_no_compat; 422 &proc_reg_file_ops_no_compat;
428 else 423 else
429 #endif 424 #endif
430 inode->i_fop = &proc_reg_file_ops; 425 inode->i_fop = &proc_reg_file_ops;
431 } else { 426 } else {
432 inode->i_fop = de->proc_fops; 427 inode->i_fop = de->proc_fops;
433 } 428 }
434 } 429 }
435 } else 430 } else
436 pde_put(de); 431 pde_put(de);
437 return inode; 432 return inode;
438 } 433 }
439 434
440 int proc_fill_super(struct super_block *s) 435 int proc_fill_super(struct super_block *s)
441 { 436 {
442 struct inode *root_inode; 437 struct inode *root_inode;
443 int ret; 438 int ret;
444 439
445 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; 440 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
446 s->s_blocksize = 1024; 441 s->s_blocksize = 1024;
447 s->s_blocksize_bits = 10; 442 s->s_blocksize_bits = 10;
448 s->s_magic = PROC_SUPER_MAGIC; 443 s->s_magic = PROC_SUPER_MAGIC;
449 s->s_op = &proc_sops; 444 s->s_op = &proc_sops;
450 s->s_time_gran = 1; 445 s->s_time_gran = 1;
451 446
452 pde_get(&proc_root); 447 pde_get(&proc_root);
453 root_inode = proc_get_inode(s, &proc_root); 448 root_inode = proc_get_inode(s, &proc_root);
454 if (!root_inode) { 449 if (!root_inode) {
455 pr_err("proc_fill_super: get root inode failed\n"); 450 pr_err("proc_fill_super: get root inode failed\n");
456 return -ENOMEM; 451 return -ENOMEM;
457 } 452 }
458 453
459 s->s_root = d_make_root(root_inode); 454 s->s_root = d_make_root(root_inode);
460 if (!s->s_root) { 455 if (!s->s_root) {
461 pr_err("proc_fill_super: allocate dentry failed\n"); 456 pr_err("proc_fill_super: allocate dentry failed\n");
462 return -ENOMEM; 457 return -ENOMEM;
463 } 458 }
464 459
465 ret = proc_setup_self(s); 460 ret = proc_setup_self(s);
466 if (ret) { 461 if (ret) {
467 return ret; 462 return ret;
468 } 463 }
469 return proc_setup_thread_self(s); 464 return proc_setup_thread_self(s);
470 } 465 }
471 466
fs/proc/namespaces.c
1 #include <linux/proc_fs.h> 1 #include <linux/proc_fs.h>
2 #include <linux/nsproxy.h> 2 #include <linux/nsproxy.h>
3 #include <linux/sched.h>
4 #include <linux/ptrace.h> 3 #include <linux/ptrace.h>
5 #include <linux/fs_struct.h>
6 #include <linux/mount.h>
7 #include <linux/path.h>
8 #include <linux/namei.h> 4 #include <linux/namei.h>
9 #include <linux/file.h> 5 #include <linux/file.h>
10 #include <linux/utsname.h> 6 #include <linux/utsname.h>
11 #include <net/net_namespace.h> 7 #include <net/net_namespace.h>
12 #include <linux/ipc_namespace.h> 8 #include <linux/ipc_namespace.h>
13 #include <linux/pid_namespace.h> 9 #include <linux/pid_namespace.h>
14 #include <linux/user_namespace.h> 10 #include <linux/user_namespace.h>
15 #include "internal.h" 11 #include "internal.h"
16 12
17 13
18 static const struct proc_ns_operations *ns_entries[] = { 14 static const struct proc_ns_operations *ns_entries[] = {
19 #ifdef CONFIG_NET_NS 15 #ifdef CONFIG_NET_NS
20 &netns_operations, 16 &netns_operations,
21 #endif 17 #endif
22 #ifdef CONFIG_UTS_NS 18 #ifdef CONFIG_UTS_NS
23 &utsns_operations, 19 &utsns_operations,
24 #endif 20 #endif
25 #ifdef CONFIG_IPC_NS 21 #ifdef CONFIG_IPC_NS
26 &ipcns_operations, 22 &ipcns_operations,
27 #endif 23 #endif
28 #ifdef CONFIG_PID_NS 24 #ifdef CONFIG_PID_NS
29 &pidns_operations, 25 &pidns_operations,
30 #endif 26 #endif
31 #ifdef CONFIG_USER_NS 27 #ifdef CONFIG_USER_NS
32 &userns_operations, 28 &userns_operations,
33 #endif 29 #endif
34 &mntns_operations, 30 &mntns_operations,
35 }; 31 };
36 32
37 static const struct file_operations ns_file_operations = {
38 .llseek = no_llseek,
39 };
40
41 static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr,
43 };
44
45 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
46 {
47 struct inode *inode = dentry->d_inode;
48 const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
49
50 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
51 ns_ops->name, inode->i_ino);
52 }
53
54 const struct dentry_operations ns_dentry_operations =
55 {
56 .d_delete = always_delete_dentry,
57 .d_dname = ns_dname,
58 };
59
60 static struct dentry *proc_ns_get_dentry(struct super_block *sb,
61 struct task_struct *task, const struct proc_ns_operations *ns_ops)
62 {
63 struct dentry *dentry, *result;
64 struct inode *inode;
65 struct proc_inode *ei;
66 struct qstr qname = { .name = "", };
67 struct ns_common *ns;
68
69 ns = ns_ops->get(task);
70 if (!ns)
71 return ERR_PTR(-ENOENT);
72
73 dentry = d_alloc_pseudo(sb, &qname);
74 if (!dentry) {
75 ns_ops->put(ns);
76 return ERR_PTR(-ENOMEM);
77 }
78 dentry->d_fsdata = (void *)ns_ops;
79
80 inode = iget_locked(sb, ns->inum);
81 if (!inode) {
82 dput(dentry);
83 ns_ops->put(ns);
84 return ERR_PTR(-ENOMEM);
85 }
86
87 ei = PROC_I(inode);
88 if (inode->i_state & I_NEW) {
89 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
90 inode->i_op = &ns_inode_operations;
91 inode->i_mode = S_IFREG | S_IRUGO;
92 inode->i_fop = &ns_file_operations;
93 ei->ns.ns_ops = ns_ops;
94 ei->ns.ns = ns;
95 unlock_new_inode(inode);
96 } else {
97 ns_ops->put(ns);
98 }
99
100 d_set_d_op(dentry, &ns_dentry_operations);
101 result = d_instantiate_unique(dentry, inode);
102 if (result) {
103 dput(dentry);
104 dentry = result;
105 }
106
107 return dentry;
108 }
109
110 static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) 33 static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
111 { 34 {
112 struct inode *inode = dentry->d_inode; 35 struct inode *inode = dentry->d_inode;
113 struct super_block *sb = inode->i_sb; 36 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
114 struct proc_inode *ei = PROC_I(inode);
115 struct task_struct *task; 37 struct task_struct *task;
116 struct path ns_path; 38 struct path ns_path;
117 void *error = ERR_PTR(-EACCES); 39 void *error = ERR_PTR(-EACCES);
118 40
119 task = get_proc_task(inode); 41 task = get_proc_task(inode);
120 if (!task) 42 if (!task)
121 goto out; 43 return error;
122 44
123 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 45 if (ptrace_may_access(task, PTRACE_MODE_READ)) {
124 goto out_put_task; 46 error = ns_get_path(&ns_path, task, ns_ops);
125 47 if (!error)
126 ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); 48 nd_jump_link(nd, &ns_path);
127 if (IS_ERR(ns_path.dentry)) {
128 error = ERR_CAST(ns_path.dentry);
129 goto out_put_task;
130 } 49 }
131
132 ns_path.mnt = mntget(nd->path.mnt);
133 nd_jump_link(nd, &ns_path);
134 error = NULL;
135
136 out_put_task:
137 put_task_struct(task); 50 put_task_struct(task);
138 out:
139 return error; 51 return error;
140 } 52 }
141 53
142 static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) 54 static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
143 { 55 {
144 struct inode *inode = dentry->d_inode; 56 struct inode *inode = dentry->d_inode;
145 struct proc_inode *ei = PROC_I(inode); 57 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
146 const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
147 struct task_struct *task; 58 struct task_struct *task;
148 struct ns_common *ns;
149 char name[50]; 59 char name[50];
150 int res = -EACCES; 60 int res = -EACCES;
151 61
152 task = get_proc_task(inode); 62 task = get_proc_task(inode);
153 if (!task) 63 if (!task)
154 goto out; 64 return res;
155 65
156 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 66 if (ptrace_may_access(task, PTRACE_MODE_READ)) {
157 goto out_put_task; 67 res = ns_get_name(name, sizeof(name), task, ns_ops);
158 68 if (res >= 0)
159 res = -ENOENT; 69 res = readlink_copy(buffer, buflen, name);
160 ns = ns_ops->get(task); 70 }
161 if (!ns)
162 goto out_put_task;
163
164 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns->inum);
165 res = readlink_copy(buffer, buflen, name);
166 ns_ops->put(ns);
167 out_put_task:
168 put_task_struct(task); 71 put_task_struct(task);
169 out:
170 return res; 72 return res;
171 } 73 }
172 74
173 static const struct inode_operations proc_ns_link_inode_operations = { 75 static const struct inode_operations proc_ns_link_inode_operations = {
174 .readlink = proc_ns_readlink, 76 .readlink = proc_ns_readlink,
175 .follow_link = proc_ns_follow_link, 77 .follow_link = proc_ns_follow_link,
176 .setattr = proc_setattr, 78 .setattr = proc_setattr,
177 }; 79 };
178 80
179 static int proc_ns_instantiate(struct inode *dir, 81 static int proc_ns_instantiate(struct inode *dir,
180 struct dentry *dentry, struct task_struct *task, const void *ptr) 82 struct dentry *dentry, struct task_struct *task, const void *ptr)
181 { 83 {
182 const struct proc_ns_operations *ns_ops = ptr; 84 const struct proc_ns_operations *ns_ops = ptr;
183 struct inode *inode; 85 struct inode *inode;
184 struct proc_inode *ei; 86 struct proc_inode *ei;
185 87
186 inode = proc_pid_make_inode(dir->i_sb, task); 88 inode = proc_pid_make_inode(dir->i_sb, task);
187 if (!inode) 89 if (!inode)
188 goto out; 90 goto out;
189 91
190 ei = PROC_I(inode); 92 ei = PROC_I(inode);
191 inode->i_mode = S_IFLNK|S_IRWXUGO; 93 inode->i_mode = S_IFLNK|S_IRWXUGO;
192 inode->i_op = &proc_ns_link_inode_operations; 94 inode->i_op = &proc_ns_link_inode_operations;
193 ei->ns.ns_ops = ns_ops; 95 ei->ns.ns_ops = ns_ops;
194 96
195 d_set_d_op(dentry, &pid_dentry_operations); 97 d_set_d_op(dentry, &pid_dentry_operations);
196 d_add(dentry, inode); 98 d_add(dentry, inode);
197 /* Close the race of the process dying before we return the dentry */ 99 /* Close the race of the process dying before we return the dentry */
198 if (pid_revalidate(dentry, 0)) 100 if (pid_revalidate(dentry, 0))
199 return 0; 101 return 0;
200 out: 102 out:
201 return -ENOENT; 103 return -ENOENT;
202 } 104 }
203 105
204 static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) 106 static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
205 { 107 {
206 struct task_struct *task = get_proc_task(file_inode(file)); 108 struct task_struct *task = get_proc_task(file_inode(file));
207 const struct proc_ns_operations **entry, **last; 109 const struct proc_ns_operations **entry, **last;
208 110
209 if (!task) 111 if (!task)
210 return -ENOENT; 112 return -ENOENT;
211 113
212 if (!dir_emit_dots(file, ctx)) 114 if (!dir_emit_dots(file, ctx))
213 goto out; 115 goto out;
214 if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) 116 if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
215 goto out; 117 goto out;
216 entry = ns_entries + (ctx->pos - 2); 118 entry = ns_entries + (ctx->pos - 2);
217 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; 119 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
218 while (entry <= last) { 120 while (entry <= last) {
219 const struct proc_ns_operations *ops = *entry; 121 const struct proc_ns_operations *ops = *entry;
220 if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), 122 if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
221 proc_ns_instantiate, task, ops)) 123 proc_ns_instantiate, task, ops))
222 break; 124 break;
223 ctx->pos++; 125 ctx->pos++;
224 entry++; 126 entry++;
225 } 127 }
226 out: 128 out:
227 put_task_struct(task); 129 put_task_struct(task);
228 return 0; 130 return 0;
229 } 131 }
230 132
231 const struct file_operations proc_ns_dir_operations = { 133 const struct file_operations proc_ns_dir_operations = {
232 .read = generic_read_dir, 134 .read = generic_read_dir,
233 .iterate = proc_ns_dir_readdir, 135 .iterate = proc_ns_dir_readdir,
234 }; 136 };
235 137
236 static struct dentry *proc_ns_dir_lookup(struct inode *dir, 138 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
237 struct dentry *dentry, unsigned int flags) 139 struct dentry *dentry, unsigned int flags)
238 { 140 {
239 int error; 141 int error;
240 struct task_struct *task = get_proc_task(dir); 142 struct task_struct *task = get_proc_task(dir);
241 const struct proc_ns_operations **entry, **last; 143 const struct proc_ns_operations **entry, **last;
242 unsigned int len = dentry->d_name.len; 144 unsigned int len = dentry->d_name.len;
243 145
244 error = -ENOENT; 146 error = -ENOENT;
245 147
246 if (!task) 148 if (!task)
247 goto out_no_task; 149 goto out_no_task;
248 150
249 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 151 last = &ns_entries[ARRAY_SIZE(ns_entries)];
250 for (entry = ns_entries; entry < last; entry++) { 152 for (entry = ns_entries; entry < last; entry++) {
251 if (strlen((*entry)->name) != len) 153 if (strlen((*entry)->name) != len)
252 continue; 154 continue;
253 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 155 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
254 break; 156 break;
255 } 157 }
256 if (entry == last) 158 if (entry == last)
257 goto out; 159 goto out;
258 160
259 error = proc_ns_instantiate(dir, dentry, task, *entry); 161 error = proc_ns_instantiate(dir, dentry, task, *entry);
260 out: 162 out:
261 put_task_struct(task); 163 put_task_struct(task);
262 out_no_task: 164 out_no_task:
263 return ERR_PTR(error); 165 return ERR_PTR(error);
264 } 166 }
265 167
266 const struct inode_operations proc_ns_dir_inode_operations = { 168 const struct inode_operations proc_ns_dir_inode_operations = {
267 .lookup = proc_ns_dir_lookup, 169 .lookup = proc_ns_dir_lookup,
268 .getattr = pid_getattr, 170 .getattr = pid_getattr,
269 .setattr = proc_setattr, 171 .setattr = proc_setattr,
270 }; 172 };
271
272 struct file *proc_ns_fget(int fd)
273 {
274 struct file *file;
275
276 file = fget(fd);
277 if (!file)
278 return ERR_PTR(-EBADF);
279
280 if (file->f_op != &ns_file_operations)
281 goto out_invalid;
282
283 return file;
284
285 out_invalid:
286 fput(file);
287 return ERR_PTR(-EINVAL);
288 }
289
290 struct ns_common *get_proc_ns(struct inode *inode)
291 {
292 return PROC_I(inode)->ns.ns;
293 }
294
295 bool proc_ns_inode(struct inode *inode)
296 {
297 return inode->i_fop == &ns_file_operations;
298 }
299 173
include/linux/ns_common.h
1 #ifndef _LINUX_NS_COMMON_H 1 #ifndef _LINUX_NS_COMMON_H
2 #define _LINUX_NS_COMMON_H 2 #define _LINUX_NS_COMMON_H
3 3
4 struct proc_ns_operations; 4 struct proc_ns_operations;
5 5
6 struct ns_common { 6 struct ns_common {
7 atomic_long_t stashed;
7 const struct proc_ns_operations *ops; 8 const struct proc_ns_operations *ops;
8 unsigned int inum; 9 unsigned int inum;
9 }; 10 };
10 11
11 #endif 12 #endif
12 13
include/linux/proc_ns.h
1 /* 1 /*
2 * procfs namespace bits 2 * procfs namespace bits
3 */ 3 */
4 #ifndef _LINUX_PROC_NS_H 4 #ifndef _LINUX_PROC_NS_H
5 #define _LINUX_PROC_NS_H 5 #define _LINUX_PROC_NS_H
6 6
7 #include <linux/ns_common.h>
8
7 struct pid_namespace; 9 struct pid_namespace;
8 struct nsproxy; 10 struct nsproxy;
9 struct ns_common; 11 struct path;
10 12
11 struct proc_ns_operations { 13 struct proc_ns_operations {
12 const char *name; 14 const char *name;
13 int type; 15 int type;
14 struct ns_common *(*get)(struct task_struct *task); 16 struct ns_common *(*get)(struct task_struct *task);
15 void (*put)(struct ns_common *ns); 17 void (*put)(struct ns_common *ns);
16 int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); 18 int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
17 }; 19 };
18 20
19 extern const struct proc_ns_operations netns_operations; 21 extern const struct proc_ns_operations netns_operations;
20 extern const struct proc_ns_operations utsns_operations; 22 extern const struct proc_ns_operations utsns_operations;
21 extern const struct proc_ns_operations ipcns_operations; 23 extern const struct proc_ns_operations ipcns_operations;
22 extern const struct proc_ns_operations pidns_operations; 24 extern const struct proc_ns_operations pidns_operations;
23 extern const struct proc_ns_operations userns_operations; 25 extern const struct proc_ns_operations userns_operations;
24 extern const struct proc_ns_operations mntns_operations; 26 extern const struct proc_ns_operations mntns_operations;
25 27
26 /* 28 /*
27 * We always define these enumerators 29 * We always define these enumerators
28 */ 30 */
29 enum { 31 enum {
30 PROC_ROOT_INO = 1, 32 PROC_ROOT_INO = 1,
31 PROC_IPC_INIT_INO = 0xEFFFFFFFU, 33 PROC_IPC_INIT_INO = 0xEFFFFFFFU,
32 PROC_UTS_INIT_INO = 0xEFFFFFFEU, 34 PROC_UTS_INIT_INO = 0xEFFFFFFEU,
33 PROC_USER_INIT_INO = 0xEFFFFFFDU, 35 PROC_USER_INIT_INO = 0xEFFFFFFDU,
34 PROC_PID_INIT_INO = 0xEFFFFFFCU, 36 PROC_PID_INIT_INO = 0xEFFFFFFCU,
35 }; 37 };
36 38
37 #ifdef CONFIG_PROC_FS 39 #ifdef CONFIG_PROC_FS
38 40
39 extern int pid_ns_prepare_proc(struct pid_namespace *ns); 41 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
40 extern void pid_ns_release_proc(struct pid_namespace *ns); 42 extern void pid_ns_release_proc(struct pid_namespace *ns);
41 extern struct file *proc_ns_fget(int fd);
42 extern struct ns_common *get_proc_ns(struct inode *);
43 extern int proc_alloc_inum(unsigned int *pino); 43 extern int proc_alloc_inum(unsigned int *pino);
44 extern void proc_free_inum(unsigned int inum); 44 extern void proc_free_inum(unsigned int inum);
45 extern bool proc_ns_inode(struct inode *inode);
46 45
47 #else /* CONFIG_PROC_FS */ 46 #else /* CONFIG_PROC_FS */
48 47
49 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } 48 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
50 static inline void pid_ns_release_proc(struct pid_namespace *ns) {} 49 static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
51 50
52 static inline struct file *proc_ns_fget(int fd)
53 {
54 return ERR_PTR(-EINVAL);
55 }
56
57 static inline struct ns_common *get_proc_ns(struct inode *inode) { return NULL; }
58
59 static inline int proc_alloc_inum(unsigned int *inum) 51 static inline int proc_alloc_inum(unsigned int *inum)
60 { 52 {
61 *inum = 1; 53 *inum = 1;
62 return 0; 54 return 0;
63 } 55 }
64 static inline void proc_free_inum(unsigned int inum) {} 56 static inline void proc_free_inum(unsigned int inum) {}
65 static inline bool proc_ns_inode(struct inode *inode) { return false; }
66 57
67 #endif /* CONFIG_PROC_FS */ 58 #endif /* CONFIG_PROC_FS */
68 59
69 #define ns_alloc_inum(ns) proc_alloc_inum(&(ns)->inum) 60 static inline int ns_alloc_inum(struct ns_common *ns)
61 {
62 atomic_long_set(&ns->stashed, 0);
63 return proc_alloc_inum(&ns->inum);
64 }
65
70 #define ns_free_inum(ns) proc_free_inum((ns)->inum) 66 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
67
include/uapi/linux/magic.h
1 #ifndef __LINUX_MAGIC_H__ 1 #ifndef __LINUX_MAGIC_H__
2 #define __LINUX_MAGIC_H__ 2 #define __LINUX_MAGIC_H__
3 3
4 #define ADFS_SUPER_MAGIC 0xadf5 4 #define ADFS_SUPER_MAGIC 0xadf5
5 #define AFFS_SUPER_MAGIC 0xadff 5 #define AFFS_SUPER_MAGIC 0xadff
6 #define AFS_SUPER_MAGIC 0x5346414F 6 #define AFS_SUPER_MAGIC 0x5346414F
7 #define AUTOFS_SUPER_MAGIC 0x0187 7 #define AUTOFS_SUPER_MAGIC 0x0187
8 #define CODA_SUPER_MAGIC 0x73757245 8 #define CODA_SUPER_MAGIC 0x73757245
9 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ 9 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */
10 #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ 10 #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */
11 #define DEBUGFS_MAGIC 0x64626720 11 #define DEBUGFS_MAGIC 0x64626720
12 #define SECURITYFS_MAGIC 0x73636673 12 #define SECURITYFS_MAGIC 0x73636673
13 #define SELINUX_MAGIC 0xf97cff8c 13 #define SELINUX_MAGIC 0xf97cff8c
14 #define SMACK_MAGIC 0x43415d53 /* "SMAC" */ 14 #define SMACK_MAGIC 0x43415d53 /* "SMAC" */
15 #define RAMFS_MAGIC 0x858458f6 /* some random number */ 15 #define RAMFS_MAGIC 0x858458f6 /* some random number */
16 #define TMPFS_MAGIC 0x01021994 16 #define TMPFS_MAGIC 0x01021994
17 #define HUGETLBFS_MAGIC 0x958458f6 /* some random number */ 17 #define HUGETLBFS_MAGIC 0x958458f6 /* some random number */
18 #define SQUASHFS_MAGIC 0x73717368 18 #define SQUASHFS_MAGIC 0x73717368
19 #define ECRYPTFS_SUPER_MAGIC 0xf15f 19 #define ECRYPTFS_SUPER_MAGIC 0xf15f
20 #define EFS_SUPER_MAGIC 0x414A53 20 #define EFS_SUPER_MAGIC 0x414A53
21 #define EXT2_SUPER_MAGIC 0xEF53 21 #define EXT2_SUPER_MAGIC 0xEF53
22 #define EXT3_SUPER_MAGIC 0xEF53 22 #define EXT3_SUPER_MAGIC 0xEF53
23 #define XENFS_SUPER_MAGIC 0xabba1974 23 #define XENFS_SUPER_MAGIC 0xabba1974
24 #define EXT4_SUPER_MAGIC 0xEF53 24 #define EXT4_SUPER_MAGIC 0xEF53
25 #define BTRFS_SUPER_MAGIC 0x9123683E 25 #define BTRFS_SUPER_MAGIC 0x9123683E
26 #define NILFS_SUPER_MAGIC 0x3434 26 #define NILFS_SUPER_MAGIC 0x3434
27 #define F2FS_SUPER_MAGIC 0xF2F52010 27 #define F2FS_SUPER_MAGIC 0xF2F52010
28 #define HPFS_SUPER_MAGIC 0xf995e849 28 #define HPFS_SUPER_MAGIC 0xf995e849
29 #define ISOFS_SUPER_MAGIC 0x9660 29 #define ISOFS_SUPER_MAGIC 0x9660
30 #define JFFS2_SUPER_MAGIC 0x72b6 30 #define JFFS2_SUPER_MAGIC 0x72b6
31 #define PSTOREFS_MAGIC 0x6165676C 31 #define PSTOREFS_MAGIC 0x6165676C
32 #define EFIVARFS_MAGIC 0xde5e81e4 32 #define EFIVARFS_MAGIC 0xde5e81e4
33 #define HOSTFS_SUPER_MAGIC 0x00c0ffee 33 #define HOSTFS_SUPER_MAGIC 0x00c0ffee
34 34
35 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ 35 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */
36 #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ 36 #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */
37 #define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ 37 #define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */
38 #define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ 38 #define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */
39 #define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ 39 #define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */
40 40
41 #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ 41 #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */
42 #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ 42 #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */
43 #define NFS_SUPER_MAGIC 0x6969 43 #define NFS_SUPER_MAGIC 0x6969
44 #define OPENPROM_SUPER_MAGIC 0x9fa1 44 #define OPENPROM_SUPER_MAGIC 0x9fa1
45 #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ 45 #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */
46 #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */ 46 #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */
47 47
48 #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ 48 #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */
49 /* used by file system utilities that 49 /* used by file system utilities that
50 look at the superblock, etc. */ 50 look at the superblock, etc. */
51 #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" 51 #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs"
52 #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" 52 #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs"
53 #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" 53 #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs"
54 54
55 #define SMB_SUPER_MAGIC 0x517B 55 #define SMB_SUPER_MAGIC 0x517B
56 #define CGROUP_SUPER_MAGIC 0x27e0eb 56 #define CGROUP_SUPER_MAGIC 0x27e0eb
57 57
58 58
59 #define STACK_END_MAGIC 0x57AC6E9D 59 #define STACK_END_MAGIC 0x57AC6E9D
60 60
61 #define V9FS_MAGIC 0x01021997 61 #define V9FS_MAGIC 0x01021997
62 62
63 #define BDEVFS_MAGIC 0x62646576 63 #define BDEVFS_MAGIC 0x62646576
64 #define BINFMTFS_MAGIC 0x42494e4d 64 #define BINFMTFS_MAGIC 0x42494e4d
65 #define DEVPTS_SUPER_MAGIC 0x1cd1 65 #define DEVPTS_SUPER_MAGIC 0x1cd1
66 #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA 66 #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
67 #define PIPEFS_MAGIC 0x50495045 67 #define PIPEFS_MAGIC 0x50495045
68 #define PROC_SUPER_MAGIC 0x9fa0 68 #define PROC_SUPER_MAGIC 0x9fa0
69 #define SOCKFS_MAGIC 0x534F434B 69 #define SOCKFS_MAGIC 0x534F434B
70 #define SYSFS_MAGIC 0x62656572 70 #define SYSFS_MAGIC 0x62656572
71 #define USBDEVICE_SUPER_MAGIC 0x9fa2 71 #define USBDEVICE_SUPER_MAGIC 0x9fa2
72 #define MTD_INODE_FS_MAGIC 0x11307854 72 #define MTD_INODE_FS_MAGIC 0x11307854
73 #define ANON_INODE_FS_MAGIC 0x09041934 73 #define ANON_INODE_FS_MAGIC 0x09041934
74 #define BTRFS_TEST_MAGIC 0x73727279 74 #define BTRFS_TEST_MAGIC 0x73727279
75 #define NSFS_MAGIC 0x6e736673
75 76
76 #endif /* __LINUX_MAGIC_H__ */ 77 #endif /* __LINUX_MAGIC_H__ */
77 78
1 /* 1 /*
2 * linux/init/main.c 2 * linux/init/main.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * GK 2/5/95 - Changed to support mounting root fs via NFS 6 * GK 2/5/95 - Changed to support mounting root fs via NFS
7 * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 7 * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
8 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 8 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
9 * Simplified starting of init: Michael A. Griffith <grif@acm.org> 9 * Simplified starting of init: Michael A. Griffith <grif@acm.org>
10 */ 10 */
11 11
12 #define DEBUG /* Enable initcall_debug */ 12 #define DEBUG /* Enable initcall_debug */
13 13
14 #include <linux/types.h> 14 #include <linux/types.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/proc_fs.h> 16 #include <linux/proc_fs.h>
17 #include <linux/kernel.h> 17 #include <linux/kernel.h>
18 #include <linux/syscalls.h> 18 #include <linux/syscalls.h>
19 #include <linux/stackprotector.h> 19 #include <linux/stackprotector.h>
20 #include <linux/string.h> 20 #include <linux/string.h>
21 #include <linux/ctype.h> 21 #include <linux/ctype.h>
22 #include <linux/delay.h> 22 #include <linux/delay.h>
23 #include <linux/ioport.h> 23 #include <linux/ioport.h>
24 #include <linux/init.h> 24 #include <linux/init.h>
25 #include <linux/initrd.h> 25 #include <linux/initrd.h>
26 #include <linux/bootmem.h> 26 #include <linux/bootmem.h>
27 #include <linux/acpi.h> 27 #include <linux/acpi.h>
28 #include <linux/tty.h> 28 #include <linux/tty.h>
29 #include <linux/percpu.h> 29 #include <linux/percpu.h>
30 #include <linux/kmod.h> 30 #include <linux/kmod.h>
31 #include <linux/vmalloc.h> 31 #include <linux/vmalloc.h>
32 #include <linux/kernel_stat.h> 32 #include <linux/kernel_stat.h>
33 #include <linux/start_kernel.h> 33 #include <linux/start_kernel.h>
34 #include <linux/security.h> 34 #include <linux/security.h>
35 #include <linux/smp.h> 35 #include <linux/smp.h>
36 #include <linux/profile.h> 36 #include <linux/profile.h>
37 #include <linux/rcupdate.h> 37 #include <linux/rcupdate.h>
38 #include <linux/moduleparam.h> 38 #include <linux/moduleparam.h>
39 #include <linux/kallsyms.h> 39 #include <linux/kallsyms.h>
40 #include <linux/writeback.h> 40 #include <linux/writeback.h>
41 #include <linux/cpu.h> 41 #include <linux/cpu.h>
42 #include <linux/cpuset.h> 42 #include <linux/cpuset.h>
43 #include <linux/cgroup.h> 43 #include <linux/cgroup.h>
44 #include <linux/efi.h> 44 #include <linux/efi.h>
45 #include <linux/tick.h> 45 #include <linux/tick.h>
46 #include <linux/interrupt.h> 46 #include <linux/interrupt.h>
47 #include <linux/taskstats_kern.h> 47 #include <linux/taskstats_kern.h>
48 #include <linux/delayacct.h> 48 #include <linux/delayacct.h>
49 #include <linux/unistd.h> 49 #include <linux/unistd.h>
50 #include <linux/rmap.h> 50 #include <linux/rmap.h>
51 #include <linux/mempolicy.h> 51 #include <linux/mempolicy.h>
52 #include <linux/key.h> 52 #include <linux/key.h>
53 #include <linux/buffer_head.h> 53 #include <linux/buffer_head.h>
54 #include <linux/page_cgroup.h> 54 #include <linux/page_cgroup.h>
55 #include <linux/debug_locks.h> 55 #include <linux/debug_locks.h>
56 #include <linux/debugobjects.h> 56 #include <linux/debugobjects.h>
57 #include <linux/lockdep.h> 57 #include <linux/lockdep.h>
58 #include <linux/kmemleak.h> 58 #include <linux/kmemleak.h>
59 #include <linux/pid_namespace.h> 59 #include <linux/pid_namespace.h>
60 #include <linux/device.h> 60 #include <linux/device.h>
61 #include <linux/kthread.h> 61 #include <linux/kthread.h>
62 #include <linux/sched.h> 62 #include <linux/sched.h>
63 #include <linux/signal.h> 63 #include <linux/signal.h>
64 #include <linux/idr.h> 64 #include <linux/idr.h>
65 #include <linux/kgdb.h> 65 #include <linux/kgdb.h>
66 #include <linux/ftrace.h> 66 #include <linux/ftrace.h>
67 #include <linux/async.h> 67 #include <linux/async.h>
68 #include <linux/kmemcheck.h> 68 #include <linux/kmemcheck.h>
69 #include <linux/sfi.h> 69 #include <linux/sfi.h>
70 #include <linux/shmem_fs.h> 70 #include <linux/shmem_fs.h>
71 #include <linux/slab.h> 71 #include <linux/slab.h>
72 #include <linux/perf_event.h> 72 #include <linux/perf_event.h>
73 #include <linux/file.h> 73 #include <linux/file.h>
74 #include <linux/ptrace.h> 74 #include <linux/ptrace.h>
75 #include <linux/blkdev.h> 75 #include <linux/blkdev.h>
76 #include <linux/elevator.h> 76 #include <linux/elevator.h>
77 #include <linux/sched_clock.h> 77 #include <linux/sched_clock.h>
78 #include <linux/context_tracking.h> 78 #include <linux/context_tracking.h>
79 #include <linux/random.h> 79 #include <linux/random.h>
80 #include <linux/list.h> 80 #include <linux/list.h>
81 #include <linux/proc_ns.h>
81 82
82 #include <asm/io.h> 83 #include <asm/io.h>
83 #include <asm/bugs.h> 84 #include <asm/bugs.h>
84 #include <asm/setup.h> 85 #include <asm/setup.h>
85 #include <asm/sections.h> 86 #include <asm/sections.h>
86 #include <asm/cacheflush.h> 87 #include <asm/cacheflush.h>
87 88
88 #ifdef CONFIG_X86_LOCAL_APIC 89 #ifdef CONFIG_X86_LOCAL_APIC
89 #include <asm/smp.h> 90 #include <asm/smp.h>
90 #endif 91 #endif
91 92
92 static int kernel_init(void *); 93 static int kernel_init(void *);
93 94
94 extern void init_IRQ(void); 95 extern void init_IRQ(void);
95 extern void fork_init(unsigned long); 96 extern void fork_init(unsigned long);
96 extern void radix_tree_init(void); 97 extern void radix_tree_init(void);
97 #ifndef CONFIG_DEBUG_RODATA 98 #ifndef CONFIG_DEBUG_RODATA
98 static inline void mark_rodata_ro(void) { } 99 static inline void mark_rodata_ro(void) { }
99 #endif 100 #endif
100 101
101 /* 102 /*
102 * Debug helper: via this flag we know that we are in 'early bootup code' 103 * Debug helper: via this flag we know that we are in 'early bootup code'
103 * where only the boot processor is running with IRQ disabled. This means 104 * where only the boot processor is running with IRQ disabled. This means
104 * two things - IRQ must not be enabled before the flag is cleared and some 105 * two things - IRQ must not be enabled before the flag is cleared and some
105 * operations which are not allowed with IRQ disabled are allowed while the 106 * operations which are not allowed with IRQ disabled are allowed while the
106 * flag is set. 107 * flag is set.
107 */ 108 */
108 bool early_boot_irqs_disabled __read_mostly; 109 bool early_boot_irqs_disabled __read_mostly;
109 110
110 enum system_states system_state __read_mostly; 111 enum system_states system_state __read_mostly;
111 EXPORT_SYMBOL(system_state); 112 EXPORT_SYMBOL(system_state);
112 113
113 /* 114 /*
114 * Boot command-line arguments 115 * Boot command-line arguments
115 */ 116 */
116 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT 117 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
117 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT 118 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
118 119
119 extern void time_init(void); 120 extern void time_init(void);
120 /* Default late time init is NULL. archs can override this later. */ 121 /* Default late time init is NULL. archs can override this later. */
121 void (*__initdata late_time_init)(void); 122 void (*__initdata late_time_init)(void);
122 123
123 /* Untouched command line saved by arch-specific code. */ 124 /* Untouched command line saved by arch-specific code. */
124 char __initdata boot_command_line[COMMAND_LINE_SIZE]; 125 char __initdata boot_command_line[COMMAND_LINE_SIZE];
125 /* Untouched saved command line (eg. for /proc) */ 126 /* Untouched saved command line (eg. for /proc) */
126 char *saved_command_line; 127 char *saved_command_line;
127 /* Command line for parameter parsing */ 128 /* Command line for parameter parsing */
128 static char *static_command_line; 129 static char *static_command_line;
129 /* Command line for per-initcall parameter parsing */ 130 /* Command line for per-initcall parameter parsing */
130 static char *initcall_command_line; 131 static char *initcall_command_line;
131 132
132 static char *execute_command; 133 static char *execute_command;
133 static char *ramdisk_execute_command; 134 static char *ramdisk_execute_command;
134 135
135 /* 136 /*
136 * Used to generate warnings if static_key manipulation functions are used 137 * Used to generate warnings if static_key manipulation functions are used
137 * before jump_label_init is called. 138 * before jump_label_init is called.
138 */ 139 */
139 bool static_key_initialized __read_mostly; 140 bool static_key_initialized __read_mostly;
140 EXPORT_SYMBOL_GPL(static_key_initialized); 141 EXPORT_SYMBOL_GPL(static_key_initialized);
141 142
142 /* 143 /*
143 * If set, this is an indication to the drivers that reset the underlying 144 * If set, this is an indication to the drivers that reset the underlying
144 * device before going ahead with the initialization otherwise driver might 145 * device before going ahead with the initialization otherwise driver might
145 * rely on the BIOS and skip the reset operation. 146 * rely on the BIOS and skip the reset operation.
146 * 147 *
147 * This is useful if kernel is booting in an unreliable environment. 148 * This is useful if kernel is booting in an unreliable environment.
148 * For ex. kdump situaiton where previous kernel has crashed, BIOS has been 149 * For ex. kdump situaiton where previous kernel has crashed, BIOS has been
149 * skipped and devices will be in unknown state. 150 * skipped and devices will be in unknown state.
150 */ 151 */
151 unsigned int reset_devices; 152 unsigned int reset_devices;
152 EXPORT_SYMBOL(reset_devices); 153 EXPORT_SYMBOL(reset_devices);
153 154
154 static int __init set_reset_devices(char *str) 155 static int __init set_reset_devices(char *str)
155 { 156 {
156 reset_devices = 1; 157 reset_devices = 1;
157 return 1; 158 return 1;
158 } 159 }
159 160
160 __setup("reset_devices", set_reset_devices); 161 __setup("reset_devices", set_reset_devices);
161 162
162 static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; 163 static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
163 const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; 164 const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
164 static const char *panic_later, *panic_param; 165 static const char *panic_later, *panic_param;
165 166
166 extern const struct obs_kernel_param __setup_start[], __setup_end[]; 167 extern const struct obs_kernel_param __setup_start[], __setup_end[];
167 168
168 static int __init obsolete_checksetup(char *line) 169 static int __init obsolete_checksetup(char *line)
169 { 170 {
170 const struct obs_kernel_param *p; 171 const struct obs_kernel_param *p;
171 int had_early_param = 0; 172 int had_early_param = 0;
172 173
173 p = __setup_start; 174 p = __setup_start;
174 do { 175 do {
175 int n = strlen(p->str); 176 int n = strlen(p->str);
176 if (parameqn(line, p->str, n)) { 177 if (parameqn(line, p->str, n)) {
177 if (p->early) { 178 if (p->early) {
178 /* Already done in parse_early_param? 179 /* Already done in parse_early_param?
179 * (Needs exact match on param part). 180 * (Needs exact match on param part).
180 * Keep iterating, as we can have early 181 * Keep iterating, as we can have early
181 * params and __setups of same names 8( */ 182 * params and __setups of same names 8( */
182 if (line[n] == '\0' || line[n] == '=') 183 if (line[n] == '\0' || line[n] == '=')
183 had_early_param = 1; 184 had_early_param = 1;
184 } else if (!p->setup_func) { 185 } else if (!p->setup_func) {
185 pr_warn("Parameter %s is obsolete, ignored\n", 186 pr_warn("Parameter %s is obsolete, ignored\n",
186 p->str); 187 p->str);
187 return 1; 188 return 1;
188 } else if (p->setup_func(line + n)) 189 } else if (p->setup_func(line + n))
189 return 1; 190 return 1;
190 } 191 }
191 p++; 192 p++;
192 } while (p < __setup_end); 193 } while (p < __setup_end);
193 194
194 return had_early_param; 195 return had_early_param;
195 } 196 }
196 197
197 /* 198 /*
198 * This should be approx 2 Bo*oMips to start (note initial shift), and will 199 * This should be approx 2 Bo*oMips to start (note initial shift), and will
199 * still work even if initially too large, it will just take slightly longer 200 * still work even if initially too large, it will just take slightly longer
200 */ 201 */
201 unsigned long loops_per_jiffy = (1<<12); 202 unsigned long loops_per_jiffy = (1<<12);
202 EXPORT_SYMBOL(loops_per_jiffy); 203 EXPORT_SYMBOL(loops_per_jiffy);
203 204
204 static int __init debug_kernel(char *str) 205 static int __init debug_kernel(char *str)
205 { 206 {
206 console_loglevel = CONSOLE_LOGLEVEL_DEBUG; 207 console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
207 return 0; 208 return 0;
208 } 209 }
209 210
210 static int __init quiet_kernel(char *str) 211 static int __init quiet_kernel(char *str)
211 { 212 {
212 console_loglevel = CONSOLE_LOGLEVEL_QUIET; 213 console_loglevel = CONSOLE_LOGLEVEL_QUIET;
213 return 0; 214 return 0;
214 } 215 }
215 216
216 early_param("debug", debug_kernel); 217 early_param("debug", debug_kernel);
217 early_param("quiet", quiet_kernel); 218 early_param("quiet", quiet_kernel);
218 219
219 static int __init loglevel(char *str) 220 static int __init loglevel(char *str)
220 { 221 {
221 int newlevel; 222 int newlevel;
222 223
223 /* 224 /*
224 * Only update loglevel value when a correct setting was passed, 225 * Only update loglevel value when a correct setting was passed,
225 * to prevent blind crashes (when loglevel being set to 0) that 226 * to prevent blind crashes (when loglevel being set to 0) that
226 * are quite hard to debug 227 * are quite hard to debug
227 */ 228 */
228 if (get_option(&str, &newlevel)) { 229 if (get_option(&str, &newlevel)) {
229 console_loglevel = newlevel; 230 console_loglevel = newlevel;
230 return 0; 231 return 0;
231 } 232 }
232 233
233 return -EINVAL; 234 return -EINVAL;
234 } 235 }
235 236
236 early_param("loglevel", loglevel); 237 early_param("loglevel", loglevel);
237 238
238 /* Change NUL term back to "=", to make "param" the whole string. */ 239 /* Change NUL term back to "=", to make "param" the whole string. */
239 static int __init repair_env_string(char *param, char *val, const char *unused) 240 static int __init repair_env_string(char *param, char *val, const char *unused)
240 { 241 {
241 if (val) { 242 if (val) {
242 /* param=val or param="val"? */ 243 /* param=val or param="val"? */
243 if (val == param+strlen(param)+1) 244 if (val == param+strlen(param)+1)
244 val[-1] = '='; 245 val[-1] = '=';
245 else if (val == param+strlen(param)+2) { 246 else if (val == param+strlen(param)+2) {
246 val[-2] = '='; 247 val[-2] = '=';
247 memmove(val-1, val, strlen(val)+1); 248 memmove(val-1, val, strlen(val)+1);
248 val--; 249 val--;
249 } else 250 } else
250 BUG(); 251 BUG();
251 } 252 }
252 return 0; 253 return 0;
253 } 254 }
254 255
255 /* Anything after -- gets handed straight to init. */ 256 /* Anything after -- gets handed straight to init. */
256 static int __init set_init_arg(char *param, char *val, const char *unused) 257 static int __init set_init_arg(char *param, char *val, const char *unused)
257 { 258 {
258 unsigned int i; 259 unsigned int i;
259 260
260 if (panic_later) 261 if (panic_later)
261 return 0; 262 return 0;
262 263
263 repair_env_string(param, val, unused); 264 repair_env_string(param, val, unused);
264 265
265 for (i = 0; argv_init[i]; i++) { 266 for (i = 0; argv_init[i]; i++) {
266 if (i == MAX_INIT_ARGS) { 267 if (i == MAX_INIT_ARGS) {
267 panic_later = "init"; 268 panic_later = "init";
268 panic_param = param; 269 panic_param = param;
269 return 0; 270 return 0;
270 } 271 }
271 } 272 }
272 argv_init[i] = param; 273 argv_init[i] = param;
273 return 0; 274 return 0;
274 } 275 }
275 276
276 /* 277 /*
277 * Unknown boot options get handed to init, unless they look like 278 * Unknown boot options get handed to init, unless they look like
278 * unused parameters (modprobe will find them in /proc/cmdline). 279 * unused parameters (modprobe will find them in /proc/cmdline).
279 */ 280 */
280 static int __init unknown_bootoption(char *param, char *val, const char *unused) 281 static int __init unknown_bootoption(char *param, char *val, const char *unused)
281 { 282 {
282 repair_env_string(param, val, unused); 283 repair_env_string(param, val, unused);
283 284
284 /* Handle obsolete-style parameters */ 285 /* Handle obsolete-style parameters */
285 if (obsolete_checksetup(param)) 286 if (obsolete_checksetup(param))
286 return 0; 287 return 0;
287 288
288 /* Unused module parameter. */ 289 /* Unused module parameter. */
289 if (strchr(param, '.') && (!val || strchr(param, '.') < val)) 290 if (strchr(param, '.') && (!val || strchr(param, '.') < val))
290 return 0; 291 return 0;
291 292
292 if (panic_later) 293 if (panic_later)
293 return 0; 294 return 0;
294 295
295 if (val) { 296 if (val) {
296 /* Environment option */ 297 /* Environment option */
297 unsigned int i; 298 unsigned int i;
298 for (i = 0; envp_init[i]; i++) { 299 for (i = 0; envp_init[i]; i++) {
299 if (i == MAX_INIT_ENVS) { 300 if (i == MAX_INIT_ENVS) {
300 panic_later = "env"; 301 panic_later = "env";
301 panic_param = param; 302 panic_param = param;
302 } 303 }
303 if (!strncmp(param, envp_init[i], val - param)) 304 if (!strncmp(param, envp_init[i], val - param))
304 break; 305 break;
305 } 306 }
306 envp_init[i] = param; 307 envp_init[i] = param;
307 } else { 308 } else {
308 /* Command line option */ 309 /* Command line option */
309 unsigned int i; 310 unsigned int i;
310 for (i = 0; argv_init[i]; i++) { 311 for (i = 0; argv_init[i]; i++) {
311 if (i == MAX_INIT_ARGS) { 312 if (i == MAX_INIT_ARGS) {
312 panic_later = "init"; 313 panic_later = "init";
313 panic_param = param; 314 panic_param = param;
314 } 315 }
315 } 316 }
316 argv_init[i] = param; 317 argv_init[i] = param;
317 } 318 }
318 return 0; 319 return 0;
319 } 320 }
320 321
321 static int __init init_setup(char *str) 322 static int __init init_setup(char *str)
322 { 323 {
323 unsigned int i; 324 unsigned int i;
324 325
325 execute_command = str; 326 execute_command = str;
326 /* 327 /*
327 * In case LILO is going to boot us with default command line, 328 * In case LILO is going to boot us with default command line,
328 * it prepends "auto" before the whole cmdline which makes 329 * it prepends "auto" before the whole cmdline which makes
329 * the shell think it should execute a script with such name. 330 * the shell think it should execute a script with such name.
330 * So we ignore all arguments entered _before_ init=... [MJ] 331 * So we ignore all arguments entered _before_ init=... [MJ]
331 */ 332 */
332 for (i = 1; i < MAX_INIT_ARGS; i++) 333 for (i = 1; i < MAX_INIT_ARGS; i++)
333 argv_init[i] = NULL; 334 argv_init[i] = NULL;
334 return 1; 335 return 1;
335 } 336 }
336 __setup("init=", init_setup); 337 __setup("init=", init_setup);
337 338
338 static int __init rdinit_setup(char *str) 339 static int __init rdinit_setup(char *str)
339 { 340 {
340 unsigned int i; 341 unsigned int i;
341 342
342 ramdisk_execute_command = str; 343 ramdisk_execute_command = str;
343 /* See "auto" comment in init_setup */ 344 /* See "auto" comment in init_setup */
344 for (i = 1; i < MAX_INIT_ARGS; i++) 345 for (i = 1; i < MAX_INIT_ARGS; i++)
345 argv_init[i] = NULL; 346 argv_init[i] = NULL;
346 return 1; 347 return 1;
347 } 348 }
348 __setup("rdinit=", rdinit_setup); 349 __setup("rdinit=", rdinit_setup);
349 350
350 #ifndef CONFIG_SMP 351 #ifndef CONFIG_SMP
351 static const unsigned int setup_max_cpus = NR_CPUS; 352 static const unsigned int setup_max_cpus = NR_CPUS;
352 #ifdef CONFIG_X86_LOCAL_APIC 353 #ifdef CONFIG_X86_LOCAL_APIC
353 static void __init smp_init(void) 354 static void __init smp_init(void)
354 { 355 {
355 APIC_init_uniprocessor(); 356 APIC_init_uniprocessor();
356 } 357 }
357 #else 358 #else
358 #define smp_init() do { } while (0) 359 #define smp_init() do { } while (0)
359 #endif 360 #endif
360 361
361 static inline void setup_nr_cpu_ids(void) { } 362 static inline void setup_nr_cpu_ids(void) { }
362 static inline void smp_prepare_cpus(unsigned int maxcpus) { } 363 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
363 #endif 364 #endif
364 365
365 /* 366 /*
366 * We need to store the untouched command line for future reference. 367 * We need to store the untouched command line for future reference.
367 * We also need to store the touched command line since the parameter 368 * We also need to store the touched command line since the parameter
368 * parsing is performed in place, and we should allow a component to 369 * parsing is performed in place, and we should allow a component to
369 * store reference of name/value for future reference. 370 * store reference of name/value for future reference.
370 */ 371 */
371 static void __init setup_command_line(char *command_line) 372 static void __init setup_command_line(char *command_line)
372 { 373 {
373 saved_command_line = 374 saved_command_line =
374 memblock_virt_alloc(strlen(boot_command_line) + 1, 0); 375 memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
375 initcall_command_line = 376 initcall_command_line =
376 memblock_virt_alloc(strlen(boot_command_line) + 1, 0); 377 memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
377 static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); 378 static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
378 strcpy(saved_command_line, boot_command_line); 379 strcpy(saved_command_line, boot_command_line);
379 strcpy(static_command_line, command_line); 380 strcpy(static_command_line, command_line);
380 } 381 }
381 382
382 /* 383 /*
383 * We need to finalize in a non-__init function or else race conditions 384 * We need to finalize in a non-__init function or else race conditions
384 * between the root thread and the init thread may cause start_kernel to 385 * between the root thread and the init thread may cause start_kernel to
385 * be reaped by free_initmem before the root thread has proceeded to 386 * be reaped by free_initmem before the root thread has proceeded to
386 * cpu_idle. 387 * cpu_idle.
387 * 388 *
388 * gcc-3.4 accidentally inlines this function, so use noinline. 389 * gcc-3.4 accidentally inlines this function, so use noinline.
389 */ 390 */
390 391
391 static __initdata DECLARE_COMPLETION(kthreadd_done); 392 static __initdata DECLARE_COMPLETION(kthreadd_done);
392 393
393 static noinline void __init_refok rest_init(void) 394 static noinline void __init_refok rest_init(void)
394 { 395 {
395 int pid; 396 int pid;
396 397
397 rcu_scheduler_starting(); 398 rcu_scheduler_starting();
398 /* 399 /*
399 * We need to spawn init first so that it obtains pid 1, however 400 * We need to spawn init first so that it obtains pid 1, however
400 * the init task will end up wanting to create kthreads, which, if 401 * the init task will end up wanting to create kthreads, which, if
401 * we schedule it before we create kthreadd, will OOPS. 402 * we schedule it before we create kthreadd, will OOPS.
402 */ 403 */
403 kernel_thread(kernel_init, NULL, CLONE_FS); 404 kernel_thread(kernel_init, NULL, CLONE_FS);
404 numa_default_policy(); 405 numa_default_policy();
405 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); 406 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
406 rcu_read_lock(); 407 rcu_read_lock();
407 kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); 408 kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
408 rcu_read_unlock(); 409 rcu_read_unlock();
409 complete(&kthreadd_done); 410 complete(&kthreadd_done);
410 411
411 /* 412 /*
412 * The boot idle thread must execute schedule() 413 * The boot idle thread must execute schedule()
413 * at least once to get things moving: 414 * at least once to get things moving:
414 */ 415 */
415 init_idle_bootup_task(current); 416 init_idle_bootup_task(current);
416 schedule_preempt_disabled(); 417 schedule_preempt_disabled();
417 /* Call into cpu_idle with preempt disabled */ 418 /* Call into cpu_idle with preempt disabled */
418 cpu_startup_entry(CPUHP_ONLINE); 419 cpu_startup_entry(CPUHP_ONLINE);
419 } 420 }
420 421
421 /* Check for early params. */ 422 /* Check for early params. */
422 static int __init do_early_param(char *param, char *val, const char *unused) 423 static int __init do_early_param(char *param, char *val, const char *unused)
423 { 424 {
424 const struct obs_kernel_param *p; 425 const struct obs_kernel_param *p;
425 426
426 for (p = __setup_start; p < __setup_end; p++) { 427 for (p = __setup_start; p < __setup_end; p++) {
427 if ((p->early && parameq(param, p->str)) || 428 if ((p->early && parameq(param, p->str)) ||
428 (strcmp(param, "console") == 0 && 429 (strcmp(param, "console") == 0 &&
429 strcmp(p->str, "earlycon") == 0) 430 strcmp(p->str, "earlycon") == 0)
430 ) { 431 ) {
431 if (p->setup_func(val) != 0) 432 if (p->setup_func(val) != 0)
432 pr_warn("Malformed early option '%s'\n", param); 433 pr_warn("Malformed early option '%s'\n", param);
433 } 434 }
434 } 435 }
435 /* We accept everything at this stage. */ 436 /* We accept everything at this stage. */
436 return 0; 437 return 0;
437 } 438 }
438 439
439 void __init parse_early_options(char *cmdline) 440 void __init parse_early_options(char *cmdline)
440 { 441 {
441 parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); 442 parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param);
442 } 443 }
443 444
444 /* Arch code calls this early on, or if not, just before other parsing. */ 445 /* Arch code calls this early on, or if not, just before other parsing. */
445 void __init parse_early_param(void) 446 void __init parse_early_param(void)
446 { 447 {
447 static int done __initdata; 448 static int done __initdata;
448 static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; 449 static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
449 450
450 if (done) 451 if (done)
451 return; 452 return;
452 453
453 /* All fall through to do_early_param. */ 454 /* All fall through to do_early_param. */
454 strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); 455 strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
455 parse_early_options(tmp_cmdline); 456 parse_early_options(tmp_cmdline);
456 done = 1; 457 done = 1;
457 } 458 }
458 459
459 /* 460 /*
460 * Activate the first processor. 461 * Activate the first processor.
461 */ 462 */
462 463
463 static void __init boot_cpu_init(void) 464 static void __init boot_cpu_init(void)
464 { 465 {
465 int cpu = smp_processor_id(); 466 int cpu = smp_processor_id();
466 /* Mark the boot cpu "present", "online" etc for SMP and UP case */ 467 /* Mark the boot cpu "present", "online" etc for SMP and UP case */
467 set_cpu_online(cpu, true); 468 set_cpu_online(cpu, true);
468 set_cpu_active(cpu, true); 469 set_cpu_active(cpu, true);
469 set_cpu_present(cpu, true); 470 set_cpu_present(cpu, true);
470 set_cpu_possible(cpu, true); 471 set_cpu_possible(cpu, true);
471 } 472 }
472 473
473 void __init __weak smp_setup_processor_id(void) 474 void __init __weak smp_setup_processor_id(void)
474 { 475 {
475 } 476 }
476 477
477 # if THREAD_SIZE >= PAGE_SIZE 478 # if THREAD_SIZE >= PAGE_SIZE
478 void __init __weak thread_info_cache_init(void) 479 void __init __weak thread_info_cache_init(void)
479 { 480 {
480 } 481 }
481 #endif 482 #endif
482 483
483 /* 484 /*
484 * Set up kernel memory allocators 485 * Set up kernel memory allocators
485 */ 486 */
486 static void __init mm_init(void) 487 static void __init mm_init(void)
487 { 488 {
488 /* 489 /*
489 * page_cgroup requires contiguous pages, 490 * page_cgroup requires contiguous pages,
490 * bigger than MAX_ORDER unless SPARSEMEM. 491 * bigger than MAX_ORDER unless SPARSEMEM.
491 */ 492 */
492 page_cgroup_init_flatmem(); 493 page_cgroup_init_flatmem();
493 mem_init(); 494 mem_init();
494 kmem_cache_init(); 495 kmem_cache_init();
495 percpu_init_late(); 496 percpu_init_late();
496 pgtable_init(); 497 pgtable_init();
497 vmalloc_init(); 498 vmalloc_init();
498 } 499 }
499 500
500 asmlinkage __visible void __init start_kernel(void) 501 asmlinkage __visible void __init start_kernel(void)
501 { 502 {
502 char *command_line; 503 char *command_line;
503 char *after_dashes; 504 char *after_dashes;
504 505
505 /* 506 /*
506 * Need to run as early as possible, to initialize the 507 * Need to run as early as possible, to initialize the
507 * lockdep hash: 508 * lockdep hash:
508 */ 509 */
509 lockdep_init(); 510 lockdep_init();
510 set_task_stack_end_magic(&init_task); 511 set_task_stack_end_magic(&init_task);
511 smp_setup_processor_id(); 512 smp_setup_processor_id();
512 debug_objects_early_init(); 513 debug_objects_early_init();
513 514
514 /* 515 /*
515 * Set up the the initial canary ASAP: 516 * Set up the the initial canary ASAP:
516 */ 517 */
517 boot_init_stack_canary(); 518 boot_init_stack_canary();
518 519
519 cgroup_init_early(); 520 cgroup_init_early();
520 521
521 local_irq_disable(); 522 local_irq_disable();
522 early_boot_irqs_disabled = true; 523 early_boot_irqs_disabled = true;
523 524
524 /* 525 /*
525 * Interrupts are still disabled. Do necessary setups, then 526 * Interrupts are still disabled. Do necessary setups, then
526 * enable them 527 * enable them
527 */ 528 */
528 boot_cpu_init(); 529 boot_cpu_init();
529 page_address_init(); 530 page_address_init();
530 pr_notice("%s", linux_banner); 531 pr_notice("%s", linux_banner);
531 setup_arch(&command_line); 532 setup_arch(&command_line);
532 mm_init_cpumask(&init_mm); 533 mm_init_cpumask(&init_mm);
533 setup_command_line(command_line); 534 setup_command_line(command_line);
534 setup_nr_cpu_ids(); 535 setup_nr_cpu_ids();
535 setup_per_cpu_areas(); 536 setup_per_cpu_areas();
536 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 537 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
537 538
538 build_all_zonelists(NULL, NULL); 539 build_all_zonelists(NULL, NULL);
539 page_alloc_init(); 540 page_alloc_init();
540 541
541 pr_notice("Kernel command line: %s\n", boot_command_line); 542 pr_notice("Kernel command line: %s\n", boot_command_line);
542 parse_early_param(); 543 parse_early_param();
543 after_dashes = parse_args("Booting kernel", 544 after_dashes = parse_args("Booting kernel",
544 static_command_line, __start___param, 545 static_command_line, __start___param,
545 __stop___param - __start___param, 546 __stop___param - __start___param,
546 -1, -1, &unknown_bootoption); 547 -1, -1, &unknown_bootoption);
547 if (after_dashes) 548 if (after_dashes)
548 parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, 549 parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
549 set_init_arg); 550 set_init_arg);
550 551
551 jump_label_init(); 552 jump_label_init();
552 553
553 /* 554 /*
554 * These use large bootmem allocations and must precede 555 * These use large bootmem allocations and must precede
555 * kmem_cache_init() 556 * kmem_cache_init()
556 */ 557 */
557 setup_log_buf(0); 558 setup_log_buf(0);
558 pidhash_init(); 559 pidhash_init();
559 vfs_caches_init_early(); 560 vfs_caches_init_early();
560 sort_main_extable(); 561 sort_main_extable();
561 trap_init(); 562 trap_init();
562 mm_init(); 563 mm_init();
563 564
564 /* 565 /*
565 * Set up the scheduler prior starting any interrupts (such as the 566 * Set up the scheduler prior starting any interrupts (such as the
566 * timer interrupt). Full topology setup happens at smp_init() 567 * timer interrupt). Full topology setup happens at smp_init()
567 * time - but meanwhile we still have a functioning scheduler. 568 * time - but meanwhile we still have a functioning scheduler.
568 */ 569 */
569 sched_init(); 570 sched_init();
570 /* 571 /*
571 * Disable preemption - early bootup scheduling is extremely 572 * Disable preemption - early bootup scheduling is extremely
572 * fragile until we cpu_idle() for the first time. 573 * fragile until we cpu_idle() for the first time.
573 */ 574 */
574 preempt_disable(); 575 preempt_disable();
575 if (WARN(!irqs_disabled(), 576 if (WARN(!irqs_disabled(),
576 "Interrupts were enabled *very* early, fixing it\n")) 577 "Interrupts were enabled *very* early, fixing it\n"))
577 local_irq_disable(); 578 local_irq_disable();
578 idr_init_cache(); 579 idr_init_cache();
579 rcu_init(); 580 rcu_init();
580 context_tracking_init(); 581 context_tracking_init();
581 radix_tree_init(); 582 radix_tree_init();
582 /* init some links before init_ISA_irqs() */ 583 /* init some links before init_ISA_irqs() */
583 early_irq_init(); 584 early_irq_init();
584 init_IRQ(); 585 init_IRQ();
585 tick_init(); 586 tick_init();
586 rcu_init_nohz(); 587 rcu_init_nohz();
587 init_timers(); 588 init_timers();
588 hrtimers_init(); 589 hrtimers_init();
589 softirq_init(); 590 softirq_init();
590 timekeeping_init(); 591 timekeeping_init();
591 time_init(); 592 time_init();
592 sched_clock_postinit(); 593 sched_clock_postinit();
593 perf_event_init(); 594 perf_event_init();
594 profile_init(); 595 profile_init();
595 call_function_init(); 596 call_function_init();
596 WARN(!irqs_disabled(), "Interrupts were enabled early\n"); 597 WARN(!irqs_disabled(), "Interrupts were enabled early\n");
597 early_boot_irqs_disabled = false; 598 early_boot_irqs_disabled = false;
598 local_irq_enable(); 599 local_irq_enable();
599 600
600 kmem_cache_init_late(); 601 kmem_cache_init_late();
601 602
602 /* 603 /*
603 * HACK ALERT! This is early. We're enabling the console before 604 * HACK ALERT! This is early. We're enabling the console before
604 * we've done PCI setups etc, and console_init() must be aware of 605 * we've done PCI setups etc, and console_init() must be aware of
605 * this. But we do want output early, in case something goes wrong. 606 * this. But we do want output early, in case something goes wrong.
606 */ 607 */
607 console_init(); 608 console_init();
608 if (panic_later) 609 if (panic_later)
609 panic("Too many boot %s vars at `%s'", panic_later, 610 panic("Too many boot %s vars at `%s'", panic_later,
610 panic_param); 611 panic_param);
611 612
612 lockdep_info(); 613 lockdep_info();
613 614
614 /* 615 /*
615 * Need to run this when irqs are enabled, because it wants 616 * Need to run this when irqs are enabled, because it wants
616 * to self-test [hard/soft]-irqs on/off lock inversion bugs 617 * to self-test [hard/soft]-irqs on/off lock inversion bugs
617 * too: 618 * too:
618 */ 619 */
619 locking_selftest(); 620 locking_selftest();
620 621
621 #ifdef CONFIG_BLK_DEV_INITRD 622 #ifdef CONFIG_BLK_DEV_INITRD
622 if (initrd_start && !initrd_below_start_ok && 623 if (initrd_start && !initrd_below_start_ok &&
623 page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { 624 page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
624 pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", 625 pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
625 page_to_pfn(virt_to_page((void *)initrd_start)), 626 page_to_pfn(virt_to_page((void *)initrd_start)),
626 min_low_pfn); 627 min_low_pfn);
627 initrd_start = 0; 628 initrd_start = 0;
628 } 629 }
629 #endif 630 #endif
630 page_cgroup_init(); 631 page_cgroup_init();
631 debug_objects_mem_init(); 632 debug_objects_mem_init();
632 kmemleak_init(); 633 kmemleak_init();
633 setup_per_cpu_pageset(); 634 setup_per_cpu_pageset();
634 numa_policy_init(); 635 numa_policy_init();
635 if (late_time_init) 636 if (late_time_init)
636 late_time_init(); 637 late_time_init();
637 sched_clock_init(); 638 sched_clock_init();
638 calibrate_delay(); 639 calibrate_delay();
639 pidmap_init(); 640 pidmap_init();
640 anon_vma_init(); 641 anon_vma_init();
641 acpi_early_init(); 642 acpi_early_init();
642 #ifdef CONFIG_X86 643 #ifdef CONFIG_X86
643 if (efi_enabled(EFI_RUNTIME_SERVICES)) 644 if (efi_enabled(EFI_RUNTIME_SERVICES))
644 efi_enter_virtual_mode(); 645 efi_enter_virtual_mode();
645 #endif 646 #endif
646 #ifdef CONFIG_X86_ESPFIX64 647 #ifdef CONFIG_X86_ESPFIX64
647 /* Should be run before the first non-init thread is created */ 648 /* Should be run before the first non-init thread is created */
648 init_espfix_bsp(); 649 init_espfix_bsp();
649 #endif 650 #endif
650 thread_info_cache_init(); 651 thread_info_cache_init();
651 cred_init(); 652 cred_init();
652 fork_init(totalram_pages); 653 fork_init(totalram_pages);
653 proc_caches_init(); 654 proc_caches_init();
654 buffer_init(); 655 buffer_init();
655 key_init(); 656 key_init();
656 security_init(); 657 security_init();
657 dbg_late_init(); 658 dbg_late_init();
658 vfs_caches_init(totalram_pages); 659 vfs_caches_init(totalram_pages);
659 signals_init(); 660 signals_init();
660 /* rootfs populating might need page-writeback */ 661 /* rootfs populating might need page-writeback */
661 page_writeback_init(); 662 page_writeback_init();
662 proc_root_init(); 663 proc_root_init();
664 nsfs_init();
663 cgroup_init(); 665 cgroup_init();
664 cpuset_init(); 666 cpuset_init();
665 taskstats_init_early(); 667 taskstats_init_early();
666 delayacct_init(); 668 delayacct_init();
667 669
668 check_bugs(); 670 check_bugs();
669 671
670 sfi_init_late(); 672 sfi_init_late();
671 673
672 if (efi_enabled(EFI_RUNTIME_SERVICES)) { 674 if (efi_enabled(EFI_RUNTIME_SERVICES)) {
673 efi_late_init(); 675 efi_late_init();
674 efi_free_boot_services(); 676 efi_free_boot_services();
675 } 677 }
676 678
677 ftrace_init(); 679 ftrace_init();
678 680
679 /* Do the rest non-__init'ed, we're now alive */ 681 /* Do the rest non-__init'ed, we're now alive */
680 rest_init(); 682 rest_init();
681 } 683 }
682 684
683 /* Call all constructor functions linked into the kernel. */ 685 /* Call all constructor functions linked into the kernel. */
684 static void __init do_ctors(void) 686 static void __init do_ctors(void)
685 { 687 {
686 #ifdef CONFIG_CONSTRUCTORS 688 #ifdef CONFIG_CONSTRUCTORS
687 ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; 689 ctor_fn_t *fn = (ctor_fn_t *) __ctors_start;
688 690
689 for (; fn < (ctor_fn_t *) __ctors_end; fn++) 691 for (; fn < (ctor_fn_t *) __ctors_end; fn++)
690 (*fn)(); 692 (*fn)();
691 #endif 693 #endif
692 } 694 }
693 695
694 bool initcall_debug; 696 bool initcall_debug;
695 core_param(initcall_debug, initcall_debug, bool, 0644); 697 core_param(initcall_debug, initcall_debug, bool, 0644);
696 698
697 #ifdef CONFIG_KALLSYMS 699 #ifdef CONFIG_KALLSYMS
698 struct blacklist_entry { 700 struct blacklist_entry {
699 struct list_head next; 701 struct list_head next;
700 char *buf; 702 char *buf;
701 }; 703 };
702 704
703 static __initdata_or_module LIST_HEAD(blacklisted_initcalls); 705 static __initdata_or_module LIST_HEAD(blacklisted_initcalls);
704 706
705 static int __init initcall_blacklist(char *str) 707 static int __init initcall_blacklist(char *str)
706 { 708 {
707 char *str_entry; 709 char *str_entry;
708 struct blacklist_entry *entry; 710 struct blacklist_entry *entry;
709 711
710 /* str argument is a comma-separated list of functions */ 712 /* str argument is a comma-separated list of functions */
711 do { 713 do {
712 str_entry = strsep(&str, ","); 714 str_entry = strsep(&str, ",");
713 if (str_entry) { 715 if (str_entry) {
714 pr_debug("blacklisting initcall %s\n", str_entry); 716 pr_debug("blacklisting initcall %s\n", str_entry);
715 entry = alloc_bootmem(sizeof(*entry)); 717 entry = alloc_bootmem(sizeof(*entry));
716 entry->buf = alloc_bootmem(strlen(str_entry) + 1); 718 entry->buf = alloc_bootmem(strlen(str_entry) + 1);
717 strcpy(entry->buf, str_entry); 719 strcpy(entry->buf, str_entry);
718 list_add(&entry->next, &blacklisted_initcalls); 720 list_add(&entry->next, &blacklisted_initcalls);
719 } 721 }
720 } while (str_entry); 722 } while (str_entry);
721 723
722 return 0; 724 return 0;
723 } 725 }
724 726
725 static bool __init_or_module initcall_blacklisted(initcall_t fn) 727 static bool __init_or_module initcall_blacklisted(initcall_t fn)
726 { 728 {
727 struct list_head *tmp; 729 struct list_head *tmp;
728 struct blacklist_entry *entry; 730 struct blacklist_entry *entry;
729 char *fn_name; 731 char *fn_name;
730 732
731 fn_name = kasprintf(GFP_KERNEL, "%pf", fn); 733 fn_name = kasprintf(GFP_KERNEL, "%pf", fn);
732 if (!fn_name) 734 if (!fn_name)
733 return false; 735 return false;
734 736
735 list_for_each(tmp, &blacklisted_initcalls) { 737 list_for_each(tmp, &blacklisted_initcalls) {
736 entry = list_entry(tmp, struct blacklist_entry, next); 738 entry = list_entry(tmp, struct blacklist_entry, next);
737 if (!strcmp(fn_name, entry->buf)) { 739 if (!strcmp(fn_name, entry->buf)) {
738 pr_debug("initcall %s blacklisted\n", fn_name); 740 pr_debug("initcall %s blacklisted\n", fn_name);
739 kfree(fn_name); 741 kfree(fn_name);
740 return true; 742 return true;
741 } 743 }
742 } 744 }
743 745
744 kfree(fn_name); 746 kfree(fn_name);
745 return false; 747 return false;
746 } 748 }
747 #else 749 #else
748 static int __init initcall_blacklist(char *str) 750 static int __init initcall_blacklist(char *str)
749 { 751 {
750 pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); 752 pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n");
751 return 0; 753 return 0;
752 } 754 }
753 755
754 static bool __init_or_module initcall_blacklisted(initcall_t fn) 756 static bool __init_or_module initcall_blacklisted(initcall_t fn)
755 { 757 {
756 return false; 758 return false;
757 } 759 }
758 #endif 760 #endif
759 __setup("initcall_blacklist=", initcall_blacklist); 761 __setup("initcall_blacklist=", initcall_blacklist);
760 762
761 static int __init_or_module do_one_initcall_debug(initcall_t fn) 763 static int __init_or_module do_one_initcall_debug(initcall_t fn)
762 { 764 {
763 ktime_t calltime, delta, rettime; 765 ktime_t calltime, delta, rettime;
764 unsigned long long duration; 766 unsigned long long duration;
765 int ret; 767 int ret;
766 768
767 printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); 769 printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
768 calltime = ktime_get(); 770 calltime = ktime_get();
769 ret = fn(); 771 ret = fn();
770 rettime = ktime_get(); 772 rettime = ktime_get();
771 delta = ktime_sub(rettime, calltime); 773 delta = ktime_sub(rettime, calltime);
772 duration = (unsigned long long) ktime_to_ns(delta) >> 10; 774 duration = (unsigned long long) ktime_to_ns(delta) >> 10;
773 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", 775 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
774 fn, ret, duration); 776 fn, ret, duration);
775 777
776 return ret; 778 return ret;
777 } 779 }
778 780
779 int __init_or_module do_one_initcall(initcall_t fn) 781 int __init_or_module do_one_initcall(initcall_t fn)
780 { 782 {
781 int count = preempt_count(); 783 int count = preempt_count();
782 int ret; 784 int ret;
783 char msgbuf[64]; 785 char msgbuf[64];
784 786
785 if (initcall_blacklisted(fn)) 787 if (initcall_blacklisted(fn))
786 return -EPERM; 788 return -EPERM;
787 789
788 if (initcall_debug) 790 if (initcall_debug)
789 ret = do_one_initcall_debug(fn); 791 ret = do_one_initcall_debug(fn);
790 else 792 else
791 ret = fn(); 793 ret = fn();
792 794
793 msgbuf[0] = 0; 795 msgbuf[0] = 0;
794 796
795 if (preempt_count() != count) { 797 if (preempt_count() != count) {
796 sprintf(msgbuf, "preemption imbalance "); 798 sprintf(msgbuf, "preemption imbalance ");
797 preempt_count_set(count); 799 preempt_count_set(count);
798 } 800 }
799 if (irqs_disabled()) { 801 if (irqs_disabled()) {
800 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); 802 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
801 local_irq_enable(); 803 local_irq_enable();
802 } 804 }
803 WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); 805 WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
804 806
805 return ret; 807 return ret;
806 } 808 }
807 809
808 810
809 extern initcall_t __initcall_start[]; 811 extern initcall_t __initcall_start[];
810 extern initcall_t __initcall0_start[]; 812 extern initcall_t __initcall0_start[];
811 extern initcall_t __initcall1_start[]; 813 extern initcall_t __initcall1_start[];
812 extern initcall_t __initcall2_start[]; 814 extern initcall_t __initcall2_start[];
813 extern initcall_t __initcall3_start[]; 815 extern initcall_t __initcall3_start[];
814 extern initcall_t __initcall4_start[]; 816 extern initcall_t __initcall4_start[];
815 extern initcall_t __initcall5_start[]; 817 extern initcall_t __initcall5_start[];
816 extern initcall_t __initcall6_start[]; 818 extern initcall_t __initcall6_start[];
817 extern initcall_t __initcall7_start[]; 819 extern initcall_t __initcall7_start[];
818 extern initcall_t __initcall_end[]; 820 extern initcall_t __initcall_end[];
819 821
820 static initcall_t *initcall_levels[] __initdata = { 822 static initcall_t *initcall_levels[] __initdata = {
821 __initcall0_start, 823 __initcall0_start,
822 __initcall1_start, 824 __initcall1_start,
823 __initcall2_start, 825 __initcall2_start,
824 __initcall3_start, 826 __initcall3_start,
825 __initcall4_start, 827 __initcall4_start,
826 __initcall5_start, 828 __initcall5_start,
827 __initcall6_start, 829 __initcall6_start,
828 __initcall7_start, 830 __initcall7_start,
829 __initcall_end, 831 __initcall_end,
830 }; 832 };
831 833
832 /* Keep these in sync with initcalls in include/linux/init.h */ 834 /* Keep these in sync with initcalls in include/linux/init.h */
833 static char *initcall_level_names[] __initdata = { 835 static char *initcall_level_names[] __initdata = {
834 "early", 836 "early",
835 "core", 837 "core",
836 "postcore", 838 "postcore",
837 "arch", 839 "arch",
838 "subsys", 840 "subsys",
839 "fs", 841 "fs",
840 "device", 842 "device",
841 "late", 843 "late",
842 }; 844 };
843 845
844 static void __init do_initcall_level(int level) 846 static void __init do_initcall_level(int level)
845 { 847 {
846 initcall_t *fn; 848 initcall_t *fn;
847 849
848 strcpy(initcall_command_line, saved_command_line); 850 strcpy(initcall_command_line, saved_command_line);
849 parse_args(initcall_level_names[level], 851 parse_args(initcall_level_names[level],
850 initcall_command_line, __start___param, 852 initcall_command_line, __start___param,
851 __stop___param - __start___param, 853 __stop___param - __start___param,
852 level, level, 854 level, level,
853 &repair_env_string); 855 &repair_env_string);
854 856
855 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) 857 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
856 do_one_initcall(*fn); 858 do_one_initcall(*fn);
857 } 859 }
858 860
859 static void __init do_initcalls(void) 861 static void __init do_initcalls(void)
860 { 862 {
861 int level; 863 int level;
862 864
863 for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) 865 for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
864 do_initcall_level(level); 866 do_initcall_level(level);
865 } 867 }
866 868
867 /* 869 /*
868 * Ok, the machine is now initialized. None of the devices 870 * Ok, the machine is now initialized. None of the devices
869 * have been touched yet, but the CPU subsystem is up and 871 * have been touched yet, but the CPU subsystem is up and
870 * running, and memory and process management works. 872 * running, and memory and process management works.
871 * 873 *
872 * Now we can finally start doing some real work.. 874 * Now we can finally start doing some real work..
873 */ 875 */
874 static void __init do_basic_setup(void) 876 static void __init do_basic_setup(void)
875 { 877 {
876 cpuset_init_smp(); 878 cpuset_init_smp();
877 usermodehelper_init(); 879 usermodehelper_init();
878 shmem_init(); 880 shmem_init();
879 driver_init(); 881 driver_init();
880 init_irq_proc(); 882 init_irq_proc();
881 do_ctors(); 883 do_ctors();
882 usermodehelper_enable(); 884 usermodehelper_enable();
883 do_initcalls(); 885 do_initcalls();
884 random_int_secret_init(); 886 random_int_secret_init();
885 } 887 }
886 888
887 static void __init do_pre_smp_initcalls(void) 889 static void __init do_pre_smp_initcalls(void)
888 { 890 {
889 initcall_t *fn; 891 initcall_t *fn;
890 892
891 for (fn = __initcall_start; fn < __initcall0_start; fn++) 893 for (fn = __initcall_start; fn < __initcall0_start; fn++)
892 do_one_initcall(*fn); 894 do_one_initcall(*fn);
893 } 895 }
894 896
895 /* 897 /*
896 * This function requests modules which should be loaded by default and is 898 * This function requests modules which should be loaded by default and is
897 * called twice right after initrd is mounted and right before init is 899 * called twice right after initrd is mounted and right before init is
898 * exec'd. If such modules are on either initrd or rootfs, they will be 900 * exec'd. If such modules are on either initrd or rootfs, they will be
899 * loaded before control is passed to userland. 901 * loaded before control is passed to userland.
900 */ 902 */
901 void __init load_default_modules(void) 903 void __init load_default_modules(void)
902 { 904 {
903 load_default_elevator_module(); 905 load_default_elevator_module();
904 } 906 }
905 907
906 static int run_init_process(const char *init_filename) 908 static int run_init_process(const char *init_filename)
907 { 909 {
908 argv_init[0] = init_filename; 910 argv_init[0] = init_filename;
909 return do_execve(getname_kernel(init_filename), 911 return do_execve(getname_kernel(init_filename),
910 (const char __user *const __user *)argv_init, 912 (const char __user *const __user *)argv_init,
911 (const char __user *const __user *)envp_init); 913 (const char __user *const __user *)envp_init);
912 } 914 }
913 915
914 static int try_to_run_init_process(const char *init_filename) 916 static int try_to_run_init_process(const char *init_filename)
915 { 917 {
916 int ret; 918 int ret;
917 919
918 ret = run_init_process(init_filename); 920 ret = run_init_process(init_filename);
919 921
920 if (ret && ret != -ENOENT) { 922 if (ret && ret != -ENOENT) {
921 pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", 923 pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
922 init_filename, ret); 924 init_filename, ret);
923 } 925 }
924 926
925 return ret; 927 return ret;
926 } 928 }
927 929
928 static noinline void __init kernel_init_freeable(void); 930 static noinline void __init kernel_init_freeable(void);
929 931
930 static int __ref kernel_init(void *unused) 932 static int __ref kernel_init(void *unused)
931 { 933 {
932 int ret; 934 int ret;
933 935
934 kernel_init_freeable(); 936 kernel_init_freeable();
935 /* need to finish all async __init code before freeing the memory */ 937 /* need to finish all async __init code before freeing the memory */
936 async_synchronize_full(); 938 async_synchronize_full();
937 free_initmem(); 939 free_initmem();
938 mark_rodata_ro(); 940 mark_rodata_ro();
939 system_state = SYSTEM_RUNNING; 941 system_state = SYSTEM_RUNNING;
940 numa_default_policy(); 942 numa_default_policy();
941 943
942 flush_delayed_fput(); 944 flush_delayed_fput();
943 945
944 if (ramdisk_execute_command) { 946 if (ramdisk_execute_command) {
945 ret = run_init_process(ramdisk_execute_command); 947 ret = run_init_process(ramdisk_execute_command);
946 if (!ret) 948 if (!ret)
947 return 0; 949 return 0;
948 pr_err("Failed to execute %s (error %d)\n", 950 pr_err("Failed to execute %s (error %d)\n",
949 ramdisk_execute_command, ret); 951 ramdisk_execute_command, ret);
950 } 952 }
951 953
952 /* 954 /*
953 * We try each of these until one succeeds. 955 * We try each of these until one succeeds.
954 * 956 *
955 * The Bourne shell can be used instead of init if we are 957 * The Bourne shell can be used instead of init if we are
956 * trying to recover a really broken machine. 958 * trying to recover a really broken machine.
957 */ 959 */
958 if (execute_command) { 960 if (execute_command) {
959 ret = run_init_process(execute_command); 961 ret = run_init_process(execute_command);
960 if (!ret) 962 if (!ret)
961 return 0; 963 return 0;
962 pr_err("Failed to execute %s (error %d). Attempting defaults...\n", 964 pr_err("Failed to execute %s (error %d). Attempting defaults...\n",
963 execute_command, ret); 965 execute_command, ret);
964 } 966 }
965 if (!try_to_run_init_process("/sbin/init") || 967 if (!try_to_run_init_process("/sbin/init") ||
966 !try_to_run_init_process("/etc/init") || 968 !try_to_run_init_process("/etc/init") ||
967 !try_to_run_init_process("/bin/init") || 969 !try_to_run_init_process("/bin/init") ||
968 !try_to_run_init_process("/bin/sh")) 970 !try_to_run_init_process("/bin/sh"))
969 return 0; 971 return 0;
970 972
971 panic("No working init found. Try passing init= option to kernel. " 973 panic("No working init found. Try passing init= option to kernel. "
972 "See Linux Documentation/init.txt for guidance."); 974 "See Linux Documentation/init.txt for guidance.");
973 } 975 }
974 976
975 static noinline void __init kernel_init_freeable(void) 977 static noinline void __init kernel_init_freeable(void)
976 { 978 {
977 /* 979 /*
978 * Wait until kthreadd is all set-up. 980 * Wait until kthreadd is all set-up.
979 */ 981 */
980 wait_for_completion(&kthreadd_done); 982 wait_for_completion(&kthreadd_done);
981 983
982 /* Now the scheduler is fully set up and can do blocking allocations */ 984 /* Now the scheduler is fully set up and can do blocking allocations */
983 gfp_allowed_mask = __GFP_BITS_MASK; 985 gfp_allowed_mask = __GFP_BITS_MASK;
984 986
985 /* 987 /*
986 * init can allocate pages on any node 988 * init can allocate pages on any node
987 */ 989 */
988 set_mems_allowed(node_states[N_MEMORY]); 990 set_mems_allowed(node_states[N_MEMORY]);
989 /* 991 /*
990 * init can run on any cpu. 992 * init can run on any cpu.
991 */ 993 */
992 set_cpus_allowed_ptr(current, cpu_all_mask); 994 set_cpus_allowed_ptr(current, cpu_all_mask);
993 995
994 cad_pid = task_pid(current); 996 cad_pid = task_pid(current);
995 997
996 smp_prepare_cpus(setup_max_cpus); 998 smp_prepare_cpus(setup_max_cpus);
997 999
998 do_pre_smp_initcalls(); 1000 do_pre_smp_initcalls();
999 lockup_detector_init(); 1001 lockup_detector_init();
1000 1002
1001 smp_init(); 1003 smp_init();
1002 sched_init_smp(); 1004 sched_init_smp();
1003 1005
1004 do_basic_setup(); 1006 do_basic_setup();
1005 1007
1006 /* Open the /dev/console on the rootfs, this should never fail */ 1008 /* Open the /dev/console on the rootfs, this should never fail */
1007 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) 1009 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
1008 pr_err("Warning: unable to open an initial console.\n"); 1010 pr_err("Warning: unable to open an initial console.\n");
1009 1011
1010 (void) sys_dup(0); 1012 (void) sys_dup(0);
1011 (void) sys_dup(0); 1013 (void) sys_dup(0);
1012 /* 1014 /*
1013 * check if there is an early userspace init. If yes, let it do all 1015 * check if there is an early userspace init. If yes, let it do all
1014 * the work 1016 * the work
1015 */ 1017 */
1016 1018
1017 if (!ramdisk_execute_command) 1019 if (!ramdisk_execute_command)
1018 ramdisk_execute_command = "/init"; 1020 ramdisk_execute_command = "/init";
1019 1021
1020 if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { 1022 if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
1021 ramdisk_execute_command = NULL; 1023 ramdisk_execute_command = NULL;
1022 prepare_namespace(); 1024 prepare_namespace();
1023 } 1025 }
1024 1026
1025 /* 1027 /*
1026 * Ok, we have completed the initial bootup, and 1028 * Ok, we have completed the initial bootup, and
1027 * we're essentially up and running. Get rid of the 1029 * we're essentially up and running. Get rid of the
1028 * initmem segments and start the user-mode stuff.. 1030 * initmem segments and start the user-mode stuff..
1029 */ 1031 */
1030 1032
1031 /* rootfs is available now, try loading default modules */ 1033 /* rootfs is available now, try loading default modules */
1032 load_default_modules(); 1034 load_default_modules();
1033 } 1035 }
1034 1036