Commit e149ed2b805fefdccf7ccdfc19eca22fdd4514ac
1 parent
f77c80142e
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
take the targets of /proc/*/ns/* symlinks to separate fs
New pseudo-filesystem: nsfs. Targets of /proc/*/ns/* live there now. It's not mountable (not even registered, so it's not in /proc/filesystems, etc.). Files on it *are* bindable - we explicitly permit that in do_loopback(). This stuff lives in fs/nsfs.c now; proc_ns_fget() moved there as well. get_proc_ns() is a macro now (it's simply returning ->i_private; would have been an inline, if not for header ordering headache). proc_ns_inode() is an ex-parrot. The interface used in procfs is ns_get_path(path, task, ops) and ns_get_name(buf, size, task, ops). Dentries and inodes are never hashed; a non-counting reference to dentry is stashed in ns_common (removed by ->d_prune()) and reused by ns_get_path() if present. See ns_get_path()/ns_prune_dentry/nsfs_evict() for details of that mechanism. As the result, proc_ns_follow_link() has stopped poking in nd->path.mnt; it does nd_jump_link() on a consistent <vfsmount,dentry> pair it gets from ns_get_path(). Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 10 changed files with 208 additions and 161 deletions Inline Diff
fs/Makefile
1 | # | 1 | # |
2 | # Makefile for the Linux filesystems. | 2 | # Makefile for the Linux filesystems. |
3 | # | 3 | # |
4 | # 14 Sep 2000, Christoph Hellwig <hch@infradead.org> | 4 | # 14 Sep 2000, Christoph Hellwig <hch@infradead.org> |
5 | # Rewritten to use lists instead of if-statements. | 5 | # Rewritten to use lists instead of if-statements. |
6 | # | 6 | # |
7 | 7 | ||
8 | obj-y := open.o read_write.o file_table.o super.o \ | 8 | obj-y := open.o read_write.o file_table.o super.o \ |
9 | char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ | 9 | char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ |
10 | ioctl.o readdir.o select.o dcache.o inode.o \ | 10 | ioctl.o readdir.o select.o dcache.o inode.o \ |
11 | attr.o bad_inode.o file.o filesystems.o namespace.o \ | 11 | attr.o bad_inode.o file.o filesystems.o namespace.o \ |
12 | seq_file.o xattr.o libfs.o fs-writeback.o \ | 12 | seq_file.o xattr.o libfs.o fs-writeback.o \ |
13 | pnode.o splice.o sync.o utimes.o \ | 13 | pnode.o splice.o sync.o utimes.o \ |
14 | stack.o fs_struct.o statfs.o fs_pin.o | 14 | stack.o fs_struct.o statfs.o fs_pin.o nsfs.o |
15 | 15 | ||
16 | ifeq ($(CONFIG_BLOCK),y) | 16 | ifeq ($(CONFIG_BLOCK),y) |
17 | obj-y += buffer.o block_dev.o direct-io.o mpage.o | 17 | obj-y += buffer.o block_dev.o direct-io.o mpage.o |
18 | else | 18 | else |
19 | obj-y += no-block.o | 19 | obj-y += no-block.o |
20 | endif | 20 | endif |
21 | 21 | ||
22 | obj-$(CONFIG_PROC_FS) += proc_namespace.o | 22 | obj-$(CONFIG_PROC_FS) += proc_namespace.o |
23 | 23 | ||
24 | obj-y += notify/ | 24 | obj-y += notify/ |
25 | obj-$(CONFIG_EPOLL) += eventpoll.o | 25 | obj-$(CONFIG_EPOLL) += eventpoll.o |
26 | obj-$(CONFIG_ANON_INODES) += anon_inodes.o | 26 | obj-$(CONFIG_ANON_INODES) += anon_inodes.o |
27 | obj-$(CONFIG_SIGNALFD) += signalfd.o | 27 | obj-$(CONFIG_SIGNALFD) += signalfd.o |
28 | obj-$(CONFIG_TIMERFD) += timerfd.o | 28 | obj-$(CONFIG_TIMERFD) += timerfd.o |
29 | obj-$(CONFIG_EVENTFD) += eventfd.o | 29 | obj-$(CONFIG_EVENTFD) += eventfd.o |
30 | obj-$(CONFIG_AIO) += aio.o | 30 | obj-$(CONFIG_AIO) += aio.o |
31 | obj-$(CONFIG_FILE_LOCKING) += locks.o | 31 | obj-$(CONFIG_FILE_LOCKING) += locks.o |
32 | obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o | 32 | obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o |
33 | obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o | 33 | obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o |
34 | obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o | 34 | obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o |
35 | obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o | 35 | obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o |
36 | obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o | 36 | obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o |
37 | obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o | 37 | obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o |
38 | obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o | 38 | obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o |
39 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o | 39 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o |
40 | obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o | 40 | obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o |
41 | obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o | 41 | obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o |
42 | 42 | ||
43 | obj-$(CONFIG_FS_MBCACHE) += mbcache.o | 43 | obj-$(CONFIG_FS_MBCACHE) += mbcache.o |
44 | obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o | 44 | obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o |
45 | obj-$(CONFIG_NFS_COMMON) += nfs_common/ | 45 | obj-$(CONFIG_NFS_COMMON) += nfs_common/ |
46 | obj-$(CONFIG_COREDUMP) += coredump.o | 46 | obj-$(CONFIG_COREDUMP) += coredump.o |
47 | obj-$(CONFIG_SYSCTL) += drop_caches.o | 47 | obj-$(CONFIG_SYSCTL) += drop_caches.o |
48 | 48 | ||
49 | obj-$(CONFIG_FHANDLE) += fhandle.o | 49 | obj-$(CONFIG_FHANDLE) += fhandle.o |
50 | 50 | ||
51 | obj-y += quota/ | 51 | obj-y += quota/ |
52 | 52 | ||
53 | obj-$(CONFIG_PROC_FS) += proc/ | 53 | obj-$(CONFIG_PROC_FS) += proc/ |
54 | obj-$(CONFIG_KERNFS) += kernfs/ | 54 | obj-$(CONFIG_KERNFS) += kernfs/ |
55 | obj-$(CONFIG_SYSFS) += sysfs/ | 55 | obj-$(CONFIG_SYSFS) += sysfs/ |
56 | obj-$(CONFIG_CONFIGFS_FS) += configfs/ | 56 | obj-$(CONFIG_CONFIGFS_FS) += configfs/ |
57 | obj-y += devpts/ | 57 | obj-y += devpts/ |
58 | 58 | ||
59 | obj-$(CONFIG_PROFILING) += dcookies.o | 59 | obj-$(CONFIG_PROFILING) += dcookies.o |
60 | obj-$(CONFIG_DLM) += dlm/ | 60 | obj-$(CONFIG_DLM) += dlm/ |
61 | 61 | ||
62 | # Do not add any filesystems before this line | 62 | # Do not add any filesystems before this line |
63 | obj-$(CONFIG_FSCACHE) += fscache/ | 63 | obj-$(CONFIG_FSCACHE) += fscache/ |
64 | obj-$(CONFIG_REISERFS_FS) += reiserfs/ | 64 | obj-$(CONFIG_REISERFS_FS) += reiserfs/ |
65 | obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 | 65 | obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 |
66 | obj-$(CONFIG_EXT2_FS) += ext2/ | 66 | obj-$(CONFIG_EXT2_FS) += ext2/ |
67 | # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 | 67 | # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 |
68 | # unless explicitly requested by rootfstype | 68 | # unless explicitly requested by rootfstype |
69 | obj-$(CONFIG_EXT4_FS) += ext4/ | 69 | obj-$(CONFIG_EXT4_FS) += ext4/ |
70 | obj-$(CONFIG_JBD) += jbd/ | 70 | obj-$(CONFIG_JBD) += jbd/ |
71 | obj-$(CONFIG_JBD2) += jbd2/ | 71 | obj-$(CONFIG_JBD2) += jbd2/ |
72 | obj-$(CONFIG_CRAMFS) += cramfs/ | 72 | obj-$(CONFIG_CRAMFS) += cramfs/ |
73 | obj-$(CONFIG_SQUASHFS) += squashfs/ | 73 | obj-$(CONFIG_SQUASHFS) += squashfs/ |
74 | obj-y += ramfs/ | 74 | obj-y += ramfs/ |
75 | obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ | 75 | obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ |
76 | obj-$(CONFIG_CODA_FS) += coda/ | 76 | obj-$(CONFIG_CODA_FS) += coda/ |
77 | obj-$(CONFIG_MINIX_FS) += minix/ | 77 | obj-$(CONFIG_MINIX_FS) += minix/ |
78 | obj-$(CONFIG_FAT_FS) += fat/ | 78 | obj-$(CONFIG_FAT_FS) += fat/ |
79 | obj-$(CONFIG_BFS_FS) += bfs/ | 79 | obj-$(CONFIG_BFS_FS) += bfs/ |
80 | obj-$(CONFIG_ISO9660_FS) += isofs/ | 80 | obj-$(CONFIG_ISO9660_FS) += isofs/ |
81 | obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ | 81 | obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ |
82 | obj-$(CONFIG_HFS_FS) += hfs/ | 82 | obj-$(CONFIG_HFS_FS) += hfs/ |
83 | obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ | 83 | obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ |
84 | obj-$(CONFIG_VXFS_FS) += freevxfs/ | 84 | obj-$(CONFIG_VXFS_FS) += freevxfs/ |
85 | obj-$(CONFIG_NFS_FS) += nfs/ | 85 | obj-$(CONFIG_NFS_FS) += nfs/ |
86 | obj-$(CONFIG_EXPORTFS) += exportfs/ | 86 | obj-$(CONFIG_EXPORTFS) += exportfs/ |
87 | obj-$(CONFIG_NFSD) += nfsd/ | 87 | obj-$(CONFIG_NFSD) += nfsd/ |
88 | obj-$(CONFIG_LOCKD) += lockd/ | 88 | obj-$(CONFIG_LOCKD) += lockd/ |
89 | obj-$(CONFIG_NLS) += nls/ | 89 | obj-$(CONFIG_NLS) += nls/ |
90 | obj-$(CONFIG_SYSV_FS) += sysv/ | 90 | obj-$(CONFIG_SYSV_FS) += sysv/ |
91 | obj-$(CONFIG_CIFS) += cifs/ | 91 | obj-$(CONFIG_CIFS) += cifs/ |
92 | obj-$(CONFIG_NCP_FS) += ncpfs/ | 92 | obj-$(CONFIG_NCP_FS) += ncpfs/ |
93 | obj-$(CONFIG_HPFS_FS) += hpfs/ | 93 | obj-$(CONFIG_HPFS_FS) += hpfs/ |
94 | obj-$(CONFIG_NTFS_FS) += ntfs/ | 94 | obj-$(CONFIG_NTFS_FS) += ntfs/ |
95 | obj-$(CONFIG_UFS_FS) += ufs/ | 95 | obj-$(CONFIG_UFS_FS) += ufs/ |
96 | obj-$(CONFIG_EFS_FS) += efs/ | 96 | obj-$(CONFIG_EFS_FS) += efs/ |
97 | obj-$(CONFIG_JFFS2_FS) += jffs2/ | 97 | obj-$(CONFIG_JFFS2_FS) += jffs2/ |
98 | obj-$(CONFIG_LOGFS) += logfs/ | 98 | obj-$(CONFIG_LOGFS) += logfs/ |
99 | obj-$(CONFIG_UBIFS_FS) += ubifs/ | 99 | obj-$(CONFIG_UBIFS_FS) += ubifs/ |
100 | obj-$(CONFIG_AFFS_FS) += affs/ | 100 | obj-$(CONFIG_AFFS_FS) += affs/ |
101 | obj-$(CONFIG_ROMFS_FS) += romfs/ | 101 | obj-$(CONFIG_ROMFS_FS) += romfs/ |
102 | obj-$(CONFIG_QNX4FS_FS) += qnx4/ | 102 | obj-$(CONFIG_QNX4FS_FS) += qnx4/ |
103 | obj-$(CONFIG_QNX6FS_FS) += qnx6/ | 103 | obj-$(CONFIG_QNX6FS_FS) += qnx6/ |
104 | obj-$(CONFIG_AUTOFS4_FS) += autofs4/ | 104 | obj-$(CONFIG_AUTOFS4_FS) += autofs4/ |
105 | obj-$(CONFIG_ADFS_FS) += adfs/ | 105 | obj-$(CONFIG_ADFS_FS) += adfs/ |
106 | obj-$(CONFIG_FUSE_FS) += fuse/ | 106 | obj-$(CONFIG_FUSE_FS) += fuse/ |
107 | obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ | 107 | obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ |
108 | obj-$(CONFIG_UDF_FS) += udf/ | 108 | obj-$(CONFIG_UDF_FS) += udf/ |
109 | obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ | 109 | obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ |
110 | obj-$(CONFIG_OMFS_FS) += omfs/ | 110 | obj-$(CONFIG_OMFS_FS) += omfs/ |
111 | obj-$(CONFIG_JFS_FS) += jfs/ | 111 | obj-$(CONFIG_JFS_FS) += jfs/ |
112 | obj-$(CONFIG_XFS_FS) += xfs/ | 112 | obj-$(CONFIG_XFS_FS) += xfs/ |
113 | obj-$(CONFIG_9P_FS) += 9p/ | 113 | obj-$(CONFIG_9P_FS) += 9p/ |
114 | obj-$(CONFIG_AFS_FS) += afs/ | 114 | obj-$(CONFIG_AFS_FS) += afs/ |
115 | obj-$(CONFIG_NILFS2_FS) += nilfs2/ | 115 | obj-$(CONFIG_NILFS2_FS) += nilfs2/ |
116 | obj-$(CONFIG_BEFS_FS) += befs/ | 116 | obj-$(CONFIG_BEFS_FS) += befs/ |
117 | obj-$(CONFIG_HOSTFS) += hostfs/ | 117 | obj-$(CONFIG_HOSTFS) += hostfs/ |
118 | obj-$(CONFIG_HPPFS) += hppfs/ | 118 | obj-$(CONFIG_HPPFS) += hppfs/ |
119 | obj-$(CONFIG_CACHEFILES) += cachefiles/ | 119 | obj-$(CONFIG_CACHEFILES) += cachefiles/ |
120 | obj-$(CONFIG_DEBUG_FS) += debugfs/ | 120 | obj-$(CONFIG_DEBUG_FS) += debugfs/ |
121 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ | 121 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ |
122 | obj-$(CONFIG_BTRFS_FS) += btrfs/ | 122 | obj-$(CONFIG_BTRFS_FS) += btrfs/ |
123 | obj-$(CONFIG_GFS2_FS) += gfs2/ | 123 | obj-$(CONFIG_GFS2_FS) += gfs2/ |
124 | obj-$(CONFIG_F2FS_FS) += f2fs/ | 124 | obj-$(CONFIG_F2FS_FS) += f2fs/ |
125 | obj-y += exofs/ # Multiple modules | 125 | obj-y += exofs/ # Multiple modules |
126 | obj-$(CONFIG_CEPH_FS) += ceph/ | 126 | obj-$(CONFIG_CEPH_FS) += ceph/ |
127 | obj-$(CONFIG_PSTORE) += pstore/ | 127 | obj-$(CONFIG_PSTORE) += pstore/ |
128 | obj-$(CONFIG_EFIVAR_FS) += efivarfs/ | 128 | obj-$(CONFIG_EFIVAR_FS) += efivarfs/ |
129 | 129 |
fs/internal.h
1 | /* fs/ internal definitions | 1 | /* fs/ internal definitions |
2 | * | 2 | * |
3 | * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | struct super_block; | 12 | struct super_block; |
13 | struct file_system_type; | 13 | struct file_system_type; |
14 | struct linux_binprm; | 14 | struct linux_binprm; |
15 | struct path; | 15 | struct path; |
16 | struct mount; | 16 | struct mount; |
17 | 17 | ||
18 | /* | 18 | /* |
19 | * block_dev.c | 19 | * block_dev.c |
20 | */ | 20 | */ |
21 | #ifdef CONFIG_BLOCK | 21 | #ifdef CONFIG_BLOCK |
22 | extern void __init bdev_cache_init(void); | 22 | extern void __init bdev_cache_init(void); |
23 | 23 | ||
24 | extern int __sync_blockdev(struct block_device *bdev, int wait); | 24 | extern int __sync_blockdev(struct block_device *bdev, int wait); |
25 | 25 | ||
26 | #else | 26 | #else |
27 | static inline void bdev_cache_init(void) | 27 | static inline void bdev_cache_init(void) |
28 | { | 28 | { |
29 | } | 29 | } |
30 | 30 | ||
31 | static inline int __sync_blockdev(struct block_device *bdev, int wait) | 31 | static inline int __sync_blockdev(struct block_device *bdev, int wait) |
32 | { | 32 | { |
33 | return 0; | 33 | return 0; |
34 | } | 34 | } |
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * buffer.c | 38 | * buffer.c |
39 | */ | 39 | */ |
40 | extern void guard_bio_eod(int rw, struct bio *bio); | 40 | extern void guard_bio_eod(int rw, struct bio *bio); |
41 | 41 | ||
42 | /* | 42 | /* |
43 | * char_dev.c | 43 | * char_dev.c |
44 | */ | 44 | */ |
45 | extern void __init chrdev_init(void); | 45 | extern void __init chrdev_init(void); |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * namei.c | 48 | * namei.c |
49 | */ | 49 | */ |
50 | extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); | 50 | extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); |
51 | extern int vfs_path_lookup(struct dentry *, struct vfsmount *, | 51 | extern int vfs_path_lookup(struct dentry *, struct vfsmount *, |
52 | const char *, unsigned int, struct path *); | 52 | const char *, unsigned int, struct path *); |
53 | 53 | ||
54 | /* | 54 | /* |
55 | * namespace.c | 55 | * namespace.c |
56 | */ | 56 | */ |
57 | extern int copy_mount_options(const void __user *, unsigned long *); | 57 | extern int copy_mount_options(const void __user *, unsigned long *); |
58 | extern char *copy_mount_string(const void __user *); | 58 | extern char *copy_mount_string(const void __user *); |
59 | 59 | ||
60 | extern struct vfsmount *lookup_mnt(struct path *); | 60 | extern struct vfsmount *lookup_mnt(struct path *); |
61 | extern int finish_automount(struct vfsmount *, struct path *); | 61 | extern int finish_automount(struct vfsmount *, struct path *); |
62 | 62 | ||
63 | extern int sb_prepare_remount_readonly(struct super_block *); | 63 | extern int sb_prepare_remount_readonly(struct super_block *); |
64 | 64 | ||
65 | extern void __init mnt_init(void); | 65 | extern void __init mnt_init(void); |
66 | 66 | ||
67 | extern int __mnt_want_write(struct vfsmount *); | 67 | extern int __mnt_want_write(struct vfsmount *); |
68 | extern int __mnt_want_write_file(struct file *); | 68 | extern int __mnt_want_write_file(struct file *); |
69 | extern void __mnt_drop_write(struct vfsmount *); | 69 | extern void __mnt_drop_write(struct vfsmount *); |
70 | extern void __mnt_drop_write_file(struct file *); | 70 | extern void __mnt_drop_write_file(struct file *); |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * fs_struct.c | 73 | * fs_struct.c |
74 | */ | 74 | */ |
75 | extern void chroot_fs_refs(const struct path *, const struct path *); | 75 | extern void chroot_fs_refs(const struct path *, const struct path *); |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * file_table.c | 78 | * file_table.c |
79 | */ | 79 | */ |
80 | extern struct file *get_empty_filp(void); | 80 | extern struct file *get_empty_filp(void); |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * super.c | 83 | * super.c |
84 | */ | 84 | */ |
85 | extern int do_remount_sb(struct super_block *, int, void *, int); | 85 | extern int do_remount_sb(struct super_block *, int, void *, int); |
86 | extern bool grab_super_passive(struct super_block *sb); | 86 | extern bool grab_super_passive(struct super_block *sb); |
87 | extern struct dentry *mount_fs(struct file_system_type *, | 87 | extern struct dentry *mount_fs(struct file_system_type *, |
88 | int, const char *, void *); | 88 | int, const char *, void *); |
89 | extern struct super_block *user_get_super(dev_t); | 89 | extern struct super_block *user_get_super(dev_t); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * open.c | 92 | * open.c |
93 | */ | 93 | */ |
94 | struct open_flags { | 94 | struct open_flags { |
95 | int open_flag; | 95 | int open_flag; |
96 | umode_t mode; | 96 | umode_t mode; |
97 | int acc_mode; | 97 | int acc_mode; |
98 | int intent; | 98 | int intent; |
99 | int lookup_flags; | 99 | int lookup_flags; |
100 | }; | 100 | }; |
101 | extern struct file *do_filp_open(int dfd, struct filename *pathname, | 101 | extern struct file *do_filp_open(int dfd, struct filename *pathname, |
102 | const struct open_flags *op); | 102 | const struct open_flags *op); |
103 | extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, | 103 | extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, |
104 | const char *, const struct open_flags *); | 104 | const char *, const struct open_flags *); |
105 | 105 | ||
106 | extern long do_handle_open(int mountdirfd, | 106 | extern long do_handle_open(int mountdirfd, |
107 | struct file_handle __user *ufh, int open_flag); | 107 | struct file_handle __user *ufh, int open_flag); |
108 | extern int open_check_o_direct(struct file *f); | 108 | extern int open_check_o_direct(struct file *f); |
109 | 109 | ||
110 | /* | 110 | /* |
111 | * inode.c | 111 | * inode.c |
112 | */ | 112 | */ |
113 | extern spinlock_t inode_sb_list_lock; | 113 | extern spinlock_t inode_sb_list_lock; |
114 | extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, | 114 | extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, |
115 | int nid); | 115 | int nid); |
116 | extern void inode_add_lru(struct inode *inode); | 116 | extern void inode_add_lru(struct inode *inode); |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * fs-writeback.c | 119 | * fs-writeback.c |
120 | */ | 120 | */ |
121 | extern void inode_wb_list_del(struct inode *inode); | 121 | extern void inode_wb_list_del(struct inode *inode); |
122 | 122 | ||
123 | extern long get_nr_dirty_inodes(void); | 123 | extern long get_nr_dirty_inodes(void); |
124 | extern void evict_inodes(struct super_block *); | 124 | extern void evict_inodes(struct super_block *); |
125 | extern int invalidate_inodes(struct super_block *, bool); | 125 | extern int invalidate_inodes(struct super_block *, bool); |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * dcache.c | 128 | * dcache.c |
129 | */ | 129 | */ |
130 | extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); | 130 | extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); |
131 | extern int d_set_mounted(struct dentry *dentry); | 131 | extern int d_set_mounted(struct dentry *dentry); |
132 | extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, | 132 | extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, |
133 | int nid); | 133 | int nid); |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * read_write.c | 136 | * read_write.c |
137 | */ | 137 | */ |
138 | extern int rw_verify_area(int, struct file *, const loff_t *, size_t); | 138 | extern int rw_verify_area(int, struct file *, const loff_t *, size_t); |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * pipe.c | 141 | * pipe.c |
142 | */ | 142 | */ |
143 | extern const struct file_operations pipefifo_fops; | 143 | extern const struct file_operations pipefifo_fops; |
144 | 144 | ||
145 | /* | 145 | /* |
146 | * fs_pin.c | 146 | * fs_pin.c |
147 | */ | 147 | */ |
148 | extern void sb_pin_kill(struct super_block *sb); | 148 | extern void sb_pin_kill(struct super_block *sb); |
149 | extern void mnt_pin_kill(struct mount *m); | 149 | extern void mnt_pin_kill(struct mount *m); |
150 | |||
151 | /* | ||
152 | * fs/nsfs.c | ||
153 | */ | ||
154 | extern struct dentry_operations ns_dentry_operations; | ||
150 | 155 |
fs/namespace.c
1 | /* | 1 | /* |
2 | * linux/fs/namespace.c | 2 | * linux/fs/namespace.c |
3 | * | 3 | * |
4 | * (C) Copyright Al Viro 2000, 2001 | 4 | * (C) Copyright Al Viro 2000, 2001 |
5 | * Released under GPL v2. | 5 | * Released under GPL v2. |
6 | * | 6 | * |
7 | * Based on code from fs/super.c, copyright Linus Torvalds and others. | 7 | * Based on code from fs/super.c, copyright Linus Torvalds and others. |
8 | * Heavily rewritten. | 8 | * Heavily rewritten. |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/syscalls.h> | 11 | #include <linux/syscalls.h> |
12 | #include <linux/export.h> | 12 | #include <linux/export.h> |
13 | #include <linux/capability.h> | 13 | #include <linux/capability.h> |
14 | #include <linux/mnt_namespace.h> | 14 | #include <linux/mnt_namespace.h> |
15 | #include <linux/user_namespace.h> | 15 | #include <linux/user_namespace.h> |
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/security.h> | 17 | #include <linux/security.h> |
18 | #include <linux/idr.h> | 18 | #include <linux/idr.h> |
19 | #include <linux/init.h> /* init_rootfs */ | 19 | #include <linux/init.h> /* init_rootfs */ |
20 | #include <linux/fs_struct.h> /* get_fs_root et.al. */ | 20 | #include <linux/fs_struct.h> /* get_fs_root et.al. */ |
21 | #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ | 21 | #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ |
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/proc_ns.h> | 23 | #include <linux/proc_ns.h> |
24 | #include <linux/magic.h> | 24 | #include <linux/magic.h> |
25 | #include <linux/bootmem.h> | 25 | #include <linux/bootmem.h> |
26 | #include <linux/task_work.h> | 26 | #include <linux/task_work.h> |
27 | #include "pnode.h" | 27 | #include "pnode.h" |
28 | #include "internal.h" | 28 | #include "internal.h" |
29 | 29 | ||
30 | static unsigned int m_hash_mask __read_mostly; | 30 | static unsigned int m_hash_mask __read_mostly; |
31 | static unsigned int m_hash_shift __read_mostly; | 31 | static unsigned int m_hash_shift __read_mostly; |
32 | static unsigned int mp_hash_mask __read_mostly; | 32 | static unsigned int mp_hash_mask __read_mostly; |
33 | static unsigned int mp_hash_shift __read_mostly; | 33 | static unsigned int mp_hash_shift __read_mostly; |
34 | 34 | ||
35 | static __initdata unsigned long mhash_entries; | 35 | static __initdata unsigned long mhash_entries; |
36 | static int __init set_mhash_entries(char *str) | 36 | static int __init set_mhash_entries(char *str) |
37 | { | 37 | { |
38 | if (!str) | 38 | if (!str) |
39 | return 0; | 39 | return 0; |
40 | mhash_entries = simple_strtoul(str, &str, 0); | 40 | mhash_entries = simple_strtoul(str, &str, 0); |
41 | return 1; | 41 | return 1; |
42 | } | 42 | } |
43 | __setup("mhash_entries=", set_mhash_entries); | 43 | __setup("mhash_entries=", set_mhash_entries); |
44 | 44 | ||
45 | static __initdata unsigned long mphash_entries; | 45 | static __initdata unsigned long mphash_entries; |
46 | static int __init set_mphash_entries(char *str) | 46 | static int __init set_mphash_entries(char *str) |
47 | { | 47 | { |
48 | if (!str) | 48 | if (!str) |
49 | return 0; | 49 | return 0; |
50 | mphash_entries = simple_strtoul(str, &str, 0); | 50 | mphash_entries = simple_strtoul(str, &str, 0); |
51 | return 1; | 51 | return 1; |
52 | } | 52 | } |
53 | __setup("mphash_entries=", set_mphash_entries); | 53 | __setup("mphash_entries=", set_mphash_entries); |
54 | 54 | ||
55 | static u64 event; | 55 | static u64 event; |
56 | static DEFINE_IDA(mnt_id_ida); | 56 | static DEFINE_IDA(mnt_id_ida); |
57 | static DEFINE_IDA(mnt_group_ida); | 57 | static DEFINE_IDA(mnt_group_ida); |
58 | static DEFINE_SPINLOCK(mnt_id_lock); | 58 | static DEFINE_SPINLOCK(mnt_id_lock); |
59 | static int mnt_id_start = 0; | 59 | static int mnt_id_start = 0; |
60 | static int mnt_group_start = 1; | 60 | static int mnt_group_start = 1; |
61 | 61 | ||
62 | static struct hlist_head *mount_hashtable __read_mostly; | 62 | static struct hlist_head *mount_hashtable __read_mostly; |
63 | static struct hlist_head *mountpoint_hashtable __read_mostly; | 63 | static struct hlist_head *mountpoint_hashtable __read_mostly; |
64 | static struct kmem_cache *mnt_cache __read_mostly; | 64 | static struct kmem_cache *mnt_cache __read_mostly; |
65 | static DECLARE_RWSEM(namespace_sem); | 65 | static DECLARE_RWSEM(namespace_sem); |
66 | 66 | ||
67 | /* /sys/fs */ | 67 | /* /sys/fs */ |
68 | struct kobject *fs_kobj; | 68 | struct kobject *fs_kobj; |
69 | EXPORT_SYMBOL_GPL(fs_kobj); | 69 | EXPORT_SYMBOL_GPL(fs_kobj); |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * vfsmount lock may be taken for read to prevent changes to the | 72 | * vfsmount lock may be taken for read to prevent changes to the |
73 | * vfsmount hash, ie. during mountpoint lookups or walking back | 73 | * vfsmount hash, ie. during mountpoint lookups or walking back |
74 | * up the tree. | 74 | * up the tree. |
75 | * | 75 | * |
76 | * It should be taken for write in all cases where the vfsmount | 76 | * It should be taken for write in all cases where the vfsmount |
77 | * tree or hash is modified or when a vfsmount structure is modified. | 77 | * tree or hash is modified or when a vfsmount structure is modified. |
78 | */ | 78 | */ |
79 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); | 79 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); |
80 | 80 | ||
81 | static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) | 81 | static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) |
82 | { | 82 | { |
83 | unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); | 83 | unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); |
84 | tmp += ((unsigned long)dentry / L1_CACHE_BYTES); | 84 | tmp += ((unsigned long)dentry / L1_CACHE_BYTES); |
85 | tmp = tmp + (tmp >> m_hash_shift); | 85 | tmp = tmp + (tmp >> m_hash_shift); |
86 | return &mount_hashtable[tmp & m_hash_mask]; | 86 | return &mount_hashtable[tmp & m_hash_mask]; |
87 | } | 87 | } |
88 | 88 | ||
89 | static inline struct hlist_head *mp_hash(struct dentry *dentry) | 89 | static inline struct hlist_head *mp_hash(struct dentry *dentry) |
90 | { | 90 | { |
91 | unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); | 91 | unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); |
92 | tmp = tmp + (tmp >> mp_hash_shift); | 92 | tmp = tmp + (tmp >> mp_hash_shift); |
93 | return &mountpoint_hashtable[tmp & mp_hash_mask]; | 93 | return &mountpoint_hashtable[tmp & mp_hash_mask]; |
94 | } | 94 | } |
95 | 95 | ||
96 | /* | 96 | /* |
97 | * allocation is serialized by namespace_sem, but we need the spinlock to | 97 | * allocation is serialized by namespace_sem, but we need the spinlock to |
98 | * serialize with freeing. | 98 | * serialize with freeing. |
99 | */ | 99 | */ |
100 | static int mnt_alloc_id(struct mount *mnt) | 100 | static int mnt_alloc_id(struct mount *mnt) |
101 | { | 101 | { |
102 | int res; | 102 | int res; |
103 | 103 | ||
104 | retry: | 104 | retry: |
105 | ida_pre_get(&mnt_id_ida, GFP_KERNEL); | 105 | ida_pre_get(&mnt_id_ida, GFP_KERNEL); |
106 | spin_lock(&mnt_id_lock); | 106 | spin_lock(&mnt_id_lock); |
107 | res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); | 107 | res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); |
108 | if (!res) | 108 | if (!res) |
109 | mnt_id_start = mnt->mnt_id + 1; | 109 | mnt_id_start = mnt->mnt_id + 1; |
110 | spin_unlock(&mnt_id_lock); | 110 | spin_unlock(&mnt_id_lock); |
111 | if (res == -EAGAIN) | 111 | if (res == -EAGAIN) |
112 | goto retry; | 112 | goto retry; |
113 | 113 | ||
114 | return res; | 114 | return res; |
115 | } | 115 | } |
116 | 116 | ||
117 | static void mnt_free_id(struct mount *mnt) | 117 | static void mnt_free_id(struct mount *mnt) |
118 | { | 118 | { |
119 | int id = mnt->mnt_id; | 119 | int id = mnt->mnt_id; |
120 | spin_lock(&mnt_id_lock); | 120 | spin_lock(&mnt_id_lock); |
121 | ida_remove(&mnt_id_ida, id); | 121 | ida_remove(&mnt_id_ida, id); |
122 | if (mnt_id_start > id) | 122 | if (mnt_id_start > id) |
123 | mnt_id_start = id; | 123 | mnt_id_start = id; |
124 | spin_unlock(&mnt_id_lock); | 124 | spin_unlock(&mnt_id_lock); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Allocate a new peer group ID | 128 | * Allocate a new peer group ID |
129 | * | 129 | * |
130 | * mnt_group_ida is protected by namespace_sem | 130 | * mnt_group_ida is protected by namespace_sem |
131 | */ | 131 | */ |
132 | static int mnt_alloc_group_id(struct mount *mnt) | 132 | static int mnt_alloc_group_id(struct mount *mnt) |
133 | { | 133 | { |
134 | int res; | 134 | int res; |
135 | 135 | ||
136 | if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) | 136 | if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) |
137 | return -ENOMEM; | 137 | return -ENOMEM; |
138 | 138 | ||
139 | res = ida_get_new_above(&mnt_group_ida, | 139 | res = ida_get_new_above(&mnt_group_ida, |
140 | mnt_group_start, | 140 | mnt_group_start, |
141 | &mnt->mnt_group_id); | 141 | &mnt->mnt_group_id); |
142 | if (!res) | 142 | if (!res) |
143 | mnt_group_start = mnt->mnt_group_id + 1; | 143 | mnt_group_start = mnt->mnt_group_id + 1; |
144 | 144 | ||
145 | return res; | 145 | return res; |
146 | } | 146 | } |
147 | 147 | ||
148 | /* | 148 | /* |
149 | * Release a peer group ID | 149 | * Release a peer group ID |
150 | */ | 150 | */ |
151 | void mnt_release_group_id(struct mount *mnt) | 151 | void mnt_release_group_id(struct mount *mnt) |
152 | { | 152 | { |
153 | int id = mnt->mnt_group_id; | 153 | int id = mnt->mnt_group_id; |
154 | ida_remove(&mnt_group_ida, id); | 154 | ida_remove(&mnt_group_ida, id); |
155 | if (mnt_group_start > id) | 155 | if (mnt_group_start > id) |
156 | mnt_group_start = id; | 156 | mnt_group_start = id; |
157 | mnt->mnt_group_id = 0; | 157 | mnt->mnt_group_id = 0; |
158 | } | 158 | } |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * vfsmount lock must be held for read | 161 | * vfsmount lock must be held for read |
162 | */ | 162 | */ |
163 | static inline void mnt_add_count(struct mount *mnt, int n) | 163 | static inline void mnt_add_count(struct mount *mnt, int n) |
164 | { | 164 | { |
165 | #ifdef CONFIG_SMP | 165 | #ifdef CONFIG_SMP |
166 | this_cpu_add(mnt->mnt_pcp->mnt_count, n); | 166 | this_cpu_add(mnt->mnt_pcp->mnt_count, n); |
167 | #else | 167 | #else |
168 | preempt_disable(); | 168 | preempt_disable(); |
169 | mnt->mnt_count += n; | 169 | mnt->mnt_count += n; |
170 | preempt_enable(); | 170 | preempt_enable(); |
171 | #endif | 171 | #endif |
172 | } | 172 | } |
173 | 173 | ||
174 | /* | 174 | /* |
175 | * vfsmount lock must be held for write | 175 | * vfsmount lock must be held for write |
176 | */ | 176 | */ |
177 | unsigned int mnt_get_count(struct mount *mnt) | 177 | unsigned int mnt_get_count(struct mount *mnt) |
178 | { | 178 | { |
179 | #ifdef CONFIG_SMP | 179 | #ifdef CONFIG_SMP |
180 | unsigned int count = 0; | 180 | unsigned int count = 0; |
181 | int cpu; | 181 | int cpu; |
182 | 182 | ||
183 | for_each_possible_cpu(cpu) { | 183 | for_each_possible_cpu(cpu) { |
184 | count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; | 184 | count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; |
185 | } | 185 | } |
186 | 186 | ||
187 | return count; | 187 | return count; |
188 | #else | 188 | #else |
189 | return mnt->mnt_count; | 189 | return mnt->mnt_count; |
190 | #endif | 190 | #endif |
191 | } | 191 | } |
192 | 192 | ||
193 | static struct mount *alloc_vfsmnt(const char *name) | 193 | static struct mount *alloc_vfsmnt(const char *name) |
194 | { | 194 | { |
195 | struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); | 195 | struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); |
196 | if (mnt) { | 196 | if (mnt) { |
197 | int err; | 197 | int err; |
198 | 198 | ||
199 | err = mnt_alloc_id(mnt); | 199 | err = mnt_alloc_id(mnt); |
200 | if (err) | 200 | if (err) |
201 | goto out_free_cache; | 201 | goto out_free_cache; |
202 | 202 | ||
203 | if (name) { | 203 | if (name) { |
204 | mnt->mnt_devname = kstrdup(name, GFP_KERNEL); | 204 | mnt->mnt_devname = kstrdup(name, GFP_KERNEL); |
205 | if (!mnt->mnt_devname) | 205 | if (!mnt->mnt_devname) |
206 | goto out_free_id; | 206 | goto out_free_id; |
207 | } | 207 | } |
208 | 208 | ||
209 | #ifdef CONFIG_SMP | 209 | #ifdef CONFIG_SMP |
210 | mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); | 210 | mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); |
211 | if (!mnt->mnt_pcp) | 211 | if (!mnt->mnt_pcp) |
212 | goto out_free_devname; | 212 | goto out_free_devname; |
213 | 213 | ||
214 | this_cpu_add(mnt->mnt_pcp->mnt_count, 1); | 214 | this_cpu_add(mnt->mnt_pcp->mnt_count, 1); |
215 | #else | 215 | #else |
216 | mnt->mnt_count = 1; | 216 | mnt->mnt_count = 1; |
217 | mnt->mnt_writers = 0; | 217 | mnt->mnt_writers = 0; |
218 | #endif | 218 | #endif |
219 | 219 | ||
220 | INIT_HLIST_NODE(&mnt->mnt_hash); | 220 | INIT_HLIST_NODE(&mnt->mnt_hash); |
221 | INIT_LIST_HEAD(&mnt->mnt_child); | 221 | INIT_LIST_HEAD(&mnt->mnt_child); |
222 | INIT_LIST_HEAD(&mnt->mnt_mounts); | 222 | INIT_LIST_HEAD(&mnt->mnt_mounts); |
223 | INIT_LIST_HEAD(&mnt->mnt_list); | 223 | INIT_LIST_HEAD(&mnt->mnt_list); |
224 | INIT_LIST_HEAD(&mnt->mnt_expire); | 224 | INIT_LIST_HEAD(&mnt->mnt_expire); |
225 | INIT_LIST_HEAD(&mnt->mnt_share); | 225 | INIT_LIST_HEAD(&mnt->mnt_share); |
226 | INIT_LIST_HEAD(&mnt->mnt_slave_list); | 226 | INIT_LIST_HEAD(&mnt->mnt_slave_list); |
227 | INIT_LIST_HEAD(&mnt->mnt_slave); | 227 | INIT_LIST_HEAD(&mnt->mnt_slave); |
228 | INIT_HLIST_NODE(&mnt->mnt_mp_list); | 228 | INIT_HLIST_NODE(&mnt->mnt_mp_list); |
229 | #ifdef CONFIG_FSNOTIFY | 229 | #ifdef CONFIG_FSNOTIFY |
230 | INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); | 230 | INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); |
231 | #endif | 231 | #endif |
232 | } | 232 | } |
233 | return mnt; | 233 | return mnt; |
234 | 234 | ||
235 | #ifdef CONFIG_SMP | 235 | #ifdef CONFIG_SMP |
236 | out_free_devname: | 236 | out_free_devname: |
237 | kfree(mnt->mnt_devname); | 237 | kfree(mnt->mnt_devname); |
238 | #endif | 238 | #endif |
239 | out_free_id: | 239 | out_free_id: |
240 | mnt_free_id(mnt); | 240 | mnt_free_id(mnt); |
241 | out_free_cache: | 241 | out_free_cache: |
242 | kmem_cache_free(mnt_cache, mnt); | 242 | kmem_cache_free(mnt_cache, mnt); |
243 | return NULL; | 243 | return NULL; |
244 | } | 244 | } |
245 | 245 | ||
246 | /* | 246 | /* |
247 | * Most r/o checks on a fs are for operations that take | 247 | * Most r/o checks on a fs are for operations that take |
248 | * discrete amounts of time, like a write() or unlink(). | 248 | * discrete amounts of time, like a write() or unlink(). |
249 | * We must keep track of when those operations start | 249 | * We must keep track of when those operations start |
250 | * (for permission checks) and when they end, so that | 250 | * (for permission checks) and when they end, so that |
251 | * we can determine when writes are able to occur to | 251 | * we can determine when writes are able to occur to |
252 | * a filesystem. | 252 | * a filesystem. |
253 | */ | 253 | */ |
254 | /* | 254 | /* |
255 | * __mnt_is_readonly: check whether a mount is read-only | 255 | * __mnt_is_readonly: check whether a mount is read-only |
256 | * @mnt: the mount to check for its write status | 256 | * @mnt: the mount to check for its write status |
257 | * | 257 | * |
258 | * This shouldn't be used directly ouside of the VFS. | 258 | * This shouldn't be used directly ouside of the VFS. |
259 | * It does not guarantee that the filesystem will stay | 259 | * It does not guarantee that the filesystem will stay |
260 | * r/w, just that it is right *now*. This can not and | 260 | * r/w, just that it is right *now*. This can not and |
261 | * should not be used in place of IS_RDONLY(inode). | 261 | * should not be used in place of IS_RDONLY(inode). |
262 | * mnt_want/drop_write() will _keep_ the filesystem | 262 | * mnt_want/drop_write() will _keep_ the filesystem |
263 | * r/w. | 263 | * r/w. |
264 | */ | 264 | */ |
265 | int __mnt_is_readonly(struct vfsmount *mnt) | 265 | int __mnt_is_readonly(struct vfsmount *mnt) |
266 | { | 266 | { |
267 | if (mnt->mnt_flags & MNT_READONLY) | 267 | if (mnt->mnt_flags & MNT_READONLY) |
268 | return 1; | 268 | return 1; |
269 | if (mnt->mnt_sb->s_flags & MS_RDONLY) | 269 | if (mnt->mnt_sb->s_flags & MS_RDONLY) |
270 | return 1; | 270 | return 1; |
271 | return 0; | 271 | return 0; |
272 | } | 272 | } |
273 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | 273 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); |
274 | 274 | ||
275 | static inline void mnt_inc_writers(struct mount *mnt) | 275 | static inline void mnt_inc_writers(struct mount *mnt) |
276 | { | 276 | { |
277 | #ifdef CONFIG_SMP | 277 | #ifdef CONFIG_SMP |
278 | this_cpu_inc(mnt->mnt_pcp->mnt_writers); | 278 | this_cpu_inc(mnt->mnt_pcp->mnt_writers); |
279 | #else | 279 | #else |
280 | mnt->mnt_writers++; | 280 | mnt->mnt_writers++; |
281 | #endif | 281 | #endif |
282 | } | 282 | } |
283 | 283 | ||
284 | static inline void mnt_dec_writers(struct mount *mnt) | 284 | static inline void mnt_dec_writers(struct mount *mnt) |
285 | { | 285 | { |
286 | #ifdef CONFIG_SMP | 286 | #ifdef CONFIG_SMP |
287 | this_cpu_dec(mnt->mnt_pcp->mnt_writers); | 287 | this_cpu_dec(mnt->mnt_pcp->mnt_writers); |
288 | #else | 288 | #else |
289 | mnt->mnt_writers--; | 289 | mnt->mnt_writers--; |
290 | #endif | 290 | #endif |
291 | } | 291 | } |
292 | 292 | ||
293 | static unsigned int mnt_get_writers(struct mount *mnt) | 293 | static unsigned int mnt_get_writers(struct mount *mnt) |
294 | { | 294 | { |
295 | #ifdef CONFIG_SMP | 295 | #ifdef CONFIG_SMP |
296 | unsigned int count = 0; | 296 | unsigned int count = 0; |
297 | int cpu; | 297 | int cpu; |
298 | 298 | ||
299 | for_each_possible_cpu(cpu) { | 299 | for_each_possible_cpu(cpu) { |
300 | count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; | 300 | count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; |
301 | } | 301 | } |
302 | 302 | ||
303 | return count; | 303 | return count; |
304 | #else | 304 | #else |
305 | return mnt->mnt_writers; | 305 | return mnt->mnt_writers; |
306 | #endif | 306 | #endif |
307 | } | 307 | } |
308 | 308 | ||
309 | static int mnt_is_readonly(struct vfsmount *mnt) | 309 | static int mnt_is_readonly(struct vfsmount *mnt) |
310 | { | 310 | { |
311 | if (mnt->mnt_sb->s_readonly_remount) | 311 | if (mnt->mnt_sb->s_readonly_remount) |
312 | return 1; | 312 | return 1; |
313 | /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ | 313 | /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ |
314 | smp_rmb(); | 314 | smp_rmb(); |
315 | return __mnt_is_readonly(mnt); | 315 | return __mnt_is_readonly(mnt); |
316 | } | 316 | } |
317 | 317 | ||
318 | /* | 318 | /* |
319 | * Most r/o & frozen checks on a fs are for operations that take discrete | 319 | * Most r/o & frozen checks on a fs are for operations that take discrete |
320 | * amounts of time, like a write() or unlink(). We must keep track of when | 320 | * amounts of time, like a write() or unlink(). We must keep track of when |
321 | * those operations start (for permission checks) and when they end, so that we | 321 | * those operations start (for permission checks) and when they end, so that we |
322 | * can determine when writes are able to occur to a filesystem. | 322 | * can determine when writes are able to occur to a filesystem. |
323 | */ | 323 | */ |
324 | /** | 324 | /** |
325 | * __mnt_want_write - get write access to a mount without freeze protection | 325 | * __mnt_want_write - get write access to a mount without freeze protection |
326 | * @m: the mount on which to take a write | 326 | * @m: the mount on which to take a write |
327 | * | 327 | * |
328 | * This tells the low-level filesystem that a write is about to be performed to | 328 | * This tells the low-level filesystem that a write is about to be performed to |
329 | * it, and makes sure that writes are allowed (mnt it read-write) before | 329 | * it, and makes sure that writes are allowed (mnt it read-write) before |
330 | * returning success. This operation does not protect against filesystem being | 330 | * returning success. This operation does not protect against filesystem being |
331 | * frozen. When the write operation is finished, __mnt_drop_write() must be | 331 | * frozen. When the write operation is finished, __mnt_drop_write() must be |
332 | * called. This is effectively a refcount. | 332 | * called. This is effectively a refcount. |
333 | */ | 333 | */ |
334 | int __mnt_want_write(struct vfsmount *m) | 334 | int __mnt_want_write(struct vfsmount *m) |
335 | { | 335 | { |
336 | struct mount *mnt = real_mount(m); | 336 | struct mount *mnt = real_mount(m); |
337 | int ret = 0; | 337 | int ret = 0; |
338 | 338 | ||
339 | preempt_disable(); | 339 | preempt_disable(); |
340 | mnt_inc_writers(mnt); | 340 | mnt_inc_writers(mnt); |
341 | /* | 341 | /* |
342 | * The store to mnt_inc_writers must be visible before we pass | 342 | * The store to mnt_inc_writers must be visible before we pass |
343 | * MNT_WRITE_HOLD loop below, so that the slowpath can see our | 343 | * MNT_WRITE_HOLD loop below, so that the slowpath can see our |
344 | * incremented count after it has set MNT_WRITE_HOLD. | 344 | * incremented count after it has set MNT_WRITE_HOLD. |
345 | */ | 345 | */ |
346 | smp_mb(); | 346 | smp_mb(); |
347 | while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) | 347 | while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) |
348 | cpu_relax(); | 348 | cpu_relax(); |
349 | /* | 349 | /* |
350 | * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will | 350 | * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will |
351 | * be set to match its requirements. So we must not load that until | 351 | * be set to match its requirements. So we must not load that until |
352 | * MNT_WRITE_HOLD is cleared. | 352 | * MNT_WRITE_HOLD is cleared. |
353 | */ | 353 | */ |
354 | smp_rmb(); | 354 | smp_rmb(); |
355 | if (mnt_is_readonly(m)) { | 355 | if (mnt_is_readonly(m)) { |
356 | mnt_dec_writers(mnt); | 356 | mnt_dec_writers(mnt); |
357 | ret = -EROFS; | 357 | ret = -EROFS; |
358 | } | 358 | } |
359 | preempt_enable(); | 359 | preempt_enable(); |
360 | 360 | ||
361 | return ret; | 361 | return ret; |
362 | } | 362 | } |
363 | 363 | ||
364 | /** | 364 | /** |
365 | * mnt_want_write - get write access to a mount | 365 | * mnt_want_write - get write access to a mount |
366 | * @m: the mount on which to take a write | 366 | * @m: the mount on which to take a write |
367 | * | 367 | * |
368 | * This tells the low-level filesystem that a write is about to be performed to | 368 | * This tells the low-level filesystem that a write is about to be performed to |
369 | * it, and makes sure that writes are allowed (mount is read-write, filesystem | 369 | * it, and makes sure that writes are allowed (mount is read-write, filesystem |
370 | * is not frozen) before returning success. When the write operation is | 370 | * is not frozen) before returning success. When the write operation is |
371 | * finished, mnt_drop_write() must be called. This is effectively a refcount. | 371 | * finished, mnt_drop_write() must be called. This is effectively a refcount. |
372 | */ | 372 | */ |
373 | int mnt_want_write(struct vfsmount *m) | 373 | int mnt_want_write(struct vfsmount *m) |
374 | { | 374 | { |
375 | int ret; | 375 | int ret; |
376 | 376 | ||
377 | sb_start_write(m->mnt_sb); | 377 | sb_start_write(m->mnt_sb); |
378 | ret = __mnt_want_write(m); | 378 | ret = __mnt_want_write(m); |
379 | if (ret) | 379 | if (ret) |
380 | sb_end_write(m->mnt_sb); | 380 | sb_end_write(m->mnt_sb); |
381 | return ret; | 381 | return ret; |
382 | } | 382 | } |
383 | EXPORT_SYMBOL_GPL(mnt_want_write); | 383 | EXPORT_SYMBOL_GPL(mnt_want_write); |
384 | 384 | ||
385 | /** | 385 | /** |
386 | * mnt_clone_write - get write access to a mount | 386 | * mnt_clone_write - get write access to a mount |
387 | * @mnt: the mount on which to take a write | 387 | * @mnt: the mount on which to take a write |
388 | * | 388 | * |
389 | * This is effectively like mnt_want_write, except | 389 | * This is effectively like mnt_want_write, except |
390 | * it must only be used to take an extra write reference | 390 | * it must only be used to take an extra write reference |
391 | * on a mountpoint that we already know has a write reference | 391 | * on a mountpoint that we already know has a write reference |
392 | * on it. This allows some optimisation. | 392 | * on it. This allows some optimisation. |
393 | * | 393 | * |
394 | * After finished, mnt_drop_write must be called as usual to | 394 | * After finished, mnt_drop_write must be called as usual to |
395 | * drop the reference. | 395 | * drop the reference. |
396 | */ | 396 | */ |
397 | int mnt_clone_write(struct vfsmount *mnt) | 397 | int mnt_clone_write(struct vfsmount *mnt) |
398 | { | 398 | { |
399 | /* superblock may be r/o */ | 399 | /* superblock may be r/o */ |
400 | if (__mnt_is_readonly(mnt)) | 400 | if (__mnt_is_readonly(mnt)) |
401 | return -EROFS; | 401 | return -EROFS; |
402 | preempt_disable(); | 402 | preempt_disable(); |
403 | mnt_inc_writers(real_mount(mnt)); | 403 | mnt_inc_writers(real_mount(mnt)); |
404 | preempt_enable(); | 404 | preempt_enable(); |
405 | return 0; | 405 | return 0; |
406 | } | 406 | } |
407 | EXPORT_SYMBOL_GPL(mnt_clone_write); | 407 | EXPORT_SYMBOL_GPL(mnt_clone_write); |
408 | 408 | ||
409 | /** | 409 | /** |
410 | * __mnt_want_write_file - get write access to a file's mount | 410 | * __mnt_want_write_file - get write access to a file's mount |
411 | * @file: the file who's mount on which to take a write | 411 | * @file: the file who's mount on which to take a write |
412 | * | 412 | * |
413 | * This is like __mnt_want_write, but it takes a file and can | 413 | * This is like __mnt_want_write, but it takes a file and can |
414 | * do some optimisations if the file is open for write already | 414 | * do some optimisations if the file is open for write already |
415 | */ | 415 | */ |
416 | int __mnt_want_write_file(struct file *file) | 416 | int __mnt_want_write_file(struct file *file) |
417 | { | 417 | { |
418 | if (!(file->f_mode & FMODE_WRITER)) | 418 | if (!(file->f_mode & FMODE_WRITER)) |
419 | return __mnt_want_write(file->f_path.mnt); | 419 | return __mnt_want_write(file->f_path.mnt); |
420 | else | 420 | else |
421 | return mnt_clone_write(file->f_path.mnt); | 421 | return mnt_clone_write(file->f_path.mnt); |
422 | } | 422 | } |
423 | 423 | ||
424 | /** | 424 | /** |
425 | * mnt_want_write_file - get write access to a file's mount | 425 | * mnt_want_write_file - get write access to a file's mount |
426 | * @file: the file who's mount on which to take a write | 426 | * @file: the file who's mount on which to take a write |
427 | * | 427 | * |
428 | * This is like mnt_want_write, but it takes a file and can | 428 | * This is like mnt_want_write, but it takes a file and can |
429 | * do some optimisations if the file is open for write already | 429 | * do some optimisations if the file is open for write already |
430 | */ | 430 | */ |
431 | int mnt_want_write_file(struct file *file) | 431 | int mnt_want_write_file(struct file *file) |
432 | { | 432 | { |
433 | int ret; | 433 | int ret; |
434 | 434 | ||
435 | sb_start_write(file->f_path.mnt->mnt_sb); | 435 | sb_start_write(file->f_path.mnt->mnt_sb); |
436 | ret = __mnt_want_write_file(file); | 436 | ret = __mnt_want_write_file(file); |
437 | if (ret) | 437 | if (ret) |
438 | sb_end_write(file->f_path.mnt->mnt_sb); | 438 | sb_end_write(file->f_path.mnt->mnt_sb); |
439 | return ret; | 439 | return ret; |
440 | } | 440 | } |
441 | EXPORT_SYMBOL_GPL(mnt_want_write_file); | 441 | EXPORT_SYMBOL_GPL(mnt_want_write_file); |
442 | 442 | ||
443 | /** | 443 | /** |
444 | * __mnt_drop_write - give up write access to a mount | 444 | * __mnt_drop_write - give up write access to a mount |
445 | * @mnt: the mount on which to give up write access | 445 | * @mnt: the mount on which to give up write access |
446 | * | 446 | * |
447 | * Tells the low-level filesystem that we are done | 447 | * Tells the low-level filesystem that we are done |
448 | * performing writes to it. Must be matched with | 448 | * performing writes to it. Must be matched with |
449 | * __mnt_want_write() call above. | 449 | * __mnt_want_write() call above. |
450 | */ | 450 | */ |
451 | void __mnt_drop_write(struct vfsmount *mnt) | 451 | void __mnt_drop_write(struct vfsmount *mnt) |
452 | { | 452 | { |
453 | preempt_disable(); | 453 | preempt_disable(); |
454 | mnt_dec_writers(real_mount(mnt)); | 454 | mnt_dec_writers(real_mount(mnt)); |
455 | preempt_enable(); | 455 | preempt_enable(); |
456 | } | 456 | } |
457 | 457 | ||
458 | /** | 458 | /** |
459 | * mnt_drop_write - give up write access to a mount | 459 | * mnt_drop_write - give up write access to a mount |
460 | * @mnt: the mount on which to give up write access | 460 | * @mnt: the mount on which to give up write access |
461 | * | 461 | * |
462 | * Tells the low-level filesystem that we are done performing writes to it and | 462 | * Tells the low-level filesystem that we are done performing writes to it and |
463 | * also allows filesystem to be frozen again. Must be matched with | 463 | * also allows filesystem to be frozen again. Must be matched with |
464 | * mnt_want_write() call above. | 464 | * mnt_want_write() call above. |
465 | */ | 465 | */ |
466 | void mnt_drop_write(struct vfsmount *mnt) | 466 | void mnt_drop_write(struct vfsmount *mnt) |
467 | { | 467 | { |
468 | __mnt_drop_write(mnt); | 468 | __mnt_drop_write(mnt); |
469 | sb_end_write(mnt->mnt_sb); | 469 | sb_end_write(mnt->mnt_sb); |
470 | } | 470 | } |
471 | EXPORT_SYMBOL_GPL(mnt_drop_write); | 471 | EXPORT_SYMBOL_GPL(mnt_drop_write); |
472 | 472 | ||
473 | void __mnt_drop_write_file(struct file *file) | 473 | void __mnt_drop_write_file(struct file *file) |
474 | { | 474 | { |
475 | __mnt_drop_write(file->f_path.mnt); | 475 | __mnt_drop_write(file->f_path.mnt); |
476 | } | 476 | } |
477 | 477 | ||
478 | void mnt_drop_write_file(struct file *file) | 478 | void mnt_drop_write_file(struct file *file) |
479 | { | 479 | { |
480 | mnt_drop_write(file->f_path.mnt); | 480 | mnt_drop_write(file->f_path.mnt); |
481 | } | 481 | } |
482 | EXPORT_SYMBOL(mnt_drop_write_file); | 482 | EXPORT_SYMBOL(mnt_drop_write_file); |
483 | 483 | ||
484 | static int mnt_make_readonly(struct mount *mnt) | 484 | static int mnt_make_readonly(struct mount *mnt) |
485 | { | 485 | { |
486 | int ret = 0; | 486 | int ret = 0; |
487 | 487 | ||
488 | lock_mount_hash(); | 488 | lock_mount_hash(); |
489 | mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; | 489 | mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; |
490 | /* | 490 | /* |
491 | * After storing MNT_WRITE_HOLD, we'll read the counters. This store | 491 | * After storing MNT_WRITE_HOLD, we'll read the counters. This store |
492 | * should be visible before we do. | 492 | * should be visible before we do. |
493 | */ | 493 | */ |
494 | smp_mb(); | 494 | smp_mb(); |
495 | 495 | ||
496 | /* | 496 | /* |
497 | * With writers on hold, if this value is zero, then there are | 497 | * With writers on hold, if this value is zero, then there are |
498 | * definitely no active writers (although held writers may subsequently | 498 | * definitely no active writers (although held writers may subsequently |
499 | * increment the count, they'll have to wait, and decrement it after | 499 | * increment the count, they'll have to wait, and decrement it after |
500 | * seeing MNT_READONLY). | 500 | * seeing MNT_READONLY). |
501 | * | 501 | * |
502 | * It is OK to have counter incremented on one CPU and decremented on | 502 | * It is OK to have counter incremented on one CPU and decremented on |
503 | * another: the sum will add up correctly. The danger would be when we | 503 | * another: the sum will add up correctly. The danger would be when we |
504 | * sum up each counter, if we read a counter before it is incremented, | 504 | * sum up each counter, if we read a counter before it is incremented, |
505 | * but then read another CPU's count which it has been subsequently | 505 | * but then read another CPU's count which it has been subsequently |
506 | * decremented from -- we would see more decrements than we should. | 506 | * decremented from -- we would see more decrements than we should. |
507 | * MNT_WRITE_HOLD protects against this scenario, because | 507 | * MNT_WRITE_HOLD protects against this scenario, because |
508 | * mnt_want_write first increments count, then smp_mb, then spins on | 508 | * mnt_want_write first increments count, then smp_mb, then spins on |
509 | * MNT_WRITE_HOLD, so it can't be decremented by another CPU while | 509 | * MNT_WRITE_HOLD, so it can't be decremented by another CPU while |
510 | * we're counting up here. | 510 | * we're counting up here. |
511 | */ | 511 | */ |
512 | if (mnt_get_writers(mnt) > 0) | 512 | if (mnt_get_writers(mnt) > 0) |
513 | ret = -EBUSY; | 513 | ret = -EBUSY; |
514 | else | 514 | else |
515 | mnt->mnt.mnt_flags |= MNT_READONLY; | 515 | mnt->mnt.mnt_flags |= MNT_READONLY; |
516 | /* | 516 | /* |
517 | * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers | 517 | * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers |
518 | * that become unheld will see MNT_READONLY. | 518 | * that become unheld will see MNT_READONLY. |
519 | */ | 519 | */ |
520 | smp_wmb(); | 520 | smp_wmb(); |
521 | mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; | 521 | mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; |
522 | unlock_mount_hash(); | 522 | unlock_mount_hash(); |
523 | return ret; | 523 | return ret; |
524 | } | 524 | } |
525 | 525 | ||
526 | static void __mnt_unmake_readonly(struct mount *mnt) | 526 | static void __mnt_unmake_readonly(struct mount *mnt) |
527 | { | 527 | { |
528 | lock_mount_hash(); | 528 | lock_mount_hash(); |
529 | mnt->mnt.mnt_flags &= ~MNT_READONLY; | 529 | mnt->mnt.mnt_flags &= ~MNT_READONLY; |
530 | unlock_mount_hash(); | 530 | unlock_mount_hash(); |
531 | } | 531 | } |
532 | 532 | ||
533 | int sb_prepare_remount_readonly(struct super_block *sb) | 533 | int sb_prepare_remount_readonly(struct super_block *sb) |
534 | { | 534 | { |
535 | struct mount *mnt; | 535 | struct mount *mnt; |
536 | int err = 0; | 536 | int err = 0; |
537 | 537 | ||
538 | /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ | 538 | /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ |
539 | if (atomic_long_read(&sb->s_remove_count)) | 539 | if (atomic_long_read(&sb->s_remove_count)) |
540 | return -EBUSY; | 540 | return -EBUSY; |
541 | 541 | ||
542 | lock_mount_hash(); | 542 | lock_mount_hash(); |
543 | list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { | 543 | list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { |
544 | if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { | 544 | if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { |
545 | mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; | 545 | mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; |
546 | smp_mb(); | 546 | smp_mb(); |
547 | if (mnt_get_writers(mnt) > 0) { | 547 | if (mnt_get_writers(mnt) > 0) { |
548 | err = -EBUSY; | 548 | err = -EBUSY; |
549 | break; | 549 | break; |
550 | } | 550 | } |
551 | } | 551 | } |
552 | } | 552 | } |
553 | if (!err && atomic_long_read(&sb->s_remove_count)) | 553 | if (!err && atomic_long_read(&sb->s_remove_count)) |
554 | err = -EBUSY; | 554 | err = -EBUSY; |
555 | 555 | ||
556 | if (!err) { | 556 | if (!err) { |
557 | sb->s_readonly_remount = 1; | 557 | sb->s_readonly_remount = 1; |
558 | smp_wmb(); | 558 | smp_wmb(); |
559 | } | 559 | } |
560 | list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { | 560 | list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { |
561 | if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) | 561 | if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) |
562 | mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; | 562 | mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; |
563 | } | 563 | } |
564 | unlock_mount_hash(); | 564 | unlock_mount_hash(); |
565 | 565 | ||
566 | return err; | 566 | return err; |
567 | } | 567 | } |
568 | 568 | ||
569 | static void free_vfsmnt(struct mount *mnt) | 569 | static void free_vfsmnt(struct mount *mnt) |
570 | { | 570 | { |
571 | kfree(mnt->mnt_devname); | 571 | kfree(mnt->mnt_devname); |
572 | #ifdef CONFIG_SMP | 572 | #ifdef CONFIG_SMP |
573 | free_percpu(mnt->mnt_pcp); | 573 | free_percpu(mnt->mnt_pcp); |
574 | #endif | 574 | #endif |
575 | kmem_cache_free(mnt_cache, mnt); | 575 | kmem_cache_free(mnt_cache, mnt); |
576 | } | 576 | } |
577 | 577 | ||
578 | static void delayed_free_vfsmnt(struct rcu_head *head) | 578 | static void delayed_free_vfsmnt(struct rcu_head *head) |
579 | { | 579 | { |
580 | free_vfsmnt(container_of(head, struct mount, mnt_rcu)); | 580 | free_vfsmnt(container_of(head, struct mount, mnt_rcu)); |
581 | } | 581 | } |
582 | 582 | ||
583 | /* call under rcu_read_lock */ | 583 | /* call under rcu_read_lock */ |
584 | bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) | 584 | bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) |
585 | { | 585 | { |
586 | struct mount *mnt; | 586 | struct mount *mnt; |
587 | if (read_seqretry(&mount_lock, seq)) | 587 | if (read_seqretry(&mount_lock, seq)) |
588 | return false; | 588 | return false; |
589 | if (bastard == NULL) | 589 | if (bastard == NULL) |
590 | return true; | 590 | return true; |
591 | mnt = real_mount(bastard); | 591 | mnt = real_mount(bastard); |
592 | mnt_add_count(mnt, 1); | 592 | mnt_add_count(mnt, 1); |
593 | if (likely(!read_seqretry(&mount_lock, seq))) | 593 | if (likely(!read_seqretry(&mount_lock, seq))) |
594 | return true; | 594 | return true; |
595 | if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { | 595 | if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { |
596 | mnt_add_count(mnt, -1); | 596 | mnt_add_count(mnt, -1); |
597 | return false; | 597 | return false; |
598 | } | 598 | } |
599 | rcu_read_unlock(); | 599 | rcu_read_unlock(); |
600 | mntput(bastard); | 600 | mntput(bastard); |
601 | rcu_read_lock(); | 601 | rcu_read_lock(); |
602 | return false; | 602 | return false; |
603 | } | 603 | } |
604 | 604 | ||
605 | /* | 605 | /* |
606 | * find the first mount at @dentry on vfsmount @mnt. | 606 | * find the first mount at @dentry on vfsmount @mnt. |
607 | * call under rcu_read_lock() | 607 | * call under rcu_read_lock() |
608 | */ | 608 | */ |
609 | struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) | 609 | struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) |
610 | { | 610 | { |
611 | struct hlist_head *head = m_hash(mnt, dentry); | 611 | struct hlist_head *head = m_hash(mnt, dentry); |
612 | struct mount *p; | 612 | struct mount *p; |
613 | 613 | ||
614 | hlist_for_each_entry_rcu(p, head, mnt_hash) | 614 | hlist_for_each_entry_rcu(p, head, mnt_hash) |
615 | if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) | 615 | if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) |
616 | return p; | 616 | return p; |
617 | return NULL; | 617 | return NULL; |
618 | } | 618 | } |
619 | 619 | ||
620 | /* | 620 | /* |
621 | * find the last mount at @dentry on vfsmount @mnt. | 621 | * find the last mount at @dentry on vfsmount @mnt. |
622 | * mount_lock must be held. | 622 | * mount_lock must be held. |
623 | */ | 623 | */ |
624 | struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) | 624 | struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) |
625 | { | 625 | { |
626 | struct mount *p, *res; | 626 | struct mount *p, *res; |
627 | res = p = __lookup_mnt(mnt, dentry); | 627 | res = p = __lookup_mnt(mnt, dentry); |
628 | if (!p) | 628 | if (!p) |
629 | goto out; | 629 | goto out; |
630 | hlist_for_each_entry_continue(p, mnt_hash) { | 630 | hlist_for_each_entry_continue(p, mnt_hash) { |
631 | if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) | 631 | if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) |
632 | break; | 632 | break; |
633 | res = p; | 633 | res = p; |
634 | } | 634 | } |
635 | out: | 635 | out: |
636 | return res; | 636 | return res; |
637 | } | 637 | } |
638 | 638 | ||
639 | /* | 639 | /* |
640 | * lookup_mnt - Return the first child mount mounted at path | 640 | * lookup_mnt - Return the first child mount mounted at path |
641 | * | 641 | * |
642 | * "First" means first mounted chronologically. If you create the | 642 | * "First" means first mounted chronologically. If you create the |
643 | * following mounts: | 643 | * following mounts: |
644 | * | 644 | * |
645 | * mount /dev/sda1 /mnt | 645 | * mount /dev/sda1 /mnt |
646 | * mount /dev/sda2 /mnt | 646 | * mount /dev/sda2 /mnt |
647 | * mount /dev/sda3 /mnt | 647 | * mount /dev/sda3 /mnt |
648 | * | 648 | * |
649 | * Then lookup_mnt() on the base /mnt dentry in the root mount will | 649 | * Then lookup_mnt() on the base /mnt dentry in the root mount will |
650 | * return successively the root dentry and vfsmount of /dev/sda1, then | 650 | * return successively the root dentry and vfsmount of /dev/sda1, then |
651 | * /dev/sda2, then /dev/sda3, then NULL. | 651 | * /dev/sda2, then /dev/sda3, then NULL. |
652 | * | 652 | * |
653 | * lookup_mnt takes a reference to the found vfsmount. | 653 | * lookup_mnt takes a reference to the found vfsmount. |
654 | */ | 654 | */ |
655 | struct vfsmount *lookup_mnt(struct path *path) | 655 | struct vfsmount *lookup_mnt(struct path *path) |
656 | { | 656 | { |
657 | struct mount *child_mnt; | 657 | struct mount *child_mnt; |
658 | struct vfsmount *m; | 658 | struct vfsmount *m; |
659 | unsigned seq; | 659 | unsigned seq; |
660 | 660 | ||
661 | rcu_read_lock(); | 661 | rcu_read_lock(); |
662 | do { | 662 | do { |
663 | seq = read_seqbegin(&mount_lock); | 663 | seq = read_seqbegin(&mount_lock); |
664 | child_mnt = __lookup_mnt(path->mnt, path->dentry); | 664 | child_mnt = __lookup_mnt(path->mnt, path->dentry); |
665 | m = child_mnt ? &child_mnt->mnt : NULL; | 665 | m = child_mnt ? &child_mnt->mnt : NULL; |
666 | } while (!legitimize_mnt(m, seq)); | 666 | } while (!legitimize_mnt(m, seq)); |
667 | rcu_read_unlock(); | 667 | rcu_read_unlock(); |
668 | return m; | 668 | return m; |
669 | } | 669 | } |
670 | 670 | ||
671 | /* | 671 | /* |
672 | * __is_local_mountpoint - Test to see if dentry is a mountpoint in the | 672 | * __is_local_mountpoint - Test to see if dentry is a mountpoint in the |
673 | * current mount namespace. | 673 | * current mount namespace. |
674 | * | 674 | * |
675 | * The common case is dentries are not mountpoints at all and that | 675 | * The common case is dentries are not mountpoints at all and that |
676 | * test is handled inline. For the slow case when we are actually | 676 | * test is handled inline. For the slow case when we are actually |
677 | * dealing with a mountpoint of some kind, walk through all of the | 677 | * dealing with a mountpoint of some kind, walk through all of the |
678 | * mounts in the current mount namespace and test to see if the dentry | 678 | * mounts in the current mount namespace and test to see if the dentry |
679 | * is a mountpoint. | 679 | * is a mountpoint. |
680 | * | 680 | * |
681 | * The mount_hashtable is not usable in the context because we | 681 | * The mount_hashtable is not usable in the context because we |
682 | * need to identify all mounts that may be in the current mount | 682 | * need to identify all mounts that may be in the current mount |
683 | * namespace not just a mount that happens to have some specified | 683 | * namespace not just a mount that happens to have some specified |
684 | * parent mount. | 684 | * parent mount. |
685 | */ | 685 | */ |
686 | bool __is_local_mountpoint(struct dentry *dentry) | 686 | bool __is_local_mountpoint(struct dentry *dentry) |
687 | { | 687 | { |
688 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; | 688 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; |
689 | struct mount *mnt; | 689 | struct mount *mnt; |
690 | bool is_covered = false; | 690 | bool is_covered = false; |
691 | 691 | ||
692 | if (!d_mountpoint(dentry)) | 692 | if (!d_mountpoint(dentry)) |
693 | goto out; | 693 | goto out; |
694 | 694 | ||
695 | down_read(&namespace_sem); | 695 | down_read(&namespace_sem); |
696 | list_for_each_entry(mnt, &ns->list, mnt_list) { | 696 | list_for_each_entry(mnt, &ns->list, mnt_list) { |
697 | is_covered = (mnt->mnt_mountpoint == dentry); | 697 | is_covered = (mnt->mnt_mountpoint == dentry); |
698 | if (is_covered) | 698 | if (is_covered) |
699 | break; | 699 | break; |
700 | } | 700 | } |
701 | up_read(&namespace_sem); | 701 | up_read(&namespace_sem); |
702 | out: | 702 | out: |
703 | return is_covered; | 703 | return is_covered; |
704 | } | 704 | } |
705 | 705 | ||
706 | static struct mountpoint *lookup_mountpoint(struct dentry *dentry) | 706 | static struct mountpoint *lookup_mountpoint(struct dentry *dentry) |
707 | { | 707 | { |
708 | struct hlist_head *chain = mp_hash(dentry); | 708 | struct hlist_head *chain = mp_hash(dentry); |
709 | struct mountpoint *mp; | 709 | struct mountpoint *mp; |
710 | 710 | ||
711 | hlist_for_each_entry(mp, chain, m_hash) { | 711 | hlist_for_each_entry(mp, chain, m_hash) { |
712 | if (mp->m_dentry == dentry) { | 712 | if (mp->m_dentry == dentry) { |
713 | /* might be worth a WARN_ON() */ | 713 | /* might be worth a WARN_ON() */ |
714 | if (d_unlinked(dentry)) | 714 | if (d_unlinked(dentry)) |
715 | return ERR_PTR(-ENOENT); | 715 | return ERR_PTR(-ENOENT); |
716 | mp->m_count++; | 716 | mp->m_count++; |
717 | return mp; | 717 | return mp; |
718 | } | 718 | } |
719 | } | 719 | } |
720 | return NULL; | 720 | return NULL; |
721 | } | 721 | } |
722 | 722 | ||
723 | static struct mountpoint *new_mountpoint(struct dentry *dentry) | 723 | static struct mountpoint *new_mountpoint(struct dentry *dentry) |
724 | { | 724 | { |
725 | struct hlist_head *chain = mp_hash(dentry); | 725 | struct hlist_head *chain = mp_hash(dentry); |
726 | struct mountpoint *mp; | 726 | struct mountpoint *mp; |
727 | int ret; | 727 | int ret; |
728 | 728 | ||
729 | mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); | 729 | mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); |
730 | if (!mp) | 730 | if (!mp) |
731 | return ERR_PTR(-ENOMEM); | 731 | return ERR_PTR(-ENOMEM); |
732 | 732 | ||
733 | ret = d_set_mounted(dentry); | 733 | ret = d_set_mounted(dentry); |
734 | if (ret) { | 734 | if (ret) { |
735 | kfree(mp); | 735 | kfree(mp); |
736 | return ERR_PTR(ret); | 736 | return ERR_PTR(ret); |
737 | } | 737 | } |
738 | 738 | ||
739 | mp->m_dentry = dentry; | 739 | mp->m_dentry = dentry; |
740 | mp->m_count = 1; | 740 | mp->m_count = 1; |
741 | hlist_add_head(&mp->m_hash, chain); | 741 | hlist_add_head(&mp->m_hash, chain); |
742 | INIT_HLIST_HEAD(&mp->m_list); | 742 | INIT_HLIST_HEAD(&mp->m_list); |
743 | return mp; | 743 | return mp; |
744 | } | 744 | } |
745 | 745 | ||
746 | static void put_mountpoint(struct mountpoint *mp) | 746 | static void put_mountpoint(struct mountpoint *mp) |
747 | { | 747 | { |
748 | if (!--mp->m_count) { | 748 | if (!--mp->m_count) { |
749 | struct dentry *dentry = mp->m_dentry; | 749 | struct dentry *dentry = mp->m_dentry; |
750 | BUG_ON(!hlist_empty(&mp->m_list)); | 750 | BUG_ON(!hlist_empty(&mp->m_list)); |
751 | spin_lock(&dentry->d_lock); | 751 | spin_lock(&dentry->d_lock); |
752 | dentry->d_flags &= ~DCACHE_MOUNTED; | 752 | dentry->d_flags &= ~DCACHE_MOUNTED; |
753 | spin_unlock(&dentry->d_lock); | 753 | spin_unlock(&dentry->d_lock); |
754 | hlist_del(&mp->m_hash); | 754 | hlist_del(&mp->m_hash); |
755 | kfree(mp); | 755 | kfree(mp); |
756 | } | 756 | } |
757 | } | 757 | } |
758 | 758 | ||
759 | static inline int check_mnt(struct mount *mnt) | 759 | static inline int check_mnt(struct mount *mnt) |
760 | { | 760 | { |
761 | return mnt->mnt_ns == current->nsproxy->mnt_ns; | 761 | return mnt->mnt_ns == current->nsproxy->mnt_ns; |
762 | } | 762 | } |
763 | 763 | ||
764 | /* | 764 | /* |
765 | * vfsmount lock must be held for write | 765 | * vfsmount lock must be held for write |
766 | */ | 766 | */ |
767 | static void touch_mnt_namespace(struct mnt_namespace *ns) | 767 | static void touch_mnt_namespace(struct mnt_namespace *ns) |
768 | { | 768 | { |
769 | if (ns) { | 769 | if (ns) { |
770 | ns->event = ++event; | 770 | ns->event = ++event; |
771 | wake_up_interruptible(&ns->poll); | 771 | wake_up_interruptible(&ns->poll); |
772 | } | 772 | } |
773 | } | 773 | } |
774 | 774 | ||
775 | /* | 775 | /* |
776 | * vfsmount lock must be held for write | 776 | * vfsmount lock must be held for write |
777 | */ | 777 | */ |
778 | static void __touch_mnt_namespace(struct mnt_namespace *ns) | 778 | static void __touch_mnt_namespace(struct mnt_namespace *ns) |
779 | { | 779 | { |
780 | if (ns && ns->event != event) { | 780 | if (ns && ns->event != event) { |
781 | ns->event = event; | 781 | ns->event = event; |
782 | wake_up_interruptible(&ns->poll); | 782 | wake_up_interruptible(&ns->poll); |
783 | } | 783 | } |
784 | } | 784 | } |
785 | 785 | ||
786 | /* | 786 | /* |
787 | * vfsmount lock must be held for write | 787 | * vfsmount lock must be held for write |
788 | */ | 788 | */ |
789 | static void detach_mnt(struct mount *mnt, struct path *old_path) | 789 | static void detach_mnt(struct mount *mnt, struct path *old_path) |
790 | { | 790 | { |
791 | old_path->dentry = mnt->mnt_mountpoint; | 791 | old_path->dentry = mnt->mnt_mountpoint; |
792 | old_path->mnt = &mnt->mnt_parent->mnt; | 792 | old_path->mnt = &mnt->mnt_parent->mnt; |
793 | mnt->mnt_parent = mnt; | 793 | mnt->mnt_parent = mnt; |
794 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; | 794 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; |
795 | list_del_init(&mnt->mnt_child); | 795 | list_del_init(&mnt->mnt_child); |
796 | hlist_del_init_rcu(&mnt->mnt_hash); | 796 | hlist_del_init_rcu(&mnt->mnt_hash); |
797 | hlist_del_init(&mnt->mnt_mp_list); | 797 | hlist_del_init(&mnt->mnt_mp_list); |
798 | put_mountpoint(mnt->mnt_mp); | 798 | put_mountpoint(mnt->mnt_mp); |
799 | mnt->mnt_mp = NULL; | 799 | mnt->mnt_mp = NULL; |
800 | } | 800 | } |
801 | 801 | ||
802 | /* | 802 | /* |
803 | * vfsmount lock must be held for write | 803 | * vfsmount lock must be held for write |
804 | */ | 804 | */ |
805 | void mnt_set_mountpoint(struct mount *mnt, | 805 | void mnt_set_mountpoint(struct mount *mnt, |
806 | struct mountpoint *mp, | 806 | struct mountpoint *mp, |
807 | struct mount *child_mnt) | 807 | struct mount *child_mnt) |
808 | { | 808 | { |
809 | mp->m_count++; | 809 | mp->m_count++; |
810 | mnt_add_count(mnt, 1); /* essentially, that's mntget */ | 810 | mnt_add_count(mnt, 1); /* essentially, that's mntget */ |
811 | child_mnt->mnt_mountpoint = dget(mp->m_dentry); | 811 | child_mnt->mnt_mountpoint = dget(mp->m_dentry); |
812 | child_mnt->mnt_parent = mnt; | 812 | child_mnt->mnt_parent = mnt; |
813 | child_mnt->mnt_mp = mp; | 813 | child_mnt->mnt_mp = mp; |
814 | hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); | 814 | hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); |
815 | } | 815 | } |
816 | 816 | ||
817 | /* | 817 | /* |
818 | * vfsmount lock must be held for write | 818 | * vfsmount lock must be held for write |
819 | */ | 819 | */ |
820 | static void attach_mnt(struct mount *mnt, | 820 | static void attach_mnt(struct mount *mnt, |
821 | struct mount *parent, | 821 | struct mount *parent, |
822 | struct mountpoint *mp) | 822 | struct mountpoint *mp) |
823 | { | 823 | { |
824 | mnt_set_mountpoint(parent, mp, mnt); | 824 | mnt_set_mountpoint(parent, mp, mnt); |
825 | hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); | 825 | hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); |
826 | list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); | 826 | list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); |
827 | } | 827 | } |
828 | 828 | ||
829 | static void attach_shadowed(struct mount *mnt, | 829 | static void attach_shadowed(struct mount *mnt, |
830 | struct mount *parent, | 830 | struct mount *parent, |
831 | struct mount *shadows) | 831 | struct mount *shadows) |
832 | { | 832 | { |
833 | if (shadows) { | 833 | if (shadows) { |
834 | hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); | 834 | hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); |
835 | list_add(&mnt->mnt_child, &shadows->mnt_child); | 835 | list_add(&mnt->mnt_child, &shadows->mnt_child); |
836 | } else { | 836 | } else { |
837 | hlist_add_head_rcu(&mnt->mnt_hash, | 837 | hlist_add_head_rcu(&mnt->mnt_hash, |
838 | m_hash(&parent->mnt, mnt->mnt_mountpoint)); | 838 | m_hash(&parent->mnt, mnt->mnt_mountpoint)); |
839 | list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); | 839 | list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); |
840 | } | 840 | } |
841 | } | 841 | } |
842 | 842 | ||
843 | /* | 843 | /* |
844 | * vfsmount lock must be held for write | 844 | * vfsmount lock must be held for write |
845 | */ | 845 | */ |
846 | static void commit_tree(struct mount *mnt, struct mount *shadows) | 846 | static void commit_tree(struct mount *mnt, struct mount *shadows) |
847 | { | 847 | { |
848 | struct mount *parent = mnt->mnt_parent; | 848 | struct mount *parent = mnt->mnt_parent; |
849 | struct mount *m; | 849 | struct mount *m; |
850 | LIST_HEAD(head); | 850 | LIST_HEAD(head); |
851 | struct mnt_namespace *n = parent->mnt_ns; | 851 | struct mnt_namespace *n = parent->mnt_ns; |
852 | 852 | ||
853 | BUG_ON(parent == mnt); | 853 | BUG_ON(parent == mnt); |
854 | 854 | ||
855 | list_add_tail(&head, &mnt->mnt_list); | 855 | list_add_tail(&head, &mnt->mnt_list); |
856 | list_for_each_entry(m, &head, mnt_list) | 856 | list_for_each_entry(m, &head, mnt_list) |
857 | m->mnt_ns = n; | 857 | m->mnt_ns = n; |
858 | 858 | ||
859 | list_splice(&head, n->list.prev); | 859 | list_splice(&head, n->list.prev); |
860 | 860 | ||
861 | attach_shadowed(mnt, parent, shadows); | 861 | attach_shadowed(mnt, parent, shadows); |
862 | touch_mnt_namespace(n); | 862 | touch_mnt_namespace(n); |
863 | } | 863 | } |
864 | 864 | ||
865 | static struct mount *next_mnt(struct mount *p, struct mount *root) | 865 | static struct mount *next_mnt(struct mount *p, struct mount *root) |
866 | { | 866 | { |
867 | struct list_head *next = p->mnt_mounts.next; | 867 | struct list_head *next = p->mnt_mounts.next; |
868 | if (next == &p->mnt_mounts) { | 868 | if (next == &p->mnt_mounts) { |
869 | while (1) { | 869 | while (1) { |
870 | if (p == root) | 870 | if (p == root) |
871 | return NULL; | 871 | return NULL; |
872 | next = p->mnt_child.next; | 872 | next = p->mnt_child.next; |
873 | if (next != &p->mnt_parent->mnt_mounts) | 873 | if (next != &p->mnt_parent->mnt_mounts) |
874 | break; | 874 | break; |
875 | p = p->mnt_parent; | 875 | p = p->mnt_parent; |
876 | } | 876 | } |
877 | } | 877 | } |
878 | return list_entry(next, struct mount, mnt_child); | 878 | return list_entry(next, struct mount, mnt_child); |
879 | } | 879 | } |
880 | 880 | ||
881 | static struct mount *skip_mnt_tree(struct mount *p) | 881 | static struct mount *skip_mnt_tree(struct mount *p) |
882 | { | 882 | { |
883 | struct list_head *prev = p->mnt_mounts.prev; | 883 | struct list_head *prev = p->mnt_mounts.prev; |
884 | while (prev != &p->mnt_mounts) { | 884 | while (prev != &p->mnt_mounts) { |
885 | p = list_entry(prev, struct mount, mnt_child); | 885 | p = list_entry(prev, struct mount, mnt_child); |
886 | prev = p->mnt_mounts.prev; | 886 | prev = p->mnt_mounts.prev; |
887 | } | 887 | } |
888 | return p; | 888 | return p; |
889 | } | 889 | } |
890 | 890 | ||
891 | struct vfsmount * | 891 | struct vfsmount * |
892 | vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) | 892 | vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) |
893 | { | 893 | { |
894 | struct mount *mnt; | 894 | struct mount *mnt; |
895 | struct dentry *root; | 895 | struct dentry *root; |
896 | 896 | ||
897 | if (!type) | 897 | if (!type) |
898 | return ERR_PTR(-ENODEV); | 898 | return ERR_PTR(-ENODEV); |
899 | 899 | ||
900 | mnt = alloc_vfsmnt(name); | 900 | mnt = alloc_vfsmnt(name); |
901 | if (!mnt) | 901 | if (!mnt) |
902 | return ERR_PTR(-ENOMEM); | 902 | return ERR_PTR(-ENOMEM); |
903 | 903 | ||
904 | if (flags & MS_KERNMOUNT) | 904 | if (flags & MS_KERNMOUNT) |
905 | mnt->mnt.mnt_flags = MNT_INTERNAL; | 905 | mnt->mnt.mnt_flags = MNT_INTERNAL; |
906 | 906 | ||
907 | root = mount_fs(type, flags, name, data); | 907 | root = mount_fs(type, flags, name, data); |
908 | if (IS_ERR(root)) { | 908 | if (IS_ERR(root)) { |
909 | mnt_free_id(mnt); | 909 | mnt_free_id(mnt); |
910 | free_vfsmnt(mnt); | 910 | free_vfsmnt(mnt); |
911 | return ERR_CAST(root); | 911 | return ERR_CAST(root); |
912 | } | 912 | } |
913 | 913 | ||
914 | mnt->mnt.mnt_root = root; | 914 | mnt->mnt.mnt_root = root; |
915 | mnt->mnt.mnt_sb = root->d_sb; | 915 | mnt->mnt.mnt_sb = root->d_sb; |
916 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; | 916 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; |
917 | mnt->mnt_parent = mnt; | 917 | mnt->mnt_parent = mnt; |
918 | lock_mount_hash(); | 918 | lock_mount_hash(); |
919 | list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); | 919 | list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); |
920 | unlock_mount_hash(); | 920 | unlock_mount_hash(); |
921 | return &mnt->mnt; | 921 | return &mnt->mnt; |
922 | } | 922 | } |
923 | EXPORT_SYMBOL_GPL(vfs_kern_mount); | 923 | EXPORT_SYMBOL_GPL(vfs_kern_mount); |
924 | 924 | ||
925 | static struct mount *clone_mnt(struct mount *old, struct dentry *root, | 925 | static struct mount *clone_mnt(struct mount *old, struct dentry *root, |
926 | int flag) | 926 | int flag) |
927 | { | 927 | { |
928 | struct super_block *sb = old->mnt.mnt_sb; | 928 | struct super_block *sb = old->mnt.mnt_sb; |
929 | struct mount *mnt; | 929 | struct mount *mnt; |
930 | int err; | 930 | int err; |
931 | 931 | ||
932 | mnt = alloc_vfsmnt(old->mnt_devname); | 932 | mnt = alloc_vfsmnt(old->mnt_devname); |
933 | if (!mnt) | 933 | if (!mnt) |
934 | return ERR_PTR(-ENOMEM); | 934 | return ERR_PTR(-ENOMEM); |
935 | 935 | ||
936 | if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) | 936 | if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) |
937 | mnt->mnt_group_id = 0; /* not a peer of original */ | 937 | mnt->mnt_group_id = 0; /* not a peer of original */ |
938 | else | 938 | else |
939 | mnt->mnt_group_id = old->mnt_group_id; | 939 | mnt->mnt_group_id = old->mnt_group_id; |
940 | 940 | ||
941 | if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { | 941 | if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { |
942 | err = mnt_alloc_group_id(mnt); | 942 | err = mnt_alloc_group_id(mnt); |
943 | if (err) | 943 | if (err) |
944 | goto out_free; | 944 | goto out_free; |
945 | } | 945 | } |
946 | 946 | ||
947 | mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); | 947 | mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); |
948 | /* Don't allow unprivileged users to change mount flags */ | 948 | /* Don't allow unprivileged users to change mount flags */ |
949 | if (flag & CL_UNPRIVILEGED) { | 949 | if (flag & CL_UNPRIVILEGED) { |
950 | mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; | 950 | mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; |
951 | 951 | ||
952 | if (mnt->mnt.mnt_flags & MNT_READONLY) | 952 | if (mnt->mnt.mnt_flags & MNT_READONLY) |
953 | mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; | 953 | mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; |
954 | 954 | ||
955 | if (mnt->mnt.mnt_flags & MNT_NODEV) | 955 | if (mnt->mnt.mnt_flags & MNT_NODEV) |
956 | mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; | 956 | mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; |
957 | 957 | ||
958 | if (mnt->mnt.mnt_flags & MNT_NOSUID) | 958 | if (mnt->mnt.mnt_flags & MNT_NOSUID) |
959 | mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; | 959 | mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; |
960 | 960 | ||
961 | if (mnt->mnt.mnt_flags & MNT_NOEXEC) | 961 | if (mnt->mnt.mnt_flags & MNT_NOEXEC) |
962 | mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; | 962 | mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; |
963 | } | 963 | } |
964 | 964 | ||
965 | /* Don't allow unprivileged users to reveal what is under a mount */ | 965 | /* Don't allow unprivileged users to reveal what is under a mount */ |
966 | if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) | 966 | if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) |
967 | mnt->mnt.mnt_flags |= MNT_LOCKED; | 967 | mnt->mnt.mnt_flags |= MNT_LOCKED; |
968 | 968 | ||
969 | atomic_inc(&sb->s_active); | 969 | atomic_inc(&sb->s_active); |
970 | mnt->mnt.mnt_sb = sb; | 970 | mnt->mnt.mnt_sb = sb; |
971 | mnt->mnt.mnt_root = dget(root); | 971 | mnt->mnt.mnt_root = dget(root); |
972 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; | 972 | mnt->mnt_mountpoint = mnt->mnt.mnt_root; |
973 | mnt->mnt_parent = mnt; | 973 | mnt->mnt_parent = mnt; |
974 | lock_mount_hash(); | 974 | lock_mount_hash(); |
975 | list_add_tail(&mnt->mnt_instance, &sb->s_mounts); | 975 | list_add_tail(&mnt->mnt_instance, &sb->s_mounts); |
976 | unlock_mount_hash(); | 976 | unlock_mount_hash(); |
977 | 977 | ||
978 | if ((flag & CL_SLAVE) || | 978 | if ((flag & CL_SLAVE) || |
979 | ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { | 979 | ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { |
980 | list_add(&mnt->mnt_slave, &old->mnt_slave_list); | 980 | list_add(&mnt->mnt_slave, &old->mnt_slave_list); |
981 | mnt->mnt_master = old; | 981 | mnt->mnt_master = old; |
982 | CLEAR_MNT_SHARED(mnt); | 982 | CLEAR_MNT_SHARED(mnt); |
983 | } else if (!(flag & CL_PRIVATE)) { | 983 | } else if (!(flag & CL_PRIVATE)) { |
984 | if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) | 984 | if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) |
985 | list_add(&mnt->mnt_share, &old->mnt_share); | 985 | list_add(&mnt->mnt_share, &old->mnt_share); |
986 | if (IS_MNT_SLAVE(old)) | 986 | if (IS_MNT_SLAVE(old)) |
987 | list_add(&mnt->mnt_slave, &old->mnt_slave); | 987 | list_add(&mnt->mnt_slave, &old->mnt_slave); |
988 | mnt->mnt_master = old->mnt_master; | 988 | mnt->mnt_master = old->mnt_master; |
989 | } | 989 | } |
990 | if (flag & CL_MAKE_SHARED) | 990 | if (flag & CL_MAKE_SHARED) |
991 | set_mnt_shared(mnt); | 991 | set_mnt_shared(mnt); |
992 | 992 | ||
993 | /* stick the duplicate mount on the same expiry list | 993 | /* stick the duplicate mount on the same expiry list |
994 | * as the original if that was on one */ | 994 | * as the original if that was on one */ |
995 | if (flag & CL_EXPIRE) { | 995 | if (flag & CL_EXPIRE) { |
996 | if (!list_empty(&old->mnt_expire)) | 996 | if (!list_empty(&old->mnt_expire)) |
997 | list_add(&mnt->mnt_expire, &old->mnt_expire); | 997 | list_add(&mnt->mnt_expire, &old->mnt_expire); |
998 | } | 998 | } |
999 | 999 | ||
1000 | return mnt; | 1000 | return mnt; |
1001 | 1001 | ||
1002 | out_free: | 1002 | out_free: |
1003 | mnt_free_id(mnt); | 1003 | mnt_free_id(mnt); |
1004 | free_vfsmnt(mnt); | 1004 | free_vfsmnt(mnt); |
1005 | return ERR_PTR(err); | 1005 | return ERR_PTR(err); |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | static void cleanup_mnt(struct mount *mnt) | 1008 | static void cleanup_mnt(struct mount *mnt) |
1009 | { | 1009 | { |
1010 | /* | 1010 | /* |
1011 | * This probably indicates that somebody messed | 1011 | * This probably indicates that somebody messed |
1012 | * up a mnt_want/drop_write() pair. If this | 1012 | * up a mnt_want/drop_write() pair. If this |
1013 | * happens, the filesystem was probably unable | 1013 | * happens, the filesystem was probably unable |
1014 | * to make r/w->r/o transitions. | 1014 | * to make r/w->r/o transitions. |
1015 | */ | 1015 | */ |
1016 | /* | 1016 | /* |
1017 | * The locking used to deal with mnt_count decrement provides barriers, | 1017 | * The locking used to deal with mnt_count decrement provides barriers, |
1018 | * so mnt_get_writers() below is safe. | 1018 | * so mnt_get_writers() below is safe. |
1019 | */ | 1019 | */ |
1020 | WARN_ON(mnt_get_writers(mnt)); | 1020 | WARN_ON(mnt_get_writers(mnt)); |
1021 | if (unlikely(mnt->mnt_pins.first)) | 1021 | if (unlikely(mnt->mnt_pins.first)) |
1022 | mnt_pin_kill(mnt); | 1022 | mnt_pin_kill(mnt); |
1023 | fsnotify_vfsmount_delete(&mnt->mnt); | 1023 | fsnotify_vfsmount_delete(&mnt->mnt); |
1024 | dput(mnt->mnt.mnt_root); | 1024 | dput(mnt->mnt.mnt_root); |
1025 | deactivate_super(mnt->mnt.mnt_sb); | 1025 | deactivate_super(mnt->mnt.mnt_sb); |
1026 | mnt_free_id(mnt); | 1026 | mnt_free_id(mnt); |
1027 | call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); | 1027 | call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | static void __cleanup_mnt(struct rcu_head *head) | 1030 | static void __cleanup_mnt(struct rcu_head *head) |
1031 | { | 1031 | { |
1032 | cleanup_mnt(container_of(head, struct mount, mnt_rcu)); | 1032 | cleanup_mnt(container_of(head, struct mount, mnt_rcu)); |
1033 | } | 1033 | } |
1034 | 1034 | ||
1035 | static LLIST_HEAD(delayed_mntput_list); | 1035 | static LLIST_HEAD(delayed_mntput_list); |
1036 | static void delayed_mntput(struct work_struct *unused) | 1036 | static void delayed_mntput(struct work_struct *unused) |
1037 | { | 1037 | { |
1038 | struct llist_node *node = llist_del_all(&delayed_mntput_list); | 1038 | struct llist_node *node = llist_del_all(&delayed_mntput_list); |
1039 | struct llist_node *next; | 1039 | struct llist_node *next; |
1040 | 1040 | ||
1041 | for (; node; node = next) { | 1041 | for (; node; node = next) { |
1042 | next = llist_next(node); | 1042 | next = llist_next(node); |
1043 | cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); | 1043 | cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); |
1044 | } | 1044 | } |
1045 | } | 1045 | } |
1046 | static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); | 1046 | static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); |
1047 | 1047 | ||
1048 | static void mntput_no_expire(struct mount *mnt) | 1048 | static void mntput_no_expire(struct mount *mnt) |
1049 | { | 1049 | { |
1050 | rcu_read_lock(); | 1050 | rcu_read_lock(); |
1051 | mnt_add_count(mnt, -1); | 1051 | mnt_add_count(mnt, -1); |
1052 | if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ | 1052 | if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ |
1053 | rcu_read_unlock(); | 1053 | rcu_read_unlock(); |
1054 | return; | 1054 | return; |
1055 | } | 1055 | } |
1056 | lock_mount_hash(); | 1056 | lock_mount_hash(); |
1057 | if (mnt_get_count(mnt)) { | 1057 | if (mnt_get_count(mnt)) { |
1058 | rcu_read_unlock(); | 1058 | rcu_read_unlock(); |
1059 | unlock_mount_hash(); | 1059 | unlock_mount_hash(); |
1060 | return; | 1060 | return; |
1061 | } | 1061 | } |
1062 | if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { | 1062 | if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { |
1063 | rcu_read_unlock(); | 1063 | rcu_read_unlock(); |
1064 | unlock_mount_hash(); | 1064 | unlock_mount_hash(); |
1065 | return; | 1065 | return; |
1066 | } | 1066 | } |
1067 | mnt->mnt.mnt_flags |= MNT_DOOMED; | 1067 | mnt->mnt.mnt_flags |= MNT_DOOMED; |
1068 | rcu_read_unlock(); | 1068 | rcu_read_unlock(); |
1069 | 1069 | ||
1070 | list_del(&mnt->mnt_instance); | 1070 | list_del(&mnt->mnt_instance); |
1071 | unlock_mount_hash(); | 1071 | unlock_mount_hash(); |
1072 | 1072 | ||
1073 | if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { | 1073 | if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { |
1074 | struct task_struct *task = current; | 1074 | struct task_struct *task = current; |
1075 | if (likely(!(task->flags & PF_KTHREAD))) { | 1075 | if (likely(!(task->flags & PF_KTHREAD))) { |
1076 | init_task_work(&mnt->mnt_rcu, __cleanup_mnt); | 1076 | init_task_work(&mnt->mnt_rcu, __cleanup_mnt); |
1077 | if (!task_work_add(task, &mnt->mnt_rcu, true)) | 1077 | if (!task_work_add(task, &mnt->mnt_rcu, true)) |
1078 | return; | 1078 | return; |
1079 | } | 1079 | } |
1080 | if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) | 1080 | if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) |
1081 | schedule_delayed_work(&delayed_mntput_work, 1); | 1081 | schedule_delayed_work(&delayed_mntput_work, 1); |
1082 | return; | 1082 | return; |
1083 | } | 1083 | } |
1084 | cleanup_mnt(mnt); | 1084 | cleanup_mnt(mnt); |
1085 | } | 1085 | } |
1086 | 1086 | ||
1087 | void mntput(struct vfsmount *mnt) | 1087 | void mntput(struct vfsmount *mnt) |
1088 | { | 1088 | { |
1089 | if (mnt) { | 1089 | if (mnt) { |
1090 | struct mount *m = real_mount(mnt); | 1090 | struct mount *m = real_mount(mnt); |
1091 | /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ | 1091 | /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ |
1092 | if (unlikely(m->mnt_expiry_mark)) | 1092 | if (unlikely(m->mnt_expiry_mark)) |
1093 | m->mnt_expiry_mark = 0; | 1093 | m->mnt_expiry_mark = 0; |
1094 | mntput_no_expire(m); | 1094 | mntput_no_expire(m); |
1095 | } | 1095 | } |
1096 | } | 1096 | } |
1097 | EXPORT_SYMBOL(mntput); | 1097 | EXPORT_SYMBOL(mntput); |
1098 | 1098 | ||
1099 | struct vfsmount *mntget(struct vfsmount *mnt) | 1099 | struct vfsmount *mntget(struct vfsmount *mnt) |
1100 | { | 1100 | { |
1101 | if (mnt) | 1101 | if (mnt) |
1102 | mnt_add_count(real_mount(mnt), 1); | 1102 | mnt_add_count(real_mount(mnt), 1); |
1103 | return mnt; | 1103 | return mnt; |
1104 | } | 1104 | } |
1105 | EXPORT_SYMBOL(mntget); | 1105 | EXPORT_SYMBOL(mntget); |
1106 | 1106 | ||
1107 | struct vfsmount *mnt_clone_internal(struct path *path) | 1107 | struct vfsmount *mnt_clone_internal(struct path *path) |
1108 | { | 1108 | { |
1109 | struct mount *p; | 1109 | struct mount *p; |
1110 | p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); | 1110 | p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); |
1111 | if (IS_ERR(p)) | 1111 | if (IS_ERR(p)) |
1112 | return ERR_CAST(p); | 1112 | return ERR_CAST(p); |
1113 | p->mnt.mnt_flags |= MNT_INTERNAL; | 1113 | p->mnt.mnt_flags |= MNT_INTERNAL; |
1114 | return &p->mnt; | 1114 | return &p->mnt; |
1115 | } | 1115 | } |
1116 | 1116 | ||
1117 | static inline void mangle(struct seq_file *m, const char *s) | 1117 | static inline void mangle(struct seq_file *m, const char *s) |
1118 | { | 1118 | { |
1119 | seq_escape(m, s, " \t\n\\"); | 1119 | seq_escape(m, s, " \t\n\\"); |
1120 | } | 1120 | } |
1121 | 1121 | ||
1122 | /* | 1122 | /* |
1123 | * Simple .show_options callback for filesystems which don't want to | 1123 | * Simple .show_options callback for filesystems which don't want to |
1124 | * implement more complex mount option showing. | 1124 | * implement more complex mount option showing. |
1125 | * | 1125 | * |
1126 | * See also save_mount_options(). | 1126 | * See also save_mount_options(). |
1127 | */ | 1127 | */ |
1128 | int generic_show_options(struct seq_file *m, struct dentry *root) | 1128 | int generic_show_options(struct seq_file *m, struct dentry *root) |
1129 | { | 1129 | { |
1130 | const char *options; | 1130 | const char *options; |
1131 | 1131 | ||
1132 | rcu_read_lock(); | 1132 | rcu_read_lock(); |
1133 | options = rcu_dereference(root->d_sb->s_options); | 1133 | options = rcu_dereference(root->d_sb->s_options); |
1134 | 1134 | ||
1135 | if (options != NULL && options[0]) { | 1135 | if (options != NULL && options[0]) { |
1136 | seq_putc(m, ','); | 1136 | seq_putc(m, ','); |
1137 | mangle(m, options); | 1137 | mangle(m, options); |
1138 | } | 1138 | } |
1139 | rcu_read_unlock(); | 1139 | rcu_read_unlock(); |
1140 | 1140 | ||
1141 | return 0; | 1141 | return 0; |
1142 | } | 1142 | } |
1143 | EXPORT_SYMBOL(generic_show_options); | 1143 | EXPORT_SYMBOL(generic_show_options); |
1144 | 1144 | ||
1145 | /* | 1145 | /* |
1146 | * If filesystem uses generic_show_options(), this function should be | 1146 | * If filesystem uses generic_show_options(), this function should be |
1147 | * called from the fill_super() callback. | 1147 | * called from the fill_super() callback. |
1148 | * | 1148 | * |
1149 | * The .remount_fs callback usually needs to be handled in a special | 1149 | * The .remount_fs callback usually needs to be handled in a special |
1150 | * way, to make sure, that previous options are not overwritten if the | 1150 | * way, to make sure, that previous options are not overwritten if the |
1151 | * remount fails. | 1151 | * remount fails. |
1152 | * | 1152 | * |
1153 | * Also note, that if the filesystem's .remount_fs function doesn't | 1153 | * Also note, that if the filesystem's .remount_fs function doesn't |
1154 | * reset all options to their default value, but changes only newly | 1154 | * reset all options to their default value, but changes only newly |
1155 | * given options, then the displayed options will not reflect reality | 1155 | * given options, then the displayed options will not reflect reality |
1156 | * any more. | 1156 | * any more. |
1157 | */ | 1157 | */ |
1158 | void save_mount_options(struct super_block *sb, char *options) | 1158 | void save_mount_options(struct super_block *sb, char *options) |
1159 | { | 1159 | { |
1160 | BUG_ON(sb->s_options); | 1160 | BUG_ON(sb->s_options); |
1161 | rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); | 1161 | rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); |
1162 | } | 1162 | } |
1163 | EXPORT_SYMBOL(save_mount_options); | 1163 | EXPORT_SYMBOL(save_mount_options); |
1164 | 1164 | ||
1165 | void replace_mount_options(struct super_block *sb, char *options) | 1165 | void replace_mount_options(struct super_block *sb, char *options) |
1166 | { | 1166 | { |
1167 | char *old = sb->s_options; | 1167 | char *old = sb->s_options; |
1168 | rcu_assign_pointer(sb->s_options, options); | 1168 | rcu_assign_pointer(sb->s_options, options); |
1169 | if (old) { | 1169 | if (old) { |
1170 | synchronize_rcu(); | 1170 | synchronize_rcu(); |
1171 | kfree(old); | 1171 | kfree(old); |
1172 | } | 1172 | } |
1173 | } | 1173 | } |
1174 | EXPORT_SYMBOL(replace_mount_options); | 1174 | EXPORT_SYMBOL(replace_mount_options); |
1175 | 1175 | ||
1176 | #ifdef CONFIG_PROC_FS | 1176 | #ifdef CONFIG_PROC_FS |
1177 | /* iterator; we want it to have access to namespace_sem, thus here... */ | 1177 | /* iterator; we want it to have access to namespace_sem, thus here... */ |
1178 | static void *m_start(struct seq_file *m, loff_t *pos) | 1178 | static void *m_start(struct seq_file *m, loff_t *pos) |
1179 | { | 1179 | { |
1180 | struct proc_mounts *p = proc_mounts(m); | 1180 | struct proc_mounts *p = proc_mounts(m); |
1181 | 1181 | ||
1182 | down_read(&namespace_sem); | 1182 | down_read(&namespace_sem); |
1183 | if (p->cached_event == p->ns->event) { | 1183 | if (p->cached_event == p->ns->event) { |
1184 | void *v = p->cached_mount; | 1184 | void *v = p->cached_mount; |
1185 | if (*pos == p->cached_index) | 1185 | if (*pos == p->cached_index) |
1186 | return v; | 1186 | return v; |
1187 | if (*pos == p->cached_index + 1) { | 1187 | if (*pos == p->cached_index + 1) { |
1188 | v = seq_list_next(v, &p->ns->list, &p->cached_index); | 1188 | v = seq_list_next(v, &p->ns->list, &p->cached_index); |
1189 | return p->cached_mount = v; | 1189 | return p->cached_mount = v; |
1190 | } | 1190 | } |
1191 | } | 1191 | } |
1192 | 1192 | ||
1193 | p->cached_event = p->ns->event; | 1193 | p->cached_event = p->ns->event; |
1194 | p->cached_mount = seq_list_start(&p->ns->list, *pos); | 1194 | p->cached_mount = seq_list_start(&p->ns->list, *pos); |
1195 | p->cached_index = *pos; | 1195 | p->cached_index = *pos; |
1196 | return p->cached_mount; | 1196 | return p->cached_mount; |
1197 | } | 1197 | } |
1198 | 1198 | ||
1199 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) | 1199 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) |
1200 | { | 1200 | { |
1201 | struct proc_mounts *p = proc_mounts(m); | 1201 | struct proc_mounts *p = proc_mounts(m); |
1202 | 1202 | ||
1203 | p->cached_mount = seq_list_next(v, &p->ns->list, pos); | 1203 | p->cached_mount = seq_list_next(v, &p->ns->list, pos); |
1204 | p->cached_index = *pos; | 1204 | p->cached_index = *pos; |
1205 | return p->cached_mount; | 1205 | return p->cached_mount; |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | static void m_stop(struct seq_file *m, void *v) | 1208 | static void m_stop(struct seq_file *m, void *v) |
1209 | { | 1209 | { |
1210 | up_read(&namespace_sem); | 1210 | up_read(&namespace_sem); |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | static int m_show(struct seq_file *m, void *v) | 1213 | static int m_show(struct seq_file *m, void *v) |
1214 | { | 1214 | { |
1215 | struct proc_mounts *p = proc_mounts(m); | 1215 | struct proc_mounts *p = proc_mounts(m); |
1216 | struct mount *r = list_entry(v, struct mount, mnt_list); | 1216 | struct mount *r = list_entry(v, struct mount, mnt_list); |
1217 | return p->show(m, &r->mnt); | 1217 | return p->show(m, &r->mnt); |
1218 | } | 1218 | } |
1219 | 1219 | ||
1220 | const struct seq_operations mounts_op = { | 1220 | const struct seq_operations mounts_op = { |
1221 | .start = m_start, | 1221 | .start = m_start, |
1222 | .next = m_next, | 1222 | .next = m_next, |
1223 | .stop = m_stop, | 1223 | .stop = m_stop, |
1224 | .show = m_show, | 1224 | .show = m_show, |
1225 | }; | 1225 | }; |
1226 | #endif /* CONFIG_PROC_FS */ | 1226 | #endif /* CONFIG_PROC_FS */ |
1227 | 1227 | ||
1228 | /** | 1228 | /** |
1229 | * may_umount_tree - check if a mount tree is busy | 1229 | * may_umount_tree - check if a mount tree is busy |
1230 | * @mnt: root of mount tree | 1230 | * @mnt: root of mount tree |
1231 | * | 1231 | * |
1232 | * This is called to check if a tree of mounts has any | 1232 | * This is called to check if a tree of mounts has any |
1233 | * open files, pwds, chroots or sub mounts that are | 1233 | * open files, pwds, chroots or sub mounts that are |
1234 | * busy. | 1234 | * busy. |
1235 | */ | 1235 | */ |
1236 | int may_umount_tree(struct vfsmount *m) | 1236 | int may_umount_tree(struct vfsmount *m) |
1237 | { | 1237 | { |
1238 | struct mount *mnt = real_mount(m); | 1238 | struct mount *mnt = real_mount(m); |
1239 | int actual_refs = 0; | 1239 | int actual_refs = 0; |
1240 | int minimum_refs = 0; | 1240 | int minimum_refs = 0; |
1241 | struct mount *p; | 1241 | struct mount *p; |
1242 | BUG_ON(!m); | 1242 | BUG_ON(!m); |
1243 | 1243 | ||
1244 | /* write lock needed for mnt_get_count */ | 1244 | /* write lock needed for mnt_get_count */ |
1245 | lock_mount_hash(); | 1245 | lock_mount_hash(); |
1246 | for (p = mnt; p; p = next_mnt(p, mnt)) { | 1246 | for (p = mnt; p; p = next_mnt(p, mnt)) { |
1247 | actual_refs += mnt_get_count(p); | 1247 | actual_refs += mnt_get_count(p); |
1248 | minimum_refs += 2; | 1248 | minimum_refs += 2; |
1249 | } | 1249 | } |
1250 | unlock_mount_hash(); | 1250 | unlock_mount_hash(); |
1251 | 1251 | ||
1252 | if (actual_refs > minimum_refs) | 1252 | if (actual_refs > minimum_refs) |
1253 | return 0; | 1253 | return 0; |
1254 | 1254 | ||
1255 | return 1; | 1255 | return 1; |
1256 | } | 1256 | } |
1257 | 1257 | ||
1258 | EXPORT_SYMBOL(may_umount_tree); | 1258 | EXPORT_SYMBOL(may_umount_tree); |
1259 | 1259 | ||
1260 | /** | 1260 | /** |
1261 | * may_umount - check if a mount point is busy | 1261 | * may_umount - check if a mount point is busy |
1262 | * @mnt: root of mount | 1262 | * @mnt: root of mount |
1263 | * | 1263 | * |
1264 | * This is called to check if a mount point has any | 1264 | * This is called to check if a mount point has any |
1265 | * open files, pwds, chroots or sub mounts. If the | 1265 | * open files, pwds, chroots or sub mounts. If the |
1266 | * mount has sub mounts this will return busy | 1266 | * mount has sub mounts this will return busy |
1267 | * regardless of whether the sub mounts are busy. | 1267 | * regardless of whether the sub mounts are busy. |
1268 | * | 1268 | * |
1269 | * Doesn't take quota and stuff into account. IOW, in some cases it will | 1269 | * Doesn't take quota and stuff into account. IOW, in some cases it will |
1270 | * give false negatives. The main reason why it's here is that we need | 1270 | * give false negatives. The main reason why it's here is that we need |
1271 | * a non-destructive way to look for easily umountable filesystems. | 1271 | * a non-destructive way to look for easily umountable filesystems. |
1272 | */ | 1272 | */ |
1273 | int may_umount(struct vfsmount *mnt) | 1273 | int may_umount(struct vfsmount *mnt) |
1274 | { | 1274 | { |
1275 | int ret = 1; | 1275 | int ret = 1; |
1276 | down_read(&namespace_sem); | 1276 | down_read(&namespace_sem); |
1277 | lock_mount_hash(); | 1277 | lock_mount_hash(); |
1278 | if (propagate_mount_busy(real_mount(mnt), 2)) | 1278 | if (propagate_mount_busy(real_mount(mnt), 2)) |
1279 | ret = 0; | 1279 | ret = 0; |
1280 | unlock_mount_hash(); | 1280 | unlock_mount_hash(); |
1281 | up_read(&namespace_sem); | 1281 | up_read(&namespace_sem); |
1282 | return ret; | 1282 | return ret; |
1283 | } | 1283 | } |
1284 | 1284 | ||
1285 | EXPORT_SYMBOL(may_umount); | 1285 | EXPORT_SYMBOL(may_umount); |
1286 | 1286 | ||
1287 | static HLIST_HEAD(unmounted); /* protected by namespace_sem */ | 1287 | static HLIST_HEAD(unmounted); /* protected by namespace_sem */ |
1288 | 1288 | ||
1289 | static void namespace_unlock(void) | 1289 | static void namespace_unlock(void) |
1290 | { | 1290 | { |
1291 | struct mount *mnt; | 1291 | struct mount *mnt; |
1292 | struct hlist_head head = unmounted; | 1292 | struct hlist_head head = unmounted; |
1293 | 1293 | ||
1294 | if (likely(hlist_empty(&head))) { | 1294 | if (likely(hlist_empty(&head))) { |
1295 | up_write(&namespace_sem); | 1295 | up_write(&namespace_sem); |
1296 | return; | 1296 | return; |
1297 | } | 1297 | } |
1298 | 1298 | ||
1299 | head.first->pprev = &head.first; | 1299 | head.first->pprev = &head.first; |
1300 | INIT_HLIST_HEAD(&unmounted); | 1300 | INIT_HLIST_HEAD(&unmounted); |
1301 | 1301 | ||
1302 | /* undo decrements we'd done in umount_tree() */ | 1302 | /* undo decrements we'd done in umount_tree() */ |
1303 | hlist_for_each_entry(mnt, &head, mnt_hash) | 1303 | hlist_for_each_entry(mnt, &head, mnt_hash) |
1304 | if (mnt->mnt_ex_mountpoint.mnt) | 1304 | if (mnt->mnt_ex_mountpoint.mnt) |
1305 | mntget(mnt->mnt_ex_mountpoint.mnt); | 1305 | mntget(mnt->mnt_ex_mountpoint.mnt); |
1306 | 1306 | ||
1307 | up_write(&namespace_sem); | 1307 | up_write(&namespace_sem); |
1308 | 1308 | ||
1309 | synchronize_rcu(); | 1309 | synchronize_rcu(); |
1310 | 1310 | ||
1311 | while (!hlist_empty(&head)) { | 1311 | while (!hlist_empty(&head)) { |
1312 | mnt = hlist_entry(head.first, struct mount, mnt_hash); | 1312 | mnt = hlist_entry(head.first, struct mount, mnt_hash); |
1313 | hlist_del_init(&mnt->mnt_hash); | 1313 | hlist_del_init(&mnt->mnt_hash); |
1314 | if (mnt->mnt_ex_mountpoint.mnt) | 1314 | if (mnt->mnt_ex_mountpoint.mnt) |
1315 | path_put(&mnt->mnt_ex_mountpoint); | 1315 | path_put(&mnt->mnt_ex_mountpoint); |
1316 | mntput(&mnt->mnt); | 1316 | mntput(&mnt->mnt); |
1317 | } | 1317 | } |
1318 | } | 1318 | } |
1319 | 1319 | ||
1320 | static inline void namespace_lock(void) | 1320 | static inline void namespace_lock(void) |
1321 | { | 1321 | { |
1322 | down_write(&namespace_sem); | 1322 | down_write(&namespace_sem); |
1323 | } | 1323 | } |
1324 | 1324 | ||
1325 | /* | 1325 | /* |
1326 | * mount_lock must be held | 1326 | * mount_lock must be held |
1327 | * namespace_sem must be held for write | 1327 | * namespace_sem must be held for write |
1328 | * how = 0 => just this tree, don't propagate | 1328 | * how = 0 => just this tree, don't propagate |
1329 | * how = 1 => propagate; we know that nobody else has reference to any victims | 1329 | * how = 1 => propagate; we know that nobody else has reference to any victims |
1330 | * how = 2 => lazy umount | 1330 | * how = 2 => lazy umount |
1331 | */ | 1331 | */ |
1332 | void umount_tree(struct mount *mnt, int how) | 1332 | void umount_tree(struct mount *mnt, int how) |
1333 | { | 1333 | { |
1334 | HLIST_HEAD(tmp_list); | 1334 | HLIST_HEAD(tmp_list); |
1335 | struct mount *p; | 1335 | struct mount *p; |
1336 | struct mount *last = NULL; | 1336 | struct mount *last = NULL; |
1337 | 1337 | ||
1338 | for (p = mnt; p; p = next_mnt(p, mnt)) { | 1338 | for (p = mnt; p; p = next_mnt(p, mnt)) { |
1339 | hlist_del_init_rcu(&p->mnt_hash); | 1339 | hlist_del_init_rcu(&p->mnt_hash); |
1340 | hlist_add_head(&p->mnt_hash, &tmp_list); | 1340 | hlist_add_head(&p->mnt_hash, &tmp_list); |
1341 | } | 1341 | } |
1342 | 1342 | ||
1343 | hlist_for_each_entry(p, &tmp_list, mnt_hash) | 1343 | hlist_for_each_entry(p, &tmp_list, mnt_hash) |
1344 | list_del_init(&p->mnt_child); | 1344 | list_del_init(&p->mnt_child); |
1345 | 1345 | ||
1346 | if (how) | 1346 | if (how) |
1347 | propagate_umount(&tmp_list); | 1347 | propagate_umount(&tmp_list); |
1348 | 1348 | ||
1349 | hlist_for_each_entry(p, &tmp_list, mnt_hash) { | 1349 | hlist_for_each_entry(p, &tmp_list, mnt_hash) { |
1350 | list_del_init(&p->mnt_expire); | 1350 | list_del_init(&p->mnt_expire); |
1351 | list_del_init(&p->mnt_list); | 1351 | list_del_init(&p->mnt_list); |
1352 | __touch_mnt_namespace(p->mnt_ns); | 1352 | __touch_mnt_namespace(p->mnt_ns); |
1353 | p->mnt_ns = NULL; | 1353 | p->mnt_ns = NULL; |
1354 | if (how < 2) | 1354 | if (how < 2) |
1355 | p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; | 1355 | p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; |
1356 | if (mnt_has_parent(p)) { | 1356 | if (mnt_has_parent(p)) { |
1357 | hlist_del_init(&p->mnt_mp_list); | 1357 | hlist_del_init(&p->mnt_mp_list); |
1358 | put_mountpoint(p->mnt_mp); | 1358 | put_mountpoint(p->mnt_mp); |
1359 | mnt_add_count(p->mnt_parent, -1); | 1359 | mnt_add_count(p->mnt_parent, -1); |
1360 | /* move the reference to mountpoint into ->mnt_ex_mountpoint */ | 1360 | /* move the reference to mountpoint into ->mnt_ex_mountpoint */ |
1361 | p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; | 1361 | p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; |
1362 | p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; | 1362 | p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; |
1363 | p->mnt_mountpoint = p->mnt.mnt_root; | 1363 | p->mnt_mountpoint = p->mnt.mnt_root; |
1364 | p->mnt_parent = p; | 1364 | p->mnt_parent = p; |
1365 | p->mnt_mp = NULL; | 1365 | p->mnt_mp = NULL; |
1366 | } | 1366 | } |
1367 | change_mnt_propagation(p, MS_PRIVATE); | 1367 | change_mnt_propagation(p, MS_PRIVATE); |
1368 | last = p; | 1368 | last = p; |
1369 | } | 1369 | } |
1370 | if (last) { | 1370 | if (last) { |
1371 | last->mnt_hash.next = unmounted.first; | 1371 | last->mnt_hash.next = unmounted.first; |
1372 | unmounted.first = tmp_list.first; | 1372 | unmounted.first = tmp_list.first; |
1373 | unmounted.first->pprev = &unmounted.first; | 1373 | unmounted.first->pprev = &unmounted.first; |
1374 | } | 1374 | } |
1375 | } | 1375 | } |
1376 | 1376 | ||
1377 | static void shrink_submounts(struct mount *mnt); | 1377 | static void shrink_submounts(struct mount *mnt); |
1378 | 1378 | ||
1379 | static int do_umount(struct mount *mnt, int flags) | 1379 | static int do_umount(struct mount *mnt, int flags) |
1380 | { | 1380 | { |
1381 | struct super_block *sb = mnt->mnt.mnt_sb; | 1381 | struct super_block *sb = mnt->mnt.mnt_sb; |
1382 | int retval; | 1382 | int retval; |
1383 | 1383 | ||
1384 | retval = security_sb_umount(&mnt->mnt, flags); | 1384 | retval = security_sb_umount(&mnt->mnt, flags); |
1385 | if (retval) | 1385 | if (retval) |
1386 | return retval; | 1386 | return retval; |
1387 | 1387 | ||
1388 | /* | 1388 | /* |
1389 | * Allow userspace to request a mountpoint be expired rather than | 1389 | * Allow userspace to request a mountpoint be expired rather than |
1390 | * unmounting unconditionally. Unmount only happens if: | 1390 | * unmounting unconditionally. Unmount only happens if: |
1391 | * (1) the mark is already set (the mark is cleared by mntput()) | 1391 | * (1) the mark is already set (the mark is cleared by mntput()) |
1392 | * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] | 1392 | * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] |
1393 | */ | 1393 | */ |
1394 | if (flags & MNT_EXPIRE) { | 1394 | if (flags & MNT_EXPIRE) { |
1395 | if (&mnt->mnt == current->fs->root.mnt || | 1395 | if (&mnt->mnt == current->fs->root.mnt || |
1396 | flags & (MNT_FORCE | MNT_DETACH)) | 1396 | flags & (MNT_FORCE | MNT_DETACH)) |
1397 | return -EINVAL; | 1397 | return -EINVAL; |
1398 | 1398 | ||
1399 | /* | 1399 | /* |
1400 | * probably don't strictly need the lock here if we examined | 1400 | * probably don't strictly need the lock here if we examined |
1401 | * all race cases, but it's a slowpath. | 1401 | * all race cases, but it's a slowpath. |
1402 | */ | 1402 | */ |
1403 | lock_mount_hash(); | 1403 | lock_mount_hash(); |
1404 | if (mnt_get_count(mnt) != 2) { | 1404 | if (mnt_get_count(mnt) != 2) { |
1405 | unlock_mount_hash(); | 1405 | unlock_mount_hash(); |
1406 | return -EBUSY; | 1406 | return -EBUSY; |
1407 | } | 1407 | } |
1408 | unlock_mount_hash(); | 1408 | unlock_mount_hash(); |
1409 | 1409 | ||
1410 | if (!xchg(&mnt->mnt_expiry_mark, 1)) | 1410 | if (!xchg(&mnt->mnt_expiry_mark, 1)) |
1411 | return -EAGAIN; | 1411 | return -EAGAIN; |
1412 | } | 1412 | } |
1413 | 1413 | ||
1414 | /* | 1414 | /* |
1415 | * If we may have to abort operations to get out of this | 1415 | * If we may have to abort operations to get out of this |
1416 | * mount, and they will themselves hold resources we must | 1416 | * mount, and they will themselves hold resources we must |
1417 | * allow the fs to do things. In the Unix tradition of | 1417 | * allow the fs to do things. In the Unix tradition of |
1418 | * 'Gee thats tricky lets do it in userspace' the umount_begin | 1418 | * 'Gee thats tricky lets do it in userspace' the umount_begin |
1419 | * might fail to complete on the first run through as other tasks | 1419 | * might fail to complete on the first run through as other tasks |
1420 | * must return, and the like. Thats for the mount program to worry | 1420 | * must return, and the like. Thats for the mount program to worry |
1421 | * about for the moment. | 1421 | * about for the moment. |
1422 | */ | 1422 | */ |
1423 | 1423 | ||
1424 | if (flags & MNT_FORCE && sb->s_op->umount_begin) { | 1424 | if (flags & MNT_FORCE && sb->s_op->umount_begin) { |
1425 | sb->s_op->umount_begin(sb); | 1425 | sb->s_op->umount_begin(sb); |
1426 | } | 1426 | } |
1427 | 1427 | ||
1428 | /* | 1428 | /* |
1429 | * No sense to grab the lock for this test, but test itself looks | 1429 | * No sense to grab the lock for this test, but test itself looks |
1430 | * somewhat bogus. Suggestions for better replacement? | 1430 | * somewhat bogus. Suggestions for better replacement? |
1431 | * Ho-hum... In principle, we might treat that as umount + switch | 1431 | * Ho-hum... In principle, we might treat that as umount + switch |
1432 | * to rootfs. GC would eventually take care of the old vfsmount. | 1432 | * to rootfs. GC would eventually take care of the old vfsmount. |
1433 | * Actually it makes sense, especially if rootfs would contain a | 1433 | * Actually it makes sense, especially if rootfs would contain a |
1434 | * /reboot - static binary that would close all descriptors and | 1434 | * /reboot - static binary that would close all descriptors and |
1435 | * call reboot(9). Then init(8) could umount root and exec /reboot. | 1435 | * call reboot(9). Then init(8) could umount root and exec /reboot. |
1436 | */ | 1436 | */ |
1437 | if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { | 1437 | if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { |
1438 | /* | 1438 | /* |
1439 | * Special case for "unmounting" root ... | 1439 | * Special case for "unmounting" root ... |
1440 | * we just try to remount it readonly. | 1440 | * we just try to remount it readonly. |
1441 | */ | 1441 | */ |
1442 | if (!capable(CAP_SYS_ADMIN)) | 1442 | if (!capable(CAP_SYS_ADMIN)) |
1443 | return -EPERM; | 1443 | return -EPERM; |
1444 | down_write(&sb->s_umount); | 1444 | down_write(&sb->s_umount); |
1445 | if (!(sb->s_flags & MS_RDONLY)) | 1445 | if (!(sb->s_flags & MS_RDONLY)) |
1446 | retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); | 1446 | retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); |
1447 | up_write(&sb->s_umount); | 1447 | up_write(&sb->s_umount); |
1448 | return retval; | 1448 | return retval; |
1449 | } | 1449 | } |
1450 | 1450 | ||
1451 | namespace_lock(); | 1451 | namespace_lock(); |
1452 | lock_mount_hash(); | 1452 | lock_mount_hash(); |
1453 | event++; | 1453 | event++; |
1454 | 1454 | ||
1455 | if (flags & MNT_DETACH) { | 1455 | if (flags & MNT_DETACH) { |
1456 | if (!list_empty(&mnt->mnt_list)) | 1456 | if (!list_empty(&mnt->mnt_list)) |
1457 | umount_tree(mnt, 2); | 1457 | umount_tree(mnt, 2); |
1458 | retval = 0; | 1458 | retval = 0; |
1459 | } else { | 1459 | } else { |
1460 | shrink_submounts(mnt); | 1460 | shrink_submounts(mnt); |
1461 | retval = -EBUSY; | 1461 | retval = -EBUSY; |
1462 | if (!propagate_mount_busy(mnt, 2)) { | 1462 | if (!propagate_mount_busy(mnt, 2)) { |
1463 | if (!list_empty(&mnt->mnt_list)) | 1463 | if (!list_empty(&mnt->mnt_list)) |
1464 | umount_tree(mnt, 1); | 1464 | umount_tree(mnt, 1); |
1465 | retval = 0; | 1465 | retval = 0; |
1466 | } | 1466 | } |
1467 | } | 1467 | } |
1468 | unlock_mount_hash(); | 1468 | unlock_mount_hash(); |
1469 | namespace_unlock(); | 1469 | namespace_unlock(); |
1470 | return retval; | 1470 | return retval; |
1471 | } | 1471 | } |
1472 | 1472 | ||
1473 | /* | 1473 | /* |
1474 | * __detach_mounts - lazily unmount all mounts on the specified dentry | 1474 | * __detach_mounts - lazily unmount all mounts on the specified dentry |
1475 | * | 1475 | * |
1476 | * During unlink, rmdir, and d_drop it is possible to loose the path | 1476 | * During unlink, rmdir, and d_drop it is possible to loose the path |
1477 | * to an existing mountpoint, and wind up leaking the mount. | 1477 | * to an existing mountpoint, and wind up leaking the mount. |
1478 | * detach_mounts allows lazily unmounting those mounts instead of | 1478 | * detach_mounts allows lazily unmounting those mounts instead of |
1479 | * leaking them. | 1479 | * leaking them. |
1480 | * | 1480 | * |
1481 | * The caller may hold dentry->d_inode->i_mutex. | 1481 | * The caller may hold dentry->d_inode->i_mutex. |
1482 | */ | 1482 | */ |
1483 | void __detach_mounts(struct dentry *dentry) | 1483 | void __detach_mounts(struct dentry *dentry) |
1484 | { | 1484 | { |
1485 | struct mountpoint *mp; | 1485 | struct mountpoint *mp; |
1486 | struct mount *mnt; | 1486 | struct mount *mnt; |
1487 | 1487 | ||
1488 | namespace_lock(); | 1488 | namespace_lock(); |
1489 | mp = lookup_mountpoint(dentry); | 1489 | mp = lookup_mountpoint(dentry); |
1490 | if (!mp) | 1490 | if (!mp) |
1491 | goto out_unlock; | 1491 | goto out_unlock; |
1492 | 1492 | ||
1493 | lock_mount_hash(); | 1493 | lock_mount_hash(); |
1494 | while (!hlist_empty(&mp->m_list)) { | 1494 | while (!hlist_empty(&mp->m_list)) { |
1495 | mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); | 1495 | mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); |
1496 | umount_tree(mnt, 2); | 1496 | umount_tree(mnt, 2); |
1497 | } | 1497 | } |
1498 | unlock_mount_hash(); | 1498 | unlock_mount_hash(); |
1499 | put_mountpoint(mp); | 1499 | put_mountpoint(mp); |
1500 | out_unlock: | 1500 | out_unlock: |
1501 | namespace_unlock(); | 1501 | namespace_unlock(); |
1502 | } | 1502 | } |
1503 | 1503 | ||
1504 | /* | 1504 | /* |
1505 | * Is the caller allowed to modify his namespace? | 1505 | * Is the caller allowed to modify his namespace? |
1506 | */ | 1506 | */ |
1507 | static inline bool may_mount(void) | 1507 | static inline bool may_mount(void) |
1508 | { | 1508 | { |
1509 | return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); | 1509 | return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); |
1510 | } | 1510 | } |
1511 | 1511 | ||
1512 | /* | 1512 | /* |
1513 | * Now umount can handle mount points as well as block devices. | 1513 | * Now umount can handle mount points as well as block devices. |
1514 | * This is important for filesystems which use unnamed block devices. | 1514 | * This is important for filesystems which use unnamed block devices. |
1515 | * | 1515 | * |
1516 | * We now support a flag for forced unmount like the other 'big iron' | 1516 | * We now support a flag for forced unmount like the other 'big iron' |
1517 | * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD | 1517 | * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD |
1518 | */ | 1518 | */ |
1519 | 1519 | ||
1520 | SYSCALL_DEFINE2(umount, char __user *, name, int, flags) | 1520 | SYSCALL_DEFINE2(umount, char __user *, name, int, flags) |
1521 | { | 1521 | { |
1522 | struct path path; | 1522 | struct path path; |
1523 | struct mount *mnt; | 1523 | struct mount *mnt; |
1524 | int retval; | 1524 | int retval; |
1525 | int lookup_flags = 0; | 1525 | int lookup_flags = 0; |
1526 | 1526 | ||
1527 | if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) | 1527 | if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) |
1528 | return -EINVAL; | 1528 | return -EINVAL; |
1529 | 1529 | ||
1530 | if (!may_mount()) | 1530 | if (!may_mount()) |
1531 | return -EPERM; | 1531 | return -EPERM; |
1532 | 1532 | ||
1533 | if (!(flags & UMOUNT_NOFOLLOW)) | 1533 | if (!(flags & UMOUNT_NOFOLLOW)) |
1534 | lookup_flags |= LOOKUP_FOLLOW; | 1534 | lookup_flags |= LOOKUP_FOLLOW; |
1535 | 1535 | ||
1536 | retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); | 1536 | retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); |
1537 | if (retval) | 1537 | if (retval) |
1538 | goto out; | 1538 | goto out; |
1539 | mnt = real_mount(path.mnt); | 1539 | mnt = real_mount(path.mnt); |
1540 | retval = -EINVAL; | 1540 | retval = -EINVAL; |
1541 | if (path.dentry != path.mnt->mnt_root) | 1541 | if (path.dentry != path.mnt->mnt_root) |
1542 | goto dput_and_out; | 1542 | goto dput_and_out; |
1543 | if (!check_mnt(mnt)) | 1543 | if (!check_mnt(mnt)) |
1544 | goto dput_and_out; | 1544 | goto dput_and_out; |
1545 | if (mnt->mnt.mnt_flags & MNT_LOCKED) | 1545 | if (mnt->mnt.mnt_flags & MNT_LOCKED) |
1546 | goto dput_and_out; | 1546 | goto dput_and_out; |
1547 | 1547 | ||
1548 | retval = do_umount(mnt, flags); | 1548 | retval = do_umount(mnt, flags); |
1549 | dput_and_out: | 1549 | dput_and_out: |
1550 | /* we mustn't call path_put() as that would clear mnt_expiry_mark */ | 1550 | /* we mustn't call path_put() as that would clear mnt_expiry_mark */ |
1551 | dput(path.dentry); | 1551 | dput(path.dentry); |
1552 | mntput_no_expire(mnt); | 1552 | mntput_no_expire(mnt); |
1553 | out: | 1553 | out: |
1554 | return retval; | 1554 | return retval; |
1555 | } | 1555 | } |
1556 | 1556 | ||
1557 | #ifdef __ARCH_WANT_SYS_OLDUMOUNT | 1557 | #ifdef __ARCH_WANT_SYS_OLDUMOUNT |
1558 | 1558 | ||
1559 | /* | 1559 | /* |
1560 | * The 2.0 compatible umount. No flags. | 1560 | * The 2.0 compatible umount. No flags. |
1561 | */ | 1561 | */ |
1562 | SYSCALL_DEFINE1(oldumount, char __user *, name) | 1562 | SYSCALL_DEFINE1(oldumount, char __user *, name) |
1563 | { | 1563 | { |
1564 | return sys_umount(name, 0); | 1564 | return sys_umount(name, 0); |
1565 | } | 1565 | } |
1566 | 1566 | ||
1567 | #endif | 1567 | #endif |
1568 | 1568 | ||
1569 | static bool is_mnt_ns_file(struct dentry *dentry) | 1569 | static bool is_mnt_ns_file(struct dentry *dentry) |
1570 | { | 1570 | { |
1571 | /* Is this a proxy for a mount namespace? */ | 1571 | /* Is this a proxy for a mount namespace? */ |
1572 | struct inode *inode = dentry->d_inode; | 1572 | return dentry->d_op == &ns_dentry_operations && |
1573 | return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations; | 1573 | dentry->d_fsdata == &mntns_operations; |
1574 | } | 1574 | } |
1575 | 1575 | ||
1576 | struct mnt_namespace *to_mnt_ns(struct ns_common *ns) | 1576 | struct mnt_namespace *to_mnt_ns(struct ns_common *ns) |
1577 | { | 1577 | { |
1578 | return container_of(ns, struct mnt_namespace, ns); | 1578 | return container_of(ns, struct mnt_namespace, ns); |
1579 | } | 1579 | } |
1580 | 1580 | ||
1581 | static bool mnt_ns_loop(struct dentry *dentry) | 1581 | static bool mnt_ns_loop(struct dentry *dentry) |
1582 | { | 1582 | { |
1583 | /* Could bind mounting the mount namespace inode cause a | 1583 | /* Could bind mounting the mount namespace inode cause a |
1584 | * mount namespace loop? | 1584 | * mount namespace loop? |
1585 | */ | 1585 | */ |
1586 | struct mnt_namespace *mnt_ns; | 1586 | struct mnt_namespace *mnt_ns; |
1587 | if (!is_mnt_ns_file(dentry)) | 1587 | if (!is_mnt_ns_file(dentry)) |
1588 | return false; | 1588 | return false; |
1589 | 1589 | ||
1590 | mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); | 1590 | mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); |
1591 | return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; | 1591 | return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; |
1592 | } | 1592 | } |
1593 | 1593 | ||
1594 | struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, | 1594 | struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, |
1595 | int flag) | 1595 | int flag) |
1596 | { | 1596 | { |
1597 | struct mount *res, *p, *q, *r, *parent; | 1597 | struct mount *res, *p, *q, *r, *parent; |
1598 | 1598 | ||
1599 | if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) | 1599 | if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) |
1600 | return ERR_PTR(-EINVAL); | 1600 | return ERR_PTR(-EINVAL); |
1601 | 1601 | ||
1602 | if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) | 1602 | if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) |
1603 | return ERR_PTR(-EINVAL); | 1603 | return ERR_PTR(-EINVAL); |
1604 | 1604 | ||
1605 | res = q = clone_mnt(mnt, dentry, flag); | 1605 | res = q = clone_mnt(mnt, dentry, flag); |
1606 | if (IS_ERR(q)) | 1606 | if (IS_ERR(q)) |
1607 | return q; | 1607 | return q; |
1608 | 1608 | ||
1609 | q->mnt.mnt_flags &= ~MNT_LOCKED; | 1609 | q->mnt.mnt_flags &= ~MNT_LOCKED; |
1610 | q->mnt_mountpoint = mnt->mnt_mountpoint; | 1610 | q->mnt_mountpoint = mnt->mnt_mountpoint; |
1611 | 1611 | ||
1612 | p = mnt; | 1612 | p = mnt; |
1613 | list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { | 1613 | list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { |
1614 | struct mount *s; | 1614 | struct mount *s; |
1615 | if (!is_subdir(r->mnt_mountpoint, dentry)) | 1615 | if (!is_subdir(r->mnt_mountpoint, dentry)) |
1616 | continue; | 1616 | continue; |
1617 | 1617 | ||
1618 | for (s = r; s; s = next_mnt(s, r)) { | 1618 | for (s = r; s; s = next_mnt(s, r)) { |
1619 | struct mount *t = NULL; | 1619 | struct mount *t = NULL; |
1620 | if (!(flag & CL_COPY_UNBINDABLE) && | 1620 | if (!(flag & CL_COPY_UNBINDABLE) && |
1621 | IS_MNT_UNBINDABLE(s)) { | 1621 | IS_MNT_UNBINDABLE(s)) { |
1622 | s = skip_mnt_tree(s); | 1622 | s = skip_mnt_tree(s); |
1623 | continue; | 1623 | continue; |
1624 | } | 1624 | } |
1625 | if (!(flag & CL_COPY_MNT_NS_FILE) && | 1625 | if (!(flag & CL_COPY_MNT_NS_FILE) && |
1626 | is_mnt_ns_file(s->mnt.mnt_root)) { | 1626 | is_mnt_ns_file(s->mnt.mnt_root)) { |
1627 | s = skip_mnt_tree(s); | 1627 | s = skip_mnt_tree(s); |
1628 | continue; | 1628 | continue; |
1629 | } | 1629 | } |
1630 | while (p != s->mnt_parent) { | 1630 | while (p != s->mnt_parent) { |
1631 | p = p->mnt_parent; | 1631 | p = p->mnt_parent; |
1632 | q = q->mnt_parent; | 1632 | q = q->mnt_parent; |
1633 | } | 1633 | } |
1634 | p = s; | 1634 | p = s; |
1635 | parent = q; | 1635 | parent = q; |
1636 | q = clone_mnt(p, p->mnt.mnt_root, flag); | 1636 | q = clone_mnt(p, p->mnt.mnt_root, flag); |
1637 | if (IS_ERR(q)) | 1637 | if (IS_ERR(q)) |
1638 | goto out; | 1638 | goto out; |
1639 | lock_mount_hash(); | 1639 | lock_mount_hash(); |
1640 | list_add_tail(&q->mnt_list, &res->mnt_list); | 1640 | list_add_tail(&q->mnt_list, &res->mnt_list); |
1641 | mnt_set_mountpoint(parent, p->mnt_mp, q); | 1641 | mnt_set_mountpoint(parent, p->mnt_mp, q); |
1642 | if (!list_empty(&parent->mnt_mounts)) { | 1642 | if (!list_empty(&parent->mnt_mounts)) { |
1643 | t = list_last_entry(&parent->mnt_mounts, | 1643 | t = list_last_entry(&parent->mnt_mounts, |
1644 | struct mount, mnt_child); | 1644 | struct mount, mnt_child); |
1645 | if (t->mnt_mp != p->mnt_mp) | 1645 | if (t->mnt_mp != p->mnt_mp) |
1646 | t = NULL; | 1646 | t = NULL; |
1647 | } | 1647 | } |
1648 | attach_shadowed(q, parent, t); | 1648 | attach_shadowed(q, parent, t); |
1649 | unlock_mount_hash(); | 1649 | unlock_mount_hash(); |
1650 | } | 1650 | } |
1651 | } | 1651 | } |
1652 | return res; | 1652 | return res; |
1653 | out: | 1653 | out: |
1654 | if (res) { | 1654 | if (res) { |
1655 | lock_mount_hash(); | 1655 | lock_mount_hash(); |
1656 | umount_tree(res, 0); | 1656 | umount_tree(res, 0); |
1657 | unlock_mount_hash(); | 1657 | unlock_mount_hash(); |
1658 | } | 1658 | } |
1659 | return q; | 1659 | return q; |
1660 | } | 1660 | } |
1661 | 1661 | ||
1662 | /* Caller should check returned pointer for errors */ | 1662 | /* Caller should check returned pointer for errors */ |
1663 | 1663 | ||
1664 | struct vfsmount *collect_mounts(struct path *path) | 1664 | struct vfsmount *collect_mounts(struct path *path) |
1665 | { | 1665 | { |
1666 | struct mount *tree; | 1666 | struct mount *tree; |
1667 | namespace_lock(); | 1667 | namespace_lock(); |
1668 | tree = copy_tree(real_mount(path->mnt), path->dentry, | 1668 | tree = copy_tree(real_mount(path->mnt), path->dentry, |
1669 | CL_COPY_ALL | CL_PRIVATE); | 1669 | CL_COPY_ALL | CL_PRIVATE); |
1670 | namespace_unlock(); | 1670 | namespace_unlock(); |
1671 | if (IS_ERR(tree)) | 1671 | if (IS_ERR(tree)) |
1672 | return ERR_CAST(tree); | 1672 | return ERR_CAST(tree); |
1673 | return &tree->mnt; | 1673 | return &tree->mnt; |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | void drop_collected_mounts(struct vfsmount *mnt) | 1676 | void drop_collected_mounts(struct vfsmount *mnt) |
1677 | { | 1677 | { |
1678 | namespace_lock(); | 1678 | namespace_lock(); |
1679 | lock_mount_hash(); | 1679 | lock_mount_hash(); |
1680 | umount_tree(real_mount(mnt), 0); | 1680 | umount_tree(real_mount(mnt), 0); |
1681 | unlock_mount_hash(); | 1681 | unlock_mount_hash(); |
1682 | namespace_unlock(); | 1682 | namespace_unlock(); |
1683 | } | 1683 | } |
1684 | 1684 | ||
1685 | /** | 1685 | /** |
1686 | * clone_private_mount - create a private clone of a path | 1686 | * clone_private_mount - create a private clone of a path |
1687 | * | 1687 | * |
1688 | * This creates a new vfsmount, which will be the clone of @path. The new will | 1688 | * This creates a new vfsmount, which will be the clone of @path. The new will |
1689 | * not be attached anywhere in the namespace and will be private (i.e. changes | 1689 | * not be attached anywhere in the namespace and will be private (i.e. changes |
1690 | * to the originating mount won't be propagated into this). | 1690 | * to the originating mount won't be propagated into this). |
1691 | * | 1691 | * |
1692 | * Release with mntput(). | 1692 | * Release with mntput(). |
1693 | */ | 1693 | */ |
1694 | struct vfsmount *clone_private_mount(struct path *path) | 1694 | struct vfsmount *clone_private_mount(struct path *path) |
1695 | { | 1695 | { |
1696 | struct mount *old_mnt = real_mount(path->mnt); | 1696 | struct mount *old_mnt = real_mount(path->mnt); |
1697 | struct mount *new_mnt; | 1697 | struct mount *new_mnt; |
1698 | 1698 | ||
1699 | if (IS_MNT_UNBINDABLE(old_mnt)) | 1699 | if (IS_MNT_UNBINDABLE(old_mnt)) |
1700 | return ERR_PTR(-EINVAL); | 1700 | return ERR_PTR(-EINVAL); |
1701 | 1701 | ||
1702 | down_read(&namespace_sem); | 1702 | down_read(&namespace_sem); |
1703 | new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); | 1703 | new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); |
1704 | up_read(&namespace_sem); | 1704 | up_read(&namespace_sem); |
1705 | if (IS_ERR(new_mnt)) | 1705 | if (IS_ERR(new_mnt)) |
1706 | return ERR_CAST(new_mnt); | 1706 | return ERR_CAST(new_mnt); |
1707 | 1707 | ||
1708 | return &new_mnt->mnt; | 1708 | return &new_mnt->mnt; |
1709 | } | 1709 | } |
1710 | EXPORT_SYMBOL_GPL(clone_private_mount); | 1710 | EXPORT_SYMBOL_GPL(clone_private_mount); |
1711 | 1711 | ||
1712 | int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, | 1712 | int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, |
1713 | struct vfsmount *root) | 1713 | struct vfsmount *root) |
1714 | { | 1714 | { |
1715 | struct mount *mnt; | 1715 | struct mount *mnt; |
1716 | int res = f(root, arg); | 1716 | int res = f(root, arg); |
1717 | if (res) | 1717 | if (res) |
1718 | return res; | 1718 | return res; |
1719 | list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { | 1719 | list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { |
1720 | res = f(&mnt->mnt, arg); | 1720 | res = f(&mnt->mnt, arg); |
1721 | if (res) | 1721 | if (res) |
1722 | return res; | 1722 | return res; |
1723 | } | 1723 | } |
1724 | return 0; | 1724 | return 0; |
1725 | } | 1725 | } |
1726 | 1726 | ||
1727 | static void cleanup_group_ids(struct mount *mnt, struct mount *end) | 1727 | static void cleanup_group_ids(struct mount *mnt, struct mount *end) |
1728 | { | 1728 | { |
1729 | struct mount *p; | 1729 | struct mount *p; |
1730 | 1730 | ||
1731 | for (p = mnt; p != end; p = next_mnt(p, mnt)) { | 1731 | for (p = mnt; p != end; p = next_mnt(p, mnt)) { |
1732 | if (p->mnt_group_id && !IS_MNT_SHARED(p)) | 1732 | if (p->mnt_group_id && !IS_MNT_SHARED(p)) |
1733 | mnt_release_group_id(p); | 1733 | mnt_release_group_id(p); |
1734 | } | 1734 | } |
1735 | } | 1735 | } |
1736 | 1736 | ||
1737 | static int invent_group_ids(struct mount *mnt, bool recurse) | 1737 | static int invent_group_ids(struct mount *mnt, bool recurse) |
1738 | { | 1738 | { |
1739 | struct mount *p; | 1739 | struct mount *p; |
1740 | 1740 | ||
1741 | for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { | 1741 | for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { |
1742 | if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { | 1742 | if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { |
1743 | int err = mnt_alloc_group_id(p); | 1743 | int err = mnt_alloc_group_id(p); |
1744 | if (err) { | 1744 | if (err) { |
1745 | cleanup_group_ids(mnt, p); | 1745 | cleanup_group_ids(mnt, p); |
1746 | return err; | 1746 | return err; |
1747 | } | 1747 | } |
1748 | } | 1748 | } |
1749 | } | 1749 | } |
1750 | 1750 | ||
1751 | return 0; | 1751 | return 0; |
1752 | } | 1752 | } |
1753 | 1753 | ||
1754 | /* | 1754 | /* |
1755 | * @source_mnt : mount tree to be attached | 1755 | * @source_mnt : mount tree to be attached |
1756 | * @nd : place the mount tree @source_mnt is attached | 1756 | * @nd : place the mount tree @source_mnt is attached |
1757 | * @parent_nd : if non-null, detach the source_mnt from its parent and | 1757 | * @parent_nd : if non-null, detach the source_mnt from its parent and |
1758 | * store the parent mount and mountpoint dentry. | 1758 | * store the parent mount and mountpoint dentry. |
1759 | * (done when source_mnt is moved) | 1759 | * (done when source_mnt is moved) |
1760 | * | 1760 | * |
1761 | * NOTE: in the table below explains the semantics when a source mount | 1761 | * NOTE: in the table below explains the semantics when a source mount |
1762 | * of a given type is attached to a destination mount of a given type. | 1762 | * of a given type is attached to a destination mount of a given type. |
1763 | * --------------------------------------------------------------------------- | 1763 | * --------------------------------------------------------------------------- |
1764 | * | BIND MOUNT OPERATION | | 1764 | * | BIND MOUNT OPERATION | |
1765 | * |************************************************************************** | 1765 | * |************************************************************************** |
1766 | * | source-->| shared | private | slave | unbindable | | 1766 | * | source-->| shared | private | slave | unbindable | |
1767 | * | dest | | | | | | 1767 | * | dest | | | | | |
1768 | * | | | | | | | | 1768 | * | | | | | | | |
1769 | * | v | | | | | | 1769 | * | v | | | | | |
1770 | * |************************************************************************** | 1770 | * |************************************************************************** |
1771 | * | shared | shared (++) | shared (+) | shared(+++)| invalid | | 1771 | * | shared | shared (++) | shared (+) | shared(+++)| invalid | |
1772 | * | | | | | | | 1772 | * | | | | | | |
1773 | * |non-shared| shared (+) | private | slave (*) | invalid | | 1773 | * |non-shared| shared (+) | private | slave (*) | invalid | |
1774 | * *************************************************************************** | 1774 | * *************************************************************************** |
1775 | * A bind operation clones the source mount and mounts the clone on the | 1775 | * A bind operation clones the source mount and mounts the clone on the |
1776 | * destination mount. | 1776 | * destination mount. |
1777 | * | 1777 | * |
1778 | * (++) the cloned mount is propagated to all the mounts in the propagation | 1778 | * (++) the cloned mount is propagated to all the mounts in the propagation |
1779 | * tree of the destination mount and the cloned mount is added to | 1779 | * tree of the destination mount and the cloned mount is added to |
1780 | * the peer group of the source mount. | 1780 | * the peer group of the source mount. |
1781 | * (+) the cloned mount is created under the destination mount and is marked | 1781 | * (+) the cloned mount is created under the destination mount and is marked |
1782 | * as shared. The cloned mount is added to the peer group of the source | 1782 | * as shared. The cloned mount is added to the peer group of the source |
1783 | * mount. | 1783 | * mount. |
1784 | * (+++) the mount is propagated to all the mounts in the propagation tree | 1784 | * (+++) the mount is propagated to all the mounts in the propagation tree |
1785 | * of the destination mount and the cloned mount is made slave | 1785 | * of the destination mount and the cloned mount is made slave |
1786 | * of the same master as that of the source mount. The cloned mount | 1786 | * of the same master as that of the source mount. The cloned mount |
1787 | * is marked as 'shared and slave'. | 1787 | * is marked as 'shared and slave'. |
1788 | * (*) the cloned mount is made a slave of the same master as that of the | 1788 | * (*) the cloned mount is made a slave of the same master as that of the |
1789 | * source mount. | 1789 | * source mount. |
1790 | * | 1790 | * |
1791 | * --------------------------------------------------------------------------- | 1791 | * --------------------------------------------------------------------------- |
1792 | * | MOVE MOUNT OPERATION | | 1792 | * | MOVE MOUNT OPERATION | |
1793 | * |************************************************************************** | 1793 | * |************************************************************************** |
1794 | * | source-->| shared | private | slave | unbindable | | 1794 | * | source-->| shared | private | slave | unbindable | |
1795 | * | dest | | | | | | 1795 | * | dest | | | | | |
1796 | * | | | | | | | | 1796 | * | | | | | | | |
1797 | * | v | | | | | | 1797 | * | v | | | | | |
1798 | * |************************************************************************** | 1798 | * |************************************************************************** |
1799 | * | shared | shared (+) | shared (+) | shared(+++) | invalid | | 1799 | * | shared | shared (+) | shared (+) | shared(+++) | invalid | |
1800 | * | | | | | | | 1800 | * | | | | | | |
1801 | * |non-shared| shared (+*) | private | slave (*) | unbindable | | 1801 | * |non-shared| shared (+*) | private | slave (*) | unbindable | |
1802 | * *************************************************************************** | 1802 | * *************************************************************************** |
1803 | * | 1803 | * |
1804 | * (+) the mount is moved to the destination. And is then propagated to | 1804 | * (+) the mount is moved to the destination. And is then propagated to |
1805 | * all the mounts in the propagation tree of the destination mount. | 1805 | * all the mounts in the propagation tree of the destination mount. |
1806 | * (+*) the mount is moved to the destination. | 1806 | * (+*) the mount is moved to the destination. |
1807 | * (+++) the mount is moved to the destination and is then propagated to | 1807 | * (+++) the mount is moved to the destination and is then propagated to |
1808 | * all the mounts belonging to the destination mount's propagation tree. | 1808 | * all the mounts belonging to the destination mount's propagation tree. |
1809 | * the mount is marked as 'shared and slave'. | 1809 | * the mount is marked as 'shared and slave'. |
1810 | * (*) the mount continues to be a slave at the new location. | 1810 | * (*) the mount continues to be a slave at the new location. |
1811 | * | 1811 | * |
1812 | * if the source mount is a tree, the operations explained above is | 1812 | * if the source mount is a tree, the operations explained above is |
1813 | * applied to each mount in the tree. | 1813 | * applied to each mount in the tree. |
1814 | * Must be called without spinlocks held, since this function can sleep | 1814 | * Must be called without spinlocks held, since this function can sleep |
1815 | * in allocations. | 1815 | * in allocations. |
1816 | */ | 1816 | */ |
1817 | static int attach_recursive_mnt(struct mount *source_mnt, | 1817 | static int attach_recursive_mnt(struct mount *source_mnt, |
1818 | struct mount *dest_mnt, | 1818 | struct mount *dest_mnt, |
1819 | struct mountpoint *dest_mp, | 1819 | struct mountpoint *dest_mp, |
1820 | struct path *parent_path) | 1820 | struct path *parent_path) |
1821 | { | 1821 | { |
1822 | HLIST_HEAD(tree_list); | 1822 | HLIST_HEAD(tree_list); |
1823 | struct mount *child, *p; | 1823 | struct mount *child, *p; |
1824 | struct hlist_node *n; | 1824 | struct hlist_node *n; |
1825 | int err; | 1825 | int err; |
1826 | 1826 | ||
1827 | if (IS_MNT_SHARED(dest_mnt)) { | 1827 | if (IS_MNT_SHARED(dest_mnt)) { |
1828 | err = invent_group_ids(source_mnt, true); | 1828 | err = invent_group_ids(source_mnt, true); |
1829 | if (err) | 1829 | if (err) |
1830 | goto out; | 1830 | goto out; |
1831 | err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); | 1831 | err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); |
1832 | lock_mount_hash(); | 1832 | lock_mount_hash(); |
1833 | if (err) | 1833 | if (err) |
1834 | goto out_cleanup_ids; | 1834 | goto out_cleanup_ids; |
1835 | for (p = source_mnt; p; p = next_mnt(p, source_mnt)) | 1835 | for (p = source_mnt; p; p = next_mnt(p, source_mnt)) |
1836 | set_mnt_shared(p); | 1836 | set_mnt_shared(p); |
1837 | } else { | 1837 | } else { |
1838 | lock_mount_hash(); | 1838 | lock_mount_hash(); |
1839 | } | 1839 | } |
1840 | if (parent_path) { | 1840 | if (parent_path) { |
1841 | detach_mnt(source_mnt, parent_path); | 1841 | detach_mnt(source_mnt, parent_path); |
1842 | attach_mnt(source_mnt, dest_mnt, dest_mp); | 1842 | attach_mnt(source_mnt, dest_mnt, dest_mp); |
1843 | touch_mnt_namespace(source_mnt->mnt_ns); | 1843 | touch_mnt_namespace(source_mnt->mnt_ns); |
1844 | } else { | 1844 | } else { |
1845 | mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); | 1845 | mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); |
1846 | commit_tree(source_mnt, NULL); | 1846 | commit_tree(source_mnt, NULL); |
1847 | } | 1847 | } |
1848 | 1848 | ||
1849 | hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { | 1849 | hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { |
1850 | struct mount *q; | 1850 | struct mount *q; |
1851 | hlist_del_init(&child->mnt_hash); | 1851 | hlist_del_init(&child->mnt_hash); |
1852 | q = __lookup_mnt_last(&child->mnt_parent->mnt, | 1852 | q = __lookup_mnt_last(&child->mnt_parent->mnt, |
1853 | child->mnt_mountpoint); | 1853 | child->mnt_mountpoint); |
1854 | commit_tree(child, q); | 1854 | commit_tree(child, q); |
1855 | } | 1855 | } |
1856 | unlock_mount_hash(); | 1856 | unlock_mount_hash(); |
1857 | 1857 | ||
1858 | return 0; | 1858 | return 0; |
1859 | 1859 | ||
1860 | out_cleanup_ids: | 1860 | out_cleanup_ids: |
1861 | while (!hlist_empty(&tree_list)) { | 1861 | while (!hlist_empty(&tree_list)) { |
1862 | child = hlist_entry(tree_list.first, struct mount, mnt_hash); | 1862 | child = hlist_entry(tree_list.first, struct mount, mnt_hash); |
1863 | umount_tree(child, 0); | 1863 | umount_tree(child, 0); |
1864 | } | 1864 | } |
1865 | unlock_mount_hash(); | 1865 | unlock_mount_hash(); |
1866 | cleanup_group_ids(source_mnt, NULL); | 1866 | cleanup_group_ids(source_mnt, NULL); |
1867 | out: | 1867 | out: |
1868 | return err; | 1868 | return err; |
1869 | } | 1869 | } |
1870 | 1870 | ||
1871 | static struct mountpoint *lock_mount(struct path *path) | 1871 | static struct mountpoint *lock_mount(struct path *path) |
1872 | { | 1872 | { |
1873 | struct vfsmount *mnt; | 1873 | struct vfsmount *mnt; |
1874 | struct dentry *dentry = path->dentry; | 1874 | struct dentry *dentry = path->dentry; |
1875 | retry: | 1875 | retry: |
1876 | mutex_lock(&dentry->d_inode->i_mutex); | 1876 | mutex_lock(&dentry->d_inode->i_mutex); |
1877 | if (unlikely(cant_mount(dentry))) { | 1877 | if (unlikely(cant_mount(dentry))) { |
1878 | mutex_unlock(&dentry->d_inode->i_mutex); | 1878 | mutex_unlock(&dentry->d_inode->i_mutex); |
1879 | return ERR_PTR(-ENOENT); | 1879 | return ERR_PTR(-ENOENT); |
1880 | } | 1880 | } |
1881 | namespace_lock(); | 1881 | namespace_lock(); |
1882 | mnt = lookup_mnt(path); | 1882 | mnt = lookup_mnt(path); |
1883 | if (likely(!mnt)) { | 1883 | if (likely(!mnt)) { |
1884 | struct mountpoint *mp = lookup_mountpoint(dentry); | 1884 | struct mountpoint *mp = lookup_mountpoint(dentry); |
1885 | if (!mp) | 1885 | if (!mp) |
1886 | mp = new_mountpoint(dentry); | 1886 | mp = new_mountpoint(dentry); |
1887 | if (IS_ERR(mp)) { | 1887 | if (IS_ERR(mp)) { |
1888 | namespace_unlock(); | 1888 | namespace_unlock(); |
1889 | mutex_unlock(&dentry->d_inode->i_mutex); | 1889 | mutex_unlock(&dentry->d_inode->i_mutex); |
1890 | return mp; | 1890 | return mp; |
1891 | } | 1891 | } |
1892 | return mp; | 1892 | return mp; |
1893 | } | 1893 | } |
1894 | namespace_unlock(); | 1894 | namespace_unlock(); |
1895 | mutex_unlock(&path->dentry->d_inode->i_mutex); | 1895 | mutex_unlock(&path->dentry->d_inode->i_mutex); |
1896 | path_put(path); | 1896 | path_put(path); |
1897 | path->mnt = mnt; | 1897 | path->mnt = mnt; |
1898 | dentry = path->dentry = dget(mnt->mnt_root); | 1898 | dentry = path->dentry = dget(mnt->mnt_root); |
1899 | goto retry; | 1899 | goto retry; |
1900 | } | 1900 | } |
1901 | 1901 | ||
1902 | static void unlock_mount(struct mountpoint *where) | 1902 | static void unlock_mount(struct mountpoint *where) |
1903 | { | 1903 | { |
1904 | struct dentry *dentry = where->m_dentry; | 1904 | struct dentry *dentry = where->m_dentry; |
1905 | put_mountpoint(where); | 1905 | put_mountpoint(where); |
1906 | namespace_unlock(); | 1906 | namespace_unlock(); |
1907 | mutex_unlock(&dentry->d_inode->i_mutex); | 1907 | mutex_unlock(&dentry->d_inode->i_mutex); |
1908 | } | 1908 | } |
1909 | 1909 | ||
1910 | static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) | 1910 | static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) |
1911 | { | 1911 | { |
1912 | if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) | 1912 | if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) |
1913 | return -EINVAL; | 1913 | return -EINVAL; |
1914 | 1914 | ||
1915 | if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != | 1915 | if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != |
1916 | S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) | 1916 | S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) |
1917 | return -ENOTDIR; | 1917 | return -ENOTDIR; |
1918 | 1918 | ||
1919 | return attach_recursive_mnt(mnt, p, mp, NULL); | 1919 | return attach_recursive_mnt(mnt, p, mp, NULL); |
1920 | } | 1920 | } |
1921 | 1921 | ||
1922 | /* | 1922 | /* |
1923 | * Sanity check the flags to change_mnt_propagation. | 1923 | * Sanity check the flags to change_mnt_propagation. |
1924 | */ | 1924 | */ |
1925 | 1925 | ||
1926 | static int flags_to_propagation_type(int flags) | 1926 | static int flags_to_propagation_type(int flags) |
1927 | { | 1927 | { |
1928 | int type = flags & ~(MS_REC | MS_SILENT); | 1928 | int type = flags & ~(MS_REC | MS_SILENT); |
1929 | 1929 | ||
1930 | /* Fail if any non-propagation flags are set */ | 1930 | /* Fail if any non-propagation flags are set */ |
1931 | if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) | 1931 | if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) |
1932 | return 0; | 1932 | return 0; |
1933 | /* Only one propagation flag should be set */ | 1933 | /* Only one propagation flag should be set */ |
1934 | if (!is_power_of_2(type)) | 1934 | if (!is_power_of_2(type)) |
1935 | return 0; | 1935 | return 0; |
1936 | return type; | 1936 | return type; |
1937 | } | 1937 | } |
1938 | 1938 | ||
1939 | /* | 1939 | /* |
1940 | * recursively change the type of the mountpoint. | 1940 | * recursively change the type of the mountpoint. |
1941 | */ | 1941 | */ |
1942 | static int do_change_type(struct path *path, int flag) | 1942 | static int do_change_type(struct path *path, int flag) |
1943 | { | 1943 | { |
1944 | struct mount *m; | 1944 | struct mount *m; |
1945 | struct mount *mnt = real_mount(path->mnt); | 1945 | struct mount *mnt = real_mount(path->mnt); |
1946 | int recurse = flag & MS_REC; | 1946 | int recurse = flag & MS_REC; |
1947 | int type; | 1947 | int type; |
1948 | int err = 0; | 1948 | int err = 0; |
1949 | 1949 | ||
1950 | if (path->dentry != path->mnt->mnt_root) | 1950 | if (path->dentry != path->mnt->mnt_root) |
1951 | return -EINVAL; | 1951 | return -EINVAL; |
1952 | 1952 | ||
1953 | type = flags_to_propagation_type(flag); | 1953 | type = flags_to_propagation_type(flag); |
1954 | if (!type) | 1954 | if (!type) |
1955 | return -EINVAL; | 1955 | return -EINVAL; |
1956 | 1956 | ||
1957 | namespace_lock(); | 1957 | namespace_lock(); |
1958 | if (type == MS_SHARED) { | 1958 | if (type == MS_SHARED) { |
1959 | err = invent_group_ids(mnt, recurse); | 1959 | err = invent_group_ids(mnt, recurse); |
1960 | if (err) | 1960 | if (err) |
1961 | goto out_unlock; | 1961 | goto out_unlock; |
1962 | } | 1962 | } |
1963 | 1963 | ||
1964 | lock_mount_hash(); | 1964 | lock_mount_hash(); |
1965 | for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) | 1965 | for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) |
1966 | change_mnt_propagation(m, type); | 1966 | change_mnt_propagation(m, type); |
1967 | unlock_mount_hash(); | 1967 | unlock_mount_hash(); |
1968 | 1968 | ||
1969 | out_unlock: | 1969 | out_unlock: |
1970 | namespace_unlock(); | 1970 | namespace_unlock(); |
1971 | return err; | 1971 | return err; |
1972 | } | 1972 | } |
1973 | 1973 | ||
1974 | static bool has_locked_children(struct mount *mnt, struct dentry *dentry) | 1974 | static bool has_locked_children(struct mount *mnt, struct dentry *dentry) |
1975 | { | 1975 | { |
1976 | struct mount *child; | 1976 | struct mount *child; |
1977 | list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { | 1977 | list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { |
1978 | if (!is_subdir(child->mnt_mountpoint, dentry)) | 1978 | if (!is_subdir(child->mnt_mountpoint, dentry)) |
1979 | continue; | 1979 | continue; |
1980 | 1980 | ||
1981 | if (child->mnt.mnt_flags & MNT_LOCKED) | 1981 | if (child->mnt.mnt_flags & MNT_LOCKED) |
1982 | return true; | 1982 | return true; |
1983 | } | 1983 | } |
1984 | return false; | 1984 | return false; |
1985 | } | 1985 | } |
1986 | 1986 | ||
1987 | /* | 1987 | /* |
1988 | * do loopback mount. | 1988 | * do loopback mount. |
1989 | */ | 1989 | */ |
1990 | static int do_loopback(struct path *path, const char *old_name, | 1990 | static int do_loopback(struct path *path, const char *old_name, |
1991 | int recurse) | 1991 | int recurse) |
1992 | { | 1992 | { |
1993 | struct path old_path; | 1993 | struct path old_path; |
1994 | struct mount *mnt = NULL, *old, *parent; | 1994 | struct mount *mnt = NULL, *old, *parent; |
1995 | struct mountpoint *mp; | 1995 | struct mountpoint *mp; |
1996 | int err; | 1996 | int err; |
1997 | if (!old_name || !*old_name) | 1997 | if (!old_name || !*old_name) |
1998 | return -EINVAL; | 1998 | return -EINVAL; |
1999 | err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); | 1999 | err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); |
2000 | if (err) | 2000 | if (err) |
2001 | return err; | 2001 | return err; |
2002 | 2002 | ||
2003 | err = -EINVAL; | 2003 | err = -EINVAL; |
2004 | if (mnt_ns_loop(old_path.dentry)) | 2004 | if (mnt_ns_loop(old_path.dentry)) |
2005 | goto out; | 2005 | goto out; |
2006 | 2006 | ||
2007 | mp = lock_mount(path); | 2007 | mp = lock_mount(path); |
2008 | err = PTR_ERR(mp); | 2008 | err = PTR_ERR(mp); |
2009 | if (IS_ERR(mp)) | 2009 | if (IS_ERR(mp)) |
2010 | goto out; | 2010 | goto out; |
2011 | 2011 | ||
2012 | old = real_mount(old_path.mnt); | 2012 | old = real_mount(old_path.mnt); |
2013 | parent = real_mount(path->mnt); | 2013 | parent = real_mount(path->mnt); |
2014 | 2014 | ||
2015 | err = -EINVAL; | 2015 | err = -EINVAL; |
2016 | if (IS_MNT_UNBINDABLE(old)) | 2016 | if (IS_MNT_UNBINDABLE(old)) |
2017 | goto out2; | 2017 | goto out2; |
2018 | 2018 | ||
2019 | if (!check_mnt(parent) || !check_mnt(old)) | 2019 | if (!check_mnt(parent)) |
2020 | goto out2; | ||
2021 | |||
2022 | if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) | ||
2020 | goto out2; | 2023 | goto out2; |
2021 | 2024 | ||
2022 | if (!recurse && has_locked_children(old, old_path.dentry)) | 2025 | if (!recurse && has_locked_children(old, old_path.dentry)) |
2023 | goto out2; | 2026 | goto out2; |
2024 | 2027 | ||
2025 | if (recurse) | 2028 | if (recurse) |
2026 | mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); | 2029 | mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); |
2027 | else | 2030 | else |
2028 | mnt = clone_mnt(old, old_path.dentry, 0); | 2031 | mnt = clone_mnt(old, old_path.dentry, 0); |
2029 | 2032 | ||
2030 | if (IS_ERR(mnt)) { | 2033 | if (IS_ERR(mnt)) { |
2031 | err = PTR_ERR(mnt); | 2034 | err = PTR_ERR(mnt); |
2032 | goto out2; | 2035 | goto out2; |
2033 | } | 2036 | } |
2034 | 2037 | ||
2035 | mnt->mnt.mnt_flags &= ~MNT_LOCKED; | 2038 | mnt->mnt.mnt_flags &= ~MNT_LOCKED; |
2036 | 2039 | ||
2037 | err = graft_tree(mnt, parent, mp); | 2040 | err = graft_tree(mnt, parent, mp); |
2038 | if (err) { | 2041 | if (err) { |
2039 | lock_mount_hash(); | 2042 | lock_mount_hash(); |
2040 | umount_tree(mnt, 0); | 2043 | umount_tree(mnt, 0); |
2041 | unlock_mount_hash(); | 2044 | unlock_mount_hash(); |
2042 | } | 2045 | } |
2043 | out2: | 2046 | out2: |
2044 | unlock_mount(mp); | 2047 | unlock_mount(mp); |
2045 | out: | 2048 | out: |
2046 | path_put(&old_path); | 2049 | path_put(&old_path); |
2047 | return err; | 2050 | return err; |
2048 | } | 2051 | } |
2049 | 2052 | ||
2050 | static int change_mount_flags(struct vfsmount *mnt, int ms_flags) | 2053 | static int change_mount_flags(struct vfsmount *mnt, int ms_flags) |
2051 | { | 2054 | { |
2052 | int error = 0; | 2055 | int error = 0; |
2053 | int readonly_request = 0; | 2056 | int readonly_request = 0; |
2054 | 2057 | ||
2055 | if (ms_flags & MS_RDONLY) | 2058 | if (ms_flags & MS_RDONLY) |
2056 | readonly_request = 1; | 2059 | readonly_request = 1; |
2057 | if (readonly_request == __mnt_is_readonly(mnt)) | 2060 | if (readonly_request == __mnt_is_readonly(mnt)) |
2058 | return 0; | 2061 | return 0; |
2059 | 2062 | ||
2060 | if (readonly_request) | 2063 | if (readonly_request) |
2061 | error = mnt_make_readonly(real_mount(mnt)); | 2064 | error = mnt_make_readonly(real_mount(mnt)); |
2062 | else | 2065 | else |
2063 | __mnt_unmake_readonly(real_mount(mnt)); | 2066 | __mnt_unmake_readonly(real_mount(mnt)); |
2064 | return error; | 2067 | return error; |
2065 | } | 2068 | } |
2066 | 2069 | ||
2067 | /* | 2070 | /* |
2068 | * change filesystem flags. dir should be a physical root of filesystem. | 2071 | * change filesystem flags. dir should be a physical root of filesystem. |
2069 | * If you've mounted a non-root directory somewhere and want to do remount | 2072 | * If you've mounted a non-root directory somewhere and want to do remount |
2070 | * on it - tough luck. | 2073 | * on it - tough luck. |
2071 | */ | 2074 | */ |
2072 | static int do_remount(struct path *path, int flags, int mnt_flags, | 2075 | static int do_remount(struct path *path, int flags, int mnt_flags, |
2073 | void *data) | 2076 | void *data) |
2074 | { | 2077 | { |
2075 | int err; | 2078 | int err; |
2076 | struct super_block *sb = path->mnt->mnt_sb; | 2079 | struct super_block *sb = path->mnt->mnt_sb; |
2077 | struct mount *mnt = real_mount(path->mnt); | 2080 | struct mount *mnt = real_mount(path->mnt); |
2078 | 2081 | ||
2079 | if (!check_mnt(mnt)) | 2082 | if (!check_mnt(mnt)) |
2080 | return -EINVAL; | 2083 | return -EINVAL; |
2081 | 2084 | ||
2082 | if (path->dentry != path->mnt->mnt_root) | 2085 | if (path->dentry != path->mnt->mnt_root) |
2083 | return -EINVAL; | 2086 | return -EINVAL; |
2084 | 2087 | ||
2085 | /* Don't allow changing of locked mnt flags. | 2088 | /* Don't allow changing of locked mnt flags. |
2086 | * | 2089 | * |
2087 | * No locks need to be held here while testing the various | 2090 | * No locks need to be held here while testing the various |
2088 | * MNT_LOCK flags because those flags can never be cleared | 2091 | * MNT_LOCK flags because those flags can never be cleared |
2089 | * once they are set. | 2092 | * once they are set. |
2090 | */ | 2093 | */ |
2091 | if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && | 2094 | if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && |
2092 | !(mnt_flags & MNT_READONLY)) { | 2095 | !(mnt_flags & MNT_READONLY)) { |
2093 | return -EPERM; | 2096 | return -EPERM; |
2094 | } | 2097 | } |
2095 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && | 2098 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && |
2096 | !(mnt_flags & MNT_NODEV)) { | 2099 | !(mnt_flags & MNT_NODEV)) { |
2097 | return -EPERM; | 2100 | return -EPERM; |
2098 | } | 2101 | } |
2099 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && | 2102 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && |
2100 | !(mnt_flags & MNT_NOSUID)) { | 2103 | !(mnt_flags & MNT_NOSUID)) { |
2101 | return -EPERM; | 2104 | return -EPERM; |
2102 | } | 2105 | } |
2103 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && | 2106 | if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && |
2104 | !(mnt_flags & MNT_NOEXEC)) { | 2107 | !(mnt_flags & MNT_NOEXEC)) { |
2105 | return -EPERM; | 2108 | return -EPERM; |
2106 | } | 2109 | } |
2107 | if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && | 2110 | if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && |
2108 | ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { | 2111 | ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { |
2109 | return -EPERM; | 2112 | return -EPERM; |
2110 | } | 2113 | } |
2111 | 2114 | ||
2112 | err = security_sb_remount(sb, data); | 2115 | err = security_sb_remount(sb, data); |
2113 | if (err) | 2116 | if (err) |
2114 | return err; | 2117 | return err; |
2115 | 2118 | ||
2116 | down_write(&sb->s_umount); | 2119 | down_write(&sb->s_umount); |
2117 | if (flags & MS_BIND) | 2120 | if (flags & MS_BIND) |
2118 | err = change_mount_flags(path->mnt, flags); | 2121 | err = change_mount_flags(path->mnt, flags); |
2119 | else if (!capable(CAP_SYS_ADMIN)) | 2122 | else if (!capable(CAP_SYS_ADMIN)) |
2120 | err = -EPERM; | 2123 | err = -EPERM; |
2121 | else | 2124 | else |
2122 | err = do_remount_sb(sb, flags, data, 0); | 2125 | err = do_remount_sb(sb, flags, data, 0); |
2123 | if (!err) { | 2126 | if (!err) { |
2124 | lock_mount_hash(); | 2127 | lock_mount_hash(); |
2125 | mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; | 2128 | mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; |
2126 | mnt->mnt.mnt_flags = mnt_flags; | 2129 | mnt->mnt.mnt_flags = mnt_flags; |
2127 | touch_mnt_namespace(mnt->mnt_ns); | 2130 | touch_mnt_namespace(mnt->mnt_ns); |
2128 | unlock_mount_hash(); | 2131 | unlock_mount_hash(); |
2129 | } | 2132 | } |
2130 | up_write(&sb->s_umount); | 2133 | up_write(&sb->s_umount); |
2131 | return err; | 2134 | return err; |
2132 | } | 2135 | } |
2133 | 2136 | ||
2134 | static inline int tree_contains_unbindable(struct mount *mnt) | 2137 | static inline int tree_contains_unbindable(struct mount *mnt) |
2135 | { | 2138 | { |
2136 | struct mount *p; | 2139 | struct mount *p; |
2137 | for (p = mnt; p; p = next_mnt(p, mnt)) { | 2140 | for (p = mnt; p; p = next_mnt(p, mnt)) { |
2138 | if (IS_MNT_UNBINDABLE(p)) | 2141 | if (IS_MNT_UNBINDABLE(p)) |
2139 | return 1; | 2142 | return 1; |
2140 | } | 2143 | } |
2141 | return 0; | 2144 | return 0; |
2142 | } | 2145 | } |
2143 | 2146 | ||
2144 | static int do_move_mount(struct path *path, const char *old_name) | 2147 | static int do_move_mount(struct path *path, const char *old_name) |
2145 | { | 2148 | { |
2146 | struct path old_path, parent_path; | 2149 | struct path old_path, parent_path; |
2147 | struct mount *p; | 2150 | struct mount *p; |
2148 | struct mount *old; | 2151 | struct mount *old; |
2149 | struct mountpoint *mp; | 2152 | struct mountpoint *mp; |
2150 | int err; | 2153 | int err; |
2151 | if (!old_name || !*old_name) | 2154 | if (!old_name || !*old_name) |
2152 | return -EINVAL; | 2155 | return -EINVAL; |
2153 | err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); | 2156 | err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); |
2154 | if (err) | 2157 | if (err) |
2155 | return err; | 2158 | return err; |
2156 | 2159 | ||
2157 | mp = lock_mount(path); | 2160 | mp = lock_mount(path); |
2158 | err = PTR_ERR(mp); | 2161 | err = PTR_ERR(mp); |
2159 | if (IS_ERR(mp)) | 2162 | if (IS_ERR(mp)) |
2160 | goto out; | 2163 | goto out; |
2161 | 2164 | ||
2162 | old = real_mount(old_path.mnt); | 2165 | old = real_mount(old_path.mnt); |
2163 | p = real_mount(path->mnt); | 2166 | p = real_mount(path->mnt); |
2164 | 2167 | ||
2165 | err = -EINVAL; | 2168 | err = -EINVAL; |
2166 | if (!check_mnt(p) || !check_mnt(old)) | 2169 | if (!check_mnt(p) || !check_mnt(old)) |
2167 | goto out1; | 2170 | goto out1; |
2168 | 2171 | ||
2169 | if (old->mnt.mnt_flags & MNT_LOCKED) | 2172 | if (old->mnt.mnt_flags & MNT_LOCKED) |
2170 | goto out1; | 2173 | goto out1; |
2171 | 2174 | ||
2172 | err = -EINVAL; | 2175 | err = -EINVAL; |
2173 | if (old_path.dentry != old_path.mnt->mnt_root) | 2176 | if (old_path.dentry != old_path.mnt->mnt_root) |
2174 | goto out1; | 2177 | goto out1; |
2175 | 2178 | ||
2176 | if (!mnt_has_parent(old)) | 2179 | if (!mnt_has_parent(old)) |
2177 | goto out1; | 2180 | goto out1; |
2178 | 2181 | ||
2179 | if (S_ISDIR(path->dentry->d_inode->i_mode) != | 2182 | if (S_ISDIR(path->dentry->d_inode->i_mode) != |
2180 | S_ISDIR(old_path.dentry->d_inode->i_mode)) | 2183 | S_ISDIR(old_path.dentry->d_inode->i_mode)) |
2181 | goto out1; | 2184 | goto out1; |
2182 | /* | 2185 | /* |
2183 | * Don't move a mount residing in a shared parent. | 2186 | * Don't move a mount residing in a shared parent. |
2184 | */ | 2187 | */ |
2185 | if (IS_MNT_SHARED(old->mnt_parent)) | 2188 | if (IS_MNT_SHARED(old->mnt_parent)) |
2186 | goto out1; | 2189 | goto out1; |
2187 | /* | 2190 | /* |
2188 | * Don't move a mount tree containing unbindable mounts to a destination | 2191 | * Don't move a mount tree containing unbindable mounts to a destination |
2189 | * mount which is shared. | 2192 | * mount which is shared. |
2190 | */ | 2193 | */ |
2191 | if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) | 2194 | if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) |
2192 | goto out1; | 2195 | goto out1; |
2193 | err = -ELOOP; | 2196 | err = -ELOOP; |
2194 | for (; mnt_has_parent(p); p = p->mnt_parent) | 2197 | for (; mnt_has_parent(p); p = p->mnt_parent) |
2195 | if (p == old) | 2198 | if (p == old) |
2196 | goto out1; | 2199 | goto out1; |
2197 | 2200 | ||
2198 | err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); | 2201 | err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); |
2199 | if (err) | 2202 | if (err) |
2200 | goto out1; | 2203 | goto out1; |
2201 | 2204 | ||
2202 | /* if the mount is moved, it should no longer be expire | 2205 | /* if the mount is moved, it should no longer be expire |
2203 | * automatically */ | 2206 | * automatically */ |
2204 | list_del_init(&old->mnt_expire); | 2207 | list_del_init(&old->mnt_expire); |
2205 | out1: | 2208 | out1: |
2206 | unlock_mount(mp); | 2209 | unlock_mount(mp); |
2207 | out: | 2210 | out: |
2208 | if (!err) | 2211 | if (!err) |
2209 | path_put(&parent_path); | 2212 | path_put(&parent_path); |
2210 | path_put(&old_path); | 2213 | path_put(&old_path); |
2211 | return err; | 2214 | return err; |
2212 | } | 2215 | } |
2213 | 2216 | ||
2214 | static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) | 2217 | static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) |
2215 | { | 2218 | { |
2216 | int err; | 2219 | int err; |
2217 | const char *subtype = strchr(fstype, '.'); | 2220 | const char *subtype = strchr(fstype, '.'); |
2218 | if (subtype) { | 2221 | if (subtype) { |
2219 | subtype++; | 2222 | subtype++; |
2220 | err = -EINVAL; | 2223 | err = -EINVAL; |
2221 | if (!subtype[0]) | 2224 | if (!subtype[0]) |
2222 | goto err; | 2225 | goto err; |
2223 | } else | 2226 | } else |
2224 | subtype = ""; | 2227 | subtype = ""; |
2225 | 2228 | ||
2226 | mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); | 2229 | mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); |
2227 | err = -ENOMEM; | 2230 | err = -ENOMEM; |
2228 | if (!mnt->mnt_sb->s_subtype) | 2231 | if (!mnt->mnt_sb->s_subtype) |
2229 | goto err; | 2232 | goto err; |
2230 | return mnt; | 2233 | return mnt; |
2231 | 2234 | ||
2232 | err: | 2235 | err: |
2233 | mntput(mnt); | 2236 | mntput(mnt); |
2234 | return ERR_PTR(err); | 2237 | return ERR_PTR(err); |
2235 | } | 2238 | } |
2236 | 2239 | ||
2237 | /* | 2240 | /* |
2238 | * add a mount into a namespace's mount tree | 2241 | * add a mount into a namespace's mount tree |
2239 | */ | 2242 | */ |
2240 | static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) | 2243 | static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) |
2241 | { | 2244 | { |
2242 | struct mountpoint *mp; | 2245 | struct mountpoint *mp; |
2243 | struct mount *parent; | 2246 | struct mount *parent; |
2244 | int err; | 2247 | int err; |
2245 | 2248 | ||
2246 | mnt_flags &= ~MNT_INTERNAL_FLAGS; | 2249 | mnt_flags &= ~MNT_INTERNAL_FLAGS; |
2247 | 2250 | ||
2248 | mp = lock_mount(path); | 2251 | mp = lock_mount(path); |
2249 | if (IS_ERR(mp)) | 2252 | if (IS_ERR(mp)) |
2250 | return PTR_ERR(mp); | 2253 | return PTR_ERR(mp); |
2251 | 2254 | ||
2252 | parent = real_mount(path->mnt); | 2255 | parent = real_mount(path->mnt); |
2253 | err = -EINVAL; | 2256 | err = -EINVAL; |
2254 | if (unlikely(!check_mnt(parent))) { | 2257 | if (unlikely(!check_mnt(parent))) { |
2255 | /* that's acceptable only for automounts done in private ns */ | 2258 | /* that's acceptable only for automounts done in private ns */ |
2256 | if (!(mnt_flags & MNT_SHRINKABLE)) | 2259 | if (!(mnt_flags & MNT_SHRINKABLE)) |
2257 | goto unlock; | 2260 | goto unlock; |
2258 | /* ... and for those we'd better have mountpoint still alive */ | 2261 | /* ... and for those we'd better have mountpoint still alive */ |
2259 | if (!parent->mnt_ns) | 2262 | if (!parent->mnt_ns) |
2260 | goto unlock; | 2263 | goto unlock; |
2261 | } | 2264 | } |
2262 | 2265 | ||
2263 | /* Refuse the same filesystem on the same mount point */ | 2266 | /* Refuse the same filesystem on the same mount point */ |
2264 | err = -EBUSY; | 2267 | err = -EBUSY; |
2265 | if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && | 2268 | if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && |
2266 | path->mnt->mnt_root == path->dentry) | 2269 | path->mnt->mnt_root == path->dentry) |
2267 | goto unlock; | 2270 | goto unlock; |
2268 | 2271 | ||
2269 | err = -EINVAL; | 2272 | err = -EINVAL; |
2270 | if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) | 2273 | if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) |
2271 | goto unlock; | 2274 | goto unlock; |
2272 | 2275 | ||
2273 | newmnt->mnt.mnt_flags = mnt_flags; | 2276 | newmnt->mnt.mnt_flags = mnt_flags; |
2274 | err = graft_tree(newmnt, parent, mp); | 2277 | err = graft_tree(newmnt, parent, mp); |
2275 | 2278 | ||
2276 | unlock: | 2279 | unlock: |
2277 | unlock_mount(mp); | 2280 | unlock_mount(mp); |
2278 | return err; | 2281 | return err; |
2279 | } | 2282 | } |
2280 | 2283 | ||
2281 | /* | 2284 | /* |
2282 | * create a new mount for userspace and request it to be added into the | 2285 | * create a new mount for userspace and request it to be added into the |
2283 | * namespace's tree | 2286 | * namespace's tree |
2284 | */ | 2287 | */ |
2285 | static int do_new_mount(struct path *path, const char *fstype, int flags, | 2288 | static int do_new_mount(struct path *path, const char *fstype, int flags, |
2286 | int mnt_flags, const char *name, void *data) | 2289 | int mnt_flags, const char *name, void *data) |
2287 | { | 2290 | { |
2288 | struct file_system_type *type; | 2291 | struct file_system_type *type; |
2289 | struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; | 2292 | struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; |
2290 | struct vfsmount *mnt; | 2293 | struct vfsmount *mnt; |
2291 | int err; | 2294 | int err; |
2292 | 2295 | ||
2293 | if (!fstype) | 2296 | if (!fstype) |
2294 | return -EINVAL; | 2297 | return -EINVAL; |
2295 | 2298 | ||
2296 | type = get_fs_type(fstype); | 2299 | type = get_fs_type(fstype); |
2297 | if (!type) | 2300 | if (!type) |
2298 | return -ENODEV; | 2301 | return -ENODEV; |
2299 | 2302 | ||
2300 | if (user_ns != &init_user_ns) { | 2303 | if (user_ns != &init_user_ns) { |
2301 | if (!(type->fs_flags & FS_USERNS_MOUNT)) { | 2304 | if (!(type->fs_flags & FS_USERNS_MOUNT)) { |
2302 | put_filesystem(type); | 2305 | put_filesystem(type); |
2303 | return -EPERM; | 2306 | return -EPERM; |
2304 | } | 2307 | } |
2305 | /* Only in special cases allow devices from mounts | 2308 | /* Only in special cases allow devices from mounts |
2306 | * created outside the initial user namespace. | 2309 | * created outside the initial user namespace. |
2307 | */ | 2310 | */ |
2308 | if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { | 2311 | if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { |
2309 | flags |= MS_NODEV; | 2312 | flags |= MS_NODEV; |
2310 | mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; | 2313 | mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; |
2311 | } | 2314 | } |
2312 | } | 2315 | } |
2313 | 2316 | ||
2314 | mnt = vfs_kern_mount(type, flags, name, data); | 2317 | mnt = vfs_kern_mount(type, flags, name, data); |
2315 | if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && | 2318 | if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && |
2316 | !mnt->mnt_sb->s_subtype) | 2319 | !mnt->mnt_sb->s_subtype) |
2317 | mnt = fs_set_subtype(mnt, fstype); | 2320 | mnt = fs_set_subtype(mnt, fstype); |
2318 | 2321 | ||
2319 | put_filesystem(type); | 2322 | put_filesystem(type); |
2320 | if (IS_ERR(mnt)) | 2323 | if (IS_ERR(mnt)) |
2321 | return PTR_ERR(mnt); | 2324 | return PTR_ERR(mnt); |
2322 | 2325 | ||
2323 | err = do_add_mount(real_mount(mnt), path, mnt_flags); | 2326 | err = do_add_mount(real_mount(mnt), path, mnt_flags); |
2324 | if (err) | 2327 | if (err) |
2325 | mntput(mnt); | 2328 | mntput(mnt); |
2326 | return err; | 2329 | return err; |
2327 | } | 2330 | } |
2328 | 2331 | ||
2329 | int finish_automount(struct vfsmount *m, struct path *path) | 2332 | int finish_automount(struct vfsmount *m, struct path *path) |
2330 | { | 2333 | { |
2331 | struct mount *mnt = real_mount(m); | 2334 | struct mount *mnt = real_mount(m); |
2332 | int err; | 2335 | int err; |
2333 | /* The new mount record should have at least 2 refs to prevent it being | 2336 | /* The new mount record should have at least 2 refs to prevent it being |
2334 | * expired before we get a chance to add it | 2337 | * expired before we get a chance to add it |
2335 | */ | 2338 | */ |
2336 | BUG_ON(mnt_get_count(mnt) < 2); | 2339 | BUG_ON(mnt_get_count(mnt) < 2); |
2337 | 2340 | ||
2338 | if (m->mnt_sb == path->mnt->mnt_sb && | 2341 | if (m->mnt_sb == path->mnt->mnt_sb && |
2339 | m->mnt_root == path->dentry) { | 2342 | m->mnt_root == path->dentry) { |
2340 | err = -ELOOP; | 2343 | err = -ELOOP; |
2341 | goto fail; | 2344 | goto fail; |
2342 | } | 2345 | } |
2343 | 2346 | ||
2344 | err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); | 2347 | err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); |
2345 | if (!err) | 2348 | if (!err) |
2346 | return 0; | 2349 | return 0; |
2347 | fail: | 2350 | fail: |
2348 | /* remove m from any expiration list it may be on */ | 2351 | /* remove m from any expiration list it may be on */ |
2349 | if (!list_empty(&mnt->mnt_expire)) { | 2352 | if (!list_empty(&mnt->mnt_expire)) { |
2350 | namespace_lock(); | 2353 | namespace_lock(); |
2351 | list_del_init(&mnt->mnt_expire); | 2354 | list_del_init(&mnt->mnt_expire); |
2352 | namespace_unlock(); | 2355 | namespace_unlock(); |
2353 | } | 2356 | } |
2354 | mntput(m); | 2357 | mntput(m); |
2355 | mntput(m); | 2358 | mntput(m); |
2356 | return err; | 2359 | return err; |
2357 | } | 2360 | } |
2358 | 2361 | ||
2359 | /** | 2362 | /** |
2360 | * mnt_set_expiry - Put a mount on an expiration list | 2363 | * mnt_set_expiry - Put a mount on an expiration list |
2361 | * @mnt: The mount to list. | 2364 | * @mnt: The mount to list. |
2362 | * @expiry_list: The list to add the mount to. | 2365 | * @expiry_list: The list to add the mount to. |
2363 | */ | 2366 | */ |
2364 | void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) | 2367 | void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) |
2365 | { | 2368 | { |
2366 | namespace_lock(); | 2369 | namespace_lock(); |
2367 | 2370 | ||
2368 | list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); | 2371 | list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); |
2369 | 2372 | ||
2370 | namespace_unlock(); | 2373 | namespace_unlock(); |
2371 | } | 2374 | } |
2372 | EXPORT_SYMBOL(mnt_set_expiry); | 2375 | EXPORT_SYMBOL(mnt_set_expiry); |
2373 | 2376 | ||
2374 | /* | 2377 | /* |
2375 | * process a list of expirable mountpoints with the intent of discarding any | 2378 | * process a list of expirable mountpoints with the intent of discarding any |
2376 | * mountpoints that aren't in use and haven't been touched since last we came | 2379 | * mountpoints that aren't in use and haven't been touched since last we came |
2377 | * here | 2380 | * here |
2378 | */ | 2381 | */ |
2379 | void mark_mounts_for_expiry(struct list_head *mounts) | 2382 | void mark_mounts_for_expiry(struct list_head *mounts) |
2380 | { | 2383 | { |
2381 | struct mount *mnt, *next; | 2384 | struct mount *mnt, *next; |
2382 | LIST_HEAD(graveyard); | 2385 | LIST_HEAD(graveyard); |
2383 | 2386 | ||
2384 | if (list_empty(mounts)) | 2387 | if (list_empty(mounts)) |
2385 | return; | 2388 | return; |
2386 | 2389 | ||
2387 | namespace_lock(); | 2390 | namespace_lock(); |
2388 | lock_mount_hash(); | 2391 | lock_mount_hash(); |
2389 | 2392 | ||
2390 | /* extract from the expiration list every vfsmount that matches the | 2393 | /* extract from the expiration list every vfsmount that matches the |
2391 | * following criteria: | 2394 | * following criteria: |
2392 | * - only referenced by its parent vfsmount | 2395 | * - only referenced by its parent vfsmount |
2393 | * - still marked for expiry (marked on the last call here; marks are | 2396 | * - still marked for expiry (marked on the last call here; marks are |
2394 | * cleared by mntput()) | 2397 | * cleared by mntput()) |
2395 | */ | 2398 | */ |
2396 | list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { | 2399 | list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { |
2397 | if (!xchg(&mnt->mnt_expiry_mark, 1) || | 2400 | if (!xchg(&mnt->mnt_expiry_mark, 1) || |
2398 | propagate_mount_busy(mnt, 1)) | 2401 | propagate_mount_busy(mnt, 1)) |
2399 | continue; | 2402 | continue; |
2400 | list_move(&mnt->mnt_expire, &graveyard); | 2403 | list_move(&mnt->mnt_expire, &graveyard); |
2401 | } | 2404 | } |
2402 | while (!list_empty(&graveyard)) { | 2405 | while (!list_empty(&graveyard)) { |
2403 | mnt = list_first_entry(&graveyard, struct mount, mnt_expire); | 2406 | mnt = list_first_entry(&graveyard, struct mount, mnt_expire); |
2404 | touch_mnt_namespace(mnt->mnt_ns); | 2407 | touch_mnt_namespace(mnt->mnt_ns); |
2405 | umount_tree(mnt, 1); | 2408 | umount_tree(mnt, 1); |
2406 | } | 2409 | } |
2407 | unlock_mount_hash(); | 2410 | unlock_mount_hash(); |
2408 | namespace_unlock(); | 2411 | namespace_unlock(); |
2409 | } | 2412 | } |
2410 | 2413 | ||
2411 | EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); | 2414 | EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); |
2412 | 2415 | ||
2413 | /* | 2416 | /* |
2414 | * Ripoff of 'select_parent()' | 2417 | * Ripoff of 'select_parent()' |
2415 | * | 2418 | * |
2416 | * search the list of submounts for a given mountpoint, and move any | 2419 | * search the list of submounts for a given mountpoint, and move any |
2417 | * shrinkable submounts to the 'graveyard' list. | 2420 | * shrinkable submounts to the 'graveyard' list. |
2418 | */ | 2421 | */ |
2419 | static int select_submounts(struct mount *parent, struct list_head *graveyard) | 2422 | static int select_submounts(struct mount *parent, struct list_head *graveyard) |
2420 | { | 2423 | { |
2421 | struct mount *this_parent = parent; | 2424 | struct mount *this_parent = parent; |
2422 | struct list_head *next; | 2425 | struct list_head *next; |
2423 | int found = 0; | 2426 | int found = 0; |
2424 | 2427 | ||
2425 | repeat: | 2428 | repeat: |
2426 | next = this_parent->mnt_mounts.next; | 2429 | next = this_parent->mnt_mounts.next; |
2427 | resume: | 2430 | resume: |
2428 | while (next != &this_parent->mnt_mounts) { | 2431 | while (next != &this_parent->mnt_mounts) { |
2429 | struct list_head *tmp = next; | 2432 | struct list_head *tmp = next; |
2430 | struct mount *mnt = list_entry(tmp, struct mount, mnt_child); | 2433 | struct mount *mnt = list_entry(tmp, struct mount, mnt_child); |
2431 | 2434 | ||
2432 | next = tmp->next; | 2435 | next = tmp->next; |
2433 | if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) | 2436 | if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) |
2434 | continue; | 2437 | continue; |
2435 | /* | 2438 | /* |
2436 | * Descend a level if the d_mounts list is non-empty. | 2439 | * Descend a level if the d_mounts list is non-empty. |
2437 | */ | 2440 | */ |
2438 | if (!list_empty(&mnt->mnt_mounts)) { | 2441 | if (!list_empty(&mnt->mnt_mounts)) { |
2439 | this_parent = mnt; | 2442 | this_parent = mnt; |
2440 | goto repeat; | 2443 | goto repeat; |
2441 | } | 2444 | } |
2442 | 2445 | ||
2443 | if (!propagate_mount_busy(mnt, 1)) { | 2446 | if (!propagate_mount_busy(mnt, 1)) { |
2444 | list_move_tail(&mnt->mnt_expire, graveyard); | 2447 | list_move_tail(&mnt->mnt_expire, graveyard); |
2445 | found++; | 2448 | found++; |
2446 | } | 2449 | } |
2447 | } | 2450 | } |
2448 | /* | 2451 | /* |
2449 | * All done at this level ... ascend and resume the search | 2452 | * All done at this level ... ascend and resume the search |
2450 | */ | 2453 | */ |
2451 | if (this_parent != parent) { | 2454 | if (this_parent != parent) { |
2452 | next = this_parent->mnt_child.next; | 2455 | next = this_parent->mnt_child.next; |
2453 | this_parent = this_parent->mnt_parent; | 2456 | this_parent = this_parent->mnt_parent; |
2454 | goto resume; | 2457 | goto resume; |
2455 | } | 2458 | } |
2456 | return found; | 2459 | return found; |
2457 | } | 2460 | } |
2458 | 2461 | ||
2459 | /* | 2462 | /* |
2460 | * process a list of expirable mountpoints with the intent of discarding any | 2463 | * process a list of expirable mountpoints with the intent of discarding any |
2461 | * submounts of a specific parent mountpoint | 2464 | * submounts of a specific parent mountpoint |
2462 | * | 2465 | * |
2463 | * mount_lock must be held for write | 2466 | * mount_lock must be held for write |
2464 | */ | 2467 | */ |
2465 | static void shrink_submounts(struct mount *mnt) | 2468 | static void shrink_submounts(struct mount *mnt) |
2466 | { | 2469 | { |
2467 | LIST_HEAD(graveyard); | 2470 | LIST_HEAD(graveyard); |
2468 | struct mount *m; | 2471 | struct mount *m; |
2469 | 2472 | ||
2470 | /* extract submounts of 'mountpoint' from the expiration list */ | 2473 | /* extract submounts of 'mountpoint' from the expiration list */ |
2471 | while (select_submounts(mnt, &graveyard)) { | 2474 | while (select_submounts(mnt, &graveyard)) { |
2472 | while (!list_empty(&graveyard)) { | 2475 | while (!list_empty(&graveyard)) { |
2473 | m = list_first_entry(&graveyard, struct mount, | 2476 | m = list_first_entry(&graveyard, struct mount, |
2474 | mnt_expire); | 2477 | mnt_expire); |
2475 | touch_mnt_namespace(m->mnt_ns); | 2478 | touch_mnt_namespace(m->mnt_ns); |
2476 | umount_tree(m, 1); | 2479 | umount_tree(m, 1); |
2477 | } | 2480 | } |
2478 | } | 2481 | } |
2479 | } | 2482 | } |
2480 | 2483 | ||
2481 | /* | 2484 | /* |
2482 | * Some copy_from_user() implementations do not return the exact number of | 2485 | * Some copy_from_user() implementations do not return the exact number of |
2483 | * bytes remaining to copy on a fault. But copy_mount_options() requires that. | 2486 | * bytes remaining to copy on a fault. But copy_mount_options() requires that. |
2484 | * Note that this function differs from copy_from_user() in that it will oops | 2487 | * Note that this function differs from copy_from_user() in that it will oops |
2485 | * on bad values of `to', rather than returning a short copy. | 2488 | * on bad values of `to', rather than returning a short copy. |
2486 | */ | 2489 | */ |
2487 | static long exact_copy_from_user(void *to, const void __user * from, | 2490 | static long exact_copy_from_user(void *to, const void __user * from, |
2488 | unsigned long n) | 2491 | unsigned long n) |
2489 | { | 2492 | { |
2490 | char *t = to; | 2493 | char *t = to; |
2491 | const char __user *f = from; | 2494 | const char __user *f = from; |
2492 | char c; | 2495 | char c; |
2493 | 2496 | ||
2494 | if (!access_ok(VERIFY_READ, from, n)) | 2497 | if (!access_ok(VERIFY_READ, from, n)) |
2495 | return n; | 2498 | return n; |
2496 | 2499 | ||
2497 | while (n) { | 2500 | while (n) { |
2498 | if (__get_user(c, f)) { | 2501 | if (__get_user(c, f)) { |
2499 | memset(t, 0, n); | 2502 | memset(t, 0, n); |
2500 | break; | 2503 | break; |
2501 | } | 2504 | } |
2502 | *t++ = c; | 2505 | *t++ = c; |
2503 | f++; | 2506 | f++; |
2504 | n--; | 2507 | n--; |
2505 | } | 2508 | } |
2506 | return n; | 2509 | return n; |
2507 | } | 2510 | } |
2508 | 2511 | ||
2509 | int copy_mount_options(const void __user * data, unsigned long *where) | 2512 | int copy_mount_options(const void __user * data, unsigned long *where) |
2510 | { | 2513 | { |
2511 | int i; | 2514 | int i; |
2512 | unsigned long page; | 2515 | unsigned long page; |
2513 | unsigned long size; | 2516 | unsigned long size; |
2514 | 2517 | ||
2515 | *where = 0; | 2518 | *where = 0; |
2516 | if (!data) | 2519 | if (!data) |
2517 | return 0; | 2520 | return 0; |
2518 | 2521 | ||
2519 | if (!(page = __get_free_page(GFP_KERNEL))) | 2522 | if (!(page = __get_free_page(GFP_KERNEL))) |
2520 | return -ENOMEM; | 2523 | return -ENOMEM; |
2521 | 2524 | ||
2522 | /* We only care that *some* data at the address the user | 2525 | /* We only care that *some* data at the address the user |
2523 | * gave us is valid. Just in case, we'll zero | 2526 | * gave us is valid. Just in case, we'll zero |
2524 | * the remainder of the page. | 2527 | * the remainder of the page. |
2525 | */ | 2528 | */ |
2526 | /* copy_from_user cannot cross TASK_SIZE ! */ | 2529 | /* copy_from_user cannot cross TASK_SIZE ! */ |
2527 | size = TASK_SIZE - (unsigned long)data; | 2530 | size = TASK_SIZE - (unsigned long)data; |
2528 | if (size > PAGE_SIZE) | 2531 | if (size > PAGE_SIZE) |
2529 | size = PAGE_SIZE; | 2532 | size = PAGE_SIZE; |
2530 | 2533 | ||
2531 | i = size - exact_copy_from_user((void *)page, data, size); | 2534 | i = size - exact_copy_from_user((void *)page, data, size); |
2532 | if (!i) { | 2535 | if (!i) { |
2533 | free_page(page); | 2536 | free_page(page); |
2534 | return -EFAULT; | 2537 | return -EFAULT; |
2535 | } | 2538 | } |
2536 | if (i != PAGE_SIZE) | 2539 | if (i != PAGE_SIZE) |
2537 | memset((char *)page + i, 0, PAGE_SIZE - i); | 2540 | memset((char *)page + i, 0, PAGE_SIZE - i); |
2538 | *where = page; | 2541 | *where = page; |
2539 | return 0; | 2542 | return 0; |
2540 | } | 2543 | } |
2541 | 2544 | ||
2542 | char *copy_mount_string(const void __user *data) | 2545 | char *copy_mount_string(const void __user *data) |
2543 | { | 2546 | { |
2544 | return data ? strndup_user(data, PAGE_SIZE) : NULL; | 2547 | return data ? strndup_user(data, PAGE_SIZE) : NULL; |
2545 | } | 2548 | } |
2546 | 2549 | ||
2547 | /* | 2550 | /* |
2548 | * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to | 2551 | * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to |
2549 | * be given to the mount() call (ie: read-only, no-dev, no-suid etc). | 2552 | * be given to the mount() call (ie: read-only, no-dev, no-suid etc). |
2550 | * | 2553 | * |
2551 | * data is a (void *) that can point to any structure up to | 2554 | * data is a (void *) that can point to any structure up to |
2552 | * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent | 2555 | * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent |
2553 | * information (or be NULL). | 2556 | * information (or be NULL). |
2554 | * | 2557 | * |
2555 | * Pre-0.97 versions of mount() didn't have a flags word. | 2558 | * Pre-0.97 versions of mount() didn't have a flags word. |
2556 | * When the flags word was introduced its top half was required | 2559 | * When the flags word was introduced its top half was required |
2557 | * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. | 2560 | * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. |
2558 | * Therefore, if this magic number is present, it carries no information | 2561 | * Therefore, if this magic number is present, it carries no information |
2559 | * and must be discarded. | 2562 | * and must be discarded. |
2560 | */ | 2563 | */ |
2561 | long do_mount(const char *dev_name, const char __user *dir_name, | 2564 | long do_mount(const char *dev_name, const char __user *dir_name, |
2562 | const char *type_page, unsigned long flags, void *data_page) | 2565 | const char *type_page, unsigned long flags, void *data_page) |
2563 | { | 2566 | { |
2564 | struct path path; | 2567 | struct path path; |
2565 | int retval = 0; | 2568 | int retval = 0; |
2566 | int mnt_flags = 0; | 2569 | int mnt_flags = 0; |
2567 | 2570 | ||
2568 | /* Discard magic */ | 2571 | /* Discard magic */ |
2569 | if ((flags & MS_MGC_MSK) == MS_MGC_VAL) | 2572 | if ((flags & MS_MGC_MSK) == MS_MGC_VAL) |
2570 | flags &= ~MS_MGC_MSK; | 2573 | flags &= ~MS_MGC_MSK; |
2571 | 2574 | ||
2572 | /* Basic sanity checks */ | 2575 | /* Basic sanity checks */ |
2573 | if (data_page) | 2576 | if (data_page) |
2574 | ((char *)data_page)[PAGE_SIZE - 1] = 0; | 2577 | ((char *)data_page)[PAGE_SIZE - 1] = 0; |
2575 | 2578 | ||
2576 | /* ... and get the mountpoint */ | 2579 | /* ... and get the mountpoint */ |
2577 | retval = user_path(dir_name, &path); | 2580 | retval = user_path(dir_name, &path); |
2578 | if (retval) | 2581 | if (retval) |
2579 | return retval; | 2582 | return retval; |
2580 | 2583 | ||
2581 | retval = security_sb_mount(dev_name, &path, | 2584 | retval = security_sb_mount(dev_name, &path, |
2582 | type_page, flags, data_page); | 2585 | type_page, flags, data_page); |
2583 | if (!retval && !may_mount()) | 2586 | if (!retval && !may_mount()) |
2584 | retval = -EPERM; | 2587 | retval = -EPERM; |
2585 | if (retval) | 2588 | if (retval) |
2586 | goto dput_out; | 2589 | goto dput_out; |
2587 | 2590 | ||
2588 | /* Default to relatime unless overriden */ | 2591 | /* Default to relatime unless overriden */ |
2589 | if (!(flags & MS_NOATIME)) | 2592 | if (!(flags & MS_NOATIME)) |
2590 | mnt_flags |= MNT_RELATIME; | 2593 | mnt_flags |= MNT_RELATIME; |
2591 | 2594 | ||
2592 | /* Separate the per-mountpoint flags */ | 2595 | /* Separate the per-mountpoint flags */ |
2593 | if (flags & MS_NOSUID) | 2596 | if (flags & MS_NOSUID) |
2594 | mnt_flags |= MNT_NOSUID; | 2597 | mnt_flags |= MNT_NOSUID; |
2595 | if (flags & MS_NODEV) | 2598 | if (flags & MS_NODEV) |
2596 | mnt_flags |= MNT_NODEV; | 2599 | mnt_flags |= MNT_NODEV; |
2597 | if (flags & MS_NOEXEC) | 2600 | if (flags & MS_NOEXEC) |
2598 | mnt_flags |= MNT_NOEXEC; | 2601 | mnt_flags |= MNT_NOEXEC; |
2599 | if (flags & MS_NOATIME) | 2602 | if (flags & MS_NOATIME) |
2600 | mnt_flags |= MNT_NOATIME; | 2603 | mnt_flags |= MNT_NOATIME; |
2601 | if (flags & MS_NODIRATIME) | 2604 | if (flags & MS_NODIRATIME) |
2602 | mnt_flags |= MNT_NODIRATIME; | 2605 | mnt_flags |= MNT_NODIRATIME; |
2603 | if (flags & MS_STRICTATIME) | 2606 | if (flags & MS_STRICTATIME) |
2604 | mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); | 2607 | mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); |
2605 | if (flags & MS_RDONLY) | 2608 | if (flags & MS_RDONLY) |
2606 | mnt_flags |= MNT_READONLY; | 2609 | mnt_flags |= MNT_READONLY; |
2607 | 2610 | ||
2608 | /* The default atime for remount is preservation */ | 2611 | /* The default atime for remount is preservation */ |
2609 | if ((flags & MS_REMOUNT) && | 2612 | if ((flags & MS_REMOUNT) && |
2610 | ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | | 2613 | ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | |
2611 | MS_STRICTATIME)) == 0)) { | 2614 | MS_STRICTATIME)) == 0)) { |
2612 | mnt_flags &= ~MNT_ATIME_MASK; | 2615 | mnt_flags &= ~MNT_ATIME_MASK; |
2613 | mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; | 2616 | mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; |
2614 | } | 2617 | } |
2615 | 2618 | ||
2616 | flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | | 2619 | flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | |
2617 | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | | 2620 | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | |
2618 | MS_STRICTATIME); | 2621 | MS_STRICTATIME); |
2619 | 2622 | ||
2620 | if (flags & MS_REMOUNT) | 2623 | if (flags & MS_REMOUNT) |
2621 | retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, | 2624 | retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, |
2622 | data_page); | 2625 | data_page); |
2623 | else if (flags & MS_BIND) | 2626 | else if (flags & MS_BIND) |
2624 | retval = do_loopback(&path, dev_name, flags & MS_REC); | 2627 | retval = do_loopback(&path, dev_name, flags & MS_REC); |
2625 | else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) | 2628 | else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) |
2626 | retval = do_change_type(&path, flags); | 2629 | retval = do_change_type(&path, flags); |
2627 | else if (flags & MS_MOVE) | 2630 | else if (flags & MS_MOVE) |
2628 | retval = do_move_mount(&path, dev_name); | 2631 | retval = do_move_mount(&path, dev_name); |
2629 | else | 2632 | else |
2630 | retval = do_new_mount(&path, type_page, flags, mnt_flags, | 2633 | retval = do_new_mount(&path, type_page, flags, mnt_flags, |
2631 | dev_name, data_page); | 2634 | dev_name, data_page); |
2632 | dput_out: | 2635 | dput_out: |
2633 | path_put(&path); | 2636 | path_put(&path); |
2634 | return retval; | 2637 | return retval; |
2635 | } | 2638 | } |
2636 | 2639 | ||
2637 | static void free_mnt_ns(struct mnt_namespace *ns) | 2640 | static void free_mnt_ns(struct mnt_namespace *ns) |
2638 | { | 2641 | { |
2639 | ns_free_inum(&ns->ns); | 2642 | ns_free_inum(&ns->ns); |
2640 | put_user_ns(ns->user_ns); | 2643 | put_user_ns(ns->user_ns); |
2641 | kfree(ns); | 2644 | kfree(ns); |
2642 | } | 2645 | } |
2643 | 2646 | ||
2644 | /* | 2647 | /* |
2645 | * Assign a sequence number so we can detect when we attempt to bind | 2648 | * Assign a sequence number so we can detect when we attempt to bind |
2646 | * mount a reference to an older mount namespace into the current | 2649 | * mount a reference to an older mount namespace into the current |
2647 | * mount namespace, preventing reference counting loops. A 64bit | 2650 | * mount namespace, preventing reference counting loops. A 64bit |
2648 | * number incrementing at 10Ghz will take 12,427 years to wrap which | 2651 | * number incrementing at 10Ghz will take 12,427 years to wrap which |
2649 | * is effectively never, so we can ignore the possibility. | 2652 | * is effectively never, so we can ignore the possibility. |
2650 | */ | 2653 | */ |
2651 | static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); | 2654 | static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); |
2652 | 2655 | ||
2653 | static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) | 2656 | static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) |
2654 | { | 2657 | { |
2655 | struct mnt_namespace *new_ns; | 2658 | struct mnt_namespace *new_ns; |
2656 | int ret; | 2659 | int ret; |
2657 | 2660 | ||
2658 | new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); | 2661 | new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); |
2659 | if (!new_ns) | 2662 | if (!new_ns) |
2660 | return ERR_PTR(-ENOMEM); | 2663 | return ERR_PTR(-ENOMEM); |
2661 | ret = ns_alloc_inum(&new_ns->ns); | 2664 | ret = ns_alloc_inum(&new_ns->ns); |
2662 | if (ret) { | 2665 | if (ret) { |
2663 | kfree(new_ns); | 2666 | kfree(new_ns); |
2664 | return ERR_PTR(ret); | 2667 | return ERR_PTR(ret); |
2665 | } | 2668 | } |
2666 | new_ns->ns.ops = &mntns_operations; | 2669 | new_ns->ns.ops = &mntns_operations; |
2667 | new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); | 2670 | new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); |
2668 | atomic_set(&new_ns->count, 1); | 2671 | atomic_set(&new_ns->count, 1); |
2669 | new_ns->root = NULL; | 2672 | new_ns->root = NULL; |
2670 | INIT_LIST_HEAD(&new_ns->list); | 2673 | INIT_LIST_HEAD(&new_ns->list); |
2671 | init_waitqueue_head(&new_ns->poll); | 2674 | init_waitqueue_head(&new_ns->poll); |
2672 | new_ns->event = 0; | 2675 | new_ns->event = 0; |
2673 | new_ns->user_ns = get_user_ns(user_ns); | 2676 | new_ns->user_ns = get_user_ns(user_ns); |
2674 | return new_ns; | 2677 | return new_ns; |
2675 | } | 2678 | } |
2676 | 2679 | ||
2677 | struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, | 2680 | struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, |
2678 | struct user_namespace *user_ns, struct fs_struct *new_fs) | 2681 | struct user_namespace *user_ns, struct fs_struct *new_fs) |
2679 | { | 2682 | { |
2680 | struct mnt_namespace *new_ns; | 2683 | struct mnt_namespace *new_ns; |
2681 | struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; | 2684 | struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; |
2682 | struct mount *p, *q; | 2685 | struct mount *p, *q; |
2683 | struct mount *old; | 2686 | struct mount *old; |
2684 | struct mount *new; | 2687 | struct mount *new; |
2685 | int copy_flags; | 2688 | int copy_flags; |
2686 | 2689 | ||
2687 | BUG_ON(!ns); | 2690 | BUG_ON(!ns); |
2688 | 2691 | ||
2689 | if (likely(!(flags & CLONE_NEWNS))) { | 2692 | if (likely(!(flags & CLONE_NEWNS))) { |
2690 | get_mnt_ns(ns); | 2693 | get_mnt_ns(ns); |
2691 | return ns; | 2694 | return ns; |
2692 | } | 2695 | } |
2693 | 2696 | ||
2694 | old = ns->root; | 2697 | old = ns->root; |
2695 | 2698 | ||
2696 | new_ns = alloc_mnt_ns(user_ns); | 2699 | new_ns = alloc_mnt_ns(user_ns); |
2697 | if (IS_ERR(new_ns)) | 2700 | if (IS_ERR(new_ns)) |
2698 | return new_ns; | 2701 | return new_ns; |
2699 | 2702 | ||
2700 | namespace_lock(); | 2703 | namespace_lock(); |
2701 | /* First pass: copy the tree topology */ | 2704 | /* First pass: copy the tree topology */ |
2702 | copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; | 2705 | copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; |
2703 | if (user_ns != ns->user_ns) | 2706 | if (user_ns != ns->user_ns) |
2704 | copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; | 2707 | copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; |
2705 | new = copy_tree(old, old->mnt.mnt_root, copy_flags); | 2708 | new = copy_tree(old, old->mnt.mnt_root, copy_flags); |
2706 | if (IS_ERR(new)) { | 2709 | if (IS_ERR(new)) { |
2707 | namespace_unlock(); | 2710 | namespace_unlock(); |
2708 | free_mnt_ns(new_ns); | 2711 | free_mnt_ns(new_ns); |
2709 | return ERR_CAST(new); | 2712 | return ERR_CAST(new); |
2710 | } | 2713 | } |
2711 | new_ns->root = new; | 2714 | new_ns->root = new; |
2712 | list_add_tail(&new_ns->list, &new->mnt_list); | 2715 | list_add_tail(&new_ns->list, &new->mnt_list); |
2713 | 2716 | ||
2714 | /* | 2717 | /* |
2715 | * Second pass: switch the tsk->fs->* elements and mark new vfsmounts | 2718 | * Second pass: switch the tsk->fs->* elements and mark new vfsmounts |
2716 | * as belonging to new namespace. We have already acquired a private | 2719 | * as belonging to new namespace. We have already acquired a private |
2717 | * fs_struct, so tsk->fs->lock is not needed. | 2720 | * fs_struct, so tsk->fs->lock is not needed. |
2718 | */ | 2721 | */ |
2719 | p = old; | 2722 | p = old; |
2720 | q = new; | 2723 | q = new; |
2721 | while (p) { | 2724 | while (p) { |
2722 | q->mnt_ns = new_ns; | 2725 | q->mnt_ns = new_ns; |
2723 | if (new_fs) { | 2726 | if (new_fs) { |
2724 | if (&p->mnt == new_fs->root.mnt) { | 2727 | if (&p->mnt == new_fs->root.mnt) { |
2725 | new_fs->root.mnt = mntget(&q->mnt); | 2728 | new_fs->root.mnt = mntget(&q->mnt); |
2726 | rootmnt = &p->mnt; | 2729 | rootmnt = &p->mnt; |
2727 | } | 2730 | } |
2728 | if (&p->mnt == new_fs->pwd.mnt) { | 2731 | if (&p->mnt == new_fs->pwd.mnt) { |
2729 | new_fs->pwd.mnt = mntget(&q->mnt); | 2732 | new_fs->pwd.mnt = mntget(&q->mnt); |
2730 | pwdmnt = &p->mnt; | 2733 | pwdmnt = &p->mnt; |
2731 | } | 2734 | } |
2732 | } | 2735 | } |
2733 | p = next_mnt(p, old); | 2736 | p = next_mnt(p, old); |
2734 | q = next_mnt(q, new); | 2737 | q = next_mnt(q, new); |
2735 | if (!q) | 2738 | if (!q) |
2736 | break; | 2739 | break; |
2737 | while (p->mnt.mnt_root != q->mnt.mnt_root) | 2740 | while (p->mnt.mnt_root != q->mnt.mnt_root) |
2738 | p = next_mnt(p, old); | 2741 | p = next_mnt(p, old); |
2739 | } | 2742 | } |
2740 | namespace_unlock(); | 2743 | namespace_unlock(); |
2741 | 2744 | ||
2742 | if (rootmnt) | 2745 | if (rootmnt) |
2743 | mntput(rootmnt); | 2746 | mntput(rootmnt); |
2744 | if (pwdmnt) | 2747 | if (pwdmnt) |
2745 | mntput(pwdmnt); | 2748 | mntput(pwdmnt); |
2746 | 2749 | ||
2747 | return new_ns; | 2750 | return new_ns; |
2748 | } | 2751 | } |
2749 | 2752 | ||
2750 | /** | 2753 | /** |
2751 | * create_mnt_ns - creates a private namespace and adds a root filesystem | 2754 | * create_mnt_ns - creates a private namespace and adds a root filesystem |
2752 | * @mnt: pointer to the new root filesystem mountpoint | 2755 | * @mnt: pointer to the new root filesystem mountpoint |
2753 | */ | 2756 | */ |
2754 | static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) | 2757 | static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) |
2755 | { | 2758 | { |
2756 | struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); | 2759 | struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); |
2757 | if (!IS_ERR(new_ns)) { | 2760 | if (!IS_ERR(new_ns)) { |
2758 | struct mount *mnt = real_mount(m); | 2761 | struct mount *mnt = real_mount(m); |
2759 | mnt->mnt_ns = new_ns; | 2762 | mnt->mnt_ns = new_ns; |
2760 | new_ns->root = mnt; | 2763 | new_ns->root = mnt; |
2761 | list_add(&mnt->mnt_list, &new_ns->list); | 2764 | list_add(&mnt->mnt_list, &new_ns->list); |
2762 | } else { | 2765 | } else { |
2763 | mntput(m); | 2766 | mntput(m); |
2764 | } | 2767 | } |
2765 | return new_ns; | 2768 | return new_ns; |
2766 | } | 2769 | } |
2767 | 2770 | ||
2768 | struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) | 2771 | struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) |
2769 | { | 2772 | { |
2770 | struct mnt_namespace *ns; | 2773 | struct mnt_namespace *ns; |
2771 | struct super_block *s; | 2774 | struct super_block *s; |
2772 | struct path path; | 2775 | struct path path; |
2773 | int err; | 2776 | int err; |
2774 | 2777 | ||
2775 | ns = create_mnt_ns(mnt); | 2778 | ns = create_mnt_ns(mnt); |
2776 | if (IS_ERR(ns)) | 2779 | if (IS_ERR(ns)) |
2777 | return ERR_CAST(ns); | 2780 | return ERR_CAST(ns); |
2778 | 2781 | ||
2779 | err = vfs_path_lookup(mnt->mnt_root, mnt, | 2782 | err = vfs_path_lookup(mnt->mnt_root, mnt, |
2780 | name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); | 2783 | name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); |
2781 | 2784 | ||
2782 | put_mnt_ns(ns); | 2785 | put_mnt_ns(ns); |
2783 | 2786 | ||
2784 | if (err) | 2787 | if (err) |
2785 | return ERR_PTR(err); | 2788 | return ERR_PTR(err); |
2786 | 2789 | ||
2787 | /* trade a vfsmount reference for active sb one */ | 2790 | /* trade a vfsmount reference for active sb one */ |
2788 | s = path.mnt->mnt_sb; | 2791 | s = path.mnt->mnt_sb; |
2789 | atomic_inc(&s->s_active); | 2792 | atomic_inc(&s->s_active); |
2790 | mntput(path.mnt); | 2793 | mntput(path.mnt); |
2791 | /* lock the sucker */ | 2794 | /* lock the sucker */ |
2792 | down_write(&s->s_umount); | 2795 | down_write(&s->s_umount); |
2793 | /* ... and return the root of (sub)tree on it */ | 2796 | /* ... and return the root of (sub)tree on it */ |
2794 | return path.dentry; | 2797 | return path.dentry; |
2795 | } | 2798 | } |
2796 | EXPORT_SYMBOL(mount_subtree); | 2799 | EXPORT_SYMBOL(mount_subtree); |
2797 | 2800 | ||
2798 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, | 2801 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, |
2799 | char __user *, type, unsigned long, flags, void __user *, data) | 2802 | char __user *, type, unsigned long, flags, void __user *, data) |
2800 | { | 2803 | { |
2801 | int ret; | 2804 | int ret; |
2802 | char *kernel_type; | 2805 | char *kernel_type; |
2803 | char *kernel_dev; | 2806 | char *kernel_dev; |
2804 | unsigned long data_page; | 2807 | unsigned long data_page; |
2805 | 2808 | ||
2806 | kernel_type = copy_mount_string(type); | 2809 | kernel_type = copy_mount_string(type); |
2807 | ret = PTR_ERR(kernel_type); | 2810 | ret = PTR_ERR(kernel_type); |
2808 | if (IS_ERR(kernel_type)) | 2811 | if (IS_ERR(kernel_type)) |
2809 | goto out_type; | 2812 | goto out_type; |
2810 | 2813 | ||
2811 | kernel_dev = copy_mount_string(dev_name); | 2814 | kernel_dev = copy_mount_string(dev_name); |
2812 | ret = PTR_ERR(kernel_dev); | 2815 | ret = PTR_ERR(kernel_dev); |
2813 | if (IS_ERR(kernel_dev)) | 2816 | if (IS_ERR(kernel_dev)) |
2814 | goto out_dev; | 2817 | goto out_dev; |
2815 | 2818 | ||
2816 | ret = copy_mount_options(data, &data_page); | 2819 | ret = copy_mount_options(data, &data_page); |
2817 | if (ret < 0) | 2820 | if (ret < 0) |
2818 | goto out_data; | 2821 | goto out_data; |
2819 | 2822 | ||
2820 | ret = do_mount(kernel_dev, dir_name, kernel_type, flags, | 2823 | ret = do_mount(kernel_dev, dir_name, kernel_type, flags, |
2821 | (void *) data_page); | 2824 | (void *) data_page); |
2822 | 2825 | ||
2823 | free_page(data_page); | 2826 | free_page(data_page); |
2824 | out_data: | 2827 | out_data: |
2825 | kfree(kernel_dev); | 2828 | kfree(kernel_dev); |
2826 | out_dev: | 2829 | out_dev: |
2827 | kfree(kernel_type); | 2830 | kfree(kernel_type); |
2828 | out_type: | 2831 | out_type: |
2829 | return ret; | 2832 | return ret; |
2830 | } | 2833 | } |
2831 | 2834 | ||
2832 | /* | 2835 | /* |
2833 | * Return true if path is reachable from root | 2836 | * Return true if path is reachable from root |
2834 | * | 2837 | * |
2835 | * namespace_sem or mount_lock is held | 2838 | * namespace_sem or mount_lock is held |
2836 | */ | 2839 | */ |
2837 | bool is_path_reachable(struct mount *mnt, struct dentry *dentry, | 2840 | bool is_path_reachable(struct mount *mnt, struct dentry *dentry, |
2838 | const struct path *root) | 2841 | const struct path *root) |
2839 | { | 2842 | { |
2840 | while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { | 2843 | while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { |
2841 | dentry = mnt->mnt_mountpoint; | 2844 | dentry = mnt->mnt_mountpoint; |
2842 | mnt = mnt->mnt_parent; | 2845 | mnt = mnt->mnt_parent; |
2843 | } | 2846 | } |
2844 | return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); | 2847 | return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); |
2845 | } | 2848 | } |
2846 | 2849 | ||
2847 | int path_is_under(struct path *path1, struct path *path2) | 2850 | int path_is_under(struct path *path1, struct path *path2) |
2848 | { | 2851 | { |
2849 | int res; | 2852 | int res; |
2850 | read_seqlock_excl(&mount_lock); | 2853 | read_seqlock_excl(&mount_lock); |
2851 | res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); | 2854 | res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); |
2852 | read_sequnlock_excl(&mount_lock); | 2855 | read_sequnlock_excl(&mount_lock); |
2853 | return res; | 2856 | return res; |
2854 | } | 2857 | } |
2855 | EXPORT_SYMBOL(path_is_under); | 2858 | EXPORT_SYMBOL(path_is_under); |
2856 | 2859 | ||
2857 | /* | 2860 | /* |
2858 | * pivot_root Semantics: | 2861 | * pivot_root Semantics: |
2859 | * Moves the root file system of the current process to the directory put_old, | 2862 | * Moves the root file system of the current process to the directory put_old, |
2860 | * makes new_root as the new root file system of the current process, and sets | 2863 | * makes new_root as the new root file system of the current process, and sets |
2861 | * root/cwd of all processes which had them on the current root to new_root. | 2864 | * root/cwd of all processes which had them on the current root to new_root. |
2862 | * | 2865 | * |
2863 | * Restrictions: | 2866 | * Restrictions: |
2864 | * The new_root and put_old must be directories, and must not be on the | 2867 | * The new_root and put_old must be directories, and must not be on the |
2865 | * same file system as the current process root. The put_old must be | 2868 | * same file system as the current process root. The put_old must be |
2866 | * underneath new_root, i.e. adding a non-zero number of /.. to the string | 2869 | * underneath new_root, i.e. adding a non-zero number of /.. to the string |
2867 | * pointed to by put_old must yield the same directory as new_root. No other | 2870 | * pointed to by put_old must yield the same directory as new_root. No other |
2868 | * file system may be mounted on put_old. After all, new_root is a mountpoint. | 2871 | * file system may be mounted on put_old. After all, new_root is a mountpoint. |
2869 | * | 2872 | * |
2870 | * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. | 2873 | * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. |
2871 | * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives | 2874 | * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives |
2872 | * in this situation. | 2875 | * in this situation. |
2873 | * | 2876 | * |
2874 | * Notes: | 2877 | * Notes: |
2875 | * - we don't move root/cwd if they are not at the root (reason: if something | 2878 | * - we don't move root/cwd if they are not at the root (reason: if something |
2876 | * cared enough to change them, it's probably wrong to force them elsewhere) | 2879 | * cared enough to change them, it's probably wrong to force them elsewhere) |
2877 | * - it's okay to pick a root that isn't the root of a file system, e.g. | 2880 | * - it's okay to pick a root that isn't the root of a file system, e.g. |
2878 | * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, | 2881 | * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, |
2879 | * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root | 2882 | * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root |
2880 | * first. | 2883 | * first. |
2881 | */ | 2884 | */ |
2882 | SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, | 2885 | SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, |
2883 | const char __user *, put_old) | 2886 | const char __user *, put_old) |
2884 | { | 2887 | { |
2885 | struct path new, old, parent_path, root_parent, root; | 2888 | struct path new, old, parent_path, root_parent, root; |
2886 | struct mount *new_mnt, *root_mnt, *old_mnt; | 2889 | struct mount *new_mnt, *root_mnt, *old_mnt; |
2887 | struct mountpoint *old_mp, *root_mp; | 2890 | struct mountpoint *old_mp, *root_mp; |
2888 | int error; | 2891 | int error; |
2889 | 2892 | ||
2890 | if (!may_mount()) | 2893 | if (!may_mount()) |
2891 | return -EPERM; | 2894 | return -EPERM; |
2892 | 2895 | ||
2893 | error = user_path_dir(new_root, &new); | 2896 | error = user_path_dir(new_root, &new); |
2894 | if (error) | 2897 | if (error) |
2895 | goto out0; | 2898 | goto out0; |
2896 | 2899 | ||
2897 | error = user_path_dir(put_old, &old); | 2900 | error = user_path_dir(put_old, &old); |
2898 | if (error) | 2901 | if (error) |
2899 | goto out1; | 2902 | goto out1; |
2900 | 2903 | ||
2901 | error = security_sb_pivotroot(&old, &new); | 2904 | error = security_sb_pivotroot(&old, &new); |
2902 | if (error) | 2905 | if (error) |
2903 | goto out2; | 2906 | goto out2; |
2904 | 2907 | ||
2905 | get_fs_root(current->fs, &root); | 2908 | get_fs_root(current->fs, &root); |
2906 | old_mp = lock_mount(&old); | 2909 | old_mp = lock_mount(&old); |
2907 | error = PTR_ERR(old_mp); | 2910 | error = PTR_ERR(old_mp); |
2908 | if (IS_ERR(old_mp)) | 2911 | if (IS_ERR(old_mp)) |
2909 | goto out3; | 2912 | goto out3; |
2910 | 2913 | ||
2911 | error = -EINVAL; | 2914 | error = -EINVAL; |
2912 | new_mnt = real_mount(new.mnt); | 2915 | new_mnt = real_mount(new.mnt); |
2913 | root_mnt = real_mount(root.mnt); | 2916 | root_mnt = real_mount(root.mnt); |
2914 | old_mnt = real_mount(old.mnt); | 2917 | old_mnt = real_mount(old.mnt); |
2915 | if (IS_MNT_SHARED(old_mnt) || | 2918 | if (IS_MNT_SHARED(old_mnt) || |
2916 | IS_MNT_SHARED(new_mnt->mnt_parent) || | 2919 | IS_MNT_SHARED(new_mnt->mnt_parent) || |
2917 | IS_MNT_SHARED(root_mnt->mnt_parent)) | 2920 | IS_MNT_SHARED(root_mnt->mnt_parent)) |
2918 | goto out4; | 2921 | goto out4; |
2919 | if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) | 2922 | if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) |
2920 | goto out4; | 2923 | goto out4; |
2921 | if (new_mnt->mnt.mnt_flags & MNT_LOCKED) | 2924 | if (new_mnt->mnt.mnt_flags & MNT_LOCKED) |
2922 | goto out4; | 2925 | goto out4; |
2923 | error = -ENOENT; | 2926 | error = -ENOENT; |
2924 | if (d_unlinked(new.dentry)) | 2927 | if (d_unlinked(new.dentry)) |
2925 | goto out4; | 2928 | goto out4; |
2926 | error = -EBUSY; | 2929 | error = -EBUSY; |
2927 | if (new_mnt == root_mnt || old_mnt == root_mnt) | 2930 | if (new_mnt == root_mnt || old_mnt == root_mnt) |
2928 | goto out4; /* loop, on the same file system */ | 2931 | goto out4; /* loop, on the same file system */ |
2929 | error = -EINVAL; | 2932 | error = -EINVAL; |
2930 | if (root.mnt->mnt_root != root.dentry) | 2933 | if (root.mnt->mnt_root != root.dentry) |
2931 | goto out4; /* not a mountpoint */ | 2934 | goto out4; /* not a mountpoint */ |
2932 | if (!mnt_has_parent(root_mnt)) | 2935 | if (!mnt_has_parent(root_mnt)) |
2933 | goto out4; /* not attached */ | 2936 | goto out4; /* not attached */ |
2934 | root_mp = root_mnt->mnt_mp; | 2937 | root_mp = root_mnt->mnt_mp; |
2935 | if (new.mnt->mnt_root != new.dentry) | 2938 | if (new.mnt->mnt_root != new.dentry) |
2936 | goto out4; /* not a mountpoint */ | 2939 | goto out4; /* not a mountpoint */ |
2937 | if (!mnt_has_parent(new_mnt)) | 2940 | if (!mnt_has_parent(new_mnt)) |
2938 | goto out4; /* not attached */ | 2941 | goto out4; /* not attached */ |
2939 | /* make sure we can reach put_old from new_root */ | 2942 | /* make sure we can reach put_old from new_root */ |
2940 | if (!is_path_reachable(old_mnt, old.dentry, &new)) | 2943 | if (!is_path_reachable(old_mnt, old.dentry, &new)) |
2941 | goto out4; | 2944 | goto out4; |
2942 | /* make certain new is below the root */ | 2945 | /* make certain new is below the root */ |
2943 | if (!is_path_reachable(new_mnt, new.dentry, &root)) | 2946 | if (!is_path_reachable(new_mnt, new.dentry, &root)) |
2944 | goto out4; | 2947 | goto out4; |
2945 | root_mp->m_count++; /* pin it so it won't go away */ | 2948 | root_mp->m_count++; /* pin it so it won't go away */ |
2946 | lock_mount_hash(); | 2949 | lock_mount_hash(); |
2947 | detach_mnt(new_mnt, &parent_path); | 2950 | detach_mnt(new_mnt, &parent_path); |
2948 | detach_mnt(root_mnt, &root_parent); | 2951 | detach_mnt(root_mnt, &root_parent); |
2949 | if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { | 2952 | if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { |
2950 | new_mnt->mnt.mnt_flags |= MNT_LOCKED; | 2953 | new_mnt->mnt.mnt_flags |= MNT_LOCKED; |
2951 | root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; | 2954 | root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; |
2952 | } | 2955 | } |
2953 | /* mount old root on put_old */ | 2956 | /* mount old root on put_old */ |
2954 | attach_mnt(root_mnt, old_mnt, old_mp); | 2957 | attach_mnt(root_mnt, old_mnt, old_mp); |
2955 | /* mount new_root on / */ | 2958 | /* mount new_root on / */ |
2956 | attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); | 2959 | attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); |
2957 | touch_mnt_namespace(current->nsproxy->mnt_ns); | 2960 | touch_mnt_namespace(current->nsproxy->mnt_ns); |
2958 | unlock_mount_hash(); | 2961 | unlock_mount_hash(); |
2959 | chroot_fs_refs(&root, &new); | 2962 | chroot_fs_refs(&root, &new); |
2960 | put_mountpoint(root_mp); | 2963 | put_mountpoint(root_mp); |
2961 | error = 0; | 2964 | error = 0; |
2962 | out4: | 2965 | out4: |
2963 | unlock_mount(old_mp); | 2966 | unlock_mount(old_mp); |
2964 | if (!error) { | 2967 | if (!error) { |
2965 | path_put(&root_parent); | 2968 | path_put(&root_parent); |
2966 | path_put(&parent_path); | 2969 | path_put(&parent_path); |
2967 | } | 2970 | } |
2968 | out3: | 2971 | out3: |
2969 | path_put(&root); | 2972 | path_put(&root); |
2970 | out2: | 2973 | out2: |
2971 | path_put(&old); | 2974 | path_put(&old); |
2972 | out1: | 2975 | out1: |
2973 | path_put(&new); | 2976 | path_put(&new); |
2974 | out0: | 2977 | out0: |
2975 | return error; | 2978 | return error; |
2976 | } | 2979 | } |
2977 | 2980 | ||
2978 | static void __init init_mount_tree(void) | 2981 | static void __init init_mount_tree(void) |
2979 | { | 2982 | { |
2980 | struct vfsmount *mnt; | 2983 | struct vfsmount *mnt; |
2981 | struct mnt_namespace *ns; | 2984 | struct mnt_namespace *ns; |
2982 | struct path root; | 2985 | struct path root; |
2983 | struct file_system_type *type; | 2986 | struct file_system_type *type; |
2984 | 2987 | ||
2985 | type = get_fs_type("rootfs"); | 2988 | type = get_fs_type("rootfs"); |
2986 | if (!type) | 2989 | if (!type) |
2987 | panic("Can't find rootfs type"); | 2990 | panic("Can't find rootfs type"); |
2988 | mnt = vfs_kern_mount(type, 0, "rootfs", NULL); | 2991 | mnt = vfs_kern_mount(type, 0, "rootfs", NULL); |
2989 | put_filesystem(type); | 2992 | put_filesystem(type); |
2990 | if (IS_ERR(mnt)) | 2993 | if (IS_ERR(mnt)) |
2991 | panic("Can't create rootfs"); | 2994 | panic("Can't create rootfs"); |
2992 | 2995 | ||
2993 | ns = create_mnt_ns(mnt); | 2996 | ns = create_mnt_ns(mnt); |
2994 | if (IS_ERR(ns)) | 2997 | if (IS_ERR(ns)) |
2995 | panic("Can't allocate initial namespace"); | 2998 | panic("Can't allocate initial namespace"); |
2996 | 2999 | ||
2997 | init_task.nsproxy->mnt_ns = ns; | 3000 | init_task.nsproxy->mnt_ns = ns; |
2998 | get_mnt_ns(ns); | 3001 | get_mnt_ns(ns); |
2999 | 3002 | ||
3000 | root.mnt = mnt; | 3003 | root.mnt = mnt; |
3001 | root.dentry = mnt->mnt_root; | 3004 | root.dentry = mnt->mnt_root; |
3002 | 3005 | ||
3003 | set_fs_pwd(current->fs, &root); | 3006 | set_fs_pwd(current->fs, &root); |
3004 | set_fs_root(current->fs, &root); | 3007 | set_fs_root(current->fs, &root); |
3005 | } | 3008 | } |
3006 | 3009 | ||
3007 | void __init mnt_init(void) | 3010 | void __init mnt_init(void) |
3008 | { | 3011 | { |
3009 | unsigned u; | 3012 | unsigned u; |
3010 | int err; | 3013 | int err; |
3011 | 3014 | ||
3012 | mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), | 3015 | mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), |
3013 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | 3016 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); |
3014 | 3017 | ||
3015 | mount_hashtable = alloc_large_system_hash("Mount-cache", | 3018 | mount_hashtable = alloc_large_system_hash("Mount-cache", |
3016 | sizeof(struct hlist_head), | 3019 | sizeof(struct hlist_head), |
3017 | mhash_entries, 19, | 3020 | mhash_entries, 19, |
3018 | 0, | 3021 | 0, |
3019 | &m_hash_shift, &m_hash_mask, 0, 0); | 3022 | &m_hash_shift, &m_hash_mask, 0, 0); |
3020 | mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", | 3023 | mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", |
3021 | sizeof(struct hlist_head), | 3024 | sizeof(struct hlist_head), |
3022 | mphash_entries, 19, | 3025 | mphash_entries, 19, |
3023 | 0, | 3026 | 0, |
3024 | &mp_hash_shift, &mp_hash_mask, 0, 0); | 3027 | &mp_hash_shift, &mp_hash_mask, 0, 0); |
3025 | 3028 | ||
3026 | if (!mount_hashtable || !mountpoint_hashtable) | 3029 | if (!mount_hashtable || !mountpoint_hashtable) |
3027 | panic("Failed to allocate mount hash table\n"); | 3030 | panic("Failed to allocate mount hash table\n"); |
3028 | 3031 | ||
3029 | for (u = 0; u <= m_hash_mask; u++) | 3032 | for (u = 0; u <= m_hash_mask; u++) |
3030 | INIT_HLIST_HEAD(&mount_hashtable[u]); | 3033 | INIT_HLIST_HEAD(&mount_hashtable[u]); |
3031 | for (u = 0; u <= mp_hash_mask; u++) | 3034 | for (u = 0; u <= mp_hash_mask; u++) |
3032 | INIT_HLIST_HEAD(&mountpoint_hashtable[u]); | 3035 | INIT_HLIST_HEAD(&mountpoint_hashtable[u]); |
3033 | 3036 | ||
3034 | kernfs_init(); | 3037 | kernfs_init(); |
3035 | 3038 | ||
3036 | err = sysfs_init(); | 3039 | err = sysfs_init(); |
3037 | if (err) | 3040 | if (err) |
3038 | printk(KERN_WARNING "%s: sysfs_init error: %d\n", | 3041 | printk(KERN_WARNING "%s: sysfs_init error: %d\n", |
3039 | __func__, err); | 3042 | __func__, err); |
3040 | fs_kobj = kobject_create_and_add("fs", NULL); | 3043 | fs_kobj = kobject_create_and_add("fs", NULL); |
3041 | if (!fs_kobj) | 3044 | if (!fs_kobj) |
3042 | printk(KERN_WARNING "%s: kobj create error\n", __func__); | 3045 | printk(KERN_WARNING "%s: kobj create error\n", __func__); |
3043 | init_rootfs(); | 3046 | init_rootfs(); |
3044 | init_mount_tree(); | 3047 | init_mount_tree(); |
3045 | } | 3048 | } |
3046 | 3049 | ||
3047 | void put_mnt_ns(struct mnt_namespace *ns) | 3050 | void put_mnt_ns(struct mnt_namespace *ns) |
3048 | { | 3051 | { |
3049 | if (!atomic_dec_and_test(&ns->count)) | 3052 | if (!atomic_dec_and_test(&ns->count)) |
3050 | return; | 3053 | return; |
3051 | drop_collected_mounts(&ns->root->mnt); | 3054 | drop_collected_mounts(&ns->root->mnt); |
3052 | free_mnt_ns(ns); | 3055 | free_mnt_ns(ns); |
3053 | } | 3056 | } |
3054 | 3057 | ||
3055 | struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) | 3058 | struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) |
3056 | { | 3059 | { |
3057 | struct vfsmount *mnt; | 3060 | struct vfsmount *mnt; |
3058 | mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); | 3061 | mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); |
3059 | if (!IS_ERR(mnt)) { | 3062 | if (!IS_ERR(mnt)) { |
3060 | /* | 3063 | /* |
3061 | * it is a longterm mount, don't release mnt until | 3064 | * it is a longterm mount, don't release mnt until |
3062 | * we unmount before file sys is unregistered | 3065 | * we unmount before file sys is unregistered |
3063 | */ | 3066 | */ |
3064 | real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; | 3067 | real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; |
3065 | } | 3068 | } |
3066 | return mnt; | 3069 | return mnt; |
3067 | } | 3070 | } |
3068 | EXPORT_SYMBOL_GPL(kern_mount_data); | 3071 | EXPORT_SYMBOL_GPL(kern_mount_data); |
3069 | 3072 | ||
3070 | void kern_unmount(struct vfsmount *mnt) | 3073 | void kern_unmount(struct vfsmount *mnt) |
3071 | { | 3074 | { |
3072 | /* release long term mount so mount point can be released */ | 3075 | /* release long term mount so mount point can be released */ |
3073 | if (!IS_ERR_OR_NULL(mnt)) { | 3076 | if (!IS_ERR_OR_NULL(mnt)) { |
3074 | real_mount(mnt)->mnt_ns = NULL; | 3077 | real_mount(mnt)->mnt_ns = NULL; |
3075 | synchronize_rcu(); /* yecchhh... */ | 3078 | synchronize_rcu(); /* yecchhh... */ |
3076 | mntput(mnt); | 3079 | mntput(mnt); |
3077 | } | 3080 | } |
3078 | } | 3081 | } |
3079 | EXPORT_SYMBOL(kern_unmount); | 3082 | EXPORT_SYMBOL(kern_unmount); |
3080 | 3083 | ||
3081 | bool our_mnt(struct vfsmount *mnt) | 3084 | bool our_mnt(struct vfsmount *mnt) |
3082 | { | 3085 | { |
3083 | return check_mnt(real_mount(mnt)); | 3086 | return check_mnt(real_mount(mnt)); |
3084 | } | 3087 | } |
3085 | 3088 | ||
3086 | bool current_chrooted(void) | 3089 | bool current_chrooted(void) |
3087 | { | 3090 | { |
3088 | /* Does the current process have a non-standard root */ | 3091 | /* Does the current process have a non-standard root */ |
3089 | struct path ns_root; | 3092 | struct path ns_root; |
3090 | struct path fs_root; | 3093 | struct path fs_root; |
3091 | bool chrooted; | 3094 | bool chrooted; |
3092 | 3095 | ||
3093 | /* Find the namespace root */ | 3096 | /* Find the namespace root */ |
3094 | ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; | 3097 | ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; |
3095 | ns_root.dentry = ns_root.mnt->mnt_root; | 3098 | ns_root.dentry = ns_root.mnt->mnt_root; |
3096 | path_get(&ns_root); | 3099 | path_get(&ns_root); |
3097 | while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) | 3100 | while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) |
3098 | ; | 3101 | ; |
3099 | 3102 | ||
3100 | get_fs_root(current->fs, &fs_root); | 3103 | get_fs_root(current->fs, &fs_root); |
3101 | 3104 | ||
3102 | chrooted = !path_equal(&fs_root, &ns_root); | 3105 | chrooted = !path_equal(&fs_root, &ns_root); |
3103 | 3106 | ||
3104 | path_put(&fs_root); | 3107 | path_put(&fs_root); |
3105 | path_put(&ns_root); | 3108 | path_put(&ns_root); |
3106 | 3109 | ||
3107 | return chrooted; | 3110 | return chrooted; |
3108 | } | 3111 | } |
3109 | 3112 | ||
3110 | bool fs_fully_visible(struct file_system_type *type) | 3113 | bool fs_fully_visible(struct file_system_type *type) |
3111 | { | 3114 | { |
3112 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; | 3115 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; |
3113 | struct mount *mnt; | 3116 | struct mount *mnt; |
3114 | bool visible = false; | 3117 | bool visible = false; |
3115 | 3118 | ||
3116 | if (unlikely(!ns)) | 3119 | if (unlikely(!ns)) |
3117 | return false; | 3120 | return false; |
3118 | 3121 | ||
3119 | down_read(&namespace_sem); | 3122 | down_read(&namespace_sem); |
3120 | list_for_each_entry(mnt, &ns->list, mnt_list) { | 3123 | list_for_each_entry(mnt, &ns->list, mnt_list) { |
3121 | struct mount *child; | 3124 | struct mount *child; |
3122 | if (mnt->mnt.mnt_sb->s_type != type) | 3125 | if (mnt->mnt.mnt_sb->s_type != type) |
3123 | continue; | 3126 | continue; |
3124 | 3127 | ||
3125 | /* This mount is not fully visible if there are any child mounts | 3128 | /* This mount is not fully visible if there are any child mounts |
3126 | * that cover anything except for empty directories. | 3129 | * that cover anything except for empty directories. |
3127 | */ | 3130 | */ |
3128 | list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { | 3131 | list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { |
3129 | struct inode *inode = child->mnt_mountpoint->d_inode; | 3132 | struct inode *inode = child->mnt_mountpoint->d_inode; |
3130 | if (!S_ISDIR(inode->i_mode)) | 3133 | if (!S_ISDIR(inode->i_mode)) |
3131 | goto next; | 3134 | goto next; |
3132 | if (inode->i_nlink > 2) | 3135 | if (inode->i_nlink > 2) |
3133 | goto next; | 3136 | goto next; |
3134 | } | 3137 | } |
3135 | visible = true; | 3138 | visible = true; |
3136 | goto found; | 3139 | goto found; |
3137 | next: ; | 3140 | next: ; |
3138 | } | 3141 | } |
3139 | found: | 3142 | found: |
3140 | up_read(&namespace_sem); | 3143 | up_read(&namespace_sem); |
3141 | return visible; | 3144 | return visible; |
3142 | } | 3145 | } |
3143 | 3146 | ||
3144 | static struct ns_common *mntns_get(struct task_struct *task) | 3147 | static struct ns_common *mntns_get(struct task_struct *task) |
3145 | { | 3148 | { |
3146 | struct ns_common *ns = NULL; | 3149 | struct ns_common *ns = NULL; |
3147 | struct nsproxy *nsproxy; | 3150 | struct nsproxy *nsproxy; |
3148 | 3151 | ||
3149 | task_lock(task); | 3152 | task_lock(task); |
3150 | nsproxy = task->nsproxy; | 3153 | nsproxy = task->nsproxy; |
3151 | if (nsproxy) { | 3154 | if (nsproxy) { |
3152 | ns = &nsproxy->mnt_ns->ns; | 3155 | ns = &nsproxy->mnt_ns->ns; |
3153 | get_mnt_ns(to_mnt_ns(ns)); | 3156 | get_mnt_ns(to_mnt_ns(ns)); |
3154 | } | 3157 | } |
3155 | task_unlock(task); | 3158 | task_unlock(task); |
3156 | 3159 | ||
3157 | return ns; | 3160 | return ns; |
3158 | } | 3161 | } |
3159 | 3162 | ||
3160 | static void mntns_put(struct ns_common *ns) | 3163 | static void mntns_put(struct ns_common *ns) |
3161 | { | 3164 | { |
3162 | put_mnt_ns(to_mnt_ns(ns)); | 3165 | put_mnt_ns(to_mnt_ns(ns)); |
3163 | } | 3166 | } |
3164 | 3167 | ||
3165 | static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) | 3168 | static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
3166 | { | 3169 | { |
3167 | struct fs_struct *fs = current->fs; | 3170 | struct fs_struct *fs = current->fs; |
3168 | struct mnt_namespace *mnt_ns = to_mnt_ns(ns); | 3171 | struct mnt_namespace *mnt_ns = to_mnt_ns(ns); |
3169 | struct path root; | 3172 | struct path root; |
3170 | 3173 | ||
3171 | if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || | 3174 | if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || |
3172 | !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || | 3175 | !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || |
3173 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 3176 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
3174 | return -EPERM; | 3177 | return -EPERM; |
3175 | 3178 | ||
3176 | if (fs->users != 1) | 3179 | if (fs->users != 1) |
3177 | return -EINVAL; | 3180 | return -EINVAL; |
3178 | 3181 | ||
3179 | get_mnt_ns(mnt_ns); | 3182 | get_mnt_ns(mnt_ns); |
3180 | put_mnt_ns(nsproxy->mnt_ns); | 3183 | put_mnt_ns(nsproxy->mnt_ns); |
3181 | nsproxy->mnt_ns = mnt_ns; | 3184 | nsproxy->mnt_ns = mnt_ns; |
3182 | 3185 | ||
3183 | /* Find the root */ | 3186 | /* Find the root */ |
3184 | root.mnt = &mnt_ns->root->mnt; | 3187 | root.mnt = &mnt_ns->root->mnt; |
3185 | root.dentry = mnt_ns->root->mnt.mnt_root; | 3188 | root.dentry = mnt_ns->root->mnt.mnt_root; |
3186 | path_get(&root); | 3189 | path_get(&root); |
3187 | while(d_mountpoint(root.dentry) && follow_down_one(&root)) | 3190 | while(d_mountpoint(root.dentry) && follow_down_one(&root)) |
3188 | ; | 3191 | ; |
3189 | 3192 | ||
3190 | /* Update the pwd and root */ | 3193 | /* Update the pwd and root */ |
3191 | set_fs_pwd(fs, &root); | 3194 | set_fs_pwd(fs, &root); |
3192 | set_fs_root(fs, &root); | 3195 | set_fs_root(fs, &root); |
3193 | 3196 | ||
3194 | path_put(&root); | 3197 | path_put(&root); |
3195 | return 0; | 3198 | return 0; |
3196 | } | 3199 | } |
3197 | 3200 | ||
3198 | const struct proc_ns_operations mntns_operations = { | 3201 | const struct proc_ns_operations mntns_operations = { |
3199 | .name = "mnt", | 3202 | .name = "mnt", |
3200 | .type = CLONE_NEWNS, | 3203 | .type = CLONE_NEWNS, |
3201 | .get = mntns_get, | 3204 | .get = mntns_get, |
3202 | .put = mntns_put, | 3205 | .put = mntns_put, |
3203 | .install = mntns_install, | 3206 | .install = mntns_install, |
3204 | }; | 3207 | }; |
3205 | 3208 |
fs/nsfs.c
File was created | 1 | #include <linux/mount.h> | |
2 | #include <linux/file.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/proc_ns.h> | ||
5 | #include <linux/magic.h> | ||
6 | #include <linux/ktime.h> | ||
7 | |||
8 | static struct vfsmount *nsfs_mnt; | ||
9 | |||
10 | static const struct file_operations ns_file_operations = { | ||
11 | .llseek = no_llseek, | ||
12 | }; | ||
13 | |||
14 | static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) | ||
15 | { | ||
16 | struct inode *inode = dentry->d_inode; | ||
17 | const struct proc_ns_operations *ns_ops = dentry->d_fsdata; | ||
18 | |||
19 | return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", | ||
20 | ns_ops->name, inode->i_ino); | ||
21 | } | ||
22 | |||
23 | static void ns_prune_dentry(struct dentry *dentry) | ||
24 | { | ||
25 | struct inode *inode = dentry->d_inode; | ||
26 | if (inode) { | ||
27 | struct ns_common *ns = inode->i_private; | ||
28 | atomic_long_set(&ns->stashed, 0); | ||
29 | } | ||
30 | } | ||
31 | |||
32 | const struct dentry_operations ns_dentry_operations = | ||
33 | { | ||
34 | .d_prune = ns_prune_dentry, | ||
35 | .d_delete = always_delete_dentry, | ||
36 | .d_dname = ns_dname, | ||
37 | }; | ||
38 | |||
39 | static void nsfs_evict(struct inode *inode) | ||
40 | { | ||
41 | struct ns_common *ns = inode->i_private; | ||
42 | clear_inode(inode); | ||
43 | ns->ops->put(ns); | ||
44 | } | ||
45 | |||
46 | void *ns_get_path(struct path *path, struct task_struct *task, | ||
47 | const struct proc_ns_operations *ns_ops) | ||
48 | { | ||
49 | struct vfsmount *mnt = mntget(nsfs_mnt); | ||
50 | struct qstr qname = { .name = "", }; | ||
51 | struct dentry *dentry; | ||
52 | struct inode *inode; | ||
53 | struct ns_common *ns; | ||
54 | unsigned long d; | ||
55 | |||
56 | again: | ||
57 | ns = ns_ops->get(task); | ||
58 | if (!ns) { | ||
59 | mntput(mnt); | ||
60 | return ERR_PTR(-ENOENT); | ||
61 | } | ||
62 | rcu_read_lock(); | ||
63 | d = atomic_long_read(&ns->stashed); | ||
64 | if (!d) | ||
65 | goto slow; | ||
66 | dentry = (struct dentry *)d; | ||
67 | if (!lockref_get_not_dead(&dentry->d_lockref)) | ||
68 | goto slow; | ||
69 | rcu_read_unlock(); | ||
70 | ns_ops->put(ns); | ||
71 | got_it: | ||
72 | path->mnt = mnt; | ||
73 | path->dentry = dentry; | ||
74 | return NULL; | ||
75 | slow: | ||
76 | rcu_read_unlock(); | ||
77 | inode = new_inode_pseudo(mnt->mnt_sb); | ||
78 | if (!inode) { | ||
79 | ns_ops->put(ns); | ||
80 | mntput(mnt); | ||
81 | return ERR_PTR(-ENOMEM); | ||
82 | } | ||
83 | inode->i_ino = ns->inum; | ||
84 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | ||
85 | inode->i_flags |= S_IMMUTABLE; | ||
86 | inode->i_mode = S_IFREG | S_IRUGO; | ||
87 | inode->i_fop = &ns_file_operations; | ||
88 | inode->i_private = ns; | ||
89 | |||
90 | dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); | ||
91 | if (!dentry) { | ||
92 | iput(inode); | ||
93 | mntput(mnt); | ||
94 | return ERR_PTR(-ENOMEM); | ||
95 | } | ||
96 | d_instantiate(dentry, inode); | ||
97 | dentry->d_fsdata = (void *)ns_ops; | ||
98 | d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); | ||
99 | if (d) { | ||
100 | d_delete(dentry); /* make sure ->d_prune() does nothing */ | ||
101 | dput(dentry); | ||
102 | cpu_relax(); | ||
103 | goto again; | ||
104 | } | ||
105 | goto got_it; | ||
106 | } | ||
107 | |||
108 | int ns_get_name(char *buf, size_t size, struct task_struct *task, | ||
109 | const struct proc_ns_operations *ns_ops) | ||
110 | { | ||
111 | struct ns_common *ns; | ||
112 | int res = -ENOENT; | ||
113 | ns = ns_ops->get(task); | ||
114 | if (ns) { | ||
115 | res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); | ||
116 | ns_ops->put(ns); | ||
117 | } | ||
118 | return res; | ||
119 | } | ||
120 | |||
121 | struct file *proc_ns_fget(int fd) | ||
122 | { | ||
123 | struct file *file; | ||
124 | |||
125 | file = fget(fd); | ||
126 | if (!file) | ||
127 | return ERR_PTR(-EBADF); | ||
128 | |||
129 | if (file->f_op != &ns_file_operations) | ||
130 | goto out_invalid; | ||
131 | |||
132 | return file; | ||
133 | |||
134 | out_invalid: | ||
135 | fput(file); | ||
136 | return ERR_PTR(-EINVAL); | ||
137 | } | ||
138 | |||
139 | static const struct super_operations nsfs_ops = { | ||
140 | .statfs = simple_statfs, | ||
141 | .evict_inode = nsfs_evict, | ||
142 | }; | ||
143 | static struct dentry *nsfs_mount(struct file_system_type *fs_type, | ||
144 | int flags, const char *dev_name, void *data) | ||
145 | { | ||
146 | return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, | ||
147 | &ns_dentry_operations, NSFS_MAGIC); | ||
148 | } | ||
149 | static struct file_system_type nsfs = { | ||
150 | .name = "nsfs", | ||
151 | .mount = nsfs_mount, | ||
152 | .kill_sb = kill_anon_super, | ||
153 | }; | ||
154 | |||
155 | void __init nsfs_init(void) | ||
156 | { | ||
157 | nsfs_mnt = kern_mount(&nsfs); | ||
158 | if (IS_ERR(nsfs_mnt)) | ||
159 | panic("can't set nsfs up\n"); | ||
160 | nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; | ||
161 | } | ||
162 |
fs/proc/inode.c
1 | /* | 1 | /* |
2 | * linux/fs/proc/inode.c | 2 | * linux/fs/proc/inode.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/time.h> | 7 | #include <linux/time.h> |
8 | #include <linux/proc_fs.h> | 8 | #include <linux/proc_fs.h> |
9 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
10 | #include <linux/pid_namespace.h> | 10 | #include <linux/pid_namespace.h> |
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/string.h> | 12 | #include <linux/string.h> |
13 | #include <linux/stat.h> | 13 | #include <linux/stat.h> |
14 | #include <linux/completion.h> | 14 | #include <linux/completion.h> |
15 | #include <linux/poll.h> | 15 | #include <linux/poll.h> |
16 | #include <linux/printk.h> | 16 | #include <linux/printk.h> |
17 | #include <linux/file.h> | 17 | #include <linux/file.h> |
18 | #include <linux/limits.h> | 18 | #include <linux/limits.h> |
19 | #include <linux/init.h> | 19 | #include <linux/init.h> |
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/sysctl.h> | 21 | #include <linux/sysctl.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
25 | #include <linux/magic.h> | 25 | #include <linux/magic.h> |
26 | 26 | ||
27 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
28 | 28 | ||
29 | #include "internal.h" | 29 | #include "internal.h" |
30 | 30 | ||
31 | static void proc_evict_inode(struct inode *inode) | 31 | static void proc_evict_inode(struct inode *inode) |
32 | { | 32 | { |
33 | struct proc_dir_entry *de; | 33 | struct proc_dir_entry *de; |
34 | struct ctl_table_header *head; | 34 | struct ctl_table_header *head; |
35 | struct ns_common *ns; | ||
36 | 35 | ||
37 | truncate_inode_pages_final(&inode->i_data); | 36 | truncate_inode_pages_final(&inode->i_data); |
38 | clear_inode(inode); | 37 | clear_inode(inode); |
39 | 38 | ||
40 | /* Stop tracking associated processes */ | 39 | /* Stop tracking associated processes */ |
41 | put_pid(PROC_I(inode)->pid); | 40 | put_pid(PROC_I(inode)->pid); |
42 | 41 | ||
43 | /* Let go of any associated proc directory entry */ | 42 | /* Let go of any associated proc directory entry */ |
44 | de = PROC_I(inode)->pde; | 43 | de = PROC_I(inode)->pde; |
45 | if (de) | 44 | if (de) |
46 | pde_put(de); | 45 | pde_put(de); |
47 | head = PROC_I(inode)->sysctl; | 46 | head = PROC_I(inode)->sysctl; |
48 | if (head) { | 47 | if (head) { |
49 | RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); | 48 | RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); |
50 | sysctl_head_put(head); | 49 | sysctl_head_put(head); |
51 | } | 50 | } |
52 | /* Release any associated namespace */ | ||
53 | ns = PROC_I(inode)->ns.ns; | ||
54 | if (ns && ns->ops) | ||
55 | ns->ops->put(ns); | ||
56 | } | 51 | } |
57 | 52 | ||
58 | static struct kmem_cache * proc_inode_cachep; | 53 | static struct kmem_cache * proc_inode_cachep; |
59 | 54 | ||
60 | static struct inode *proc_alloc_inode(struct super_block *sb) | 55 | static struct inode *proc_alloc_inode(struct super_block *sb) |
61 | { | 56 | { |
62 | struct proc_inode *ei; | 57 | struct proc_inode *ei; |
63 | struct inode *inode; | 58 | struct inode *inode; |
64 | 59 | ||
65 | ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); | 60 | ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); |
66 | if (!ei) | 61 | if (!ei) |
67 | return NULL; | 62 | return NULL; |
68 | ei->pid = NULL; | 63 | ei->pid = NULL; |
69 | ei->fd = 0; | 64 | ei->fd = 0; |
70 | ei->op.proc_get_link = NULL; | 65 | ei->op.proc_get_link = NULL; |
71 | ei->pde = NULL; | 66 | ei->pde = NULL; |
72 | ei->sysctl = NULL; | 67 | ei->sysctl = NULL; |
73 | ei->sysctl_entry = NULL; | 68 | ei->sysctl_entry = NULL; |
74 | ei->ns.ns = NULL; | 69 | ei->ns.ns = NULL; |
75 | ei->ns.ns_ops = NULL; | 70 | ei->ns.ns_ops = NULL; |
76 | inode = &ei->vfs_inode; | 71 | inode = &ei->vfs_inode; |
77 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | 72 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; |
78 | return inode; | 73 | return inode; |
79 | } | 74 | } |
80 | 75 | ||
81 | static void proc_i_callback(struct rcu_head *head) | 76 | static void proc_i_callback(struct rcu_head *head) |
82 | { | 77 | { |
83 | struct inode *inode = container_of(head, struct inode, i_rcu); | 78 | struct inode *inode = container_of(head, struct inode, i_rcu); |
84 | kmem_cache_free(proc_inode_cachep, PROC_I(inode)); | 79 | kmem_cache_free(proc_inode_cachep, PROC_I(inode)); |
85 | } | 80 | } |
86 | 81 | ||
87 | static void proc_destroy_inode(struct inode *inode) | 82 | static void proc_destroy_inode(struct inode *inode) |
88 | { | 83 | { |
89 | call_rcu(&inode->i_rcu, proc_i_callback); | 84 | call_rcu(&inode->i_rcu, proc_i_callback); |
90 | } | 85 | } |
91 | 86 | ||
92 | static void init_once(void *foo) | 87 | static void init_once(void *foo) |
93 | { | 88 | { |
94 | struct proc_inode *ei = (struct proc_inode *) foo; | 89 | struct proc_inode *ei = (struct proc_inode *) foo; |
95 | 90 | ||
96 | inode_init_once(&ei->vfs_inode); | 91 | inode_init_once(&ei->vfs_inode); |
97 | } | 92 | } |
98 | 93 | ||
99 | void __init proc_init_inodecache(void) | 94 | void __init proc_init_inodecache(void) |
100 | { | 95 | { |
101 | proc_inode_cachep = kmem_cache_create("proc_inode_cache", | 96 | proc_inode_cachep = kmem_cache_create("proc_inode_cache", |
102 | sizeof(struct proc_inode), | 97 | sizeof(struct proc_inode), |
103 | 0, (SLAB_RECLAIM_ACCOUNT| | 98 | 0, (SLAB_RECLAIM_ACCOUNT| |
104 | SLAB_MEM_SPREAD|SLAB_PANIC), | 99 | SLAB_MEM_SPREAD|SLAB_PANIC), |
105 | init_once); | 100 | init_once); |
106 | } | 101 | } |
107 | 102 | ||
108 | static int proc_show_options(struct seq_file *seq, struct dentry *root) | 103 | static int proc_show_options(struct seq_file *seq, struct dentry *root) |
109 | { | 104 | { |
110 | struct super_block *sb = root->d_sb; | 105 | struct super_block *sb = root->d_sb; |
111 | struct pid_namespace *pid = sb->s_fs_info; | 106 | struct pid_namespace *pid = sb->s_fs_info; |
112 | 107 | ||
113 | if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) | 108 | if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) |
114 | seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); | 109 | seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); |
115 | if (pid->hide_pid != 0) | 110 | if (pid->hide_pid != 0) |
116 | seq_printf(seq, ",hidepid=%u", pid->hide_pid); | 111 | seq_printf(seq, ",hidepid=%u", pid->hide_pid); |
117 | 112 | ||
118 | return 0; | 113 | return 0; |
119 | } | 114 | } |
120 | 115 | ||
121 | static const struct super_operations proc_sops = { | 116 | static const struct super_operations proc_sops = { |
122 | .alloc_inode = proc_alloc_inode, | 117 | .alloc_inode = proc_alloc_inode, |
123 | .destroy_inode = proc_destroy_inode, | 118 | .destroy_inode = proc_destroy_inode, |
124 | .drop_inode = generic_delete_inode, | 119 | .drop_inode = generic_delete_inode, |
125 | .evict_inode = proc_evict_inode, | 120 | .evict_inode = proc_evict_inode, |
126 | .statfs = simple_statfs, | 121 | .statfs = simple_statfs, |
127 | .remount_fs = proc_remount, | 122 | .remount_fs = proc_remount, |
128 | .show_options = proc_show_options, | 123 | .show_options = proc_show_options, |
129 | }; | 124 | }; |
130 | 125 | ||
131 | enum {BIAS = -1U<<31}; | 126 | enum {BIAS = -1U<<31}; |
132 | 127 | ||
133 | static inline int use_pde(struct proc_dir_entry *pde) | 128 | static inline int use_pde(struct proc_dir_entry *pde) |
134 | { | 129 | { |
135 | return atomic_inc_unless_negative(&pde->in_use); | 130 | return atomic_inc_unless_negative(&pde->in_use); |
136 | } | 131 | } |
137 | 132 | ||
138 | static void unuse_pde(struct proc_dir_entry *pde) | 133 | static void unuse_pde(struct proc_dir_entry *pde) |
139 | { | 134 | { |
140 | if (atomic_dec_return(&pde->in_use) == BIAS) | 135 | if (atomic_dec_return(&pde->in_use) == BIAS) |
141 | complete(pde->pde_unload_completion); | 136 | complete(pde->pde_unload_completion); |
142 | } | 137 | } |
143 | 138 | ||
144 | /* pde is locked */ | 139 | /* pde is locked */ |
145 | static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) | 140 | static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) |
146 | { | 141 | { |
147 | if (pdeo->closing) { | 142 | if (pdeo->closing) { |
148 | /* somebody else is doing that, just wait */ | 143 | /* somebody else is doing that, just wait */ |
149 | DECLARE_COMPLETION_ONSTACK(c); | 144 | DECLARE_COMPLETION_ONSTACK(c); |
150 | pdeo->c = &c; | 145 | pdeo->c = &c; |
151 | spin_unlock(&pde->pde_unload_lock); | 146 | spin_unlock(&pde->pde_unload_lock); |
152 | wait_for_completion(&c); | 147 | wait_for_completion(&c); |
153 | spin_lock(&pde->pde_unload_lock); | 148 | spin_lock(&pde->pde_unload_lock); |
154 | } else { | 149 | } else { |
155 | struct file *file; | 150 | struct file *file; |
156 | pdeo->closing = 1; | 151 | pdeo->closing = 1; |
157 | spin_unlock(&pde->pde_unload_lock); | 152 | spin_unlock(&pde->pde_unload_lock); |
158 | file = pdeo->file; | 153 | file = pdeo->file; |
159 | pde->proc_fops->release(file_inode(file), file); | 154 | pde->proc_fops->release(file_inode(file), file); |
160 | spin_lock(&pde->pde_unload_lock); | 155 | spin_lock(&pde->pde_unload_lock); |
161 | list_del_init(&pdeo->lh); | 156 | list_del_init(&pdeo->lh); |
162 | if (pdeo->c) | 157 | if (pdeo->c) |
163 | complete(pdeo->c); | 158 | complete(pdeo->c); |
164 | kfree(pdeo); | 159 | kfree(pdeo); |
165 | } | 160 | } |
166 | } | 161 | } |
167 | 162 | ||
168 | void proc_entry_rundown(struct proc_dir_entry *de) | 163 | void proc_entry_rundown(struct proc_dir_entry *de) |
169 | { | 164 | { |
170 | DECLARE_COMPLETION_ONSTACK(c); | 165 | DECLARE_COMPLETION_ONSTACK(c); |
171 | /* Wait until all existing callers into module are done. */ | 166 | /* Wait until all existing callers into module are done. */ |
172 | de->pde_unload_completion = &c; | 167 | de->pde_unload_completion = &c; |
173 | if (atomic_add_return(BIAS, &de->in_use) != BIAS) | 168 | if (atomic_add_return(BIAS, &de->in_use) != BIAS) |
174 | wait_for_completion(&c); | 169 | wait_for_completion(&c); |
175 | 170 | ||
176 | spin_lock(&de->pde_unload_lock); | 171 | spin_lock(&de->pde_unload_lock); |
177 | while (!list_empty(&de->pde_openers)) { | 172 | while (!list_empty(&de->pde_openers)) { |
178 | struct pde_opener *pdeo; | 173 | struct pde_opener *pdeo; |
179 | pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); | 174 | pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); |
180 | close_pdeo(de, pdeo); | 175 | close_pdeo(de, pdeo); |
181 | } | 176 | } |
182 | spin_unlock(&de->pde_unload_lock); | 177 | spin_unlock(&de->pde_unload_lock); |
183 | } | 178 | } |
184 | 179 | ||
185 | static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) | 180 | static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) |
186 | { | 181 | { |
187 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 182 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
188 | loff_t rv = -EINVAL; | 183 | loff_t rv = -EINVAL; |
189 | if (use_pde(pde)) { | 184 | if (use_pde(pde)) { |
190 | loff_t (*llseek)(struct file *, loff_t, int); | 185 | loff_t (*llseek)(struct file *, loff_t, int); |
191 | llseek = pde->proc_fops->llseek; | 186 | llseek = pde->proc_fops->llseek; |
192 | if (!llseek) | 187 | if (!llseek) |
193 | llseek = default_llseek; | 188 | llseek = default_llseek; |
194 | rv = llseek(file, offset, whence); | 189 | rv = llseek(file, offset, whence); |
195 | unuse_pde(pde); | 190 | unuse_pde(pde); |
196 | } | 191 | } |
197 | return rv; | 192 | return rv; |
198 | } | 193 | } |
199 | 194 | ||
200 | static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | 195 | static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) |
201 | { | 196 | { |
202 | ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); | 197 | ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); |
203 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 198 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
204 | ssize_t rv = -EIO; | 199 | ssize_t rv = -EIO; |
205 | if (use_pde(pde)) { | 200 | if (use_pde(pde)) { |
206 | read = pde->proc_fops->read; | 201 | read = pde->proc_fops->read; |
207 | if (read) | 202 | if (read) |
208 | rv = read(file, buf, count, ppos); | 203 | rv = read(file, buf, count, ppos); |
209 | unuse_pde(pde); | 204 | unuse_pde(pde); |
210 | } | 205 | } |
211 | return rv; | 206 | return rv; |
212 | } | 207 | } |
213 | 208 | ||
214 | static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | 209 | static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) |
215 | { | 210 | { |
216 | ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); | 211 | ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); |
217 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 212 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
218 | ssize_t rv = -EIO; | 213 | ssize_t rv = -EIO; |
219 | if (use_pde(pde)) { | 214 | if (use_pde(pde)) { |
220 | write = pde->proc_fops->write; | 215 | write = pde->proc_fops->write; |
221 | if (write) | 216 | if (write) |
222 | rv = write(file, buf, count, ppos); | 217 | rv = write(file, buf, count, ppos); |
223 | unuse_pde(pde); | 218 | unuse_pde(pde); |
224 | } | 219 | } |
225 | return rv; | 220 | return rv; |
226 | } | 221 | } |
227 | 222 | ||
228 | static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) | 223 | static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) |
229 | { | 224 | { |
230 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 225 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
231 | unsigned int rv = DEFAULT_POLLMASK; | 226 | unsigned int rv = DEFAULT_POLLMASK; |
232 | unsigned int (*poll)(struct file *, struct poll_table_struct *); | 227 | unsigned int (*poll)(struct file *, struct poll_table_struct *); |
233 | if (use_pde(pde)) { | 228 | if (use_pde(pde)) { |
234 | poll = pde->proc_fops->poll; | 229 | poll = pde->proc_fops->poll; |
235 | if (poll) | 230 | if (poll) |
236 | rv = poll(file, pts); | 231 | rv = poll(file, pts); |
237 | unuse_pde(pde); | 232 | unuse_pde(pde); |
238 | } | 233 | } |
239 | return rv; | 234 | return rv; |
240 | } | 235 | } |
241 | 236 | ||
242 | static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 237 | static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
243 | { | 238 | { |
244 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 239 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
245 | long rv = -ENOTTY; | 240 | long rv = -ENOTTY; |
246 | long (*ioctl)(struct file *, unsigned int, unsigned long); | 241 | long (*ioctl)(struct file *, unsigned int, unsigned long); |
247 | if (use_pde(pde)) { | 242 | if (use_pde(pde)) { |
248 | ioctl = pde->proc_fops->unlocked_ioctl; | 243 | ioctl = pde->proc_fops->unlocked_ioctl; |
249 | if (ioctl) | 244 | if (ioctl) |
250 | rv = ioctl(file, cmd, arg); | 245 | rv = ioctl(file, cmd, arg); |
251 | unuse_pde(pde); | 246 | unuse_pde(pde); |
252 | } | 247 | } |
253 | return rv; | 248 | return rv; |
254 | } | 249 | } |
255 | 250 | ||
256 | #ifdef CONFIG_COMPAT | 251 | #ifdef CONFIG_COMPAT |
257 | static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 252 | static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
258 | { | 253 | { |
259 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 254 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
260 | long rv = -ENOTTY; | 255 | long rv = -ENOTTY; |
261 | long (*compat_ioctl)(struct file *, unsigned int, unsigned long); | 256 | long (*compat_ioctl)(struct file *, unsigned int, unsigned long); |
262 | if (use_pde(pde)) { | 257 | if (use_pde(pde)) { |
263 | compat_ioctl = pde->proc_fops->compat_ioctl; | 258 | compat_ioctl = pde->proc_fops->compat_ioctl; |
264 | if (compat_ioctl) | 259 | if (compat_ioctl) |
265 | rv = compat_ioctl(file, cmd, arg); | 260 | rv = compat_ioctl(file, cmd, arg); |
266 | unuse_pde(pde); | 261 | unuse_pde(pde); |
267 | } | 262 | } |
268 | return rv; | 263 | return rv; |
269 | } | 264 | } |
270 | #endif | 265 | #endif |
271 | 266 | ||
272 | static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) | 267 | static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) |
273 | { | 268 | { |
274 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 269 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
275 | int rv = -EIO; | 270 | int rv = -EIO; |
276 | int (*mmap)(struct file *, struct vm_area_struct *); | 271 | int (*mmap)(struct file *, struct vm_area_struct *); |
277 | if (use_pde(pde)) { | 272 | if (use_pde(pde)) { |
278 | mmap = pde->proc_fops->mmap; | 273 | mmap = pde->proc_fops->mmap; |
279 | if (mmap) | 274 | if (mmap) |
280 | rv = mmap(file, vma); | 275 | rv = mmap(file, vma); |
281 | unuse_pde(pde); | 276 | unuse_pde(pde); |
282 | } | 277 | } |
283 | return rv; | 278 | return rv; |
284 | } | 279 | } |
285 | 280 | ||
286 | static unsigned long | 281 | static unsigned long |
287 | proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, | 282 | proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, |
288 | unsigned long len, unsigned long pgoff, | 283 | unsigned long len, unsigned long pgoff, |
289 | unsigned long flags) | 284 | unsigned long flags) |
290 | { | 285 | { |
291 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 286 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
292 | unsigned long rv = -EIO; | 287 | unsigned long rv = -EIO; |
293 | 288 | ||
294 | if (use_pde(pde)) { | 289 | if (use_pde(pde)) { |
295 | typeof(proc_reg_get_unmapped_area) *get_area; | 290 | typeof(proc_reg_get_unmapped_area) *get_area; |
296 | 291 | ||
297 | get_area = pde->proc_fops->get_unmapped_area; | 292 | get_area = pde->proc_fops->get_unmapped_area; |
298 | #ifdef CONFIG_MMU | 293 | #ifdef CONFIG_MMU |
299 | if (!get_area) | 294 | if (!get_area) |
300 | get_area = current->mm->get_unmapped_area; | 295 | get_area = current->mm->get_unmapped_area; |
301 | #endif | 296 | #endif |
302 | 297 | ||
303 | if (get_area) | 298 | if (get_area) |
304 | rv = get_area(file, orig_addr, len, pgoff, flags); | 299 | rv = get_area(file, orig_addr, len, pgoff, flags); |
305 | else | 300 | else |
306 | rv = orig_addr; | 301 | rv = orig_addr; |
307 | unuse_pde(pde); | 302 | unuse_pde(pde); |
308 | } | 303 | } |
309 | return rv; | 304 | return rv; |
310 | } | 305 | } |
311 | 306 | ||
312 | static int proc_reg_open(struct inode *inode, struct file *file) | 307 | static int proc_reg_open(struct inode *inode, struct file *file) |
313 | { | 308 | { |
314 | struct proc_dir_entry *pde = PDE(inode); | 309 | struct proc_dir_entry *pde = PDE(inode); |
315 | int rv = 0; | 310 | int rv = 0; |
316 | int (*open)(struct inode *, struct file *); | 311 | int (*open)(struct inode *, struct file *); |
317 | int (*release)(struct inode *, struct file *); | 312 | int (*release)(struct inode *, struct file *); |
318 | struct pde_opener *pdeo; | 313 | struct pde_opener *pdeo; |
319 | 314 | ||
320 | /* | 315 | /* |
321 | * What for, you ask? Well, we can have open, rmmod, remove_proc_entry | 316 | * What for, you ask? Well, we can have open, rmmod, remove_proc_entry |
322 | * sequence. ->release won't be called because ->proc_fops will be | 317 | * sequence. ->release won't be called because ->proc_fops will be |
323 | * cleared. Depending on complexity of ->release, consequences vary. | 318 | * cleared. Depending on complexity of ->release, consequences vary. |
324 | * | 319 | * |
325 | * We can't wait for mercy when close will be done for real, it's | 320 | * We can't wait for mercy when close will be done for real, it's |
326 | * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release | 321 | * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release |
327 | * by hand in remove_proc_entry(). For this, save opener's credentials | 322 | * by hand in remove_proc_entry(). For this, save opener's credentials |
328 | * for later. | 323 | * for later. |
329 | */ | 324 | */ |
330 | pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); | 325 | pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); |
331 | if (!pdeo) | 326 | if (!pdeo) |
332 | return -ENOMEM; | 327 | return -ENOMEM; |
333 | 328 | ||
334 | if (!use_pde(pde)) { | 329 | if (!use_pde(pde)) { |
335 | kfree(pdeo); | 330 | kfree(pdeo); |
336 | return -ENOENT; | 331 | return -ENOENT; |
337 | } | 332 | } |
338 | open = pde->proc_fops->open; | 333 | open = pde->proc_fops->open; |
339 | release = pde->proc_fops->release; | 334 | release = pde->proc_fops->release; |
340 | 335 | ||
341 | if (open) | 336 | if (open) |
342 | rv = open(inode, file); | 337 | rv = open(inode, file); |
343 | 338 | ||
344 | if (rv == 0 && release) { | 339 | if (rv == 0 && release) { |
345 | /* To know what to release. */ | 340 | /* To know what to release. */ |
346 | pdeo->file = file; | 341 | pdeo->file = file; |
347 | /* Strictly for "too late" ->release in proc_reg_release(). */ | 342 | /* Strictly for "too late" ->release in proc_reg_release(). */ |
348 | spin_lock(&pde->pde_unload_lock); | 343 | spin_lock(&pde->pde_unload_lock); |
349 | list_add(&pdeo->lh, &pde->pde_openers); | 344 | list_add(&pdeo->lh, &pde->pde_openers); |
350 | spin_unlock(&pde->pde_unload_lock); | 345 | spin_unlock(&pde->pde_unload_lock); |
351 | } else | 346 | } else |
352 | kfree(pdeo); | 347 | kfree(pdeo); |
353 | 348 | ||
354 | unuse_pde(pde); | 349 | unuse_pde(pde); |
355 | return rv; | 350 | return rv; |
356 | } | 351 | } |
357 | 352 | ||
358 | static int proc_reg_release(struct inode *inode, struct file *file) | 353 | static int proc_reg_release(struct inode *inode, struct file *file) |
359 | { | 354 | { |
360 | struct proc_dir_entry *pde = PDE(inode); | 355 | struct proc_dir_entry *pde = PDE(inode); |
361 | struct pde_opener *pdeo; | 356 | struct pde_opener *pdeo; |
362 | spin_lock(&pde->pde_unload_lock); | 357 | spin_lock(&pde->pde_unload_lock); |
363 | list_for_each_entry(pdeo, &pde->pde_openers, lh) { | 358 | list_for_each_entry(pdeo, &pde->pde_openers, lh) { |
364 | if (pdeo->file == file) { | 359 | if (pdeo->file == file) { |
365 | close_pdeo(pde, pdeo); | 360 | close_pdeo(pde, pdeo); |
366 | break; | 361 | break; |
367 | } | 362 | } |
368 | } | 363 | } |
369 | spin_unlock(&pde->pde_unload_lock); | 364 | spin_unlock(&pde->pde_unload_lock); |
370 | return 0; | 365 | return 0; |
371 | } | 366 | } |
372 | 367 | ||
373 | static const struct file_operations proc_reg_file_ops = { | 368 | static const struct file_operations proc_reg_file_ops = { |
374 | .llseek = proc_reg_llseek, | 369 | .llseek = proc_reg_llseek, |
375 | .read = proc_reg_read, | 370 | .read = proc_reg_read, |
376 | .write = proc_reg_write, | 371 | .write = proc_reg_write, |
377 | .poll = proc_reg_poll, | 372 | .poll = proc_reg_poll, |
378 | .unlocked_ioctl = proc_reg_unlocked_ioctl, | 373 | .unlocked_ioctl = proc_reg_unlocked_ioctl, |
379 | #ifdef CONFIG_COMPAT | 374 | #ifdef CONFIG_COMPAT |
380 | .compat_ioctl = proc_reg_compat_ioctl, | 375 | .compat_ioctl = proc_reg_compat_ioctl, |
381 | #endif | 376 | #endif |
382 | .mmap = proc_reg_mmap, | 377 | .mmap = proc_reg_mmap, |
383 | .get_unmapped_area = proc_reg_get_unmapped_area, | 378 | .get_unmapped_area = proc_reg_get_unmapped_area, |
384 | .open = proc_reg_open, | 379 | .open = proc_reg_open, |
385 | .release = proc_reg_release, | 380 | .release = proc_reg_release, |
386 | }; | 381 | }; |
387 | 382 | ||
388 | #ifdef CONFIG_COMPAT | 383 | #ifdef CONFIG_COMPAT |
389 | static const struct file_operations proc_reg_file_ops_no_compat = { | 384 | static const struct file_operations proc_reg_file_ops_no_compat = { |
390 | .llseek = proc_reg_llseek, | 385 | .llseek = proc_reg_llseek, |
391 | .read = proc_reg_read, | 386 | .read = proc_reg_read, |
392 | .write = proc_reg_write, | 387 | .write = proc_reg_write, |
393 | .poll = proc_reg_poll, | 388 | .poll = proc_reg_poll, |
394 | .unlocked_ioctl = proc_reg_unlocked_ioctl, | 389 | .unlocked_ioctl = proc_reg_unlocked_ioctl, |
395 | .mmap = proc_reg_mmap, | 390 | .mmap = proc_reg_mmap, |
396 | .get_unmapped_area = proc_reg_get_unmapped_area, | 391 | .get_unmapped_area = proc_reg_get_unmapped_area, |
397 | .open = proc_reg_open, | 392 | .open = proc_reg_open, |
398 | .release = proc_reg_release, | 393 | .release = proc_reg_release, |
399 | }; | 394 | }; |
400 | #endif | 395 | #endif |
401 | 396 | ||
402 | struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) | 397 | struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) |
403 | { | 398 | { |
404 | struct inode *inode = new_inode_pseudo(sb); | 399 | struct inode *inode = new_inode_pseudo(sb); |
405 | 400 | ||
406 | if (inode) { | 401 | if (inode) { |
407 | inode->i_ino = de->low_ino; | 402 | inode->i_ino = de->low_ino; |
408 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | 403 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; |
409 | PROC_I(inode)->pde = de; | 404 | PROC_I(inode)->pde = de; |
410 | 405 | ||
411 | if (de->mode) { | 406 | if (de->mode) { |
412 | inode->i_mode = de->mode; | 407 | inode->i_mode = de->mode; |
413 | inode->i_uid = de->uid; | 408 | inode->i_uid = de->uid; |
414 | inode->i_gid = de->gid; | 409 | inode->i_gid = de->gid; |
415 | } | 410 | } |
416 | if (de->size) | 411 | if (de->size) |
417 | inode->i_size = de->size; | 412 | inode->i_size = de->size; |
418 | if (de->nlink) | 413 | if (de->nlink) |
419 | set_nlink(inode, de->nlink); | 414 | set_nlink(inode, de->nlink); |
420 | WARN_ON(!de->proc_iops); | 415 | WARN_ON(!de->proc_iops); |
421 | inode->i_op = de->proc_iops; | 416 | inode->i_op = de->proc_iops; |
422 | if (de->proc_fops) { | 417 | if (de->proc_fops) { |
423 | if (S_ISREG(inode->i_mode)) { | 418 | if (S_ISREG(inode->i_mode)) { |
424 | #ifdef CONFIG_COMPAT | 419 | #ifdef CONFIG_COMPAT |
425 | if (!de->proc_fops->compat_ioctl) | 420 | if (!de->proc_fops->compat_ioctl) |
426 | inode->i_fop = | 421 | inode->i_fop = |
427 | &proc_reg_file_ops_no_compat; | 422 | &proc_reg_file_ops_no_compat; |
428 | else | 423 | else |
429 | #endif | 424 | #endif |
430 | inode->i_fop = &proc_reg_file_ops; | 425 | inode->i_fop = &proc_reg_file_ops; |
431 | } else { | 426 | } else { |
432 | inode->i_fop = de->proc_fops; | 427 | inode->i_fop = de->proc_fops; |
433 | } | 428 | } |
434 | } | 429 | } |
435 | } else | 430 | } else |
436 | pde_put(de); | 431 | pde_put(de); |
437 | return inode; | 432 | return inode; |
438 | } | 433 | } |
439 | 434 | ||
440 | int proc_fill_super(struct super_block *s) | 435 | int proc_fill_super(struct super_block *s) |
441 | { | 436 | { |
442 | struct inode *root_inode; | 437 | struct inode *root_inode; |
443 | int ret; | 438 | int ret; |
444 | 439 | ||
445 | s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; | 440 | s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; |
446 | s->s_blocksize = 1024; | 441 | s->s_blocksize = 1024; |
447 | s->s_blocksize_bits = 10; | 442 | s->s_blocksize_bits = 10; |
448 | s->s_magic = PROC_SUPER_MAGIC; | 443 | s->s_magic = PROC_SUPER_MAGIC; |
449 | s->s_op = &proc_sops; | 444 | s->s_op = &proc_sops; |
450 | s->s_time_gran = 1; | 445 | s->s_time_gran = 1; |
451 | 446 | ||
452 | pde_get(&proc_root); | 447 | pde_get(&proc_root); |
453 | root_inode = proc_get_inode(s, &proc_root); | 448 | root_inode = proc_get_inode(s, &proc_root); |
454 | if (!root_inode) { | 449 | if (!root_inode) { |
455 | pr_err("proc_fill_super: get root inode failed\n"); | 450 | pr_err("proc_fill_super: get root inode failed\n"); |
456 | return -ENOMEM; | 451 | return -ENOMEM; |
457 | } | 452 | } |
458 | 453 | ||
459 | s->s_root = d_make_root(root_inode); | 454 | s->s_root = d_make_root(root_inode); |
460 | if (!s->s_root) { | 455 | if (!s->s_root) { |
461 | pr_err("proc_fill_super: allocate dentry failed\n"); | 456 | pr_err("proc_fill_super: allocate dentry failed\n"); |
462 | return -ENOMEM; | 457 | return -ENOMEM; |
463 | } | 458 | } |
464 | 459 | ||
465 | ret = proc_setup_self(s); | 460 | ret = proc_setup_self(s); |
466 | if (ret) { | 461 | if (ret) { |
467 | return ret; | 462 | return ret; |
468 | } | 463 | } |
469 | return proc_setup_thread_self(s); | 464 | return proc_setup_thread_self(s); |
470 | } | 465 | } |
471 | 466 |
fs/proc/namespaces.c
1 | #include <linux/proc_fs.h> | 1 | #include <linux/proc_fs.h> |
2 | #include <linux/nsproxy.h> | 2 | #include <linux/nsproxy.h> |
3 | #include <linux/sched.h> | ||
4 | #include <linux/ptrace.h> | 3 | #include <linux/ptrace.h> |
5 | #include <linux/fs_struct.h> | ||
6 | #include <linux/mount.h> | ||
7 | #include <linux/path.h> | ||
8 | #include <linux/namei.h> | 4 | #include <linux/namei.h> |
9 | #include <linux/file.h> | 5 | #include <linux/file.h> |
10 | #include <linux/utsname.h> | 6 | #include <linux/utsname.h> |
11 | #include <net/net_namespace.h> | 7 | #include <net/net_namespace.h> |
12 | #include <linux/ipc_namespace.h> | 8 | #include <linux/ipc_namespace.h> |
13 | #include <linux/pid_namespace.h> | 9 | #include <linux/pid_namespace.h> |
14 | #include <linux/user_namespace.h> | 10 | #include <linux/user_namespace.h> |
15 | #include "internal.h" | 11 | #include "internal.h" |
16 | 12 | ||
17 | 13 | ||
18 | static const struct proc_ns_operations *ns_entries[] = { | 14 | static const struct proc_ns_operations *ns_entries[] = { |
19 | #ifdef CONFIG_NET_NS | 15 | #ifdef CONFIG_NET_NS |
20 | &netns_operations, | 16 | &netns_operations, |
21 | #endif | 17 | #endif |
22 | #ifdef CONFIG_UTS_NS | 18 | #ifdef CONFIG_UTS_NS |
23 | &utsns_operations, | 19 | &utsns_operations, |
24 | #endif | 20 | #endif |
25 | #ifdef CONFIG_IPC_NS | 21 | #ifdef CONFIG_IPC_NS |
26 | &ipcns_operations, | 22 | &ipcns_operations, |
27 | #endif | 23 | #endif |
28 | #ifdef CONFIG_PID_NS | 24 | #ifdef CONFIG_PID_NS |
29 | &pidns_operations, | 25 | &pidns_operations, |
30 | #endif | 26 | #endif |
31 | #ifdef CONFIG_USER_NS | 27 | #ifdef CONFIG_USER_NS |
32 | &userns_operations, | 28 | &userns_operations, |
33 | #endif | 29 | #endif |
34 | &mntns_operations, | 30 | &mntns_operations, |
35 | }; | 31 | }; |
36 | 32 | ||
37 | static const struct file_operations ns_file_operations = { | ||
38 | .llseek = no_llseek, | ||
39 | }; | ||
40 | |||
41 | static const struct inode_operations ns_inode_operations = { | ||
42 | .setattr = proc_setattr, | ||
43 | }; | ||
44 | |||
45 | static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) | ||
46 | { | ||
47 | struct inode *inode = dentry->d_inode; | ||
48 | const struct proc_ns_operations *ns_ops = dentry->d_fsdata; | ||
49 | |||
50 | return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", | ||
51 | ns_ops->name, inode->i_ino); | ||
52 | } | ||
53 | |||
54 | const struct dentry_operations ns_dentry_operations = | ||
55 | { | ||
56 | .d_delete = always_delete_dentry, | ||
57 | .d_dname = ns_dname, | ||
58 | }; | ||
59 | |||
60 | static struct dentry *proc_ns_get_dentry(struct super_block *sb, | ||
61 | struct task_struct *task, const struct proc_ns_operations *ns_ops) | ||
62 | { | ||
63 | struct dentry *dentry, *result; | ||
64 | struct inode *inode; | ||
65 | struct proc_inode *ei; | ||
66 | struct qstr qname = { .name = "", }; | ||
67 | struct ns_common *ns; | ||
68 | |||
69 | ns = ns_ops->get(task); | ||
70 | if (!ns) | ||
71 | return ERR_PTR(-ENOENT); | ||
72 | |||
73 | dentry = d_alloc_pseudo(sb, &qname); | ||
74 | if (!dentry) { | ||
75 | ns_ops->put(ns); | ||
76 | return ERR_PTR(-ENOMEM); | ||
77 | } | ||
78 | dentry->d_fsdata = (void *)ns_ops; | ||
79 | |||
80 | inode = iget_locked(sb, ns->inum); | ||
81 | if (!inode) { | ||
82 | dput(dentry); | ||
83 | ns_ops->put(ns); | ||
84 | return ERR_PTR(-ENOMEM); | ||
85 | } | ||
86 | |||
87 | ei = PROC_I(inode); | ||
88 | if (inode->i_state & I_NEW) { | ||
89 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | ||
90 | inode->i_op = &ns_inode_operations; | ||
91 | inode->i_mode = S_IFREG | S_IRUGO; | ||
92 | inode->i_fop = &ns_file_operations; | ||
93 | ei->ns.ns_ops = ns_ops; | ||
94 | ei->ns.ns = ns; | ||
95 | unlock_new_inode(inode); | ||
96 | } else { | ||
97 | ns_ops->put(ns); | ||
98 | } | ||
99 | |||
100 | d_set_d_op(dentry, &ns_dentry_operations); | ||
101 | result = d_instantiate_unique(dentry, inode); | ||
102 | if (result) { | ||
103 | dput(dentry); | ||
104 | dentry = result; | ||
105 | } | ||
106 | |||
107 | return dentry; | ||
108 | } | ||
109 | |||
110 | static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) | 33 | static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) |
111 | { | 34 | { |
112 | struct inode *inode = dentry->d_inode; | 35 | struct inode *inode = dentry->d_inode; |
113 | struct super_block *sb = inode->i_sb; | 36 | const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; |
114 | struct proc_inode *ei = PROC_I(inode); | ||
115 | struct task_struct *task; | 37 | struct task_struct *task; |
116 | struct path ns_path; | 38 | struct path ns_path; |
117 | void *error = ERR_PTR(-EACCES); | 39 | void *error = ERR_PTR(-EACCES); |
118 | 40 | ||
119 | task = get_proc_task(inode); | 41 | task = get_proc_task(inode); |
120 | if (!task) | 42 | if (!task) |
121 | goto out; | 43 | return error; |
122 | 44 | ||
123 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 45 | if (ptrace_may_access(task, PTRACE_MODE_READ)) { |
124 | goto out_put_task; | 46 | error = ns_get_path(&ns_path, task, ns_ops); |
125 | 47 | if (!error) | |
126 | ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); | 48 | nd_jump_link(nd, &ns_path); |
127 | if (IS_ERR(ns_path.dentry)) { | ||
128 | error = ERR_CAST(ns_path.dentry); | ||
129 | goto out_put_task; | ||
130 | } | 49 | } |
131 | |||
132 | ns_path.mnt = mntget(nd->path.mnt); | ||
133 | nd_jump_link(nd, &ns_path); | ||
134 | error = NULL; | ||
135 | |||
136 | out_put_task: | ||
137 | put_task_struct(task); | 50 | put_task_struct(task); |
138 | out: | ||
139 | return error; | 51 | return error; |
140 | } | 52 | } |
141 | 53 | ||
142 | static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) | 54 | static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) |
143 | { | 55 | { |
144 | struct inode *inode = dentry->d_inode; | 56 | struct inode *inode = dentry->d_inode; |
145 | struct proc_inode *ei = PROC_I(inode); | 57 | const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; |
146 | const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; | ||
147 | struct task_struct *task; | 58 | struct task_struct *task; |
148 | struct ns_common *ns; | ||
149 | char name[50]; | 59 | char name[50]; |
150 | int res = -EACCES; | 60 | int res = -EACCES; |
151 | 61 | ||
152 | task = get_proc_task(inode); | 62 | task = get_proc_task(inode); |
153 | if (!task) | 63 | if (!task) |
154 | goto out; | 64 | return res; |
155 | 65 | ||
156 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 66 | if (ptrace_may_access(task, PTRACE_MODE_READ)) { |
157 | goto out_put_task; | 67 | res = ns_get_name(name, sizeof(name), task, ns_ops); |
158 | 68 | if (res >= 0) | |
159 | res = -ENOENT; | 69 | res = readlink_copy(buffer, buflen, name); |
160 | ns = ns_ops->get(task); | 70 | } |
161 | if (!ns) | ||
162 | goto out_put_task; | ||
163 | |||
164 | snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns->inum); | ||
165 | res = readlink_copy(buffer, buflen, name); | ||
166 | ns_ops->put(ns); | ||
167 | out_put_task: | ||
168 | put_task_struct(task); | 71 | put_task_struct(task); |
169 | out: | ||
170 | return res; | 72 | return res; |
171 | } | 73 | } |
172 | 74 | ||
173 | static const struct inode_operations proc_ns_link_inode_operations = { | 75 | static const struct inode_operations proc_ns_link_inode_operations = { |
174 | .readlink = proc_ns_readlink, | 76 | .readlink = proc_ns_readlink, |
175 | .follow_link = proc_ns_follow_link, | 77 | .follow_link = proc_ns_follow_link, |
176 | .setattr = proc_setattr, | 78 | .setattr = proc_setattr, |
177 | }; | 79 | }; |
178 | 80 | ||
179 | static int proc_ns_instantiate(struct inode *dir, | 81 | static int proc_ns_instantiate(struct inode *dir, |
180 | struct dentry *dentry, struct task_struct *task, const void *ptr) | 82 | struct dentry *dentry, struct task_struct *task, const void *ptr) |
181 | { | 83 | { |
182 | const struct proc_ns_operations *ns_ops = ptr; | 84 | const struct proc_ns_operations *ns_ops = ptr; |
183 | struct inode *inode; | 85 | struct inode *inode; |
184 | struct proc_inode *ei; | 86 | struct proc_inode *ei; |
185 | 87 | ||
186 | inode = proc_pid_make_inode(dir->i_sb, task); | 88 | inode = proc_pid_make_inode(dir->i_sb, task); |
187 | if (!inode) | 89 | if (!inode) |
188 | goto out; | 90 | goto out; |
189 | 91 | ||
190 | ei = PROC_I(inode); | 92 | ei = PROC_I(inode); |
191 | inode->i_mode = S_IFLNK|S_IRWXUGO; | 93 | inode->i_mode = S_IFLNK|S_IRWXUGO; |
192 | inode->i_op = &proc_ns_link_inode_operations; | 94 | inode->i_op = &proc_ns_link_inode_operations; |
193 | ei->ns.ns_ops = ns_ops; | 95 | ei->ns.ns_ops = ns_ops; |
194 | 96 | ||
195 | d_set_d_op(dentry, &pid_dentry_operations); | 97 | d_set_d_op(dentry, &pid_dentry_operations); |
196 | d_add(dentry, inode); | 98 | d_add(dentry, inode); |
197 | /* Close the race of the process dying before we return the dentry */ | 99 | /* Close the race of the process dying before we return the dentry */ |
198 | if (pid_revalidate(dentry, 0)) | 100 | if (pid_revalidate(dentry, 0)) |
199 | return 0; | 101 | return 0; |
200 | out: | 102 | out: |
201 | return -ENOENT; | 103 | return -ENOENT; |
202 | } | 104 | } |
203 | 105 | ||
204 | static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) | 106 | static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) |
205 | { | 107 | { |
206 | struct task_struct *task = get_proc_task(file_inode(file)); | 108 | struct task_struct *task = get_proc_task(file_inode(file)); |
207 | const struct proc_ns_operations **entry, **last; | 109 | const struct proc_ns_operations **entry, **last; |
208 | 110 | ||
209 | if (!task) | 111 | if (!task) |
210 | return -ENOENT; | 112 | return -ENOENT; |
211 | 113 | ||
212 | if (!dir_emit_dots(file, ctx)) | 114 | if (!dir_emit_dots(file, ctx)) |
213 | goto out; | 115 | goto out; |
214 | if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) | 116 | if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) |
215 | goto out; | 117 | goto out; |
216 | entry = ns_entries + (ctx->pos - 2); | 118 | entry = ns_entries + (ctx->pos - 2); |
217 | last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; | 119 | last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; |
218 | while (entry <= last) { | 120 | while (entry <= last) { |
219 | const struct proc_ns_operations *ops = *entry; | 121 | const struct proc_ns_operations *ops = *entry; |
220 | if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), | 122 | if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), |
221 | proc_ns_instantiate, task, ops)) | 123 | proc_ns_instantiate, task, ops)) |
222 | break; | 124 | break; |
223 | ctx->pos++; | 125 | ctx->pos++; |
224 | entry++; | 126 | entry++; |
225 | } | 127 | } |
226 | out: | 128 | out: |
227 | put_task_struct(task); | 129 | put_task_struct(task); |
228 | return 0; | 130 | return 0; |
229 | } | 131 | } |
230 | 132 | ||
231 | const struct file_operations proc_ns_dir_operations = { | 133 | const struct file_operations proc_ns_dir_operations = { |
232 | .read = generic_read_dir, | 134 | .read = generic_read_dir, |
233 | .iterate = proc_ns_dir_readdir, | 135 | .iterate = proc_ns_dir_readdir, |
234 | }; | 136 | }; |
235 | 137 | ||
236 | static struct dentry *proc_ns_dir_lookup(struct inode *dir, | 138 | static struct dentry *proc_ns_dir_lookup(struct inode *dir, |
237 | struct dentry *dentry, unsigned int flags) | 139 | struct dentry *dentry, unsigned int flags) |
238 | { | 140 | { |
239 | int error; | 141 | int error; |
240 | struct task_struct *task = get_proc_task(dir); | 142 | struct task_struct *task = get_proc_task(dir); |
241 | const struct proc_ns_operations **entry, **last; | 143 | const struct proc_ns_operations **entry, **last; |
242 | unsigned int len = dentry->d_name.len; | 144 | unsigned int len = dentry->d_name.len; |
243 | 145 | ||
244 | error = -ENOENT; | 146 | error = -ENOENT; |
245 | 147 | ||
246 | if (!task) | 148 | if (!task) |
247 | goto out_no_task; | 149 | goto out_no_task; |
248 | 150 | ||
249 | last = &ns_entries[ARRAY_SIZE(ns_entries)]; | 151 | last = &ns_entries[ARRAY_SIZE(ns_entries)]; |
250 | for (entry = ns_entries; entry < last; entry++) { | 152 | for (entry = ns_entries; entry < last; entry++) { |
251 | if (strlen((*entry)->name) != len) | 153 | if (strlen((*entry)->name) != len) |
252 | continue; | 154 | continue; |
253 | if (!memcmp(dentry->d_name.name, (*entry)->name, len)) | 155 | if (!memcmp(dentry->d_name.name, (*entry)->name, len)) |
254 | break; | 156 | break; |
255 | } | 157 | } |
256 | if (entry == last) | 158 | if (entry == last) |
257 | goto out; | 159 | goto out; |
258 | 160 | ||
259 | error = proc_ns_instantiate(dir, dentry, task, *entry); | 161 | error = proc_ns_instantiate(dir, dentry, task, *entry); |
260 | out: | 162 | out: |
261 | put_task_struct(task); | 163 | put_task_struct(task); |
262 | out_no_task: | 164 | out_no_task: |
263 | return ERR_PTR(error); | 165 | return ERR_PTR(error); |
264 | } | 166 | } |
265 | 167 | ||
266 | const struct inode_operations proc_ns_dir_inode_operations = { | 168 | const struct inode_operations proc_ns_dir_inode_operations = { |
267 | .lookup = proc_ns_dir_lookup, | 169 | .lookup = proc_ns_dir_lookup, |
268 | .getattr = pid_getattr, | 170 | .getattr = pid_getattr, |
269 | .setattr = proc_setattr, | 171 | .setattr = proc_setattr, |
270 | }; | 172 | }; |
271 | |||
272 | struct file *proc_ns_fget(int fd) | ||
273 | { | ||
274 | struct file *file; | ||
275 | |||
276 | file = fget(fd); | ||
277 | if (!file) | ||
278 | return ERR_PTR(-EBADF); | ||
279 | |||
280 | if (file->f_op != &ns_file_operations) | ||
281 | goto out_invalid; | ||
282 | |||
283 | return file; | ||
284 | |||
285 | out_invalid: | ||
286 | fput(file); | ||
287 | return ERR_PTR(-EINVAL); | ||
288 | } | ||
289 | |||
290 | struct ns_common *get_proc_ns(struct inode *inode) | ||
291 | { | ||
292 | return PROC_I(inode)->ns.ns; | ||
293 | } | ||
294 | |||
295 | bool proc_ns_inode(struct inode *inode) | ||
296 | { | ||
297 | return inode->i_fop == &ns_file_operations; | ||
298 | } | ||
299 | 173 |
include/linux/ns_common.h
1 | #ifndef _LINUX_NS_COMMON_H | 1 | #ifndef _LINUX_NS_COMMON_H |
2 | #define _LINUX_NS_COMMON_H | 2 | #define _LINUX_NS_COMMON_H |
3 | 3 | ||
4 | struct proc_ns_operations; | 4 | struct proc_ns_operations; |
5 | 5 | ||
6 | struct ns_common { | 6 | struct ns_common { |
7 | atomic_long_t stashed; | ||
7 | const struct proc_ns_operations *ops; | 8 | const struct proc_ns_operations *ops; |
8 | unsigned int inum; | 9 | unsigned int inum; |
9 | }; | 10 | }; |
10 | 11 | ||
11 | #endif | 12 | #endif |
12 | 13 |
include/linux/proc_ns.h
1 | /* | 1 | /* |
2 | * procfs namespace bits | 2 | * procfs namespace bits |
3 | */ | 3 | */ |
4 | #ifndef _LINUX_PROC_NS_H | 4 | #ifndef _LINUX_PROC_NS_H |
5 | #define _LINUX_PROC_NS_H | 5 | #define _LINUX_PROC_NS_H |
6 | 6 | ||
7 | #include <linux/ns_common.h> | ||
8 | |||
7 | struct pid_namespace; | 9 | struct pid_namespace; |
8 | struct nsproxy; | 10 | struct nsproxy; |
9 | struct ns_common; | 11 | struct path; |
10 | 12 | ||
11 | struct proc_ns_operations { | 13 | struct proc_ns_operations { |
12 | const char *name; | 14 | const char *name; |
13 | int type; | 15 | int type; |
14 | struct ns_common *(*get)(struct task_struct *task); | 16 | struct ns_common *(*get)(struct task_struct *task); |
15 | void (*put)(struct ns_common *ns); | 17 | void (*put)(struct ns_common *ns); |
16 | int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); | 18 | int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); |
17 | }; | 19 | }; |
18 | 20 | ||
19 | extern const struct proc_ns_operations netns_operations; | 21 | extern const struct proc_ns_operations netns_operations; |
20 | extern const struct proc_ns_operations utsns_operations; | 22 | extern const struct proc_ns_operations utsns_operations; |
21 | extern const struct proc_ns_operations ipcns_operations; | 23 | extern const struct proc_ns_operations ipcns_operations; |
22 | extern const struct proc_ns_operations pidns_operations; | 24 | extern const struct proc_ns_operations pidns_operations; |
23 | extern const struct proc_ns_operations userns_operations; | 25 | extern const struct proc_ns_operations userns_operations; |
24 | extern const struct proc_ns_operations mntns_operations; | 26 | extern const struct proc_ns_operations mntns_operations; |
25 | 27 | ||
26 | /* | 28 | /* |
27 | * We always define these enumerators | 29 | * We always define these enumerators |
28 | */ | 30 | */ |
29 | enum { | 31 | enum { |
30 | PROC_ROOT_INO = 1, | 32 | PROC_ROOT_INO = 1, |
31 | PROC_IPC_INIT_INO = 0xEFFFFFFFU, | 33 | PROC_IPC_INIT_INO = 0xEFFFFFFFU, |
32 | PROC_UTS_INIT_INO = 0xEFFFFFFEU, | 34 | PROC_UTS_INIT_INO = 0xEFFFFFFEU, |
33 | PROC_USER_INIT_INO = 0xEFFFFFFDU, | 35 | PROC_USER_INIT_INO = 0xEFFFFFFDU, |
34 | PROC_PID_INIT_INO = 0xEFFFFFFCU, | 36 | PROC_PID_INIT_INO = 0xEFFFFFFCU, |
35 | }; | 37 | }; |
36 | 38 | ||
37 | #ifdef CONFIG_PROC_FS | 39 | #ifdef CONFIG_PROC_FS |
38 | 40 | ||
39 | extern int pid_ns_prepare_proc(struct pid_namespace *ns); | 41 | extern int pid_ns_prepare_proc(struct pid_namespace *ns); |
40 | extern void pid_ns_release_proc(struct pid_namespace *ns); | 42 | extern void pid_ns_release_proc(struct pid_namespace *ns); |
41 | extern struct file *proc_ns_fget(int fd); | ||
42 | extern struct ns_common *get_proc_ns(struct inode *); | ||
43 | extern int proc_alloc_inum(unsigned int *pino); | 43 | extern int proc_alloc_inum(unsigned int *pino); |
44 | extern void proc_free_inum(unsigned int inum); | 44 | extern void proc_free_inum(unsigned int inum); |
45 | extern bool proc_ns_inode(struct inode *inode); | ||
46 | 45 | ||
47 | #else /* CONFIG_PROC_FS */ | 46 | #else /* CONFIG_PROC_FS */ |
48 | 47 | ||
49 | static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } | 48 | static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } |
50 | static inline void pid_ns_release_proc(struct pid_namespace *ns) {} | 49 | static inline void pid_ns_release_proc(struct pid_namespace *ns) {} |
51 | 50 | ||
52 | static inline struct file *proc_ns_fget(int fd) | ||
53 | { | ||
54 | return ERR_PTR(-EINVAL); | ||
55 | } | ||
56 | |||
57 | static inline struct ns_common *get_proc_ns(struct inode *inode) { return NULL; } | ||
58 | |||
59 | static inline int proc_alloc_inum(unsigned int *inum) | 51 | static inline int proc_alloc_inum(unsigned int *inum) |
60 | { | 52 | { |
61 | *inum = 1; | 53 | *inum = 1; |
62 | return 0; | 54 | return 0; |
63 | } | 55 | } |
64 | static inline void proc_free_inum(unsigned int inum) {} | 56 | static inline void proc_free_inum(unsigned int inum) {} |
65 | static inline bool proc_ns_inode(struct inode *inode) { return false; } | ||
66 | 57 | ||
67 | #endif /* CONFIG_PROC_FS */ | 58 | #endif /* CONFIG_PROC_FS */ |
68 | 59 | ||
69 | #define ns_alloc_inum(ns) proc_alloc_inum(&(ns)->inum) | 60 | static inline int ns_alloc_inum(struct ns_common *ns) |
61 | { | ||
62 | atomic_long_set(&ns->stashed, 0); | ||
63 | return proc_alloc_inum(&ns->inum); | ||
64 | } | ||
65 | |||
70 | #define ns_free_inum(ns) proc_free_inum((ns)->inum) | 66 | #define ns_free_inum(ns) proc_free_inum((ns)->inum) |
67 |
include/uapi/linux/magic.h
1 | #ifndef __LINUX_MAGIC_H__ | 1 | #ifndef __LINUX_MAGIC_H__ |
2 | #define __LINUX_MAGIC_H__ | 2 | #define __LINUX_MAGIC_H__ |
3 | 3 | ||
4 | #define ADFS_SUPER_MAGIC 0xadf5 | 4 | #define ADFS_SUPER_MAGIC 0xadf5 |
5 | #define AFFS_SUPER_MAGIC 0xadff | 5 | #define AFFS_SUPER_MAGIC 0xadff |
6 | #define AFS_SUPER_MAGIC 0x5346414F | 6 | #define AFS_SUPER_MAGIC 0x5346414F |
7 | #define AUTOFS_SUPER_MAGIC 0x0187 | 7 | #define AUTOFS_SUPER_MAGIC 0x0187 |
8 | #define CODA_SUPER_MAGIC 0x73757245 | 8 | #define CODA_SUPER_MAGIC 0x73757245 |
9 | #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ | 9 | #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ |
10 | #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ | 10 | #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ |
11 | #define DEBUGFS_MAGIC 0x64626720 | 11 | #define DEBUGFS_MAGIC 0x64626720 |
12 | #define SECURITYFS_MAGIC 0x73636673 | 12 | #define SECURITYFS_MAGIC 0x73636673 |
13 | #define SELINUX_MAGIC 0xf97cff8c | 13 | #define SELINUX_MAGIC 0xf97cff8c |
14 | #define SMACK_MAGIC 0x43415d53 /* "SMAC" */ | 14 | #define SMACK_MAGIC 0x43415d53 /* "SMAC" */ |
15 | #define RAMFS_MAGIC 0x858458f6 /* some random number */ | 15 | #define RAMFS_MAGIC 0x858458f6 /* some random number */ |
16 | #define TMPFS_MAGIC 0x01021994 | 16 | #define TMPFS_MAGIC 0x01021994 |
17 | #define HUGETLBFS_MAGIC 0x958458f6 /* some random number */ | 17 | #define HUGETLBFS_MAGIC 0x958458f6 /* some random number */ |
18 | #define SQUASHFS_MAGIC 0x73717368 | 18 | #define SQUASHFS_MAGIC 0x73717368 |
19 | #define ECRYPTFS_SUPER_MAGIC 0xf15f | 19 | #define ECRYPTFS_SUPER_MAGIC 0xf15f |
20 | #define EFS_SUPER_MAGIC 0x414A53 | 20 | #define EFS_SUPER_MAGIC 0x414A53 |
21 | #define EXT2_SUPER_MAGIC 0xEF53 | 21 | #define EXT2_SUPER_MAGIC 0xEF53 |
22 | #define EXT3_SUPER_MAGIC 0xEF53 | 22 | #define EXT3_SUPER_MAGIC 0xEF53 |
23 | #define XENFS_SUPER_MAGIC 0xabba1974 | 23 | #define XENFS_SUPER_MAGIC 0xabba1974 |
24 | #define EXT4_SUPER_MAGIC 0xEF53 | 24 | #define EXT4_SUPER_MAGIC 0xEF53 |
25 | #define BTRFS_SUPER_MAGIC 0x9123683E | 25 | #define BTRFS_SUPER_MAGIC 0x9123683E |
26 | #define NILFS_SUPER_MAGIC 0x3434 | 26 | #define NILFS_SUPER_MAGIC 0x3434 |
27 | #define F2FS_SUPER_MAGIC 0xF2F52010 | 27 | #define F2FS_SUPER_MAGIC 0xF2F52010 |
28 | #define HPFS_SUPER_MAGIC 0xf995e849 | 28 | #define HPFS_SUPER_MAGIC 0xf995e849 |
29 | #define ISOFS_SUPER_MAGIC 0x9660 | 29 | #define ISOFS_SUPER_MAGIC 0x9660 |
30 | #define JFFS2_SUPER_MAGIC 0x72b6 | 30 | #define JFFS2_SUPER_MAGIC 0x72b6 |
31 | #define PSTOREFS_MAGIC 0x6165676C | 31 | #define PSTOREFS_MAGIC 0x6165676C |
32 | #define EFIVARFS_MAGIC 0xde5e81e4 | 32 | #define EFIVARFS_MAGIC 0xde5e81e4 |
33 | #define HOSTFS_SUPER_MAGIC 0x00c0ffee | 33 | #define HOSTFS_SUPER_MAGIC 0x00c0ffee |
34 | 34 | ||
35 | #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ | 35 | #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ |
36 | #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ | 36 | #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ |
37 | #define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ | 37 | #define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ |
38 | #define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ | 38 | #define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ |
39 | #define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ | 39 | #define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ |
40 | 40 | ||
41 | #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ | 41 | #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ |
42 | #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ | 42 | #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ |
43 | #define NFS_SUPER_MAGIC 0x6969 | 43 | #define NFS_SUPER_MAGIC 0x6969 |
44 | #define OPENPROM_SUPER_MAGIC 0x9fa1 | 44 | #define OPENPROM_SUPER_MAGIC 0x9fa1 |
45 | #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ | 45 | #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ |
46 | #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */ | 46 | #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */ |
47 | 47 | ||
48 | #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ | 48 | #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ |
49 | /* used by file system utilities that | 49 | /* used by file system utilities that |
50 | look at the superblock, etc. */ | 50 | look at the superblock, etc. */ |
51 | #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" | 51 | #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" |
52 | #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" | 52 | #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" |
53 | #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" | 53 | #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" |
54 | 54 | ||
55 | #define SMB_SUPER_MAGIC 0x517B | 55 | #define SMB_SUPER_MAGIC 0x517B |
56 | #define CGROUP_SUPER_MAGIC 0x27e0eb | 56 | #define CGROUP_SUPER_MAGIC 0x27e0eb |
57 | 57 | ||
58 | 58 | ||
59 | #define STACK_END_MAGIC 0x57AC6E9D | 59 | #define STACK_END_MAGIC 0x57AC6E9D |
60 | 60 | ||
61 | #define V9FS_MAGIC 0x01021997 | 61 | #define V9FS_MAGIC 0x01021997 |
62 | 62 | ||
63 | #define BDEVFS_MAGIC 0x62646576 | 63 | #define BDEVFS_MAGIC 0x62646576 |
64 | #define BINFMTFS_MAGIC 0x42494e4d | 64 | #define BINFMTFS_MAGIC 0x42494e4d |
65 | #define DEVPTS_SUPER_MAGIC 0x1cd1 | 65 | #define DEVPTS_SUPER_MAGIC 0x1cd1 |
66 | #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA | 66 | #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA |
67 | #define PIPEFS_MAGIC 0x50495045 | 67 | #define PIPEFS_MAGIC 0x50495045 |
68 | #define PROC_SUPER_MAGIC 0x9fa0 | 68 | #define PROC_SUPER_MAGIC 0x9fa0 |
69 | #define SOCKFS_MAGIC 0x534F434B | 69 | #define SOCKFS_MAGIC 0x534F434B |
70 | #define SYSFS_MAGIC 0x62656572 | 70 | #define SYSFS_MAGIC 0x62656572 |
71 | #define USBDEVICE_SUPER_MAGIC 0x9fa2 | 71 | #define USBDEVICE_SUPER_MAGIC 0x9fa2 |
72 | #define MTD_INODE_FS_MAGIC 0x11307854 | 72 | #define MTD_INODE_FS_MAGIC 0x11307854 |
73 | #define ANON_INODE_FS_MAGIC 0x09041934 | 73 | #define ANON_INODE_FS_MAGIC 0x09041934 |
74 | #define BTRFS_TEST_MAGIC 0x73727279 | 74 | #define BTRFS_TEST_MAGIC 0x73727279 |
75 | #define NSFS_MAGIC 0x6e736673 | ||
75 | 76 | ||
76 | #endif /* __LINUX_MAGIC_H__ */ | 77 | #endif /* __LINUX_MAGIC_H__ */ |
77 | 78 |
init/main.c
1 | /* | 1 | /* |
2 | * linux/init/main.c | 2 | * linux/init/main.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * GK 2/5/95 - Changed to support mounting root fs via NFS | 6 | * GK 2/5/95 - Changed to support mounting root fs via NFS |
7 | * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 | 7 | * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 |
8 | * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 | 8 | * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 |
9 | * Simplified starting of init: Michael A. Griffith <grif@acm.org> | 9 | * Simplified starting of init: Michael A. Griffith <grif@acm.org> |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define DEBUG /* Enable initcall_debug */ | 12 | #define DEBUG /* Enable initcall_debug */ |
13 | 13 | ||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/proc_fs.h> | 16 | #include <linux/proc_fs.h> |
17 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
18 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
19 | #include <linux/stackprotector.h> | 19 | #include <linux/stackprotector.h> |
20 | #include <linux/string.h> | 20 | #include <linux/string.h> |
21 | #include <linux/ctype.h> | 21 | #include <linux/ctype.h> |
22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
23 | #include <linux/ioport.h> | 23 | #include <linux/ioport.h> |
24 | #include <linux/init.h> | 24 | #include <linux/init.h> |
25 | #include <linux/initrd.h> | 25 | #include <linux/initrd.h> |
26 | #include <linux/bootmem.h> | 26 | #include <linux/bootmem.h> |
27 | #include <linux/acpi.h> | 27 | #include <linux/acpi.h> |
28 | #include <linux/tty.h> | 28 | #include <linux/tty.h> |
29 | #include <linux/percpu.h> | 29 | #include <linux/percpu.h> |
30 | #include <linux/kmod.h> | 30 | #include <linux/kmod.h> |
31 | #include <linux/vmalloc.h> | 31 | #include <linux/vmalloc.h> |
32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
33 | #include <linux/start_kernel.h> | 33 | #include <linux/start_kernel.h> |
34 | #include <linux/security.h> | 34 | #include <linux/security.h> |
35 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
36 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
37 | #include <linux/rcupdate.h> | 37 | #include <linux/rcupdate.h> |
38 | #include <linux/moduleparam.h> | 38 | #include <linux/moduleparam.h> |
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/writeback.h> | 40 | #include <linux/writeback.h> |
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/cpuset.h> | 42 | #include <linux/cpuset.h> |
43 | #include <linux/cgroup.h> | 43 | #include <linux/cgroup.h> |
44 | #include <linux/efi.h> | 44 | #include <linux/efi.h> |
45 | #include <linux/tick.h> | 45 | #include <linux/tick.h> |
46 | #include <linux/interrupt.h> | 46 | #include <linux/interrupt.h> |
47 | #include <linux/taskstats_kern.h> | 47 | #include <linux/taskstats_kern.h> |
48 | #include <linux/delayacct.h> | 48 | #include <linux/delayacct.h> |
49 | #include <linux/unistd.h> | 49 | #include <linux/unistd.h> |
50 | #include <linux/rmap.h> | 50 | #include <linux/rmap.h> |
51 | #include <linux/mempolicy.h> | 51 | #include <linux/mempolicy.h> |
52 | #include <linux/key.h> | 52 | #include <linux/key.h> |
53 | #include <linux/buffer_head.h> | 53 | #include <linux/buffer_head.h> |
54 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
55 | #include <linux/debug_locks.h> | 55 | #include <linux/debug_locks.h> |
56 | #include <linux/debugobjects.h> | 56 | #include <linux/debugobjects.h> |
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/kmemleak.h> | 58 | #include <linux/kmemleak.h> |
59 | #include <linux/pid_namespace.h> | 59 | #include <linux/pid_namespace.h> |
60 | #include <linux/device.h> | 60 | #include <linux/device.h> |
61 | #include <linux/kthread.h> | 61 | #include <linux/kthread.h> |
62 | #include <linux/sched.h> | 62 | #include <linux/sched.h> |
63 | #include <linux/signal.h> | 63 | #include <linux/signal.h> |
64 | #include <linux/idr.h> | 64 | #include <linux/idr.h> |
65 | #include <linux/kgdb.h> | 65 | #include <linux/kgdb.h> |
66 | #include <linux/ftrace.h> | 66 | #include <linux/ftrace.h> |
67 | #include <linux/async.h> | 67 | #include <linux/async.h> |
68 | #include <linux/kmemcheck.h> | 68 | #include <linux/kmemcheck.h> |
69 | #include <linux/sfi.h> | 69 | #include <linux/sfi.h> |
70 | #include <linux/shmem_fs.h> | 70 | #include <linux/shmem_fs.h> |
71 | #include <linux/slab.h> | 71 | #include <linux/slab.h> |
72 | #include <linux/perf_event.h> | 72 | #include <linux/perf_event.h> |
73 | #include <linux/file.h> | 73 | #include <linux/file.h> |
74 | #include <linux/ptrace.h> | 74 | #include <linux/ptrace.h> |
75 | #include <linux/blkdev.h> | 75 | #include <linux/blkdev.h> |
76 | #include <linux/elevator.h> | 76 | #include <linux/elevator.h> |
77 | #include <linux/sched_clock.h> | 77 | #include <linux/sched_clock.h> |
78 | #include <linux/context_tracking.h> | 78 | #include <linux/context_tracking.h> |
79 | #include <linux/random.h> | 79 | #include <linux/random.h> |
80 | #include <linux/list.h> | 80 | #include <linux/list.h> |
81 | #include <linux/proc_ns.h> | ||
81 | 82 | ||
82 | #include <asm/io.h> | 83 | #include <asm/io.h> |
83 | #include <asm/bugs.h> | 84 | #include <asm/bugs.h> |
84 | #include <asm/setup.h> | 85 | #include <asm/setup.h> |
85 | #include <asm/sections.h> | 86 | #include <asm/sections.h> |
86 | #include <asm/cacheflush.h> | 87 | #include <asm/cacheflush.h> |
87 | 88 | ||
88 | #ifdef CONFIG_X86_LOCAL_APIC | 89 | #ifdef CONFIG_X86_LOCAL_APIC |
89 | #include <asm/smp.h> | 90 | #include <asm/smp.h> |
90 | #endif | 91 | #endif |
91 | 92 | ||
92 | static int kernel_init(void *); | 93 | static int kernel_init(void *); |
93 | 94 | ||
94 | extern void init_IRQ(void); | 95 | extern void init_IRQ(void); |
95 | extern void fork_init(unsigned long); | 96 | extern void fork_init(unsigned long); |
96 | extern void radix_tree_init(void); | 97 | extern void radix_tree_init(void); |
97 | #ifndef CONFIG_DEBUG_RODATA | 98 | #ifndef CONFIG_DEBUG_RODATA |
98 | static inline void mark_rodata_ro(void) { } | 99 | static inline void mark_rodata_ro(void) { } |
99 | #endif | 100 | #endif |
100 | 101 | ||
101 | /* | 102 | /* |
102 | * Debug helper: via this flag we know that we are in 'early bootup code' | 103 | * Debug helper: via this flag we know that we are in 'early bootup code' |
103 | * where only the boot processor is running with IRQ disabled. This means | 104 | * where only the boot processor is running with IRQ disabled. This means |
104 | * two things - IRQ must not be enabled before the flag is cleared and some | 105 | * two things - IRQ must not be enabled before the flag is cleared and some |
105 | * operations which are not allowed with IRQ disabled are allowed while the | 106 | * operations which are not allowed with IRQ disabled are allowed while the |
106 | * flag is set. | 107 | * flag is set. |
107 | */ | 108 | */ |
108 | bool early_boot_irqs_disabled __read_mostly; | 109 | bool early_boot_irqs_disabled __read_mostly; |
109 | 110 | ||
110 | enum system_states system_state __read_mostly; | 111 | enum system_states system_state __read_mostly; |
111 | EXPORT_SYMBOL(system_state); | 112 | EXPORT_SYMBOL(system_state); |
112 | 113 | ||
113 | /* | 114 | /* |
114 | * Boot command-line arguments | 115 | * Boot command-line arguments |
115 | */ | 116 | */ |
116 | #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT | 117 | #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT |
117 | #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT | 118 | #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT |
118 | 119 | ||
119 | extern void time_init(void); | 120 | extern void time_init(void); |
120 | /* Default late time init is NULL. archs can override this later. */ | 121 | /* Default late time init is NULL. archs can override this later. */ |
121 | void (*__initdata late_time_init)(void); | 122 | void (*__initdata late_time_init)(void); |
122 | 123 | ||
123 | /* Untouched command line saved by arch-specific code. */ | 124 | /* Untouched command line saved by arch-specific code. */ |
124 | char __initdata boot_command_line[COMMAND_LINE_SIZE]; | 125 | char __initdata boot_command_line[COMMAND_LINE_SIZE]; |
125 | /* Untouched saved command line (eg. for /proc) */ | 126 | /* Untouched saved command line (eg. for /proc) */ |
126 | char *saved_command_line; | 127 | char *saved_command_line; |
127 | /* Command line for parameter parsing */ | 128 | /* Command line for parameter parsing */ |
128 | static char *static_command_line; | 129 | static char *static_command_line; |
129 | /* Command line for per-initcall parameter parsing */ | 130 | /* Command line for per-initcall parameter parsing */ |
130 | static char *initcall_command_line; | 131 | static char *initcall_command_line; |
131 | 132 | ||
132 | static char *execute_command; | 133 | static char *execute_command; |
133 | static char *ramdisk_execute_command; | 134 | static char *ramdisk_execute_command; |
134 | 135 | ||
135 | /* | 136 | /* |
136 | * Used to generate warnings if static_key manipulation functions are used | 137 | * Used to generate warnings if static_key manipulation functions are used |
137 | * before jump_label_init is called. | 138 | * before jump_label_init is called. |
138 | */ | 139 | */ |
139 | bool static_key_initialized __read_mostly; | 140 | bool static_key_initialized __read_mostly; |
140 | EXPORT_SYMBOL_GPL(static_key_initialized); | 141 | EXPORT_SYMBOL_GPL(static_key_initialized); |
141 | 142 | ||
142 | /* | 143 | /* |
143 | * If set, this is an indication to the drivers that reset the underlying | 144 | * If set, this is an indication to the drivers that reset the underlying |
144 | * device before going ahead with the initialization otherwise driver might | 145 | * device before going ahead with the initialization otherwise driver might |
145 | * rely on the BIOS and skip the reset operation. | 146 | * rely on the BIOS and skip the reset operation. |
146 | * | 147 | * |
147 | * This is useful if kernel is booting in an unreliable environment. | 148 | * This is useful if kernel is booting in an unreliable environment. |
148 | * For ex. kdump situaiton where previous kernel has crashed, BIOS has been | 149 | * For ex. kdump situaiton where previous kernel has crashed, BIOS has been |
149 | * skipped and devices will be in unknown state. | 150 | * skipped and devices will be in unknown state. |
150 | */ | 151 | */ |
151 | unsigned int reset_devices; | 152 | unsigned int reset_devices; |
152 | EXPORT_SYMBOL(reset_devices); | 153 | EXPORT_SYMBOL(reset_devices); |
153 | 154 | ||
154 | static int __init set_reset_devices(char *str) | 155 | static int __init set_reset_devices(char *str) |
155 | { | 156 | { |
156 | reset_devices = 1; | 157 | reset_devices = 1; |
157 | return 1; | 158 | return 1; |
158 | } | 159 | } |
159 | 160 | ||
160 | __setup("reset_devices", set_reset_devices); | 161 | __setup("reset_devices", set_reset_devices); |
161 | 162 | ||
162 | static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; | 163 | static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; |
163 | const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; | 164 | const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; |
164 | static const char *panic_later, *panic_param; | 165 | static const char *panic_later, *panic_param; |
165 | 166 | ||
166 | extern const struct obs_kernel_param __setup_start[], __setup_end[]; | 167 | extern const struct obs_kernel_param __setup_start[], __setup_end[]; |
167 | 168 | ||
168 | static int __init obsolete_checksetup(char *line) | 169 | static int __init obsolete_checksetup(char *line) |
169 | { | 170 | { |
170 | const struct obs_kernel_param *p; | 171 | const struct obs_kernel_param *p; |
171 | int had_early_param = 0; | 172 | int had_early_param = 0; |
172 | 173 | ||
173 | p = __setup_start; | 174 | p = __setup_start; |
174 | do { | 175 | do { |
175 | int n = strlen(p->str); | 176 | int n = strlen(p->str); |
176 | if (parameqn(line, p->str, n)) { | 177 | if (parameqn(line, p->str, n)) { |
177 | if (p->early) { | 178 | if (p->early) { |
178 | /* Already done in parse_early_param? | 179 | /* Already done in parse_early_param? |
179 | * (Needs exact match on param part). | 180 | * (Needs exact match on param part). |
180 | * Keep iterating, as we can have early | 181 | * Keep iterating, as we can have early |
181 | * params and __setups of same names 8( */ | 182 | * params and __setups of same names 8( */ |
182 | if (line[n] == '\0' || line[n] == '=') | 183 | if (line[n] == '\0' || line[n] == '=') |
183 | had_early_param = 1; | 184 | had_early_param = 1; |
184 | } else if (!p->setup_func) { | 185 | } else if (!p->setup_func) { |
185 | pr_warn("Parameter %s is obsolete, ignored\n", | 186 | pr_warn("Parameter %s is obsolete, ignored\n", |
186 | p->str); | 187 | p->str); |
187 | return 1; | 188 | return 1; |
188 | } else if (p->setup_func(line + n)) | 189 | } else if (p->setup_func(line + n)) |
189 | return 1; | 190 | return 1; |
190 | } | 191 | } |
191 | p++; | 192 | p++; |
192 | } while (p < __setup_end); | 193 | } while (p < __setup_end); |
193 | 194 | ||
194 | return had_early_param; | 195 | return had_early_param; |
195 | } | 196 | } |
196 | 197 | ||
197 | /* | 198 | /* |
198 | * This should be approx 2 Bo*oMips to start (note initial shift), and will | 199 | * This should be approx 2 Bo*oMips to start (note initial shift), and will |
199 | * still work even if initially too large, it will just take slightly longer | 200 | * still work even if initially too large, it will just take slightly longer |
200 | */ | 201 | */ |
201 | unsigned long loops_per_jiffy = (1<<12); | 202 | unsigned long loops_per_jiffy = (1<<12); |
202 | EXPORT_SYMBOL(loops_per_jiffy); | 203 | EXPORT_SYMBOL(loops_per_jiffy); |
203 | 204 | ||
204 | static int __init debug_kernel(char *str) | 205 | static int __init debug_kernel(char *str) |
205 | { | 206 | { |
206 | console_loglevel = CONSOLE_LOGLEVEL_DEBUG; | 207 | console_loglevel = CONSOLE_LOGLEVEL_DEBUG; |
207 | return 0; | 208 | return 0; |
208 | } | 209 | } |
209 | 210 | ||
210 | static int __init quiet_kernel(char *str) | 211 | static int __init quiet_kernel(char *str) |
211 | { | 212 | { |
212 | console_loglevel = CONSOLE_LOGLEVEL_QUIET; | 213 | console_loglevel = CONSOLE_LOGLEVEL_QUIET; |
213 | return 0; | 214 | return 0; |
214 | } | 215 | } |
215 | 216 | ||
216 | early_param("debug", debug_kernel); | 217 | early_param("debug", debug_kernel); |
217 | early_param("quiet", quiet_kernel); | 218 | early_param("quiet", quiet_kernel); |
218 | 219 | ||
219 | static int __init loglevel(char *str) | 220 | static int __init loglevel(char *str) |
220 | { | 221 | { |
221 | int newlevel; | 222 | int newlevel; |
222 | 223 | ||
223 | /* | 224 | /* |
224 | * Only update loglevel value when a correct setting was passed, | 225 | * Only update loglevel value when a correct setting was passed, |
225 | * to prevent blind crashes (when loglevel being set to 0) that | 226 | * to prevent blind crashes (when loglevel being set to 0) that |
226 | * are quite hard to debug | 227 | * are quite hard to debug |
227 | */ | 228 | */ |
228 | if (get_option(&str, &newlevel)) { | 229 | if (get_option(&str, &newlevel)) { |
229 | console_loglevel = newlevel; | 230 | console_loglevel = newlevel; |
230 | return 0; | 231 | return 0; |
231 | } | 232 | } |
232 | 233 | ||
233 | return -EINVAL; | 234 | return -EINVAL; |
234 | } | 235 | } |
235 | 236 | ||
236 | early_param("loglevel", loglevel); | 237 | early_param("loglevel", loglevel); |
237 | 238 | ||
238 | /* Change NUL term back to "=", to make "param" the whole string. */ | 239 | /* Change NUL term back to "=", to make "param" the whole string. */ |
239 | static int __init repair_env_string(char *param, char *val, const char *unused) | 240 | static int __init repair_env_string(char *param, char *val, const char *unused) |
240 | { | 241 | { |
241 | if (val) { | 242 | if (val) { |
242 | /* param=val or param="val"? */ | 243 | /* param=val or param="val"? */ |
243 | if (val == param+strlen(param)+1) | 244 | if (val == param+strlen(param)+1) |
244 | val[-1] = '='; | 245 | val[-1] = '='; |
245 | else if (val == param+strlen(param)+2) { | 246 | else if (val == param+strlen(param)+2) { |
246 | val[-2] = '='; | 247 | val[-2] = '='; |
247 | memmove(val-1, val, strlen(val)+1); | 248 | memmove(val-1, val, strlen(val)+1); |
248 | val--; | 249 | val--; |
249 | } else | 250 | } else |
250 | BUG(); | 251 | BUG(); |
251 | } | 252 | } |
252 | return 0; | 253 | return 0; |
253 | } | 254 | } |
254 | 255 | ||
255 | /* Anything after -- gets handed straight to init. */ | 256 | /* Anything after -- gets handed straight to init. */ |
256 | static int __init set_init_arg(char *param, char *val, const char *unused) | 257 | static int __init set_init_arg(char *param, char *val, const char *unused) |
257 | { | 258 | { |
258 | unsigned int i; | 259 | unsigned int i; |
259 | 260 | ||
260 | if (panic_later) | 261 | if (panic_later) |
261 | return 0; | 262 | return 0; |
262 | 263 | ||
263 | repair_env_string(param, val, unused); | 264 | repair_env_string(param, val, unused); |
264 | 265 | ||
265 | for (i = 0; argv_init[i]; i++) { | 266 | for (i = 0; argv_init[i]; i++) { |
266 | if (i == MAX_INIT_ARGS) { | 267 | if (i == MAX_INIT_ARGS) { |
267 | panic_later = "init"; | 268 | panic_later = "init"; |
268 | panic_param = param; | 269 | panic_param = param; |
269 | return 0; | 270 | return 0; |
270 | } | 271 | } |
271 | } | 272 | } |
272 | argv_init[i] = param; | 273 | argv_init[i] = param; |
273 | return 0; | 274 | return 0; |
274 | } | 275 | } |
275 | 276 | ||
276 | /* | 277 | /* |
277 | * Unknown boot options get handed to init, unless they look like | 278 | * Unknown boot options get handed to init, unless they look like |
278 | * unused parameters (modprobe will find them in /proc/cmdline). | 279 | * unused parameters (modprobe will find them in /proc/cmdline). |
279 | */ | 280 | */ |
280 | static int __init unknown_bootoption(char *param, char *val, const char *unused) | 281 | static int __init unknown_bootoption(char *param, char *val, const char *unused) |
281 | { | 282 | { |
282 | repair_env_string(param, val, unused); | 283 | repair_env_string(param, val, unused); |
283 | 284 | ||
284 | /* Handle obsolete-style parameters */ | 285 | /* Handle obsolete-style parameters */ |
285 | if (obsolete_checksetup(param)) | 286 | if (obsolete_checksetup(param)) |
286 | return 0; | 287 | return 0; |
287 | 288 | ||
288 | /* Unused module parameter. */ | 289 | /* Unused module parameter. */ |
289 | if (strchr(param, '.') && (!val || strchr(param, '.') < val)) | 290 | if (strchr(param, '.') && (!val || strchr(param, '.') < val)) |
290 | return 0; | 291 | return 0; |
291 | 292 | ||
292 | if (panic_later) | 293 | if (panic_later) |
293 | return 0; | 294 | return 0; |
294 | 295 | ||
295 | if (val) { | 296 | if (val) { |
296 | /* Environment option */ | 297 | /* Environment option */ |
297 | unsigned int i; | 298 | unsigned int i; |
298 | for (i = 0; envp_init[i]; i++) { | 299 | for (i = 0; envp_init[i]; i++) { |
299 | if (i == MAX_INIT_ENVS) { | 300 | if (i == MAX_INIT_ENVS) { |
300 | panic_later = "env"; | 301 | panic_later = "env"; |
301 | panic_param = param; | 302 | panic_param = param; |
302 | } | 303 | } |
303 | if (!strncmp(param, envp_init[i], val - param)) | 304 | if (!strncmp(param, envp_init[i], val - param)) |
304 | break; | 305 | break; |
305 | } | 306 | } |
306 | envp_init[i] = param; | 307 | envp_init[i] = param; |
307 | } else { | 308 | } else { |
308 | /* Command line option */ | 309 | /* Command line option */ |
309 | unsigned int i; | 310 | unsigned int i; |
310 | for (i = 0; argv_init[i]; i++) { | 311 | for (i = 0; argv_init[i]; i++) { |
311 | if (i == MAX_INIT_ARGS) { | 312 | if (i == MAX_INIT_ARGS) { |
312 | panic_later = "init"; | 313 | panic_later = "init"; |
313 | panic_param = param; | 314 | panic_param = param; |
314 | } | 315 | } |
315 | } | 316 | } |
316 | argv_init[i] = param; | 317 | argv_init[i] = param; |
317 | } | 318 | } |
318 | return 0; | 319 | return 0; |
319 | } | 320 | } |
320 | 321 | ||
321 | static int __init init_setup(char *str) | 322 | static int __init init_setup(char *str) |
322 | { | 323 | { |
323 | unsigned int i; | 324 | unsigned int i; |
324 | 325 | ||
325 | execute_command = str; | 326 | execute_command = str; |
326 | /* | 327 | /* |
327 | * In case LILO is going to boot us with default command line, | 328 | * In case LILO is going to boot us with default command line, |
328 | * it prepends "auto" before the whole cmdline which makes | 329 | * it prepends "auto" before the whole cmdline which makes |
329 | * the shell think it should execute a script with such name. | 330 | * the shell think it should execute a script with such name. |
330 | * So we ignore all arguments entered _before_ init=... [MJ] | 331 | * So we ignore all arguments entered _before_ init=... [MJ] |
331 | */ | 332 | */ |
332 | for (i = 1; i < MAX_INIT_ARGS; i++) | 333 | for (i = 1; i < MAX_INIT_ARGS; i++) |
333 | argv_init[i] = NULL; | 334 | argv_init[i] = NULL; |
334 | return 1; | 335 | return 1; |
335 | } | 336 | } |
336 | __setup("init=", init_setup); | 337 | __setup("init=", init_setup); |
337 | 338 | ||
338 | static int __init rdinit_setup(char *str) | 339 | static int __init rdinit_setup(char *str) |
339 | { | 340 | { |
340 | unsigned int i; | 341 | unsigned int i; |
341 | 342 | ||
342 | ramdisk_execute_command = str; | 343 | ramdisk_execute_command = str; |
343 | /* See "auto" comment in init_setup */ | 344 | /* See "auto" comment in init_setup */ |
344 | for (i = 1; i < MAX_INIT_ARGS; i++) | 345 | for (i = 1; i < MAX_INIT_ARGS; i++) |
345 | argv_init[i] = NULL; | 346 | argv_init[i] = NULL; |
346 | return 1; | 347 | return 1; |
347 | } | 348 | } |
348 | __setup("rdinit=", rdinit_setup); | 349 | __setup("rdinit=", rdinit_setup); |
349 | 350 | ||
350 | #ifndef CONFIG_SMP | 351 | #ifndef CONFIG_SMP |
351 | static const unsigned int setup_max_cpus = NR_CPUS; | 352 | static const unsigned int setup_max_cpus = NR_CPUS; |
352 | #ifdef CONFIG_X86_LOCAL_APIC | 353 | #ifdef CONFIG_X86_LOCAL_APIC |
353 | static void __init smp_init(void) | 354 | static void __init smp_init(void) |
354 | { | 355 | { |
355 | APIC_init_uniprocessor(); | 356 | APIC_init_uniprocessor(); |
356 | } | 357 | } |
357 | #else | 358 | #else |
358 | #define smp_init() do { } while (0) | 359 | #define smp_init() do { } while (0) |
359 | #endif | 360 | #endif |
360 | 361 | ||
361 | static inline void setup_nr_cpu_ids(void) { } | 362 | static inline void setup_nr_cpu_ids(void) { } |
362 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } | 363 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } |
363 | #endif | 364 | #endif |
364 | 365 | ||
365 | /* | 366 | /* |
366 | * We need to store the untouched command line for future reference. | 367 | * We need to store the untouched command line for future reference. |
367 | * We also need to store the touched command line since the parameter | 368 | * We also need to store the touched command line since the parameter |
368 | * parsing is performed in place, and we should allow a component to | 369 | * parsing is performed in place, and we should allow a component to |
369 | * store reference of name/value for future reference. | 370 | * store reference of name/value for future reference. |
370 | */ | 371 | */ |
371 | static void __init setup_command_line(char *command_line) | 372 | static void __init setup_command_line(char *command_line) |
372 | { | 373 | { |
373 | saved_command_line = | 374 | saved_command_line = |
374 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); | 375 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); |
375 | initcall_command_line = | 376 | initcall_command_line = |
376 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); | 377 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); |
377 | static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); | 378 | static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); |
378 | strcpy(saved_command_line, boot_command_line); | 379 | strcpy(saved_command_line, boot_command_line); |
379 | strcpy(static_command_line, command_line); | 380 | strcpy(static_command_line, command_line); |
380 | } | 381 | } |
381 | 382 | ||
382 | /* | 383 | /* |
383 | * We need to finalize in a non-__init function or else race conditions | 384 | * We need to finalize in a non-__init function or else race conditions |
384 | * between the root thread and the init thread may cause start_kernel to | 385 | * between the root thread and the init thread may cause start_kernel to |
385 | * be reaped by free_initmem before the root thread has proceeded to | 386 | * be reaped by free_initmem before the root thread has proceeded to |
386 | * cpu_idle. | 387 | * cpu_idle. |
387 | * | 388 | * |
388 | * gcc-3.4 accidentally inlines this function, so use noinline. | 389 | * gcc-3.4 accidentally inlines this function, so use noinline. |
389 | */ | 390 | */ |
390 | 391 | ||
391 | static __initdata DECLARE_COMPLETION(kthreadd_done); | 392 | static __initdata DECLARE_COMPLETION(kthreadd_done); |
392 | 393 | ||
393 | static noinline void __init_refok rest_init(void) | 394 | static noinline void __init_refok rest_init(void) |
394 | { | 395 | { |
395 | int pid; | 396 | int pid; |
396 | 397 | ||
397 | rcu_scheduler_starting(); | 398 | rcu_scheduler_starting(); |
398 | /* | 399 | /* |
399 | * We need to spawn init first so that it obtains pid 1, however | 400 | * We need to spawn init first so that it obtains pid 1, however |
400 | * the init task will end up wanting to create kthreads, which, if | 401 | * the init task will end up wanting to create kthreads, which, if |
401 | * we schedule it before we create kthreadd, will OOPS. | 402 | * we schedule it before we create kthreadd, will OOPS. |
402 | */ | 403 | */ |
403 | kernel_thread(kernel_init, NULL, CLONE_FS); | 404 | kernel_thread(kernel_init, NULL, CLONE_FS); |
404 | numa_default_policy(); | 405 | numa_default_policy(); |
405 | pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); | 406 | pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); |
406 | rcu_read_lock(); | 407 | rcu_read_lock(); |
407 | kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); | 408 | kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); |
408 | rcu_read_unlock(); | 409 | rcu_read_unlock(); |
409 | complete(&kthreadd_done); | 410 | complete(&kthreadd_done); |
410 | 411 | ||
411 | /* | 412 | /* |
412 | * The boot idle thread must execute schedule() | 413 | * The boot idle thread must execute schedule() |
413 | * at least once to get things moving: | 414 | * at least once to get things moving: |
414 | */ | 415 | */ |
415 | init_idle_bootup_task(current); | 416 | init_idle_bootup_task(current); |
416 | schedule_preempt_disabled(); | 417 | schedule_preempt_disabled(); |
417 | /* Call into cpu_idle with preempt disabled */ | 418 | /* Call into cpu_idle with preempt disabled */ |
418 | cpu_startup_entry(CPUHP_ONLINE); | 419 | cpu_startup_entry(CPUHP_ONLINE); |
419 | } | 420 | } |
420 | 421 | ||
421 | /* Check for early params. */ | 422 | /* Check for early params. */ |
422 | static int __init do_early_param(char *param, char *val, const char *unused) | 423 | static int __init do_early_param(char *param, char *val, const char *unused) |
423 | { | 424 | { |
424 | const struct obs_kernel_param *p; | 425 | const struct obs_kernel_param *p; |
425 | 426 | ||
426 | for (p = __setup_start; p < __setup_end; p++) { | 427 | for (p = __setup_start; p < __setup_end; p++) { |
427 | if ((p->early && parameq(param, p->str)) || | 428 | if ((p->early && parameq(param, p->str)) || |
428 | (strcmp(param, "console") == 0 && | 429 | (strcmp(param, "console") == 0 && |
429 | strcmp(p->str, "earlycon") == 0) | 430 | strcmp(p->str, "earlycon") == 0) |
430 | ) { | 431 | ) { |
431 | if (p->setup_func(val) != 0) | 432 | if (p->setup_func(val) != 0) |
432 | pr_warn("Malformed early option '%s'\n", param); | 433 | pr_warn("Malformed early option '%s'\n", param); |
433 | } | 434 | } |
434 | } | 435 | } |
435 | /* We accept everything at this stage. */ | 436 | /* We accept everything at this stage. */ |
436 | return 0; | 437 | return 0; |
437 | } | 438 | } |
438 | 439 | ||
439 | void __init parse_early_options(char *cmdline) | 440 | void __init parse_early_options(char *cmdline) |
440 | { | 441 | { |
441 | parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); | 442 | parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); |
442 | } | 443 | } |
443 | 444 | ||
444 | /* Arch code calls this early on, or if not, just before other parsing. */ | 445 | /* Arch code calls this early on, or if not, just before other parsing. */ |
445 | void __init parse_early_param(void) | 446 | void __init parse_early_param(void) |
446 | { | 447 | { |
447 | static int done __initdata; | 448 | static int done __initdata; |
448 | static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; | 449 | static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; |
449 | 450 | ||
450 | if (done) | 451 | if (done) |
451 | return; | 452 | return; |
452 | 453 | ||
453 | /* All fall through to do_early_param. */ | 454 | /* All fall through to do_early_param. */ |
454 | strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); | 455 | strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); |
455 | parse_early_options(tmp_cmdline); | 456 | parse_early_options(tmp_cmdline); |
456 | done = 1; | 457 | done = 1; |
457 | } | 458 | } |
458 | 459 | ||
459 | /* | 460 | /* |
460 | * Activate the first processor. | 461 | * Activate the first processor. |
461 | */ | 462 | */ |
462 | 463 | ||
463 | static void __init boot_cpu_init(void) | 464 | static void __init boot_cpu_init(void) |
464 | { | 465 | { |
465 | int cpu = smp_processor_id(); | 466 | int cpu = smp_processor_id(); |
466 | /* Mark the boot cpu "present", "online" etc for SMP and UP case */ | 467 | /* Mark the boot cpu "present", "online" etc for SMP and UP case */ |
467 | set_cpu_online(cpu, true); | 468 | set_cpu_online(cpu, true); |
468 | set_cpu_active(cpu, true); | 469 | set_cpu_active(cpu, true); |
469 | set_cpu_present(cpu, true); | 470 | set_cpu_present(cpu, true); |
470 | set_cpu_possible(cpu, true); | 471 | set_cpu_possible(cpu, true); |
471 | } | 472 | } |
472 | 473 | ||
473 | void __init __weak smp_setup_processor_id(void) | 474 | void __init __weak smp_setup_processor_id(void) |
474 | { | 475 | { |
475 | } | 476 | } |
476 | 477 | ||
477 | # if THREAD_SIZE >= PAGE_SIZE | 478 | # if THREAD_SIZE >= PAGE_SIZE |
478 | void __init __weak thread_info_cache_init(void) | 479 | void __init __weak thread_info_cache_init(void) |
479 | { | 480 | { |
480 | } | 481 | } |
481 | #endif | 482 | #endif |
482 | 483 | ||
483 | /* | 484 | /* |
484 | * Set up kernel memory allocators | 485 | * Set up kernel memory allocators |
485 | */ | 486 | */ |
486 | static void __init mm_init(void) | 487 | static void __init mm_init(void) |
487 | { | 488 | { |
488 | /* | 489 | /* |
489 | * page_cgroup requires contiguous pages, | 490 | * page_cgroup requires contiguous pages, |
490 | * bigger than MAX_ORDER unless SPARSEMEM. | 491 | * bigger than MAX_ORDER unless SPARSEMEM. |
491 | */ | 492 | */ |
492 | page_cgroup_init_flatmem(); | 493 | page_cgroup_init_flatmem(); |
493 | mem_init(); | 494 | mem_init(); |
494 | kmem_cache_init(); | 495 | kmem_cache_init(); |
495 | percpu_init_late(); | 496 | percpu_init_late(); |
496 | pgtable_init(); | 497 | pgtable_init(); |
497 | vmalloc_init(); | 498 | vmalloc_init(); |
498 | } | 499 | } |
499 | 500 | ||
500 | asmlinkage __visible void __init start_kernel(void) | 501 | asmlinkage __visible void __init start_kernel(void) |
501 | { | 502 | { |
502 | char *command_line; | 503 | char *command_line; |
503 | char *after_dashes; | 504 | char *after_dashes; |
504 | 505 | ||
505 | /* | 506 | /* |
506 | * Need to run as early as possible, to initialize the | 507 | * Need to run as early as possible, to initialize the |
507 | * lockdep hash: | 508 | * lockdep hash: |
508 | */ | 509 | */ |
509 | lockdep_init(); | 510 | lockdep_init(); |
510 | set_task_stack_end_magic(&init_task); | 511 | set_task_stack_end_magic(&init_task); |
511 | smp_setup_processor_id(); | 512 | smp_setup_processor_id(); |
512 | debug_objects_early_init(); | 513 | debug_objects_early_init(); |
513 | 514 | ||
514 | /* | 515 | /* |
515 | * Set up the the initial canary ASAP: | 516 | * Set up the the initial canary ASAP: |
516 | */ | 517 | */ |
517 | boot_init_stack_canary(); | 518 | boot_init_stack_canary(); |
518 | 519 | ||
519 | cgroup_init_early(); | 520 | cgroup_init_early(); |
520 | 521 | ||
521 | local_irq_disable(); | 522 | local_irq_disable(); |
522 | early_boot_irqs_disabled = true; | 523 | early_boot_irqs_disabled = true; |
523 | 524 | ||
524 | /* | 525 | /* |
525 | * Interrupts are still disabled. Do necessary setups, then | 526 | * Interrupts are still disabled. Do necessary setups, then |
526 | * enable them | 527 | * enable them |
527 | */ | 528 | */ |
528 | boot_cpu_init(); | 529 | boot_cpu_init(); |
529 | page_address_init(); | 530 | page_address_init(); |
530 | pr_notice("%s", linux_banner); | 531 | pr_notice("%s", linux_banner); |
531 | setup_arch(&command_line); | 532 | setup_arch(&command_line); |
532 | mm_init_cpumask(&init_mm); | 533 | mm_init_cpumask(&init_mm); |
533 | setup_command_line(command_line); | 534 | setup_command_line(command_line); |
534 | setup_nr_cpu_ids(); | 535 | setup_nr_cpu_ids(); |
535 | setup_per_cpu_areas(); | 536 | setup_per_cpu_areas(); |
536 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 537 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
537 | 538 | ||
538 | build_all_zonelists(NULL, NULL); | 539 | build_all_zonelists(NULL, NULL); |
539 | page_alloc_init(); | 540 | page_alloc_init(); |
540 | 541 | ||
541 | pr_notice("Kernel command line: %s\n", boot_command_line); | 542 | pr_notice("Kernel command line: %s\n", boot_command_line); |
542 | parse_early_param(); | 543 | parse_early_param(); |
543 | after_dashes = parse_args("Booting kernel", | 544 | after_dashes = parse_args("Booting kernel", |
544 | static_command_line, __start___param, | 545 | static_command_line, __start___param, |
545 | __stop___param - __start___param, | 546 | __stop___param - __start___param, |
546 | -1, -1, &unknown_bootoption); | 547 | -1, -1, &unknown_bootoption); |
547 | if (after_dashes) | 548 | if (after_dashes) |
548 | parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, | 549 | parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, |
549 | set_init_arg); | 550 | set_init_arg); |
550 | 551 | ||
551 | jump_label_init(); | 552 | jump_label_init(); |
552 | 553 | ||
553 | /* | 554 | /* |
554 | * These use large bootmem allocations and must precede | 555 | * These use large bootmem allocations and must precede |
555 | * kmem_cache_init() | 556 | * kmem_cache_init() |
556 | */ | 557 | */ |
557 | setup_log_buf(0); | 558 | setup_log_buf(0); |
558 | pidhash_init(); | 559 | pidhash_init(); |
559 | vfs_caches_init_early(); | 560 | vfs_caches_init_early(); |
560 | sort_main_extable(); | 561 | sort_main_extable(); |
561 | trap_init(); | 562 | trap_init(); |
562 | mm_init(); | 563 | mm_init(); |
563 | 564 | ||
564 | /* | 565 | /* |
565 | * Set up the scheduler prior starting any interrupts (such as the | 566 | * Set up the scheduler prior starting any interrupts (such as the |
566 | * timer interrupt). Full topology setup happens at smp_init() | 567 | * timer interrupt). Full topology setup happens at smp_init() |
567 | * time - but meanwhile we still have a functioning scheduler. | 568 | * time - but meanwhile we still have a functioning scheduler. |
568 | */ | 569 | */ |
569 | sched_init(); | 570 | sched_init(); |
570 | /* | 571 | /* |
571 | * Disable preemption - early bootup scheduling is extremely | 572 | * Disable preemption - early bootup scheduling is extremely |
572 | * fragile until we cpu_idle() for the first time. | 573 | * fragile until we cpu_idle() for the first time. |
573 | */ | 574 | */ |
574 | preempt_disable(); | 575 | preempt_disable(); |
575 | if (WARN(!irqs_disabled(), | 576 | if (WARN(!irqs_disabled(), |
576 | "Interrupts were enabled *very* early, fixing it\n")) | 577 | "Interrupts were enabled *very* early, fixing it\n")) |
577 | local_irq_disable(); | 578 | local_irq_disable(); |
578 | idr_init_cache(); | 579 | idr_init_cache(); |
579 | rcu_init(); | 580 | rcu_init(); |
580 | context_tracking_init(); | 581 | context_tracking_init(); |
581 | radix_tree_init(); | 582 | radix_tree_init(); |
582 | /* init some links before init_ISA_irqs() */ | 583 | /* init some links before init_ISA_irqs() */ |
583 | early_irq_init(); | 584 | early_irq_init(); |
584 | init_IRQ(); | 585 | init_IRQ(); |
585 | tick_init(); | 586 | tick_init(); |
586 | rcu_init_nohz(); | 587 | rcu_init_nohz(); |
587 | init_timers(); | 588 | init_timers(); |
588 | hrtimers_init(); | 589 | hrtimers_init(); |
589 | softirq_init(); | 590 | softirq_init(); |
590 | timekeeping_init(); | 591 | timekeeping_init(); |
591 | time_init(); | 592 | time_init(); |
592 | sched_clock_postinit(); | 593 | sched_clock_postinit(); |
593 | perf_event_init(); | 594 | perf_event_init(); |
594 | profile_init(); | 595 | profile_init(); |
595 | call_function_init(); | 596 | call_function_init(); |
596 | WARN(!irqs_disabled(), "Interrupts were enabled early\n"); | 597 | WARN(!irqs_disabled(), "Interrupts were enabled early\n"); |
597 | early_boot_irqs_disabled = false; | 598 | early_boot_irqs_disabled = false; |
598 | local_irq_enable(); | 599 | local_irq_enable(); |
599 | 600 | ||
600 | kmem_cache_init_late(); | 601 | kmem_cache_init_late(); |
601 | 602 | ||
602 | /* | 603 | /* |
603 | * HACK ALERT! This is early. We're enabling the console before | 604 | * HACK ALERT! This is early. We're enabling the console before |
604 | * we've done PCI setups etc, and console_init() must be aware of | 605 | * we've done PCI setups etc, and console_init() must be aware of |
605 | * this. But we do want output early, in case something goes wrong. | 606 | * this. But we do want output early, in case something goes wrong. |
606 | */ | 607 | */ |
607 | console_init(); | 608 | console_init(); |
608 | if (panic_later) | 609 | if (panic_later) |
609 | panic("Too many boot %s vars at `%s'", panic_later, | 610 | panic("Too many boot %s vars at `%s'", panic_later, |
610 | panic_param); | 611 | panic_param); |
611 | 612 | ||
612 | lockdep_info(); | 613 | lockdep_info(); |
613 | 614 | ||
614 | /* | 615 | /* |
615 | * Need to run this when irqs are enabled, because it wants | 616 | * Need to run this when irqs are enabled, because it wants |
616 | * to self-test [hard/soft]-irqs on/off lock inversion bugs | 617 | * to self-test [hard/soft]-irqs on/off lock inversion bugs |
617 | * too: | 618 | * too: |
618 | */ | 619 | */ |
619 | locking_selftest(); | 620 | locking_selftest(); |
620 | 621 | ||
621 | #ifdef CONFIG_BLK_DEV_INITRD | 622 | #ifdef CONFIG_BLK_DEV_INITRD |
622 | if (initrd_start && !initrd_below_start_ok && | 623 | if (initrd_start && !initrd_below_start_ok && |
623 | page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { | 624 | page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { |
624 | pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", | 625 | pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", |
625 | page_to_pfn(virt_to_page((void *)initrd_start)), | 626 | page_to_pfn(virt_to_page((void *)initrd_start)), |
626 | min_low_pfn); | 627 | min_low_pfn); |
627 | initrd_start = 0; | 628 | initrd_start = 0; |
628 | } | 629 | } |
629 | #endif | 630 | #endif |
630 | page_cgroup_init(); | 631 | page_cgroup_init(); |
631 | debug_objects_mem_init(); | 632 | debug_objects_mem_init(); |
632 | kmemleak_init(); | 633 | kmemleak_init(); |
633 | setup_per_cpu_pageset(); | 634 | setup_per_cpu_pageset(); |
634 | numa_policy_init(); | 635 | numa_policy_init(); |
635 | if (late_time_init) | 636 | if (late_time_init) |
636 | late_time_init(); | 637 | late_time_init(); |
637 | sched_clock_init(); | 638 | sched_clock_init(); |
638 | calibrate_delay(); | 639 | calibrate_delay(); |
639 | pidmap_init(); | 640 | pidmap_init(); |
640 | anon_vma_init(); | 641 | anon_vma_init(); |
641 | acpi_early_init(); | 642 | acpi_early_init(); |
642 | #ifdef CONFIG_X86 | 643 | #ifdef CONFIG_X86 |
643 | if (efi_enabled(EFI_RUNTIME_SERVICES)) | 644 | if (efi_enabled(EFI_RUNTIME_SERVICES)) |
644 | efi_enter_virtual_mode(); | 645 | efi_enter_virtual_mode(); |
645 | #endif | 646 | #endif |
646 | #ifdef CONFIG_X86_ESPFIX64 | 647 | #ifdef CONFIG_X86_ESPFIX64 |
647 | /* Should be run before the first non-init thread is created */ | 648 | /* Should be run before the first non-init thread is created */ |
648 | init_espfix_bsp(); | 649 | init_espfix_bsp(); |
649 | #endif | 650 | #endif |
650 | thread_info_cache_init(); | 651 | thread_info_cache_init(); |
651 | cred_init(); | 652 | cred_init(); |
652 | fork_init(totalram_pages); | 653 | fork_init(totalram_pages); |
653 | proc_caches_init(); | 654 | proc_caches_init(); |
654 | buffer_init(); | 655 | buffer_init(); |
655 | key_init(); | 656 | key_init(); |
656 | security_init(); | 657 | security_init(); |
657 | dbg_late_init(); | 658 | dbg_late_init(); |
658 | vfs_caches_init(totalram_pages); | 659 | vfs_caches_init(totalram_pages); |
659 | signals_init(); | 660 | signals_init(); |
660 | /* rootfs populating might need page-writeback */ | 661 | /* rootfs populating might need page-writeback */ |
661 | page_writeback_init(); | 662 | page_writeback_init(); |
662 | proc_root_init(); | 663 | proc_root_init(); |
664 | nsfs_init(); | ||
663 | cgroup_init(); | 665 | cgroup_init(); |
664 | cpuset_init(); | 666 | cpuset_init(); |
665 | taskstats_init_early(); | 667 | taskstats_init_early(); |
666 | delayacct_init(); | 668 | delayacct_init(); |
667 | 669 | ||
668 | check_bugs(); | 670 | check_bugs(); |
669 | 671 | ||
670 | sfi_init_late(); | 672 | sfi_init_late(); |
671 | 673 | ||
672 | if (efi_enabled(EFI_RUNTIME_SERVICES)) { | 674 | if (efi_enabled(EFI_RUNTIME_SERVICES)) { |
673 | efi_late_init(); | 675 | efi_late_init(); |
674 | efi_free_boot_services(); | 676 | efi_free_boot_services(); |
675 | } | 677 | } |
676 | 678 | ||
677 | ftrace_init(); | 679 | ftrace_init(); |
678 | 680 | ||
679 | /* Do the rest non-__init'ed, we're now alive */ | 681 | /* Do the rest non-__init'ed, we're now alive */ |
680 | rest_init(); | 682 | rest_init(); |
681 | } | 683 | } |
682 | 684 | ||
683 | /* Call all constructor functions linked into the kernel. */ | 685 | /* Call all constructor functions linked into the kernel. */ |
684 | static void __init do_ctors(void) | 686 | static void __init do_ctors(void) |
685 | { | 687 | { |
686 | #ifdef CONFIG_CONSTRUCTORS | 688 | #ifdef CONFIG_CONSTRUCTORS |
687 | ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; | 689 | ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; |
688 | 690 | ||
689 | for (; fn < (ctor_fn_t *) __ctors_end; fn++) | 691 | for (; fn < (ctor_fn_t *) __ctors_end; fn++) |
690 | (*fn)(); | 692 | (*fn)(); |
691 | #endif | 693 | #endif |
692 | } | 694 | } |
693 | 695 | ||
694 | bool initcall_debug; | 696 | bool initcall_debug; |
695 | core_param(initcall_debug, initcall_debug, bool, 0644); | 697 | core_param(initcall_debug, initcall_debug, bool, 0644); |
696 | 698 | ||
697 | #ifdef CONFIG_KALLSYMS | 699 | #ifdef CONFIG_KALLSYMS |
698 | struct blacklist_entry { | 700 | struct blacklist_entry { |
699 | struct list_head next; | 701 | struct list_head next; |
700 | char *buf; | 702 | char *buf; |
701 | }; | 703 | }; |
702 | 704 | ||
703 | static __initdata_or_module LIST_HEAD(blacklisted_initcalls); | 705 | static __initdata_or_module LIST_HEAD(blacklisted_initcalls); |
704 | 706 | ||
705 | static int __init initcall_blacklist(char *str) | 707 | static int __init initcall_blacklist(char *str) |
706 | { | 708 | { |
707 | char *str_entry; | 709 | char *str_entry; |
708 | struct blacklist_entry *entry; | 710 | struct blacklist_entry *entry; |
709 | 711 | ||
710 | /* str argument is a comma-separated list of functions */ | 712 | /* str argument is a comma-separated list of functions */ |
711 | do { | 713 | do { |
712 | str_entry = strsep(&str, ","); | 714 | str_entry = strsep(&str, ","); |
713 | if (str_entry) { | 715 | if (str_entry) { |
714 | pr_debug("blacklisting initcall %s\n", str_entry); | 716 | pr_debug("blacklisting initcall %s\n", str_entry); |
715 | entry = alloc_bootmem(sizeof(*entry)); | 717 | entry = alloc_bootmem(sizeof(*entry)); |
716 | entry->buf = alloc_bootmem(strlen(str_entry) + 1); | 718 | entry->buf = alloc_bootmem(strlen(str_entry) + 1); |
717 | strcpy(entry->buf, str_entry); | 719 | strcpy(entry->buf, str_entry); |
718 | list_add(&entry->next, &blacklisted_initcalls); | 720 | list_add(&entry->next, &blacklisted_initcalls); |
719 | } | 721 | } |
720 | } while (str_entry); | 722 | } while (str_entry); |
721 | 723 | ||
722 | return 0; | 724 | return 0; |
723 | } | 725 | } |
724 | 726 | ||
725 | static bool __init_or_module initcall_blacklisted(initcall_t fn) | 727 | static bool __init_or_module initcall_blacklisted(initcall_t fn) |
726 | { | 728 | { |
727 | struct list_head *tmp; | 729 | struct list_head *tmp; |
728 | struct blacklist_entry *entry; | 730 | struct blacklist_entry *entry; |
729 | char *fn_name; | 731 | char *fn_name; |
730 | 732 | ||
731 | fn_name = kasprintf(GFP_KERNEL, "%pf", fn); | 733 | fn_name = kasprintf(GFP_KERNEL, "%pf", fn); |
732 | if (!fn_name) | 734 | if (!fn_name) |
733 | return false; | 735 | return false; |
734 | 736 | ||
735 | list_for_each(tmp, &blacklisted_initcalls) { | 737 | list_for_each(tmp, &blacklisted_initcalls) { |
736 | entry = list_entry(tmp, struct blacklist_entry, next); | 738 | entry = list_entry(tmp, struct blacklist_entry, next); |
737 | if (!strcmp(fn_name, entry->buf)) { | 739 | if (!strcmp(fn_name, entry->buf)) { |
738 | pr_debug("initcall %s blacklisted\n", fn_name); | 740 | pr_debug("initcall %s blacklisted\n", fn_name); |
739 | kfree(fn_name); | 741 | kfree(fn_name); |
740 | return true; | 742 | return true; |
741 | } | 743 | } |
742 | } | 744 | } |
743 | 745 | ||
744 | kfree(fn_name); | 746 | kfree(fn_name); |
745 | return false; | 747 | return false; |
746 | } | 748 | } |
747 | #else | 749 | #else |
748 | static int __init initcall_blacklist(char *str) | 750 | static int __init initcall_blacklist(char *str) |
749 | { | 751 | { |
750 | pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); | 752 | pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); |
751 | return 0; | 753 | return 0; |
752 | } | 754 | } |
753 | 755 | ||
754 | static bool __init_or_module initcall_blacklisted(initcall_t fn) | 756 | static bool __init_or_module initcall_blacklisted(initcall_t fn) |
755 | { | 757 | { |
756 | return false; | 758 | return false; |
757 | } | 759 | } |
758 | #endif | 760 | #endif |
759 | __setup("initcall_blacklist=", initcall_blacklist); | 761 | __setup("initcall_blacklist=", initcall_blacklist); |
760 | 762 | ||
761 | static int __init_or_module do_one_initcall_debug(initcall_t fn) | 763 | static int __init_or_module do_one_initcall_debug(initcall_t fn) |
762 | { | 764 | { |
763 | ktime_t calltime, delta, rettime; | 765 | ktime_t calltime, delta, rettime; |
764 | unsigned long long duration; | 766 | unsigned long long duration; |
765 | int ret; | 767 | int ret; |
766 | 768 | ||
767 | printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); | 769 | printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); |
768 | calltime = ktime_get(); | 770 | calltime = ktime_get(); |
769 | ret = fn(); | 771 | ret = fn(); |
770 | rettime = ktime_get(); | 772 | rettime = ktime_get(); |
771 | delta = ktime_sub(rettime, calltime); | 773 | delta = ktime_sub(rettime, calltime); |
772 | duration = (unsigned long long) ktime_to_ns(delta) >> 10; | 774 | duration = (unsigned long long) ktime_to_ns(delta) >> 10; |
773 | printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", | 775 | printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", |
774 | fn, ret, duration); | 776 | fn, ret, duration); |
775 | 777 | ||
776 | return ret; | 778 | return ret; |
777 | } | 779 | } |
778 | 780 | ||
779 | int __init_or_module do_one_initcall(initcall_t fn) | 781 | int __init_or_module do_one_initcall(initcall_t fn) |
780 | { | 782 | { |
781 | int count = preempt_count(); | 783 | int count = preempt_count(); |
782 | int ret; | 784 | int ret; |
783 | char msgbuf[64]; | 785 | char msgbuf[64]; |
784 | 786 | ||
785 | if (initcall_blacklisted(fn)) | 787 | if (initcall_blacklisted(fn)) |
786 | return -EPERM; | 788 | return -EPERM; |
787 | 789 | ||
788 | if (initcall_debug) | 790 | if (initcall_debug) |
789 | ret = do_one_initcall_debug(fn); | 791 | ret = do_one_initcall_debug(fn); |
790 | else | 792 | else |
791 | ret = fn(); | 793 | ret = fn(); |
792 | 794 | ||
793 | msgbuf[0] = 0; | 795 | msgbuf[0] = 0; |
794 | 796 | ||
795 | if (preempt_count() != count) { | 797 | if (preempt_count() != count) { |
796 | sprintf(msgbuf, "preemption imbalance "); | 798 | sprintf(msgbuf, "preemption imbalance "); |
797 | preempt_count_set(count); | 799 | preempt_count_set(count); |
798 | } | 800 | } |
799 | if (irqs_disabled()) { | 801 | if (irqs_disabled()) { |
800 | strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); | 802 | strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); |
801 | local_irq_enable(); | 803 | local_irq_enable(); |
802 | } | 804 | } |
803 | WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); | 805 | WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); |
804 | 806 | ||
805 | return ret; | 807 | return ret; |
806 | } | 808 | } |
807 | 809 | ||
808 | 810 | ||
809 | extern initcall_t __initcall_start[]; | 811 | extern initcall_t __initcall_start[]; |
810 | extern initcall_t __initcall0_start[]; | 812 | extern initcall_t __initcall0_start[]; |
811 | extern initcall_t __initcall1_start[]; | 813 | extern initcall_t __initcall1_start[]; |
812 | extern initcall_t __initcall2_start[]; | 814 | extern initcall_t __initcall2_start[]; |
813 | extern initcall_t __initcall3_start[]; | 815 | extern initcall_t __initcall3_start[]; |
814 | extern initcall_t __initcall4_start[]; | 816 | extern initcall_t __initcall4_start[]; |
815 | extern initcall_t __initcall5_start[]; | 817 | extern initcall_t __initcall5_start[]; |
816 | extern initcall_t __initcall6_start[]; | 818 | extern initcall_t __initcall6_start[]; |
817 | extern initcall_t __initcall7_start[]; | 819 | extern initcall_t __initcall7_start[]; |
818 | extern initcall_t __initcall_end[]; | 820 | extern initcall_t __initcall_end[]; |
819 | 821 | ||
820 | static initcall_t *initcall_levels[] __initdata = { | 822 | static initcall_t *initcall_levels[] __initdata = { |
821 | __initcall0_start, | 823 | __initcall0_start, |
822 | __initcall1_start, | 824 | __initcall1_start, |
823 | __initcall2_start, | 825 | __initcall2_start, |
824 | __initcall3_start, | 826 | __initcall3_start, |
825 | __initcall4_start, | 827 | __initcall4_start, |
826 | __initcall5_start, | 828 | __initcall5_start, |
827 | __initcall6_start, | 829 | __initcall6_start, |
828 | __initcall7_start, | 830 | __initcall7_start, |
829 | __initcall_end, | 831 | __initcall_end, |
830 | }; | 832 | }; |
831 | 833 | ||
832 | /* Keep these in sync with initcalls in include/linux/init.h */ | 834 | /* Keep these in sync with initcalls in include/linux/init.h */ |
833 | static char *initcall_level_names[] __initdata = { | 835 | static char *initcall_level_names[] __initdata = { |
834 | "early", | 836 | "early", |
835 | "core", | 837 | "core", |
836 | "postcore", | 838 | "postcore", |
837 | "arch", | 839 | "arch", |
838 | "subsys", | 840 | "subsys", |
839 | "fs", | 841 | "fs", |
840 | "device", | 842 | "device", |
841 | "late", | 843 | "late", |
842 | }; | 844 | }; |
843 | 845 | ||
844 | static void __init do_initcall_level(int level) | 846 | static void __init do_initcall_level(int level) |
845 | { | 847 | { |
846 | initcall_t *fn; | 848 | initcall_t *fn; |
847 | 849 | ||
848 | strcpy(initcall_command_line, saved_command_line); | 850 | strcpy(initcall_command_line, saved_command_line); |
849 | parse_args(initcall_level_names[level], | 851 | parse_args(initcall_level_names[level], |
850 | initcall_command_line, __start___param, | 852 | initcall_command_line, __start___param, |
851 | __stop___param - __start___param, | 853 | __stop___param - __start___param, |
852 | level, level, | 854 | level, level, |
853 | &repair_env_string); | 855 | &repair_env_string); |
854 | 856 | ||
855 | for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) | 857 | for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) |
856 | do_one_initcall(*fn); | 858 | do_one_initcall(*fn); |
857 | } | 859 | } |
858 | 860 | ||
859 | static void __init do_initcalls(void) | 861 | static void __init do_initcalls(void) |
860 | { | 862 | { |
861 | int level; | 863 | int level; |
862 | 864 | ||
863 | for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) | 865 | for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) |
864 | do_initcall_level(level); | 866 | do_initcall_level(level); |
865 | } | 867 | } |
866 | 868 | ||
867 | /* | 869 | /* |
868 | * Ok, the machine is now initialized. None of the devices | 870 | * Ok, the machine is now initialized. None of the devices |
869 | * have been touched yet, but the CPU subsystem is up and | 871 | * have been touched yet, but the CPU subsystem is up and |
870 | * running, and memory and process management works. | 872 | * running, and memory and process management works. |
871 | * | 873 | * |
872 | * Now we can finally start doing some real work.. | 874 | * Now we can finally start doing some real work.. |
873 | */ | 875 | */ |
874 | static void __init do_basic_setup(void) | 876 | static void __init do_basic_setup(void) |
875 | { | 877 | { |
876 | cpuset_init_smp(); | 878 | cpuset_init_smp(); |
877 | usermodehelper_init(); | 879 | usermodehelper_init(); |
878 | shmem_init(); | 880 | shmem_init(); |
879 | driver_init(); | 881 | driver_init(); |
880 | init_irq_proc(); | 882 | init_irq_proc(); |
881 | do_ctors(); | 883 | do_ctors(); |
882 | usermodehelper_enable(); | 884 | usermodehelper_enable(); |
883 | do_initcalls(); | 885 | do_initcalls(); |
884 | random_int_secret_init(); | 886 | random_int_secret_init(); |
885 | } | 887 | } |
886 | 888 | ||
887 | static void __init do_pre_smp_initcalls(void) | 889 | static void __init do_pre_smp_initcalls(void) |
888 | { | 890 | { |
889 | initcall_t *fn; | 891 | initcall_t *fn; |
890 | 892 | ||
891 | for (fn = __initcall_start; fn < __initcall0_start; fn++) | 893 | for (fn = __initcall_start; fn < __initcall0_start; fn++) |
892 | do_one_initcall(*fn); | 894 | do_one_initcall(*fn); |
893 | } | 895 | } |
894 | 896 | ||
895 | /* | 897 | /* |
896 | * This function requests modules which should be loaded by default and is | 898 | * This function requests modules which should be loaded by default and is |
897 | * called twice right after initrd is mounted and right before init is | 899 | * called twice right after initrd is mounted and right before init is |
898 | * exec'd. If such modules are on either initrd or rootfs, they will be | 900 | * exec'd. If such modules are on either initrd or rootfs, they will be |
899 | * loaded before control is passed to userland. | 901 | * loaded before control is passed to userland. |
900 | */ | 902 | */ |
901 | void __init load_default_modules(void) | 903 | void __init load_default_modules(void) |
902 | { | 904 | { |
903 | load_default_elevator_module(); | 905 | load_default_elevator_module(); |
904 | } | 906 | } |
905 | 907 | ||
906 | static int run_init_process(const char *init_filename) | 908 | static int run_init_process(const char *init_filename) |
907 | { | 909 | { |
908 | argv_init[0] = init_filename; | 910 | argv_init[0] = init_filename; |
909 | return do_execve(getname_kernel(init_filename), | 911 | return do_execve(getname_kernel(init_filename), |
910 | (const char __user *const __user *)argv_init, | 912 | (const char __user *const __user *)argv_init, |
911 | (const char __user *const __user *)envp_init); | 913 | (const char __user *const __user *)envp_init); |
912 | } | 914 | } |
913 | 915 | ||
914 | static int try_to_run_init_process(const char *init_filename) | 916 | static int try_to_run_init_process(const char *init_filename) |
915 | { | 917 | { |
916 | int ret; | 918 | int ret; |
917 | 919 | ||
918 | ret = run_init_process(init_filename); | 920 | ret = run_init_process(init_filename); |
919 | 921 | ||
920 | if (ret && ret != -ENOENT) { | 922 | if (ret && ret != -ENOENT) { |
921 | pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", | 923 | pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", |
922 | init_filename, ret); | 924 | init_filename, ret); |
923 | } | 925 | } |
924 | 926 | ||
925 | return ret; | 927 | return ret; |
926 | } | 928 | } |
927 | 929 | ||
928 | static noinline void __init kernel_init_freeable(void); | 930 | static noinline void __init kernel_init_freeable(void); |
929 | 931 | ||
930 | static int __ref kernel_init(void *unused) | 932 | static int __ref kernel_init(void *unused) |
931 | { | 933 | { |
932 | int ret; | 934 | int ret; |
933 | 935 | ||
934 | kernel_init_freeable(); | 936 | kernel_init_freeable(); |
935 | /* need to finish all async __init code before freeing the memory */ | 937 | /* need to finish all async __init code before freeing the memory */ |
936 | async_synchronize_full(); | 938 | async_synchronize_full(); |
937 | free_initmem(); | 939 | free_initmem(); |
938 | mark_rodata_ro(); | 940 | mark_rodata_ro(); |
939 | system_state = SYSTEM_RUNNING; | 941 | system_state = SYSTEM_RUNNING; |
940 | numa_default_policy(); | 942 | numa_default_policy(); |
941 | 943 | ||
942 | flush_delayed_fput(); | 944 | flush_delayed_fput(); |
943 | 945 | ||
944 | if (ramdisk_execute_command) { | 946 | if (ramdisk_execute_command) { |
945 | ret = run_init_process(ramdisk_execute_command); | 947 | ret = run_init_process(ramdisk_execute_command); |
946 | if (!ret) | 948 | if (!ret) |
947 | return 0; | 949 | return 0; |
948 | pr_err("Failed to execute %s (error %d)\n", | 950 | pr_err("Failed to execute %s (error %d)\n", |
949 | ramdisk_execute_command, ret); | 951 | ramdisk_execute_command, ret); |
950 | } | 952 | } |
951 | 953 | ||
952 | /* | 954 | /* |
953 | * We try each of these until one succeeds. | 955 | * We try each of these until one succeeds. |
954 | * | 956 | * |
955 | * The Bourne shell can be used instead of init if we are | 957 | * The Bourne shell can be used instead of init if we are |
956 | * trying to recover a really broken machine. | 958 | * trying to recover a really broken machine. |
957 | */ | 959 | */ |
958 | if (execute_command) { | 960 | if (execute_command) { |
959 | ret = run_init_process(execute_command); | 961 | ret = run_init_process(execute_command); |
960 | if (!ret) | 962 | if (!ret) |
961 | return 0; | 963 | return 0; |
962 | pr_err("Failed to execute %s (error %d). Attempting defaults...\n", | 964 | pr_err("Failed to execute %s (error %d). Attempting defaults...\n", |
963 | execute_command, ret); | 965 | execute_command, ret); |
964 | } | 966 | } |
965 | if (!try_to_run_init_process("/sbin/init") || | 967 | if (!try_to_run_init_process("/sbin/init") || |
966 | !try_to_run_init_process("/etc/init") || | 968 | !try_to_run_init_process("/etc/init") || |
967 | !try_to_run_init_process("/bin/init") || | 969 | !try_to_run_init_process("/bin/init") || |
968 | !try_to_run_init_process("/bin/sh")) | 970 | !try_to_run_init_process("/bin/sh")) |
969 | return 0; | 971 | return 0; |
970 | 972 | ||
971 | panic("No working init found. Try passing init= option to kernel. " | 973 | panic("No working init found. Try passing init= option to kernel. " |
972 | "See Linux Documentation/init.txt for guidance."); | 974 | "See Linux Documentation/init.txt for guidance."); |
973 | } | 975 | } |
974 | 976 | ||
975 | static noinline void __init kernel_init_freeable(void) | 977 | static noinline void __init kernel_init_freeable(void) |
976 | { | 978 | { |
977 | /* | 979 | /* |
978 | * Wait until kthreadd is all set-up. | 980 | * Wait until kthreadd is all set-up. |
979 | */ | 981 | */ |
980 | wait_for_completion(&kthreadd_done); | 982 | wait_for_completion(&kthreadd_done); |
981 | 983 | ||
982 | /* Now the scheduler is fully set up and can do blocking allocations */ | 984 | /* Now the scheduler is fully set up and can do blocking allocations */ |
983 | gfp_allowed_mask = __GFP_BITS_MASK; | 985 | gfp_allowed_mask = __GFP_BITS_MASK; |
984 | 986 | ||
985 | /* | 987 | /* |
986 | * init can allocate pages on any node | 988 | * init can allocate pages on any node |
987 | */ | 989 | */ |
988 | set_mems_allowed(node_states[N_MEMORY]); | 990 | set_mems_allowed(node_states[N_MEMORY]); |
989 | /* | 991 | /* |
990 | * init can run on any cpu. | 992 | * init can run on any cpu. |
991 | */ | 993 | */ |
992 | set_cpus_allowed_ptr(current, cpu_all_mask); | 994 | set_cpus_allowed_ptr(current, cpu_all_mask); |
993 | 995 | ||
994 | cad_pid = task_pid(current); | 996 | cad_pid = task_pid(current); |
995 | 997 | ||
996 | smp_prepare_cpus(setup_max_cpus); | 998 | smp_prepare_cpus(setup_max_cpus); |
997 | 999 | ||
998 | do_pre_smp_initcalls(); | 1000 | do_pre_smp_initcalls(); |
999 | lockup_detector_init(); | 1001 | lockup_detector_init(); |
1000 | 1002 | ||
1001 | smp_init(); | 1003 | smp_init(); |
1002 | sched_init_smp(); | 1004 | sched_init_smp(); |
1003 | 1005 | ||
1004 | do_basic_setup(); | 1006 | do_basic_setup(); |
1005 | 1007 | ||
1006 | /* Open the /dev/console on the rootfs, this should never fail */ | 1008 | /* Open the /dev/console on the rootfs, this should never fail */ |
1007 | if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) | 1009 | if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) |
1008 | pr_err("Warning: unable to open an initial console.\n"); | 1010 | pr_err("Warning: unable to open an initial console.\n"); |
1009 | 1011 | ||
1010 | (void) sys_dup(0); | 1012 | (void) sys_dup(0); |
1011 | (void) sys_dup(0); | 1013 | (void) sys_dup(0); |
1012 | /* | 1014 | /* |
1013 | * check if there is an early userspace init. If yes, let it do all | 1015 | * check if there is an early userspace init. If yes, let it do all |
1014 | * the work | 1016 | * the work |
1015 | */ | 1017 | */ |
1016 | 1018 | ||
1017 | if (!ramdisk_execute_command) | 1019 | if (!ramdisk_execute_command) |
1018 | ramdisk_execute_command = "/init"; | 1020 | ramdisk_execute_command = "/init"; |
1019 | 1021 | ||
1020 | if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { | 1022 | if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { |
1021 | ramdisk_execute_command = NULL; | 1023 | ramdisk_execute_command = NULL; |
1022 | prepare_namespace(); | 1024 | prepare_namespace(); |
1023 | } | 1025 | } |
1024 | 1026 | ||
1025 | /* | 1027 | /* |
1026 | * Ok, we have completed the initial bootup, and | 1028 | * Ok, we have completed the initial bootup, and |
1027 | * we're essentially up and running. Get rid of the | 1029 | * we're essentially up and running. Get rid of the |
1028 | * initmem segments and start the user-mode stuff.. | 1030 | * initmem segments and start the user-mode stuff.. |
1029 | */ | 1031 | */ |
1030 | 1032 | ||
1031 | /* rootfs is available now, try loading default modules */ | 1033 | /* rootfs is available now, try loading default modules */ |
1032 | load_default_modules(); | 1034 | load_default_modules(); |
1033 | } | 1035 | } |
1034 | 1036 |