Commit 7eafd7c74c3f2e67c27621b987b28397110d643f

Authored by Serge E. Hallyn
Committed by Linus Torvalds
1 parent 614b84cf4e

namespaces: ipc namespaces: implement support for posix msqueues

Implement multiple mounts of the mqueue file system, and link it to usage
of CLONE_NEWIPC.

Each ipc ns has a corresponding mqueuefs superblock.  When a user does
clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the unshare will cause an
internal mount of a new mqueuefs sb linked to the new ipc ns.

When a user does 'mount -t mqueue mqueue /dev/mqueue', he mounts the
mqueuefs superblock.

Posix message queues can be worked with both through the mq_* system calls
(see mq_overview(7)), and through the VFS through the mqueue mount.  Any
usage of mq_open() and friends will work with the acting task's ipc
namespace.  Any actions through the VFS will work with the mqueuefs in
which the file was created.  So if a user doesn't remount mqueuefs after
unshare(CLONE_NEWIPC), mq_open("/ab") will not be reflected in "ls
/dev/mqueue".

If task a mounts mqueue for ipc_ns:1, then clones task b with a new ipcns,
ipcns:2, and then task a is the last task in ipc_ns:1 to exit, then (1)
ipc_ns:1 will be freed, (2) it's superblock will live on until task b
umounts the corresponding mqueuefs, and vfs actions will continue to
succeed, but (3) sb->s_fs_info will be NULL for the sb corresponding to
the deceased ipc_ns:1.

To make this happen, we must protect the ipc reference count when

a) a task exits and drops its ipcns->count, since it might be dropping
   it to 0 and freeing the ipcns

b) a task accesses the ipcns through its mqueuefs interface, since it
   bumps the ipcns refcount and might race with the last task in the ipcns
   exiting.

So the kref is changed to an atomic_t so we can use
atomic_dec_and_lock(&ns->count,mq_lock), and every access to the ipcns
through ns = mqueuefs_sb->s_fs_info is protected by the same lock.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 131 additions and 52 deletions Side-by-side Diff

include/linux/ipc_namespace.h
... ... @@ -25,7 +25,7 @@
25 25 };
26 26  
27 27 struct ipc_namespace {
28   - struct kref kref;
  28 + atomic_t count;
29 29 struct ipc_ids ids[3];
30 30  
31 31 int sem_ctls[4];
... ... @@ -61,6 +61,7 @@
61 61 extern struct ipc_namespace init_ipc_ns;
62 62 extern atomic_t nr_ipc_ns;
63 63  
  64 +extern spinlock_t mq_lock;
64 65 #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
65 66 #define INIT_IPC_NS(ns) .ns = &init_ipc_ns,
66 67 #else
67 68  
68 69  
... ... @@ -82,18 +83,18 @@
82 83 #endif /* CONFIG_SYSVIPC */
83 84  
84 85 #ifdef CONFIG_POSIX_MQUEUE
85   -extern void mq_init_ns(struct ipc_namespace *ns);
  86 +extern int mq_init_ns(struct ipc_namespace *ns);
86 87 /* default values */
87 88 #define DFLT_QUEUESMAX 256 /* max number of message queues */
88 89 #define DFLT_MSGMAX 10 /* max number of messages in each queue */
89 90 #define HARD_MSGMAX (131072/sizeof(void *))
90 91 #define DFLT_MSGSIZEMAX 8192 /* max message size */
91 92 #else
92   -#define mq_init_ns(ns) ((void) 0)
  93 +static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
93 94 #endif
94 95  
95 96 #if defined(CONFIG_IPC_NS)
96   -extern void free_ipc_ns(struct kref *kref);
  97 +extern void free_ipc_ns(struct ipc_namespace *ns);
97 98 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
98 99 struct ipc_namespace *ns);
99 100 extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
100 101  
... ... @@ -103,14 +104,11 @@
103 104 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
104 105 {
105 106 if (ns)
106   - kref_get(&ns->kref);
  107 + atomic_inc(&ns->count);
107 108 return ns;
108 109 }
109 110  
110   -static inline void put_ipc_ns(struct ipc_namespace *ns)
111   -{
112   - kref_put(&ns->kref, free_ipc_ns);
113   -}
  111 +extern void put_ipc_ns(struct ipc_namespace *ns);
114 112 #else
115 113 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
116 114 struct ipc_namespace *ns)
... ... @@ -88,7 +88,6 @@
88 88 static struct super_operations mqueue_super_ops;
89 89 static void remove_notification(struct mqueue_inode_info *info);
90 90  
91   -static spinlock_t mq_lock;
92 91 static struct kmem_cache *mqueue_inode_cachep;
93 92  
94 93 static struct ctl_table_header * mq_sysctl_table;
95 94  
96 95  
97 96  
98 97  
99 98  
... ... @@ -98,27 +97,30 @@
98 97 return container_of(inode, struct mqueue_inode_info, vfs_inode);
99 98 }
100 99  
101   -void mq_init_ns(struct ipc_namespace *ns)
  100 +/*
  101 + * This routine should be called with the mq_lock held.
  102 + */
  103 +static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
102 104 {
103   - ns->mq_queues_count = 0;
104   - ns->mq_queues_max = DFLT_QUEUESMAX;
105   - ns->mq_msg_max = DFLT_MSGMAX;
106   - ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
107   - ns->mq_mnt = mntget(init_ipc_ns.mq_mnt);
  105 + return get_ipc_ns(inode->i_sb->s_fs_info);
108 106 }
109 107  
110   -void mq_exit_ns(struct ipc_namespace *ns)
  108 +static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
111 109 {
112   - /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
113   - mntput(ns->mq_mnt);
  110 + struct ipc_namespace *ns;
  111 +
  112 + spin_lock(&mq_lock);
  113 + ns = __get_ns_from_inode(inode);
  114 + spin_unlock(&mq_lock);
  115 + return ns;
114 116 }
115 117  
116   -static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
117   - struct mq_attr *attr)
  118 +static struct inode *mqueue_get_inode(struct super_block *sb,
  119 + struct ipc_namespace *ipc_ns, int mode,
  120 + struct mq_attr *attr)
118 121 {
119 122 struct user_struct *u = current_user();
120 123 struct inode *inode;
121   - struct ipc_namespace *ipc_ns = &init_ipc_ns;
122 124  
123 125 inode = new_inode(sb);
124 126 if (inode) {
125 127  
126 128  
127 129  
128 130  
... ... @@ -193,30 +195,38 @@
193 195 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
194 196 {
195 197 struct inode *inode;
  198 + struct ipc_namespace *ns = data;
  199 + int error = 0;
196 200  
197 201 sb->s_blocksize = PAGE_CACHE_SIZE;
198 202 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
199 203 sb->s_magic = MQUEUE_MAGIC;
200 204 sb->s_op = &mqueue_super_ops;
201 205  
202   - inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
203   - if (!inode)
204   - return -ENOMEM;
  206 + inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
  207 + NULL);
  208 + if (!inode) {
  209 + error = -ENOMEM;
  210 + goto out;
  211 + }
205 212  
206 213 sb->s_root = d_alloc_root(inode);
207 214 if (!sb->s_root) {
208 215 iput(inode);
209   - return -ENOMEM;
  216 + error = -ENOMEM;
210 217 }
211 218  
212   - return 0;
  219 +out:
  220 + return error;
213 221 }
214 222  
215 223 static int mqueue_get_sb(struct file_system_type *fs_type,
216 224 int flags, const char *dev_name,
217 225 void *data, struct vfsmount *mnt)
218 226 {
219   - return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
  227 + if (!(flags & MS_KERNMOUNT))
  228 + data = current->nsproxy->ipc_ns;
  229 + return get_sb_ns(fs_type, flags, data, mqueue_fill_super, mnt);
220 230 }
221 231  
222 232 static void init_once(void *foo)
223 233  
... ... @@ -247,12 +257,13 @@
247 257 struct user_struct *user;
248 258 unsigned long mq_bytes;
249 259 int i;
250   - struct ipc_namespace *ipc_ns = &init_ipc_ns;
  260 + struct ipc_namespace *ipc_ns;
251 261  
252 262 if (S_ISDIR(inode->i_mode)) {
253 263 clear_inode(inode);
254 264 return;
255 265 }
  266 + ipc_ns = get_ns_from_inode(inode);
256 267 info = MQUEUE_I(inode);
257 268 spin_lock(&info->lock);
258 269 for (i = 0; i < info->attr.mq_curmsgs; i++)
259 270  
... ... @@ -268,10 +279,19 @@
268 279 if (user) {
269 280 spin_lock(&mq_lock);
270 281 user->mq_bytes -= mq_bytes;
271   - ipc_ns->mq_queues_count--;
  282 + /*
  283 + * get_ns_from_inode() ensures that the
  284 + * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
  285 + * to which we now hold a reference, or it is NULL.
  286 + * We can't put it here under mq_lock, though.
  287 + */
  288 + if (ipc_ns)
  289 + ipc_ns->mq_queues_count--;
272 290 spin_unlock(&mq_lock);
273 291 free_uid(user);
274 292 }
  293 + if (ipc_ns)
  294 + put_ipc_ns(ipc_ns);
275 295 }
276 296  
277 297 static int mqueue_create(struct inode *dir, struct dentry *dentry,
278 298  
... ... @@ -280,9 +300,14 @@
280 300 struct inode *inode;
281 301 struct mq_attr *attr = dentry->d_fsdata;
282 302 int error;
283   - struct ipc_namespace *ipc_ns = &init_ipc_ns;
  303 + struct ipc_namespace *ipc_ns;
284 304  
285 305 spin_lock(&mq_lock);
  306 + ipc_ns = __get_ns_from_inode(dir);
  307 + if (!ipc_ns) {
  308 + error = -EACCES;
  309 + goto out_unlock;
  310 + }
286 311 if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
287 312 !capable(CAP_SYS_RESOURCE)) {
288 313 error = -ENOSPC;
... ... @@ -291,7 +316,7 @@
291 316 ipc_ns->mq_queues_count++;
292 317 spin_unlock(&mq_lock);
293 318  
294   - inode = mqueue_get_inode(dir->i_sb, mode, attr);
  319 + inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
295 320 if (!inode) {
296 321 error = -ENOMEM;
297 322 spin_lock(&mq_lock);
... ... @@ -299,6 +324,7 @@
299 324 goto out_unlock;
300 325 }
301 326  
  327 + put_ipc_ns(ipc_ns);
302 328 dir->i_size += DIRENT_SIZE;
303 329 dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
304 330  
... ... @@ -307,6 +333,8 @@
307 333 return 0;
308 334 out_unlock:
309 335 spin_unlock(&mq_lock);
  336 + if (ipc_ns)
  337 + put_ipc_ns(ipc_ns);
310 338 return error;
311 339 }
312 340  
... ... @@ -668,7 +696,7 @@
668 696 char *name;
669 697 struct mq_attr attr;
670 698 int fd, error;
671   - struct ipc_namespace *ipc_ns = &init_ipc_ns;
  699 + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
672 700  
673 701 if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
674 702 return -EFAULT;
... ... @@ -738,7 +766,7 @@
738 766 char *name;
739 767 struct dentry *dentry;
740 768 struct inode *inode = NULL;
741   - struct ipc_namespace *ipc_ns = &init_ipc_ns;
  769 + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
742 770  
743 771 name = getname(u_name);
744 772 if (IS_ERR(name))
... ... @@ -1217,6 +1245,32 @@
1217 1245 .kill_sb = kill_litter_super,
1218 1246 };
1219 1247  
  1248 +int mq_init_ns(struct ipc_namespace *ns)
  1249 +{
  1250 + ns->mq_queues_count = 0;
  1251 + ns->mq_queues_max = DFLT_QUEUESMAX;
  1252 + ns->mq_msg_max = DFLT_MSGMAX;
  1253 + ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
  1254 +
  1255 + ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
  1256 + if (IS_ERR(ns->mq_mnt)) {
  1257 + int err = PTR_ERR(ns->mq_mnt);
  1258 + ns->mq_mnt = NULL;
  1259 + return err;
  1260 + }
  1261 + return 0;
  1262 +}
  1263 +
  1264 +void mq_clear_sbinfo(struct ipc_namespace *ns)
  1265 +{
  1266 + ns->mq_mnt->mnt_sb->s_fs_info = NULL;
  1267 +}
  1268 +
  1269 +void mq_put_mnt(struct ipc_namespace *ns)
  1270 +{
  1271 + mntput(ns->mq_mnt);
  1272 +}
  1273 +
1220 1274 static int msg_max_limit_min = MIN_MSGMAX;
1221 1275 static int msg_max_limit_max = MAX_MSGMAX;
1222 1276  
1223 1277  
... ... @@ -1288,14 +1342,13 @@
1288 1342 if (error)
1289 1343 goto out_sysctl;
1290 1344  
1291   - init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
  1345 + spin_lock_init(&mq_lock);
  1346 +
  1347 + init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
1292 1348 if (IS_ERR(init_ipc_ns.mq_mnt)) {
1293 1349 error = PTR_ERR(init_ipc_ns.mq_mnt);
1294 1350 goto out_filesystem;
1295 1351 }
1296   -
1297   - /* internal initialization - not common for vfs */
1298   - spin_lock_init(&mq_lock);
1299 1352  
1300 1353 return 0;
1301 1354  
... ... @@ -18,19 +18,16 @@
18 18  
19 19 #include "util.h"
20 20  
  21 +DEFINE_SPINLOCK(mq_lock);
  22 +
21 23 /*
22 24 * The next 2 defines are here bc this is the only file
23 25 * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
24 26 * and not CONFIG_IPC_NS.
25 27 */
26 28 struct ipc_namespace init_ipc_ns = {
27   - .kref = {
28   - /* It's not for this patch to change, but should this be 1? */
29   - .refcount = ATOMIC_INIT(2),
30   - },
  29 + .count = ATOMIC_INIT(1),
31 30 #ifdef CONFIG_POSIX_MQUEUE
32   - .mq_mnt = NULL,
33   - .mq_queues_count = 0,
34 31 .mq_queues_max = DFLT_QUEUESMAX,
35 32 .mq_msg_max = DFLT_MSGMAX,
36 33 .mq_msgsize_max = DFLT_MSGSIZEMAX,
... ... @@ -9,23 +9,31 @@
9 9 #include <linux/rcupdate.h>
10 10 #include <linux/nsproxy.h>
11 11 #include <linux/slab.h>
  12 +#include <linux/fs.h>
  13 +#include <linux/mount.h>
12 14  
13 15 #include "util.h"
14 16  
15 17 static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
16 18 {
17 19 struct ipc_namespace *ns;
  20 + int err;
18 21  
19 22 ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
20 23 if (ns == NULL)
21 24 return ERR_PTR(-ENOMEM);
22 25  
  26 + atomic_set(&ns->count, 1);
  27 + err = mq_init_ns(ns);
  28 + if (err) {
  29 + kfree(ns);
  30 + return ERR_PTR(err);
  31 + }
23 32 atomic_inc(&nr_ipc_ns);
24 33  
25 34 sem_init_ns(ns);
26 35 msg_init_ns(ns);
27 36 shm_init_ns(ns);
28   - mq_init_ns(ns);
29 37  
30 38 /*
31 39 * msgmni has already been computed for the new ipc ns.
... ... @@ -35,7 +43,6 @@
35 43 ipcns_notify(IPCNS_CREATED);
36 44 register_ipcns_notifier(ns);
37 45  
38   - kref_init(&ns->kref);
39 46 return ns;
40 47 }
41 48  
42 49  
43 50  
... ... @@ -85,11 +92,34 @@
85 92 up_write(&ids->rw_mutex);
86 93 }
87 94  
88   -void free_ipc_ns(struct kref *kref)
  95 +/*
  96 + * put_ipc_ns - drop a reference to an ipc namespace.
  97 + * @ns: the namespace to put
  98 + *
  99 + * If this is the last task in the namespace exiting, and
  100 + * it is dropping the refcount to 0, then it can race with
  101 + * a task in another ipc namespace but in a mounts namespace
  102 + * which has this ipcns's mqueuefs mounted, doing some action
  103 + * with one of the mqueuefs files. That can raise the refcount.
  104 + * So dropping the refcount, and raising the refcount when
  105 + * accessing it through the VFS, are protected with mq_lock.
  106 + *
  107 + * (Clearly, a task raising the refcount on its own ipc_ns
  108 + * needn't take mq_lock since it can't race with the last task
  109 + * in the ipcns exiting).
  110 + */
  111 +void put_ipc_ns(struct ipc_namespace *ns)
89 112 {
90   - struct ipc_namespace *ns;
  113 + if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
  114 + mq_clear_sbinfo(ns);
  115 + spin_unlock(&mq_lock);
  116 + mq_put_mnt(ns);
  117 + free_ipc_ns(ns);
  118 + }
  119 +}
91 120  
92   - ns = container_of(kref, struct ipc_namespace, kref);
  121 +void free_ipc_ns(struct ipc_namespace *ns)
  122 +{
93 123 /*
94 124 * Unregistering the hotplug notifier at the beginning guarantees
95 125 * that the ipc namespace won't be freed while we are inside the
... ... @@ -102,7 +132,6 @@
102 132 sem_exit_ns(ns);
103 133 msg_exit_ns(ns);
104 134 shm_exit_ns(ns);
105   - mq_exit_ns(ns);
106 135 kfree(ns);
107 136 atomic_dec(&nr_ipc_ns);
108 137  
... ... @@ -21,9 +21,11 @@
21 21 struct ipc_namespace;
22 22  
23 23 #ifdef CONFIG_POSIX_MQUEUE
24   -void mq_exit_ns(struct ipc_namespace *ns);
  24 +extern void mq_clear_sbinfo(struct ipc_namespace *ns);
  25 +extern void mq_put_mnt(struct ipc_namespace *ns);
25 26 #else
26   -static inline void mq_exit_ns(struct ipc_namespace *ns) { }
  27 +static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
  28 +static inline void mq_put_mnt(struct ipc_namespace *ns) { }
27 29 #endif
28 30  
29 31 #ifdef CONFIG_SYSVIPC