Commit 30e49c263e36341b60b735cbef5ca37912549264

Authored by Pavel Emelyanov
Committed by Linus Torvalds
1 parent b461cc0382

pid namespaces: allow cloning of new namespace

When clone() is invoked with CLONE_NEWPID, create a new pid namespace and then
create a new struct pid for the new process.  Allocate pid_t's for the new
process in the new pid namespace and all ancestor pid namespaces.  Make the
newly cloned process the session and process group leader.

Since the active pid namespace is special and expected to be the first entry
in pid->upid_list, preserve the order of pid namespaces.

The size of 'struct pid' is dependent on the the number of pid namespaces the
process exists in, so we use multiple pid-caches'.  Only one pid cache is
created during system startup and this used by processes that exist only in
init_pid_ns.

When a process clones its pid namespace, we create additional pid caches as
necessary and use the pid cache to allocate 'struct pids' for that depth.

Note, that with this patch the newly created namespace won't work, since the
rest of the kernel still uses global pids, but this is to be fixed soon.  Init
pid namespace still works.

[oleg@tv-sign.ru: merge fix]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 113 additions and 22 deletions Side-by-side Diff

include/linux/sched.h
... ... @@ -25,6 +25,7 @@
25 25 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */
26 26 #define CLONE_NEWIPC 0x08000000 /* New ipcs */
27 27 #define CLONE_NEWUSER 0x10000000 /* New user namespace */
  28 +#define CLONE_NEWPID 0x20000000 /* New pid namespace */
28 29 #define CLONE_NEWNET 0x40000000 /* New network namespace */
29 30  
30 31 /*
... ... @@ -973,7 +973,6 @@
973 973 unsigned long stack_start,
974 974 struct pt_regs *regs,
975 975 unsigned long stack_size,
976   - int __user *parent_tidptr,
977 976 int __user *child_tidptr,
978 977 struct pid *pid)
979 978 {
... ... @@ -1043,11 +1042,6 @@
1043 1042 p->did_exec = 0;
1044 1043 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1045 1044 copy_flags(clone_flags, p);
1046   - retval = -EFAULT;
1047   - if (clone_flags & CLONE_PARENT_SETTID)
1048   - if (put_user(p->pid, parent_tidptr))
1049   - goto bad_fork_cleanup_delays_binfmt;
1050   -
1051 1045 INIT_LIST_HEAD(&p->children);
1052 1046 INIT_LIST_HEAD(&p->sibling);
1053 1047 p->vfork_done = NULL;
... ... @@ -1289,11 +1283,22 @@
1289 1283 __ptrace_link(p, current->parent);
1290 1284  
1291 1285 if (thread_group_leader(p)) {
1292   - p->signal->tty = current->signal->tty;
1293   - p->signal->pgrp = task_pgrp_nr(current);
1294   - set_task_session(p, task_session_nr(current));
1295   - attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1296   - attach_pid(p, PIDTYPE_SID, task_session(current));
  1286 + if (clone_flags & CLONE_NEWPID) {
  1287 + p->nsproxy->pid_ns->child_reaper = p;
  1288 + p->signal->tty = NULL;
  1289 + p->signal->pgrp = p->pid;
  1290 + set_task_session(p, p->pid);
  1291 + attach_pid(p, PIDTYPE_PGID, pid);
  1292 + attach_pid(p, PIDTYPE_SID, pid);
  1293 + } else {
  1294 + p->signal->tty = current->signal->tty;
  1295 + p->signal->pgrp = task_pgrp_nr(current);
  1296 + set_task_session(p, task_session_nr(current));
  1297 + attach_pid(p, PIDTYPE_PGID,
  1298 + task_pgrp(current));
  1299 + attach_pid(p, PIDTYPE_SID,
  1300 + task_session(current));
  1301 + }
1297 1302  
1298 1303 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1299 1304 __get_cpu_var(process_counts)++;
... ... @@ -1339,7 +1344,6 @@
1339 1344 bad_fork_cleanup_cgroup:
1340 1345 #endif
1341 1346 cgroup_exit(p, cgroup_callbacks_done);
1342   -bad_fork_cleanup_delays_binfmt:
1343 1347 delayacct_tsk_free(p);
1344 1348 if (p->binfmt)
1345 1349 module_put(p->binfmt->module);
... ... @@ -1366,7 +1370,7 @@
1366 1370 struct task_struct *task;
1367 1371 struct pt_regs regs;
1368 1372  
1369   - task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
  1373 + task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1370 1374 &init_struct_pid);
1371 1375 if (!IS_ERR(task))
1372 1376 init_idle(task, cpu);
... ... @@ -1414,7 +1418,7 @@
1414 1418 }
1415 1419  
1416 1420 p = copy_process(clone_flags, stack_start, regs, stack_size,
1417   - parent_tidptr, child_tidptr, NULL);
  1421 + child_tidptr, NULL);
1418 1422 /*
1419 1423 * Do this prior waking up the new thread - the thread pointer
1420 1424 * might get invalid after that point, if the thread exits quickly.
... ... @@ -1422,7 +1426,16 @@
1422 1426 if (!IS_ERR(p)) {
1423 1427 struct completion vfork;
1424 1428  
1425   - nr = pid_nr(task_pid(p));
  1429 + /*
  1430 + * this is enough to call pid_nr_ns here, but this if
  1431 + * improves optimisation of regular fork()
  1432 + */
  1433 + nr = (clone_flags & CLONE_NEWPID) ?
  1434 + task_pid_nr_ns(p, current->nsproxy->pid_ns) :
  1435 + task_pid_vnr(p);
  1436 +
  1437 + if (clone_flags & CLONE_PARENT_SETTID)
  1438 + put_user(nr, parent_tidptr);
1426 1439  
1427 1440 if (clone_flags & CLONE_VFORK) {
1428 1441 p->vfork_done = &vfork;
... ... @@ -129,7 +129,8 @@
129 129  
130 130 get_nsproxy(old_ns);
131 131  
132   - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
  132 + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
  133 + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
133 134 return 0;
134 135  
135 136 if (!capable(CAP_SYS_ADMIN)) {
... ... @@ -18,6 +18,12 @@
18 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
  21 + *
  22 + * Pid namespaces:
  23 + * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
  24 + * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
  25 + * Many thanks to Oleg Nesterov for comments and help
  26 + *
21 27 */
22 28  
23 29 #include <linux/mm.h>
... ... @@ -456,8 +462,8 @@
456 462  
457 463 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
458 464 cachep = kmem_cache_create(pcache->name,
459   - /* FIXME add numerical ids here */
460   - sizeof(struct pid), 0, SLAB_HWCACHE_ALIGN, NULL);
  465 + sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
  466 + 0, SLAB_HWCACHE_ALIGN, NULL);
461 467 if (cachep == NULL)
462 468 goto err_cachep;
463 469  
464 470  
465 471  
466 472  
467 473  
... ... @@ -475,19 +481,89 @@
475 481 return NULL;
476 482 }
477 483  
  484 +static struct pid_namespace *create_pid_namespace(int level)
  485 +{
  486 + struct pid_namespace *ns;
  487 + int i;
  488 +
  489 + ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL);
  490 + if (ns == NULL)
  491 + goto out;
  492 +
  493 + ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
  494 + if (!ns->pidmap[0].page)
  495 + goto out_free;
  496 +
  497 + ns->pid_cachep = create_pid_cachep(level + 1);
  498 + if (ns->pid_cachep == NULL)
  499 + goto out_free_map;
  500 +
  501 + kref_init(&ns->kref);
  502 + ns->last_pid = 0;
  503 + ns->child_reaper = NULL;
  504 + ns->level = level;
  505 +
  506 + set_bit(0, ns->pidmap[0].page);
  507 + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
  508 +
  509 + for (i = 1; i < PIDMAP_ENTRIES; i++) {
  510 + ns->pidmap[i].page = 0;
  511 + atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
  512 + }
  513 +
  514 + return ns;
  515 +
  516 +out_free_map:
  517 + kfree(ns->pidmap[0].page);
  518 +out_free:
  519 + kfree(ns);
  520 +out:
  521 + return ERR_PTR(-ENOMEM);
  522 +}
  523 +
  524 +static void destroy_pid_namespace(struct pid_namespace *ns)
  525 +{
  526 + int i;
  527 +
  528 + for (i = 0; i < PIDMAP_ENTRIES; i++)
  529 + kfree(ns->pidmap[i].page);
  530 + kfree(ns);
  531 +}
  532 +
478 533 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
479 534 {
  535 + struct pid_namespace *new_ns;
  536 +
480 537 BUG_ON(!old_ns);
481   - get_pid_ns(old_ns);
482   - return old_ns;
  538 + new_ns = get_pid_ns(old_ns);
  539 + if (!(flags & CLONE_NEWPID))
  540 + goto out;
  541 +
  542 + new_ns = ERR_PTR(-EINVAL);
  543 + if (flags & CLONE_THREAD)
  544 + goto out_put;
  545 +
  546 + new_ns = create_pid_namespace(old_ns->level + 1);
  547 + if (!IS_ERR(new_ns))
  548 + new_ns->parent = get_pid_ns(old_ns);
  549 +
  550 +out_put:
  551 + put_pid_ns(old_ns);
  552 +out:
  553 + return new_ns;
483 554 }
484 555  
485 556 void free_pid_ns(struct kref *kref)
486 557 {
487   - struct pid_namespace *ns;
  558 + struct pid_namespace *ns, *parent;
488 559  
489 560 ns = container_of(kref, struct pid_namespace, kref);
490   - kfree(ns);
  561 +
  562 + parent = ns->parent;
  563 + destroy_pid_namespace(ns);
  564 +
  565 + if (parent != NULL)
  566 + put_pid_ns(parent);
491 567 }
492 568  
493 569 /*