Commit cf7b708c8d1d7a27736771bcf4c457b332b0f818

Authored by Pavel Emelyanov
Committed by Linus Torvalds
1 parent a6f5e06378

Make access to task's nsproxy lighter

When someone wants to deal with some other taks's namespaces it has to lock
the task and then to get the desired namespace if the one exists.  This is
slow on read-only paths and may be impossible in some cases.

E.g.  Oleg recently noticed a race between unshare() and the (sent for
review in cgroups) pid namespaces - when the task notifies the parent it
has to know the parent's namespace, but taking the task_lock() is
impossible there - the code is under write locked tasklist lock.

On the other hand switching the namespace on task (daemonize) and releasing
the namespace (after the last task exit) is rather rare operation and we
can sacrifice its speed to solve the issues above.

The access to other task namespaces is proposed to be performed
like this:

     rcu_read_lock();
     nsproxy = task_nsproxy(tsk);
     if (nsproxy != NULL) {
             / *
               * work with the namespaces here
               * e.g. get the reference on one of them
               * /
     } / *
         * NULL task_nsproxy() means that this task is
         * almost dead (zombie)
         * /
     rcu_read_unlock();

This patch has passed the review by Eric and Oleg :) and,
of course, tested.

[clg@fr.ibm.com: fix unshare()]
[ebiederm@xmission.com: Update get_net_ns_by_pid]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 6 changed files with 91 additions and 45 deletions Side-by-side Diff

... ... @@ -350,18 +350,21 @@
350 350 static int mounts_open(struct inode *inode, struct file *file)
351 351 {
352 352 struct task_struct *task = get_proc_task(inode);
  353 + struct nsproxy *nsp;
353 354 struct mnt_namespace *ns = NULL;
354 355 struct proc_mounts *p;
355 356 int ret = -EINVAL;
356 357  
357 358 if (task) {
358   - task_lock(task);
359   - if (task->nsproxy) {
360   - ns = task->nsproxy->mnt_ns;
  359 + rcu_read_lock();
  360 + nsp = task_nsproxy(task);
  361 + if (nsp) {
  362 + ns = nsp->mnt_ns;
361 363 if (ns)
362 364 get_mnt_ns(ns);
363 365 }
364   - task_unlock(task);
  366 + rcu_read_unlock();
  367 +
365 368 put_task_struct(task);
366 369 }
367 370  
368 371  
... ... @@ -424,16 +427,20 @@
424 427  
425 428 if (!ret) {
426 429 struct seq_file *m = file->private_data;
  430 + struct nsproxy *nsp;
427 431 struct mnt_namespace *mnt_ns = NULL;
428 432 struct task_struct *task = get_proc_task(inode);
429 433  
430 434 if (task) {
431   - task_lock(task);
432   - if (task->nsproxy)
433   - mnt_ns = task->nsproxy->mnt_ns;
434   - if (mnt_ns)
435   - get_mnt_ns(mnt_ns);
436   - task_unlock(task);
  435 + rcu_read_lock();
  436 + nsp = task_nsproxy(task);
  437 + if (nsp) {
  438 + mnt_ns = nsp->mnt_ns;
  439 + if (mnt_ns)
  440 + get_mnt_ns(mnt_ns);
  441 + }
  442 + rcu_read_unlock();
  443 +
437 444 put_task_struct(task);
438 445 }
439 446  
include/linux/nsproxy.h
... ... @@ -32,8 +32,39 @@
32 32 };
33 33 extern struct nsproxy init_nsproxy;
34 34  
  35 +/*
  36 + * the namespaces access rules are:
  37 + *
  38 + * 1. only current task is allowed to change tsk->nsproxy pointer or
  39 + * any pointer on the nsproxy itself
  40 + *
  41 + * 2. when accessing (i.e. reading) current task's namespaces - no
  42 + * precautions should be taken - just dereference the pointers
  43 + *
  44 + * 3. the access to other task namespaces is performed like this
  45 + * rcu_read_lock();
  46 + * nsproxy = task_nsproxy(tsk);
  47 + * if (nsproxy != NULL) {
  48 + * / *
  49 + * * work with the namespaces here
  50 + * * e.g. get the reference on one of them
  51 + * * /
  52 + * } / *
  53 + * * NULL task_nsproxy() means that this task is
  54 + * * almost dead (zombie)
  55 + * * /
  56 + * rcu_read_unlock();
  57 + *
  58 + */
  59 +
  60 +static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
  61 +{
  62 + return rcu_dereference(tsk->nsproxy);
  63 +}
  64 +
35 65 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
36   -void get_task_namespaces(struct task_struct *tsk);
  66 +void exit_task_namespaces(struct task_struct *tsk);
  67 +void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
37 68 void free_nsproxy(struct nsproxy *ns);
38 69 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
39 70 struct fs_struct *);
40 71  
... ... @@ -45,15 +76,9 @@
45 76 }
46 77 }
47 78  
48   -static inline void exit_task_namespaces(struct task_struct *p)
  79 +static inline void get_nsproxy(struct nsproxy *ns)
49 80 {
50   - struct nsproxy *ns = p->nsproxy;
51   - if (ns) {
52   - task_lock(p);
53   - p->nsproxy = NULL;
54   - task_unlock(p);
55   - put_nsproxy(ns);
56   - }
  81 + atomic_inc(&ns->count);
57 82 }
58 83  
59 84 #ifdef CONFIG_CGROUP_NS
... ... @@ -400,9 +400,10 @@
400 400 current->fs = fs;
401 401 atomic_inc(&fs->count);
402 402  
403   - exit_task_namespaces(current);
404   - current->nsproxy = init_task.nsproxy;
405   - get_task_namespaces(current);
  403 + if (current->nsproxy != init_task.nsproxy) {
  404 + get_nsproxy(init_task.nsproxy);
  405 + switch_task_namespaces(current, init_task.nsproxy);
  406 + }
406 407  
407 408 exit_files(current);
408 409 current->files = init_task.files;
... ... @@ -1632,7 +1632,7 @@
1632 1632 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1633 1633 struct files_struct *fd, *new_fd = NULL;
1634 1634 struct sem_undo_list *new_ulist = NULL;
1635   - struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
  1635 + struct nsproxy *new_nsproxy = NULL;
1636 1636  
1637 1637 check_unshare_flags(&unshare_flags);
1638 1638  
1639 1639  
1640 1640  
... ... @@ -1662,13 +1662,12 @@
1662 1662  
1663 1663 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1664 1664  
1665   - task_lock(current);
1666   -
1667 1665 if (new_nsproxy) {
1668   - old_nsproxy = current->nsproxy;
1669   - current->nsproxy = new_nsproxy;
1670   - new_nsproxy = old_nsproxy;
  1666 + switch_task_namespaces(current, new_nsproxy);
  1667 + new_nsproxy = NULL;
1671 1668 }
  1669 +
  1670 + task_lock(current);
1672 1671  
1673 1672 if (new_fs) {
1674 1673 fs = current->fs;
... ... @@ -26,19 +26,6 @@
26 26  
27 27 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28  
29   -static inline void get_nsproxy(struct nsproxy *ns)
30   -{
31   - atomic_inc(&ns->count);
32   -}
33   -
34   -void get_task_namespaces(struct task_struct *tsk)
35   -{
36   - struct nsproxy *ns = tsk->nsproxy;
37   - if (ns) {
38   - get_nsproxy(ns);
39   - }
40   -}
41   -
42 29 /*
43 30 * creates a copy of "orig" with refcount 1.
44 31 */
... ... @@ -214,6 +201,33 @@
214 201  
215 202 out:
216 203 return err;
  204 +}
  205 +
  206 +void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
  207 +{
  208 + struct nsproxy *ns;
  209 +
  210 + might_sleep();
  211 +
  212 + ns = p->nsproxy;
  213 +
  214 + rcu_assign_pointer(p->nsproxy, new);
  215 +
  216 + if (ns && atomic_dec_and_test(&ns->count)) {
  217 + /*
  218 + * wait for others to get what they want from this nsproxy.
  219 + *
  220 + * cannot release this nsproxy via the call_rcu() since
  221 + * put_mnt_ns() will want to sleep
  222 + */
  223 + synchronize_rcu();
  224 + free_nsproxy(ns);
  225 + }
  226 +}
  227 +
  228 +void exit_task_namespaces(struct task_struct *p)
  229 +{
  230 + switch_task_namespaces(p, NULL);
217 231 }
218 232  
219 233 static int __init nsproxy_cache_init(void)
net/core/rtnetlink.c
... ... @@ -744,10 +744,10 @@
744 744 rcu_read_lock();
745 745 tsk = find_task_by_pid(pid);
746 746 if (tsk) {
747   - task_lock(tsk);
748   - if (tsk->nsproxy)
749   - net = get_net(tsk->nsproxy->net_ns);
750   - task_unlock(tsk);
  747 + struct nsproxy *nsproxy;
  748 + nsproxy = task_nsproxy(tsk);
  749 + if (nsproxy)
  750 + net = get_net(nsproxy->net_ns);
751 751 }
752 752 rcu_read_unlock();
753 753 return net;