Commit b460cbc581a53cc088ceba80608021dd49c63c43

Authored by Serge E. Hallyn
Committed by Linus Torvalds
1 parent 3743ca05ff

pid namespaces: define is_global_init() and is_container_init()

is_init() is an ambiguous name for the pid==1 check.  Split it into
is_global_init() and is_container_init().

A cgroup init has it's tsk->pid == 1.

A global init also has it's tsk->pid == 1 and it's active pid namespace
is the init_pid_ns.  But rather than check the active pid namespace,
compare the task structure with 'init_pid_ns.child_reaper', which is
initialized during boot to the /sbin/init process and never changes.

Changelog:

	2.6.22-rc4-mm2-pidns1:
	- Use 'init_pid_ns.child_reaper' to determine if a given task is the
	  global init (/sbin/init) process. This would improve performance
	  and remove dependence on the task_pid().

	2.6.21-mm2-pidns2:

	- [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc,
	  ppc,avr32}/traps.c for the _exception() call to is_global_init().
	  This way, we kill only the cgroup if the cgroup's init has a
	  bug rather than force a kernel panic.

[akpm@linux-foundation.org: fix comment]
[sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c]
[bunk@stusta.de: kernel/pid.c: remove unused exports]
[sukadev@us.ibm.com: Fix capability.c to work with threaded init]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Acked-by: Pavel Emelianov <xemul@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Herbert Poetzel <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 32 changed files with 52 additions and 37 deletions Side-by-side Diff

arch/alpha/mm/fault.c
... ... @@ -188,7 +188,7 @@
188 188 /* We ran out of memory, or some other thing happened to us that
189 189 made us unable to handle the page fault gracefully. */
190 190 out_of_memory:
191   - if (is_init(current)) {
  191 + if (is_global_init(current)) {
192 192 yield();
193 193 down_read(&mm->mmap_sem);
194 194 goto survive;
... ... @@ -197,7 +197,7 @@
197 197 return fault;
198 198  
199 199 out_of_memory:
200   - if (!is_init(tsk))
  200 + if (!is_global_init(tsk))
201 201 goto out;
202 202  
203 203 /*
arch/avr32/kernel/traps.c
... ... @@ -89,7 +89,7 @@
89 89 * generate the same exception over and over again and we get
90 90 * nowhere. Better to kill it and let the kernel panic.
91 91 */
92   - if (is_init(current)) {
  92 + if (is_global_init(current)) {
93 93 __sighandler_t handler;
94 94  
95 95 spin_lock_irq(&current->sighand->siglock);
arch/avr32/mm/fault.c
... ... @@ -160,7 +160,7 @@
160 160 if (exception_trace && printk_ratelimit())
161 161 printk("%s%s[%d]: segfault at %08lx pc %08lx "
162 162 "sp %08lx ecr %lu\n",
163   - is_init(tsk) ? KERN_EMERG : KERN_INFO,
  163 + is_global_init(tsk) ? KERN_EMERG : KERN_INFO,
164 164 tsk->comm, tsk->pid, address, regs->pc,
165 165 regs->sp, ecr);
166 166 _exception(SIGSEGV, regs, code, address);
... ... @@ -209,7 +209,7 @@
209 209 */
210 210 out_of_memory:
211 211 up_read(&mm->mmap_sem);
212   - if (is_init(current)) {
  212 + if (is_global_init(current)) {
213 213 yield();
214 214 down_read(&mm->mmap_sem);
215 215 goto survive;
... ... @@ -231,7 +231,7 @@
231 231 if (exception_trace)
232 232 printk("%s%s[%d]: bus error at %08lx pc %08lx "
233 233 "sp %08lx ecr %lu\n",
234   - is_init(tsk) ? KERN_EMERG : KERN_INFO,
  234 + is_global_init(tsk) ? KERN_EMERG : KERN_INFO,
235 235 tsk->comm, tsk->pid, address, regs->pc,
236 236 regs->sp, ecr);
237 237  
arch/ia64/mm/fault.c
... ... @@ -274,7 +274,7 @@
274 274  
275 275 out_of_memory:
276 276 up_read(&mm->mmap_sem);
277   - if (is_init(current)) {
  277 + if (is_global_init(current)) {
278 278 yield();
279 279 down_read(&mm->mmap_sem);
280 280 goto survive;
arch/m32r/mm/fault.c
... ... @@ -271,7 +271,7 @@
271 271 */
272 272 out_of_memory:
273 273 up_read(&mm->mmap_sem);
274   - if (is_init(tsk)) {
  274 + if (is_global_init(tsk)) {
275 275 yield();
276 276 down_read(&mm->mmap_sem);
277 277 goto survive;
arch/m68k/mm/fault.c
... ... @@ -180,7 +180,7 @@
180 180 */
181 181 out_of_memory:
182 182 up_read(&mm->mmap_sem);
183   - if (is_init(current)) {
  183 + if (is_global_init(current)) {
184 184 yield();
185 185 down_read(&mm->mmap_sem);
186 186 goto survive;
arch/mips/mm/fault.c
... ... @@ -173,7 +173,7 @@
173 173 */
174 174 out_of_memory:
175 175 up_read(&mm->mmap_sem);
176   - if (is_init(tsk)) {
  176 + if (is_global_init(tsk)) {
177 177 yield();
178 178 down_read(&mm->mmap_sem);
179 179 goto survive;
arch/powerpc/kernel/traps.c
... ... @@ -201,7 +201,7 @@
201 201 * generate the same exception over and over again and we get
202 202 * nowhere. Better to kill it and let the kernel panic.
203 203 */
204   - if (is_init(current)) {
  204 + if (is_global_init(current)) {
205 205 __sighandler_t handler;
206 206  
207 207 spin_lock_irq(&current->sighand->siglock);
arch/powerpc/mm/fault.c
... ... @@ -375,7 +375,7 @@
375 375 */
376 376 out_of_memory:
377 377 up_read(&mm->mmap_sem);
378   - if (is_init(current)) {
  378 + if (is_global_init(current)) {
379 379 yield();
380 380 down_read(&mm->mmap_sem);
381 381 goto survive;
arch/powerpc/platforms/pseries/ras.c
... ... @@ -332,7 +332,7 @@
332 332 err->disposition == RTAS_DISP_NOT_RECOVERED &&
333 333 err->target == RTAS_TARGET_MEMORY &&
334 334 err->type == RTAS_TYPE_ECC_UNCORR &&
335   - !(current->pid == 0 || is_init(current))) {
  335 + !(current->pid == 0 || is_global_init(current))) {
336 336 /* Kill off a user process with an ECC error */
337 337 printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
338 338 current->pid);
arch/ppc/kernel/traps.c
... ... @@ -121,7 +121,7 @@
121 121 * generate the same exception over and over again and we get
122 122 * nowhere. Better to kill it and let the kernel panic.
123 123 */
124   - if (is_init(current)) {
  124 + if (is_global_init(current)) {
125 125 __sighandler_t handler;
126 126  
127 127 spin_lock_irq(&current->sighand->siglock);
... ... @@ -290,7 +290,7 @@
290 290 */
291 291 out_of_memory:
292 292 up_read(&mm->mmap_sem);
293   - if (is_init(current)) {
  293 + if (is_global_init(current)) {
294 294 yield();
295 295 down_read(&mm->mmap_sem);
296 296 goto survive;
arch/s390/lib/uaccess_pt.c
... ... @@ -64,7 +64,7 @@
64 64  
65 65 out_of_memory:
66 66 up_read(&mm->mmap_sem);
67   - if (is_init(current)) {
  67 + if (is_global_init(current)) {
68 68 yield();
69 69 down_read(&mm->mmap_sem);
70 70 goto survive;
arch/s390/mm/fault.c
... ... @@ -211,7 +211,7 @@
211 211 struct mm_struct *mm = tsk->mm;
212 212  
213 213 up_read(&mm->mmap_sem);
214   - if (is_init(tsk)) {
  214 + if (is_global_init(tsk)) {
215 215 yield();
216 216 down_read(&mm->mmap_sem);
217 217 return 1;
... ... @@ -207,7 +207,7 @@
207 207 */
208 208 out_of_memory:
209 209 up_read(&mm->mmap_sem);
210   - if (is_init(current)) {
  210 + if (is_global_init(current)) {
211 211 yield();
212 212 down_read(&mm->mmap_sem);
213 213 goto survive;
arch/sh64/mm/fault.c
... ... @@ -278,7 +278,7 @@
278 278 show_regs(regs);
279 279 #endif
280 280 }
281   - if (is_init(tsk)) {
  281 + if (is_global_init(tsk)) {
282 282 panic("INIT had user mode bad_area\n");
283 283 }
284 284 tsk->thread.address = address;
285 285  
... ... @@ -320,14 +320,14 @@
320 320 * us unable to handle the page fault gracefully.
321 321 */
322 322 out_of_memory:
323   - if (is_init(current)) {
  323 + if (is_global_init(current)) {
324 324 panic("INIT out of memory\n");
325 325 yield();
326 326 goto survive;
327 327 }
328 328 printk("fault:Out of memory\n");
329 329 up_read(&mm->mmap_sem);
330   - if (is_init(current)) {
  330 + if (is_global_init(current)) {
331 331 yield();
332 332 down_read(&mm->mmap_sem);
333 333 goto survive;
arch/um/kernel/trap.c
... ... @@ -108,7 +108,7 @@
108 108 * us unable to handle the page fault gracefully.
109 109 */
110 110 out_of_memory:
111   - if (is_init(current)) {
  111 + if (is_global_init(current)) {
112 112 up_read(&mm->mmap_sem);
113 113 yield();
114 114 down_read(&mm->mmap_sem);
arch/x86/lib/usercopy_32.c
... ... @@ -748,7 +748,7 @@
748 748 retval = get_user_pages(current, current->mm,
749 749 (unsigned long )to, 1, 1, 0, &pg, NULL);
750 750  
751   - if (retval == -ENOMEM && is_init(current)) {
  751 + if (retval == -ENOMEM && is_global_init(current)) {
752 752 up_read(&current->mm->mmap_sem);
753 753 congestion_wait(WRITE, HZ/50);
754 754 goto survive;
arch/x86/mm/fault_32.c
... ... @@ -587,7 +587,7 @@
587 587 */
588 588 out_of_memory:
589 589 up_read(&mm->mmap_sem);
590   - if (is_init(tsk)) {
  590 + if (is_global_init(tsk)) {
591 591 yield();
592 592 down_read(&mm->mmap_sem);
593 593 goto survive;
arch/x86/mm/fault_64.c
... ... @@ -554,7 +554,7 @@
554 554 */
555 555 out_of_memory:
556 556 up_read(&mm->mmap_sem);
557   - if (is_init(current)) {
  557 + if (is_global_init(current)) {
558 558 yield();
559 559 goto again;
560 560 }
arch/xtensa/mm/fault.c
... ... @@ -145,7 +145,7 @@
145 145 */
146 146 out_of_memory:
147 147 up_read(&mm->mmap_sem);
148   - if (is_init(current)) {
  148 + if (is_global_init(current)) {
149 149 yield();
150 150 down_read(&mm->mmap_sem);
151 151 goto survive;
drivers/char/sysrq.c
... ... @@ -251,7 +251,7 @@
251 251 struct task_struct *p;
252 252  
253 253 for_each_process(p) {
254   - if (p->mm && !is_init(p))
  254 + if (p->mm && !is_global_init(p))
255 255 /* Not swapper, init nor kernel thread */
256 256 force_sig(sig, p);
257 257 }
include/linux/sched.h
... ... @@ -1237,12 +1237,20 @@
1237 1237 }
1238 1238  
1239 1239 /**
1240   - * is_init - check if a task structure is init
  1240 + * is_global_init - check if a task structure is init
1241 1241 * @tsk: Task structure to be checked.
1242 1242 *
1243 1243 * Check if a task structure is the first user space task the kernel created.
  1244 + *
  1245 + * TODO: We should inline this function after some cleanups in pid_namespace.h
1244 1246 */
1245   -static inline int is_init(struct task_struct *tsk)
  1247 +extern int is_global_init(struct task_struct *tsk);
  1248 +
  1249 +/*
  1250 + * is_container_init:
  1251 + * check whether in the task is init in its own pid namespace.
  1252 + */
  1253 +static inline int is_container_init(struct task_struct *tsk)
1246 1254 {
1247 1255 return tsk->pid == 1;
1248 1256 }
... ... @@ -12,6 +12,7 @@
12 12 #include <linux/module.h>
13 13 #include <linux/security.h>
14 14 #include <linux/syscalls.h>
  15 +#include <linux/pid_namespace.h>
15 16 #include <asm/uaccess.h>
16 17  
17 18 /*
... ... @@ -129,7 +130,7 @@
129 130 int found = 0;
130 131  
131 132 do_each_thread(g, target) {
132   - if (target == current || is_init(target))
  133 + if (target == current || is_container_init(target->group_leader))
133 134 continue;
134 135 found = 1;
135 136 if (security_capset_check(target, effective, inheritable,
... ... @@ -221,7 +221,7 @@
221 221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
222 222 if (p == ignored_task
223 223 || p->exit_state
224   - || is_init(p->real_parent))
  224 + || is_global_init(p->real_parent))
225 225 continue;
226 226 if (task_pgrp(p->real_parent) != pgrp &&
227 227 task_session(p->real_parent) == task_session(p)) {
... ... @@ -51,7 +51,7 @@
51 51  
52 52 int kexec_should_crash(struct task_struct *p)
53 53 {
54   - if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
  54 + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
55 55 return 1;
56 56 return 0;
57 57 }
... ... @@ -70,6 +70,11 @@
70 70 .child_reaper = &init_task
71 71 };
72 72  
  73 +int is_global_init(struct task_struct *tsk)
  74 +{
  75 + return tsk == init_pid_ns.child_reaper;
  76 +}
  77 +
73 78 /*
74 79 * Note: disable interrupts while the pidmap_lock is held as an
75 80 * interrupt might come in and do read_lock(&tasklist_lock).
... ... @@ -256,7 +256,7 @@
256 256  
257 257 int unhandled_signal(struct task_struct *tsk, int sig)
258 258 {
259   - if (is_init(tsk))
  259 + if (is_global_init(tsk))
260 260 return 1;
261 261 if (tsk->ptrace & PT_PTRACED)
262 262 return 0;
... ... @@ -1888,7 +1888,7 @@
1888 1888 return -EPERM;
1889 1889 }
1890 1890  
1891   - op = is_init(current) ? OP_SET : OP_AND;
  1891 + op = is_global_init(current) ? OP_SET : OP_AND;
1892 1892 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1893 1893 do_proc_dointvec_bset_conv,&op);
1894 1894 }
... ... @@ -212,7 +212,7 @@
212 212 if (!p->mm)
213 213 continue;
214 214 /* skip the init task */
215   - if (is_init(p))
  215 + if (is_global_init(p))
216 216 continue;
217 217  
218 218 /*
... ... @@ -265,7 +265,7 @@
265 265 */
266 266 static void __oom_kill_task(struct task_struct *p, int verbose)
267 267 {
268   - if (is_init(p)) {
  268 + if (is_global_init(p)) {
269 269 WARN_ON(1);
270 270 printk(KERN_WARNING "tried to kill init!\n");
271 271 return;
security/commoncap.c
... ... @@ -23,6 +23,7 @@
23 23 #include <linux/xattr.h>
24 24 #include <linux/hugetlb.h>
25 25 #include <linux/mount.h>
  26 +#include <linux/sched.h>
26 27  
27 28 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
28 29 /*
... ... @@ -334,7 +335,7 @@
334 335 /* For init, we want to retain the capabilities set
335 336 * in the init_task struct. Thus we skip the usual
336 337 * capability rules */
337   - if (!is_init(current)) {
  338 + if (!is_global_init(current)) {
338 339 current->cap_permitted = new_permitted;
339 340 current->cap_effective = bprm->cap_effective ?
340 341 new_permitted : 0;