Commit b460cbc581a53cc088ceba80608021dd49c63c43
Committed by
Linus Torvalds
1 parent
3743ca05ff
Exists in
master
and in
7 other branches
pid namespaces: define is_global_init() and is_container_init()
is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 32 changed files with 52 additions and 37 deletions Side-by-side Diff
- arch/alpha/mm/fault.c
- arch/arm/mm/fault.c
- arch/avr32/kernel/traps.c
- arch/avr32/mm/fault.c
- arch/ia64/mm/fault.c
- arch/m32r/mm/fault.c
- arch/m68k/mm/fault.c
- arch/mips/mm/fault.c
- arch/powerpc/kernel/traps.c
- arch/powerpc/mm/fault.c
- arch/powerpc/platforms/pseries/ras.c
- arch/ppc/kernel/traps.c
- arch/ppc/mm/fault.c
- arch/s390/lib/uaccess_pt.c
- arch/s390/mm/fault.c
- arch/sh/mm/fault.c
- arch/sh64/mm/fault.c
- arch/um/kernel/trap.c
- arch/x86/lib/usercopy_32.c
- arch/x86/mm/fault_32.c
- arch/x86/mm/fault_64.c
- arch/xtensa/mm/fault.c
- drivers/char/sysrq.c
- include/linux/sched.h
- kernel/capability.c
- kernel/exit.c
- kernel/kexec.c
- kernel/pid.c
- kernel/signal.c
- kernel/sysctl.c
- mm/oom_kill.c
- security/commoncap.c
arch/alpha/mm/fault.c
... | ... | @@ -188,7 +188,7 @@ |
188 | 188 | /* We ran out of memory, or some other thing happened to us that |
189 | 189 | made us unable to handle the page fault gracefully. */ |
190 | 190 | out_of_memory: |
191 | - if (is_init(current)) { | |
191 | + if (is_global_init(current)) { | |
192 | 192 | yield(); |
193 | 193 | down_read(&mm->mmap_sem); |
194 | 194 | goto survive; |
arch/arm/mm/fault.c
arch/avr32/kernel/traps.c
... | ... | @@ -89,7 +89,7 @@ |
89 | 89 | * generate the same exception over and over again and we get |
90 | 90 | * nowhere. Better to kill it and let the kernel panic. |
91 | 91 | */ |
92 | - if (is_init(current)) { | |
92 | + if (is_global_init(current)) { | |
93 | 93 | __sighandler_t handler; |
94 | 94 | |
95 | 95 | spin_lock_irq(¤t->sighand->siglock); |
arch/avr32/mm/fault.c
... | ... | @@ -160,7 +160,7 @@ |
160 | 160 | if (exception_trace && printk_ratelimit()) |
161 | 161 | printk("%s%s[%d]: segfault at %08lx pc %08lx " |
162 | 162 | "sp %08lx ecr %lu\n", |
163 | - is_init(tsk) ? KERN_EMERG : KERN_INFO, | |
163 | + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, | |
164 | 164 | tsk->comm, tsk->pid, address, regs->pc, |
165 | 165 | regs->sp, ecr); |
166 | 166 | _exception(SIGSEGV, regs, code, address); |
... | ... | @@ -209,7 +209,7 @@ |
209 | 209 | */ |
210 | 210 | out_of_memory: |
211 | 211 | up_read(&mm->mmap_sem); |
212 | - if (is_init(current)) { | |
212 | + if (is_global_init(current)) { | |
213 | 213 | yield(); |
214 | 214 | down_read(&mm->mmap_sem); |
215 | 215 | goto survive; |
... | ... | @@ -231,7 +231,7 @@ |
231 | 231 | if (exception_trace) |
232 | 232 | printk("%s%s[%d]: bus error at %08lx pc %08lx " |
233 | 233 | "sp %08lx ecr %lu\n", |
234 | - is_init(tsk) ? KERN_EMERG : KERN_INFO, | |
234 | + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, | |
235 | 235 | tsk->comm, tsk->pid, address, regs->pc, |
236 | 236 | regs->sp, ecr); |
237 | 237 |
arch/ia64/mm/fault.c
arch/m32r/mm/fault.c
arch/m68k/mm/fault.c
arch/mips/mm/fault.c
arch/powerpc/kernel/traps.c
... | ... | @@ -201,7 +201,7 @@ |
201 | 201 | * generate the same exception over and over again and we get |
202 | 202 | * nowhere. Better to kill it and let the kernel panic. |
203 | 203 | */ |
204 | - if (is_init(current)) { | |
204 | + if (is_global_init(current)) { | |
205 | 205 | __sighandler_t handler; |
206 | 206 | |
207 | 207 | spin_lock_irq(¤t->sighand->siglock); |
arch/powerpc/mm/fault.c
arch/powerpc/platforms/pseries/ras.c
... | ... | @@ -332,7 +332,7 @@ |
332 | 332 | err->disposition == RTAS_DISP_NOT_RECOVERED && |
333 | 333 | err->target == RTAS_TARGET_MEMORY && |
334 | 334 | err->type == RTAS_TYPE_ECC_UNCORR && |
335 | - !(current->pid == 0 || is_init(current))) { | |
335 | + !(current->pid == 0 || is_global_init(current))) { | |
336 | 336 | /* Kill off a user process with an ECC error */ |
337 | 337 | printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", |
338 | 338 | current->pid); |
arch/ppc/kernel/traps.c
... | ... | @@ -121,7 +121,7 @@ |
121 | 121 | * generate the same exception over and over again and we get |
122 | 122 | * nowhere. Better to kill it and let the kernel panic. |
123 | 123 | */ |
124 | - if (is_init(current)) { | |
124 | + if (is_global_init(current)) { | |
125 | 125 | __sighandler_t handler; |
126 | 126 | |
127 | 127 | spin_lock_irq(¤t->sighand->siglock); |
arch/ppc/mm/fault.c
arch/s390/lib/uaccess_pt.c
arch/s390/mm/fault.c
arch/sh/mm/fault.c
arch/sh64/mm/fault.c
... | ... | @@ -278,7 +278,7 @@ |
278 | 278 | show_regs(regs); |
279 | 279 | #endif |
280 | 280 | } |
281 | - if (is_init(tsk)) { | |
281 | + if (is_global_init(tsk)) { | |
282 | 282 | panic("INIT had user mode bad_area\n"); |
283 | 283 | } |
284 | 284 | tsk->thread.address = address; |
285 | 285 | |
... | ... | @@ -320,14 +320,14 @@ |
320 | 320 | * us unable to handle the page fault gracefully. |
321 | 321 | */ |
322 | 322 | out_of_memory: |
323 | - if (is_init(current)) { | |
323 | + if (is_global_init(current)) { | |
324 | 324 | panic("INIT out of memory\n"); |
325 | 325 | yield(); |
326 | 326 | goto survive; |
327 | 327 | } |
328 | 328 | printk("fault:Out of memory\n"); |
329 | 329 | up_read(&mm->mmap_sem); |
330 | - if (is_init(current)) { | |
330 | + if (is_global_init(current)) { | |
331 | 331 | yield(); |
332 | 332 | down_read(&mm->mmap_sem); |
333 | 333 | goto survive; |
arch/um/kernel/trap.c
arch/x86/lib/usercopy_32.c
... | ... | @@ -748,7 +748,7 @@ |
748 | 748 | retval = get_user_pages(current, current->mm, |
749 | 749 | (unsigned long )to, 1, 1, 0, &pg, NULL); |
750 | 750 | |
751 | - if (retval == -ENOMEM && is_init(current)) { | |
751 | + if (retval == -ENOMEM && is_global_init(current)) { | |
752 | 752 | up_read(¤t->mm->mmap_sem); |
753 | 753 | congestion_wait(WRITE, HZ/50); |
754 | 754 | goto survive; |
arch/x86/mm/fault_32.c
arch/x86/mm/fault_64.c
arch/xtensa/mm/fault.c
drivers/char/sysrq.c
include/linux/sched.h
... | ... | @@ -1237,12 +1237,20 @@ |
1237 | 1237 | } |
1238 | 1238 | |
1239 | 1239 | /** |
1240 | - * is_init - check if a task structure is init | |
1240 | + * is_global_init - check if a task structure is init | |
1241 | 1241 | * @tsk: Task structure to be checked. |
1242 | 1242 | * |
1243 | 1243 | * Check if a task structure is the first user space task the kernel created. |
1244 | + * | |
1245 | + * TODO: We should inline this function after some cleanups in pid_namespace.h | |
1244 | 1246 | */ |
1245 | -static inline int is_init(struct task_struct *tsk) | |
1247 | +extern int is_global_init(struct task_struct *tsk); | |
1248 | + | |
1249 | +/* | |
1250 | + * is_container_init: | |
1251 | + * check whether in the task is init in its own pid namespace. | |
1252 | + */ | |
1253 | +static inline int is_container_init(struct task_struct *tsk) | |
1246 | 1254 | { |
1247 | 1255 | return tsk->pid == 1; |
1248 | 1256 | } |
kernel/capability.c
... | ... | @@ -12,6 +12,7 @@ |
12 | 12 | #include <linux/module.h> |
13 | 13 | #include <linux/security.h> |
14 | 14 | #include <linux/syscalls.h> |
15 | +#include <linux/pid_namespace.h> | |
15 | 16 | #include <asm/uaccess.h> |
16 | 17 | |
17 | 18 | /* |
... | ... | @@ -129,7 +130,7 @@ |
129 | 130 | int found = 0; |
130 | 131 | |
131 | 132 | do_each_thread(g, target) { |
132 | - if (target == current || is_init(target)) | |
133 | + if (target == current || is_container_init(target->group_leader)) | |
133 | 134 | continue; |
134 | 135 | found = 1; |
135 | 136 | if (security_capset_check(target, effective, inheritable, |
kernel/exit.c
... | ... | @@ -221,7 +221,7 @@ |
221 | 221 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
222 | 222 | if (p == ignored_task |
223 | 223 | || p->exit_state |
224 | - || is_init(p->real_parent)) | |
224 | + || is_global_init(p->real_parent)) | |
225 | 225 | continue; |
226 | 226 | if (task_pgrp(p->real_parent) != pgrp && |
227 | 227 | task_session(p->real_parent) == task_session(p)) { |
kernel/kexec.c
kernel/pid.c
... | ... | @@ -70,6 +70,11 @@ |
70 | 70 | .child_reaper = &init_task |
71 | 71 | }; |
72 | 72 | |
73 | +int is_global_init(struct task_struct *tsk) | |
74 | +{ | |
75 | + return tsk == init_pid_ns.child_reaper; | |
76 | +} | |
77 | + | |
73 | 78 | /* |
74 | 79 | * Note: disable interrupts while the pidmap_lock is held as an |
75 | 80 | * interrupt might come in and do read_lock(&tasklist_lock). |
kernel/signal.c
kernel/sysctl.c
... | ... | @@ -1888,7 +1888,7 @@ |
1888 | 1888 | return -EPERM; |
1889 | 1889 | } |
1890 | 1890 | |
1891 | - op = is_init(current) ? OP_SET : OP_AND; | |
1891 | + op = is_global_init(current) ? OP_SET : OP_AND; | |
1892 | 1892 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, |
1893 | 1893 | do_proc_dointvec_bset_conv,&op); |
1894 | 1894 | } |
mm/oom_kill.c
... | ... | @@ -212,7 +212,7 @@ |
212 | 212 | if (!p->mm) |
213 | 213 | continue; |
214 | 214 | /* skip the init task */ |
215 | - if (is_init(p)) | |
215 | + if (is_global_init(p)) | |
216 | 216 | continue; |
217 | 217 | |
218 | 218 | /* |
... | ... | @@ -265,7 +265,7 @@ |
265 | 265 | */ |
266 | 266 | static void __oom_kill_task(struct task_struct *p, int verbose) |
267 | 267 | { |
268 | - if (is_init(p)) { | |
268 | + if (is_global_init(p)) { | |
269 | 269 | WARN_ON(1); |
270 | 270 | printk(KERN_WARNING "tried to kill init!\n"); |
271 | 271 | return; |
security/commoncap.c
... | ... | @@ -23,6 +23,7 @@ |
23 | 23 | #include <linux/xattr.h> |
24 | 24 | #include <linux/hugetlb.h> |
25 | 25 | #include <linux/mount.h> |
26 | +#include <linux/sched.h> | |
26 | 27 | |
27 | 28 | #ifdef CONFIG_SECURITY_FILE_CAPABILITIES |
28 | 29 | /* |
... | ... | @@ -334,7 +335,7 @@ |
334 | 335 | /* For init, we want to retain the capabilities set |
335 | 336 | * in the init_task struct. Thus we skip the usual |
336 | 337 | * capability rules */ |
337 | - if (!is_init(current)) { | |
338 | + if (!is_global_init(current)) { | |
338 | 339 | current->cap_permitted = new_permitted; |
339 | 340 | current->cap_effective = bprm->cap_effective ? |
340 | 341 | new_permitted : 0; |