Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit e80d0a1ae8bb8fee0edd37427836f108b30f596b

Authored by Frederic Weisbecker 2012-11-21 23:26:44 +0800

Exists in smarc-l5.0.0_1.0.0-ga and in 5 other branches

cputime: Rename thread_group_times to thread_group_cputime_adjusted

We have thread_group_cputime() and thread_group_times(). The naming
doesn't provide enough information about the difference between
these two APIs.

To lower the confusion, rename thread_group_times() to
thread_group_cputime_adjusted(). This name better suggests that
it's a version of thread_group_cputime() that does some stabilization
on the raw cputime values. ie here: scale on top of CFS runtime
stats and bound lower value for monotonicity.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>

Showing 5 changed files with 13 additions and 13 deletions Inline Diff

fs/proc/array.c
include/linux/sched.h
kernel/exit.c
kernel/sched/cputime.c
kernel/sys.c

fs/proc/array.c

Diff comments View file @ e80d0a1

1	/*	1	/*
2	* linux/fs/proc/array.c	2	* linux/fs/proc/array.c
3	*	3	*
4	* Copyright (C) 1992 by Linus Torvalds	4	* Copyright (C) 1992 by Linus Torvalds
5	* based on ideas by Darren Senn	5	* based on ideas by Darren Senn
6	*	6	*
7	* Fixes:	7	* Fixes:
8	* Michael. K. Johnson: stat,statm extensions.	8	* Michael. K. Johnson: stat,statm extensions.
9	* <johnsonm@stolaf.edu>	9	* <johnsonm@stolaf.edu>
10	*	10	*
11	* Pauline Middelink : Made cmdline,envline only break at '\0's, to	11	* Pauline Middelink : Made cmdline,envline only break at '\0's, to
12	* make sure SET_PROCTITLE works. Also removed	12	* make sure SET_PROCTITLE works. Also removed
13	* bad '!' which forced address recalculation for	13	* bad '!' which forced address recalculation for
14	* EVERY character on the current page.	14	* EVERY character on the current page.
15	* <middelin@polyware.iaf.nl>	15	* <middelin@polyware.iaf.nl>
16	*	16	*
17	* Danny ter Haar : added cpuinfo	17	* Danny ter Haar : added cpuinfo
18	* <dth@cistron.nl>	18	* <dth@cistron.nl>
19	*	19	*
20	* Alessandro Rubini : profile extension.	20	* Alessandro Rubini : profile extension.
21	* <rubini@ipvvis.unipv.it>	21	* <rubini@ipvvis.unipv.it>
22	*	22	*
23	* Jeff Tranter : added BogoMips field to cpuinfo	23	* Jeff Tranter : added BogoMips field to cpuinfo
24	* <Jeff_Tranter@Mitel.COM>	24	* <Jeff_Tranter@Mitel.COM>
25	*	25	*
26	* Bruno Haible : remove 4K limit for the maps file	26	* Bruno Haible : remove 4K limit for the maps file
27	* <haible@ma2s2.mathematik.uni-karlsruhe.de>	27	* <haible@ma2s2.mathematik.uni-karlsruhe.de>
28	*	28	*
29	* Yves Arrouye : remove removal of trailing spaces in get_array.	29	* Yves Arrouye : remove removal of trailing spaces in get_array.
30	* <Yves.Arrouye@marin.fdn.fr>	30	* <Yves.Arrouye@marin.fdn.fr>
31	*	31	*
32	* Jerome Forissier : added per-CPU time information to /proc/stat	32	* Jerome Forissier : added per-CPU time information to /proc/stat
33	* and /proc/<pid>/cpu extension	33	* and /proc/<pid>/cpu extension
34	* <forissier@isia.cma.fr>	34	* <forissier@isia.cma.fr>
35	* - Incorporation and non-SMP safe operation	35	* - Incorporation and non-SMP safe operation
36	* of forissier patch in 2.1.78 by	36	* of forissier patch in 2.1.78 by
37	* Hans Marcus <crowbar@concepts.nl>	37	* Hans Marcus <crowbar@concepts.nl>
38	*	38	*
39	* aeb@cwi.nl : /proc/partitions	39	* aeb@cwi.nl : /proc/partitions
40	*	40	*
41	*	41	*
42	* Alan Cox : security fixes.	42	* Alan Cox : security fixes.
43	* <alan@lxorguk.ukuu.org.uk>	43	* <alan@lxorguk.ukuu.org.uk>
44	*	44	*
45	* Al Viro : safe handling of mm_struct	45	* Al Viro : safe handling of mm_struct
46	*	46	*
47	* Gerhard Wichert : added BIGMEM support	47	* Gerhard Wichert : added BIGMEM support
48	* Siemens AG <Gerhard.Wichert@pdb.siemens.de>	48	* Siemens AG <Gerhard.Wichert@pdb.siemens.de>
49	*	49	*
50	* Al Viro & Jeff Garzik : moved most of the thing into base.c and	50	* Al Viro & Jeff Garzik : moved most of the thing into base.c and
51	* : proc_misc.c. The rest may eventually go into	51	* : proc_misc.c. The rest may eventually go into
52	* : base.c too.	52	* : base.c too.
53	*/	53	*/
54		54
55	#include <linux/types.h>	55	#include <linux/types.h>
56	#include <linux/errno.h>	56	#include <linux/errno.h>
57	#include <linux/time.h>	57	#include <linux/time.h>
58	#include <linux/kernel.h>	58	#include <linux/kernel.h>
59	#include <linux/kernel_stat.h>	59	#include <linux/kernel_stat.h>
60	#include <linux/tty.h>	60	#include <linux/tty.h>
61	#include <linux/string.h>	61	#include <linux/string.h>
62	#include <linux/mman.h>	62	#include <linux/mman.h>
63	#include <linux/proc_fs.h>	63	#include <linux/proc_fs.h>
64	#include <linux/ioport.h>	64	#include <linux/ioport.h>
65	#include <linux/uaccess.h>	65	#include <linux/uaccess.h>
66	#include <linux/io.h>	66	#include <linux/io.h>
67	#include <linux/mm.h>	67	#include <linux/mm.h>
68	#include <linux/hugetlb.h>	68	#include <linux/hugetlb.h>
69	#include <linux/pagemap.h>	69	#include <linux/pagemap.h>
70	#include <linux/swap.h>	70	#include <linux/swap.h>
71	#include <linux/smp.h>	71	#include <linux/smp.h>
72	#include <linux/signal.h>	72	#include <linux/signal.h>
73	#include <linux/highmem.h>	73	#include <linux/highmem.h>
74	#include <linux/file.h>	74	#include <linux/file.h>
75	#include <linux/fdtable.h>	75	#include <linux/fdtable.h>
76	#include <linux/times.h>	76	#include <linux/times.h>
77	#include <linux/cpuset.h>	77	#include <linux/cpuset.h>
78	#include <linux/rcupdate.h>	78	#include <linux/rcupdate.h>
79	#include <linux/delayacct.h>	79	#include <linux/delayacct.h>
80	#include <linux/seq_file.h>	80	#include <linux/seq_file.h>
81	#include <linux/pid_namespace.h>	81	#include <linux/pid_namespace.h>
82	#include <linux/ptrace.h>	82	#include <linux/ptrace.h>
83	#include <linux/tracehook.h>	83	#include <linux/tracehook.h>
84	#include <linux/user_namespace.h>	84	#include <linux/user_namespace.h>
85		85
86	#include <asm/pgtable.h>	86	#include <asm/pgtable.h>
87	#include <asm/processor.h>	87	#include <asm/processor.h>
88	#include "internal.h"	88	#include "internal.h"
89		89
90	static inline void task_name(struct seq_file m, struct task_struct p)	90	static inline void task_name(struct seq_file m, struct task_struct p)
91	{	91	{
92	int i;	92	int i;
93	char buf, end;	93	char buf, end;
94	char *name;	94	char *name;
95	char tcomm[sizeof(p->comm)];	95	char tcomm[sizeof(p->comm)];
96		96
97	get_task_comm(tcomm, p);	97	get_task_comm(tcomm, p);
98		98
99	seq_puts(m, "Name:\t");	99	seq_puts(m, "Name:\t");
100	end = m->buf + m->size;	100	end = m->buf + m->size;
101	buf = m->buf + m->count;	101	buf = m->buf + m->count;
102	name = tcomm;	102	name = tcomm;
103	i = sizeof(tcomm);	103	i = sizeof(tcomm);
104	while (i && (buf < end)) {	104	while (i && (buf < end)) {
105	unsigned char c = *name;	105	unsigned char c = *name;
106	name++;	106	name++;
107	i--;	107	i--;
108	*buf = c;	108	*buf = c;
109	if (!c)	109	if (!c)
110	break;	110	break;
111	if (c == '\\') {	111	if (c == '\\') {
112	buf++;	112	buf++;
113	if (buf < end)	113	if (buf < end)
114	*buf++ = c;	114	*buf++ = c;
115	continue;	115	continue;
116	}	116	}
117	if (c == '\n') {	117	if (c == '\n') {
118	*buf++ = '\\';	118	*buf++ = '\\';
119	if (buf < end)	119	if (buf < end)
120	*buf++ = 'n';	120	*buf++ = 'n';
121	continue;	121	continue;
122	}	122	}
123	buf++;	123	buf++;
124	}	124	}
125	m->count = buf - m->buf;	125	m->count = buf - m->buf;
126	seq_putc(m, '\n');	126	seq_putc(m, '\n');
127	}	127	}
128		128
129	/*	129	/*
130	* The task state array is a strange "bitmap" of	130	* The task state array is a strange "bitmap" of
131	* reasons to sleep. Thus "running" is zero, and	131	* reasons to sleep. Thus "running" is zero, and
132	* you can test for combinations of others with	132	* you can test for combinations of others with
133	* simple bit tests.	133	* simple bit tests.
134	*/	134	*/
135	static const char * const task_state_array[] = {	135	static const char * const task_state_array[] = {
136	"R (running)", /* 0 */	136	"R (running)", /* 0 */
137	"S (sleeping)", /* 1 */	137	"S (sleeping)", /* 1 */
138	"D (disk sleep)", /* 2 */	138	"D (disk sleep)", /* 2 */
139	"T (stopped)", /* 4 */	139	"T (stopped)", /* 4 */
140	"t (tracing stop)", /* 8 */	140	"t (tracing stop)", /* 8 */
141	"Z (zombie)", /* 16 */	141	"Z (zombie)", /* 16 */
142	"X (dead)", /* 32 */	142	"X (dead)", /* 32 */
143	"x (dead)", /* 64 */	143	"x (dead)", /* 64 */
144	"K (wakekill)", /* 128 */	144	"K (wakekill)", /* 128 */
145	"W (waking)", /* 256 */	145	"W (waking)", /* 256 */
146	};	146	};
147		147
148	static inline const char get_task_state(struct task_struct tsk)	148	static inline const char get_task_state(struct task_struct tsk)
149	{	149	{
150	unsigned int state = (tsk->state & TASK_REPORT) \| tsk->exit_state;	150	unsigned int state = (tsk->state & TASK_REPORT) \| tsk->exit_state;
151	const char * const *p = &task_state_array[0];	151	const char * const *p = &task_state_array[0];
152		152
153	BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));	153	BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
154		154
155	while (state) {	155	while (state) {
156	p++;	156	p++;
157	state >>= 1;	157	state >>= 1;
158	}	158	}
159	return *p;	159	return *p;
160	}	160	}
161		161
162	static inline void task_state(struct seq_file m, struct pid_namespace ns,	162	static inline void task_state(struct seq_file m, struct pid_namespace ns,
163	struct pid pid, struct task_struct p)	163	struct pid pid, struct task_struct p)
164	{	164	{
165	struct user_namespace *user_ns = current_user_ns();	165	struct user_namespace *user_ns = current_user_ns();
166	struct group_info *group_info;	166	struct group_info *group_info;
167	int g;	167	int g;
168	struct fdtable *fdt = NULL;	168	struct fdtable *fdt = NULL;
169	const struct cred *cred;	169	const struct cred *cred;
170	pid_t ppid, tpid;	170	pid_t ppid, tpid;
171		171
172	rcu_read_lock();	172	rcu_read_lock();
173	ppid = pid_alive(p) ?	173	ppid = pid_alive(p) ?
174	task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;	174	task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
175	tpid = 0;	175	tpid = 0;
176	if (pid_alive(p)) {	176	if (pid_alive(p)) {
177	struct task_struct *tracer = ptrace_parent(p);	177	struct task_struct *tracer = ptrace_parent(p);
178	if (tracer)	178	if (tracer)
179	tpid = task_pid_nr_ns(tracer, ns);	179	tpid = task_pid_nr_ns(tracer, ns);
180	}	180	}
181	cred = get_task_cred(p);	181	cred = get_task_cred(p);
182	seq_printf(m,	182	seq_printf(m,
183	"State:\t%s\n"	183	"State:\t%s\n"
184	"Tgid:\t%d\n"	184	"Tgid:\t%d\n"
185	"Pid:\t%d\n"	185	"Pid:\t%d\n"
186	"PPid:\t%d\n"	186	"PPid:\t%d\n"
187	"TracerPid:\t%d\n"	187	"TracerPid:\t%d\n"
188	"Uid:\t%d\t%d\t%d\t%d\n"	188	"Uid:\t%d\t%d\t%d\t%d\n"
189	"Gid:\t%d\t%d\t%d\t%d\n",	189	"Gid:\t%d\t%d\t%d\t%d\n",
190	get_task_state(p),	190	get_task_state(p),
191	task_tgid_nr_ns(p, ns),	191	task_tgid_nr_ns(p, ns),
192	pid_nr_ns(pid, ns),	192	pid_nr_ns(pid, ns),
193	ppid, tpid,	193	ppid, tpid,
194	from_kuid_munged(user_ns, cred->uid),	194	from_kuid_munged(user_ns, cred->uid),
195	from_kuid_munged(user_ns, cred->euid),	195	from_kuid_munged(user_ns, cred->euid),
196	from_kuid_munged(user_ns, cred->suid),	196	from_kuid_munged(user_ns, cred->suid),
197	from_kuid_munged(user_ns, cred->fsuid),	197	from_kuid_munged(user_ns, cred->fsuid),
198	from_kgid_munged(user_ns, cred->gid),	198	from_kgid_munged(user_ns, cred->gid),
199	from_kgid_munged(user_ns, cred->egid),	199	from_kgid_munged(user_ns, cred->egid),
200	from_kgid_munged(user_ns, cred->sgid),	200	from_kgid_munged(user_ns, cred->sgid),
201	from_kgid_munged(user_ns, cred->fsgid));	201	from_kgid_munged(user_ns, cred->fsgid));
202		202
203	task_lock(p);	203	task_lock(p);
204	if (p->files)	204	if (p->files)
205	fdt = files_fdtable(p->files);	205	fdt = files_fdtable(p->files);
206	seq_printf(m,	206	seq_printf(m,
207	"FDSize:\t%d\n"	207	"FDSize:\t%d\n"
208	"Groups:\t",	208	"Groups:\t",
209	fdt ? fdt->max_fds : 0);	209	fdt ? fdt->max_fds : 0);
210	rcu_read_unlock();	210	rcu_read_unlock();
211		211
212	group_info = cred->group_info;	212	group_info = cred->group_info;
213	task_unlock(p);	213	task_unlock(p);
214		214
215	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)	215	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
216	seq_printf(m, "%d ",	216	seq_printf(m, "%d ",
217	from_kgid_munged(user_ns, GROUP_AT(group_info, g)));	217	from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
218	put_cred(cred);	218	put_cred(cred);
219		219
220	seq_putc(m, '\n');	220	seq_putc(m, '\n');
221	}	221	}
222		222
223	static void render_sigset_t(struct seq_file m, const char header,	223	static void render_sigset_t(struct seq_file m, const char header,
224	sigset_t *set)	224	sigset_t *set)
225	{	225	{
226	int i;	226	int i;
227		227
228	seq_puts(m, header);	228	seq_puts(m, header);
229		229
230	i = _NSIG;	230	i = _NSIG;
231	do {	231	do {
232	int x = 0;	232	int x = 0;
233		233
234	i -= 4;	234	i -= 4;
235	if (sigismember(set, i+1)) x \|= 1;	235	if (sigismember(set, i+1)) x \|= 1;
236	if (sigismember(set, i+2)) x \|= 2;	236	if (sigismember(set, i+2)) x \|= 2;
237	if (sigismember(set, i+3)) x \|= 4;	237	if (sigismember(set, i+3)) x \|= 4;
238	if (sigismember(set, i+4)) x \|= 8;	238	if (sigismember(set, i+4)) x \|= 8;
239	seq_printf(m, "%x", x);	239	seq_printf(m, "%x", x);
240	} while (i >= 4);	240	} while (i >= 4);
241		241
242	seq_putc(m, '\n');	242	seq_putc(m, '\n');
243	}	243	}
244		244
245	static void collect_sigign_sigcatch(struct task_struct p, sigset_t ign,	245	static void collect_sigign_sigcatch(struct task_struct p, sigset_t ign,
246	sigset_t *catch)	246	sigset_t *catch)
247	{	247	{
248	struct k_sigaction *k;	248	struct k_sigaction *k;
249	int i;	249	int i;
250		250
251	k = p->sighand->action;	251	k = p->sighand->action;
252	for (i = 1; i <= _NSIG; ++i, ++k) {	252	for (i = 1; i <= _NSIG; ++i, ++k) {
253	if (k->sa.sa_handler == SIG_IGN)	253	if (k->sa.sa_handler == SIG_IGN)
254	sigaddset(ign, i);	254	sigaddset(ign, i);
255	else if (k->sa.sa_handler != SIG_DFL)	255	else if (k->sa.sa_handler != SIG_DFL)
256	sigaddset(catch, i);	256	sigaddset(catch, i);
257	}	257	}
258	}	258	}
259		259
260	static inline void task_sig(struct seq_file m, struct task_struct p)	260	static inline void task_sig(struct seq_file m, struct task_struct p)
261	{	261	{
262	unsigned long flags;	262	unsigned long flags;
263	sigset_t pending, shpending, blocked, ignored, caught;	263	sigset_t pending, shpending, blocked, ignored, caught;
264	int num_threads = 0;	264	int num_threads = 0;
265	unsigned long qsize = 0;	265	unsigned long qsize = 0;
266	unsigned long qlim = 0;	266	unsigned long qlim = 0;
267		267
268	sigemptyset(&pending);	268	sigemptyset(&pending);
269	sigemptyset(&shpending);	269	sigemptyset(&shpending);
270	sigemptyset(&blocked);	270	sigemptyset(&blocked);
271	sigemptyset(&ignored);	271	sigemptyset(&ignored);
272	sigemptyset(&caught);	272	sigemptyset(&caught);
273		273
274	if (lock_task_sighand(p, &flags)) {	274	if (lock_task_sighand(p, &flags)) {
275	pending = p->pending.signal;	275	pending = p->pending.signal;
276	shpending = p->signal->shared_pending.signal;	276	shpending = p->signal->shared_pending.signal;
277	blocked = p->blocked;	277	blocked = p->blocked;
278	collect_sigign_sigcatch(p, &ignored, &caught);	278	collect_sigign_sigcatch(p, &ignored, &caught);
279	num_threads = get_nr_threads(p);	279	num_threads = get_nr_threads(p);
280	rcu_read_lock(); /* FIXME: is this correct? */	280	rcu_read_lock(); /* FIXME: is this correct? */
281	qsize = atomic_read(&__task_cred(p)->user->sigpending);	281	qsize = atomic_read(&__task_cred(p)->user->sigpending);
282	rcu_read_unlock();	282	rcu_read_unlock();
283	qlim = task_rlimit(p, RLIMIT_SIGPENDING);	283	qlim = task_rlimit(p, RLIMIT_SIGPENDING);
284	unlock_task_sighand(p, &flags);	284	unlock_task_sighand(p, &flags);
285	}	285	}
286		286
287	seq_printf(m, "Threads:\t%d\n", num_threads);	287	seq_printf(m, "Threads:\t%d\n", num_threads);
288	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);	288	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
289		289
290	/* render them all */	290	/* render them all */
291	render_sigset_t(m, "SigPnd:\t", &pending);	291	render_sigset_t(m, "SigPnd:\t", &pending);
292	render_sigset_t(m, "ShdPnd:\t", &shpending);	292	render_sigset_t(m, "ShdPnd:\t", &shpending);
293	render_sigset_t(m, "SigBlk:\t", &blocked);	293	render_sigset_t(m, "SigBlk:\t", &blocked);
294	render_sigset_t(m, "SigIgn:\t", &ignored);	294	render_sigset_t(m, "SigIgn:\t", &ignored);
295	render_sigset_t(m, "SigCgt:\t", &caught);	295	render_sigset_t(m, "SigCgt:\t", &caught);
296	}	296	}
297		297
298	static void render_cap_t(struct seq_file m, const char header,	298	static void render_cap_t(struct seq_file m, const char header,
299	kernel_cap_t *a)	299	kernel_cap_t *a)
300	{	300	{
301	unsigned __capi;	301	unsigned __capi;
302		302
303	seq_puts(m, header);	303	seq_puts(m, header);
304	CAP_FOR_EACH_U32(__capi) {	304	CAP_FOR_EACH_U32(__capi) {
305	seq_printf(m, "%08x",	305	seq_printf(m, "%08x",
306	a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);	306	a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
307	}	307	}
308	seq_putc(m, '\n');	308	seq_putc(m, '\n');
309	}	309	}
310		310
311	static inline void task_cap(struct seq_file m, struct task_struct p)	311	static inline void task_cap(struct seq_file m, struct task_struct p)
312	{	312	{
313	const struct cred *cred;	313	const struct cred *cred;
314	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;	314	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
315		315
316	rcu_read_lock();	316	rcu_read_lock();
317	cred = __task_cred(p);	317	cred = __task_cred(p);
318	cap_inheritable = cred->cap_inheritable;	318	cap_inheritable = cred->cap_inheritable;
319	cap_permitted = cred->cap_permitted;	319	cap_permitted = cred->cap_permitted;
320	cap_effective = cred->cap_effective;	320	cap_effective = cred->cap_effective;
321	cap_bset = cred->cap_bset;	321	cap_bset = cred->cap_bset;
322	rcu_read_unlock();	322	rcu_read_unlock();
323		323
324	render_cap_t(m, "CapInh:\t", &cap_inheritable);	324	render_cap_t(m, "CapInh:\t", &cap_inheritable);
325	render_cap_t(m, "CapPrm:\t", &cap_permitted);	325	render_cap_t(m, "CapPrm:\t", &cap_permitted);
326	render_cap_t(m, "CapEff:\t", &cap_effective);	326	render_cap_t(m, "CapEff:\t", &cap_effective);
327	render_cap_t(m, "CapBnd:\t", &cap_bset);	327	render_cap_t(m, "CapBnd:\t", &cap_bset);
328	}	328	}
329		329
330	static inline void task_context_switch_counts(struct seq_file *m,	330	static inline void task_context_switch_counts(struct seq_file *m,
331	struct task_struct *p)	331	struct task_struct *p)
332	{	332	{
333	seq_printf(m, "voluntary_ctxt_switches:\t%lu\n"	333	seq_printf(m, "voluntary_ctxt_switches:\t%lu\n"
334	"nonvoluntary_ctxt_switches:\t%lu\n",	334	"nonvoluntary_ctxt_switches:\t%lu\n",
335	p->nvcsw,	335	p->nvcsw,
336	p->nivcsw);	336	p->nivcsw);
337	}	337	}
338		338
339	static void task_cpus_allowed(struct seq_file m, struct task_struct task)	339	static void task_cpus_allowed(struct seq_file m, struct task_struct task)
340	{	340	{
341	seq_puts(m, "Cpus_allowed:\t");	341	seq_puts(m, "Cpus_allowed:\t");
342	seq_cpumask(m, &task->cpus_allowed);	342	seq_cpumask(m, &task->cpus_allowed);
343	seq_putc(m, '\n');	343	seq_putc(m, '\n');
344	seq_puts(m, "Cpus_allowed_list:\t");	344	seq_puts(m, "Cpus_allowed_list:\t");
345	seq_cpumask_list(m, &task->cpus_allowed);	345	seq_cpumask_list(m, &task->cpus_allowed);
346	seq_putc(m, '\n');	346	seq_putc(m, '\n');
347	}	347	}
348		348
349	int proc_pid_status(struct seq_file m, struct pid_namespace ns,	349	int proc_pid_status(struct seq_file m, struct pid_namespace ns,
350	struct pid pid, struct task_struct task)	350	struct pid pid, struct task_struct task)
351	{	351	{
352	struct mm_struct *mm = get_task_mm(task);	352	struct mm_struct *mm = get_task_mm(task);
353		353
354	task_name(m, task);	354	task_name(m, task);
355	task_state(m, ns, pid, task);	355	task_state(m, ns, pid, task);
356		356
357	if (mm) {	357	if (mm) {
358	task_mem(m, mm);	358	task_mem(m, mm);
359	mmput(mm);	359	mmput(mm);
360	}	360	}
361	task_sig(m, task);	361	task_sig(m, task);
362	task_cap(m, task);	362	task_cap(m, task);
363	task_cpus_allowed(m, task);	363	task_cpus_allowed(m, task);
364	cpuset_task_status_allowed(m, task);	364	cpuset_task_status_allowed(m, task);
365	task_context_switch_counts(m, task);	365	task_context_switch_counts(m, task);
366	return 0;	366	return 0;
367	}	367	}
368		368
369	static int do_task_stat(struct seq_file m, struct pid_namespace ns,	369	static int do_task_stat(struct seq_file m, struct pid_namespace ns,
370	struct pid pid, struct task_struct task, int whole)	370	struct pid pid, struct task_struct task, int whole)
371	{	371	{
372	unsigned long vsize, eip, esp, wchan = ~0UL;	372	unsigned long vsize, eip, esp, wchan = ~0UL;
373	int priority, nice;	373	int priority, nice;
374	int tty_pgrp = -1, tty_nr = 0;	374	int tty_pgrp = -1, tty_nr = 0;
375	sigset_t sigign, sigcatch;	375	sigset_t sigign, sigcatch;
376	char state;	376	char state;
377	pid_t ppid = 0, pgid = -1, sid = -1;	377	pid_t ppid = 0, pgid = -1, sid = -1;
378	int num_threads = 0;	378	int num_threads = 0;
379	int permitted;	379	int permitted;
380	struct mm_struct *mm;	380	struct mm_struct *mm;
381	unsigned long long start_time;	381	unsigned long long start_time;
382	unsigned long cmin_flt = 0, cmaj_flt = 0;	382	unsigned long cmin_flt = 0, cmaj_flt = 0;
383	unsigned long min_flt = 0, maj_flt = 0;	383	unsigned long min_flt = 0, maj_flt = 0;
384	cputime_t cutime, cstime, utime, stime;	384	cputime_t cutime, cstime, utime, stime;
385	cputime_t cgtime, gtime;	385	cputime_t cgtime, gtime;
386	unsigned long rsslim = 0;	386	unsigned long rsslim = 0;
387	char tcomm[sizeof(task->comm)];	387	char tcomm[sizeof(task->comm)];
388	unsigned long flags;	388	unsigned long flags;
389		389
390	state = *get_task_state(task);	390	state = *get_task_state(task);
391	vsize = eip = esp = 0;	391	vsize = eip = esp = 0;
392	permitted = ptrace_may_access(task, PTRACE_MODE_READ \| PTRACE_MODE_NOAUDIT);	392	permitted = ptrace_may_access(task, PTRACE_MODE_READ \| PTRACE_MODE_NOAUDIT);
393	mm = get_task_mm(task);	393	mm = get_task_mm(task);
394	if (mm) {	394	if (mm) {
395	vsize = task_vsize(mm);	395	vsize = task_vsize(mm);
396	if (permitted) {	396	if (permitted) {
397	eip = KSTK_EIP(task);	397	eip = KSTK_EIP(task);
398	esp = KSTK_ESP(task);	398	esp = KSTK_ESP(task);
399	}	399	}
400	}	400	}
401		401
402	get_task_comm(tcomm, task);	402	get_task_comm(tcomm, task);
403		403
404	sigemptyset(&sigign);	404	sigemptyset(&sigign);
405	sigemptyset(&sigcatch);	405	sigemptyset(&sigcatch);
406	cutime = cstime = utime = stime = 0;	406	cutime = cstime = utime = stime = 0;
407	cgtime = gtime = 0;	407	cgtime = gtime = 0;
408		408
409	if (lock_task_sighand(task, &flags)) {	409	if (lock_task_sighand(task, &flags)) {
410	struct signal_struct *sig = task->signal;	410	struct signal_struct *sig = task->signal;
411		411
412	if (sig->tty) {	412	if (sig->tty) {
413	struct pid *pgrp = tty_get_pgrp(sig->tty);	413	struct pid *pgrp = tty_get_pgrp(sig->tty);
414	tty_pgrp = pid_nr_ns(pgrp, ns);	414	tty_pgrp = pid_nr_ns(pgrp, ns);
415	put_pid(pgrp);	415	put_pid(pgrp);
416	tty_nr = new_encode_dev(tty_devnum(sig->tty));	416	tty_nr = new_encode_dev(tty_devnum(sig->tty));
417	}	417	}
418		418
419	num_threads = get_nr_threads(task);	419	num_threads = get_nr_threads(task);
420	collect_sigign_sigcatch(task, &sigign, &sigcatch);	420	collect_sigign_sigcatch(task, &sigign, &sigcatch);
421		421
422	cmin_flt = sig->cmin_flt;	422	cmin_flt = sig->cmin_flt;
423	cmaj_flt = sig->cmaj_flt;	423	cmaj_flt = sig->cmaj_flt;
424	cutime = sig->cutime;	424	cutime = sig->cutime;
425	cstime = sig->cstime;	425	cstime = sig->cstime;
426	cgtime = sig->cgtime;	426	cgtime = sig->cgtime;
427	rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);	427	rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
428		428
429	/* add up live thread stats at the group level */	429	/* add up live thread stats at the group level */
430	if (whole) {	430	if (whole) {
431	struct task_struct *t = task;	431	struct task_struct *t = task;
432	do {	432	do {
433	min_flt += t->min_flt;	433	min_flt += t->min_flt;
434	maj_flt += t->maj_flt;	434	maj_flt += t->maj_flt;
435	gtime += t->gtime;	435	gtime += t->gtime;
436	t = next_thread(t);	436	t = next_thread(t);
437	} while (t != task);	437	} while (t != task);
438		438
439	min_flt += sig->min_flt;	439	min_flt += sig->min_flt;
440	maj_flt += sig->maj_flt;	440	maj_flt += sig->maj_flt;
441	thread_group_times(task, &utime, &stime);	441	thread_group_cputime_adjusted(task, &utime, &stime);
442	gtime += sig->gtime;	442	gtime += sig->gtime;
443	}	443	}
444		444
445	sid = task_session_nr_ns(task, ns);	445	sid = task_session_nr_ns(task, ns);
446	ppid = task_tgid_nr_ns(task->real_parent, ns);	446	ppid = task_tgid_nr_ns(task->real_parent, ns);
447	pgid = task_pgrp_nr_ns(task, ns);	447	pgid = task_pgrp_nr_ns(task, ns);
448		448
449	unlock_task_sighand(task, &flags);	449	unlock_task_sighand(task, &flags);
450	}	450	}
451		451
452	if (permitted && (!whole \|\| num_threads < 2))	452	if (permitted && (!whole \|\| num_threads < 2))
453	wchan = get_wchan(task);	453	wchan = get_wchan(task);
454	if (!whole) {	454	if (!whole) {
455	min_flt = task->min_flt;	455	min_flt = task->min_flt;
456	maj_flt = task->maj_flt;	456	maj_flt = task->maj_flt;
457	task_times(task, &utime, &stime);	457	task_cputime_adjusted(task, &utime, &stime);
458	gtime = task->gtime;	458	gtime = task->gtime;
459	}	459	}
460		460
461	/* scale priority and nice values from timeslices to -20..20 */	461	/* scale priority and nice values from timeslices to -20..20 */
462	/* to make it look like a "normal" Unix priority/nice value */	462	/* to make it look like a "normal" Unix priority/nice value */
463	priority = task_prio(task);	463	priority = task_prio(task);
464	nice = task_nice(task);	464	nice = task_nice(task);
465		465
466	/* Temporary variable needed for gcc-2.96 */	466	/* Temporary variable needed for gcc-2.96 */
467	/* convert timespec -> nsec*/	467	/* convert timespec -> nsec*/
468	start_time =	468	start_time =
469	(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC	469	(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
470	+ task->real_start_time.tv_nsec;	470	+ task->real_start_time.tv_nsec;
471	/* convert nsec -> ticks */	471	/* convert nsec -> ticks */
472	start_time = nsec_to_clock_t(start_time);	472	start_time = nsec_to_clock_t(start_time);
473		473
474	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);	474	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
475	seq_put_decimal_ll(m, ' ', ppid);	475	seq_put_decimal_ll(m, ' ', ppid);
476	seq_put_decimal_ll(m, ' ', pgid);	476	seq_put_decimal_ll(m, ' ', pgid);
477	seq_put_decimal_ll(m, ' ', sid);	477	seq_put_decimal_ll(m, ' ', sid);
478	seq_put_decimal_ll(m, ' ', tty_nr);	478	seq_put_decimal_ll(m, ' ', tty_nr);
479	seq_put_decimal_ll(m, ' ', tty_pgrp);	479	seq_put_decimal_ll(m, ' ', tty_pgrp);
480	seq_put_decimal_ull(m, ' ', task->flags);	480	seq_put_decimal_ull(m, ' ', task->flags);
481	seq_put_decimal_ull(m, ' ', min_flt);	481	seq_put_decimal_ull(m, ' ', min_flt);
482	seq_put_decimal_ull(m, ' ', cmin_flt);	482	seq_put_decimal_ull(m, ' ', cmin_flt);
483	seq_put_decimal_ull(m, ' ', maj_flt);	483	seq_put_decimal_ull(m, ' ', maj_flt);
484	seq_put_decimal_ull(m, ' ', cmaj_flt);	484	seq_put_decimal_ull(m, ' ', cmaj_flt);
485	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));	485	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
486	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));	486	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
487	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));	487	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
488	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));	488	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
489	seq_put_decimal_ll(m, ' ', priority);	489	seq_put_decimal_ll(m, ' ', priority);
490	seq_put_decimal_ll(m, ' ', nice);	490	seq_put_decimal_ll(m, ' ', nice);
491	seq_put_decimal_ll(m, ' ', num_threads);	491	seq_put_decimal_ll(m, ' ', num_threads);
492	seq_put_decimal_ull(m, ' ', 0);	492	seq_put_decimal_ull(m, ' ', 0);
493	seq_put_decimal_ull(m, ' ', start_time);	493	seq_put_decimal_ull(m, ' ', start_time);
494	seq_put_decimal_ull(m, ' ', vsize);	494	seq_put_decimal_ull(m, ' ', vsize);
495	seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);	495	seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
496	seq_put_decimal_ull(m, ' ', rsslim);	496	seq_put_decimal_ull(m, ' ', rsslim);
497	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);	497	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
498	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);	498	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
499	seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);	499	seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
500	seq_put_decimal_ull(m, ' ', esp);	500	seq_put_decimal_ull(m, ' ', esp);
501	seq_put_decimal_ull(m, ' ', eip);	501	seq_put_decimal_ull(m, ' ', eip);
502	/* The signal information here is obsolete.	502	/* The signal information here is obsolete.
503	* It must be decimal for Linux 2.0 compatibility.	503	* It must be decimal for Linux 2.0 compatibility.
504	* Use /proc/#/status for real-time signals.	504	* Use /proc/#/status for real-time signals.
505	*/	505	*/
506	seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);	506	seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
507	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);	507	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
508	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);	508	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
509	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);	509	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
510	seq_put_decimal_ull(m, ' ', wchan);	510	seq_put_decimal_ull(m, ' ', wchan);
511	seq_put_decimal_ull(m, ' ', 0);	511	seq_put_decimal_ull(m, ' ', 0);
512	seq_put_decimal_ull(m, ' ', 0);	512	seq_put_decimal_ull(m, ' ', 0);
513	seq_put_decimal_ll(m, ' ', task->exit_signal);	513	seq_put_decimal_ll(m, ' ', task->exit_signal);
514	seq_put_decimal_ll(m, ' ', task_cpu(task));	514	seq_put_decimal_ll(m, ' ', task_cpu(task));
515	seq_put_decimal_ull(m, ' ', task->rt_priority);	515	seq_put_decimal_ull(m, ' ', task->rt_priority);
516	seq_put_decimal_ull(m, ' ', task->policy);	516	seq_put_decimal_ull(m, ' ', task->policy);
517	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));	517	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
518	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));	518	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
519	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));	519	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
520		520
521	if (mm && permitted) {	521	if (mm && permitted) {
522	seq_put_decimal_ull(m, ' ', mm->start_data);	522	seq_put_decimal_ull(m, ' ', mm->start_data);
523	seq_put_decimal_ull(m, ' ', mm->end_data);	523	seq_put_decimal_ull(m, ' ', mm->end_data);
524	seq_put_decimal_ull(m, ' ', mm->start_brk);	524	seq_put_decimal_ull(m, ' ', mm->start_brk);
525	seq_put_decimal_ull(m, ' ', mm->arg_start);	525	seq_put_decimal_ull(m, ' ', mm->arg_start);
526	seq_put_decimal_ull(m, ' ', mm->arg_end);	526	seq_put_decimal_ull(m, ' ', mm->arg_end);
527	seq_put_decimal_ull(m, ' ', mm->env_start);	527	seq_put_decimal_ull(m, ' ', mm->env_start);
528	seq_put_decimal_ull(m, ' ', mm->env_end);	528	seq_put_decimal_ull(m, ' ', mm->env_end);
529	} else	529	} else
530	seq_printf(m, " 0 0 0 0 0 0 0");	530	seq_printf(m, " 0 0 0 0 0 0 0");
531		531
532	if (permitted)	532	if (permitted)
533	seq_put_decimal_ll(m, ' ', task->exit_code);	533	seq_put_decimal_ll(m, ' ', task->exit_code);
534	else	534	else
535	seq_put_decimal_ll(m, ' ', 0);	535	seq_put_decimal_ll(m, ' ', 0);
536		536
537	seq_putc(m, '\n');	537	seq_putc(m, '\n');
538	if (mm)	538	if (mm)
539	mmput(mm);	539	mmput(mm);
540	return 0;	540	return 0;
541	}	541	}
542		542
543	int proc_tid_stat(struct seq_file m, struct pid_namespace ns,	543	int proc_tid_stat(struct seq_file m, struct pid_namespace ns,
544	struct pid pid, struct task_struct task)	544	struct pid pid, struct task_struct task)
545	{	545	{
546	return do_task_stat(m, ns, pid, task, 0);	546	return do_task_stat(m, ns, pid, task, 0);
547	}	547	}
548		548
549	int proc_tgid_stat(struct seq_file m, struct pid_namespace ns,	549	int proc_tgid_stat(struct seq_file m, struct pid_namespace ns,
550	struct pid pid, struct task_struct task)	550	struct pid pid, struct task_struct task)
551	{	551	{
552	return do_task_stat(m, ns, pid, task, 1);	552	return do_task_stat(m, ns, pid, task, 1);
553	}	553	}
554		554
555	int proc_pid_statm(struct seq_file m, struct pid_namespace ns,	555	int proc_pid_statm(struct seq_file m, struct pid_namespace ns,
556	struct pid pid, struct task_struct task)	556	struct pid pid, struct task_struct task)
557	{	557	{
558	unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;	558	unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
559	struct mm_struct *mm = get_task_mm(task);	559	struct mm_struct *mm = get_task_mm(task);
560		560
561	if (mm) {	561	if (mm) {
562	size = task_statm(mm, &shared, &text, &data, &resident);	562	size = task_statm(mm, &shared, &text, &data, &resident);
563	mmput(mm);	563	mmput(mm);
564	}	564	}
565	/*	565	/*
566	* For quick read, open code by putting numbers directly	566	* For quick read, open code by putting numbers directly
567	* expected format is	567	* expected format is
568	* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",	568	* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
569	* size, resident, shared, text, data);	569	* size, resident, shared, text, data);
570	*/	570	*/
571	seq_put_decimal_ull(m, 0, size);	571	seq_put_decimal_ull(m, 0, size);
572	seq_put_decimal_ull(m, ' ', resident);	572	seq_put_decimal_ull(m, ' ', resident);
573	seq_put_decimal_ull(m, ' ', shared);	573	seq_put_decimal_ull(m, ' ', shared);
574	seq_put_decimal_ull(m, ' ', text);	574	seq_put_decimal_ull(m, ' ', text);
575	seq_put_decimal_ull(m, ' ', 0);	575	seq_put_decimal_ull(m, ' ', 0);
576	seq_put_decimal_ull(m, ' ', data);	576	seq_put_decimal_ull(m, ' ', data);
577	seq_put_decimal_ull(m, ' ', 0);	577	seq_put_decimal_ull(m, ' ', 0);
578	seq_putc(m, '\n');	578	seq_putc(m, '\n');
579		579
580	return 0;	580	return 0;
581	}	581	}
582		582
583	#ifdef CONFIG_CHECKPOINT_RESTORE	583	#ifdef CONFIG_CHECKPOINT_RESTORE
584	static struct pid *	584	static struct pid *
585	get_children_pid(struct inode inode, struct pid pid_prev, loff_t pos)	585	get_children_pid(struct inode inode, struct pid pid_prev, loff_t pos)
586	{	586	{
587	struct task_struct start, task;	587	struct task_struct start, task;
588	struct pid *pid = NULL;	588	struct pid *pid = NULL;
589		589
590	read_lock(&tasklist_lock);	590	read_lock(&tasklist_lock);
591		591
592	start = pid_task(proc_pid(inode), PIDTYPE_PID);	592	start = pid_task(proc_pid(inode), PIDTYPE_PID);
593	if (!start)	593	if (!start)
594	goto out;	594	goto out;
595		595
596	/*	596	/*
597	* Lets try to continue searching first, this gives	597	* Lets try to continue searching first, this gives
598	* us significant speedup on children-rich processes.	598	* us significant speedup on children-rich processes.
599	*/	599	*/
600	if (pid_prev) {	600	if (pid_prev) {
601	task = pid_task(pid_prev, PIDTYPE_PID);	601	task = pid_task(pid_prev, PIDTYPE_PID);
602	if (task && task->real_parent == start &&	602	if (task && task->real_parent == start &&
603	!(list_empty(&task->sibling))) {	603	!(list_empty(&task->sibling))) {
604	if (list_is_last(&task->sibling, &start->children))	604	if (list_is_last(&task->sibling, &start->children))
605	goto out;	605	goto out;
606	task = list_first_entry(&task->sibling,	606	task = list_first_entry(&task->sibling,
607	struct task_struct, sibling);	607	struct task_struct, sibling);
608	pid = get_pid(task_pid(task));	608	pid = get_pid(task_pid(task));
609	goto out;	609	goto out;
610	}	610	}
611	}	611	}
612		612
613	/*	613	/*
614	* Slow search case.	614	* Slow search case.
615	*	615	*
616	* We might miss some children here if children	616	* We might miss some children here if children
617	* are exited while we were not holding the lock,	617	* are exited while we were not holding the lock,
618	* but it was never promised to be accurate that	618	* but it was never promised to be accurate that
619	* much.	619	* much.
620	*	620	*
621	* "Just suppose that the parent sleeps, but N children	621	* "Just suppose that the parent sleeps, but N children
622	* exit after we printed their tids. Now the slow paths	622	* exit after we printed their tids. Now the slow paths
623	* skips N extra children, we miss N tasks." (c)	623	* skips N extra children, we miss N tasks." (c)
624	*	624	*
625	* So one need to stop or freeze the leader and all	625	* So one need to stop or freeze the leader and all
626	* its children to get a precise result.	626	* its children to get a precise result.
627	*/	627	*/
628	list_for_each_entry(task, &start->children, sibling) {	628	list_for_each_entry(task, &start->children, sibling) {
629	if (pos-- == 0) {	629	if (pos-- == 0) {
630	pid = get_pid(task_pid(task));	630	pid = get_pid(task_pid(task));
631	break;	631	break;
632	}	632	}
633	}	633	}
634		634
635	out:	635	out:
636	read_unlock(&tasklist_lock);	636	read_unlock(&tasklist_lock);
637	return pid;	637	return pid;
638	}	638	}
639		639
640	static int children_seq_show(struct seq_file seq, void v)	640	static int children_seq_show(struct seq_file seq, void v)
641	{	641	{
642	struct inode *inode = seq->private;	642	struct inode *inode = seq->private;
643	pid_t pid;	643	pid_t pid;
644		644
645	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);	645	pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
646	return seq_printf(seq, "%d ", pid);	646	return seq_printf(seq, "%d ", pid);
647	}	647	}
648		648
649	static void children_seq_start(struct seq_file seq, loff_t *pos)	649	static void children_seq_start(struct seq_file seq, loff_t *pos)
650	{	650	{
651	return get_children_pid(seq->private, NULL, *pos);	651	return get_children_pid(seq->private, NULL, *pos);
652	}	652	}
653		653
654	static void children_seq_next(struct seq_file seq, void v, loff_t pos)	654	static void children_seq_next(struct seq_file seq, void v, loff_t pos)
655	{	655	{
656	struct pid *pid;	656	struct pid *pid;
657		657
658	pid = get_children_pid(seq->private, v, *pos + 1);	658	pid = get_children_pid(seq->private, v, *pos + 1);
659	put_pid(v);	659	put_pid(v);
660		660
661	++*pos;	661	++*pos;
662	return pid;	662	return pid;
663	}	663	}
664		664
665	static void children_seq_stop(struct seq_file seq, void v)	665	static void children_seq_stop(struct seq_file seq, void v)
666	{	666	{
667	put_pid(v);	667	put_pid(v);
668	}	668	}
669		669
670	static const struct seq_operations children_seq_ops = {	670	static const struct seq_operations children_seq_ops = {
671	.start = children_seq_start,	671	.start = children_seq_start,
672	.next = children_seq_next,	672	.next = children_seq_next,
673	.stop = children_seq_stop,	673	.stop = children_seq_stop,
674	.show = children_seq_show,	674	.show = children_seq_show,
675	};	675	};
676		676
677	static int children_seq_open(struct inode inode, struct file file)	677	static int children_seq_open(struct inode inode, struct file file)
678	{	678	{
679	struct seq_file *m;	679	struct seq_file *m;
680	int ret;	680	int ret;
681		681
682	ret = seq_open(file, &children_seq_ops);	682	ret = seq_open(file, &children_seq_ops);
683	if (ret)	683	if (ret)
684	return ret;	684	return ret;
685		685
686	m = file->private_data;	686	m = file->private_data;
687	m->private = inode;	687	m->private = inode;
688		688
689	return ret;	689	return ret;
690	}	690	}
691		691
692	int children_seq_release(struct inode inode, struct file file)	692	int children_seq_release(struct inode inode, struct file file)
693	{	693	{
694	seq_release(inode, file);	694	seq_release(inode, file);
695	return 0;	695	return 0;
696	}	696	}
697		697
698	const struct file_operations proc_tid_children_operations = {	698	const struct file_operations proc_tid_children_operations = {
699	.open = children_seq_open,	699	.open = children_seq_open,
700	.read = seq_read,	700	.read = seq_read,
701	.llseek = seq_lseek,	701	.llseek = seq_lseek,
702	.release = children_seq_release,	702	.release = children_seq_release,
703	};	703	};
704	#endif /* CONFIG_CHECKPOINT_RESTORE */	704	#endif /* CONFIG_CHECKPOINT_RESTORE */
705		705

include/linux/sched.h

Diff comments View file @ e80d0a1

kernel/exit.c

Diff comments View file @ e80d0a1

1	/*	1	/*
2	* linux/kernel/exit.c	2	* linux/kernel/exit.c
3	*	3	*
4	* Copyright (C) 1991, 1992 Linus Torvalds	4	* Copyright (C) 1991, 1992 Linus Torvalds
5	*/	5	*/
6		6
7	#include <linux/mm.h>	7	#include <linux/mm.h>
8	#include <linux/slab.h>	8	#include <linux/slab.h>
9	#include <linux/interrupt.h>	9	#include <linux/interrupt.h>
10	#include <linux/module.h>	10	#include <linux/module.h>
11	#include <linux/capability.h>	11	#include <linux/capability.h>
12	#include <linux/completion.h>	12	#include <linux/completion.h>
13	#include <linux/personality.h>	13	#include <linux/personality.h>
14	#include <linux/tty.h>	14	#include <linux/tty.h>
15	#include <linux/iocontext.h>	15	#include <linux/iocontext.h>
16	#include <linux/key.h>	16	#include <linux/key.h>
17	#include <linux/security.h>	17	#include <linux/security.h>
18	#include <linux/cpu.h>	18	#include <linux/cpu.h>
19	#include <linux/acct.h>	19	#include <linux/acct.h>
20	#include <linux/tsacct_kern.h>	20	#include <linux/tsacct_kern.h>
21	#include <linux/file.h>	21	#include <linux/file.h>
22	#include <linux/fdtable.h>	22	#include <linux/fdtable.h>
23	#include <linux/binfmts.h>	23	#include <linux/binfmts.h>
24	#include <linux/nsproxy.h>	24	#include <linux/nsproxy.h>
25	#include <linux/pid_namespace.h>	25	#include <linux/pid_namespace.h>
26	#include <linux/ptrace.h>	26	#include <linux/ptrace.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/mount.h>	28	#include <linux/mount.h>
29	#include <linux/proc_fs.h>	29	#include <linux/proc_fs.h>
30	#include <linux/kthread.h>	30	#include <linux/kthread.h>
31	#include <linux/mempolicy.h>	31	#include <linux/mempolicy.h>
32	#include <linux/taskstats_kern.h>	32	#include <linux/taskstats_kern.h>
33	#include <linux/delayacct.h>	33	#include <linux/delayacct.h>
34	#include <linux/freezer.h>	34	#include <linux/freezer.h>
35	#include <linux/cgroup.h>	35	#include <linux/cgroup.h>
36	#include <linux/syscalls.h>	36	#include <linux/syscalls.h>
37	#include <linux/signal.h>	37	#include <linux/signal.h>
38	#include <linux/posix-timers.h>	38	#include <linux/posix-timers.h>
39	#include <linux/cn_proc.h>	39	#include <linux/cn_proc.h>
40	#include <linux/mutex.h>	40	#include <linux/mutex.h>
41	#include <linux/futex.h>	41	#include <linux/futex.h>
42	#include <linux/pipe_fs_i.h>	42	#include <linux/pipe_fs_i.h>
43	#include <linux/audit.h> /* for audit_free() */	43	#include <linux/audit.h> /* for audit_free() */
44	#include <linux/resource.h>	44	#include <linux/resource.h>
45	#include <linux/blkdev.h>	45	#include <linux/blkdev.h>
46	#include <linux/task_io_accounting_ops.h>	46	#include <linux/task_io_accounting_ops.h>
47	#include <linux/tracehook.h>	47	#include <linux/tracehook.h>
48	#include <linux/fs_struct.h>	48	#include <linux/fs_struct.h>
49	#include <linux/init_task.h>	49	#include <linux/init_task.h>
50	#include <linux/perf_event.h>	50	#include <linux/perf_event.h>
51	#include <trace/events/sched.h>	51	#include <trace/events/sched.h>
52	#include <linux/hw_breakpoint.h>	52	#include <linux/hw_breakpoint.h>
53	#include <linux/oom.h>	53	#include <linux/oom.h>
54	#include <linux/writeback.h>	54	#include <linux/writeback.h>
55	#include <linux/shm.h>	55	#include <linux/shm.h>
56		56
57	#include <asm/uaccess.h>	57	#include <asm/uaccess.h>
58	#include <asm/unistd.h>	58	#include <asm/unistd.h>
59	#include <asm/pgtable.h>	59	#include <asm/pgtable.h>
60	#include <asm/mmu_context.h>	60	#include <asm/mmu_context.h>
61		61
62	static void exit_mm(struct task_struct * tsk);	62	static void exit_mm(struct task_struct * tsk);
63		63
64	static void __unhash_process(struct task_struct *p, bool group_dead)	64	static void __unhash_process(struct task_struct *p, bool group_dead)
65	{	65	{
66	nr_threads--;	66	nr_threads--;
67	detach_pid(p, PIDTYPE_PID);	67	detach_pid(p, PIDTYPE_PID);
68	if (group_dead) {	68	if (group_dead) {
69	detach_pid(p, PIDTYPE_PGID);	69	detach_pid(p, PIDTYPE_PGID);
70	detach_pid(p, PIDTYPE_SID);	70	detach_pid(p, PIDTYPE_SID);
71		71
72	list_del_rcu(&p->tasks);	72	list_del_rcu(&p->tasks);
73	list_del_init(&p->sibling);	73	list_del_init(&p->sibling);
74	__this_cpu_dec(process_counts);	74	__this_cpu_dec(process_counts);
75	/*	75	/*
76	* If we are the last child process in a pid namespace to be	76	* If we are the last child process in a pid namespace to be
77	* reaped, notify the reaper sleeping zap_pid_ns_processes().	77	* reaped, notify the reaper sleeping zap_pid_ns_processes().
78	*/	78	*/
79	if (IS_ENABLED(CONFIG_PID_NS)) {	79	if (IS_ENABLED(CONFIG_PID_NS)) {
80	struct task_struct *parent = p->real_parent;	80	struct task_struct *parent = p->real_parent;
81		81
82	if ((task_active_pid_ns(parent)->child_reaper == parent) &&	82	if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83	list_empty(&parent->children) &&	83	list_empty(&parent->children) &&
84	(parent->flags & PF_EXITING))	84	(parent->flags & PF_EXITING))
85	wake_up_process(parent);	85	wake_up_process(parent);
86	}	86	}
87	}	87	}
88	list_del_rcu(&p->thread_group);	88	list_del_rcu(&p->thread_group);
89	}	89	}
90		90
91	/*	91	/*
92	* This function expects the tasklist_lock write-locked.	92	* This function expects the tasklist_lock write-locked.
93	*/	93	*/
94	static void __exit_signal(struct task_struct *tsk)	94	static void __exit_signal(struct task_struct *tsk)
95	{	95	{
96	struct signal_struct *sig = tsk->signal;	96	struct signal_struct *sig = tsk->signal;
97	bool group_dead = thread_group_leader(tsk);	97	bool group_dead = thread_group_leader(tsk);
98	struct sighand_struct *sighand;	98	struct sighand_struct *sighand;
99	struct tty_struct *uninitialized_var(tty);	99	struct tty_struct *uninitialized_var(tty);
100		100
101	sighand = rcu_dereference_check(tsk->sighand,	101	sighand = rcu_dereference_check(tsk->sighand,
102	lockdep_tasklist_lock_is_held());	102	lockdep_tasklist_lock_is_held());
103	spin_lock(&sighand->siglock);	103	spin_lock(&sighand->siglock);
104		104
105	posix_cpu_timers_exit(tsk);	105	posix_cpu_timers_exit(tsk);
106	if (group_dead) {	106	if (group_dead) {
107	posix_cpu_timers_exit_group(tsk);	107	posix_cpu_timers_exit_group(tsk);
108	tty = sig->tty;	108	tty = sig->tty;
109	sig->tty = NULL;	109	sig->tty = NULL;
110	} else {	110	} else {
111	/*	111	/*
112	* This can only happen if the caller is de_thread().	112	* This can only happen if the caller is de_thread().
113	* FIXME: this is the temporary hack, we should teach	113	* FIXME: this is the temporary hack, we should teach
114	* posix-cpu-timers to handle this case correctly.	114	* posix-cpu-timers to handle this case correctly.
115	*/	115	*/
116	if (unlikely(has_group_leader_pid(tsk)))	116	if (unlikely(has_group_leader_pid(tsk)))
117	posix_cpu_timers_exit_group(tsk);	117	posix_cpu_timers_exit_group(tsk);
118		118
119	/*	119	/*
120	* If there is any task waiting for the group exit	120	* If there is any task waiting for the group exit
121	* then notify it:	121	* then notify it:
122	*/	122	*/
123	if (sig->notify_count > 0 && !--sig->notify_count)	123	if (sig->notify_count > 0 && !--sig->notify_count)
124	wake_up_process(sig->group_exit_task);	124	wake_up_process(sig->group_exit_task);
125		125
126	if (tsk == sig->curr_target)	126	if (tsk == sig->curr_target)
127	sig->curr_target = next_thread(tsk);	127	sig->curr_target = next_thread(tsk);
128	/*	128	/*
129	* Accumulate here the counters for all threads but the	129	* Accumulate here the counters for all threads but the
130	* group leader as they die, so they can be added into	130	* group leader as they die, so they can be added into
131	* the process-wide totals when those are taken.	131	* the process-wide totals when those are taken.
132	* The group leader stays around as a zombie as long	132	* The group leader stays around as a zombie as long
133	* as there are other threads. When it gets reaped,	133	* as there are other threads. When it gets reaped,
134	* the exit.c code will add its counts into these totals.	134	* the exit.c code will add its counts into these totals.
135	* We won't ever get here for the group leader, since it	135	* We won't ever get here for the group leader, since it
136	* will have been the last reference on the signal_struct.	136	* will have been the last reference on the signal_struct.
137	*/	137	*/
138	sig->utime += tsk->utime;	138	sig->utime += tsk->utime;
139	sig->stime += tsk->stime;	139	sig->stime += tsk->stime;
140	sig->gtime += tsk->gtime;	140	sig->gtime += tsk->gtime;
141	sig->min_flt += tsk->min_flt;	141	sig->min_flt += tsk->min_flt;
142	sig->maj_flt += tsk->maj_flt;	142	sig->maj_flt += tsk->maj_flt;
143	sig->nvcsw += tsk->nvcsw;	143	sig->nvcsw += tsk->nvcsw;
144	sig->nivcsw += tsk->nivcsw;	144	sig->nivcsw += tsk->nivcsw;
145	sig->inblock += task_io_get_inblock(tsk);	145	sig->inblock += task_io_get_inblock(tsk);
146	sig->oublock += task_io_get_oublock(tsk);	146	sig->oublock += task_io_get_oublock(tsk);
147	task_io_accounting_add(&sig->ioac, &tsk->ioac);	147	task_io_accounting_add(&sig->ioac, &tsk->ioac);
148	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;	148	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
149	}	149	}
150		150
151	sig->nr_threads--;	151	sig->nr_threads--;
152	__unhash_process(tsk, group_dead);	152	__unhash_process(tsk, group_dead);
153		153
154	/*	154	/*
155	* Do this under ->siglock, we can race with another thread	155	* Do this under ->siglock, we can race with another thread
156	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.	156	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
157	*/	157	*/
158	flush_sigqueue(&tsk->pending);	158	flush_sigqueue(&tsk->pending);
159	tsk->sighand = NULL;	159	tsk->sighand = NULL;
160	spin_unlock(&sighand->siglock);	160	spin_unlock(&sighand->siglock);
161		161
162	__cleanup_sighand(sighand);	162	__cleanup_sighand(sighand);
163	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);	163	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
164	if (group_dead) {	164	if (group_dead) {
165	flush_sigqueue(&sig->shared_pending);	165	flush_sigqueue(&sig->shared_pending);
166	tty_kref_put(tty);	166	tty_kref_put(tty);
167	}	167	}
168	}	168	}
169		169
170	static void delayed_put_task_struct(struct rcu_head *rhp)	170	static void delayed_put_task_struct(struct rcu_head *rhp)
171	{	171	{
172	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);	172	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
173		173
174	perf_event_delayed_put(tsk);	174	perf_event_delayed_put(tsk);
175	trace_sched_process_free(tsk);	175	trace_sched_process_free(tsk);
176	put_task_struct(tsk);	176	put_task_struct(tsk);
177	}	177	}
178		178
179		179
180	void release_task(struct task_struct * p)	180	void release_task(struct task_struct * p)
181	{	181	{
182	struct task_struct *leader;	182	struct task_struct *leader;
183	int zap_leader;	183	int zap_leader;
184	repeat:	184	repeat:
185	/* don't need to get the RCU readlock here - the process is dead and	185	/* don't need to get the RCU readlock here - the process is dead and
186	* can't be modifying its own credentials. But shut RCU-lockdep up */	186	* can't be modifying its own credentials. But shut RCU-lockdep up */
187	rcu_read_lock();	187	rcu_read_lock();
188	atomic_dec(&__task_cred(p)->user->processes);	188	atomic_dec(&__task_cred(p)->user->processes);
189	rcu_read_unlock();	189	rcu_read_unlock();
190		190
191	proc_flush_task(p);	191	proc_flush_task(p);
192		192
193	write_lock_irq(&tasklist_lock);	193	write_lock_irq(&tasklist_lock);
194	ptrace_release_task(p);	194	ptrace_release_task(p);
195	__exit_signal(p);	195	__exit_signal(p);
196		196
197	/*	197	/*
198	* If we are the last non-leader member of the thread	198	* If we are the last non-leader member of the thread
199	* group, and the leader is zombie, then notify the	199	* group, and the leader is zombie, then notify the
200	* group leader's parent process. (if it wants notification.)	200	* group leader's parent process. (if it wants notification.)
201	*/	201	*/
202	zap_leader = 0;	202	zap_leader = 0;
203	leader = p->group_leader;	203	leader = p->group_leader;
204	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {	204	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
205	/*	205	/*
206	* If we were the last child thread and the leader has	206	* If we were the last child thread and the leader has
207	* exited already, and the leader's parent ignores SIGCHLD,	207	* exited already, and the leader's parent ignores SIGCHLD,
208	* then we are the one who should release the leader.	208	* then we are the one who should release the leader.
209	*/	209	*/
210	zap_leader = do_notify_parent(leader, leader->exit_signal);	210	zap_leader = do_notify_parent(leader, leader->exit_signal);
211	if (zap_leader)	211	if (zap_leader)
212	leader->exit_state = EXIT_DEAD;	212	leader->exit_state = EXIT_DEAD;
213	}	213	}
214		214
215	write_unlock_irq(&tasklist_lock);	215	write_unlock_irq(&tasklist_lock);
216	release_thread(p);	216	release_thread(p);
217	call_rcu(&p->rcu, delayed_put_task_struct);	217	call_rcu(&p->rcu, delayed_put_task_struct);
218		218
219	p = leader;	219	p = leader;
220	if (unlikely(zap_leader))	220	if (unlikely(zap_leader))
221	goto repeat;	221	goto repeat;
222	}	222	}
223		223
224	/*	224	/*
225	* This checks not only the pgrp, but falls back on the pid if no	225	* This checks not only the pgrp, but falls back on the pid if no
226	* satisfactory pgrp is found. I dunno - gdb doesn't work correctly	226	* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
227	* without this...	227	* without this...
228	*	228	*
229	* The caller must hold rcu lock or the tasklist lock.	229	* The caller must hold rcu lock or the tasklist lock.
230	*/	230	*/
231	struct pid session_of_pgrp(struct pid pgrp)	231	struct pid session_of_pgrp(struct pid pgrp)
232	{	232	{
233	struct task_struct *p;	233	struct task_struct *p;
234	struct pid *sid = NULL;	234	struct pid *sid = NULL;
235		235
236	p = pid_task(pgrp, PIDTYPE_PGID);	236	p = pid_task(pgrp, PIDTYPE_PGID);
237	if (p == NULL)	237	if (p == NULL)
238	p = pid_task(pgrp, PIDTYPE_PID);	238	p = pid_task(pgrp, PIDTYPE_PID);
239	if (p != NULL)	239	if (p != NULL)
240	sid = task_session(p);	240	sid = task_session(p);
241		241
242	return sid;	242	return sid;
243	}	243	}
244		244
245	/*	245	/*
246	* Determine if a process group is "orphaned", according to the POSIX	246	* Determine if a process group is "orphaned", according to the POSIX
247	* definition in 2.2.2.52. Orphaned process groups are not to be affected	247	* definition in 2.2.2.52. Orphaned process groups are not to be affected
248	* by terminal-generated stop signals. Newly orphaned process groups are	248	* by terminal-generated stop signals. Newly orphaned process groups are
249	* to receive a SIGHUP and a SIGCONT.	249	* to receive a SIGHUP and a SIGCONT.
250	*	250	*
251	* "I ask you, have you ever known what it is to be an orphan?"	251	* "I ask you, have you ever known what it is to be an orphan?"
252	*/	252	*/
253	static int will_become_orphaned_pgrp(struct pid pgrp, struct task_struct ignored_task)	253	static int will_become_orphaned_pgrp(struct pid pgrp, struct task_struct ignored_task)
254	{	254	{
255	struct task_struct *p;	255	struct task_struct *p;
256		256
257	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {	257	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
258	if ((p == ignored_task) \|\|	258	if ((p == ignored_task) \|\|
259	(p->exit_state && thread_group_empty(p)) \|\|	259	(p->exit_state && thread_group_empty(p)) \|\|
260	is_global_init(p->real_parent))	260	is_global_init(p->real_parent))
261	continue;	261	continue;
262		262
263	if (task_pgrp(p->real_parent) != pgrp &&	263	if (task_pgrp(p->real_parent) != pgrp &&
264	task_session(p->real_parent) == task_session(p))	264	task_session(p->real_parent) == task_session(p))
265	return 0;	265	return 0;
266	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);	266	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
267		267
268	return 1;	268	return 1;
269	}	269	}
270		270
271	int is_current_pgrp_orphaned(void)	271	int is_current_pgrp_orphaned(void)
272	{	272	{
273	int retval;	273	int retval;
274		274
275	read_lock(&tasklist_lock);	275	read_lock(&tasklist_lock);
276	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);	276	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
277	read_unlock(&tasklist_lock);	277	read_unlock(&tasklist_lock);
278		278
279	return retval;	279	return retval;
280	}	280	}
281		281
282	static bool has_stopped_jobs(struct pid *pgrp)	282	static bool has_stopped_jobs(struct pid *pgrp)
283	{	283	{
284	struct task_struct *p;	284	struct task_struct *p;
285		285
286	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {	286	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
287	if (p->signal->flags & SIGNAL_STOP_STOPPED)	287	if (p->signal->flags & SIGNAL_STOP_STOPPED)
288	return true;	288	return true;
289	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);	289	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
290		290
291	return false;	291	return false;
292	}	292	}
293		293
294	/*	294	/*
295	* Check to see if any process groups have become orphaned as	295	* Check to see if any process groups have become orphaned as
296	* a result of our exiting, and if they have any stopped jobs,	296	* a result of our exiting, and if they have any stopped jobs,
297	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)	297	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
298	*/	298	*/
299	static void	299	static void
300	kill_orphaned_pgrp(struct task_struct tsk, struct task_struct parent)	300	kill_orphaned_pgrp(struct task_struct tsk, struct task_struct parent)
301	{	301	{
302	struct pid *pgrp = task_pgrp(tsk);	302	struct pid *pgrp = task_pgrp(tsk);
303	struct task_struct *ignored_task = tsk;	303	struct task_struct *ignored_task = tsk;
304		304
305	if (!parent)	305	if (!parent)
306	/* exit: our father is in a different pgrp than	306	/* exit: our father is in a different pgrp than
307	* we are and we were the only connection outside.	307	* we are and we were the only connection outside.
308	*/	308	*/
309	parent = tsk->real_parent;	309	parent = tsk->real_parent;
310	else	310	else
311	/* reparent: our child is in a different pgrp than	311	/* reparent: our child is in a different pgrp than
312	* we are, and it was the only connection outside.	312	* we are, and it was the only connection outside.
313	*/	313	*/
314	ignored_task = NULL;	314	ignored_task = NULL;
315		315
316	if (task_pgrp(parent) != pgrp &&	316	if (task_pgrp(parent) != pgrp &&
317	task_session(parent) == task_session(tsk) &&	317	task_session(parent) == task_session(tsk) &&
318	will_become_orphaned_pgrp(pgrp, ignored_task) &&	318	will_become_orphaned_pgrp(pgrp, ignored_task) &&
319	has_stopped_jobs(pgrp)) {	319	has_stopped_jobs(pgrp)) {
320	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);	320	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
321	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);	321	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
322	}	322	}
323	}	323	}
324		324
325	/**	325	/**
326	* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd	326	* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327	*	327	*
328	* If a kernel thread is launched as a result of a system call, or if	328	* If a kernel thread is launched as a result of a system call, or if
329	* it ever exits, it should generally reparent itself to kthreadd so it	329	* it ever exits, it should generally reparent itself to kthreadd so it
330	* isn't in the way of other processes and is correctly cleaned up on exit.	330	* isn't in the way of other processes and is correctly cleaned up on exit.
331	*	331	*
332	* The various task state such as scheduling policy and priority may have	332	* The various task state such as scheduling policy and priority may have
333	* been inherited from a user process, so we reset them to sane values here.	333	* been inherited from a user process, so we reset them to sane values here.
334	*	334	*
335	* NOTE that reparent_to_kthreadd() gives the caller full capabilities.	335	* NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336	*/	336	*/
337	static void reparent_to_kthreadd(void)	337	static void reparent_to_kthreadd(void)
338	{	338	{
339	write_lock_irq(&tasklist_lock);	339	write_lock_irq(&tasklist_lock);
340		340
341	ptrace_unlink(current);	341	ptrace_unlink(current);
342	/* Reparent to init */	342	/* Reparent to init */
343	current->real_parent = current->parent = kthreadd_task;	343	current->real_parent = current->parent = kthreadd_task;
344	list_move_tail(&current->sibling, &current->real_parent->children);	344	list_move_tail(&current->sibling, &current->real_parent->children);
345		345
346	/* Set the exit signal to SIGCHLD so we signal init on exit */	346	/* Set the exit signal to SIGCHLD so we signal init on exit */
347	current->exit_signal = SIGCHLD;	347	current->exit_signal = SIGCHLD;
348		348
349	if (task_nice(current) < 0)	349	if (task_nice(current) < 0)
350	set_user_nice(current, 0);	350	set_user_nice(current, 0);
351	/* cpus_allowed? */	351	/* cpus_allowed? */
352	/* rt_priority? */	352	/* rt_priority? */
353	/* signals? */	353	/* signals? */
354	memcpy(current->signal->rlim, init_task.signal->rlim,	354	memcpy(current->signal->rlim, init_task.signal->rlim,
355	sizeof(current->signal->rlim));	355	sizeof(current->signal->rlim));
356		356
357	atomic_inc(&init_cred.usage);	357	atomic_inc(&init_cred.usage);
358	commit_creds(&init_cred);	358	commit_creds(&init_cred);
359	write_unlock_irq(&tasklist_lock);	359	write_unlock_irq(&tasklist_lock);
360	}	360	}
361		361
362	void __set_special_pids(struct pid *pid)	362	void __set_special_pids(struct pid *pid)
363	{	363	{
364	struct task_struct *curr = current->group_leader;	364	struct task_struct *curr = current->group_leader;
365		365
366	if (task_session(curr) != pid)	366	if (task_session(curr) != pid)
367	change_pid(curr, PIDTYPE_SID, pid);	367	change_pid(curr, PIDTYPE_SID, pid);
368		368
369	if (task_pgrp(curr) != pid)	369	if (task_pgrp(curr) != pid)
370	change_pid(curr, PIDTYPE_PGID, pid);	370	change_pid(curr, PIDTYPE_PGID, pid);
371	}	371	}
372		372
373	static void set_special_pids(struct pid *pid)	373	static void set_special_pids(struct pid *pid)
374	{	374	{
375	write_lock_irq(&tasklist_lock);	375	write_lock_irq(&tasklist_lock);
376	__set_special_pids(pid);	376	__set_special_pids(pid);
377	write_unlock_irq(&tasklist_lock);	377	write_unlock_irq(&tasklist_lock);
378	}	378	}
379		379
380	/*	380	/*
381	* Let kernel threads use this to say that they allow a certain signal.	381	* Let kernel threads use this to say that they allow a certain signal.
382	* Must not be used if kthread was cloned with CLONE_SIGHAND.	382	* Must not be used if kthread was cloned with CLONE_SIGHAND.
383	*/	383	*/
384	int allow_signal(int sig)	384	int allow_signal(int sig)
385	{	385	{
386	if (!valid_signal(sig) \|\| sig < 1)	386	if (!valid_signal(sig) \|\| sig < 1)
387	return -EINVAL;	387	return -EINVAL;
388		388
389	spin_lock_irq(&current->sighand->siglock);	389	spin_lock_irq(&current->sighand->siglock);
390	/* This is only needed for daemonize()'ed kthreads */	390	/* This is only needed for daemonize()'ed kthreads */
391	sigdelset(&current->blocked, sig);	391	sigdelset(&current->blocked, sig);
392	/*	392	/*
393	* Kernel threads handle their own signals. Let the signal code	393	* Kernel threads handle their own signals. Let the signal code
394	* know it'll be handled, so that they don't get converted to	394	* know it'll be handled, so that they don't get converted to
395	* SIGKILL or just silently dropped.	395	* SIGKILL or just silently dropped.
396	*/	396	*/
397	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;	397	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
398	recalc_sigpending();	398	recalc_sigpending();
399	spin_unlock_irq(&current->sighand->siglock);	399	spin_unlock_irq(&current->sighand->siglock);
400	return 0;	400	return 0;
401	}	401	}
402		402
403	EXPORT_SYMBOL(allow_signal);	403	EXPORT_SYMBOL(allow_signal);
404		404
405	int disallow_signal(int sig)	405	int disallow_signal(int sig)
406	{	406	{
407	if (!valid_signal(sig) \|\| sig < 1)	407	if (!valid_signal(sig) \|\| sig < 1)
408	return -EINVAL;	408	return -EINVAL;
409		409
410	spin_lock_irq(&current->sighand->siglock);	410	spin_lock_irq(&current->sighand->siglock);
411	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;	411	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
412	recalc_sigpending();	412	recalc_sigpending();
413	spin_unlock_irq(&current->sighand->siglock);	413	spin_unlock_irq(&current->sighand->siglock);
414	return 0;	414	return 0;
415	}	415	}
416		416
417	EXPORT_SYMBOL(disallow_signal);	417	EXPORT_SYMBOL(disallow_signal);
418		418
419	/*	419	/*
420	* Put all the gunge required to become a kernel thread without	420	* Put all the gunge required to become a kernel thread without
421	* attached user resources in one place where it belongs.	421	* attached user resources in one place where it belongs.
422	*/	422	*/
423		423
424	void daemonize(const char *name, ...)	424	void daemonize(const char *name, ...)
425	{	425	{
426	va_list args;	426	va_list args;
427	sigset_t blocked;	427	sigset_t blocked;
428		428
429	va_start(args, name);	429	va_start(args, name);
430	vsnprintf(current->comm, sizeof(current->comm), name, args);	430	vsnprintf(current->comm, sizeof(current->comm), name, args);
431	va_end(args);	431	va_end(args);
432		432
433	/*	433	/*
434	* If we were started as result of loading a module, close all of the	434	* If we were started as result of loading a module, close all of the
435	* user space pages. We don't need them, and if we didn't close them	435	* user space pages. We don't need them, and if we didn't close them
436	* they would be locked into memory.	436	* they would be locked into memory.
437	*/	437	*/
438	exit_mm(current);	438	exit_mm(current);
439	/*	439	/*
440	* We don't want to get frozen, in case system-wide hibernation	440	* We don't want to get frozen, in case system-wide hibernation
441	* or suspend transition begins right now.	441	* or suspend transition begins right now.
442	*/	442	*/
443	current->flags \|= (PF_NOFREEZE \| PF_KTHREAD);	443	current->flags \|= (PF_NOFREEZE \| PF_KTHREAD);
444		444
445	if (current->nsproxy != &init_nsproxy) {	445	if (current->nsproxy != &init_nsproxy) {
446	get_nsproxy(&init_nsproxy);	446	get_nsproxy(&init_nsproxy);
447	switch_task_namespaces(current, &init_nsproxy);	447	switch_task_namespaces(current, &init_nsproxy);
448	}	448	}
449	set_special_pids(&init_struct_pid);	449	set_special_pids(&init_struct_pid);
450	proc_clear_tty(current);	450	proc_clear_tty(current);
451		451
452	/* Block and flush all signals */	452	/* Block and flush all signals */
453	sigfillset(&blocked);	453	sigfillset(&blocked);
454	sigprocmask(SIG_BLOCK, &blocked, NULL);	454	sigprocmask(SIG_BLOCK, &blocked, NULL);
455	flush_signals(current);	455	flush_signals(current);
456		456
457	/* Become as one with the init task */	457	/* Become as one with the init task */
458		458
459	daemonize_fs_struct();	459	daemonize_fs_struct();
460	daemonize_descriptors();	460	daemonize_descriptors();
461		461
462	reparent_to_kthreadd();	462	reparent_to_kthreadd();
463	}	463	}
464		464
465	EXPORT_SYMBOL(daemonize);	465	EXPORT_SYMBOL(daemonize);
466		466
467	#ifdef CONFIG_MM_OWNER	467	#ifdef CONFIG_MM_OWNER
468	/*	468	/*
469	* A task is exiting. If it owned this mm, find a new owner for the mm.	469	* A task is exiting. If it owned this mm, find a new owner for the mm.
470	*/	470	*/
471	void mm_update_next_owner(struct mm_struct *mm)	471	void mm_update_next_owner(struct mm_struct *mm)
472	{	472	{
473	struct task_struct c, g, *p = current;	473	struct task_struct c, g, *p = current;
474		474
475	retry:	475	retry:
476	/*	476	/*
477	* If the exiting or execing task is not the owner, it's	477	* If the exiting or execing task is not the owner, it's
478	* someone else's problem.	478	* someone else's problem.
479	*/	479	*/
480	if (mm->owner != p)	480	if (mm->owner != p)
481	return;	481	return;
482	/*	482	/*
483	* The current owner is exiting/execing and there are no other	483	* The current owner is exiting/execing and there are no other
484	* candidates. Do not leave the mm pointing to a possibly	484	* candidates. Do not leave the mm pointing to a possibly
485	* freed task structure.	485	* freed task structure.
486	*/	486	*/
487	if (atomic_read(&mm->mm_users) <= 1) {	487	if (atomic_read(&mm->mm_users) <= 1) {
488	mm->owner = NULL;	488	mm->owner = NULL;
489	return;	489	return;
490	}	490	}
491		491
492	read_lock(&tasklist_lock);	492	read_lock(&tasklist_lock);
493	/*	493	/*
494	* Search in the children	494	* Search in the children
495	*/	495	*/
496	list_for_each_entry(c, &p->children, sibling) {	496	list_for_each_entry(c, &p->children, sibling) {
497	if (c->mm == mm)	497	if (c->mm == mm)
498	goto assign_new_owner;	498	goto assign_new_owner;
499	}	499	}
500		500
501	/*	501	/*
502	* Search in the siblings	502	* Search in the siblings
503	*/	503	*/
504	list_for_each_entry(c, &p->real_parent->children, sibling) {	504	list_for_each_entry(c, &p->real_parent->children, sibling) {
505	if (c->mm == mm)	505	if (c->mm == mm)
506	goto assign_new_owner;	506	goto assign_new_owner;
507	}	507	}
508		508
509	/*	509	/*
510	* Search through everything else. We should not get	510	* Search through everything else. We should not get
511	* here often	511	* here often
512	*/	512	*/
513	do_each_thread(g, c) {	513	do_each_thread(g, c) {
514	if (c->mm == mm)	514	if (c->mm == mm)
515	goto assign_new_owner;	515	goto assign_new_owner;
516	} while_each_thread(g, c);	516	} while_each_thread(g, c);
517		517
518	read_unlock(&tasklist_lock);	518	read_unlock(&tasklist_lock);
519	/*	519	/*
520	* We found no owner yet mm_users > 1: this implies that we are	520	* We found no owner yet mm_users > 1: this implies that we are
521	* most likely racing with swapoff (try_to_unuse()) or /proc or	521	* most likely racing with swapoff (try_to_unuse()) or /proc or
522	* ptrace or page migration (get_task_mm()). Mark owner as NULL.	522	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
523	*/	523	*/
524	mm->owner = NULL;	524	mm->owner = NULL;
525	return;	525	return;
526		526
527	assign_new_owner:	527	assign_new_owner:
528	BUG_ON(c == p);	528	BUG_ON(c == p);
529	get_task_struct(c);	529	get_task_struct(c);
530	/*	530	/*
531	* The task_lock protects c->mm from changing.	531	* The task_lock protects c->mm from changing.
532	* We always want mm->owner->mm == mm	532	* We always want mm->owner->mm == mm
533	*/	533	*/
534	task_lock(c);	534	task_lock(c);
535	/*	535	/*
536	* Delay read_unlock() till we have the task_lock()	536	* Delay read_unlock() till we have the task_lock()
537	* to ensure that c does not slip away underneath us	537	* to ensure that c does not slip away underneath us
538	*/	538	*/
539	read_unlock(&tasklist_lock);	539	read_unlock(&tasklist_lock);
540	if (c->mm != mm) {	540	if (c->mm != mm) {
541	task_unlock(c);	541	task_unlock(c);
542	put_task_struct(c);	542	put_task_struct(c);
543	goto retry;	543	goto retry;
544	}	544	}
545	mm->owner = c;	545	mm->owner = c;
546	task_unlock(c);	546	task_unlock(c);
547	put_task_struct(c);	547	put_task_struct(c);
548	}	548	}
549	#endif /* CONFIG_MM_OWNER */	549	#endif /* CONFIG_MM_OWNER */
550		550
551	/*	551	/*
552	* Turn us into a lazy TLB process if we	552	* Turn us into a lazy TLB process if we
553	* aren't already..	553	* aren't already..
554	*/	554	*/
555	static void exit_mm(struct task_struct * tsk)	555	static void exit_mm(struct task_struct * tsk)
556	{	556	{
557	struct mm_struct *mm = tsk->mm;	557	struct mm_struct *mm = tsk->mm;
558	struct core_state *core_state;	558	struct core_state *core_state;
559		559
560	mm_release(tsk, mm);	560	mm_release(tsk, mm);
561	if (!mm)	561	if (!mm)
562	return;	562	return;
563	sync_mm_rss(mm);	563	sync_mm_rss(mm);
564	/*	564	/*
565	* Serialize with any possible pending coredump.	565	* Serialize with any possible pending coredump.
566	* We must hold mmap_sem around checking core_state	566	* We must hold mmap_sem around checking core_state
567	* and clearing tsk->mm. The core-inducing thread	567	* and clearing tsk->mm. The core-inducing thread
568	* will increment ->nr_threads for each thread in the	568	* will increment ->nr_threads for each thread in the
569	* group with ->mm != NULL.	569	* group with ->mm != NULL.
570	*/	570	*/
571	down_read(&mm->mmap_sem);	571	down_read(&mm->mmap_sem);
572	core_state = mm->core_state;	572	core_state = mm->core_state;
573	if (core_state) {	573	if (core_state) {
574	struct core_thread self;	574	struct core_thread self;
575	up_read(&mm->mmap_sem);	575	up_read(&mm->mmap_sem);
576		576
577	self.task = tsk;	577	self.task = tsk;
578	self.next = xchg(&core_state->dumper.next, &self);	578	self.next = xchg(&core_state->dumper.next, &self);
579	/*	579	/*
580	* Implies mb(), the result of xchg() must be visible	580	* Implies mb(), the result of xchg() must be visible
581	* to core_state->dumper.	581	* to core_state->dumper.
582	*/	582	*/
583	if (atomic_dec_and_test(&core_state->nr_threads))	583	if (atomic_dec_and_test(&core_state->nr_threads))
584	complete(&core_state->startup);	584	complete(&core_state->startup);
585		585
586	for (;;) {	586	for (;;) {
587	set_task_state(tsk, TASK_UNINTERRUPTIBLE);	587	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
588	if (!self.task) /* see coredump_finish() */	588	if (!self.task) /* see coredump_finish() */
589	break;	589	break;
590	schedule();	590	schedule();
591	}	591	}
592	__set_task_state(tsk, TASK_RUNNING);	592	__set_task_state(tsk, TASK_RUNNING);
593	down_read(&mm->mmap_sem);	593	down_read(&mm->mmap_sem);
594	}	594	}
595	atomic_inc(&mm->mm_count);	595	atomic_inc(&mm->mm_count);
596	BUG_ON(mm != tsk->active_mm);	596	BUG_ON(mm != tsk->active_mm);
597	/* more a memory barrier than a real lock */	597	/* more a memory barrier than a real lock */
598	task_lock(tsk);	598	task_lock(tsk);
599	tsk->mm = NULL;	599	tsk->mm = NULL;
600	up_read(&mm->mmap_sem);	600	up_read(&mm->mmap_sem);
601	enter_lazy_tlb(mm, current);	601	enter_lazy_tlb(mm, current);
602	task_unlock(tsk);	602	task_unlock(tsk);
603	mm_update_next_owner(mm);	603	mm_update_next_owner(mm);
604	mmput(mm);	604	mmput(mm);
605	}	605	}
606		606
607	/*	607	/*
608	* When we die, we re-parent all our children, and try to:	608	* When we die, we re-parent all our children, and try to:
609	* 1. give them to another thread in our thread group, if such a member exists	609	* 1. give them to another thread in our thread group, if such a member exists
610	* 2. give it to the first ancestor process which prctl'd itself as a	610	* 2. give it to the first ancestor process which prctl'd itself as a
611	* child_subreaper for its children (like a service manager)	611	* child_subreaper for its children (like a service manager)
612	* 3. give it to the init process (PID 1) in our pid namespace	612	* 3. give it to the init process (PID 1) in our pid namespace
613	*/	613	*/
614	static struct task_struct find_new_reaper(struct task_struct father)	614	static struct task_struct find_new_reaper(struct task_struct father)
615	__releases(&tasklist_lock)	615	__releases(&tasklist_lock)
616	__acquires(&tasklist_lock)	616	__acquires(&tasklist_lock)
617	{	617	{
618	struct pid_namespace *pid_ns = task_active_pid_ns(father);	618	struct pid_namespace *pid_ns = task_active_pid_ns(father);
619	struct task_struct *thread;	619	struct task_struct *thread;
620		620
621	thread = father;	621	thread = father;
622	while_each_thread(father, thread) {	622	while_each_thread(father, thread) {
623	if (thread->flags & PF_EXITING)	623	if (thread->flags & PF_EXITING)
624	continue;	624	continue;
625	if (unlikely(pid_ns->child_reaper == father))	625	if (unlikely(pid_ns->child_reaper == father))
626	pid_ns->child_reaper = thread;	626	pid_ns->child_reaper = thread;
627	return thread;	627	return thread;
628	}	628	}
629		629
630	if (unlikely(pid_ns->child_reaper == father)) {	630	if (unlikely(pid_ns->child_reaper == father)) {
631	write_unlock_irq(&tasklist_lock);	631	write_unlock_irq(&tasklist_lock);
632	if (unlikely(pid_ns == &init_pid_ns)) {	632	if (unlikely(pid_ns == &init_pid_ns)) {
633	panic("Attempted to kill init! exitcode=0x%08x\n",	633	panic("Attempted to kill init! exitcode=0x%08x\n",
634	father->signal->group_exit_code ?:	634	father->signal->group_exit_code ?:
635	father->exit_code);	635	father->exit_code);
636	}	636	}
637		637
638	zap_pid_ns_processes(pid_ns);	638	zap_pid_ns_processes(pid_ns);
639	write_lock_irq(&tasklist_lock);	639	write_lock_irq(&tasklist_lock);
640	} else if (father->signal->has_child_subreaper) {	640	} else if (father->signal->has_child_subreaper) {
641	struct task_struct *reaper;	641	struct task_struct *reaper;
642		642
643	/*	643	/*
644	* Find the first ancestor marked as child_subreaper.	644	* Find the first ancestor marked as child_subreaper.
645	* Note that the code below checks same_thread_group(reaper,	645	* Note that the code below checks same_thread_group(reaper,
646	* pid_ns->child_reaper). This is what we need to DTRT in a	646	* pid_ns->child_reaper). This is what we need to DTRT in a
647	* PID namespace. However we still need the check above, see	647	* PID namespace. However we still need the check above, see
648	* http://marc.info/?l=linux-kernel&m=131385460420380	648	* http://marc.info/?l=linux-kernel&m=131385460420380
649	*/	649	*/
650	for (reaper = father->real_parent;	650	for (reaper = father->real_parent;
651	reaper != &init_task;	651	reaper != &init_task;
652	reaper = reaper->real_parent) {	652	reaper = reaper->real_parent) {
653	if (same_thread_group(reaper, pid_ns->child_reaper))	653	if (same_thread_group(reaper, pid_ns->child_reaper))
654	break;	654	break;
655	if (!reaper->signal->is_child_subreaper)	655	if (!reaper->signal->is_child_subreaper)
656	continue;	656	continue;
657	thread = reaper;	657	thread = reaper;
658	do {	658	do {
659	if (!(thread->flags & PF_EXITING))	659	if (!(thread->flags & PF_EXITING))
660	return reaper;	660	return reaper;
661	} while_each_thread(reaper, thread);	661	} while_each_thread(reaper, thread);
662	}	662	}
663	}	663	}
664		664
665	return pid_ns->child_reaper;	665	return pid_ns->child_reaper;
666	}	666	}
667		667
668	/*	668	/*
669	* Any that need to be release_task'd are put on the @dead list.	669	* Any that need to be release_task'd are put on the @dead list.
670	*/	670	*/
671	static void reparent_leader(struct task_struct father, struct task_struct p,	671	static void reparent_leader(struct task_struct father, struct task_struct p,
672	struct list_head *dead)	672	struct list_head *dead)
673	{	673	{
674	list_move_tail(&p->sibling, &p->real_parent->children);	674	list_move_tail(&p->sibling, &p->real_parent->children);
675		675
676	if (p->exit_state == EXIT_DEAD)	676	if (p->exit_state == EXIT_DEAD)
677	return;	677	return;
678	/*	678	/*
679	* If this is a threaded reparent there is no need to	679	* If this is a threaded reparent there is no need to
680	* notify anyone anything has happened.	680	* notify anyone anything has happened.
681	*/	681	*/
682	if (same_thread_group(p->real_parent, father))	682	if (same_thread_group(p->real_parent, father))
683	return;	683	return;
684		684
685	/* We don't want people slaying init. */	685	/* We don't want people slaying init. */
686	p->exit_signal = SIGCHLD;	686	p->exit_signal = SIGCHLD;
687		687
688	/* If it has exited notify the new parent about this child's death. */	688	/* If it has exited notify the new parent about this child's death. */
689	if (!p->ptrace &&	689	if (!p->ptrace &&
690	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {	690	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
691	if (do_notify_parent(p, p->exit_signal)) {	691	if (do_notify_parent(p, p->exit_signal)) {
692	p->exit_state = EXIT_DEAD;	692	p->exit_state = EXIT_DEAD;
693	list_move_tail(&p->sibling, dead);	693	list_move_tail(&p->sibling, dead);
694	}	694	}
695	}	695	}
696		696
697	kill_orphaned_pgrp(p, father);	697	kill_orphaned_pgrp(p, father);
698	}	698	}
699		699
700	static void forget_original_parent(struct task_struct *father)	700	static void forget_original_parent(struct task_struct *father)
701	{	701	{
702	struct task_struct p, n, *reaper;	702	struct task_struct p, n, *reaper;
703	LIST_HEAD(dead_children);	703	LIST_HEAD(dead_children);
704		704
705	write_lock_irq(&tasklist_lock);	705	write_lock_irq(&tasklist_lock);
706	/*	706	/*
707	* Note that exit_ptrace() and find_new_reaper() might	707	* Note that exit_ptrace() and find_new_reaper() might
708	* drop tasklist_lock and reacquire it.	708	* drop tasklist_lock and reacquire it.
709	*/	709	*/
710	exit_ptrace(father);	710	exit_ptrace(father);
711	reaper = find_new_reaper(father);	711	reaper = find_new_reaper(father);
712		712
713	list_for_each_entry_safe(p, n, &father->children, sibling) {	713	list_for_each_entry_safe(p, n, &father->children, sibling) {
714	struct task_struct *t = p;	714	struct task_struct *t = p;
715	do {	715	do {
716	t->real_parent = reaper;	716	t->real_parent = reaper;
717	if (t->parent == father) {	717	if (t->parent == father) {
718	BUG_ON(t->ptrace);	718	BUG_ON(t->ptrace);
719	t->parent = t->real_parent;	719	t->parent = t->real_parent;
720	}	720	}
721	if (t->pdeath_signal)	721	if (t->pdeath_signal)
722	group_send_sig_info(t->pdeath_signal,	722	group_send_sig_info(t->pdeath_signal,
723	SEND_SIG_NOINFO, t);	723	SEND_SIG_NOINFO, t);
724	} while_each_thread(p, t);	724	} while_each_thread(p, t);
725	reparent_leader(father, p, &dead_children);	725	reparent_leader(father, p, &dead_children);
726	}	726	}
727	write_unlock_irq(&tasklist_lock);	727	write_unlock_irq(&tasklist_lock);
728		728
729	BUG_ON(!list_empty(&father->children));	729	BUG_ON(!list_empty(&father->children));
730		730
731	list_for_each_entry_safe(p, n, &dead_children, sibling) {	731	list_for_each_entry_safe(p, n, &dead_children, sibling) {
732	list_del_init(&p->sibling);	732	list_del_init(&p->sibling);
733	release_task(p);	733	release_task(p);
734	}	734	}
735	}	735	}
736		736
737	/*	737	/*
738	* Send signals to all our closest relatives so that they know	738	* Send signals to all our closest relatives so that they know
739	* to properly mourn us..	739	* to properly mourn us..
740	*/	740	*/
741	static void exit_notify(struct task_struct *tsk, int group_dead)	741	static void exit_notify(struct task_struct *tsk, int group_dead)
742	{	742	{
743	bool autoreap;	743	bool autoreap;
744		744
745	/*	745	/*
746	* This does two things:	746	* This does two things:
747	*	747	*
748	* A. Make init inherit all the child processes	748	* A. Make init inherit all the child processes
749	* B. Check to see if any process groups have become orphaned	749	* B. Check to see if any process groups have become orphaned
750	* as a result of our exiting, and if they have any stopped	750	* as a result of our exiting, and if they have any stopped
751	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)	751	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
752	*/	752	*/
753	forget_original_parent(tsk);	753	forget_original_parent(tsk);
754	exit_task_namespaces(tsk);	754	exit_task_namespaces(tsk);
755		755
756	write_lock_irq(&tasklist_lock);	756	write_lock_irq(&tasklist_lock);
757	if (group_dead)	757	if (group_dead)
758	kill_orphaned_pgrp(tsk->group_leader, NULL);	758	kill_orphaned_pgrp(tsk->group_leader, NULL);
759		759
760	if (unlikely(tsk->ptrace)) {	760	if (unlikely(tsk->ptrace)) {
761	int sig = thread_group_leader(tsk) &&	761	int sig = thread_group_leader(tsk) &&
762	thread_group_empty(tsk) &&	762	thread_group_empty(tsk) &&
763	!ptrace_reparented(tsk) ?	763	!ptrace_reparented(tsk) ?
764	tsk->exit_signal : SIGCHLD;	764	tsk->exit_signal : SIGCHLD;
765	autoreap = do_notify_parent(tsk, sig);	765	autoreap = do_notify_parent(tsk, sig);
766	} else if (thread_group_leader(tsk)) {	766	} else if (thread_group_leader(tsk)) {
767	autoreap = thread_group_empty(tsk) &&	767	autoreap = thread_group_empty(tsk) &&
768	do_notify_parent(tsk, tsk->exit_signal);	768	do_notify_parent(tsk, tsk->exit_signal);
769	} else {	769	} else {
770	autoreap = true;	770	autoreap = true;
771	}	771	}
772		772
773	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;	773	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
774		774
775	/* mt-exec, de_thread() is waiting for group leader */	775	/* mt-exec, de_thread() is waiting for group leader */
776	if (unlikely(tsk->signal->notify_count < 0))	776	if (unlikely(tsk->signal->notify_count < 0))
777	wake_up_process(tsk->signal->group_exit_task);	777	wake_up_process(tsk->signal->group_exit_task);
778	write_unlock_irq(&tasklist_lock);	778	write_unlock_irq(&tasklist_lock);
779		779
780	/* If the process is dead, release it - nobody will wait for it */	780	/* If the process is dead, release it - nobody will wait for it */
781	if (autoreap)	781	if (autoreap)
782	release_task(tsk);	782	release_task(tsk);
783	}	783	}
784		784
785	#ifdef CONFIG_DEBUG_STACK_USAGE	785	#ifdef CONFIG_DEBUG_STACK_USAGE
786	static void check_stack_usage(void)	786	static void check_stack_usage(void)
787	{	787	{
788	static DEFINE_SPINLOCK(low_water_lock);	788	static DEFINE_SPINLOCK(low_water_lock);
789	static int lowest_to_date = THREAD_SIZE;	789	static int lowest_to_date = THREAD_SIZE;
790	unsigned long free;	790	unsigned long free;
791		791
792	free = stack_not_used(current);	792	free = stack_not_used(current);
793		793
794	if (free >= lowest_to_date)	794	if (free >= lowest_to_date)
795	return;	795	return;
796		796
797	spin_lock(&low_water_lock);	797	spin_lock(&low_water_lock);
798	if (free < lowest_to_date) {	798	if (free < lowest_to_date) {
799	printk(KERN_WARNING "%s (%d) used greatest stack depth: "	799	printk(KERN_WARNING "%s (%d) used greatest stack depth: "
800	"%lu bytes left\n",	800	"%lu bytes left\n",
801	current->comm, task_pid_nr(current), free);	801	current->comm, task_pid_nr(current), free);
802	lowest_to_date = free;	802	lowest_to_date = free;
803	}	803	}
804	spin_unlock(&low_water_lock);	804	spin_unlock(&low_water_lock);
805	}	805	}
806	#else	806	#else
807	static inline void check_stack_usage(void) {}	807	static inline void check_stack_usage(void) {}
808	#endif	808	#endif
809		809
810	void do_exit(long code)	810	void do_exit(long code)
811	{	811	{
812	struct task_struct *tsk = current;	812	struct task_struct *tsk = current;
813	int group_dead;	813	int group_dead;
814		814
815	profile_task_exit(tsk);	815	profile_task_exit(tsk);
816		816
817	WARN_ON(blk_needs_flush_plug(tsk));	817	WARN_ON(blk_needs_flush_plug(tsk));
818		818
819	if (unlikely(in_interrupt()))	819	if (unlikely(in_interrupt()))
820	panic("Aiee, killing interrupt handler!");	820	panic("Aiee, killing interrupt handler!");
821	if (unlikely(!tsk->pid))	821	if (unlikely(!tsk->pid))
822	panic("Attempted to kill the idle task!");	822	panic("Attempted to kill the idle task!");
823		823
824	/*	824	/*
825	* If do_exit is called because this processes oopsed, it's possible	825	* If do_exit is called because this processes oopsed, it's possible
826	* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before	826	* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
827	* continuing. Amongst other possible reasons, this is to prevent	827	* continuing. Amongst other possible reasons, this is to prevent
828	* mm_release()->clear_child_tid() from writing to a user-controlled	828	* mm_release()->clear_child_tid() from writing to a user-controlled
829	* kernel address.	829	* kernel address.
830	*/	830	*/
831	set_fs(USER_DS);	831	set_fs(USER_DS);
832		832
833	ptrace_event(PTRACE_EVENT_EXIT, code);	833	ptrace_event(PTRACE_EVENT_EXIT, code);
834		834
835	validate_creds_for_do_exit(tsk);	835	validate_creds_for_do_exit(tsk);
836		836
837	/*	837	/*
838	* We're taking recursive faults here in do_exit. Safest is to just	838	* We're taking recursive faults here in do_exit. Safest is to just
839	* leave this task alone and wait for reboot.	839	* leave this task alone and wait for reboot.
840	*/	840	*/
841	if (unlikely(tsk->flags & PF_EXITING)) {	841	if (unlikely(tsk->flags & PF_EXITING)) {
842	printk(KERN_ALERT	842	printk(KERN_ALERT
843	"Fixing recursive fault but reboot is needed!\n");	843	"Fixing recursive fault but reboot is needed!\n");
844	/*	844	/*
845	* We can do this unlocked here. The futex code uses	845	* We can do this unlocked here. The futex code uses
846	* this flag just to verify whether the pi state	846	* this flag just to verify whether the pi state
847	* cleanup has been done or not. In the worst case it	847	* cleanup has been done or not. In the worst case it
848	* loops once more. We pretend that the cleanup was	848	* loops once more. We pretend that the cleanup was
849	* done as there is no way to return. Either the	849	* done as there is no way to return. Either the
850	* OWNER_DIED bit is set by now or we push the blocked	850	* OWNER_DIED bit is set by now or we push the blocked
851	* task into the wait for ever nirwana as well.	851	* task into the wait for ever nirwana as well.
852	*/	852	*/
853	tsk->flags \|= PF_EXITPIDONE;	853	tsk->flags \|= PF_EXITPIDONE;
854	set_current_state(TASK_UNINTERRUPTIBLE);	854	set_current_state(TASK_UNINTERRUPTIBLE);
855	schedule();	855	schedule();
856	}	856	}
857		857
858	exit_signals(tsk); /* sets PF_EXITING */	858	exit_signals(tsk); /* sets PF_EXITING */
859	/*	859	/*
860	* tsk->flags are checked in the futex code to protect against	860	* tsk->flags are checked in the futex code to protect against
861	* an exiting task cleaning up the robust pi futexes.	861	* an exiting task cleaning up the robust pi futexes.
862	*/	862	*/
863	smp_mb();	863	smp_mb();
864	raw_spin_unlock_wait(&tsk->pi_lock);	864	raw_spin_unlock_wait(&tsk->pi_lock);
865		865
866	if (unlikely(in_atomic()))	866	if (unlikely(in_atomic()))
867	printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",	867	printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
868	current->comm, task_pid_nr(current),	868	current->comm, task_pid_nr(current),
869	preempt_count());	869	preempt_count());
870		870
871	acct_update_integrals(tsk);	871	acct_update_integrals(tsk);
872	/* sync mm's RSS info before statistics gathering */	872	/* sync mm's RSS info before statistics gathering */
873	if (tsk->mm)	873	if (tsk->mm)
874	sync_mm_rss(tsk->mm);	874	sync_mm_rss(tsk->mm);
875	group_dead = atomic_dec_and_test(&tsk->signal->live);	875	group_dead = atomic_dec_and_test(&tsk->signal->live);
876	if (group_dead) {	876	if (group_dead) {
877	hrtimer_cancel(&tsk->signal->real_timer);	877	hrtimer_cancel(&tsk->signal->real_timer);
878	exit_itimers(tsk->signal);	878	exit_itimers(tsk->signal);
879	if (tsk->mm)	879	if (tsk->mm)
880	setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);	880	setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
881	}	881	}
882	acct_collect(code, group_dead);	882	acct_collect(code, group_dead);
883	if (group_dead)	883	if (group_dead)
884	tty_audit_exit();	884	tty_audit_exit();
885	audit_free(tsk);	885	audit_free(tsk);
886		886
887	tsk->exit_code = code;	887	tsk->exit_code = code;
888	taskstats_exit(tsk, group_dead);	888	taskstats_exit(tsk, group_dead);
889		889
890	exit_mm(tsk);	890	exit_mm(tsk);
891		891
892	if (group_dead)	892	if (group_dead)
893	acct_process();	893	acct_process();
894	trace_sched_process_exit(tsk);	894	trace_sched_process_exit(tsk);
895		895
896	exit_sem(tsk);	896	exit_sem(tsk);
897	exit_shm(tsk);	897	exit_shm(tsk);
898	exit_files(tsk);	898	exit_files(tsk);
899	exit_fs(tsk);	899	exit_fs(tsk);
900	exit_task_work(tsk);	900	exit_task_work(tsk);
901	check_stack_usage();	901	check_stack_usage();
902	exit_thread();	902	exit_thread();
903		903
904	/*	904	/*
905	* Flush inherited counters to the parent - before the parent	905	* Flush inherited counters to the parent - before the parent
906	* gets woken up by child-exit notifications.	906	* gets woken up by child-exit notifications.
907	*	907	*
908	* because of cgroup mode, must be called before cgroup_exit()	908	* because of cgroup mode, must be called before cgroup_exit()
909	*/	909	*/
910	perf_event_exit_task(tsk);	910	perf_event_exit_task(tsk);
911		911
912	cgroup_exit(tsk, 1);	912	cgroup_exit(tsk, 1);
913		913
914	if (group_dead)	914	if (group_dead)
915	disassociate_ctty(1);	915	disassociate_ctty(1);
916		916
917	module_put(task_thread_info(tsk)->exec_domain->module);	917	module_put(task_thread_info(tsk)->exec_domain->module);
918		918
919	proc_exit_connector(tsk);	919	proc_exit_connector(tsk);
920		920
921	/*	921	/*
922	* FIXME: do that only when needed, using sched_exit tracepoint	922	* FIXME: do that only when needed, using sched_exit tracepoint
923	*/	923	*/
924	ptrace_put_breakpoints(tsk);	924	ptrace_put_breakpoints(tsk);
925		925
926	exit_notify(tsk, group_dead);	926	exit_notify(tsk, group_dead);
927	#ifdef CONFIG_NUMA	927	#ifdef CONFIG_NUMA
928	task_lock(tsk);	928	task_lock(tsk);
929	mpol_put(tsk->mempolicy);	929	mpol_put(tsk->mempolicy);
930	tsk->mempolicy = NULL;	930	tsk->mempolicy = NULL;
931	task_unlock(tsk);	931	task_unlock(tsk);
932	#endif	932	#endif
933	#ifdef CONFIG_FUTEX	933	#ifdef CONFIG_FUTEX
934	if (unlikely(current->pi_state_cache))	934	if (unlikely(current->pi_state_cache))
935	kfree(current->pi_state_cache);	935	kfree(current->pi_state_cache);
936	#endif	936	#endif
937	/*	937	/*
938	* Make sure we are holding no locks:	938	* Make sure we are holding no locks:
939	*/	939	*/
940	debug_check_no_locks_held(tsk);	940	debug_check_no_locks_held(tsk);
941	/*	941	/*
942	* We can do this unlocked here. The futex code uses this flag	942	* We can do this unlocked here. The futex code uses this flag
943	* just to verify whether the pi state cleanup has been done	943	* just to verify whether the pi state cleanup has been done
944	* or not. In the worst case it loops once more.	944	* or not. In the worst case it loops once more.
945	*/	945	*/
946	tsk->flags \|= PF_EXITPIDONE;	946	tsk->flags \|= PF_EXITPIDONE;
947		947
948	if (tsk->io_context)	948	if (tsk->io_context)
949	exit_io_context(tsk);	949	exit_io_context(tsk);
950		950
951	if (tsk->splice_pipe)	951	if (tsk->splice_pipe)
952	__free_pipe_info(tsk->splice_pipe);	952	__free_pipe_info(tsk->splice_pipe);
953		953
954	if (tsk->task_frag.page)	954	if (tsk->task_frag.page)
955	put_page(tsk->task_frag.page);	955	put_page(tsk->task_frag.page);
956		956
957	validate_creds_for_do_exit(tsk);	957	validate_creds_for_do_exit(tsk);
958		958
959	preempt_disable();	959	preempt_disable();
960	if (tsk->nr_dirtied)	960	if (tsk->nr_dirtied)
961	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);	961	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
962	exit_rcu();	962	exit_rcu();
963		963
964	/*	964	/*
965	* The setting of TASK_RUNNING by try_to_wake_up() may be delayed	965	* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
966	* when the following two conditions become true.	966	* when the following two conditions become true.
967	* - There is race condition of mmap_sem (It is acquired by	967	* - There is race condition of mmap_sem (It is acquired by
968	* exit_mm()), and	968	* exit_mm()), and
969	* - SMI occurs before setting TASK_RUNINNG.	969	* - SMI occurs before setting TASK_RUNINNG.
970	* (or hypervisor of virtual machine switches to other guest)	970	* (or hypervisor of virtual machine switches to other guest)
971	* As a result, we may become TASK_RUNNING after becoming TASK_DEAD	971	* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
972	*	972	*
973	* To avoid it, we have to wait for releasing tsk->pi_lock which	973	* To avoid it, we have to wait for releasing tsk->pi_lock which
974	* is held by try_to_wake_up()	974	* is held by try_to_wake_up()
975	*/	975	*/
976	smp_mb();	976	smp_mb();
977	raw_spin_unlock_wait(&tsk->pi_lock);	977	raw_spin_unlock_wait(&tsk->pi_lock);
978		978
979	/* causes final put_task_struct in finish_task_switch(). */	979	/* causes final put_task_struct in finish_task_switch(). */
980	tsk->state = TASK_DEAD;	980	tsk->state = TASK_DEAD;
981	tsk->flags \|= PF_NOFREEZE; /* tell freezer to ignore us */	981	tsk->flags \|= PF_NOFREEZE; /* tell freezer to ignore us */
982	schedule();	982	schedule();
983	BUG();	983	BUG();
984	/* Avoid "noreturn function does return". */	984	/* Avoid "noreturn function does return". */
985	for (;;)	985	for (;;)
986	cpu_relax(); /* For when BUG is null */	986	cpu_relax(); /* For when BUG is null */
987	}	987	}
988		988
989	EXPORT_SYMBOL_GPL(do_exit);	989	EXPORT_SYMBOL_GPL(do_exit);
990		990
991	void complete_and_exit(struct completion *comp, long code)	991	void complete_and_exit(struct completion *comp, long code)
992	{	992	{
993	if (comp)	993	if (comp)
994	complete(comp);	994	complete(comp);
995		995
996	do_exit(code);	996	do_exit(code);
997	}	997	}
998		998
999	EXPORT_SYMBOL(complete_and_exit);	999	EXPORT_SYMBOL(complete_and_exit);
1000		1000
1001	SYSCALL_DEFINE1(exit, int, error_code)	1001	SYSCALL_DEFINE1(exit, int, error_code)
1002	{	1002	{
1003	do_exit((error_code&0xff)<<8);	1003	do_exit((error_code&0xff)<<8);
1004	}	1004	}
1005		1005
1006	/*	1006	/*
1007	* Take down every thread in the group. This is called by fatal signals	1007	* Take down every thread in the group. This is called by fatal signals
1008	* as well as by sys_exit_group (below).	1008	* as well as by sys_exit_group (below).
1009	*/	1009	*/
1010	void	1010	void
1011	do_group_exit(int exit_code)	1011	do_group_exit(int exit_code)
1012	{	1012	{
1013	struct signal_struct *sig = current->signal;	1013	struct signal_struct *sig = current->signal;
1014		1014
1015	BUG_ON(exit_code & 0x80); /* core dumps don't get here */	1015	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1016		1016
1017	if (signal_group_exit(sig))	1017	if (signal_group_exit(sig))
1018	exit_code = sig->group_exit_code;	1018	exit_code = sig->group_exit_code;
1019	else if (!thread_group_empty(current)) {	1019	else if (!thread_group_empty(current)) {
1020	struct sighand_struct *const sighand = current->sighand;	1020	struct sighand_struct *const sighand = current->sighand;
1021	spin_lock_irq(&sighand->siglock);	1021	spin_lock_irq(&sighand->siglock);
1022	if (signal_group_exit(sig))	1022	if (signal_group_exit(sig))
1023	/* Another thread got here before we took the lock. */	1023	/* Another thread got here before we took the lock. */
1024	exit_code = sig->group_exit_code;	1024	exit_code = sig->group_exit_code;
1025	else {	1025	else {
1026	sig->group_exit_code = exit_code;	1026	sig->group_exit_code = exit_code;
1027	sig->flags = SIGNAL_GROUP_EXIT;	1027	sig->flags = SIGNAL_GROUP_EXIT;
1028	zap_other_threads(current);	1028	zap_other_threads(current);
1029	}	1029	}
1030	spin_unlock_irq(&sighand->siglock);	1030	spin_unlock_irq(&sighand->siglock);
1031	}	1031	}
1032		1032
1033	do_exit(exit_code);	1033	do_exit(exit_code);
1034	/* NOTREACHED */	1034	/* NOTREACHED */
1035	}	1035	}
1036		1036
1037	/*	1037	/*
1038	* this kills every thread in the thread group. Note that any externally	1038	* this kills every thread in the thread group. Note that any externally
1039	* wait4()-ing process will get the correct exit code - even if this	1039	* wait4()-ing process will get the correct exit code - even if this
1040	* thread is not the thread group leader.	1040	* thread is not the thread group leader.
1041	*/	1041	*/
1042	SYSCALL_DEFINE1(exit_group, int, error_code)	1042	SYSCALL_DEFINE1(exit_group, int, error_code)
1043	{	1043	{
1044	do_group_exit((error_code & 0xff) << 8);	1044	do_group_exit((error_code & 0xff) << 8);
1045	/* NOTREACHED */	1045	/* NOTREACHED */
1046	return 0;	1046	return 0;
1047	}	1047	}
1048		1048
1049	struct wait_opts {	1049	struct wait_opts {
1050	enum pid_type wo_type;	1050	enum pid_type wo_type;
1051	int wo_flags;	1051	int wo_flags;
1052	struct pid *wo_pid;	1052	struct pid *wo_pid;
1053		1053
1054	struct siginfo __user *wo_info;	1054	struct siginfo __user *wo_info;
1055	int __user *wo_stat;	1055	int __user *wo_stat;
1056	struct rusage __user *wo_rusage;	1056	struct rusage __user *wo_rusage;
1057		1057
1058	wait_queue_t child_wait;	1058	wait_queue_t child_wait;
1059	int notask_error;	1059	int notask_error;
1060	};	1060	};
1061		1061
1062	static inline	1062	static inline
1063	struct pid task_pid_type(struct task_struct task, enum pid_type type)	1063	struct pid task_pid_type(struct task_struct task, enum pid_type type)
1064	{	1064	{
1065	if (type != PIDTYPE_PID)	1065	if (type != PIDTYPE_PID)
1066	task = task->group_leader;	1066	task = task->group_leader;
1067	return task->pids[type].pid;	1067	return task->pids[type].pid;
1068	}	1068	}
1069		1069
1070	static int eligible_pid(struct wait_opts wo, struct task_struct p)	1070	static int eligible_pid(struct wait_opts wo, struct task_struct p)
1071	{	1071	{
1072	return wo->wo_type == PIDTYPE_MAX \|\|	1072	return wo->wo_type == PIDTYPE_MAX \|\|
1073	task_pid_type(p, wo->wo_type) == wo->wo_pid;	1073	task_pid_type(p, wo->wo_type) == wo->wo_pid;
1074	}	1074	}
1075		1075
1076	static int eligible_child(struct wait_opts wo, struct task_struct p)	1076	static int eligible_child(struct wait_opts wo, struct task_struct p)
1077	{	1077	{
1078	if (!eligible_pid(wo, p))	1078	if (!eligible_pid(wo, p))
1079	return 0;	1079	return 0;
1080	/* Wait for all children (clone and not) if __WALL is set;	1080	/* Wait for all children (clone and not) if __WALL is set;
1081	* otherwise, wait for clone children only if __WCLONE is	1081	* otherwise, wait for clone children only if __WCLONE is
1082	* set; otherwise, wait for non-clone children only. (Note:	1082	* set; otherwise, wait for non-clone children only. (Note:
1083	* A "clone" child here is one that reports to its parent	1083	* A "clone" child here is one that reports to its parent
1084	* using a signal other than SIGCHLD.) */	1084	* using a signal other than SIGCHLD.) */
1085	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))	1085	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1086	&& !(wo->wo_flags & __WALL))	1086	&& !(wo->wo_flags & __WALL))
1087	return 0;	1087	return 0;
1088		1088
1089	return 1;	1089	return 1;
1090	}	1090	}
1091		1091
1092	static int wait_noreap_copyout(struct wait_opts wo, struct task_struct p,	1092	static int wait_noreap_copyout(struct wait_opts wo, struct task_struct p,
1093	pid_t pid, uid_t uid, int why, int status)	1093	pid_t pid, uid_t uid, int why, int status)
1094	{	1094	{
1095	struct siginfo __user *infop;	1095	struct siginfo __user *infop;
1096	int retval = wo->wo_rusage	1096	int retval = wo->wo_rusage
1097	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;	1097	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1098		1098
1099	put_task_struct(p);	1099	put_task_struct(p);
1100	infop = wo->wo_info;	1100	infop = wo->wo_info;
1101	if (infop) {	1101	if (infop) {
1102	if (!retval)	1102	if (!retval)
1103	retval = put_user(SIGCHLD, &infop->si_signo);	1103	retval = put_user(SIGCHLD, &infop->si_signo);
1104	if (!retval)	1104	if (!retval)
1105	retval = put_user(0, &infop->si_errno);	1105	retval = put_user(0, &infop->si_errno);
1106	if (!retval)	1106	if (!retval)
1107	retval = put_user((short)why, &infop->si_code);	1107	retval = put_user((short)why, &infop->si_code);
1108	if (!retval)	1108	if (!retval)
1109	retval = put_user(pid, &infop->si_pid);	1109	retval = put_user(pid, &infop->si_pid);
1110	if (!retval)	1110	if (!retval)
1111	retval = put_user(uid, &infop->si_uid);	1111	retval = put_user(uid, &infop->si_uid);
1112	if (!retval)	1112	if (!retval)
1113	retval = put_user(status, &infop->si_status);	1113	retval = put_user(status, &infop->si_status);
1114	}	1114	}
1115	if (!retval)	1115	if (!retval)
1116	retval = pid;	1116	retval = pid;
1117	return retval;	1117	return retval;
1118	}	1118	}
1119		1119
1120	/*	1120	/*
1121	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold	1121	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1122	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold	1122	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1123	* the lock and this task is uninteresting. If we return nonzero, we have	1123	* the lock and this task is uninteresting. If we return nonzero, we have
1124	* released the lock and the system call should return.	1124	* released the lock and the system call should return.
1125	*/	1125	*/
1126	static int wait_task_zombie(struct wait_opts wo, struct task_struct p)	1126	static int wait_task_zombie(struct wait_opts wo, struct task_struct p)
1127	{	1127	{
1128	unsigned long state;	1128	unsigned long state;
1129	int retval, status, traced;	1129	int retval, status, traced;
1130	pid_t pid = task_pid_vnr(p);	1130	pid_t pid = task_pid_vnr(p);
1131	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));	1131	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1132	struct siginfo __user *infop;	1132	struct siginfo __user *infop;
1133		1133
1134	if (!likely(wo->wo_flags & WEXITED))	1134	if (!likely(wo->wo_flags & WEXITED))
1135	return 0;	1135	return 0;
1136		1136
1137	if (unlikely(wo->wo_flags & WNOWAIT)) {	1137	if (unlikely(wo->wo_flags & WNOWAIT)) {
1138	int exit_code = p->exit_code;	1138	int exit_code = p->exit_code;
1139	int why;	1139	int why;
1140		1140
1141	get_task_struct(p);	1141	get_task_struct(p);
1142	read_unlock(&tasklist_lock);	1142	read_unlock(&tasklist_lock);
1143	if ((exit_code & 0x7f) == 0) {	1143	if ((exit_code & 0x7f) == 0) {
1144	why = CLD_EXITED;	1144	why = CLD_EXITED;
1145	status = exit_code >> 8;	1145	status = exit_code >> 8;
1146	} else {	1146	} else {
1147	why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;	1147	why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1148	status = exit_code & 0x7f;	1148	status = exit_code & 0x7f;
1149	}	1149	}
1150	return wait_noreap_copyout(wo, p, pid, uid, why, status);	1150	return wait_noreap_copyout(wo, p, pid, uid, why, status);
1151	}	1151	}
1152		1152
1153	/*	1153	/*
1154	* Try to move the task's state to DEAD	1154	* Try to move the task's state to DEAD
1155	* only one thread is allowed to do this:	1155	* only one thread is allowed to do this:
1156	*/	1156	*/
1157	state = xchg(&p->exit_state, EXIT_DEAD);	1157	state = xchg(&p->exit_state, EXIT_DEAD);
1158	if (state != EXIT_ZOMBIE) {	1158	if (state != EXIT_ZOMBIE) {
1159	BUG_ON(state != EXIT_DEAD);	1159	BUG_ON(state != EXIT_DEAD);
1160	return 0;	1160	return 0;
1161	}	1161	}
1162		1162
1163	traced = ptrace_reparented(p);	1163	traced = ptrace_reparented(p);
1164	/*	1164	/*
1165	* It can be ptraced but not reparented, check	1165	* It can be ptraced but not reparented, check
1166	* thread_group_leader() to filter out sub-threads.	1166	* thread_group_leader() to filter out sub-threads.
1167	*/	1167	*/
1168	if (likely(!traced) && thread_group_leader(p)) {	1168	if (likely(!traced) && thread_group_leader(p)) {
1169	struct signal_struct *psig;	1169	struct signal_struct *psig;
1170	struct signal_struct *sig;	1170	struct signal_struct *sig;
1171	unsigned long maxrss;	1171	unsigned long maxrss;
1172	cputime_t tgutime, tgstime;	1172	cputime_t tgutime, tgstime;
1173		1173
1174	/*	1174	/*
1175	* The resource counters for the group leader are in its	1175	* The resource counters for the group leader are in its
1176	* own task_struct. Those for dead threads in the group	1176	* own task_struct. Those for dead threads in the group
1177	* are in its signal_struct, as are those for the child	1177	* are in its signal_struct, as are those for the child
1178	* processes it has previously reaped. All these	1178	* processes it has previously reaped. All these
1179	* accumulate in the parent's signal_struct c* fields.	1179	* accumulate in the parent's signal_struct c* fields.
1180	*	1180	*
1181	* We don't bother to take a lock here to protect these	1181	* We don't bother to take a lock here to protect these
1182	* p->signal fields, because they are only touched by	1182	* p->signal fields, because they are only touched by
1183	* __exit_signal, which runs with tasklist_lock	1183	* __exit_signal, which runs with tasklist_lock
1184	* write-locked anyway, and so is excluded here. We do	1184	* write-locked anyway, and so is excluded here. We do
1185	* need to protect the access to parent->signal fields,	1185	* need to protect the access to parent->signal fields,
1186	* as other threads in the parent group can be right	1186	* as other threads in the parent group can be right
1187	* here reaping other children at the same time.	1187	* here reaping other children at the same time.
1188	*	1188	*
1189	* We use thread_group_times() to get times for the thread	1189	* We use thread_group_cputime_adjusted() to get times for the thread
1190	* group, which consolidates times for all threads in the	1190	* group, which consolidates times for all threads in the
1191	* group including the group leader.	1191	* group including the group leader.
1192	*/	1192	*/
1193	thread_group_times(p, &tgutime, &tgstime);	1193	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1194	spin_lock_irq(&p->real_parent->sighand->siglock);	1194	spin_lock_irq(&p->real_parent->sighand->siglock);
1195	psig = p->real_parent->signal;	1195	psig = p->real_parent->signal;
1196	sig = p->signal;	1196	sig = p->signal;
1197	psig->cutime += tgutime + sig->cutime;	1197	psig->cutime += tgutime + sig->cutime;
1198	psig->cstime += tgstime + sig->cstime;	1198	psig->cstime += tgstime + sig->cstime;
1199	psig->cgtime += p->gtime + sig->gtime + sig->cgtime;	1199	psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1200	psig->cmin_flt +=	1200	psig->cmin_flt +=
1201	p->min_flt + sig->min_flt + sig->cmin_flt;	1201	p->min_flt + sig->min_flt + sig->cmin_flt;
1202	psig->cmaj_flt +=	1202	psig->cmaj_flt +=
1203	p->maj_flt + sig->maj_flt + sig->cmaj_flt;	1203	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1204	psig->cnvcsw +=	1204	psig->cnvcsw +=
1205	p->nvcsw + sig->nvcsw + sig->cnvcsw;	1205	p->nvcsw + sig->nvcsw + sig->cnvcsw;
1206	psig->cnivcsw +=	1206	psig->cnivcsw +=
1207	p->nivcsw + sig->nivcsw + sig->cnivcsw;	1207	p->nivcsw + sig->nivcsw + sig->cnivcsw;
1208	psig->cinblock +=	1208	psig->cinblock +=
1209	task_io_get_inblock(p) +	1209	task_io_get_inblock(p) +
1210	sig->inblock + sig->cinblock;	1210	sig->inblock + sig->cinblock;
1211	psig->coublock +=	1211	psig->coublock +=
1212	task_io_get_oublock(p) +	1212	task_io_get_oublock(p) +
1213	sig->oublock + sig->coublock;	1213	sig->oublock + sig->coublock;
1214	maxrss = max(sig->maxrss, sig->cmaxrss);	1214	maxrss = max(sig->maxrss, sig->cmaxrss);
1215	if (psig->cmaxrss < maxrss)	1215	if (psig->cmaxrss < maxrss)
1216	psig->cmaxrss = maxrss;	1216	psig->cmaxrss = maxrss;
1217	task_io_accounting_add(&psig->ioac, &p->ioac);	1217	task_io_accounting_add(&psig->ioac, &p->ioac);
1218	task_io_accounting_add(&psig->ioac, &sig->ioac);	1218	task_io_accounting_add(&psig->ioac, &sig->ioac);
1219	spin_unlock_irq(&p->real_parent->sighand->siglock);	1219	spin_unlock_irq(&p->real_parent->sighand->siglock);
1220	}	1220	}
1221		1221
1222	/*	1222	/*
1223	* Now we are sure this task is interesting, and no other	1223	* Now we are sure this task is interesting, and no other
1224	* thread can reap it because we set its state to EXIT_DEAD.	1224	* thread can reap it because we set its state to EXIT_DEAD.
1225	*/	1225	*/
1226	read_unlock(&tasklist_lock);	1226	read_unlock(&tasklist_lock);
1227		1227
1228	retval = wo->wo_rusage	1228	retval = wo->wo_rusage
1229	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;	1229	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1230	status = (p->signal->flags & SIGNAL_GROUP_EXIT)	1230	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1231	? p->signal->group_exit_code : p->exit_code;	1231	? p->signal->group_exit_code : p->exit_code;
1232	if (!retval && wo->wo_stat)	1232	if (!retval && wo->wo_stat)
1233	retval = put_user(status, wo->wo_stat);	1233	retval = put_user(status, wo->wo_stat);
1234		1234
1235	infop = wo->wo_info;	1235	infop = wo->wo_info;
1236	if (!retval && infop)	1236	if (!retval && infop)
1237	retval = put_user(SIGCHLD, &infop->si_signo);	1237	retval = put_user(SIGCHLD, &infop->si_signo);
1238	if (!retval && infop)	1238	if (!retval && infop)
1239	retval = put_user(0, &infop->si_errno);	1239	retval = put_user(0, &infop->si_errno);
1240	if (!retval && infop) {	1240	if (!retval && infop) {
1241	int why;	1241	int why;
1242		1242
1243	if ((status & 0x7f) == 0) {	1243	if ((status & 0x7f) == 0) {
1244	why = CLD_EXITED;	1244	why = CLD_EXITED;
1245	status >>= 8;	1245	status >>= 8;
1246	} else {	1246	} else {
1247	why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;	1247	why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1248	status &= 0x7f;	1248	status &= 0x7f;
1249	}	1249	}
1250	retval = put_user((short)why, &infop->si_code);	1250	retval = put_user((short)why, &infop->si_code);
1251	if (!retval)	1251	if (!retval)
1252	retval = put_user(status, &infop->si_status);	1252	retval = put_user(status, &infop->si_status);
1253	}	1253	}
1254	if (!retval && infop)	1254	if (!retval && infop)
1255	retval = put_user(pid, &infop->si_pid);	1255	retval = put_user(pid, &infop->si_pid);
1256	if (!retval && infop)	1256	if (!retval && infop)
1257	retval = put_user(uid, &infop->si_uid);	1257	retval = put_user(uid, &infop->si_uid);
1258	if (!retval)	1258	if (!retval)
1259	retval = pid;	1259	retval = pid;
1260		1260
1261	if (traced) {	1261	if (traced) {
1262	write_lock_irq(&tasklist_lock);	1262	write_lock_irq(&tasklist_lock);
1263	/* We dropped tasklist, ptracer could die and untrace */	1263	/* We dropped tasklist, ptracer could die and untrace */
1264	ptrace_unlink(p);	1264	ptrace_unlink(p);
1265	/*	1265	/*
1266	* If this is not a sub-thread, notify the parent.	1266	* If this is not a sub-thread, notify the parent.
1267	* If parent wants a zombie, don't release it now.	1267	* If parent wants a zombie, don't release it now.
1268	*/	1268	*/
1269	if (thread_group_leader(p) &&	1269	if (thread_group_leader(p) &&
1270	!do_notify_parent(p, p->exit_signal)) {	1270	!do_notify_parent(p, p->exit_signal)) {
1271	p->exit_state = EXIT_ZOMBIE;	1271	p->exit_state = EXIT_ZOMBIE;
1272	p = NULL;	1272	p = NULL;
1273	}	1273	}
1274	write_unlock_irq(&tasklist_lock);	1274	write_unlock_irq(&tasklist_lock);
1275	}	1275	}
1276	if (p != NULL)	1276	if (p != NULL)
1277	release_task(p);	1277	release_task(p);
1278		1278
1279	return retval;	1279	return retval;
1280	}	1280	}
1281		1281
1282	static int task_stopped_code(struct task_struct p, bool ptrace)	1282	static int task_stopped_code(struct task_struct p, bool ptrace)
1283	{	1283	{
1284	if (ptrace) {	1284	if (ptrace) {
1285	if (task_is_stopped_or_traced(p) &&	1285	if (task_is_stopped_or_traced(p) &&
1286	!(p->jobctl & JOBCTL_LISTENING))	1286	!(p->jobctl & JOBCTL_LISTENING))
1287	return &p->exit_code;	1287	return &p->exit_code;
1288	} else {	1288	} else {
1289	if (p->signal->flags & SIGNAL_STOP_STOPPED)	1289	if (p->signal->flags & SIGNAL_STOP_STOPPED)
1290	return &p->signal->group_exit_code;	1290	return &p->signal->group_exit_code;
1291	}	1291	}
1292	return NULL;	1292	return NULL;
1293	}	1293	}
1294		1294
1295	/**	1295	/**
1296	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED	1296	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1297	* @wo: wait options	1297	* @wo: wait options
1298	* @ptrace: is the wait for ptrace	1298	* @ptrace: is the wait for ptrace
1299	* @p: task to wait for	1299	* @p: task to wait for
1300	*	1300	*
1301	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.	1301	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1302	*	1302	*
1303	* CONTEXT:	1303	* CONTEXT:
1304	* read_lock(&tasklist_lock), which is released if return value is	1304	* read_lock(&tasklist_lock), which is released if return value is
1305	* non-zero. Also, grabs and releases @p->sighand->siglock.	1305	* non-zero. Also, grabs and releases @p->sighand->siglock.
1306	*	1306	*
1307	* RETURNS:	1307	* RETURNS:
1308	* 0 if wait condition didn't exist and search for other wait conditions	1308	* 0 if wait condition didn't exist and search for other wait conditions
1309	* should continue. Non-zero return, -errno on failure and @p's pid on	1309	* should continue. Non-zero return, -errno on failure and @p's pid on
1310	* success, implies that tasklist_lock is released and wait condition	1310	* success, implies that tasklist_lock is released and wait condition
1311	* search should terminate.	1311	* search should terminate.
1312	*/	1312	*/
1313	static int wait_task_stopped(struct wait_opts *wo,	1313	static int wait_task_stopped(struct wait_opts *wo,
1314	int ptrace, struct task_struct *p)	1314	int ptrace, struct task_struct *p)
1315	{	1315	{
1316	struct siginfo __user *infop;	1316	struct siginfo __user *infop;
1317	int retval, exit_code, *p_code, why;	1317	int retval, exit_code, *p_code, why;
1318	uid_t uid = 0; /* unneeded, required by compiler */	1318	uid_t uid = 0; /* unneeded, required by compiler */
1319	pid_t pid;	1319	pid_t pid;
1320		1320
1321	/*	1321	/*
1322	* Traditionally we see ptrace'd stopped tasks regardless of options.	1322	* Traditionally we see ptrace'd stopped tasks regardless of options.
1323	*/	1323	*/
1324	if (!ptrace && !(wo->wo_flags & WUNTRACED))	1324	if (!ptrace && !(wo->wo_flags & WUNTRACED))
1325	return 0;	1325	return 0;
1326		1326
1327	if (!task_stopped_code(p, ptrace))	1327	if (!task_stopped_code(p, ptrace))
1328	return 0;	1328	return 0;
1329		1329
1330	exit_code = 0;	1330	exit_code = 0;
1331	spin_lock_irq(&p->sighand->siglock);	1331	spin_lock_irq(&p->sighand->siglock);
1332		1332
1333	p_code = task_stopped_code(p, ptrace);	1333	p_code = task_stopped_code(p, ptrace);
1334	if (unlikely(!p_code))	1334	if (unlikely(!p_code))
1335	goto unlock_sig;	1335	goto unlock_sig;
1336		1336
1337	exit_code = *p_code;	1337	exit_code = *p_code;
1338	if (!exit_code)	1338	if (!exit_code)
1339	goto unlock_sig;	1339	goto unlock_sig;
1340		1340
1341	if (!unlikely(wo->wo_flags & WNOWAIT))	1341	if (!unlikely(wo->wo_flags & WNOWAIT))
1342	*p_code = 0;	1342	*p_code = 0;
1343		1343
1344	uid = from_kuid_munged(current_user_ns(), task_uid(p));	1344	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1345	unlock_sig:	1345	unlock_sig:
1346	spin_unlock_irq(&p->sighand->siglock);	1346	spin_unlock_irq(&p->sighand->siglock);
1347	if (!exit_code)	1347	if (!exit_code)
1348	return 0;	1348	return 0;
1349		1349
1350	/*	1350	/*
1351	* Now we are pretty sure this task is interesting.	1351	* Now we are pretty sure this task is interesting.
1352	* Make sure it doesn't get reaped out from under us while we	1352	* Make sure it doesn't get reaped out from under us while we
1353	* give up the lock and then examine it below. We don't want to	1353	* give up the lock and then examine it below. We don't want to
1354	* keep holding onto the tasklist_lock while we call getrusage and	1354	* keep holding onto the tasklist_lock while we call getrusage and
1355	* possibly take page faults for user memory.	1355	* possibly take page faults for user memory.
1356	*/	1356	*/
1357	get_task_struct(p);	1357	get_task_struct(p);
1358	pid = task_pid_vnr(p);	1358	pid = task_pid_vnr(p);
1359	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;	1359	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1360	read_unlock(&tasklist_lock);	1360	read_unlock(&tasklist_lock);
1361		1361
1362	if (unlikely(wo->wo_flags & WNOWAIT))	1362	if (unlikely(wo->wo_flags & WNOWAIT))
1363	return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);	1363	return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1364		1364
1365	retval = wo->wo_rusage	1365	retval = wo->wo_rusage
1366	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;	1366	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1367	if (!retval && wo->wo_stat)	1367	if (!retval && wo->wo_stat)
1368	retval = put_user((exit_code << 8) \| 0x7f, wo->wo_stat);	1368	retval = put_user((exit_code << 8) \| 0x7f, wo->wo_stat);
1369		1369
1370	infop = wo->wo_info;	1370	infop = wo->wo_info;
1371	if (!retval && infop)	1371	if (!retval && infop)
1372	retval = put_user(SIGCHLD, &infop->si_signo);	1372	retval = put_user(SIGCHLD, &infop->si_signo);
1373	if (!retval && infop)	1373	if (!retval && infop)
1374	retval = put_user(0, &infop->si_errno);	1374	retval = put_user(0, &infop->si_errno);
1375	if (!retval && infop)	1375	if (!retval && infop)
1376	retval = put_user((short)why, &infop->si_code);	1376	retval = put_user((short)why, &infop->si_code);
1377	if (!retval && infop)	1377	if (!retval && infop)
1378	retval = put_user(exit_code, &infop->si_status);	1378	retval = put_user(exit_code, &infop->si_status);
1379	if (!retval && infop)	1379	if (!retval && infop)
1380	retval = put_user(pid, &infop->si_pid);	1380	retval = put_user(pid, &infop->si_pid);
1381	if (!retval && infop)	1381	if (!retval && infop)
1382	retval = put_user(uid, &infop->si_uid);	1382	retval = put_user(uid, &infop->si_uid);
1383	if (!retval)	1383	if (!retval)
1384	retval = pid;	1384	retval = pid;
1385	put_task_struct(p);	1385	put_task_struct(p);
1386		1386
1387	BUG_ON(!retval);	1387	BUG_ON(!retval);
1388	return retval;	1388	return retval;
1389	}	1389	}
1390		1390
1391	/*	1391	/*
1392	* Handle do_wait work for one task in a live, non-stopped state.	1392	* Handle do_wait work for one task in a live, non-stopped state.
1393	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold	1393	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1394	* the lock and this task is uninteresting. If we return nonzero, we have	1394	* the lock and this task is uninteresting. If we return nonzero, we have
1395	* released the lock and the system call should return.	1395	* released the lock and the system call should return.
1396	*/	1396	*/
1397	static int wait_task_continued(struct wait_opts wo, struct task_struct p)	1397	static int wait_task_continued(struct wait_opts wo, struct task_struct p)
1398	{	1398	{
1399	int retval;	1399	int retval;
1400	pid_t pid;	1400	pid_t pid;
1401	uid_t uid;	1401	uid_t uid;
1402		1402
1403	if (!unlikely(wo->wo_flags & WCONTINUED))	1403	if (!unlikely(wo->wo_flags & WCONTINUED))
1404	return 0;	1404	return 0;
1405		1405
1406	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))	1406	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1407	return 0;	1407	return 0;
1408		1408
1409	spin_lock_irq(&p->sighand->siglock);	1409	spin_lock_irq(&p->sighand->siglock);
1410	/* Re-check with the lock held. */	1410	/* Re-check with the lock held. */
1411	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {	1411	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1412	spin_unlock_irq(&p->sighand->siglock);	1412	spin_unlock_irq(&p->sighand->siglock);
1413	return 0;	1413	return 0;
1414	}	1414	}
1415	if (!unlikely(wo->wo_flags & WNOWAIT))	1415	if (!unlikely(wo->wo_flags & WNOWAIT))
1416	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;	1416	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1417	uid = from_kuid_munged(current_user_ns(), task_uid(p));	1417	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1418	spin_unlock_irq(&p->sighand->siglock);	1418	spin_unlock_irq(&p->sighand->siglock);
1419		1419
1420	pid = task_pid_vnr(p);	1420	pid = task_pid_vnr(p);
1421	get_task_struct(p);	1421	get_task_struct(p);
1422	read_unlock(&tasklist_lock);	1422	read_unlock(&tasklist_lock);
1423		1423
1424	if (!wo->wo_info) {	1424	if (!wo->wo_info) {
1425	retval = wo->wo_rusage	1425	retval = wo->wo_rusage
1426	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;	1426	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1427	put_task_struct(p);	1427	put_task_struct(p);
1428	if (!retval && wo->wo_stat)	1428	if (!retval && wo->wo_stat)
1429	retval = put_user(0xffff, wo->wo_stat);	1429	retval = put_user(0xffff, wo->wo_stat);
1430	if (!retval)	1430	if (!retval)
1431	retval = pid;	1431	retval = pid;
1432	} else {	1432	} else {
1433	retval = wait_noreap_copyout(wo, p, pid, uid,	1433	retval = wait_noreap_copyout(wo, p, pid, uid,
1434	CLD_CONTINUED, SIGCONT);	1434	CLD_CONTINUED, SIGCONT);
1435	BUG_ON(retval == 0);	1435	BUG_ON(retval == 0);
1436	}	1436	}
1437		1437
1438	return retval;	1438	return retval;
1439	}	1439	}
1440		1440
1441	/*	1441	/*
1442	* Consider @p for a wait by @parent.	1442	* Consider @p for a wait by @parent.
1443	*	1443	*
1444	* -ECHILD should be in ->notask_error before the first call.	1444	* -ECHILD should be in ->notask_error before the first call.
1445	* Returns nonzero for a final return, when we have unlocked tasklist_lock.	1445	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1446	* Returns zero if the search for a child should continue;	1446	* Returns zero if the search for a child should continue;
1447	* then ->notask_error is 0 if @p is an eligible child,	1447	* then ->notask_error is 0 if @p is an eligible child,
1448	* or another error from security_task_wait(), or still -ECHILD.	1448	* or another error from security_task_wait(), or still -ECHILD.
1449	*/	1449	*/
1450	static int wait_consider_task(struct wait_opts *wo, int ptrace,	1450	static int wait_consider_task(struct wait_opts *wo, int ptrace,
1451	struct task_struct *p)	1451	struct task_struct *p)
1452	{	1452	{
1453	int ret = eligible_child(wo, p);	1453	int ret = eligible_child(wo, p);
1454	if (!ret)	1454	if (!ret)
1455	return ret;	1455	return ret;
1456		1456
1457	ret = security_task_wait(p);	1457	ret = security_task_wait(p);
1458	if (unlikely(ret < 0)) {	1458	if (unlikely(ret < 0)) {
1459	/*	1459	/*
1460	* If we have not yet seen any eligible child,	1460	* If we have not yet seen any eligible child,
1461	* then let this error code replace -ECHILD.	1461	* then let this error code replace -ECHILD.
1462	* A permission error will give the user a clue	1462	* A permission error will give the user a clue
1463	* to look for security policy problems, rather	1463	* to look for security policy problems, rather
1464	* than for mysterious wait bugs.	1464	* than for mysterious wait bugs.
1465	*/	1465	*/
1466	if (wo->notask_error)	1466	if (wo->notask_error)
1467	wo->notask_error = ret;	1467	wo->notask_error = ret;
1468	return 0;	1468	return 0;
1469	}	1469	}
1470		1470
1471	/* dead body doesn't have much to contribute */	1471	/* dead body doesn't have much to contribute */
1472	if (unlikely(p->exit_state == EXIT_DEAD)) {	1472	if (unlikely(p->exit_state == EXIT_DEAD)) {
1473	/*	1473	/*
1474	* But do not ignore this task until the tracer does	1474	* But do not ignore this task until the tracer does
1475	* wait_task_zombie()->do_notify_parent().	1475	* wait_task_zombie()->do_notify_parent().
1476	*/	1476	*/
1477	if (likely(!ptrace) && unlikely(ptrace_reparented(p)))	1477	if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1478	wo->notask_error = 0;	1478	wo->notask_error = 0;
1479	return 0;	1479	return 0;
1480	}	1480	}
1481		1481
1482	/* slay zombie? */	1482	/* slay zombie? */
1483	if (p->exit_state == EXIT_ZOMBIE) {	1483	if (p->exit_state == EXIT_ZOMBIE) {
1484	/*	1484	/*
1485	* A zombie ptracee is only visible to its ptracer.	1485	* A zombie ptracee is only visible to its ptracer.
1486	* Notification and reaping will be cascaded to the real	1486	* Notification and reaping will be cascaded to the real
1487	* parent when the ptracer detaches.	1487	* parent when the ptracer detaches.
1488	*/	1488	*/
1489	if (likely(!ptrace) && unlikely(p->ptrace)) {	1489	if (likely(!ptrace) && unlikely(p->ptrace)) {
1490	/* it will become visible, clear notask_error */	1490	/* it will become visible, clear notask_error */
1491	wo->notask_error = 0;	1491	wo->notask_error = 0;
1492	return 0;	1492	return 0;
1493	}	1493	}
1494		1494
1495	/* we don't reap group leaders with subthreads */	1495	/* we don't reap group leaders with subthreads */
1496	if (!delay_group_leader(p))	1496	if (!delay_group_leader(p))
1497	return wait_task_zombie(wo, p);	1497	return wait_task_zombie(wo, p);
1498		1498
1499	/*	1499	/*
1500	* Allow access to stopped/continued state via zombie by	1500	* Allow access to stopped/continued state via zombie by
1501	* falling through. Clearing of notask_error is complex.	1501	* falling through. Clearing of notask_error is complex.
1502	*	1502	*
1503	* When !@ptrace:	1503	* When !@ptrace:
1504	*	1504	*
1505	* If WEXITED is set, notask_error should naturally be	1505	* If WEXITED is set, notask_error should naturally be
1506	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,	1506	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
1507	* so, if there are live subthreads, there are events to	1507	* so, if there are live subthreads, there are events to
1508	* wait for. If all subthreads are dead, it's still safe	1508	* wait for. If all subthreads are dead, it's still safe
1509	* to clear - this function will be called again in finite	1509	* to clear - this function will be called again in finite
1510	* amount time once all the subthreads are released and	1510	* amount time once all the subthreads are released and
1511	* will then return without clearing.	1511	* will then return without clearing.
1512	*	1512	*
1513	* When @ptrace:	1513	* When @ptrace:
1514	*	1514	*
1515	* Stopped state is per-task and thus can't change once the	1515	* Stopped state is per-task and thus can't change once the
1516	* target task dies. Only continued and exited can happen.	1516	* target task dies. Only continued and exited can happen.
1517	* Clear notask_error if WCONTINUED \| WEXITED.	1517	* Clear notask_error if WCONTINUED \| WEXITED.
1518	*/	1518	*/
1519	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))	1519	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
1520	wo->notask_error = 0;	1520	wo->notask_error = 0;
1521	} else {	1521	} else {
1522	/*	1522	/*
1523	* If @p is ptraced by a task in its real parent's group,	1523	* If @p is ptraced by a task in its real parent's group,
1524	* hide group stop/continued state when looking at @p as	1524	* hide group stop/continued state when looking at @p as
1525	* the real parent; otherwise, a single stop can be	1525	* the real parent; otherwise, a single stop can be
1526	* reported twice as group and ptrace stops.	1526	* reported twice as group and ptrace stops.
1527	*	1527	*
1528	* If a ptracer wants to distinguish the two events for its	1528	* If a ptracer wants to distinguish the two events for its
1529	* own children, it should create a separate process which	1529	* own children, it should create a separate process which
1530	* takes the role of real parent.	1530	* takes the role of real parent.
1531	*/	1531	*/
1532	if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))	1532	if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1533	return 0;	1533	return 0;
1534		1534
1535	/*	1535	/*
1536	* @p is alive and it's gonna stop, continue or exit, so	1536	* @p is alive and it's gonna stop, continue or exit, so
1537	* there always is something to wait for.	1537	* there always is something to wait for.
1538	*/	1538	*/
1539	wo->notask_error = 0;	1539	wo->notask_error = 0;
1540	}	1540	}
1541		1541
1542	/*	1542	/*
1543	* Wait for stopped. Depending on @ptrace, different stopped state	1543	* Wait for stopped. Depending on @ptrace, different stopped state
1544	* is used and the two don't interact with each other.	1544	* is used and the two don't interact with each other.
1545	*/	1545	*/
1546	ret = wait_task_stopped(wo, ptrace, p);	1546	ret = wait_task_stopped(wo, ptrace, p);
1547	if (ret)	1547	if (ret)
1548	return ret;	1548	return ret;
1549		1549
1550	/*	1550	/*
1551	* Wait for continued. There's only one continued state and the	1551	* Wait for continued. There's only one continued state and the
1552	* ptracer can consume it which can confuse the real parent. Don't	1552	* ptracer can consume it which can confuse the real parent. Don't
1553	* use WCONTINUED from ptracer. You don't need or want it.	1553	* use WCONTINUED from ptracer. You don't need or want it.
1554	*/	1554	*/
1555	return wait_task_continued(wo, p);	1555	return wait_task_continued(wo, p);
1556	}	1556	}
1557		1557
1558	/*	1558	/*
1559	* Do the work of do_wait() for one thread in the group, @tsk.	1559	* Do the work of do_wait() for one thread in the group, @tsk.
1560	*	1560	*
1561	* -ECHILD should be in ->notask_error before the first call.	1561	* -ECHILD should be in ->notask_error before the first call.
1562	* Returns nonzero for a final return, when we have unlocked tasklist_lock.	1562	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1563	* Returns zero if the search for a child should continue; then	1563	* Returns zero if the search for a child should continue; then
1564	* ->notask_error is 0 if there were any eligible children,	1564	* ->notask_error is 0 if there were any eligible children,
1565	* or another error from security_task_wait(), or still -ECHILD.	1565	* or another error from security_task_wait(), or still -ECHILD.
1566	*/	1566	*/
1567	static int do_wait_thread(struct wait_opts wo, struct task_struct tsk)	1567	static int do_wait_thread(struct wait_opts wo, struct task_struct tsk)
1568	{	1568	{
1569	struct task_struct *p;	1569	struct task_struct *p;
1570		1570
1571	list_for_each_entry(p, &tsk->children, sibling) {	1571	list_for_each_entry(p, &tsk->children, sibling) {
1572	int ret = wait_consider_task(wo, 0, p);	1572	int ret = wait_consider_task(wo, 0, p);
1573	if (ret)	1573	if (ret)
1574	return ret;	1574	return ret;
1575	}	1575	}
1576		1576
1577	return 0;	1577	return 0;
1578	}	1578	}
1579		1579
1580	static int ptrace_do_wait(struct wait_opts wo, struct task_struct tsk)	1580	static int ptrace_do_wait(struct wait_opts wo, struct task_struct tsk)
1581	{	1581	{
1582	struct task_struct *p;	1582	struct task_struct *p;
1583		1583
1584	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {	1584	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1585	int ret = wait_consider_task(wo, 1, p);	1585	int ret = wait_consider_task(wo, 1, p);
1586	if (ret)	1586	if (ret)
1587	return ret;	1587	return ret;
1588	}	1588	}
1589		1589
1590	return 0;	1590	return 0;
1591	}	1591	}
1592		1592
1593	static int child_wait_callback(wait_queue_t *wait, unsigned mode,	1593	static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1594	int sync, void *key)	1594	int sync, void *key)
1595	{	1595	{
1596	struct wait_opts *wo = container_of(wait, struct wait_opts,	1596	struct wait_opts *wo = container_of(wait, struct wait_opts,
1597	child_wait);	1597	child_wait);
1598	struct task_struct *p = key;	1598	struct task_struct *p = key;
1599		1599
1600	if (!eligible_pid(wo, p))	1600	if (!eligible_pid(wo, p))
1601	return 0;	1601	return 0;
1602		1602
1603	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)	1603	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1604	return 0;	1604	return 0;
1605		1605
1606	return default_wake_function(wait, mode, sync, key);	1606	return default_wake_function(wait, mode, sync, key);
1607	}	1607	}
1608		1608
1609	void __wake_up_parent(struct task_struct p, struct task_struct parent)	1609	void __wake_up_parent(struct task_struct p, struct task_struct parent)
1610	{	1610	{
1611	__wake_up_sync_key(&parent->signal->wait_chldexit,	1611	__wake_up_sync_key(&parent->signal->wait_chldexit,
1612	TASK_INTERRUPTIBLE, 1, p);	1612	TASK_INTERRUPTIBLE, 1, p);
1613	}	1613	}
1614		1614
1615	static long do_wait(struct wait_opts *wo)	1615	static long do_wait(struct wait_opts *wo)
1616	{	1616	{
1617	struct task_struct *tsk;	1617	struct task_struct *tsk;
1618	int retval;	1618	int retval;
1619		1619
1620	trace_sched_process_wait(wo->wo_pid);	1620	trace_sched_process_wait(wo->wo_pid);
1621		1621
1622	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);	1622	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1623	wo->child_wait.private = current;	1623	wo->child_wait.private = current;
1624	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);	1624	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1625	repeat:	1625	repeat:
1626	/*	1626	/*
1627	* If there is nothing that can match our critiera just get out.	1627	* If there is nothing that can match our critiera just get out.
1628	* We will clear ->notask_error to zero if we see any child that	1628	* We will clear ->notask_error to zero if we see any child that
1629	* might later match our criteria, even if we are not able to reap	1629	* might later match our criteria, even if we are not able to reap
1630	* it yet.	1630	* it yet.
1631	*/	1631	*/
1632	wo->notask_error = -ECHILD;	1632	wo->notask_error = -ECHILD;
1633	if ((wo->wo_type < PIDTYPE_MAX) &&	1633	if ((wo->wo_type < PIDTYPE_MAX) &&
1634	(!wo->wo_pid \|\| hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))	1634	(!wo->wo_pid \|\| hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1635	goto notask;	1635	goto notask;
1636		1636
1637	set_current_state(TASK_INTERRUPTIBLE);	1637	set_current_state(TASK_INTERRUPTIBLE);
1638	read_lock(&tasklist_lock);	1638	read_lock(&tasklist_lock);
1639	tsk = current;	1639	tsk = current;
1640	do {	1640	do {
1641	retval = do_wait_thread(wo, tsk);	1641	retval = do_wait_thread(wo, tsk);
1642	if (retval)	1642	if (retval)
1643	goto end;	1643	goto end;
1644		1644
1645	retval = ptrace_do_wait(wo, tsk);	1645	retval = ptrace_do_wait(wo, tsk);
1646	if (retval)	1646	if (retval)
1647	goto end;	1647	goto end;
1648		1648
1649	if (wo->wo_flags & __WNOTHREAD)	1649	if (wo->wo_flags & __WNOTHREAD)
1650	break;	1650	break;
1651	} while_each_thread(current, tsk);	1651	} while_each_thread(current, tsk);
1652	read_unlock(&tasklist_lock);	1652	read_unlock(&tasklist_lock);
1653		1653
1654	notask:	1654	notask:
1655	retval = wo->notask_error;	1655	retval = wo->notask_error;
1656	if (!retval && !(wo->wo_flags & WNOHANG)) {	1656	if (!retval && !(wo->wo_flags & WNOHANG)) {
1657	retval = -ERESTARTSYS;	1657	retval = -ERESTARTSYS;
1658	if (!signal_pending(current)) {	1658	if (!signal_pending(current)) {
1659	schedule();	1659	schedule();
1660	goto repeat;	1660	goto repeat;
1661	}	1661	}
1662	}	1662	}
1663	end:	1663	end:
1664	__set_current_state(TASK_RUNNING);	1664	__set_current_state(TASK_RUNNING);
1665	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);	1665	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1666	return retval;	1666	return retval;
1667	}	1667	}
1668		1668
1669	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,	1669	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1670	infop, int, options, struct rusage __user *, ru)	1670	infop, int, options, struct rusage __user *, ru)
1671	{	1671	{
1672	struct wait_opts wo;	1672	struct wait_opts wo;
1673	struct pid *pid = NULL;	1673	struct pid *pid = NULL;
1674	enum pid_type type;	1674	enum pid_type type;
1675	long ret;	1675	long ret;
1676		1676
1677	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED))	1677	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED))
1678	return -EINVAL;	1678	return -EINVAL;
1679	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))	1679	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
1680	return -EINVAL;	1680	return -EINVAL;
1681		1681
1682	switch (which) {	1682	switch (which) {
1683	case P_ALL:	1683	case P_ALL:
1684	type = PIDTYPE_MAX;	1684	type = PIDTYPE_MAX;
1685	break;	1685	break;
1686	case P_PID:	1686	case P_PID:
1687	type = PIDTYPE_PID;	1687	type = PIDTYPE_PID;
1688	if (upid <= 0)	1688	if (upid <= 0)
1689	return -EINVAL;	1689	return -EINVAL;
1690	break;	1690	break;
1691	case P_PGID:	1691	case P_PGID:
1692	type = PIDTYPE_PGID;	1692	type = PIDTYPE_PGID;
1693	if (upid <= 0)	1693	if (upid <= 0)
1694	return -EINVAL;	1694	return -EINVAL;
1695	break;	1695	break;
1696	default:	1696	default:
1697	return -EINVAL;	1697	return -EINVAL;
1698	}	1698	}
1699		1699
1700	if (type < PIDTYPE_MAX)	1700	if (type < PIDTYPE_MAX)
1701	pid = find_get_pid(upid);	1701	pid = find_get_pid(upid);
1702		1702
1703	wo.wo_type = type;	1703	wo.wo_type = type;
1704	wo.wo_pid = pid;	1704	wo.wo_pid = pid;
1705	wo.wo_flags = options;	1705	wo.wo_flags = options;
1706	wo.wo_info = infop;	1706	wo.wo_info = infop;
1707	wo.wo_stat = NULL;	1707	wo.wo_stat = NULL;
1708	wo.wo_rusage = ru;	1708	wo.wo_rusage = ru;
1709	ret = do_wait(&wo);	1709	ret = do_wait(&wo);
1710		1710
1711	if (ret > 0) {	1711	if (ret > 0) {
1712	ret = 0;	1712	ret = 0;
1713	} else if (infop) {	1713	} else if (infop) {
1714	/*	1714	/*
1715	* For a WNOHANG return, clear out all the fields	1715	* For a WNOHANG return, clear out all the fields
1716	* we would set so the user can easily tell the	1716	* we would set so the user can easily tell the
1717	* difference.	1717	* difference.
1718	*/	1718	*/
1719	if (!ret)	1719	if (!ret)
1720	ret = put_user(0, &infop->si_signo);	1720	ret = put_user(0, &infop->si_signo);
1721	if (!ret)	1721	if (!ret)
1722	ret = put_user(0, &infop->si_errno);	1722	ret = put_user(0, &infop->si_errno);
1723	if (!ret)	1723	if (!ret)
1724	ret = put_user(0, &infop->si_code);	1724	ret = put_user(0, &infop->si_code);
1725	if (!ret)	1725	if (!ret)
1726	ret = put_user(0, &infop->si_pid);	1726	ret = put_user(0, &infop->si_pid);
1727	if (!ret)	1727	if (!ret)
1728	ret = put_user(0, &infop->si_uid);	1728	ret = put_user(0, &infop->si_uid);
1729	if (!ret)	1729	if (!ret)
1730	ret = put_user(0, &infop->si_status);	1730	ret = put_user(0, &infop->si_status);
1731	}	1731	}
1732		1732
1733	put_pid(pid);	1733	put_pid(pid);
1734		1734
1735	/* avoid REGPARM breakage on x86: */	1735	/* avoid REGPARM breakage on x86: */
1736	asmlinkage_protect(5, ret, which, upid, infop, options, ru);	1736	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1737	return ret;	1737	return ret;
1738	}	1738	}
1739		1739
1740	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,	1740	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1741	int, options, struct rusage __user *, ru)	1741	int, options, struct rusage __user *, ru)
1742	{	1742	{
1743	struct wait_opts wo;	1743	struct wait_opts wo;
1744	struct pid *pid = NULL;	1744	struct pid *pid = NULL;
1745	enum pid_type type;	1745	enum pid_type type;
1746	long ret;	1746	long ret;
1747		1747
1748	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|	1748	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
1749	__WNOTHREAD\|__WCLONE\|__WALL))	1749	__WNOTHREAD\|__WCLONE\|__WALL))
1750	return -EINVAL;	1750	return -EINVAL;
1751		1751
1752	if (upid == -1)	1752	if (upid == -1)
1753	type = PIDTYPE_MAX;	1753	type = PIDTYPE_MAX;
1754	else if (upid < 0) {	1754	else if (upid < 0) {
1755	type = PIDTYPE_PGID;	1755	type = PIDTYPE_PGID;
1756	pid = find_get_pid(-upid);	1756	pid = find_get_pid(-upid);
1757	} else if (upid == 0) {	1757	} else if (upid == 0) {
1758	type = PIDTYPE_PGID;	1758	type = PIDTYPE_PGID;
1759	pid = get_task_pid(current, PIDTYPE_PGID);	1759	pid = get_task_pid(current, PIDTYPE_PGID);
1760	} else /* upid > 0 */ {	1760	} else /* upid > 0 */ {
1761	type = PIDTYPE_PID;	1761	type = PIDTYPE_PID;
1762	pid = find_get_pid(upid);	1762	pid = find_get_pid(upid);
1763	}	1763	}
1764		1764
1765	wo.wo_type = type;	1765	wo.wo_type = type;
1766	wo.wo_pid = pid;	1766	wo.wo_pid = pid;
1767	wo.wo_flags = options \| WEXITED;	1767	wo.wo_flags = options \| WEXITED;
1768	wo.wo_info = NULL;	1768	wo.wo_info = NULL;
1769	wo.wo_stat = stat_addr;	1769	wo.wo_stat = stat_addr;
1770	wo.wo_rusage = ru;	1770	wo.wo_rusage = ru;
1771	ret = do_wait(&wo);	1771	ret = do_wait(&wo);
1772	put_pid(pid);	1772	put_pid(pid);
1773		1773
1774	/* avoid REGPARM breakage on x86: */	1774	/* avoid REGPARM breakage on x86: */
1775	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);	1775	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1776	return ret;	1776	return ret;
1777	}	1777	}
1778		1778
1779	#ifdef __ARCH_WANT_SYS_WAITPID	1779	#ifdef __ARCH_WANT_SYS_WAITPID
1780		1780
1781	/*	1781	/*
1782	* sys_waitpid() remains for compatibility. waitpid() should be	1782	* sys_waitpid() remains for compatibility. waitpid() should be
1783	* implemented by calling sys_wait4() from libc.a.	1783	* implemented by calling sys_wait4() from libc.a.
1784	*/	1784	*/
1785	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)	1785	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1786	{	1786	{
1787	return sys_wait4(pid, stat_addr, options, NULL);	1787	return sys_wait4(pid, stat_addr, options, NULL);
1788	}	1788	}
1789		1789
1790	#endif	1790	#endif
1791		1791

kernel/sched/cputime.c

Diff comments View file @ e80d0a1

kernel/sys.c

Diff comments View file @ e80d0a1

1	/*	1	/*
2	* linux/kernel/sys.c	2	* linux/kernel/sys.c
3	*	3	*
4	* Copyright (C) 1991, 1992 Linus Torvalds	4	* Copyright (C) 1991, 1992 Linus Torvalds
5	*/	5	*/
6		6
7	#include <linux/export.h>	7	#include <linux/export.h>
8	#include <linux/mm.h>	8	#include <linux/mm.h>
9	#include <linux/utsname.h>	9	#include <linux/utsname.h>
10	#include <linux/mman.h>	10	#include <linux/mman.h>
11	#include <linux/reboot.h>	11	#include <linux/reboot.h>
12	#include <linux/prctl.h>	12	#include <linux/prctl.h>
13	#include <linux/highuid.h>	13	#include <linux/highuid.h>
14	#include <linux/fs.h>	14	#include <linux/fs.h>
15	#include <linux/kmod.h>	15	#include <linux/kmod.h>
16	#include <linux/perf_event.h>	16	#include <linux/perf_event.h>
17	#include <linux/resource.h>	17	#include <linux/resource.h>
18	#include <linux/kernel.h>	18	#include <linux/kernel.h>
19	#include <linux/kexec.h>	19	#include <linux/kexec.h>
20	#include <linux/workqueue.h>	20	#include <linux/workqueue.h>
21	#include <linux/capability.h>	21	#include <linux/capability.h>
22	#include <linux/device.h>	22	#include <linux/device.h>
23	#include <linux/key.h>	23	#include <linux/key.h>
24	#include <linux/times.h>	24	#include <linux/times.h>
25	#include <linux/posix-timers.h>	25	#include <linux/posix-timers.h>
26	#include <linux/security.h>	26	#include <linux/security.h>
27	#include <linux/dcookies.h>	27	#include <linux/dcookies.h>
28	#include <linux/suspend.h>	28	#include <linux/suspend.h>
29	#include <linux/tty.h>	29	#include <linux/tty.h>
30	#include <linux/signal.h>	30	#include <linux/signal.h>
31	#include <linux/cn_proc.h>	31	#include <linux/cn_proc.h>
32	#include <linux/getcpu.h>	32	#include <linux/getcpu.h>
33	#include <linux/task_io_accounting_ops.h>	33	#include <linux/task_io_accounting_ops.h>
34	#include <linux/seccomp.h>	34	#include <linux/seccomp.h>
35	#include <linux/cpu.h>	35	#include <linux/cpu.h>
36	#include <linux/personality.h>	36	#include <linux/personality.h>
37	#include <linux/ptrace.h>	37	#include <linux/ptrace.h>
38	#include <linux/fs_struct.h>	38	#include <linux/fs_struct.h>
39	#include <linux/file.h>	39	#include <linux/file.h>
40	#include <linux/mount.h>	40	#include <linux/mount.h>
41	#include <linux/gfp.h>	41	#include <linux/gfp.h>
42	#include <linux/syscore_ops.h>	42	#include <linux/syscore_ops.h>
43	#include <linux/version.h>	43	#include <linux/version.h>
44	#include <linux/ctype.h>	44	#include <linux/ctype.h>
45		45
46	#include <linux/compat.h>	46	#include <linux/compat.h>
47	#include <linux/syscalls.h>	47	#include <linux/syscalls.h>
48	#include <linux/kprobes.h>	48	#include <linux/kprobes.h>
49	#include <linux/user_namespace.h>	49	#include <linux/user_namespace.h>
50		50
51	#include <linux/kmsg_dump.h>	51	#include <linux/kmsg_dump.h>
52	/* Move somewhere else to avoid recompiling? */	52	/* Move somewhere else to avoid recompiling? */
53	#include <generated/utsrelease.h>	53	#include <generated/utsrelease.h>
54		54
55	#include <asm/uaccess.h>	55	#include <asm/uaccess.h>
56	#include <asm/io.h>	56	#include <asm/io.h>
57	#include <asm/unistd.h>	57	#include <asm/unistd.h>
58		58
59	#ifndef SET_UNALIGN_CTL	59	#ifndef SET_UNALIGN_CTL
60	# define SET_UNALIGN_CTL(a,b) (-EINVAL)	60	# define SET_UNALIGN_CTL(a,b) (-EINVAL)
61	#endif	61	#endif
62	#ifndef GET_UNALIGN_CTL	62	#ifndef GET_UNALIGN_CTL
63	# define GET_UNALIGN_CTL(a,b) (-EINVAL)	63	# define GET_UNALIGN_CTL(a,b) (-EINVAL)
64	#endif	64	#endif
65	#ifndef SET_FPEMU_CTL	65	#ifndef SET_FPEMU_CTL
66	# define SET_FPEMU_CTL(a,b) (-EINVAL)	66	# define SET_FPEMU_CTL(a,b) (-EINVAL)
67	#endif	67	#endif
68	#ifndef GET_FPEMU_CTL	68	#ifndef GET_FPEMU_CTL
69	# define GET_FPEMU_CTL(a,b) (-EINVAL)	69	# define GET_FPEMU_CTL(a,b) (-EINVAL)
70	#endif	70	#endif
71	#ifndef SET_FPEXC_CTL	71	#ifndef SET_FPEXC_CTL
72	# define SET_FPEXC_CTL(a,b) (-EINVAL)	72	# define SET_FPEXC_CTL(a,b) (-EINVAL)
73	#endif	73	#endif
74	#ifndef GET_FPEXC_CTL	74	#ifndef GET_FPEXC_CTL
75	# define GET_FPEXC_CTL(a,b) (-EINVAL)	75	# define GET_FPEXC_CTL(a,b) (-EINVAL)
76	#endif	76	#endif
77	#ifndef GET_ENDIAN	77	#ifndef GET_ENDIAN
78	# define GET_ENDIAN(a,b) (-EINVAL)	78	# define GET_ENDIAN(a,b) (-EINVAL)
79	#endif	79	#endif
80	#ifndef SET_ENDIAN	80	#ifndef SET_ENDIAN
81	# define SET_ENDIAN(a,b) (-EINVAL)	81	# define SET_ENDIAN(a,b) (-EINVAL)
82	#endif	82	#endif
83	#ifndef GET_TSC_CTL	83	#ifndef GET_TSC_CTL
84	# define GET_TSC_CTL(a) (-EINVAL)	84	# define GET_TSC_CTL(a) (-EINVAL)
85	#endif	85	#endif
86	#ifndef SET_TSC_CTL	86	#ifndef SET_TSC_CTL
87	# define SET_TSC_CTL(a) (-EINVAL)	87	# define SET_TSC_CTL(a) (-EINVAL)
88	#endif	88	#endif
89		89
90	/*	90	/*
91	* this is where the system-wide overflow UID and GID are defined, for	91	* this is where the system-wide overflow UID and GID are defined, for
92	* architectures that now have 32-bit UID/GID but didn't in the past	92	* architectures that now have 32-bit UID/GID but didn't in the past
93	*/	93	*/
94		94
95	int overflowuid = DEFAULT_OVERFLOWUID;	95	int overflowuid = DEFAULT_OVERFLOWUID;
96	int overflowgid = DEFAULT_OVERFLOWGID;	96	int overflowgid = DEFAULT_OVERFLOWGID;
97		97
98	EXPORT_SYMBOL(overflowuid);	98	EXPORT_SYMBOL(overflowuid);
99	EXPORT_SYMBOL(overflowgid);	99	EXPORT_SYMBOL(overflowgid);
100		100
101	/*	101	/*
102	* the same as above, but for filesystems which can only store a 16-bit	102	* the same as above, but for filesystems which can only store a 16-bit
103	* UID and GID. as such, this is needed on all architectures	103	* UID and GID. as such, this is needed on all architectures
104	*/	104	*/
105		105
106	int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;	106	int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
107	int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;	107	int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
108		108
109	EXPORT_SYMBOL(fs_overflowuid);	109	EXPORT_SYMBOL(fs_overflowuid);
110	EXPORT_SYMBOL(fs_overflowgid);	110	EXPORT_SYMBOL(fs_overflowgid);
111		111
112	/*	112	/*
113	* this indicates whether you can reboot with ctrl-alt-del: the default is yes	113	* this indicates whether you can reboot with ctrl-alt-del: the default is yes
114	*/	114	*/
115		115
116	int C_A_D = 1;	116	int C_A_D = 1;
117	struct pid *cad_pid;	117	struct pid *cad_pid;
118	EXPORT_SYMBOL(cad_pid);	118	EXPORT_SYMBOL(cad_pid);
119		119
120	/*	120	/*
121	* If set, this is used for preparing the system to power off.	121	* If set, this is used for preparing the system to power off.
122	*/	122	*/
123		123
124	void (*pm_power_off_prepare)(void);	124	void (*pm_power_off_prepare)(void);
125		125
126	/*	126	/*
127	* Returns true if current's euid is same as p's uid or euid,	127	* Returns true if current's euid is same as p's uid or euid,
128	* or has CAP_SYS_NICE to p's user_ns.	128	* or has CAP_SYS_NICE to p's user_ns.
129	*	129	*
130	* Called with rcu_read_lock, creds are safe	130	* Called with rcu_read_lock, creds are safe
131	*/	131	*/
132	static bool set_one_prio_perm(struct task_struct *p)	132	static bool set_one_prio_perm(struct task_struct *p)
133	{	133	{
134	const struct cred cred = current_cred(), pcred = __task_cred(p);	134	const struct cred cred = current_cred(), pcred = __task_cred(p);
135		135
136	if (uid_eq(pcred->uid, cred->euid) \|\|	136	if (uid_eq(pcred->uid, cred->euid) \|\|
137	uid_eq(pcred->euid, cred->euid))	137	uid_eq(pcred->euid, cred->euid))
138	return true;	138	return true;
139	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))	139	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
140	return true;	140	return true;
141	return false;	141	return false;
142	}	142	}
143		143
144	/*	144	/*
145	* set the priority of a task	145	* set the priority of a task
146	* - the caller must hold the RCU read lock	146	* - the caller must hold the RCU read lock
147	*/	147	*/
148	static int set_one_prio(struct task_struct *p, int niceval, int error)	148	static int set_one_prio(struct task_struct *p, int niceval, int error)
149	{	149	{
150	int no_nice;	150	int no_nice;
151		151
152	if (!set_one_prio_perm(p)) {	152	if (!set_one_prio_perm(p)) {
153	error = -EPERM;	153	error = -EPERM;
154	goto out;	154	goto out;
155	}	155	}
156	if (niceval < task_nice(p) && !can_nice(p, niceval)) {	156	if (niceval < task_nice(p) && !can_nice(p, niceval)) {
157	error = -EACCES;	157	error = -EACCES;
158	goto out;	158	goto out;
159	}	159	}
160	no_nice = security_task_setnice(p, niceval);	160	no_nice = security_task_setnice(p, niceval);
161	if (no_nice) {	161	if (no_nice) {
162	error = no_nice;	162	error = no_nice;
163	goto out;	163	goto out;
164	}	164	}
165	if (error == -ESRCH)	165	if (error == -ESRCH)
166	error = 0;	166	error = 0;
167	set_user_nice(p, niceval);	167	set_user_nice(p, niceval);
168	out:	168	out:
169	return error;	169	return error;
170	}	170	}
171		171
172	SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)	172	SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
173	{	173	{
174	struct task_struct g, p;	174	struct task_struct g, p;
175	struct user_struct *user;	175	struct user_struct *user;
176	const struct cred *cred = current_cred();	176	const struct cred *cred = current_cred();
177	int error = -EINVAL;	177	int error = -EINVAL;
178	struct pid *pgrp;	178	struct pid *pgrp;
179	kuid_t uid;	179	kuid_t uid;
180		180
181	if (which > PRIO_USER \|\| which < PRIO_PROCESS)	181	if (which > PRIO_USER \|\| which < PRIO_PROCESS)
182	goto out;	182	goto out;
183		183
184	/* normalize: avoid signed division (rounding problems) */	184	/* normalize: avoid signed division (rounding problems) */
185	error = -ESRCH;	185	error = -ESRCH;
186	if (niceval < -20)	186	if (niceval < -20)
187	niceval = -20;	187	niceval = -20;
188	if (niceval > 19)	188	if (niceval > 19)
189	niceval = 19;	189	niceval = 19;
190		190
191	rcu_read_lock();	191	rcu_read_lock();
192	read_lock(&tasklist_lock);	192	read_lock(&tasklist_lock);
193	switch (which) {	193	switch (which) {
194	case PRIO_PROCESS:	194	case PRIO_PROCESS:
195	if (who)	195	if (who)
196	p = find_task_by_vpid(who);	196	p = find_task_by_vpid(who);
197	else	197	else
198	p = current;	198	p = current;
199	if (p)	199	if (p)
200	error = set_one_prio(p, niceval, error);	200	error = set_one_prio(p, niceval, error);
201	break;	201	break;
202	case PRIO_PGRP:	202	case PRIO_PGRP:
203	if (who)	203	if (who)
204	pgrp = find_vpid(who);	204	pgrp = find_vpid(who);
205	else	205	else
206	pgrp = task_pgrp(current);	206	pgrp = task_pgrp(current);
207	do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {	207	do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
208	error = set_one_prio(p, niceval, error);	208	error = set_one_prio(p, niceval, error);
209	} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);	209	} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
210	break;	210	break;
211	case PRIO_USER:	211	case PRIO_USER:
212	uid = make_kuid(cred->user_ns, who);	212	uid = make_kuid(cred->user_ns, who);
213	user = cred->user;	213	user = cred->user;
214	if (!who)	214	if (!who)
215	uid = cred->uid;	215	uid = cred->uid;
216	else if (!uid_eq(uid, cred->uid) &&	216	else if (!uid_eq(uid, cred->uid) &&
217	!(user = find_user(uid)))	217	!(user = find_user(uid)))
218	goto out_unlock; /* No processes for this user */	218	goto out_unlock; /* No processes for this user */
219		219
220	do_each_thread(g, p) {	220	do_each_thread(g, p) {
221	if (uid_eq(task_uid(p), uid))	221	if (uid_eq(task_uid(p), uid))
222	error = set_one_prio(p, niceval, error);	222	error = set_one_prio(p, niceval, error);
223	} while_each_thread(g, p);	223	} while_each_thread(g, p);
224	if (!uid_eq(uid, cred->uid))	224	if (!uid_eq(uid, cred->uid))
225	free_uid(user); /* For find_user() */	225	free_uid(user); /* For find_user() */
226	break;	226	break;
227	}	227	}
228	out_unlock:	228	out_unlock:
229	read_unlock(&tasklist_lock);	229	read_unlock(&tasklist_lock);
230	rcu_read_unlock();	230	rcu_read_unlock();
231	out:	231	out:
232	return error;	232	return error;
233	}	233	}
234		234
235	/*	235	/*
236	* Ugh. To avoid negative return values, "getpriority()" will	236	* Ugh. To avoid negative return values, "getpriority()" will
237	* not return the normal nice-value, but a negated value that	237	* not return the normal nice-value, but a negated value that
238	* has been offset by 20 (ie it returns 40..1 instead of -20..19)	238	* has been offset by 20 (ie it returns 40..1 instead of -20..19)
239	* to stay compatible.	239	* to stay compatible.
240	*/	240	*/
241	SYSCALL_DEFINE2(getpriority, int, which, int, who)	241	SYSCALL_DEFINE2(getpriority, int, which, int, who)
242	{	242	{
243	struct task_struct g, p;	243	struct task_struct g, p;
244	struct user_struct *user;	244	struct user_struct *user;
245	const struct cred *cred = current_cred();	245	const struct cred *cred = current_cred();
246	long niceval, retval = -ESRCH;	246	long niceval, retval = -ESRCH;
247	struct pid *pgrp;	247	struct pid *pgrp;
248	kuid_t uid;	248	kuid_t uid;
249		249
250	if (which > PRIO_USER \|\| which < PRIO_PROCESS)	250	if (which > PRIO_USER \|\| which < PRIO_PROCESS)
251	return -EINVAL;	251	return -EINVAL;
252		252
253	rcu_read_lock();	253	rcu_read_lock();
254	read_lock(&tasklist_lock);	254	read_lock(&tasklist_lock);
255	switch (which) {	255	switch (which) {
256	case PRIO_PROCESS:	256	case PRIO_PROCESS:
257	if (who)	257	if (who)
258	p = find_task_by_vpid(who);	258	p = find_task_by_vpid(who);
259	else	259	else
260	p = current;	260	p = current;
261	if (p) {	261	if (p) {
262	niceval = 20 - task_nice(p);	262	niceval = 20 - task_nice(p);
263	if (niceval > retval)	263	if (niceval > retval)
264	retval = niceval;	264	retval = niceval;
265	}	265	}
266	break;	266	break;
267	case PRIO_PGRP:	267	case PRIO_PGRP:
268	if (who)	268	if (who)
269	pgrp = find_vpid(who);	269	pgrp = find_vpid(who);
270	else	270	else
271	pgrp = task_pgrp(current);	271	pgrp = task_pgrp(current);
272	do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {	272	do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
273	niceval = 20 - task_nice(p);	273	niceval = 20 - task_nice(p);
274	if (niceval > retval)	274	if (niceval > retval)
275	retval = niceval;	275	retval = niceval;
276	} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);	276	} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
277	break;	277	break;
278	case PRIO_USER:	278	case PRIO_USER:
279	uid = make_kuid(cred->user_ns, who);	279	uid = make_kuid(cred->user_ns, who);
280	user = cred->user;	280	user = cred->user;
281	if (!who)	281	if (!who)
282	uid = cred->uid;	282	uid = cred->uid;
283	else if (!uid_eq(uid, cred->uid) &&	283	else if (!uid_eq(uid, cred->uid) &&
284	!(user = find_user(uid)))	284	!(user = find_user(uid)))
285	goto out_unlock; /* No processes for this user */	285	goto out_unlock; /* No processes for this user */
286		286
287	do_each_thread(g, p) {	287	do_each_thread(g, p) {
288	if (uid_eq(task_uid(p), uid)) {	288	if (uid_eq(task_uid(p), uid)) {
289	niceval = 20 - task_nice(p);	289	niceval = 20 - task_nice(p);
290	if (niceval > retval)	290	if (niceval > retval)
291	retval = niceval;	291	retval = niceval;
292	}	292	}
293	} while_each_thread(g, p);	293	} while_each_thread(g, p);
294	if (!uid_eq(uid, cred->uid))	294	if (!uid_eq(uid, cred->uid))
295	free_uid(user); /* for find_user() */	295	free_uid(user); /* for find_user() */
296	break;	296	break;
297	}	297	}
298	out_unlock:	298	out_unlock:
299	read_unlock(&tasklist_lock);	299	read_unlock(&tasklist_lock);
300	rcu_read_unlock();	300	rcu_read_unlock();
301		301
302	return retval;	302	return retval;
303	}	303	}
304		304
305	/**	305	/**
306	* emergency_restart - reboot the system	306	* emergency_restart - reboot the system
307	*	307	*
308	* Without shutting down any hardware or taking any locks	308	* Without shutting down any hardware or taking any locks
309	* reboot the system. This is called when we know we are in	309	* reboot the system. This is called when we know we are in
310	* trouble so this is our best effort to reboot. This is	310	* trouble so this is our best effort to reboot. This is
311	* safe to call in interrupt context.	311	* safe to call in interrupt context.
312	*/	312	*/
313	void emergency_restart(void)	313	void emergency_restart(void)
314	{	314	{
315	kmsg_dump(KMSG_DUMP_EMERG);	315	kmsg_dump(KMSG_DUMP_EMERG);
316	machine_emergency_restart();	316	machine_emergency_restart();
317	}	317	}
318	EXPORT_SYMBOL_GPL(emergency_restart);	318	EXPORT_SYMBOL_GPL(emergency_restart);
319		319
320	void kernel_restart_prepare(char *cmd)	320	void kernel_restart_prepare(char *cmd)
321	{	321	{
322	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);	322	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
323	system_state = SYSTEM_RESTART;	323	system_state = SYSTEM_RESTART;
324	usermodehelper_disable();	324	usermodehelper_disable();
325	device_shutdown();	325	device_shutdown();
326	syscore_shutdown();	326	syscore_shutdown();
327	}	327	}
328		328
329	/**	329	/**
330	* register_reboot_notifier - Register function to be called at reboot time	330	* register_reboot_notifier - Register function to be called at reboot time
331	* @nb: Info about notifier function to be called	331	* @nb: Info about notifier function to be called
332	*	332	*
333	* Registers a function with the list of functions	333	* Registers a function with the list of functions
334	* to be called at reboot time.	334	* to be called at reboot time.
335	*	335	*
336	* Currently always returns zero, as blocking_notifier_chain_register()	336	* Currently always returns zero, as blocking_notifier_chain_register()
337	* always returns zero.	337	* always returns zero.
338	*/	338	*/
339	int register_reboot_notifier(struct notifier_block *nb)	339	int register_reboot_notifier(struct notifier_block *nb)
340	{	340	{
341	return blocking_notifier_chain_register(&reboot_notifier_list, nb);	341	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
342	}	342	}
343	EXPORT_SYMBOL(register_reboot_notifier);	343	EXPORT_SYMBOL(register_reboot_notifier);
344		344
345	/**	345	/**
346	* unregister_reboot_notifier - Unregister previously registered reboot notifier	346	* unregister_reboot_notifier - Unregister previously registered reboot notifier
347	* @nb: Hook to be unregistered	347	* @nb: Hook to be unregistered
348	*	348	*
349	* Unregisters a previously registered reboot	349	* Unregisters a previously registered reboot
350	* notifier function.	350	* notifier function.
351	*	351	*
352	* Returns zero on success, or %-ENOENT on failure.	352	* Returns zero on success, or %-ENOENT on failure.
353	*/	353	*/
354	int unregister_reboot_notifier(struct notifier_block *nb)	354	int unregister_reboot_notifier(struct notifier_block *nb)
355	{	355	{
356	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);	356	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
357	}	357	}
358	EXPORT_SYMBOL(unregister_reboot_notifier);	358	EXPORT_SYMBOL(unregister_reboot_notifier);
359		359
360	/**	360	/**
361	* kernel_restart - reboot the system	361	* kernel_restart - reboot the system
362	* @cmd: pointer to buffer containing command to execute for restart	362	* @cmd: pointer to buffer containing command to execute for restart
363	* or %NULL	363	* or %NULL
364	*	364	*
365	* Shutdown everything and perform a clean reboot.	365	* Shutdown everything and perform a clean reboot.
366	* This is not safe to call in interrupt context.	366	* This is not safe to call in interrupt context.
367	*/	367	*/
368	void kernel_restart(char *cmd)	368	void kernel_restart(char *cmd)
369	{	369	{
370	kernel_restart_prepare(cmd);	370	kernel_restart_prepare(cmd);
371	disable_nonboot_cpus();	371	disable_nonboot_cpus();
372	if (!cmd)	372	if (!cmd)
373	printk(KERN_EMERG "Restarting system.\n");	373	printk(KERN_EMERG "Restarting system.\n");
374	else	374	else
375	printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);	375	printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
376	kmsg_dump(KMSG_DUMP_RESTART);	376	kmsg_dump(KMSG_DUMP_RESTART);
377	machine_restart(cmd);	377	machine_restart(cmd);
378	}	378	}
379	EXPORT_SYMBOL_GPL(kernel_restart);	379	EXPORT_SYMBOL_GPL(kernel_restart);
380		380
381	static void kernel_shutdown_prepare(enum system_states state)	381	static void kernel_shutdown_prepare(enum system_states state)
382	{	382	{
383	blocking_notifier_call_chain(&reboot_notifier_list,	383	blocking_notifier_call_chain(&reboot_notifier_list,
384	(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);	384	(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
385	system_state = state;	385	system_state = state;
386	usermodehelper_disable();	386	usermodehelper_disable();
387	device_shutdown();	387	device_shutdown();
388	}	388	}
389	/**	389	/**
390	* kernel_halt - halt the system	390	* kernel_halt - halt the system
391	*	391	*
392	* Shutdown everything and perform a clean system halt.	392	* Shutdown everything and perform a clean system halt.
393	*/	393	*/
394	void kernel_halt(void)	394	void kernel_halt(void)
395	{	395	{
396	kernel_shutdown_prepare(SYSTEM_HALT);	396	kernel_shutdown_prepare(SYSTEM_HALT);
397	syscore_shutdown();	397	syscore_shutdown();
398	printk(KERN_EMERG "System halted.\n");	398	printk(KERN_EMERG "System halted.\n");
399	kmsg_dump(KMSG_DUMP_HALT);	399	kmsg_dump(KMSG_DUMP_HALT);
400	machine_halt();	400	machine_halt();
401	}	401	}
402		402
403	EXPORT_SYMBOL_GPL(kernel_halt);	403	EXPORT_SYMBOL_GPL(kernel_halt);
404		404
405	/**	405	/**
406	* kernel_power_off - power_off the system	406	* kernel_power_off - power_off the system
407	*	407	*
408	* Shutdown everything and perform a clean system power_off.	408	* Shutdown everything and perform a clean system power_off.
409	*/	409	*/
410	void kernel_power_off(void)	410	void kernel_power_off(void)
411	{	411	{
412	kernel_shutdown_prepare(SYSTEM_POWER_OFF);	412	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
413	if (pm_power_off_prepare)	413	if (pm_power_off_prepare)
414	pm_power_off_prepare();	414	pm_power_off_prepare();
415	disable_nonboot_cpus();	415	disable_nonboot_cpus();
416	syscore_shutdown();	416	syscore_shutdown();
417	printk(KERN_EMERG "Power down.\n");	417	printk(KERN_EMERG "Power down.\n");
418	kmsg_dump(KMSG_DUMP_POWEROFF);	418	kmsg_dump(KMSG_DUMP_POWEROFF);
419	machine_power_off();	419	machine_power_off();
420	}	420	}
421	EXPORT_SYMBOL_GPL(kernel_power_off);	421	EXPORT_SYMBOL_GPL(kernel_power_off);
422		422
423	static DEFINE_MUTEX(reboot_mutex);	423	static DEFINE_MUTEX(reboot_mutex);
424		424
425	/*	425	/*
426	* Reboot system call: for obvious reasons only root may call it,	426	* Reboot system call: for obvious reasons only root may call it,
427	* and even root needs to set up some magic numbers in the registers	427	* and even root needs to set up some magic numbers in the registers
428	* so that some mistake won't make this reboot the whole machine.	428	* so that some mistake won't make this reboot the whole machine.
429	* You can also set the meaning of the ctrl-alt-del-key here.	429	* You can also set the meaning of the ctrl-alt-del-key here.
430	*	430	*
431	* reboot doesn't sync: do that yourself before calling this.	431	* reboot doesn't sync: do that yourself before calling this.
432	*/	432	*/
433	SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,	433	SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
434	void __user *, arg)	434	void __user *, arg)
435	{	435	{
436	char buffer[256];	436	char buffer[256];
437	int ret = 0;	437	int ret = 0;
438		438
439	/* We only trust the superuser with rebooting the system. */	439	/* We only trust the superuser with rebooting the system. */
440	if (!capable(CAP_SYS_BOOT))	440	if (!capable(CAP_SYS_BOOT))
441	return -EPERM;	441	return -EPERM;
442		442
443	/* For safety, we require "magic" arguments. */	443	/* For safety, we require "magic" arguments. */
444	if (magic1 != LINUX_REBOOT_MAGIC1 \|\|	444	if (magic1 != LINUX_REBOOT_MAGIC1 \|\|
445	(magic2 != LINUX_REBOOT_MAGIC2 &&	445	(magic2 != LINUX_REBOOT_MAGIC2 &&
446	magic2 != LINUX_REBOOT_MAGIC2A &&	446	magic2 != LINUX_REBOOT_MAGIC2A &&
447	magic2 != LINUX_REBOOT_MAGIC2B &&	447	magic2 != LINUX_REBOOT_MAGIC2B &&
448	magic2 != LINUX_REBOOT_MAGIC2C))	448	magic2 != LINUX_REBOOT_MAGIC2C))
449	return -EINVAL;	449	return -EINVAL;
450		450
451	/*	451	/*
452	* If pid namespaces are enabled and the current task is in a child	452	* If pid namespaces are enabled and the current task is in a child
453	* pid_namespace, the command is handled by reboot_pid_ns() which will	453	* pid_namespace, the command is handled by reboot_pid_ns() which will
454	* call do_exit().	454	* call do_exit().
455	*/	455	*/
456	ret = reboot_pid_ns(task_active_pid_ns(current), cmd);	456	ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
457	if (ret)	457	if (ret)
458	return ret;	458	return ret;
459		459
460	/* Instead of trying to make the power_off code look like	460	/* Instead of trying to make the power_off code look like
461	* halt when pm_power_off is not set do it the easy way.	461	* halt when pm_power_off is not set do it the easy way.
462	*/	462	*/
463	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)	463	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
464	cmd = LINUX_REBOOT_CMD_HALT;	464	cmd = LINUX_REBOOT_CMD_HALT;
465		465
466	mutex_lock(&reboot_mutex);	466	mutex_lock(&reboot_mutex);
467	switch (cmd) {	467	switch (cmd) {
468	case LINUX_REBOOT_CMD_RESTART:	468	case LINUX_REBOOT_CMD_RESTART:
469	kernel_restart(NULL);	469	kernel_restart(NULL);
470	break;	470	break;
471		471
472	case LINUX_REBOOT_CMD_CAD_ON:	472	case LINUX_REBOOT_CMD_CAD_ON:
473	C_A_D = 1;	473	C_A_D = 1;
474	break;	474	break;
475		475
476	case LINUX_REBOOT_CMD_CAD_OFF:	476	case LINUX_REBOOT_CMD_CAD_OFF:
477	C_A_D = 0;	477	C_A_D = 0;
478	break;	478	break;
479		479
480	case LINUX_REBOOT_CMD_HALT:	480	case LINUX_REBOOT_CMD_HALT:
481	kernel_halt();	481	kernel_halt();
482	do_exit(0);	482	do_exit(0);
483	panic("cannot halt");	483	panic("cannot halt");
484		484
485	case LINUX_REBOOT_CMD_POWER_OFF:	485	case LINUX_REBOOT_CMD_POWER_OFF:
486	kernel_power_off();	486	kernel_power_off();
487	do_exit(0);	487	do_exit(0);
488	break;	488	break;
489		489
490	case LINUX_REBOOT_CMD_RESTART2:	490	case LINUX_REBOOT_CMD_RESTART2:
491	if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {	491	if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
492	ret = -EFAULT;	492	ret = -EFAULT;
493	break;	493	break;
494	}	494	}
495	buffer[sizeof(buffer) - 1] = '\0';	495	buffer[sizeof(buffer) - 1] = '\0';
496		496
497	kernel_restart(buffer);	497	kernel_restart(buffer);
498	break;	498	break;
499		499
500	#ifdef CONFIG_KEXEC	500	#ifdef CONFIG_KEXEC
501	case LINUX_REBOOT_CMD_KEXEC:	501	case LINUX_REBOOT_CMD_KEXEC:
502	ret = kernel_kexec();	502	ret = kernel_kexec();
503	break;	503	break;
504	#endif	504	#endif
505		505
506	#ifdef CONFIG_HIBERNATION	506	#ifdef CONFIG_HIBERNATION
507	case LINUX_REBOOT_CMD_SW_SUSPEND:	507	case LINUX_REBOOT_CMD_SW_SUSPEND:
508	ret = hibernate();	508	ret = hibernate();
509	break;	509	break;
510	#endif	510	#endif
511		511
512	default:	512	default:
513	ret = -EINVAL;	513	ret = -EINVAL;
514	break;	514	break;
515	}	515	}
516	mutex_unlock(&reboot_mutex);	516	mutex_unlock(&reboot_mutex);
517	return ret;	517	return ret;
518	}	518	}
519		519
520	static void deferred_cad(struct work_struct *dummy)	520	static void deferred_cad(struct work_struct *dummy)
521	{	521	{
522	kernel_restart(NULL);	522	kernel_restart(NULL);
523	}	523	}
524		524
525	/*	525	/*
526	* This function gets called by ctrl-alt-del - ie the keyboard interrupt.	526	* This function gets called by ctrl-alt-del - ie the keyboard interrupt.
527	* As it's called within an interrupt, it may NOT sync: the only choice	527	* As it's called within an interrupt, it may NOT sync: the only choice
528	* is whether to reboot at once, or just ignore the ctrl-alt-del.	528	* is whether to reboot at once, or just ignore the ctrl-alt-del.
529	*/	529	*/
530	void ctrl_alt_del(void)	530	void ctrl_alt_del(void)
531	{	531	{
532	static DECLARE_WORK(cad_work, deferred_cad);	532	static DECLARE_WORK(cad_work, deferred_cad);
533		533
534	if (C_A_D)	534	if (C_A_D)
535	schedule_work(&cad_work);	535	schedule_work(&cad_work);
536	else	536	else
537	kill_cad_pid(SIGINT, 1);	537	kill_cad_pid(SIGINT, 1);
538	}	538	}
539		539
540	/*	540	/*
541	* Unprivileged users may change the real gid to the effective gid	541	* Unprivileged users may change the real gid to the effective gid
542	* or vice versa. (BSD-style)	542	* or vice versa. (BSD-style)
543	*	543	*
544	* If you set the real gid at all, or set the effective gid to a value not	544	* If you set the real gid at all, or set the effective gid to a value not
545	* equal to the real gid, then the saved gid is set to the new effective gid.	545	* equal to the real gid, then the saved gid is set to the new effective gid.
546	*	546	*
547	* This makes it possible for a setgid program to completely drop its	547	* This makes it possible for a setgid program to completely drop its
548	* privileges, which is often a useful assertion to make when you are doing	548	* privileges, which is often a useful assertion to make when you are doing
549	* a security audit over a program.	549	* a security audit over a program.
550	*	550	*
551	* The general idea is that a program which uses just setregid() will be	551	* The general idea is that a program which uses just setregid() will be
552	* 100% compatible with BSD. A program which uses just setgid() will be	552	* 100% compatible with BSD. A program which uses just setgid() will be
553	* 100% compatible with POSIX with saved IDs.	553	* 100% compatible with POSIX with saved IDs.
554	*	554	*
555	* SMP: There are not races, the GIDs are checked only by filesystem	555	* SMP: There are not races, the GIDs are checked only by filesystem
556	* operations (as far as semantic preservation is concerned).	556	* operations (as far as semantic preservation is concerned).
557	*/	557	*/
558	SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)	558	SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
559	{	559	{
560	struct user_namespace *ns = current_user_ns();	560	struct user_namespace *ns = current_user_ns();
561	const struct cred *old;	561	const struct cred *old;
562	struct cred *new;	562	struct cred *new;
563	int retval;	563	int retval;
564	kgid_t krgid, kegid;	564	kgid_t krgid, kegid;
565		565
566	krgid = make_kgid(ns, rgid);	566	krgid = make_kgid(ns, rgid);
567	kegid = make_kgid(ns, egid);	567	kegid = make_kgid(ns, egid);
568		568
569	if ((rgid != (gid_t) -1) && !gid_valid(krgid))	569	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
570	return -EINVAL;	570	return -EINVAL;
571	if ((egid != (gid_t) -1) && !gid_valid(kegid))	571	if ((egid != (gid_t) -1) && !gid_valid(kegid))
572	return -EINVAL;	572	return -EINVAL;
573		573
574	new = prepare_creds();	574	new = prepare_creds();
575	if (!new)	575	if (!new)
576	return -ENOMEM;	576	return -ENOMEM;
577	old = current_cred();	577	old = current_cred();
578		578
579	retval = -EPERM;	579	retval = -EPERM;
580	if (rgid != (gid_t) -1) {	580	if (rgid != (gid_t) -1) {
581	if (gid_eq(old->gid, krgid) \|\|	581	if (gid_eq(old->gid, krgid) \|\|
582	gid_eq(old->egid, krgid) \|\|	582	gid_eq(old->egid, krgid) \|\|
583	nsown_capable(CAP_SETGID))	583	nsown_capable(CAP_SETGID))
584	new->gid = krgid;	584	new->gid = krgid;
585	else	585	else
586	goto error;	586	goto error;
587	}	587	}
588	if (egid != (gid_t) -1) {	588	if (egid != (gid_t) -1) {
589	if (gid_eq(old->gid, kegid) \|\|	589	if (gid_eq(old->gid, kegid) \|\|
590	gid_eq(old->egid, kegid) \|\|	590	gid_eq(old->egid, kegid) \|\|
591	gid_eq(old->sgid, kegid) \|\|	591	gid_eq(old->sgid, kegid) \|\|
592	nsown_capable(CAP_SETGID))	592	nsown_capable(CAP_SETGID))
593	new->egid = kegid;	593	new->egid = kegid;
594	else	594	else
595	goto error;	595	goto error;
596	}	596	}
597		597
598	if (rgid != (gid_t) -1 \|\|	598	if (rgid != (gid_t) -1 \|\|
599	(egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))	599	(egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
600	new->sgid = new->egid;	600	new->sgid = new->egid;
601	new->fsgid = new->egid;	601	new->fsgid = new->egid;
602		602
603	return commit_creds(new);	603	return commit_creds(new);
604		604
605	error:	605	error:
606	abort_creds(new);	606	abort_creds(new);
607	return retval;	607	return retval;
608	}	608	}
609		609
610	/*	610	/*
611	* setgid() is implemented like SysV w/ SAVED_IDS	611	* setgid() is implemented like SysV w/ SAVED_IDS
612	*	612	*
613	* SMP: Same implicit races as above.	613	* SMP: Same implicit races as above.
614	*/	614	*/
615	SYSCALL_DEFINE1(setgid, gid_t, gid)	615	SYSCALL_DEFINE1(setgid, gid_t, gid)
616	{	616	{
617	struct user_namespace *ns = current_user_ns();	617	struct user_namespace *ns = current_user_ns();
618	const struct cred *old;	618	const struct cred *old;
619	struct cred *new;	619	struct cred *new;
620	int retval;	620	int retval;
621	kgid_t kgid;	621	kgid_t kgid;
622		622
623	kgid = make_kgid(ns, gid);	623	kgid = make_kgid(ns, gid);
624	if (!gid_valid(kgid))	624	if (!gid_valid(kgid))
625	return -EINVAL;	625	return -EINVAL;
626		626
627	new = prepare_creds();	627	new = prepare_creds();
628	if (!new)	628	if (!new)
629	return -ENOMEM;	629	return -ENOMEM;
630	old = current_cred();	630	old = current_cred();
631		631
632	retval = -EPERM;	632	retval = -EPERM;
633	if (nsown_capable(CAP_SETGID))	633	if (nsown_capable(CAP_SETGID))
634	new->gid = new->egid = new->sgid = new->fsgid = kgid;	634	new->gid = new->egid = new->sgid = new->fsgid = kgid;
635	else if (gid_eq(kgid, old->gid) \|\| gid_eq(kgid, old->sgid))	635	else if (gid_eq(kgid, old->gid) \|\| gid_eq(kgid, old->sgid))
636	new->egid = new->fsgid = kgid;	636	new->egid = new->fsgid = kgid;
637	else	637	else
638	goto error;	638	goto error;
639		639
640	return commit_creds(new);	640	return commit_creds(new);
641		641
642	error:	642	error:
643	abort_creds(new);	643	abort_creds(new);
644	return retval;	644	return retval;
645	}	645	}
646		646
647	/*	647	/*
648	* change the user struct in a credentials set to match the new UID	648	* change the user struct in a credentials set to match the new UID
649	*/	649	*/
650	static int set_user(struct cred *new)	650	static int set_user(struct cred *new)
651	{	651	{
652	struct user_struct *new_user;	652	struct user_struct *new_user;
653		653
654	new_user = alloc_uid(new->uid);	654	new_user = alloc_uid(new->uid);
655	if (!new_user)	655	if (!new_user)
656	return -EAGAIN;	656	return -EAGAIN;
657		657
658	/*	658	/*
659	* We don't fail in case of NPROC limit excess here because too many	659	* We don't fail in case of NPROC limit excess here because too many
660	* poorly written programs don't check set*uid() return code, assuming	660	* poorly written programs don't check set*uid() return code, assuming
661	* it never fails if called by root. We may still enforce NPROC limit	661	* it never fails if called by root. We may still enforce NPROC limit
662	* for programs doing set*uid()+execve() by harmlessly deferring the	662	* for programs doing set*uid()+execve() by harmlessly deferring the
663	* failure to the execve() stage.	663	* failure to the execve() stage.
664	*/	664	*/
665	if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&	665	if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
666	new_user != INIT_USER)	666	new_user != INIT_USER)
667	current->flags \|= PF_NPROC_EXCEEDED;	667	current->flags \|= PF_NPROC_EXCEEDED;
668	else	668	else
669	current->flags &= ~PF_NPROC_EXCEEDED;	669	current->flags &= ~PF_NPROC_EXCEEDED;
670		670
671	free_uid(new->user);	671	free_uid(new->user);
672	new->user = new_user;	672	new->user = new_user;
673	return 0;	673	return 0;
674	}	674	}
675		675
676	/*	676	/*
677	* Unprivileged users may change the real uid to the effective uid	677	* Unprivileged users may change the real uid to the effective uid
678	* or vice versa. (BSD-style)	678	* or vice versa. (BSD-style)
679	*	679	*
680	* If you set the real uid at all, or set the effective uid to a value not	680	* If you set the real uid at all, or set the effective uid to a value not
681	* equal to the real uid, then the saved uid is set to the new effective uid.	681	* equal to the real uid, then the saved uid is set to the new effective uid.
682	*	682	*
683	* This makes it possible for a setuid program to completely drop its	683	* This makes it possible for a setuid program to completely drop its
684	* privileges, which is often a useful assertion to make when you are doing	684	* privileges, which is often a useful assertion to make when you are doing
685	* a security audit over a program.	685	* a security audit over a program.
686	*	686	*
687	* The general idea is that a program which uses just setreuid() will be	687	* The general idea is that a program which uses just setreuid() will be
688	* 100% compatible with BSD. A program which uses just setuid() will be	688	* 100% compatible with BSD. A program which uses just setuid() will be
689	* 100% compatible with POSIX with saved IDs.	689	* 100% compatible with POSIX with saved IDs.
690	*/	690	*/
691	SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)	691	SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
692	{	692	{
693	struct user_namespace *ns = current_user_ns();	693	struct user_namespace *ns = current_user_ns();
694	const struct cred *old;	694	const struct cred *old;
695	struct cred *new;	695	struct cred *new;
696	int retval;	696	int retval;
697	kuid_t kruid, keuid;	697	kuid_t kruid, keuid;
698		698
699	kruid = make_kuid(ns, ruid);	699	kruid = make_kuid(ns, ruid);
700	keuid = make_kuid(ns, euid);	700	keuid = make_kuid(ns, euid);
701		701
702	if ((ruid != (uid_t) -1) && !uid_valid(kruid))	702	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
703	return -EINVAL;	703	return -EINVAL;
704	if ((euid != (uid_t) -1) && !uid_valid(keuid))	704	if ((euid != (uid_t) -1) && !uid_valid(keuid))
705	return -EINVAL;	705	return -EINVAL;
706		706
707	new = prepare_creds();	707	new = prepare_creds();
708	if (!new)	708	if (!new)
709	return -ENOMEM;	709	return -ENOMEM;
710	old = current_cred();	710	old = current_cred();
711		711
712	retval = -EPERM;	712	retval = -EPERM;
713	if (ruid != (uid_t) -1) {	713	if (ruid != (uid_t) -1) {
714	new->uid = kruid;	714	new->uid = kruid;
715	if (!uid_eq(old->uid, kruid) &&	715	if (!uid_eq(old->uid, kruid) &&
716	!uid_eq(old->euid, kruid) &&	716	!uid_eq(old->euid, kruid) &&
717	!nsown_capable(CAP_SETUID))	717	!nsown_capable(CAP_SETUID))
718	goto error;	718	goto error;
719	}	719	}
720		720
721	if (euid != (uid_t) -1) {	721	if (euid != (uid_t) -1) {
722	new->euid = keuid;	722	new->euid = keuid;
723	if (!uid_eq(old->uid, keuid) &&	723	if (!uid_eq(old->uid, keuid) &&
724	!uid_eq(old->euid, keuid) &&	724	!uid_eq(old->euid, keuid) &&
725	!uid_eq(old->suid, keuid) &&	725	!uid_eq(old->suid, keuid) &&
726	!nsown_capable(CAP_SETUID))	726	!nsown_capable(CAP_SETUID))
727	goto error;	727	goto error;
728	}	728	}
729		729
730	if (!uid_eq(new->uid, old->uid)) {	730	if (!uid_eq(new->uid, old->uid)) {
731	retval = set_user(new);	731	retval = set_user(new);
732	if (retval < 0)	732	if (retval < 0)
733	goto error;	733	goto error;
734	}	734	}
735	if (ruid != (uid_t) -1 \|\|	735	if (ruid != (uid_t) -1 \|\|
736	(euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))	736	(euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
737	new->suid = new->euid;	737	new->suid = new->euid;
738	new->fsuid = new->euid;	738	new->fsuid = new->euid;
739		739
740	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);	740	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
741	if (retval < 0)	741	if (retval < 0)
742	goto error;	742	goto error;
743		743
744	return commit_creds(new);	744	return commit_creds(new);
745		745
746	error:	746	error:
747	abort_creds(new);	747	abort_creds(new);
748	return retval;	748	return retval;
749	}	749	}
750		750
751	/*	751	/*
752	* setuid() is implemented like SysV with SAVED_IDS	752	* setuid() is implemented like SysV with SAVED_IDS
753	*	753	*
754	* Note that SAVED_ID's is deficient in that a setuid root program	754	* Note that SAVED_ID's is deficient in that a setuid root program
755	* like sendmail, for example, cannot set its uid to be a normal	755	* like sendmail, for example, cannot set its uid to be a normal
756	* user and then switch back, because if you're root, setuid() sets	756	* user and then switch back, because if you're root, setuid() sets
757	* the saved uid too. If you don't like this, blame the bright people	757	* the saved uid too. If you don't like this, blame the bright people
758	* in the POSIX committee and/or USG. Note that the BSD-style setreuid()	758	* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
759	* will allow a root program to temporarily drop privileges and be able to	759	* will allow a root program to temporarily drop privileges and be able to
760	* regain them by swapping the real and effective uid.	760	* regain them by swapping the real and effective uid.
761	*/	761	*/
762	SYSCALL_DEFINE1(setuid, uid_t, uid)	762	SYSCALL_DEFINE1(setuid, uid_t, uid)
763	{	763	{
764	struct user_namespace *ns = current_user_ns();	764	struct user_namespace *ns = current_user_ns();
765	const struct cred *old;	765	const struct cred *old;
766	struct cred *new;	766	struct cred *new;
767	int retval;	767	int retval;
768	kuid_t kuid;	768	kuid_t kuid;
769		769
770	kuid = make_kuid(ns, uid);	770	kuid = make_kuid(ns, uid);
771	if (!uid_valid(kuid))	771	if (!uid_valid(kuid))
772	return -EINVAL;	772	return -EINVAL;
773		773
774	new = prepare_creds();	774	new = prepare_creds();
775	if (!new)	775	if (!new)
776	return -ENOMEM;	776	return -ENOMEM;
777	old = current_cred();	777	old = current_cred();
778		778
779	retval = -EPERM;	779	retval = -EPERM;
780	if (nsown_capable(CAP_SETUID)) {	780	if (nsown_capable(CAP_SETUID)) {
781	new->suid = new->uid = kuid;	781	new->suid = new->uid = kuid;
782	if (!uid_eq(kuid, old->uid)) {	782	if (!uid_eq(kuid, old->uid)) {
783	retval = set_user(new);	783	retval = set_user(new);
784	if (retval < 0)	784	if (retval < 0)
785	goto error;	785	goto error;
786	}	786	}
787	} else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {	787	} else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
788	goto error;	788	goto error;
789	}	789	}
790		790
791	new->fsuid = new->euid = kuid;	791	new->fsuid = new->euid = kuid;
792		792
793	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);	793	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
794	if (retval < 0)	794	if (retval < 0)
795	goto error;	795	goto error;
796		796
797	return commit_creds(new);	797	return commit_creds(new);
798		798
799	error:	799	error:
800	abort_creds(new);	800	abort_creds(new);
801	return retval;	801	return retval;
802	}	802	}
803		803
804		804
805	/*	805	/*
806	* This function implements a generic ability to update ruid, euid,	806	* This function implements a generic ability to update ruid, euid,
807	* and suid. This allows you to implement the 4.4 compatible seteuid().	807	* and suid. This allows you to implement the 4.4 compatible seteuid().
808	*/	808	*/
809	SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)	809	SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
810	{	810	{
811	struct user_namespace *ns = current_user_ns();	811	struct user_namespace *ns = current_user_ns();
812	const struct cred *old;	812	const struct cred *old;
813	struct cred *new;	813	struct cred *new;
814	int retval;	814	int retval;
815	kuid_t kruid, keuid, ksuid;	815	kuid_t kruid, keuid, ksuid;
816		816
817	kruid = make_kuid(ns, ruid);	817	kruid = make_kuid(ns, ruid);
818	keuid = make_kuid(ns, euid);	818	keuid = make_kuid(ns, euid);
819	ksuid = make_kuid(ns, suid);	819	ksuid = make_kuid(ns, suid);
820		820
821	if ((ruid != (uid_t) -1) && !uid_valid(kruid))	821	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
822	return -EINVAL;	822	return -EINVAL;
823		823
824	if ((euid != (uid_t) -1) && !uid_valid(keuid))	824	if ((euid != (uid_t) -1) && !uid_valid(keuid))
825	return -EINVAL;	825	return -EINVAL;
826		826
827	if ((suid != (uid_t) -1) && !uid_valid(ksuid))	827	if ((suid != (uid_t) -1) && !uid_valid(ksuid))
828	return -EINVAL;	828	return -EINVAL;
829		829
830	new = prepare_creds();	830	new = prepare_creds();
831	if (!new)	831	if (!new)
832	return -ENOMEM;	832	return -ENOMEM;
833		833
834	old = current_cred();	834	old = current_cred();
835		835
836	retval = -EPERM;	836	retval = -EPERM;
837	if (!nsown_capable(CAP_SETUID)) {	837	if (!nsown_capable(CAP_SETUID)) {
838	if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&	838	if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
839	!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))	839	!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
840	goto error;	840	goto error;
841	if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&	841	if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
842	!uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))	842	!uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
843	goto error;	843	goto error;
844	if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&	844	if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
845	!uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))	845	!uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
846	goto error;	846	goto error;
847	}	847	}
848		848
849	if (ruid != (uid_t) -1) {	849	if (ruid != (uid_t) -1) {
850	new->uid = kruid;	850	new->uid = kruid;
851	if (!uid_eq(kruid, old->uid)) {	851	if (!uid_eq(kruid, old->uid)) {
852	retval = set_user(new);	852	retval = set_user(new);
853	if (retval < 0)	853	if (retval < 0)
854	goto error;	854	goto error;
855	}	855	}
856	}	856	}
857	if (euid != (uid_t) -1)	857	if (euid != (uid_t) -1)
858	new->euid = keuid;	858	new->euid = keuid;
859	if (suid != (uid_t) -1)	859	if (suid != (uid_t) -1)
860	new->suid = ksuid;	860	new->suid = ksuid;
861	new->fsuid = new->euid;	861	new->fsuid = new->euid;
862		862
863	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);	863	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
864	if (retval < 0)	864	if (retval < 0)
865	goto error;	865	goto error;
866		866
867	return commit_creds(new);	867	return commit_creds(new);
868		868
869	error:	869	error:
870	abort_creds(new);	870	abort_creds(new);
871	return retval;	871	return retval;
872	}	872	}
873		873
874	SYSCALL_DEFINE3(getresuid, uid_t __user , ruidp, uid_t __user , euidp, uid_t __user *, suidp)	874	SYSCALL_DEFINE3(getresuid, uid_t __user , ruidp, uid_t __user , euidp, uid_t __user *, suidp)
875	{	875	{
876	const struct cred *cred = current_cred();	876	const struct cred *cred = current_cred();
877	int retval;	877	int retval;
878	uid_t ruid, euid, suid;	878	uid_t ruid, euid, suid;
879		879
880	ruid = from_kuid_munged(cred->user_ns, cred->uid);	880	ruid = from_kuid_munged(cred->user_ns, cred->uid);
881	euid = from_kuid_munged(cred->user_ns, cred->euid);	881	euid = from_kuid_munged(cred->user_ns, cred->euid);
882	suid = from_kuid_munged(cred->user_ns, cred->suid);	882	suid = from_kuid_munged(cred->user_ns, cred->suid);
883		883
884	if (!(retval = put_user(ruid, ruidp)) &&	884	if (!(retval = put_user(ruid, ruidp)) &&
885	!(retval = put_user(euid, euidp)))	885	!(retval = put_user(euid, euidp)))
886	retval = put_user(suid, suidp);	886	retval = put_user(suid, suidp);
887		887
888	return retval;	888	return retval;
889	}	889	}
890		890
891	/*	891	/*
892	* Same as above, but for rgid, egid, sgid.	892	* Same as above, but for rgid, egid, sgid.
893	*/	893	*/
894	SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)	894	SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
895	{	895	{
896	struct user_namespace *ns = current_user_ns();	896	struct user_namespace *ns = current_user_ns();
897	const struct cred *old;	897	const struct cred *old;
898	struct cred *new;	898	struct cred *new;
899	int retval;	899	int retval;
900	kgid_t krgid, kegid, ksgid;	900	kgid_t krgid, kegid, ksgid;
901		901
902	krgid = make_kgid(ns, rgid);	902	krgid = make_kgid(ns, rgid);
903	kegid = make_kgid(ns, egid);	903	kegid = make_kgid(ns, egid);
904	ksgid = make_kgid(ns, sgid);	904	ksgid = make_kgid(ns, sgid);
905		905
906	if ((rgid != (gid_t) -1) && !gid_valid(krgid))	906	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
907	return -EINVAL;	907	return -EINVAL;
908	if ((egid != (gid_t) -1) && !gid_valid(kegid))	908	if ((egid != (gid_t) -1) && !gid_valid(kegid))
909	return -EINVAL;	909	return -EINVAL;
910	if ((sgid != (gid_t) -1) && !gid_valid(ksgid))	910	if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
911	return -EINVAL;	911	return -EINVAL;
912		912
913	new = prepare_creds();	913	new = prepare_creds();
914	if (!new)	914	if (!new)
915	return -ENOMEM;	915	return -ENOMEM;
916	old = current_cred();	916	old = current_cred();
917		917
918	retval = -EPERM;	918	retval = -EPERM;
919	if (!nsown_capable(CAP_SETGID)) {	919	if (!nsown_capable(CAP_SETGID)) {
920	if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&	920	if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
921	!gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))	921	!gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
922	goto error;	922	goto error;
923	if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&	923	if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
924	!gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))	924	!gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
925	goto error;	925	goto error;
926	if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&	926	if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
927	!gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))	927	!gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
928	goto error;	928	goto error;
929	}	929	}
930		930
931	if (rgid != (gid_t) -1)	931	if (rgid != (gid_t) -1)
932	new->gid = krgid;	932	new->gid = krgid;
933	if (egid != (gid_t) -1)	933	if (egid != (gid_t) -1)
934	new->egid = kegid;	934	new->egid = kegid;
935	if (sgid != (gid_t) -1)	935	if (sgid != (gid_t) -1)
936	new->sgid = ksgid;	936	new->sgid = ksgid;
937	new->fsgid = new->egid;	937	new->fsgid = new->egid;
938		938
939	return commit_creds(new);	939	return commit_creds(new);
940		940
941	error:	941	error:
942	abort_creds(new);	942	abort_creds(new);
943	return retval;	943	return retval;
944	}	944	}
945		945
946	SYSCALL_DEFINE3(getresgid, gid_t __user , rgidp, gid_t __user , egidp, gid_t __user *, sgidp)	946	SYSCALL_DEFINE3(getresgid, gid_t __user , rgidp, gid_t __user , egidp, gid_t __user *, sgidp)
947	{	947	{
948	const struct cred *cred = current_cred();	948	const struct cred *cred = current_cred();
949	int retval;	949	int retval;
950	gid_t rgid, egid, sgid;	950	gid_t rgid, egid, sgid;
951		951
952	rgid = from_kgid_munged(cred->user_ns, cred->gid);	952	rgid = from_kgid_munged(cred->user_ns, cred->gid);
953	egid = from_kgid_munged(cred->user_ns, cred->egid);	953	egid = from_kgid_munged(cred->user_ns, cred->egid);
954	sgid = from_kgid_munged(cred->user_ns, cred->sgid);	954	sgid = from_kgid_munged(cred->user_ns, cred->sgid);
955		955
956	if (!(retval = put_user(rgid, rgidp)) &&	956	if (!(retval = put_user(rgid, rgidp)) &&
957	!(retval = put_user(egid, egidp)))	957	!(retval = put_user(egid, egidp)))
958	retval = put_user(sgid, sgidp);	958	retval = put_user(sgid, sgidp);
959		959
960	return retval;	960	return retval;
961	}	961	}
962		962
963		963
964	/*	964	/*
965	* "setfsuid()" sets the fsuid - the uid used for filesystem checks. This	965	* "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
966	* is used for "access()" and for the NFS daemon (letting nfsd stay at	966	* is used for "access()" and for the NFS daemon (letting nfsd stay at
967	* whatever uid it wants to). It normally shadows "euid", except when	967	* whatever uid it wants to). It normally shadows "euid", except when
968	* explicitly set by setfsuid() or for access..	968	* explicitly set by setfsuid() or for access..
969	*/	969	*/
970	SYSCALL_DEFINE1(setfsuid, uid_t, uid)	970	SYSCALL_DEFINE1(setfsuid, uid_t, uid)
971	{	971	{
972	const struct cred *old;	972	const struct cred *old;
973	struct cred *new;	973	struct cred *new;
974	uid_t old_fsuid;	974	uid_t old_fsuid;
975	kuid_t kuid;	975	kuid_t kuid;
976		976
977	old = current_cred();	977	old = current_cred();
978	old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);	978	old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
979		979
980	kuid = make_kuid(old->user_ns, uid);	980	kuid = make_kuid(old->user_ns, uid);
981	if (!uid_valid(kuid))	981	if (!uid_valid(kuid))
982	return old_fsuid;	982	return old_fsuid;
983		983
984	new = prepare_creds();	984	new = prepare_creds();
985	if (!new)	985	if (!new)
986	return old_fsuid;	986	return old_fsuid;
987		987
988	if (uid_eq(kuid, old->uid) \|\| uid_eq(kuid, old->euid) \|\|	988	if (uid_eq(kuid, old->uid) \|\| uid_eq(kuid, old->euid) \|\|
989	uid_eq(kuid, old->suid) \|\| uid_eq(kuid, old->fsuid) \|\|	989	uid_eq(kuid, old->suid) \|\| uid_eq(kuid, old->fsuid) \|\|
990	nsown_capable(CAP_SETUID)) {	990	nsown_capable(CAP_SETUID)) {
991	if (!uid_eq(kuid, old->fsuid)) {	991	if (!uid_eq(kuid, old->fsuid)) {
992	new->fsuid = kuid;	992	new->fsuid = kuid;
993	if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)	993	if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
994	goto change_okay;	994	goto change_okay;
995	}	995	}
996	}	996	}
997		997
998	abort_creds(new);	998	abort_creds(new);
999	return old_fsuid;	999	return old_fsuid;
1000		1000
1001	change_okay:	1001	change_okay:
1002	commit_creds(new);	1002	commit_creds(new);
1003	return old_fsuid;	1003	return old_fsuid;
1004	}	1004	}
1005		1005
1006	/*	1006	/*
1007	* Samma på svenska..	1007	* Samma på svenska..
1008	*/	1008	*/
1009	SYSCALL_DEFINE1(setfsgid, gid_t, gid)	1009	SYSCALL_DEFINE1(setfsgid, gid_t, gid)
1010	{	1010	{
1011	const struct cred *old;	1011	const struct cred *old;
1012	struct cred *new;	1012	struct cred *new;
1013	gid_t old_fsgid;	1013	gid_t old_fsgid;
1014	kgid_t kgid;	1014	kgid_t kgid;
1015		1015
1016	old = current_cred();	1016	old = current_cred();
1017	old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);	1017	old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
1018		1018
1019	kgid = make_kgid(old->user_ns, gid);	1019	kgid = make_kgid(old->user_ns, gid);
1020	if (!gid_valid(kgid))	1020	if (!gid_valid(kgid))
1021	return old_fsgid;	1021	return old_fsgid;
1022		1022
1023	new = prepare_creds();	1023	new = prepare_creds();
1024	if (!new)	1024	if (!new)
1025	return old_fsgid;	1025	return old_fsgid;
1026		1026
1027	if (gid_eq(kgid, old->gid) \|\| gid_eq(kgid, old->egid) \|\|	1027	if (gid_eq(kgid, old->gid) \|\| gid_eq(kgid, old->egid) \|\|
1028	gid_eq(kgid, old->sgid) \|\| gid_eq(kgid, old->fsgid) \|\|	1028	gid_eq(kgid, old->sgid) \|\| gid_eq(kgid, old->fsgid) \|\|
1029	nsown_capable(CAP_SETGID)) {	1029	nsown_capable(CAP_SETGID)) {
1030	if (!gid_eq(kgid, old->fsgid)) {	1030	if (!gid_eq(kgid, old->fsgid)) {
1031	new->fsgid = kgid;	1031	new->fsgid = kgid;
1032	goto change_okay;	1032	goto change_okay;
1033	}	1033	}
1034	}	1034	}
1035		1035
1036	abort_creds(new);	1036	abort_creds(new);
1037	return old_fsgid;	1037	return old_fsgid;
1038		1038
1039	change_okay:	1039	change_okay:
1040	commit_creds(new);	1040	commit_creds(new);
1041	return old_fsgid;	1041	return old_fsgid;
1042	}	1042	}
1043		1043
1044	void do_sys_times(struct tms *tms)	1044	void do_sys_times(struct tms *tms)
1045	{	1045	{
1046	cputime_t tgutime, tgstime, cutime, cstime;	1046	cputime_t tgutime, tgstime, cutime, cstime;
1047		1047
1048	spin_lock_irq(&current->sighand->siglock);	1048	spin_lock_irq(&current->sighand->siglock);
1049	thread_group_times(current, &tgutime, &tgstime);	1049	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
1050	cutime = current->signal->cutime;	1050	cutime = current->signal->cutime;
1051	cstime = current->signal->cstime;	1051	cstime = current->signal->cstime;
1052	spin_unlock_irq(&current->sighand->siglock);	1052	spin_unlock_irq(&current->sighand->siglock);
1053	tms->tms_utime = cputime_to_clock_t(tgutime);	1053	tms->tms_utime = cputime_to_clock_t(tgutime);
1054	tms->tms_stime = cputime_to_clock_t(tgstime);	1054	tms->tms_stime = cputime_to_clock_t(tgstime);
1055	tms->tms_cutime = cputime_to_clock_t(cutime);	1055	tms->tms_cutime = cputime_to_clock_t(cutime);
1056	tms->tms_cstime = cputime_to_clock_t(cstime);	1056	tms->tms_cstime = cputime_to_clock_t(cstime);
1057	}	1057	}
1058		1058
1059	SYSCALL_DEFINE1(times, struct tms __user *, tbuf)	1059	SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
1060	{	1060	{
1061	if (tbuf) {	1061	if (tbuf) {
1062	struct tms tmp;	1062	struct tms tmp;
1063		1063
1064	do_sys_times(&tmp);	1064	do_sys_times(&tmp);
1065	if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))	1065	if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
1066	return -EFAULT;	1066	return -EFAULT;
1067	}	1067	}
1068	force_successful_syscall_return();	1068	force_successful_syscall_return();
1069	return (long) jiffies_64_to_clock_t(get_jiffies_64());	1069	return (long) jiffies_64_to_clock_t(get_jiffies_64());
1070	}	1070	}
1071		1071
1072	/*	1072	/*
1073	* This needs some heavy checking ...	1073	* This needs some heavy checking ...
1074	* I just haven't the stomach for it. I also don't fully	1074	* I just haven't the stomach for it. I also don't fully
1075	* understand sessions/pgrp etc. Let somebody who does explain it.	1075	* understand sessions/pgrp etc. Let somebody who does explain it.
1076	*	1076	*
1077	* OK, I think I have the protection semantics right.... this is really	1077	* OK, I think I have the protection semantics right.... this is really
1078	* only important on a multi-user system anyway, to make sure one user	1078	* only important on a multi-user system anyway, to make sure one user
1079	* can't send a signal to a process owned by another. -TYT, 12/12/91	1079	* can't send a signal to a process owned by another. -TYT, 12/12/91
1080	*	1080	*
1081	* Auch. Had to add the 'did_exec' flag to conform completely to POSIX.	1081	* Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
1082	* LBT 04.03.94	1082	* LBT 04.03.94
1083	*/	1083	*/
1084	SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)	1084	SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
1085	{	1085	{
1086	struct task_struct *p;	1086	struct task_struct *p;
1087	struct task_struct *group_leader = current->group_leader;	1087	struct task_struct *group_leader = current->group_leader;
1088	struct pid *pgrp;	1088	struct pid *pgrp;
1089	int err;	1089	int err;
1090		1090
1091	if (!pid)	1091	if (!pid)
1092	pid = task_pid_vnr(group_leader);	1092	pid = task_pid_vnr(group_leader);
1093	if (!pgid)	1093	if (!pgid)
1094	pgid = pid;	1094	pgid = pid;
1095	if (pgid < 0)	1095	if (pgid < 0)
1096	return -EINVAL;	1096	return -EINVAL;
1097	rcu_read_lock();	1097	rcu_read_lock();
1098		1098
1099	/* From this point forward we keep holding onto the tasklist lock	1099	/* From this point forward we keep holding onto the tasklist lock
1100	* so that our parent does not change from under us. -DaveM	1100	* so that our parent does not change from under us. -DaveM
1101	*/	1101	*/
1102	write_lock_irq(&tasklist_lock);	1102	write_lock_irq(&tasklist_lock);
1103		1103
1104	err = -ESRCH;	1104	err = -ESRCH;
1105	p = find_task_by_vpid(pid);	1105	p = find_task_by_vpid(pid);
1106	if (!p)	1106	if (!p)
1107	goto out;	1107	goto out;
1108		1108
1109	err = -EINVAL;	1109	err = -EINVAL;
1110	if (!thread_group_leader(p))	1110	if (!thread_group_leader(p))
1111	goto out;	1111	goto out;
1112		1112
1113	if (same_thread_group(p->real_parent, group_leader)) {	1113	if (same_thread_group(p->real_parent, group_leader)) {
1114	err = -EPERM;	1114	err = -EPERM;
1115	if (task_session(p) != task_session(group_leader))	1115	if (task_session(p) != task_session(group_leader))
1116	goto out;	1116	goto out;
1117	err = -EACCES;	1117	err = -EACCES;
1118	if (p->did_exec)	1118	if (p->did_exec)
1119	goto out;	1119	goto out;
1120	} else {	1120	} else {
1121	err = -ESRCH;	1121	err = -ESRCH;
1122	if (p != group_leader)	1122	if (p != group_leader)
1123	goto out;	1123	goto out;
1124	}	1124	}
1125		1125
1126	err = -EPERM;	1126	err = -EPERM;
1127	if (p->signal->leader)	1127	if (p->signal->leader)
1128	goto out;	1128	goto out;
1129		1129
1130	pgrp = task_pid(p);	1130	pgrp = task_pid(p);
1131	if (pgid != pid) {	1131	if (pgid != pid) {
1132	struct task_struct *g;	1132	struct task_struct *g;
1133		1133
1134	pgrp = find_vpid(pgid);	1134	pgrp = find_vpid(pgid);
1135	g = pid_task(pgrp, PIDTYPE_PGID);	1135	g = pid_task(pgrp, PIDTYPE_PGID);
1136	if (!g \|\| task_session(g) != task_session(group_leader))	1136	if (!g \|\| task_session(g) != task_session(group_leader))
1137	goto out;	1137	goto out;
1138	}	1138	}
1139		1139
1140	err = security_task_setpgid(p, pgid);	1140	err = security_task_setpgid(p, pgid);
1141	if (err)	1141	if (err)
1142	goto out;	1142	goto out;
1143		1143
1144	if (task_pgrp(p) != pgrp)	1144	if (task_pgrp(p) != pgrp)
1145	change_pid(p, PIDTYPE_PGID, pgrp);	1145	change_pid(p, PIDTYPE_PGID, pgrp);
1146		1146
1147	err = 0;	1147	err = 0;
1148	out:	1148	out:
1149	/* All paths lead to here, thus we are safe. -DaveM */	1149	/* All paths lead to here, thus we are safe. -DaveM */
1150	write_unlock_irq(&tasklist_lock);	1150	write_unlock_irq(&tasklist_lock);
1151	rcu_read_unlock();	1151	rcu_read_unlock();
1152	return err;	1152	return err;
1153	}	1153	}
1154		1154
1155	SYSCALL_DEFINE1(getpgid, pid_t, pid)	1155	SYSCALL_DEFINE1(getpgid, pid_t, pid)
1156	{	1156	{
1157	struct task_struct *p;	1157	struct task_struct *p;
1158	struct pid *grp;	1158	struct pid *grp;
1159	int retval;	1159	int retval;
1160		1160
1161	rcu_read_lock();	1161	rcu_read_lock();
1162	if (!pid)	1162	if (!pid)
1163	grp = task_pgrp(current);	1163	grp = task_pgrp(current);
1164	else {	1164	else {
1165	retval = -ESRCH;	1165	retval = -ESRCH;
1166	p = find_task_by_vpid(pid);	1166	p = find_task_by_vpid(pid);
1167	if (!p)	1167	if (!p)
1168	goto out;	1168	goto out;
1169	grp = task_pgrp(p);	1169	grp = task_pgrp(p);
1170	if (!grp)	1170	if (!grp)
1171	goto out;	1171	goto out;
1172		1172
1173	retval = security_task_getpgid(p);	1173	retval = security_task_getpgid(p);
1174	if (retval)	1174	if (retval)
1175	goto out;	1175	goto out;
1176	}	1176	}
1177	retval = pid_vnr(grp);	1177	retval = pid_vnr(grp);
1178	out:	1178	out:
1179	rcu_read_unlock();	1179	rcu_read_unlock();
1180	return retval;	1180	return retval;
1181	}	1181	}
1182		1182
1183	#ifdef __ARCH_WANT_SYS_GETPGRP	1183	#ifdef __ARCH_WANT_SYS_GETPGRP
1184		1184
1185	SYSCALL_DEFINE0(getpgrp)	1185	SYSCALL_DEFINE0(getpgrp)
1186	{	1186	{
1187	return sys_getpgid(0);	1187	return sys_getpgid(0);
1188	}	1188	}
1189		1189
1190	#endif	1190	#endif
1191		1191
1192	SYSCALL_DEFINE1(getsid, pid_t, pid)	1192	SYSCALL_DEFINE1(getsid, pid_t, pid)
1193	{	1193	{
1194	struct task_struct *p;	1194	struct task_struct *p;
1195	struct pid *sid;	1195	struct pid *sid;
1196	int retval;	1196	int retval;
1197		1197
1198	rcu_read_lock();	1198	rcu_read_lock();
1199	if (!pid)	1199	if (!pid)
1200	sid = task_session(current);	1200	sid = task_session(current);
1201	else {	1201	else {
1202	retval = -ESRCH;	1202	retval = -ESRCH;
1203	p = find_task_by_vpid(pid);	1203	p = find_task_by_vpid(pid);
1204	if (!p)	1204	if (!p)
1205	goto out;	1205	goto out;
1206	sid = task_session(p);	1206	sid = task_session(p);
1207	if (!sid)	1207	if (!sid)
1208	goto out;	1208	goto out;
1209		1209
1210	retval = security_task_getsid(p);	1210	retval = security_task_getsid(p);
1211	if (retval)	1211	if (retval)
1212	goto out;	1212	goto out;
1213	}	1213	}
1214	retval = pid_vnr(sid);	1214	retval = pid_vnr(sid);
1215	out:	1215	out:
1216	rcu_read_unlock();	1216	rcu_read_unlock();
1217	return retval;	1217	return retval;
1218	}	1218	}
1219		1219
1220	SYSCALL_DEFINE0(setsid)	1220	SYSCALL_DEFINE0(setsid)
1221	{	1221	{
1222	struct task_struct *group_leader = current->group_leader;	1222	struct task_struct *group_leader = current->group_leader;
1223	struct pid *sid = task_pid(group_leader);	1223	struct pid *sid = task_pid(group_leader);
1224	pid_t session = pid_vnr(sid);	1224	pid_t session = pid_vnr(sid);
1225	int err = -EPERM;	1225	int err = -EPERM;
1226		1226
1227	write_lock_irq(&tasklist_lock);	1227	write_lock_irq(&tasklist_lock);
1228	/* Fail if I am already a session leader */	1228	/* Fail if I am already a session leader */
1229	if (group_leader->signal->leader)	1229	if (group_leader->signal->leader)
1230	goto out;	1230	goto out;
1231		1231
1232	/* Fail if a process group id already exists that equals the	1232	/* Fail if a process group id already exists that equals the
1233	* proposed session id.	1233	* proposed session id.
1234	*/	1234	*/
1235	if (pid_task(sid, PIDTYPE_PGID))	1235	if (pid_task(sid, PIDTYPE_PGID))
1236	goto out;	1236	goto out;
1237		1237
1238	group_leader->signal->leader = 1;	1238	group_leader->signal->leader = 1;
1239	__set_special_pids(sid);	1239	__set_special_pids(sid);
1240		1240
1241	proc_clear_tty(group_leader);	1241	proc_clear_tty(group_leader);
1242		1242
1243	err = session;	1243	err = session;
1244	out:	1244	out:
1245	write_unlock_irq(&tasklist_lock);	1245	write_unlock_irq(&tasklist_lock);
1246	if (err > 0) {	1246	if (err > 0) {
1247	proc_sid_connector(group_leader);	1247	proc_sid_connector(group_leader);
1248	sched_autogroup_create_attach(group_leader);	1248	sched_autogroup_create_attach(group_leader);
1249	}	1249	}
1250	return err;	1250	return err;
1251	}	1251	}
1252		1252
1253	DECLARE_RWSEM(uts_sem);	1253	DECLARE_RWSEM(uts_sem);
1254		1254
1255	#ifdef COMPAT_UTS_MACHINE	1255	#ifdef COMPAT_UTS_MACHINE
1256	#define override_architecture(name) \	1256	#define override_architecture(name) \
1257	(personality(current->personality) == PER_LINUX32 && \	1257	(personality(current->personality) == PER_LINUX32 && \
1258	copy_to_user(name->machine, COMPAT_UTS_MACHINE, \	1258	copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1259	sizeof(COMPAT_UTS_MACHINE)))	1259	sizeof(COMPAT_UTS_MACHINE)))
1260	#else	1260	#else
1261	#define override_architecture(name) 0	1261	#define override_architecture(name) 0
1262	#endif	1262	#endif
1263		1263
1264	/*	1264	/*
1265	* Work around broken programs that cannot handle "Linux 3.0".	1265	* Work around broken programs that cannot handle "Linux 3.0".
1266	* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40	1266	* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1267	*/	1267	*/
1268	static int override_release(char __user *release, size_t len)	1268	static int override_release(char __user *release, size_t len)
1269	{	1269	{
1270	int ret = 0;	1270	int ret = 0;
1271		1271
1272	if (current->personality & UNAME26) {	1272	if (current->personality & UNAME26) {
1273	const char *rest = UTS_RELEASE;	1273	const char *rest = UTS_RELEASE;
1274	char buf[65] = { 0 };	1274	char buf[65] = { 0 };
1275	int ndots = 0;	1275	int ndots = 0;
1276	unsigned v;	1276	unsigned v;
1277	size_t copy;	1277	size_t copy;
1278		1278
1279	while (*rest) {	1279	while (*rest) {
1280	if (*rest == '.' && ++ndots >= 3)	1280	if (*rest == '.' && ++ndots >= 3)
1281	break;	1281	break;
1282	if (!isdigit(rest) && rest != '.')	1282	if (!isdigit(rest) && rest != '.')
1283	break;	1283	break;
1284	rest++;	1284	rest++;
1285	}	1285	}
1286	v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;	1286	v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1287	copy = clamp_t(size_t, len, 1, sizeof(buf));	1287	copy = clamp_t(size_t, len, 1, sizeof(buf));
1288	copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);	1288	copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
1289	ret = copy_to_user(release, buf, copy + 1);	1289	ret = copy_to_user(release, buf, copy + 1);
1290	}	1290	}
1291	return ret;	1291	return ret;
1292	}	1292	}
1293		1293
1294	SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)	1294	SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1295	{	1295	{
1296	int errno = 0;	1296	int errno = 0;
1297		1297
1298	down_read(&uts_sem);	1298	down_read(&uts_sem);
1299	if (copy_to_user(name, utsname(), sizeof *name))	1299	if (copy_to_user(name, utsname(), sizeof *name))
1300	errno = -EFAULT;	1300	errno = -EFAULT;
1301	up_read(&uts_sem);	1301	up_read(&uts_sem);
1302		1302
1303	if (!errno && override_release(name->release, sizeof(name->release)))	1303	if (!errno && override_release(name->release, sizeof(name->release)))
1304	errno = -EFAULT;	1304	errno = -EFAULT;
1305	if (!errno && override_architecture(name))	1305	if (!errno && override_architecture(name))
1306	errno = -EFAULT;	1306	errno = -EFAULT;
1307	return errno;	1307	return errno;
1308	}	1308	}
1309		1309
1310	#ifdef __ARCH_WANT_SYS_OLD_UNAME	1310	#ifdef __ARCH_WANT_SYS_OLD_UNAME
1311	/*	1311	/*
1312	* Old cruft	1312	* Old cruft
1313	*/	1313	*/
1314	SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)	1314	SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1315	{	1315	{
1316	int error = 0;	1316	int error = 0;
1317		1317
1318	if (!name)	1318	if (!name)
1319	return -EFAULT;	1319	return -EFAULT;
1320		1320
1321	down_read(&uts_sem);	1321	down_read(&uts_sem);
1322	if (copy_to_user(name, utsname(), sizeof(*name)))	1322	if (copy_to_user(name, utsname(), sizeof(*name)))
1323	error = -EFAULT;	1323	error = -EFAULT;
1324	up_read(&uts_sem);	1324	up_read(&uts_sem);
1325		1325
1326	if (!error && override_release(name->release, sizeof(name->release)))	1326	if (!error && override_release(name->release, sizeof(name->release)))
1327	error = -EFAULT;	1327	error = -EFAULT;
1328	if (!error && override_architecture(name))	1328	if (!error && override_architecture(name))
1329	error = -EFAULT;	1329	error = -EFAULT;
1330	return error;	1330	return error;
1331	}	1331	}
1332		1332
1333	SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)	1333	SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1334	{	1334	{
1335	int error;	1335	int error;
1336		1336
1337	if (!name)	1337	if (!name)
1338	return -EFAULT;	1338	return -EFAULT;
1339	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))	1339	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1340	return -EFAULT;	1340	return -EFAULT;
1341		1341
1342	down_read(&uts_sem);	1342	down_read(&uts_sem);
1343	error = __copy_to_user(&name->sysname, &utsname()->sysname,	1343	error = __copy_to_user(&name->sysname, &utsname()->sysname,
1344	__OLD_UTS_LEN);	1344	__OLD_UTS_LEN);
1345	error \|= __put_user(0, name->sysname + __OLD_UTS_LEN);	1345	error \|= __put_user(0, name->sysname + __OLD_UTS_LEN);
1346	error \|= __copy_to_user(&name->nodename, &utsname()->nodename,	1346	error \|= __copy_to_user(&name->nodename, &utsname()->nodename,
1347	__OLD_UTS_LEN);	1347	__OLD_UTS_LEN);
1348	error \|= __put_user(0, name->nodename + __OLD_UTS_LEN);	1348	error \|= __put_user(0, name->nodename + __OLD_UTS_LEN);
1349	error \|= __copy_to_user(&name->release, &utsname()->release,	1349	error \|= __copy_to_user(&name->release, &utsname()->release,
1350	__OLD_UTS_LEN);	1350	__OLD_UTS_LEN);
1351	error \|= __put_user(0, name->release + __OLD_UTS_LEN);	1351	error \|= __put_user(0, name->release + __OLD_UTS_LEN);
1352	error \|= __copy_to_user(&name->version, &utsname()->version,	1352	error \|= __copy_to_user(&name->version, &utsname()->version,
1353	__OLD_UTS_LEN);	1353	__OLD_UTS_LEN);
1354	error \|= __put_user(0, name->version + __OLD_UTS_LEN);	1354	error \|= __put_user(0, name->version + __OLD_UTS_LEN);
1355	error \|= __copy_to_user(&name->machine, &utsname()->machine,	1355	error \|= __copy_to_user(&name->machine, &utsname()->machine,
1356	__OLD_UTS_LEN);	1356	__OLD_UTS_LEN);
1357	error \|= __put_user(0, name->machine + __OLD_UTS_LEN);	1357	error \|= __put_user(0, name->machine + __OLD_UTS_LEN);
1358	up_read(&uts_sem);	1358	up_read(&uts_sem);
1359		1359
1360	if (!error && override_architecture(name))	1360	if (!error && override_architecture(name))
1361	error = -EFAULT;	1361	error = -EFAULT;
1362	if (!error && override_release(name->release, sizeof(name->release)))	1362	if (!error && override_release(name->release, sizeof(name->release)))
1363	error = -EFAULT;	1363	error = -EFAULT;
1364	return error ? -EFAULT : 0;	1364	return error ? -EFAULT : 0;
1365	}	1365	}
1366	#endif	1366	#endif
1367		1367
1368	SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)	1368	SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1369	{	1369	{
1370	int errno;	1370	int errno;
1371	char tmp[__NEW_UTS_LEN];	1371	char tmp[__NEW_UTS_LEN];
1372		1372
1373	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))	1373	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1374	return -EPERM;	1374	return -EPERM;
1375		1375
1376	if (len < 0 \|\| len > __NEW_UTS_LEN)	1376	if (len < 0 \|\| len > __NEW_UTS_LEN)
1377	return -EINVAL;	1377	return -EINVAL;
1378	down_write(&uts_sem);	1378	down_write(&uts_sem);
1379	errno = -EFAULT;	1379	errno = -EFAULT;
1380	if (!copy_from_user(tmp, name, len)) {	1380	if (!copy_from_user(tmp, name, len)) {
1381	struct new_utsname *u = utsname();	1381	struct new_utsname *u = utsname();
1382		1382
1383	memcpy(u->nodename, tmp, len);	1383	memcpy(u->nodename, tmp, len);
1384	memset(u->nodename + len, 0, sizeof(u->nodename) - len);	1384	memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1385	errno = 0;	1385	errno = 0;
1386	uts_proc_notify(UTS_PROC_HOSTNAME);	1386	uts_proc_notify(UTS_PROC_HOSTNAME);
1387	}	1387	}
1388	up_write(&uts_sem);	1388	up_write(&uts_sem);
1389	return errno;	1389	return errno;
1390	}	1390	}
1391		1391
1392	#ifdef __ARCH_WANT_SYS_GETHOSTNAME	1392	#ifdef __ARCH_WANT_SYS_GETHOSTNAME
1393		1393
1394	SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)	1394	SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
1395	{	1395	{
1396	int i, errno;	1396	int i, errno;
1397	struct new_utsname *u;	1397	struct new_utsname *u;
1398		1398
1399	if (len < 0)	1399	if (len < 0)
1400	return -EINVAL;	1400	return -EINVAL;
1401	down_read(&uts_sem);	1401	down_read(&uts_sem);
1402	u = utsname();	1402	u = utsname();
1403	i = 1 + strlen(u->nodename);	1403	i = 1 + strlen(u->nodename);
1404	if (i > len)	1404	if (i > len)
1405	i = len;	1405	i = len;
1406	errno = 0;	1406	errno = 0;
1407	if (copy_to_user(name, u->nodename, i))	1407	if (copy_to_user(name, u->nodename, i))
1408	errno = -EFAULT;	1408	errno = -EFAULT;
1409	up_read(&uts_sem);	1409	up_read(&uts_sem);
1410	return errno;	1410	return errno;
1411	}	1411	}
1412		1412
1413	#endif	1413	#endif
1414		1414
1415	/*	1415	/*
1416	* Only setdomainname; getdomainname can be implemented by calling	1416	* Only setdomainname; getdomainname can be implemented by calling
1417	* uname()	1417	* uname()
1418	*/	1418	*/
1419	SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)	1419	SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1420	{	1420	{
1421	int errno;	1421	int errno;
1422	char tmp[__NEW_UTS_LEN];	1422	char tmp[__NEW_UTS_LEN];
1423		1423
1424	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))	1424	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1425	return -EPERM;	1425	return -EPERM;
1426	if (len < 0 \|\| len > __NEW_UTS_LEN)	1426	if (len < 0 \|\| len > __NEW_UTS_LEN)
1427	return -EINVAL;	1427	return -EINVAL;
1428		1428
1429	down_write(&uts_sem);	1429	down_write(&uts_sem);
1430	errno = -EFAULT;	1430	errno = -EFAULT;
1431	if (!copy_from_user(tmp, name, len)) {	1431	if (!copy_from_user(tmp, name, len)) {
1432	struct new_utsname *u = utsname();	1432	struct new_utsname *u = utsname();
1433		1433
1434	memcpy(u->domainname, tmp, len);	1434	memcpy(u->domainname, tmp, len);
1435	memset(u->domainname + len, 0, sizeof(u->domainname) - len);	1435	memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1436	errno = 0;	1436	errno = 0;
1437	uts_proc_notify(UTS_PROC_DOMAINNAME);	1437	uts_proc_notify(UTS_PROC_DOMAINNAME);
1438	}	1438	}
1439	up_write(&uts_sem);	1439	up_write(&uts_sem);
1440	return errno;	1440	return errno;
1441	}	1441	}
1442		1442
1443	SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)	1443	SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1444	{	1444	{
1445	struct rlimit value;	1445	struct rlimit value;
1446	int ret;	1446	int ret;
1447		1447
1448	ret = do_prlimit(current, resource, NULL, &value);	1448	ret = do_prlimit(current, resource, NULL, &value);
1449	if (!ret)	1449	if (!ret)
1450	ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;	1450	ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1451		1451
1452	return ret;	1452	return ret;
1453	}	1453	}
1454		1454
1455	#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT	1455	#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1456		1456
1457	/*	1457	/*
1458	* Back compatibility for getrlimit. Needed for some apps.	1458	* Back compatibility for getrlimit. Needed for some apps.
1459	*/	1459	*/
1460		1460
1461	SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,	1461	SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1462	struct rlimit __user *, rlim)	1462	struct rlimit __user *, rlim)
1463	{	1463	{
1464	struct rlimit x;	1464	struct rlimit x;
1465	if (resource >= RLIM_NLIMITS)	1465	if (resource >= RLIM_NLIMITS)
1466	return -EINVAL;	1466	return -EINVAL;
1467		1467
1468	task_lock(current->group_leader);	1468	task_lock(current->group_leader);
1469	x = current->signal->rlim[resource];	1469	x = current->signal->rlim[resource];
1470	task_unlock(current->group_leader);	1470	task_unlock(current->group_leader);
1471	if (x.rlim_cur > 0x7FFFFFFF)	1471	if (x.rlim_cur > 0x7FFFFFFF)
1472	x.rlim_cur = 0x7FFFFFFF;	1472	x.rlim_cur = 0x7FFFFFFF;
1473	if (x.rlim_max > 0x7FFFFFFF)	1473	if (x.rlim_max > 0x7FFFFFFF)
1474	x.rlim_max = 0x7FFFFFFF;	1474	x.rlim_max = 0x7FFFFFFF;
1475	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;	1475	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
1476	}	1476	}
1477		1477
1478	#endif	1478	#endif
1479		1479
1480	static inline bool rlim64_is_infinity(__u64 rlim64)	1480	static inline bool rlim64_is_infinity(__u64 rlim64)
1481	{	1481	{
1482	#if BITS_PER_LONG < 64	1482	#if BITS_PER_LONG < 64
1483	return rlim64 >= ULONG_MAX;	1483	return rlim64 >= ULONG_MAX;
1484	#else	1484	#else
1485	return rlim64 == RLIM64_INFINITY;	1485	return rlim64 == RLIM64_INFINITY;
1486	#endif	1486	#endif
1487	}	1487	}
1488		1488
1489	static void rlim_to_rlim64(const struct rlimit rlim, struct rlimit64 rlim64)	1489	static void rlim_to_rlim64(const struct rlimit rlim, struct rlimit64 rlim64)
1490	{	1490	{
1491	if (rlim->rlim_cur == RLIM_INFINITY)	1491	if (rlim->rlim_cur == RLIM_INFINITY)
1492	rlim64->rlim_cur = RLIM64_INFINITY;	1492	rlim64->rlim_cur = RLIM64_INFINITY;
1493	else	1493	else
1494	rlim64->rlim_cur = rlim->rlim_cur;	1494	rlim64->rlim_cur = rlim->rlim_cur;
1495	if (rlim->rlim_max == RLIM_INFINITY)	1495	if (rlim->rlim_max == RLIM_INFINITY)
1496	rlim64->rlim_max = RLIM64_INFINITY;	1496	rlim64->rlim_max = RLIM64_INFINITY;
1497	else	1497	else
1498	rlim64->rlim_max = rlim->rlim_max;	1498	rlim64->rlim_max = rlim->rlim_max;
1499	}	1499	}
1500		1500
1501	static void rlim64_to_rlim(const struct rlimit64 rlim64, struct rlimit rlim)	1501	static void rlim64_to_rlim(const struct rlimit64 rlim64, struct rlimit rlim)
1502	{	1502	{
1503	if (rlim64_is_infinity(rlim64->rlim_cur))	1503	if (rlim64_is_infinity(rlim64->rlim_cur))
1504	rlim->rlim_cur = RLIM_INFINITY;	1504	rlim->rlim_cur = RLIM_INFINITY;
1505	else	1505	else
1506	rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;	1506	rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1507	if (rlim64_is_infinity(rlim64->rlim_max))	1507	if (rlim64_is_infinity(rlim64->rlim_max))
1508	rlim->rlim_max = RLIM_INFINITY;	1508	rlim->rlim_max = RLIM_INFINITY;
1509	else	1509	else
1510	rlim->rlim_max = (unsigned long)rlim64->rlim_max;	1510	rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1511	}	1511	}
1512		1512
1513	/* make sure you are allowed to change @tsk limits before calling this */	1513	/* make sure you are allowed to change @tsk limits before calling this */
1514	int do_prlimit(struct task_struct *tsk, unsigned int resource,	1514	int do_prlimit(struct task_struct *tsk, unsigned int resource,
1515	struct rlimit new_rlim, struct rlimit old_rlim)	1515	struct rlimit new_rlim, struct rlimit old_rlim)
1516	{	1516	{
1517	struct rlimit *rlim;	1517	struct rlimit *rlim;
1518	int retval = 0;	1518	int retval = 0;
1519		1519
1520	if (resource >= RLIM_NLIMITS)	1520	if (resource >= RLIM_NLIMITS)
1521	return -EINVAL;	1521	return -EINVAL;
1522	if (new_rlim) {	1522	if (new_rlim) {
1523	if (new_rlim->rlim_cur > new_rlim->rlim_max)	1523	if (new_rlim->rlim_cur > new_rlim->rlim_max)
1524	return -EINVAL;	1524	return -EINVAL;
1525	if (resource == RLIMIT_NOFILE &&	1525	if (resource == RLIMIT_NOFILE &&
1526	new_rlim->rlim_max > sysctl_nr_open)	1526	new_rlim->rlim_max > sysctl_nr_open)
1527	return -EPERM;	1527	return -EPERM;
1528	}	1528	}
1529		1529
1530	/* protect tsk->signal and tsk->sighand from disappearing */	1530	/* protect tsk->signal and tsk->sighand from disappearing */
1531	read_lock(&tasklist_lock);	1531	read_lock(&tasklist_lock);
1532	if (!tsk->sighand) {	1532	if (!tsk->sighand) {
1533	retval = -ESRCH;	1533	retval = -ESRCH;
1534	goto out;	1534	goto out;
1535	}	1535	}
1536		1536
1537	rlim = tsk->signal->rlim + resource;	1537	rlim = tsk->signal->rlim + resource;
1538	task_lock(tsk->group_leader);	1538	task_lock(tsk->group_leader);
1539	if (new_rlim) {	1539	if (new_rlim) {
1540	/* Keep the capable check against init_user_ns until	1540	/* Keep the capable check against init_user_ns until
1541	cgroups can contain all limits */	1541	cgroups can contain all limits */
1542	if (new_rlim->rlim_max > rlim->rlim_max &&	1542	if (new_rlim->rlim_max > rlim->rlim_max &&
1543	!capable(CAP_SYS_RESOURCE))	1543	!capable(CAP_SYS_RESOURCE))
1544	retval = -EPERM;	1544	retval = -EPERM;
1545	if (!retval)	1545	if (!retval)
1546	retval = security_task_setrlimit(tsk->group_leader,	1546	retval = security_task_setrlimit(tsk->group_leader,
1547	resource, new_rlim);	1547	resource, new_rlim);
1548	if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {	1548	if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1549	/*	1549	/*
1550	* The caller is asking for an immediate RLIMIT_CPU	1550	* The caller is asking for an immediate RLIMIT_CPU
1551	* expiry. But we use the zero value to mean "it was	1551	* expiry. But we use the zero value to mean "it was
1552	* never set". So let's cheat and make it one second	1552	* never set". So let's cheat and make it one second
1553	* instead	1553	* instead
1554	*/	1554	*/
1555	new_rlim->rlim_cur = 1;	1555	new_rlim->rlim_cur = 1;
1556	}	1556	}
1557	}	1557	}
1558	if (!retval) {	1558	if (!retval) {
1559	if (old_rlim)	1559	if (old_rlim)
1560	old_rlim = rlim;	1560	old_rlim = rlim;
1561	if (new_rlim)	1561	if (new_rlim)
1562	rlim = new_rlim;	1562	rlim = new_rlim;
1563	}	1563	}
1564	task_unlock(tsk->group_leader);	1564	task_unlock(tsk->group_leader);
1565		1565
1566	/*	1566	/*
1567	* RLIMIT_CPU handling. Note that the kernel fails to return an error	1567	* RLIMIT_CPU handling. Note that the kernel fails to return an error
1568	* code if it rejected the user's attempt to set RLIMIT_CPU. This is a	1568	* code if it rejected the user's attempt to set RLIMIT_CPU. This is a
1569	* very long-standing error, and fixing it now risks breakage of	1569	* very long-standing error, and fixing it now risks breakage of
1570	* applications, so we live with it	1570	* applications, so we live with it
1571	*/	1571	*/
1572	if (!retval && new_rlim && resource == RLIMIT_CPU &&	1572	if (!retval && new_rlim && resource == RLIMIT_CPU &&
1573	new_rlim->rlim_cur != RLIM_INFINITY)	1573	new_rlim->rlim_cur != RLIM_INFINITY)
1574	update_rlimit_cpu(tsk, new_rlim->rlim_cur);	1574	update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1575	out:	1575	out:
1576	read_unlock(&tasklist_lock);	1576	read_unlock(&tasklist_lock);
1577	return retval;	1577	return retval;
1578	}	1578	}
1579		1579
1580	/* rcu lock must be held */	1580	/* rcu lock must be held */
1581	static int check_prlimit_permission(struct task_struct *task)	1581	static int check_prlimit_permission(struct task_struct *task)
1582	{	1582	{
1583	const struct cred cred = current_cred(), tcred;	1583	const struct cred cred = current_cred(), tcred;
1584		1584
1585	if (current == task)	1585	if (current == task)
1586	return 0;	1586	return 0;
1587		1587
1588	tcred = __task_cred(task);	1588	tcred = __task_cred(task);
1589	if (uid_eq(cred->uid, tcred->euid) &&	1589	if (uid_eq(cred->uid, tcred->euid) &&
1590	uid_eq(cred->uid, tcred->suid) &&	1590	uid_eq(cred->uid, tcred->suid) &&
1591	uid_eq(cred->uid, tcred->uid) &&	1591	uid_eq(cred->uid, tcred->uid) &&
1592	gid_eq(cred->gid, tcred->egid) &&	1592	gid_eq(cred->gid, tcred->egid) &&
1593	gid_eq(cred->gid, tcred->sgid) &&	1593	gid_eq(cred->gid, tcred->sgid) &&
1594	gid_eq(cred->gid, tcred->gid))	1594	gid_eq(cred->gid, tcred->gid))
1595	return 0;	1595	return 0;
1596	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))	1596	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1597	return 0;	1597	return 0;
1598		1598
1599	return -EPERM;	1599	return -EPERM;
1600	}	1600	}
1601		1601
1602	SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,	1602	SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1603	const struct rlimit64 __user *, new_rlim,	1603	const struct rlimit64 __user *, new_rlim,
1604	struct rlimit64 __user *, old_rlim)	1604	struct rlimit64 __user *, old_rlim)
1605	{	1605	{
1606	struct rlimit64 old64, new64;	1606	struct rlimit64 old64, new64;
1607	struct rlimit old, new;	1607	struct rlimit old, new;
1608	struct task_struct *tsk;	1608	struct task_struct *tsk;
1609	int ret;	1609	int ret;
1610		1610
1611	if (new_rlim) {	1611	if (new_rlim) {
1612	if (copy_from_user(&new64, new_rlim, sizeof(new64)))	1612	if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1613	return -EFAULT;	1613	return -EFAULT;
1614	rlim64_to_rlim(&new64, &new);	1614	rlim64_to_rlim(&new64, &new);
1615	}	1615	}
1616		1616
1617	rcu_read_lock();	1617	rcu_read_lock();
1618	tsk = pid ? find_task_by_vpid(pid) : current;	1618	tsk = pid ? find_task_by_vpid(pid) : current;
1619	if (!tsk) {	1619	if (!tsk) {
1620	rcu_read_unlock();	1620	rcu_read_unlock();
1621	return -ESRCH;	1621	return -ESRCH;
1622	}	1622	}
1623	ret = check_prlimit_permission(tsk);	1623	ret = check_prlimit_permission(tsk);
1624	if (ret) {	1624	if (ret) {
1625	rcu_read_unlock();	1625	rcu_read_unlock();
1626	return ret;	1626	return ret;
1627	}	1627	}
1628	get_task_struct(tsk);	1628	get_task_struct(tsk);
1629	rcu_read_unlock();	1629	rcu_read_unlock();
1630		1630
1631	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,	1631	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1632	old_rlim ? &old : NULL);	1632	old_rlim ? &old : NULL);
1633		1633
1634	if (!ret && old_rlim) {	1634	if (!ret && old_rlim) {
1635	rlim_to_rlim64(&old, &old64);	1635	rlim_to_rlim64(&old, &old64);
1636	if (copy_to_user(old_rlim, &old64, sizeof(old64)))	1636	if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1637	ret = -EFAULT;	1637	ret = -EFAULT;
1638	}	1638	}
1639		1639
1640	put_task_struct(tsk);	1640	put_task_struct(tsk);
1641	return ret;	1641	return ret;
1642	}	1642	}
1643		1643
1644	SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)	1644	SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1645	{	1645	{
1646	struct rlimit new_rlim;	1646	struct rlimit new_rlim;
1647		1647
1648	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))	1648	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1649	return -EFAULT;	1649	return -EFAULT;
1650	return do_prlimit(current, resource, &new_rlim, NULL);	1650	return do_prlimit(current, resource, &new_rlim, NULL);
1651	}	1651	}
1652		1652
1653	/*	1653	/*
1654	* It would make sense to put struct rusage in the task_struct,	1654	* It would make sense to put struct rusage in the task_struct,
1655	* except that would make the task_struct be really big. After	1655	* except that would make the task_struct be really big. After
1656	* task_struct gets moved into malloc'ed memory, it would	1656	* task_struct gets moved into malloc'ed memory, it would
1657	* make sense to do this. It will make moving the rest of the information	1657	* make sense to do this. It will make moving the rest of the information
1658	* a lot simpler! (Which we're not doing right now because we're not	1658	* a lot simpler! (Which we're not doing right now because we're not
1659	* measuring them yet).	1659	* measuring them yet).
1660	*	1660	*
1661	* When sampling multiple threads for RUSAGE_SELF, under SMP we might have	1661	* When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1662	* races with threads incrementing their own counters. But since word	1662	* races with threads incrementing their own counters. But since word
1663	* reads are atomic, we either get new values or old values and we don't	1663	* reads are atomic, we either get new values or old values and we don't
1664	* care which for the sums. We always take the siglock to protect reading	1664	* care which for the sums. We always take the siglock to protect reading
1665	* the c* fields from p->signal from races with exit.c updating those	1665	* the c* fields from p->signal from races with exit.c updating those
1666	* fields when reaping, so a sample either gets all the additions of a	1666	* fields when reaping, so a sample either gets all the additions of a
1667	* given child after it's reaped, or none so this sample is before reaping.	1667	* given child after it's reaped, or none so this sample is before reaping.
1668	*	1668	*
1669	* Locking:	1669	* Locking:
1670	* We need to take the siglock for CHILDEREN, SELF and BOTH	1670	* We need to take the siglock for CHILDEREN, SELF and BOTH
1671	* for the cases current multithreaded, non-current single threaded	1671	* for the cases current multithreaded, non-current single threaded
1672	* non-current multithreaded. Thread traversal is now safe with	1672	* non-current multithreaded. Thread traversal is now safe with
1673	* the siglock held.	1673	* the siglock held.
1674	* Strictly speaking, we donot need to take the siglock if we are current and	1674	* Strictly speaking, we donot need to take the siglock if we are current and
1675	* single threaded, as no one else can take our signal_struct away, no one	1675	* single threaded, as no one else can take our signal_struct away, no one
1676	* else can reap the children to update signal->c* counters, and no one else	1676	* else can reap the children to update signal->c* counters, and no one else
1677	* can race with the signal-> fields. If we do not take any lock, the	1677	* can race with the signal-> fields. If we do not take any lock, the
1678	* signal-> fields could be read out of order while another thread was just	1678	* signal-> fields could be read out of order while another thread was just
1679	* exiting. So we should place a read memory barrier when we avoid the lock.	1679	* exiting. So we should place a read memory barrier when we avoid the lock.
1680	* On the writer side, write memory barrier is implied in __exit_signal	1680	* On the writer side, write memory barrier is implied in __exit_signal
1681	* as __exit_signal releases the siglock spinlock after updating the signal->	1681	* as __exit_signal releases the siglock spinlock after updating the signal->
1682	* fields. But we don't do this yet to keep things simple.	1682	* fields. But we don't do this yet to keep things simple.
1683	*	1683	*
1684	*/	1684	*/
1685		1685
1686	static void accumulate_thread_rusage(struct task_struct t, struct rusage r)	1686	static void accumulate_thread_rusage(struct task_struct t, struct rusage r)
1687	{	1687	{
1688	r->ru_nvcsw += t->nvcsw;	1688	r->ru_nvcsw += t->nvcsw;
1689	r->ru_nivcsw += t->nivcsw;	1689	r->ru_nivcsw += t->nivcsw;
1690	r->ru_minflt += t->min_flt;	1690	r->ru_minflt += t->min_flt;
1691	r->ru_majflt += t->maj_flt;	1691	r->ru_majflt += t->maj_flt;
1692	r->ru_inblock += task_io_get_inblock(t);	1692	r->ru_inblock += task_io_get_inblock(t);
1693	r->ru_oublock += task_io_get_oublock(t);	1693	r->ru_oublock += task_io_get_oublock(t);
1694	}	1694	}
1695		1695
1696	static void k_getrusage(struct task_struct p, int who, struct rusage r)	1696	static void k_getrusage(struct task_struct p, int who, struct rusage r)
1697	{	1697	{
1698	struct task_struct *t;	1698	struct task_struct *t;
1699	unsigned long flags;	1699	unsigned long flags;
1700	cputime_t tgutime, tgstime, utime, stime;	1700	cputime_t tgutime, tgstime, utime, stime;
1701	unsigned long maxrss = 0;	1701	unsigned long maxrss = 0;
1702		1702
1703	memset((char ) r, 0, sizeof r);	1703	memset((char ) r, 0, sizeof r);
1704	utime = stime = 0;	1704	utime = stime = 0;
1705		1705
1706	if (who == RUSAGE_THREAD) {	1706	if (who == RUSAGE_THREAD) {
1707	task_times(current, &utime, &stime);	1707	task_cputime_adjusted(current, &utime, &stime);
1708	accumulate_thread_rusage(p, r);	1708	accumulate_thread_rusage(p, r);
1709	maxrss = p->signal->maxrss;	1709	maxrss = p->signal->maxrss;
1710	goto out;	1710	goto out;
1711	}	1711	}
1712		1712
1713	if (!lock_task_sighand(p, &flags))	1713	if (!lock_task_sighand(p, &flags))
1714	return;	1714	return;
1715		1715
1716	switch (who) {	1716	switch (who) {
1717	case RUSAGE_BOTH:	1717	case RUSAGE_BOTH:
1718	case RUSAGE_CHILDREN:	1718	case RUSAGE_CHILDREN:
1719	utime = p->signal->cutime;	1719	utime = p->signal->cutime;
1720	stime = p->signal->cstime;	1720	stime = p->signal->cstime;
1721	r->ru_nvcsw = p->signal->cnvcsw;	1721	r->ru_nvcsw = p->signal->cnvcsw;
1722	r->ru_nivcsw = p->signal->cnivcsw;	1722	r->ru_nivcsw = p->signal->cnivcsw;
1723	r->ru_minflt = p->signal->cmin_flt;	1723	r->ru_minflt = p->signal->cmin_flt;
1724	r->ru_majflt = p->signal->cmaj_flt;	1724	r->ru_majflt = p->signal->cmaj_flt;
1725	r->ru_inblock = p->signal->cinblock;	1725	r->ru_inblock = p->signal->cinblock;
1726	r->ru_oublock = p->signal->coublock;	1726	r->ru_oublock = p->signal->coublock;
1727	maxrss = p->signal->cmaxrss;	1727	maxrss = p->signal->cmaxrss;
1728		1728
1729	if (who == RUSAGE_CHILDREN)	1729	if (who == RUSAGE_CHILDREN)
1730	break;	1730	break;
1731		1731
1732	case RUSAGE_SELF:	1732	case RUSAGE_SELF:
1733	thread_group_times(p, &tgutime, &tgstime);	1733	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1734	utime += tgutime;	1734	utime += tgutime;
1735	stime += tgstime;	1735	stime += tgstime;
1736	r->ru_nvcsw += p->signal->nvcsw;	1736	r->ru_nvcsw += p->signal->nvcsw;
1737	r->ru_nivcsw += p->signal->nivcsw;	1737	r->ru_nivcsw += p->signal->nivcsw;
1738	r->ru_minflt += p->signal->min_flt;	1738	r->ru_minflt += p->signal->min_flt;
1739	r->ru_majflt += p->signal->maj_flt;	1739	r->ru_majflt += p->signal->maj_flt;
1740	r->ru_inblock += p->signal->inblock;	1740	r->ru_inblock += p->signal->inblock;
1741	r->ru_oublock += p->signal->oublock;	1741	r->ru_oublock += p->signal->oublock;
1742	if (maxrss < p->signal->maxrss)	1742	if (maxrss < p->signal->maxrss)
1743	maxrss = p->signal->maxrss;	1743	maxrss = p->signal->maxrss;
1744	t = p;	1744	t = p;
1745	do {	1745	do {
1746	accumulate_thread_rusage(t, r);	1746	accumulate_thread_rusage(t, r);
1747	t = next_thread(t);	1747	t = next_thread(t);
1748	} while (t != p);	1748	} while (t != p);
1749	break;	1749	break;
1750		1750
1751	default:	1751	default:
1752	BUG();	1752	BUG();
1753	}	1753	}
1754	unlock_task_sighand(p, &flags);	1754	unlock_task_sighand(p, &flags);
1755		1755
1756	out:	1756	out:
1757	cputime_to_timeval(utime, &r->ru_utime);	1757	cputime_to_timeval(utime, &r->ru_utime);
1758	cputime_to_timeval(stime, &r->ru_stime);	1758	cputime_to_timeval(stime, &r->ru_stime);
1759		1759
1760	if (who != RUSAGE_CHILDREN) {	1760	if (who != RUSAGE_CHILDREN) {
1761	struct mm_struct *mm = get_task_mm(p);	1761	struct mm_struct *mm = get_task_mm(p);
1762	if (mm) {	1762	if (mm) {
1763	setmax_mm_hiwater_rss(&maxrss, mm);	1763	setmax_mm_hiwater_rss(&maxrss, mm);
1764	mmput(mm);	1764	mmput(mm);
1765	}	1765	}
1766	}	1766	}
1767	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */	1767	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1768	}	1768	}
1769		1769
1770	int getrusage(struct task_struct p, int who, struct rusage __user ru)	1770	int getrusage(struct task_struct p, int who, struct rusage __user ru)
1771	{	1771	{
1772	struct rusage r;	1772	struct rusage r;
1773	k_getrusage(p, who, &r);	1773	k_getrusage(p, who, &r);
1774	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;	1774	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1775	}	1775	}
1776		1776
1777	SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)	1777	SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1778	{	1778	{
1779	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&	1779	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1780	who != RUSAGE_THREAD)	1780	who != RUSAGE_THREAD)
1781	return -EINVAL;	1781	return -EINVAL;
1782	return getrusage(current, who, ru);	1782	return getrusage(current, who, ru);
1783	}	1783	}
1784		1784
1785	SYSCALL_DEFINE1(umask, int, mask)	1785	SYSCALL_DEFINE1(umask, int, mask)
1786	{	1786	{
1787	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);	1787	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1788	return mask;	1788	return mask;
1789	}	1789	}
1790		1790
1791	#ifdef CONFIG_CHECKPOINT_RESTORE	1791	#ifdef CONFIG_CHECKPOINT_RESTORE
1792	static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)	1792	static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793	{	1793	{
1794	struct fd exe;	1794	struct fd exe;
1795	struct dentry *dentry;	1795	struct dentry *dentry;
1796	int err;	1796	int err;
1797		1797
1798	exe = fdget(fd);	1798	exe = fdget(fd);
1799	if (!exe.file)	1799	if (!exe.file)
1800	return -EBADF;	1800	return -EBADF;
1801		1801
1802	dentry = exe.file->f_path.dentry;	1802	dentry = exe.file->f_path.dentry;
1803		1803
1804	/*	1804	/*
1805	* Because the original mm->exe_file points to executable file, make	1805	* Because the original mm->exe_file points to executable file, make
1806	* sure that this one is executable as well, to avoid breaking an	1806	* sure that this one is executable as well, to avoid breaking an
1807	* overall picture.	1807	* overall picture.
1808	*/	1808	*/
1809	err = -EACCES;	1809	err = -EACCES;
1810	if (!S_ISREG(dentry->d_inode->i_mode) \|\|	1810	if (!S_ISREG(dentry->d_inode->i_mode) \|\|
1811	exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)	1811	exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812	goto exit;	1812	goto exit;
1813		1813
1814	err = inode_permission(dentry->d_inode, MAY_EXEC);	1814	err = inode_permission(dentry->d_inode, MAY_EXEC);
1815	if (err)	1815	if (err)
1816	goto exit;	1816	goto exit;
1817		1817
1818	down_write(&mm->mmap_sem);	1818	down_write(&mm->mmap_sem);
1819		1819
1820	/*	1820	/*
1821	* Forbid mm->exe_file change if old file still mapped.	1821	* Forbid mm->exe_file change if old file still mapped.
1822	*/	1822	*/
1823	err = -EBUSY;	1823	err = -EBUSY;
1824	if (mm->exe_file) {	1824	if (mm->exe_file) {
1825	struct vm_area_struct *vma;	1825	struct vm_area_struct *vma;
1826		1826
1827	for (vma = mm->mmap; vma; vma = vma->vm_next)	1827	for (vma = mm->mmap; vma; vma = vma->vm_next)
1828	if (vma->vm_file &&	1828	if (vma->vm_file &&
1829	path_equal(&vma->vm_file->f_path,	1829	path_equal(&vma->vm_file->f_path,
1830	&mm->exe_file->f_path))	1830	&mm->exe_file->f_path))
1831	goto exit_unlock;	1831	goto exit_unlock;
1832	}	1832	}
1833		1833
1834	/*	1834	/*
1835	* The symlink can be changed only once, just to disallow arbitrary	1835	* The symlink can be changed only once, just to disallow arbitrary
1836	* transitions malicious software might bring in. This means one	1836	* transitions malicious software might bring in. This means one
1837	* could make a snapshot over all processes running and monitor	1837	* could make a snapshot over all processes running and monitor
1838	* /proc/pid/exe changes to notice unusual activity if needed.	1838	* /proc/pid/exe changes to notice unusual activity if needed.
1839	*/	1839	*/
1840	err = -EPERM;	1840	err = -EPERM;
1841	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))	1841	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1842	goto exit_unlock;	1842	goto exit_unlock;
1843		1843
1844	err = 0;	1844	err = 0;
1845	set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */	1845	set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1846	exit_unlock:	1846	exit_unlock:
1847	up_write(&mm->mmap_sem);	1847	up_write(&mm->mmap_sem);
1848		1848
1849	exit:	1849	exit:
1850	fdput(exe);	1850	fdput(exe);
1851	return err;	1851	return err;
1852	}	1852	}
1853		1853
1854	static int prctl_set_mm(int opt, unsigned long addr,	1854	static int prctl_set_mm(int opt, unsigned long addr,
1855	unsigned long arg4, unsigned long arg5)	1855	unsigned long arg4, unsigned long arg5)
1856	{	1856	{
1857	unsigned long rlim = rlimit(RLIMIT_DATA);	1857	unsigned long rlim = rlimit(RLIMIT_DATA);
1858	struct mm_struct *mm = current->mm;	1858	struct mm_struct *mm = current->mm;
1859	struct vm_area_struct *vma;	1859	struct vm_area_struct *vma;
1860	int error;	1860	int error;
1861		1861
1862	if (arg5 \|\| (arg4 && opt != PR_SET_MM_AUXV))	1862	if (arg5 \|\| (arg4 && opt != PR_SET_MM_AUXV))
1863	return -EINVAL;	1863	return -EINVAL;
1864		1864
1865	if (!capable(CAP_SYS_RESOURCE))	1865	if (!capable(CAP_SYS_RESOURCE))
1866	return -EPERM;	1866	return -EPERM;
1867		1867
1868	if (opt == PR_SET_MM_EXE_FILE)	1868	if (opt == PR_SET_MM_EXE_FILE)
1869	return prctl_set_mm_exe_file(mm, (unsigned int)addr);	1869	return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1870		1870
1871	if (addr >= TASK_SIZE \|\| addr < mmap_min_addr)	1871	if (addr >= TASK_SIZE \|\| addr < mmap_min_addr)
1872	return -EINVAL;	1872	return -EINVAL;
1873		1873
1874	error = -EINVAL;	1874	error = -EINVAL;
1875		1875
1876	down_read(&mm->mmap_sem);	1876	down_read(&mm->mmap_sem);
1877	vma = find_vma(mm, addr);	1877	vma = find_vma(mm, addr);
1878		1878
1879	switch (opt) {	1879	switch (opt) {
1880	case PR_SET_MM_START_CODE:	1880	case PR_SET_MM_START_CODE:
1881	mm->start_code = addr;	1881	mm->start_code = addr;
1882	break;	1882	break;
1883	case PR_SET_MM_END_CODE:	1883	case PR_SET_MM_END_CODE:
1884	mm->end_code = addr;	1884	mm->end_code = addr;
1885	break;	1885	break;
1886	case PR_SET_MM_START_DATA:	1886	case PR_SET_MM_START_DATA:
1887	mm->start_data = addr;	1887	mm->start_data = addr;
1888	break;	1888	break;
1889	case PR_SET_MM_END_DATA:	1889	case PR_SET_MM_END_DATA:
1890	mm->end_data = addr;	1890	mm->end_data = addr;
1891	break;	1891	break;
1892		1892
1893	case PR_SET_MM_START_BRK:	1893	case PR_SET_MM_START_BRK:
1894	if (addr <= mm->end_data)	1894	if (addr <= mm->end_data)
1895	goto out;	1895	goto out;
1896		1896
1897	if (rlim < RLIM_INFINITY &&	1897	if (rlim < RLIM_INFINITY &&
1898	(mm->brk - addr) +	1898	(mm->brk - addr) +
1899	(mm->end_data - mm->start_data) > rlim)	1899	(mm->end_data - mm->start_data) > rlim)
1900	goto out;	1900	goto out;
1901		1901
1902	mm->start_brk = addr;	1902	mm->start_brk = addr;
1903	break;	1903	break;
1904		1904
1905	case PR_SET_MM_BRK:	1905	case PR_SET_MM_BRK:
1906	if (addr <= mm->end_data)	1906	if (addr <= mm->end_data)
1907	goto out;	1907	goto out;
1908		1908
1909	if (rlim < RLIM_INFINITY &&	1909	if (rlim < RLIM_INFINITY &&
1910	(addr - mm->start_brk) +	1910	(addr - mm->start_brk) +
1911	(mm->end_data - mm->start_data) > rlim)	1911	(mm->end_data - mm->start_data) > rlim)
1912	goto out;	1912	goto out;
1913		1913
1914	mm->brk = addr;	1914	mm->brk = addr;
1915	break;	1915	break;
1916		1916
1917	/*	1917	/*
1918	* If command line arguments and environment	1918	* If command line arguments and environment
1919	* are placed somewhere else on stack, we can	1919	* are placed somewhere else on stack, we can
1920	* set them up here, ARG_START/END to setup	1920	* set them up here, ARG_START/END to setup
1921	* command line argumets and ENV_START/END	1921	* command line argumets and ENV_START/END
1922	* for environment.	1922	* for environment.
1923	*/	1923	*/
1924	case PR_SET_MM_START_STACK:	1924	case PR_SET_MM_START_STACK:
1925	case PR_SET_MM_ARG_START:	1925	case PR_SET_MM_ARG_START:
1926	case PR_SET_MM_ARG_END:	1926	case PR_SET_MM_ARG_END:
1927	case PR_SET_MM_ENV_START:	1927	case PR_SET_MM_ENV_START:
1928	case PR_SET_MM_ENV_END:	1928	case PR_SET_MM_ENV_END:
1929	if (!vma) {	1929	if (!vma) {
1930	error = -EFAULT;	1930	error = -EFAULT;
1931	goto out;	1931	goto out;
1932	}	1932	}
1933	if (opt == PR_SET_MM_START_STACK)	1933	if (opt == PR_SET_MM_START_STACK)
1934	mm->start_stack = addr;	1934	mm->start_stack = addr;
1935	else if (opt == PR_SET_MM_ARG_START)	1935	else if (opt == PR_SET_MM_ARG_START)
1936	mm->arg_start = addr;	1936	mm->arg_start = addr;
1937	else if (opt == PR_SET_MM_ARG_END)	1937	else if (opt == PR_SET_MM_ARG_END)
1938	mm->arg_end = addr;	1938	mm->arg_end = addr;
1939	else if (opt == PR_SET_MM_ENV_START)	1939	else if (opt == PR_SET_MM_ENV_START)
1940	mm->env_start = addr;	1940	mm->env_start = addr;
1941	else if (opt == PR_SET_MM_ENV_END)	1941	else if (opt == PR_SET_MM_ENV_END)
1942	mm->env_end = addr;	1942	mm->env_end = addr;
1943	break;	1943	break;
1944		1944
1945	/*	1945	/*
1946	* This doesn't move auxiliary vector itself	1946	* This doesn't move auxiliary vector itself
1947	* since it's pinned to mm_struct, but allow	1947	* since it's pinned to mm_struct, but allow
1948	* to fill vector with new values. It's up	1948	* to fill vector with new values. It's up
1949	* to a caller to provide sane values here	1949	* to a caller to provide sane values here
1950	* otherwise user space tools which use this	1950	* otherwise user space tools which use this
1951	* vector might be unhappy.	1951	* vector might be unhappy.
1952	*/	1952	*/
1953	case PR_SET_MM_AUXV: {	1953	case PR_SET_MM_AUXV: {
1954	unsigned long user_auxv[AT_VECTOR_SIZE];	1954	unsigned long user_auxv[AT_VECTOR_SIZE];
1955		1955
1956	if (arg4 > sizeof(user_auxv))	1956	if (arg4 > sizeof(user_auxv))
1957	goto out;	1957	goto out;
1958	up_read(&mm->mmap_sem);	1958	up_read(&mm->mmap_sem);
1959		1959
1960	if (copy_from_user(user_auxv, (const void __user *)addr, arg4))	1960	if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
1961	return -EFAULT;	1961	return -EFAULT;
1962		1962
1963	/* Make sure the last entry is always AT_NULL */	1963	/* Make sure the last entry is always AT_NULL */
1964	user_auxv[AT_VECTOR_SIZE - 2] = 0;	1964	user_auxv[AT_VECTOR_SIZE - 2] = 0;
1965	user_auxv[AT_VECTOR_SIZE - 1] = 0;	1965	user_auxv[AT_VECTOR_SIZE - 1] = 0;
1966		1966
1967	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));	1967	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1968		1968
1969	task_lock(current);	1969	task_lock(current);
1970	memcpy(mm->saved_auxv, user_auxv, arg4);	1970	memcpy(mm->saved_auxv, user_auxv, arg4);
1971	task_unlock(current);	1971	task_unlock(current);
1972		1972
1973	return 0;	1973	return 0;
1974	}	1974	}
1975	default:	1975	default:
1976	goto out;	1976	goto out;
1977	}	1977	}
1978		1978
1979	error = 0;	1979	error = 0;
1980	out:	1980	out:
1981	up_read(&mm->mmap_sem);	1981	up_read(&mm->mmap_sem);
1982	return error;	1982	return error;
1983	}	1983	}
1984		1984
1985	static int prctl_get_tid_address(struct task_struct me, int __user *tid_addr)	1985	static int prctl_get_tid_address(struct task_struct me, int __user *tid_addr)
1986	{	1986	{
1987	return put_user(me->clear_child_tid, tid_addr);	1987	return put_user(me->clear_child_tid, tid_addr);
1988	}	1988	}
1989		1989
1990	#else /* CONFIG_CHECKPOINT_RESTORE */	1990	#else /* CONFIG_CHECKPOINT_RESTORE */
1991	static int prctl_set_mm(int opt, unsigned long addr,	1991	static int prctl_set_mm(int opt, unsigned long addr,
1992	unsigned long arg4, unsigned long arg5)	1992	unsigned long arg4, unsigned long arg5)
1993	{	1993	{
1994	return -EINVAL;	1994	return -EINVAL;
1995	}	1995	}
1996	static int prctl_get_tid_address(struct task_struct me, int __user *tid_addr)	1996	static int prctl_get_tid_address(struct task_struct me, int __user *tid_addr)
1997	{	1997	{
1998	return -EINVAL;	1998	return -EINVAL;
1999	}	1999	}
2000	#endif	2000	#endif
2001		2001
2002	SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,	2002	SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2003	unsigned long, arg4, unsigned long, arg5)	2003	unsigned long, arg4, unsigned long, arg5)
2004	{	2004	{
2005	struct task_struct *me = current;	2005	struct task_struct *me = current;
2006	unsigned char comm[sizeof(me->comm)];	2006	unsigned char comm[sizeof(me->comm)];
2007	long error;	2007	long error;
2008		2008
2009	error = security_task_prctl(option, arg2, arg3, arg4, arg5);	2009	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
2010	if (error != -ENOSYS)	2010	if (error != -ENOSYS)
2011	return error;	2011	return error;
2012		2012
2013	error = 0;	2013	error = 0;
2014	switch (option) {	2014	switch (option) {
2015	case PR_SET_PDEATHSIG:	2015	case PR_SET_PDEATHSIG:
2016	if (!valid_signal(arg2)) {	2016	if (!valid_signal(arg2)) {
2017	error = -EINVAL;	2017	error = -EINVAL;
2018	break;	2018	break;
2019	}	2019	}
2020	me->pdeath_signal = arg2;	2020	me->pdeath_signal = arg2;
2021	break;	2021	break;
2022	case PR_GET_PDEATHSIG:	2022	case PR_GET_PDEATHSIG:
2023	error = put_user(me->pdeath_signal, (int __user *)arg2);	2023	error = put_user(me->pdeath_signal, (int __user *)arg2);
2024	break;	2024	break;
2025	case PR_GET_DUMPABLE:	2025	case PR_GET_DUMPABLE:
2026	error = get_dumpable(me->mm);	2026	error = get_dumpable(me->mm);
2027	break;	2027	break;
2028	case PR_SET_DUMPABLE:	2028	case PR_SET_DUMPABLE:
2029	if (arg2 < 0 \|\| arg2 > 1) {	2029	if (arg2 < 0 \|\| arg2 > 1) {
2030	error = -EINVAL;	2030	error = -EINVAL;
2031	break;	2031	break;
2032	}	2032	}
2033	set_dumpable(me->mm, arg2);	2033	set_dumpable(me->mm, arg2);
2034	break;	2034	break;
2035		2035
2036	case PR_SET_UNALIGN:	2036	case PR_SET_UNALIGN:
2037	error = SET_UNALIGN_CTL(me, arg2);	2037	error = SET_UNALIGN_CTL(me, arg2);
2038	break;	2038	break;
2039	case PR_GET_UNALIGN:	2039	case PR_GET_UNALIGN:
2040	error = GET_UNALIGN_CTL(me, arg2);	2040	error = GET_UNALIGN_CTL(me, arg2);
2041	break;	2041	break;
2042	case PR_SET_FPEMU:	2042	case PR_SET_FPEMU:
2043	error = SET_FPEMU_CTL(me, arg2);	2043	error = SET_FPEMU_CTL(me, arg2);
2044	break;	2044	break;
2045	case PR_GET_FPEMU:	2045	case PR_GET_FPEMU:
2046	error = GET_FPEMU_CTL(me, arg2);	2046	error = GET_FPEMU_CTL(me, arg2);
2047	break;	2047	break;
2048	case PR_SET_FPEXC:	2048	case PR_SET_FPEXC:
2049	error = SET_FPEXC_CTL(me, arg2);	2049	error = SET_FPEXC_CTL(me, arg2);
2050	break;	2050	break;
2051	case PR_GET_FPEXC:	2051	case PR_GET_FPEXC:
2052	error = GET_FPEXC_CTL(me, arg2);	2052	error = GET_FPEXC_CTL(me, arg2);
2053	break;	2053	break;
2054	case PR_GET_TIMING:	2054	case PR_GET_TIMING:
2055	error = PR_TIMING_STATISTICAL;	2055	error = PR_TIMING_STATISTICAL;
2056	break;	2056	break;
2057	case PR_SET_TIMING:	2057	case PR_SET_TIMING:
2058	if (arg2 != PR_TIMING_STATISTICAL)	2058	if (arg2 != PR_TIMING_STATISTICAL)
2059	error = -EINVAL;	2059	error = -EINVAL;
2060	break;	2060	break;
2061	case PR_SET_NAME:	2061	case PR_SET_NAME:
2062	comm[sizeof(me->comm)-1] = 0;	2062	comm[sizeof(me->comm)-1] = 0;
2063	if (strncpy_from_user(comm, (char __user *)arg2,	2063	if (strncpy_from_user(comm, (char __user *)arg2,
2064	sizeof(me->comm) - 1) < 0)	2064	sizeof(me->comm) - 1) < 0)
2065	return -EFAULT;	2065	return -EFAULT;
2066	set_task_comm(me, comm);	2066	set_task_comm(me, comm);
2067	proc_comm_connector(me);	2067	proc_comm_connector(me);
2068	break;	2068	break;
2069	case PR_GET_NAME:	2069	case PR_GET_NAME:
2070	get_task_comm(comm, me);	2070	get_task_comm(comm, me);
2071	if (copy_to_user((char __user *)arg2, comm,	2071	if (copy_to_user((char __user *)arg2, comm,
2072	sizeof(comm)))	2072	sizeof(comm)))
2073	return -EFAULT;	2073	return -EFAULT;
2074	break;	2074	break;
2075	case PR_GET_ENDIAN:	2075	case PR_GET_ENDIAN:
2076	error = GET_ENDIAN(me, arg2);	2076	error = GET_ENDIAN(me, arg2);
2077	break;	2077	break;
2078	case PR_SET_ENDIAN:	2078	case PR_SET_ENDIAN:
2079	error = SET_ENDIAN(me, arg2);	2079	error = SET_ENDIAN(me, arg2);
2080	break;	2080	break;
2081	case PR_GET_SECCOMP:	2081	case PR_GET_SECCOMP:
2082	error = prctl_get_seccomp();	2082	error = prctl_get_seccomp();
2083	break;	2083	break;
2084	case PR_SET_SECCOMP:	2084	case PR_SET_SECCOMP:
2085	error = prctl_set_seccomp(arg2, (char __user *)arg3);	2085	error = prctl_set_seccomp(arg2, (char __user *)arg3);
2086	break;	2086	break;
2087	case PR_GET_TSC:	2087	case PR_GET_TSC:
2088	error = GET_TSC_CTL(arg2);	2088	error = GET_TSC_CTL(arg2);
2089	break;	2089	break;
2090	case PR_SET_TSC:	2090	case PR_SET_TSC:
2091	error = SET_TSC_CTL(arg2);	2091	error = SET_TSC_CTL(arg2);
2092	break;	2092	break;
2093	case PR_TASK_PERF_EVENTS_DISABLE:	2093	case PR_TASK_PERF_EVENTS_DISABLE:
2094	error = perf_event_task_disable();	2094	error = perf_event_task_disable();
2095	break;	2095	break;
2096	case PR_TASK_PERF_EVENTS_ENABLE:	2096	case PR_TASK_PERF_EVENTS_ENABLE:
2097	error = perf_event_task_enable();	2097	error = perf_event_task_enable();
2098	break;	2098	break;
2099	case PR_GET_TIMERSLACK:	2099	case PR_GET_TIMERSLACK:
2100	error = current->timer_slack_ns;	2100	error = current->timer_slack_ns;
2101	break;	2101	break;
2102	case PR_SET_TIMERSLACK:	2102	case PR_SET_TIMERSLACK:
2103	if (arg2 <= 0)	2103	if (arg2 <= 0)
2104	current->timer_slack_ns =	2104	current->timer_slack_ns =
2105	current->default_timer_slack_ns;	2105	current->default_timer_slack_ns;
2106	else	2106	else
2107	current->timer_slack_ns = arg2;	2107	current->timer_slack_ns = arg2;
2108	break;	2108	break;
2109	case PR_MCE_KILL:	2109	case PR_MCE_KILL:
2110	if (arg4 \| arg5)	2110	if (arg4 \| arg5)
2111	return -EINVAL;	2111	return -EINVAL;
2112	switch (arg2) {	2112	switch (arg2) {
2113	case PR_MCE_KILL_CLEAR:	2113	case PR_MCE_KILL_CLEAR:
2114	if (arg3 != 0)	2114	if (arg3 != 0)
2115	return -EINVAL;	2115	return -EINVAL;
2116	current->flags &= ~PF_MCE_PROCESS;	2116	current->flags &= ~PF_MCE_PROCESS;
2117	break;	2117	break;
2118	case PR_MCE_KILL_SET:	2118	case PR_MCE_KILL_SET:
2119	current->flags \|= PF_MCE_PROCESS;	2119	current->flags \|= PF_MCE_PROCESS;
2120	if (arg3 == PR_MCE_KILL_EARLY)	2120	if (arg3 == PR_MCE_KILL_EARLY)
2121	current->flags \|= PF_MCE_EARLY;	2121	current->flags \|= PF_MCE_EARLY;
2122	else if (arg3 == PR_MCE_KILL_LATE)	2122	else if (arg3 == PR_MCE_KILL_LATE)
2123	current->flags &= ~PF_MCE_EARLY;	2123	current->flags &= ~PF_MCE_EARLY;
2124	else if (arg3 == PR_MCE_KILL_DEFAULT)	2124	else if (arg3 == PR_MCE_KILL_DEFAULT)
2125	current->flags &=	2125	current->flags &=
2126	~(PF_MCE_EARLY\|PF_MCE_PROCESS);	2126	~(PF_MCE_EARLY\|PF_MCE_PROCESS);
2127	else	2127	else
2128	return -EINVAL;	2128	return -EINVAL;
2129	break;	2129	break;
2130	default:	2130	default:
2131	return -EINVAL;	2131	return -EINVAL;
2132	}	2132	}
2133	break;	2133	break;
2134	case PR_MCE_KILL_GET:	2134	case PR_MCE_KILL_GET:
2135	if (arg2 \| arg3 \| arg4 \| arg5)	2135	if (arg2 \| arg3 \| arg4 \| arg5)
2136	return -EINVAL;	2136	return -EINVAL;
2137	if (current->flags & PF_MCE_PROCESS)	2137	if (current->flags & PF_MCE_PROCESS)
2138	error = (current->flags & PF_MCE_EARLY) ?	2138	error = (current->flags & PF_MCE_EARLY) ?
2139	PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;	2139	PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2140	else	2140	else
2141	error = PR_MCE_KILL_DEFAULT;	2141	error = PR_MCE_KILL_DEFAULT;
2142	break;	2142	break;
2143	case PR_SET_MM:	2143	case PR_SET_MM:
2144	error = prctl_set_mm(arg2, arg3, arg4, arg5);	2144	error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145	break;	2145	break;
2146	case PR_GET_TID_ADDRESS:	2146	case PR_GET_TID_ADDRESS:
2147	error = prctl_get_tid_address(me, (int __user **)arg2);	2147	error = prctl_get_tid_address(me, (int __user **)arg2);
2148	break;	2148	break;
2149	case PR_SET_CHILD_SUBREAPER:	2149	case PR_SET_CHILD_SUBREAPER:
2150	me->signal->is_child_subreaper = !!arg2;	2150	me->signal->is_child_subreaper = !!arg2;
2151	break;	2151	break;
2152	case PR_GET_CHILD_SUBREAPER:	2152	case PR_GET_CHILD_SUBREAPER:
2153	error = put_user(me->signal->is_child_subreaper,	2153	error = put_user(me->signal->is_child_subreaper,
2154	(int __user *) arg2);	2154	(int __user *) arg2);
2155	break;	2155	break;
2156	case PR_SET_NO_NEW_PRIVS:	2156	case PR_SET_NO_NEW_PRIVS:
2157	if (arg2 != 1 \|\| arg3 \|\| arg4 \|\| arg5)	2157	if (arg2 != 1 \|\| arg3 \|\| arg4 \|\| arg5)
2158	return -EINVAL;	2158	return -EINVAL;
2159		2159
2160	current->no_new_privs = 1;	2160	current->no_new_privs = 1;
2161	break;	2161	break;
2162	case PR_GET_NO_NEW_PRIVS:	2162	case PR_GET_NO_NEW_PRIVS:
2163	if (arg2 \|\| arg3 \|\| arg4 \|\| arg5)	2163	if (arg2 \|\| arg3 \|\| arg4 \|\| arg5)
2164	return -EINVAL;	2164	return -EINVAL;
2165	return current->no_new_privs ? 1 : 0;	2165	return current->no_new_privs ? 1 : 0;
2166	default:	2166	default:
2167	error = -EINVAL;	2167	error = -EINVAL;
2168	break;	2168	break;
2169	}	2169	}
2170	return error;	2170	return error;
2171	}	2171	}
2172		2172
2173	SYSCALL_DEFINE3(getcpu, unsigned __user , cpup, unsigned __user , nodep,	2173	SYSCALL_DEFINE3(getcpu, unsigned __user , cpup, unsigned __user , nodep,
2174	struct getcpu_cache __user *, unused)	2174	struct getcpu_cache __user *, unused)
2175	{	2175	{
2176	int err = 0;	2176	int err = 0;
2177	int cpu = raw_smp_processor_id();	2177	int cpu = raw_smp_processor_id();
2178	if (cpup)	2178	if (cpup)
2179	err \|= put_user(cpu, cpup);	2179	err \|= put_user(cpu, cpup);
2180	if (nodep)	2180	if (nodep)
2181	err \|= put_user(cpu_to_node(cpu), nodep);	2181	err \|= put_user(cpu_to_node(cpu), nodep);
2182	return err ? -EFAULT : 0;	2182	return err ? -EFAULT : 0;
2183	}	2183	}
2184		2184
2185	char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";	2185	char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2186		2186
2187	static void argv_cleanup(struct subprocess_info *info)	2187	static void argv_cleanup(struct subprocess_info *info)
2188	{	2188	{
2189	argv_free(info->argv);	2189	argv_free(info->argv);
2190	}	2190	}
2191		2191
2192	static int __orderly_poweroff(void)	2192	static int __orderly_poweroff(void)
2193	{	2193	{
2194	int argc;	2194	int argc;
2195	char **argv;	2195	char **argv;
2196	static char *envp[] = {	2196	static char *envp[] = {
2197	"HOME=/",	2197	"HOME=/",
2198	"PATH=/sbin:/bin:/usr/sbin:/usr/bin",	2198	"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2199	NULL	2199	NULL
2200	};	2200	};
2201	int ret;	2201	int ret;
2202		2202
2203	argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);	2203	argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2204	if (argv == NULL) {	2204	if (argv == NULL) {
2205	printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",	2205	printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2206	__func__, poweroff_cmd);	2206	__func__, poweroff_cmd);
2207	return -ENOMEM;	2207	return -ENOMEM;
2208	}	2208	}
2209		2209
2210	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,	2210	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2211	NULL, argv_cleanup, NULL);	2211	NULL, argv_cleanup, NULL);
2212	if (ret == -ENOMEM)	2212	if (ret == -ENOMEM)
2213	argv_free(argv);	2213	argv_free(argv);
2214		2214
2215	return ret;	2215	return ret;
2216	}	2216	}
2217		2217
2218	/**	2218	/**
2219	* orderly_poweroff - Trigger an orderly system poweroff	2219	* orderly_poweroff - Trigger an orderly system poweroff
2220	* @force: force poweroff if command execution fails	2220	* @force: force poweroff if command execution fails
2221	*	2221	*
2222	* This may be called from any context to trigger a system shutdown.	2222	* This may be called from any context to trigger a system shutdown.
2223	* If the orderly shutdown fails, it will force an immediate shutdown.	2223	* If the orderly shutdown fails, it will force an immediate shutdown.
2224	*/	2224	*/
2225	int orderly_poweroff(bool force)	2225	int orderly_poweroff(bool force)
2226	{	2226	{
2227	int ret = __orderly_poweroff();	2227	int ret = __orderly_poweroff();
2228		2228
2229	if (ret && force) {	2229	if (ret && force) {
2230	printk(KERN_WARNING "Failed to start orderly shutdown: "	2230	printk(KERN_WARNING "Failed to start orderly shutdown: "
2231	"forcing the issue\n");	2231	"forcing the issue\n");
2232		2232
2233	/*	2233	/*
2234	* I guess this should try to kick off some daemon to sync and	2234	* I guess this should try to kick off some daemon to sync and
2235	* poweroff asap. Or not even bother syncing if we're doing an	2235	* poweroff asap. Or not even bother syncing if we're doing an
2236	* emergency shutdown?	2236	* emergency shutdown?
2237	*/	2237	*/
2238	emergency_sync();	2238	emergency_sync();
2239	kernel_power_off();	2239	kernel_power_off();
2240	}	2240	}
2241		2241
2242	return ret;	2242	return ret;
2243	}	2243	}
2244	EXPORT_SYMBOL_GPL(orderly_poweroff);	2244	EXPORT_SYMBOL_GPL(orderly_poweroff);
2245		2245