Blame view

kernel/fork.c 74.1 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
13
  /*
   *  linux/kernel/fork.c
   *
   *  Copyright (C) 1991, 1992  Linus Torvalds
   */
  
  /*
   *  'fork.c' contains the help-routines for the 'fork' system call
   * (see also entry.S and others).
   * Fork is rather simple, once you get the hang of it, but the memory
   * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
   */
b3e583825   Christian Brauner   clone: add CLONE_...
14
  #include <linux/anon_inodes.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
  #include <linux/slab.h>
4eb5aaa3a   Ingo Molnar   sched/headers: Pr...
16
  #include <linux/sched/autogroup.h>
6e84f3152   Ingo Molnar   sched/headers: Pr...
17
  #include <linux/sched/mm.h>
f7ccbae45   Ingo Molnar   sched/headers: Pr...
18
  #include <linux/sched/coredump.h>
8703e8a46   Ingo Molnar   sched/headers: Pr...
19
  #include <linux/sched/user.h>
6a3827d75   Ingo Molnar   sched/headers: Pr...
20
  #include <linux/sched/numa_balancing.h>
03441a348   Ingo Molnar   sched/headers: Pr...
21
  #include <linux/sched/stat.h>
299300258   Ingo Molnar   sched/headers: Pr...
22
  #include <linux/sched/task.h>
68db0cf10   Ingo Molnar   sched/headers: Pr...
23
  #include <linux/sched/task_stack.h>
32ef5517c   Ingo Molnar   sched/headers: Pr...
24
  #include <linux/sched/cputime.h>
b3e583825   Christian Brauner   clone: add CLONE_...
25
  #include <linux/seq_file.h>
037741a6d   Ingo Molnar   sched/headers: Pr...
26
  #include <linux/rtmutex.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
27
28
  #include <linux/init.h>
  #include <linux/unistd.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
31
  #include <linux/module.h>
  #include <linux/vmalloc.h>
  #include <linux/completion.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
35
  #include <linux/personality.h>
  #include <linux/mempolicy.h>
  #include <linux/sem.h>
  #include <linux/file.h>
9f3acc314   Al Viro   [PATCH] split lin...
36
  #include <linux/fdtable.h>
da9cbc873   Jens Axboe   block: blkdev.h c...
37
  #include <linux/iocontext.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
39
40
  #include <linux/key.h>
  #include <linux/binfmts.h>
  #include <linux/mman.h>
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
41
  #include <linux/mmu_notifier.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
  #include <linux/fs.h>
615d6e875   Davidlohr Bueso   mm: per-thread vm...
43
44
  #include <linux/mm.h>
  #include <linux/vmacache.h>
ab516013a   Serge E. Hallyn   [PATCH] namespace...
45
  #include <linux/nsproxy.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
46
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
47
  #include <linux/cpu.h>
b4f48b636   Paul Menage   Task Control Grou...
48
  #include <linux/cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
  #include <linux/security.h>
a1e78772d   Mel Gorman   hugetlb: reserve ...
50
  #include <linux/hugetlb.h>
e2cfabdfd   Will Drewry   seccomp: add syst...
51
  #include <linux/seccomp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52
53
54
55
  #include <linux/swap.h>
  #include <linux/syscalls.h>
  #include <linux/jiffies.h>
  #include <linux/futex.h>
8141c7f3e   Linus Torvalds   Move "exit_robust...
56
  #include <linux/compat.h>
207205a2b   Eric Dumazet   kthread: NUMA awa...
57
  #include <linux/kthread.h>
7c3ab7381   Andrew Morton   [PATCH] io-accoun...
58
  #include <linux/task_io_accounting_ops.h>
ab2af1f50   Dipankar Sarma   [PATCH] files: fi...
59
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
61
62
  #include <linux/ptrace.h>
  #include <linux/mount.h>
  #include <linux/audit.h>
78fb74669   Pavel Emelianov   Memory controller...
63
  #include <linux/memcontrol.h>
f201ae235   Frederic Weisbecker   tracing/function-...
64
  #include <linux/ftrace.h>
5e2bf0142   Mike Galbraith   namespaces, pid_n...
65
  #include <linux/proc_fs.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66
67
  #include <linux/profile.h>
  #include <linux/rmap.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
68
  #include <linux/ksm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
69
  #include <linux/acct.h>
893e26e61   Pavel Emelyanov   userfaultfd: non-...
70
  #include <linux/userfaultfd_k.h>
8f0ab5147   Jay Lan   [PATCH] csa: conv...
71
  #include <linux/tsacct_kern.h>
9f46080c4   Matt Helsley   [PATCH] Process E...
72
  #include <linux/cn_proc.h>
ba96a0c88   Rafael J. Wysocki   freezer: fix vfor...
73
  #include <linux/freezer.h>
ca74e92b4   Shailabh Nagar   [PATCH] per-task-...
74
  #include <linux/delayacct.h>
ad4ecbcba   Shailabh Nagar   [PATCH] delay acc...
75
  #include <linux/taskstats_kern.h>
0a4254058   Arjan van de Ven   [PATCH] Add the c...
76
  #include <linux/random.h>
522ed7767   Miloslav Trmac   Audit: add TTY in...
77
  #include <linux/tty.h>
fd0928df9   Jens Axboe   ioprio: move io p...
78
  #include <linux/blkdev.h>
5ad4e53bd   Al Viro   Get rid of indire...
79
  #include <linux/fs_struct.h>
7c9f8861e   Eric Sandeen   stackprotector: u...
80
  #include <linux/magic.h>
cdd6c482c   Ingo Molnar   perf: Do the big ...
81
  #include <linux/perf_event.h>
42c4ab41a   Stanislaw Gruszka   itimers: Merge IT...
82
  #include <linux/posix-timers.h>
8e7cac798   Avi Kivity   core: Fix user re...
83
  #include <linux/user-return-notifier.h>
3d5992d2a   Ying Han   oom: add per-mm o...
84
  #include <linux/oom.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
85
  #include <linux/khugepaged.h>
d80e731ec   Oleg Nesterov   epoll: introduce ...
86
  #include <linux/signalfd.h>
0326f5a94   Srikar Dronamraju   uprobes/core: Han...
87
  #include <linux/uprobes.h>
a27bb332c   Kent Overstreet   aio: don't includ...
88
  #include <linux/aio.h>
52f5684c8   Gideon Israel Dsouza   kernel: use macro...
89
  #include <linux/compiler.h>
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
90
  #include <linux/sysctl.h>
5c9a8750a   Dmitry Vyukov   kernel: add kcov ...
91
  #include <linux/kcov.h>
d83a7cb37   Josh Poimboeuf   livepatch: change...
92
  #include <linux/livepatch.h>
48ac3c18c   Mark Rutland   fork: allow arch-...
93
  #include <linux/thread_info.h>
afaef01c0   Alexander Popov   x86/entry: Add ST...
94
  #include <linux/stackleak.h>
eafb149ed   Daniel Axtens   fork: support VMA...
95
  #include <linux/kasan.h>
d08b9f0ca   Sami Tolvanen   scs: Add support ...
96
  #include <linux/scs.h>
0f2122045   Jens Axboe   io_uring: don't r...
97
  #include <linux/io_uring.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
99
  #include <asm/pgalloc.h>
7c0f6ba68   Linus Torvalds   Replace <asm/uacc...
100
  #include <linux/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
101
102
103
  #include <asm/mmu_context.h>
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
ad8d75fff   Steven Rostedt   tracing/events: m...
104
  #include <trace/events/sched.h>
43d2b1132   KAMEZAWA Hiroyuki   tracepoint: add t...
105
106
  #define CREATE_TRACE_POINTS
  #include <trace/events/task.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107
  /*
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
108
109
110
111
112
113
114
115
116
117
   * Minimum number of threads to boot the kernel
   */
  #define MIN_THREADS 20
  
  /*
   * Maximum number of threads
   */
  #define MAX_THREADS FUTEX_TID_MASK
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
118
119
120
   * Protected counters by write_lock_irq(&tasklist_lock)
   */
  unsigned long total_forks;	/* Handle normal Linux uptimes. */
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
121
  int nr_threads;			/* The idle threads do not count.. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
122

8856ae4df   Kefeng Wang   kernel/fork.c: ma...
123
  static int max_threads;		/* tunable limit on nr_threads */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124

8495f7e67   Sai Praneeth Prakhya   fork: improve err...
125
126
127
128
129
130
131
132
  #define NAMED_ARRAY_INDEX(x)	[x] = __stringify(x)
  
  static const char * const resident_page_types[] = {
  	NAMED_ARRAY_INDEX(MM_FILEPAGES),
  	NAMED_ARRAY_INDEX(MM_ANONPAGES),
  	NAMED_ARRAY_INDEX(MM_SWAPENTS),
  	NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
133
  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
c59923a15   Christoph Hellwig   [PATCH] remove th...
134
  __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
db1466b3e   Paul E. McKenney   rcu: Use wrapper ...
135
136
137
138
139
140
141
142
  
  #ifdef CONFIG_PROVE_RCU
  int lockdep_tasklist_lock_is_held(void)
  {
  	return lockdep_is_held(&tasklist_lock);
  }
  EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
  #endif /* #ifdef CONFIG_PROVE_RCU */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143
144
145
146
147
  
  int nr_processes(void)
  {
  	int cpu;
  	int total = 0;
1d5107509   Ian Campbell   Correct nr_proces...
148
  	for_each_possible_cpu(cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
150
151
152
  		total += per_cpu(process_counts, cpu);
  
  	return total;
  }
f19b9f74b   Akinobu Mita   fork: fix error h...
153
154
155
  void __weak arch_release_task_struct(struct task_struct *tsk)
  {
  }
f5e102873   Thomas Gleixner   task_allocator: U...
156
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
157
  static struct kmem_cache *task_struct_cachep;
41101809a   Thomas Gleixner   fork: Provide wea...
158
159
160
161
162
  
  static inline struct task_struct *alloc_task_struct_node(int node)
  {
  	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
  }
41101809a   Thomas Gleixner   fork: Provide wea...
163
164
  static inline void free_task_struct(struct task_struct *tsk)
  {
41101809a   Thomas Gleixner   fork: Provide wea...
165
166
  	kmem_cache_free(task_struct_cachep, tsk);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
  #endif
b235beea9   Linus Torvalds   Clarify naming of...
168
  #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
41101809a   Thomas Gleixner   fork: Provide wea...
169

0d15d74a1   Thomas Gleixner   fork: Provide kme...
170
171
172
173
  /*
   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
   * kmemcache based allocator.
   */
ba14a194a   Andy Lutomirski   fork: Add generic...
174
  # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
ac496bf48   Andy Lutomirski   fork: Optimize ta...
175
176
177
178
179
180
181
182
  
  #ifdef CONFIG_VMAP_STACK
  /*
   * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
   * flush.  Try to minimize the number of calls by caching stacks.
   */
  #define NR_CACHED_STACKS 2
  static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
19659c59a   Hoeun Ryu   fork: free vmappe...
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
  
  static int free_vm_stack_cache(unsigned int cpu)
  {
  	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
  	int i;
  
  	for (i = 0; i < NR_CACHED_STACKS; i++) {
  		struct vm_struct *vm_stack = cached_vm_stacks[i];
  
  		if (!vm_stack)
  			continue;
  
  		vfree(vm_stack->addr);
  		cached_vm_stacks[i] = NULL;
  	}
  
  	return 0;
  }
ac496bf48   Andy Lutomirski   fork: Optimize ta...
201
  #endif
ba14a194a   Andy Lutomirski   fork: Add generic...
202
  static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
b69c49b78   FUJITA Tomonori   clean up duplicat...
203
  {
ba14a194a   Andy Lutomirski   fork: Add generic...
204
  #ifdef CONFIG_VMAP_STACK
ac496bf48   Andy Lutomirski   fork: Optimize ta...
205
206
  	void *stack;
  	int i;
ac496bf48   Andy Lutomirski   fork: Optimize ta...
207
  	for (i = 0; i < NR_CACHED_STACKS; i++) {
112166f88   Christoph Lameter   kernel/fork.c: vi...
208
209
210
  		struct vm_struct *s;
  
  		s = this_cpu_xchg(cached_stacks[i], NULL);
ac496bf48   Andy Lutomirski   fork: Optimize ta...
211
212
213
  
  		if (!s)
  			continue;
ac496bf48   Andy Lutomirski   fork: Optimize ta...
214

eafb149ed   Daniel Axtens   fork: support VMA...
215
216
  		/* Clear the KASAN shadow of the stack. */
  		kasan_unpoison_shadow(s->addr, THREAD_SIZE);
ca1825518   Konstantin Khlebnikov   kmemleak: clear s...
217
218
  		/* Clear stale pointers from reused stack. */
  		memset(s->addr, 0, THREAD_SIZE);
e01e80634   Kees Cook   fork: uncondition...
219

ac496bf48   Andy Lutomirski   fork: Optimize ta...
220
  		tsk->stack_vm_area = s;
ba4a45746   Shakeel Butt   fork, memcg: fix ...
221
  		tsk->stack = s->addr;
ac496bf48   Andy Lutomirski   fork: Optimize ta...
222
223
  		return s->addr;
  	}
ac496bf48   Andy Lutomirski   fork: Optimize ta...
224

9b6f7e163   Roman Gushchin   mm: rework memcg ...
225
226
227
228
229
  	/*
  	 * Allocated stacks are cached and later reused by new threads,
  	 * so memcg accounting is performed manually on assigning/releasing
  	 * stacks to tasks. Drop __GFP_ACCOUNT.
  	 */
48ac3c18c   Mark Rutland   fork: allow arch-...
230
  	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
ac496bf48   Andy Lutomirski   fork: Optimize ta...
231
  				     VMALLOC_START, VMALLOC_END,
9b6f7e163   Roman Gushchin   mm: rework memcg ...
232
  				     THREADINFO_GFP & ~__GFP_ACCOUNT,
ac496bf48   Andy Lutomirski   fork: Optimize ta...
233
234
  				     PAGE_KERNEL,
  				     0, node, __builtin_return_address(0));
ba14a194a   Andy Lutomirski   fork: Add generic...
235
236
237
238
239
240
  
  	/*
  	 * We can't call find_vm_area() in interrupt context, and
  	 * free_thread_stack() can be called in interrupt context,
  	 * so cache the vm_struct.
  	 */
5eed6f1df   Rik van Riel   fork,memcg: fix c...
241
  	if (stack) {
ba14a194a   Andy Lutomirski   fork: Add generic...
242
  		tsk->stack_vm_area = find_vm_area(stack);
5eed6f1df   Rik van Riel   fork,memcg: fix c...
243
244
  		tsk->stack = stack;
  	}
ba14a194a   Andy Lutomirski   fork: Add generic...
245
246
  	return stack;
  #else
4949148ad   Vladimir Davydov   mm: charge/unchar...
247
248
  	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
  					     THREAD_SIZE_ORDER);
b6a84016b   Eric Dumazet   mm: NUMA aware al...
249

1bf4580e0   Andrea Arcangeli   fork,memcg: alloc...
250
  	if (likely(page)) {
8dcc1d346   Andrey Konovalov   kasan: don't tag ...
251
  		tsk->stack = kasan_reset_tag(page_address(page));
1bf4580e0   Andrea Arcangeli   fork,memcg: alloc...
252
253
254
  		return tsk->stack;
  	}
  	return NULL;
ba14a194a   Andy Lutomirski   fork: Add generic...
255
  #endif
b69c49b78   FUJITA Tomonori   clean up duplicat...
256
  }
ba14a194a   Andy Lutomirski   fork: Add generic...
257
  static inline void free_thread_stack(struct task_struct *tsk)
b69c49b78   FUJITA Tomonori   clean up duplicat...
258
  {
ac496bf48   Andy Lutomirski   fork: Optimize ta...
259
  #ifdef CONFIG_VMAP_STACK
9b6f7e163   Roman Gushchin   mm: rework memcg ...
260
261
262
  	struct vm_struct *vm = task_stack_vm_area(tsk);
  
  	if (vm) {
ac496bf48   Andy Lutomirski   fork: Optimize ta...
263
  		int i;
991e76738   Shakeel Butt   mm: memcontrol: a...
264
  		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
f4b00eab5   Roman Gushchin   mm: kmem: rename ...
265
  			memcg_kmem_uncharge_page(vm->pages[i], 0);
9b6f7e163   Roman Gushchin   mm: rework memcg ...
266

ac496bf48   Andy Lutomirski   fork: Optimize ta...
267
  		for (i = 0; i < NR_CACHED_STACKS; i++) {
112166f88   Christoph Lameter   kernel/fork.c: vi...
268
269
  			if (this_cpu_cmpxchg(cached_stacks[i],
  					NULL, tsk->stack_vm_area) != NULL)
ac496bf48   Andy Lutomirski   fork: Optimize ta...
270
  				continue;
ac496bf48   Andy Lutomirski   fork: Optimize ta...
271
272
  			return;
  		}
ac496bf48   Andy Lutomirski   fork: Optimize ta...
273

0f110a9b9   Andrey Ryabinin   kernel/fork: use ...
274
  		vfree_atomic(tsk->stack);
ac496bf48   Andy Lutomirski   fork: Optimize ta...
275
276
277
278
279
  		return;
  	}
  #endif
  
  	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
b69c49b78   FUJITA Tomonori   clean up duplicat...
280
  }
0d15d74a1   Thomas Gleixner   fork: Provide kme...
281
  # else
b235beea9   Linus Torvalds   Clarify naming of...
282
  static struct kmem_cache *thread_stack_cache;
0d15d74a1   Thomas Gleixner   fork: Provide kme...
283

9521d3997   Michael Ellerman   Fix build break i...
284
  static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
0d15d74a1   Thomas Gleixner   fork: Provide kme...
285
286
  						  int node)
  {
5eed6f1df   Rik van Riel   fork,memcg: fix c...
287
288
  	unsigned long *stack;
  	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
8dcc1d346   Andrey Konovalov   kasan: don't tag ...
289
  	stack = kasan_reset_tag(stack);
5eed6f1df   Rik van Riel   fork,memcg: fix c...
290
291
  	tsk->stack = stack;
  	return stack;
0d15d74a1   Thomas Gleixner   fork: Provide kme...
292
  }
ba14a194a   Andy Lutomirski   fork: Add generic...
293
  static void free_thread_stack(struct task_struct *tsk)
0d15d74a1   Thomas Gleixner   fork: Provide kme...
294
  {
ba14a194a   Andy Lutomirski   fork: Add generic...
295
  	kmem_cache_free(thread_stack_cache, tsk->stack);
0d15d74a1   Thomas Gleixner   fork: Provide kme...
296
  }
b235beea9   Linus Torvalds   Clarify naming of...
297
  void thread_stack_cache_init(void)
0d15d74a1   Thomas Gleixner   fork: Provide kme...
298
  {
f9d29946c   David Windsor   fork: Define user...
299
300
301
  	thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
  					THREAD_SIZE, THREAD_SIZE, 0, 0,
  					THREAD_SIZE, NULL);
b235beea9   Linus Torvalds   Clarify naming of...
302
  	BUG_ON(thread_stack_cache == NULL);
0d15d74a1   Thomas Gleixner   fork: Provide kme...
303
304
  }
  # endif
b69c49b78   FUJITA Tomonori   clean up duplicat...
305
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306
  /* SLAB cache for signal_struct structures (tsk->signal) */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
307
  static struct kmem_cache *signal_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
308
309
  
  /* SLAB cache for sighand_struct structures (tsk->sighand) */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
310
  struct kmem_cache *sighand_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
312
  
  /* SLAB cache for files_struct structures (tsk->files) */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
313
  struct kmem_cache *files_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
315
  
  /* SLAB cache for fs_struct structures (tsk->fs) */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
316
  struct kmem_cache *fs_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
317
318
  
  /* SLAB cache for vm_area_struct structures */
3928d4f5e   Linus Torvalds   mm: use helper fu...
319
  static struct kmem_cache *vm_area_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
320
321
  
  /* SLAB cache for mm_struct structures (tsk->mm) */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
322
  static struct kmem_cache *mm_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
323

490fc0538   Linus Torvalds   mm: make vm_area_...
324
  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
3928d4f5e   Linus Torvalds   mm: use helper fu...
325
  {
a670468f5   Andrew Morton   mm: zero out the ...
326
  	struct vm_area_struct *vma;
490fc0538   Linus Torvalds   mm: make vm_area_...
327

a670468f5   Andrew Morton   mm: zero out the ...
328
  	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
027232da7   Kirill A. Shutemov   mm: introduce vma...
329
330
  	if (vma)
  		vma_init(vma, mm);
490fc0538   Linus Torvalds   mm: make vm_area_...
331
  	return vma;
3928d4f5e   Linus Torvalds   mm: use helper fu...
332
333
334
335
  }
  
  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
  {
95faf6992   Linus Torvalds   mm: make vm_area_...
336
337
338
  	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  
  	if (new) {
cda099b37   Qian Cai   fork: Annotate a ...
339
340
341
342
343
344
345
  		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
  		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
  		/*
  		 * orig->shared.rb may be modified concurrently, but the clone
  		 * will be reinitialized.
  		 */
  		*new = data_race(*orig);
95faf6992   Linus Torvalds   mm: make vm_area_...
346
  		INIT_LIST_HEAD(&new->anon_vma_chain);
e39a4b332   Li Xinhai   mm: set vm_next a...
347
  		new->vm_next = new->vm_prev = NULL;
95faf6992   Linus Torvalds   mm: make vm_area_...
348
349
  	}
  	return new;
3928d4f5e   Linus Torvalds   mm: use helper fu...
350
351
352
353
354
355
  }
  
  void vm_area_free(struct vm_area_struct *vma)
  {
  	kmem_cache_free(vm_area_cachep, vma);
  }
ba14a194a   Andy Lutomirski   fork: Add generic...
356
  static void account_kernel_stack(struct task_struct *tsk, int account)
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
357
  {
ba14a194a   Andy Lutomirski   fork: Add generic...
358
359
  	void *stack = task_stack_page(tsk);
  	struct vm_struct *vm = task_stack_vm_area(tsk);
ba14a194a   Andy Lutomirski   fork: Add generic...
360

991e76738   Shakeel Butt   mm: memcontrol: a...
361
362
363
364
365
366
367
  	/* All stack pages are in the same node. */
  	if (vm)
  		mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
  				      account * (THREAD_SIZE / 1024));
  	else
  		mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
  				      account * (THREAD_SIZE / 1024));
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
368
  }
9b6f7e163   Roman Gushchin   mm: rework memcg ...
369
370
371
372
373
  static int memcg_charge_kernel_stack(struct task_struct *tsk)
  {
  #ifdef CONFIG_VMAP_STACK
  	struct vm_struct *vm = task_stack_vm_area(tsk);
  	int ret;
991e76738   Shakeel Butt   mm: memcontrol: a...
374
  	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
9b6f7e163   Roman Gushchin   mm: rework memcg ...
375
376
  	if (vm) {
  		int i;
991e76738   Shakeel Butt   mm: memcontrol: a...
377
  		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
9b6f7e163   Roman Gushchin   mm: rework memcg ...
378
379
  		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
  			/*
f4b00eab5   Roman Gushchin   mm: kmem: rename ...
380
  			 * If memcg_kmem_charge_page() fails, page->mem_cgroup
991e76738   Shakeel Butt   mm: memcontrol: a...
381
382
  			 * pointer is NULL, and memcg_kmem_uncharge_page() in
  			 * free_thread_stack() will ignore this page.
9b6f7e163   Roman Gushchin   mm: rework memcg ...
383
  			 */
f4b00eab5   Roman Gushchin   mm: kmem: rename ...
384
385
  			ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
  						     0);
9b6f7e163   Roman Gushchin   mm: rework memcg ...
386
387
  			if (ret)
  				return ret;
9b6f7e163   Roman Gushchin   mm: rework memcg ...
388
389
390
391
392
  		}
  	}
  #endif
  	return 0;
  }
68f24b08e   Andy Lutomirski   sched/core: Free ...
393
  static void release_task_stack(struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394
  {
405c07597   Andy Lutomirski   fork: Add task st...
395
396
  	if (WARN_ON(tsk->state != TASK_DEAD))
  		return;  /* Better to leak the stack than to free prematurely */
ba14a194a   Andy Lutomirski   fork: Add generic...
397
  	account_kernel_stack(tsk, -1);
ba14a194a   Andy Lutomirski   fork: Add generic...
398
  	free_thread_stack(tsk);
68f24b08e   Andy Lutomirski   sched/core: Free ...
399
400
401
402
403
404
405
406
407
  	tsk->stack = NULL;
  #ifdef CONFIG_VMAP_STACK
  	tsk->stack_vm_area = NULL;
  #endif
  }
  
  #ifdef CONFIG_THREAD_INFO_IN_TASK
  void put_task_stack(struct task_struct *tsk)
  {
f0b89d395   Elena Reshetova   sched/core: Conve...
408
  	if (refcount_dec_and_test(&tsk->stack_refcount))
68f24b08e   Andy Lutomirski   sched/core: Free ...
409
410
411
412
413
414
  		release_task_stack(tsk);
  }
  #endif
  
  void free_task(struct task_struct *tsk)
  {
d08b9f0ca   Sami Tolvanen   scs: Add support ...
415
  	scs_release(tsk);
68f24b08e   Andy Lutomirski   sched/core: Free ...
416
417
418
419
420
421
422
423
424
425
426
  #ifndef CONFIG_THREAD_INFO_IN_TASK
  	/*
  	 * The task is finally done with both the stack and thread_info,
  	 * so free both.
  	 */
  	release_task_stack(tsk);
  #else
  	/*
  	 * If the task had a separate stack allocation, it should be gone
  	 * by now.
  	 */
f0b89d395   Elena Reshetova   sched/core: Conve...
427
  	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
68f24b08e   Andy Lutomirski   sched/core: Free ...
428
  #endif
23f78d4a0   Ingo Molnar   [PATCH] pi-futex:...
429
  	rt_mutex_debug_task_free(tsk);
fb52607af   Frederic Weisbecker   tracing/function-...
430
  	ftrace_graph_exit_task(tsk);
f19b9f74b   Akinobu Mita   fork: fix error h...
431
  	arch_release_task_struct(tsk);
1da5c46fa   Oleg Nesterov   kthread: Make str...
432
433
  	if (tsk->flags & PF_KTHREAD)
  		free_kthread_struct(tsk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
434
435
436
  	free_task_struct(tsk);
  }
  EXPORT_SYMBOL(free_task);
d70f2a14b   Andrew Morton   include/linux/sch...
437
438
439
440
441
442
443
444
445
446
447
  #ifdef CONFIG_MMU
  static __latent_entropy int dup_mmap(struct mm_struct *mm,
  					struct mm_struct *oldmm)
  {
  	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
  	struct rb_node **rb_link, *rb_parent;
  	int retval;
  	unsigned long charge;
  	LIST_HEAD(uf);
  
  	uprobe_start_dup_mmap();
d8ed45c5d   Michel Lespinasse   mmap locking API:...
448
  	if (mmap_write_lock_killable(oldmm)) {
d70f2a14b   Andrew Morton   include/linux/sch...
449
450
451
452
453
454
455
456
  		retval = -EINTR;
  		goto fail_uprobe_end;
  	}
  	flush_cache_dup_mm(oldmm);
  	uprobe_dup_mmap(oldmm, mm);
  	/*
  	 * Not linked in yet - no deadlock potential:
  	 */
aaa2cc56c   Michel Lespinasse   mmap locking API:...
457
  	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
d70f2a14b   Andrew Morton   include/linux/sch...
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
  
  	/* No ordering required: file already has been exposed. */
  	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
  
  	mm->total_vm = oldmm->total_vm;
  	mm->data_vm = oldmm->data_vm;
  	mm->exec_vm = oldmm->exec_vm;
  	mm->stack_vm = oldmm->stack_vm;
  
  	rb_link = &mm->mm_rb.rb_node;
  	rb_parent = NULL;
  	pprev = &mm->mmap;
  	retval = ksm_fork(mm, oldmm);
  	if (retval)
  		goto out;
  	retval = khugepaged_fork(mm, oldmm);
  	if (retval)
  		goto out;
  
  	prev = NULL;
  	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
  		struct file *file;
  
  		if (mpnt->vm_flags & VM_DONTCOPY) {
  			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
  			continue;
  		}
  		charge = 0;
655c79bb4   Tetsuo Handa   mm: check for SIG...
486
487
488
489
490
491
492
493
  		/*
  		 * Don't duplicate many vmas if we've been oom-killed (for
  		 * example)
  		 */
  		if (fatal_signal_pending(current)) {
  			retval = -EINTR;
  			goto out;
  		}
d70f2a14b   Andrew Morton   include/linux/sch...
494
495
496
497
498
499
500
  		if (mpnt->vm_flags & VM_ACCOUNT) {
  			unsigned long len = vma_pages(mpnt);
  
  			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
  				goto fail_nomem;
  			charge = len;
  		}
3928d4f5e   Linus Torvalds   mm: use helper fu...
501
  		tmp = vm_area_dup(mpnt);
d70f2a14b   Andrew Morton   include/linux/sch...
502
503
  		if (!tmp)
  			goto fail_nomem;
d70f2a14b   Andrew Morton   include/linux/sch...
504
505
506
507
508
509
510
511
  		retval = vma_dup_policy(mpnt, tmp);
  		if (retval)
  			goto fail_nomem_policy;
  		tmp->vm_mm = mm;
  		retval = dup_userfaultfd(tmp, &uf);
  		if (retval)
  			goto fail_nomem_anon_vma_fork;
  		if (tmp->vm_flags & VM_WIPEONFORK) {
93949bb21   Li Xinhai   mm: don't prepare...
512
513
514
515
516
  			/*
  			 * VM_WIPEONFORK gets a clean slate in the child.
  			 * Don't prepare anon_vma until fault since we don't
  			 * copy page for current vma.
  			 */
d70f2a14b   Andrew Morton   include/linux/sch...
517
  			tmp->anon_vma = NULL;
d70f2a14b   Andrew Morton   include/linux/sch...
518
519
520
  		} else if (anon_vma_fork(tmp, mpnt))
  			goto fail_nomem_anon_vma_fork;
  		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
d70f2a14b   Andrew Morton   include/linux/sch...
521
522
523
524
525
526
527
528
529
530
  		file = tmp->vm_file;
  		if (file) {
  			struct inode *inode = file_inode(file);
  			struct address_space *mapping = file->f_mapping;
  
  			get_file(file);
  			if (tmp->vm_flags & VM_DENYWRITE)
  				atomic_dec(&inode->i_writecount);
  			i_mmap_lock_write(mapping);
  			if (tmp->vm_flags & VM_SHARED)
cf508b584   Miaohe Lin   mm: use helper fu...
531
  				mapping_allow_writable(mapping);
d70f2a14b   Andrew Morton   include/linux/sch...
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
  			flush_dcache_mmap_lock(mapping);
  			/* insert tmp into the share list, just after mpnt */
  			vma_interval_tree_insert_after(tmp, mpnt,
  					&mapping->i_mmap);
  			flush_dcache_mmap_unlock(mapping);
  			i_mmap_unlock_write(mapping);
  		}
  
  		/*
  		 * Clear hugetlb-related page reserves for children. This only
  		 * affects MAP_PRIVATE mappings. Faults generated by the child
  		 * are not guaranteed to succeed, even if read-only
  		 */
  		if (is_vm_hugetlb_page(tmp))
  			reset_vma_resv_huge_pages(tmp);
  
  		/*
  		 * Link in the new vma and copy the page table entries.
  		 */
  		*pprev = tmp;
  		pprev = &tmp->vm_next;
  		tmp->vm_prev = prev;
  		prev = tmp;
  
  		__vma_link_rb(mm, tmp, rb_link, rb_parent);
  		rb_link = &tmp->vm_rb.rb_right;
  		rb_parent = &tmp->vm_rb;
  
  		mm->map_count++;
  		if (!(tmp->vm_flags & VM_WIPEONFORK))
c78f46364   Peter Xu   mm: remove src/ds...
562
  			retval = copy_page_range(tmp, mpnt);
d70f2a14b   Andrew Morton   include/linux/sch...
563
564
565
566
567
568
569
570
  
  		if (tmp->vm_ops && tmp->vm_ops->open)
  			tmp->vm_ops->open(tmp);
  
  		if (retval)
  			goto out;
  	}
  	/* a new mm has just been created */
1ed0cc5a0   Nadav Amit   mm: respect arch_...
571
  	retval = arch_dup_mmap(oldmm, mm);
d70f2a14b   Andrew Morton   include/linux/sch...
572
  out:
d8ed45c5d   Michel Lespinasse   mmap locking API:...
573
  	mmap_write_unlock(mm);
d70f2a14b   Andrew Morton   include/linux/sch...
574
  	flush_tlb_mm(oldmm);
d8ed45c5d   Michel Lespinasse   mmap locking API:...
575
  	mmap_write_unlock(oldmm);
d70f2a14b   Andrew Morton   include/linux/sch...
576
577
578
579
580
581
582
  	dup_userfaultfd_complete(&uf);
  fail_uprobe_end:
  	uprobe_end_dup_mmap();
  	return retval;
  fail_nomem_anon_vma_fork:
  	mpol_put(vma_policy(tmp));
  fail_nomem_policy:
3928d4f5e   Linus Torvalds   mm: use helper fu...
583
  	vm_area_free(tmp);
d70f2a14b   Andrew Morton   include/linux/sch...
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
  fail_nomem:
  	retval = -ENOMEM;
  	vm_unacct_memory(charge);
  	goto out;
  }
  
  static inline int mm_alloc_pgd(struct mm_struct *mm)
  {
  	mm->pgd = pgd_alloc(mm);
  	if (unlikely(!mm->pgd))
  		return -ENOMEM;
  	return 0;
  }
  
  static inline void mm_free_pgd(struct mm_struct *mm)
  {
  	pgd_free(mm, mm->pgd);
  }
  #else
  static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
605
  	mmap_write_lock(oldmm);
d70f2a14b   Andrew Morton   include/linux/sch...
606
  	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
d8ed45c5d   Michel Lespinasse   mmap locking API:...
607
  	mmap_write_unlock(oldmm);
d70f2a14b   Andrew Morton   include/linux/sch...
608
609
610
611
612
613
614
615
616
  	return 0;
  }
  #define mm_alloc_pgd(mm)	(0)
  #define mm_free_pgd(mm)
  #endif /* CONFIG_MMU */
  
  static void check_mm(struct mm_struct *mm)
  {
  	int i;
8495f7e67   Sai Praneeth Prakhya   fork: improve err...
617
618
  	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
  			 "Please make sure 'struct resident_page_types[]' is updated as well");
d70f2a14b   Andrew Morton   include/linux/sch...
619
620
621
622
  	for (i = 0; i < NR_MM_COUNTERS; i++) {
  		long x = atomic_long_read(&mm->rss_stat.count[i]);
  
  		if (unlikely(x))
8495f7e67   Sai Praneeth Prakhya   fork: improve err...
623
624
625
  			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld
  ",
  				 mm, resident_page_types[i], x);
d70f2a14b   Andrew Morton   include/linux/sch...
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
  	}
  
  	if (mm_pgtables_bytes(mm))
  		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld
  ",
  				mm_pgtables_bytes(mm));
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
  	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
  #endif
  }
  
  #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
  #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
  
  /*
   * Called when the last reference to the mm
   * is dropped: either by a lazy thread or by
   * mmput. Free the page directory and the mm.
   */
d34bc48f8   Andrew Morton   include/linux/sch...
646
  void __mmdrop(struct mm_struct *mm)
d70f2a14b   Andrew Morton   include/linux/sch...
647
648
  {
  	BUG_ON(mm == &init_mm);
3eda69c92   Mark Rutland   kernel/fork.c: de...
649
650
  	WARN_ON_ONCE(mm == current->mm);
  	WARN_ON_ONCE(mm == current->active_mm);
d70f2a14b   Andrew Morton   include/linux/sch...
651
652
  	mm_free_pgd(mm);
  	destroy_context(mm);
984cfe4e2   Jason Gunthorpe   mm/mmu_notifier: ...
653
  	mmu_notifier_subscriptions_destroy(mm);
d70f2a14b   Andrew Morton   include/linux/sch...
654
655
656
657
  	check_mm(mm);
  	put_user_ns(mm->user_ns);
  	free_mm(mm);
  }
d34bc48f8   Andrew Morton   include/linux/sch...
658
  EXPORT_SYMBOL_GPL(__mmdrop);
d70f2a14b   Andrew Morton   include/linux/sch...
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
  
  static void mmdrop_async_fn(struct work_struct *work)
  {
  	struct mm_struct *mm;
  
  	mm = container_of(work, struct mm_struct, async_put_work);
  	__mmdrop(mm);
  }
  
  static void mmdrop_async(struct mm_struct *mm)
  {
  	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
  		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
  		schedule_work(&mm->async_put_work);
  	}
  }
ea6d290ca   Oleg Nesterov   signals: make tas...
675
676
  static inline void free_signal_struct(struct signal_struct *sig)
  {
97101eb41   Oleg Nesterov   exit: move taskst...
677
  	taskstats_tgid_free(sig);
1c5354de9   Mike Galbraith   sched: Move sched...
678
  	sched_autogroup_exit(sig);
7283094ec   Michal Hocko   kernel, oom: fix ...
679
680
681
682
  	/*
  	 * __mmdrop is not safe to call from softirq context on x86 due to
  	 * pgd_dtor so postpone it to the async context
  	 */
26db62f17   Michal Hocko   oom: keep mm of t...
683
  	if (sig->oom_mm)
7283094ec   Michal Hocko   kernel, oom: fix ...
684
  		mmdrop_async(sig->oom_mm);
ea6d290ca   Oleg Nesterov   signals: make tas...
685
686
687
688
689
  	kmem_cache_free(signal_cachep, sig);
  }
  
  static inline void put_signal_struct(struct signal_struct *sig)
  {
60d4de3ff   Elena Reshetova   sched/core: Conve...
690
  	if (refcount_dec_and_test(&sig->sigcnt))
ea6d290ca   Oleg Nesterov   signals: make tas...
691
692
  		free_signal_struct(sig);
  }
158d9ebd1   Andrew Morton   [PATCH] resurrect...
693
  void __put_task_struct(struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694
  {
270f722d4   Eugene Teo   Fix tsk->exit_sta...
695
  	WARN_ON(!tsk->exit_state);
ec1d28192   Elena Reshetova   sched/core: Conve...
696
  	WARN_ON(refcount_read(&tsk->usage));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
  	WARN_ON(tsk == current);
0f2122045   Jens Axboe   io_uring: don't r...
698
  	io_uring_free(tsk);
2e91fa7f6   Tejun Heo   cgroup: keep zomb...
699
  	cgroup_free(tsk);
16d51a590   Jann Horn   sched/fair: Don't...
700
  	task_numa_free(tsk, true);
1a2a4d06e   Kees Cook   security: create ...
701
  	security_task_free(tsk);
e0e817392   David Howells   CRED: Add some co...
702
  	exit_creds(tsk);
35df17c57   Shailabh Nagar   [PATCH] task dela...
703
  	delayacct_tsk_free(tsk);
ea6d290ca   Oleg Nesterov   signals: make tas...
704
  	put_signal_struct(tsk->signal);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
705
706
707
708
  
  	if (!profile_handoff_task(tsk))
  		free_task(tsk);
  }
77c100c83   Rik van Riel   export pid symbol...
709
  EXPORT_SYMBOL_GPL(__put_task_struct);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
710

6c0a9fa62   Thomas Gleixner   fork: Remove the ...
711
  void __init __weak arch_task_cache_init(void) { }
61c4628b5   Suresh Siddha   x86, fpu: split F...
712

ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
713
714
715
  /*
   * set_max_threads
   */
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
716
  static void set_max_threads(unsigned int max_threads_suggested)
ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
717
  {
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
718
  	u64 threads;
ca79b0c21   Arun KS   mm: convert total...
719
  	unsigned long nr_pages = totalram_pages();
ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
720
721
  
  	/*
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
722
723
  	 * The number of threads shall be limited such that the thread
  	 * structures may only consume a small part of the available memory.
ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
724
  	 */
3d6357de8   Arun KS   mm: reference tot...
725
  	if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
726
727
  		threads = MAX_THREADS;
  	else
3d6357de8   Arun KS   mm: reference tot...
728
  		threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
729
  				    (u64) THREAD_SIZE * 8UL);
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
730
731
  	if (threads > max_threads_suggested)
  		threads = max_threads_suggested;
ac1b398de   Heinrich Schuchardt   kernel/fork.c: av...
732
  	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
733
  }
5aaeb5c01   Ingo Molnar   x86/fpu, sched: I...
734
735
736
737
  #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
  /* Initialized by the architecture: */
  int arch_task_struct_size __read_mostly;
  #endif
0c8c0f03e   Dave Hansen   x86/fpu, sched: D...
738

4189ff234   Christoph Hellwig   kernel: only defi...
739
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
5905429ad   Kees Cook   fork: Provide use...
740
741
742
743
744
745
746
747
748
749
750
751
752
753
  static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
  {
  	/* Fetch thread_struct whitelist for the architecture. */
  	arch_thread_struct_whitelist(offset, size);
  
  	/*
  	 * Handle zero-sized whitelist or empty thread_struct, otherwise
  	 * adjust offset to position of thread_struct in task_struct.
  	 */
  	if (unlikely(*size == 0))
  		*offset = 0;
  	else
  		*offset += offsetof(struct task_struct, thread);
  }
4189ff234   Christoph Hellwig   kernel: only defi...
754
  #endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
5905429ad   Kees Cook   fork: Provide use...
755

ff691f6e0   Heinrich Schuchardt   kernel/fork.c: ne...
756
  void __init fork_init(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
757
  {
25f9c0817   Eric W. Biederman   userns: Generaliz...
758
  	int i;
f5e102873   Thomas Gleixner   task_allocator: U...
759
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
760
  #ifndef ARCH_MIN_TASKALIGN
e274795ea   Peter Zijlstra   locking/mutex: Fi...
761
  #define ARCH_MIN_TASKALIGN	0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
762
  #endif
95cb64c1f   Peter Zijlstra   fork: Fix task_st...
763
  	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
5905429ad   Kees Cook   fork: Provide use...
764
  	unsigned long useroffset, usersize;
e274795ea   Peter Zijlstra   locking/mutex: Fi...
765

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
766
  	/* create a slab on which task_structs can be allocated */
5905429ad   Kees Cook   fork: Provide use...
767
768
  	task_struct_whitelist(&useroffset, &usersize);
  	task_struct_cachep = kmem_cache_create_usercopy("task_struct",
e274795ea   Peter Zijlstra   locking/mutex: Fi...
769
  			arch_task_struct_size, align,
5905429ad   Kees Cook   fork: Provide use...
770
771
  			SLAB_PANIC|SLAB_ACCOUNT,
  			useroffset, usersize, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
772
  #endif
61c4628b5   Suresh Siddha   x86, fpu: split F...
773
774
  	/* do the arch specific task caches init */
  	arch_task_cache_init();
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
775
  	set_max_threads(MAX_THREADS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776
777
778
779
780
  
  	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
  	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
  	init_task.signal->rlim[RLIMIT_SIGPENDING] =
  		init_task.signal->rlim[RLIMIT_NPROC];
b376c3e1b   Eric W. Biederman   userns: Add a lim...
781

25f9c0817   Eric W. Biederman   userns: Generaliz...
782
783
784
  	for (i = 0; i < UCOUNT_COUNTS; i++) {
  		init_user_ns.ucount_max[i] = max_threads/2;
  	}
19659c59a   Hoeun Ryu   fork: free vmappe...
785
786
787
788
789
  
  #ifdef CONFIG_VMAP_STACK
  	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
  			  NULL, free_vm_stack_cache);
  #endif
b09be676e   Byungchul Park   locking/lockdep: ...
790

d08b9f0ca   Sami Tolvanen   scs: Add support ...
791
  	scs_init();
b09be676e   Byungchul Park   locking/lockdep: ...
792
  	lockdep_init_task(&init_task);
aad42dd44   Nadav Amit   uprobes: Initiali...
793
  	uprobes_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
794
  }
52f5684c8   Gideon Israel Dsouza   kernel: use macro...
795
  int __weak arch_dup_task_struct(struct task_struct *dst,
61c4628b5   Suresh Siddha   x86, fpu: split F...
796
797
798
799
800
  					       struct task_struct *src)
  {
  	*dst = *src;
  	return 0;
  }
d4311ff1a   Aaron Tomlin   init/main.c: Give...
801
802
803
804
805
806
807
  void set_task_stack_end_magic(struct task_struct *tsk)
  {
  	unsigned long *stackend;
  
  	stackend = end_of_stack(tsk);
  	*stackend = STACK_END_MAGIC;	/* for overflow detection */
  }
725fc629f   Andi Kleen   kernek/fork.c: al...
808
  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
809
810
  {
  	struct task_struct *tsk;
b235beea9   Linus Torvalds   Clarify naming of...
811
  	unsigned long *stack;
0f4991e8f   YueHaibing   kernel/fork.c: ma...
812
  	struct vm_struct *stack_vm_area __maybe_unused;
3e26c149c   Peter Zijlstra   mm: dirty balanci...
813
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
814

725fc629f   Andi Kleen   kernek/fork.c: al...
815
816
  	if (node == NUMA_NO_NODE)
  		node = tsk_fork_get_node(orig);
504f52b54   Eric Dumazet   mm: NUMA aware al...
817
  	tsk = alloc_task_struct_node(node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
818
819
  	if (!tsk)
  		return NULL;
b235beea9   Linus Torvalds   Clarify naming of...
820
821
  	stack = alloc_thread_stack_node(tsk, node);
  	if (!stack)
f19b9f74b   Akinobu Mita   fork: fix error h...
822
  		goto free_tsk;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
823

9b6f7e163   Roman Gushchin   mm: rework memcg ...
824
825
  	if (memcg_charge_kernel_stack(tsk))
  		goto free_stack;
ba14a194a   Andy Lutomirski   fork: Add generic...
826
  	stack_vm_area = task_stack_vm_area(tsk);
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
827
  	err = arch_dup_task_struct(tsk, orig);
ba14a194a   Andy Lutomirski   fork: Add generic...
828
829
830
831
832
833
834
835
836
837
  
  	/*
  	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
  	 * sure they're properly initialized before using any stack-related
  	 * functions again.
  	 */
  	tsk->stack = stack;
  #ifdef CONFIG_VMAP_STACK
  	tsk->stack_vm_area = stack_vm_area;
  #endif
68f24b08e   Andy Lutomirski   sched/core: Free ...
838
  #ifdef CONFIG_THREAD_INFO_IN_TASK
f0b89d395   Elena Reshetova   sched/core: Conve...
839
  	refcount_set(&tsk->stack_refcount, 1);
68f24b08e   Andy Lutomirski   sched/core: Free ...
840
  #endif
ba14a194a   Andy Lutomirski   fork: Add generic...
841

164c33c6a   Salman Qazi   sched: Fix fork()...
842
  	if (err)
b235beea9   Linus Torvalds   Clarify naming of...
843
  		goto free_stack;
164c33c6a   Salman Qazi   sched: Fix fork()...
844

d08b9f0ca   Sami Tolvanen   scs: Add support ...
845
846
847
  	err = scs_prepare(tsk, node);
  	if (err)
  		goto free_stack;
dbd952127   Kees Cook   seccomp: introduc...
848
849
850
851
852
853
854
855
856
  #ifdef CONFIG_SECCOMP
  	/*
  	 * We must handle setting up seccomp filters once we're under
  	 * the sighand lock in case orig has changed between now and
  	 * then. Until then, filter must be NULL to avoid messing up
  	 * the usage counts on the error path calling free_task.
  	 */
  	tsk->seccomp.filter = NULL;
  #endif
87bec58a5   Andrew Morton   revert "sched: Fi...
857
858
  
  	setup_thread_stack(tsk, orig);
8e7cac798   Avi Kivity   core: Fix user re...
859
  	clear_user_return_notifier(tsk);
f26f9aff6   Mike Galbraith   Sched: fix skip_c...
860
  	clear_tsk_need_resched(tsk);
d4311ff1a   Aaron Tomlin   init/main.c: Give...
861
  	set_task_stack_end_magic(tsk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
862

050e9baa9   Linus Torvalds   Kbuild: rename CC...
863
  #ifdef CONFIG_STACKPROTECTOR
7cd815bce   Rik van Riel   fork,random: use ...
864
  	tsk->stack_canary = get_random_canary();
0a4254058   Arjan van de Ven   [PATCH] Add the c...
865
  #endif
3bd370625   Sebastian Andrzej Siewior   sched/core: Provi...
866
867
  	if (orig->cpus_ptr == &orig->cpus_mask)
  		tsk->cpus_ptr = &tsk->cpus_mask;
0a4254058   Arjan van de Ven   [PATCH] Add the c...
868

fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
869
  	/*
0ff7b2cfb   Eric W. Biederman   tasks, sched/core...
870
871
  	 * One for the user space visible state that goes away when reaped.
  	 * One for the scheduler.
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
872
  	 */
0ff7b2cfb   Eric W. Biederman   tasks, sched/core...
873
874
875
  	refcount_set(&tsk->rcu_users, 2);
  	/* One for the rcu users */
  	refcount_set(&tsk->usage, 1);
6c5c93415   Alexey Dobriyan   [PATCH] ifdef blk...
876
  #ifdef CONFIG_BLK_DEV_IO_TRACE
2056a782f   Jens Axboe   [PATCH] Block que...
877
  	tsk->btrace_seq = 0;
6c5c93415   Alexey Dobriyan   [PATCH] ifdef blk...
878
  #endif
a0aa7f68a   Jens Axboe   [PATCH] Don't inh...
879
  	tsk->splice_pipe = NULL;
5640f7685   Eric Dumazet   net: use a per ta...
880
  	tsk->task_frag.page = NULL;
093e5840a   Sebastian Andrzej Siewior   sched/core: Reset...
881
  	tsk->wake_q.next = NULL;
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
882

ba14a194a   Andy Lutomirski   fork: Add generic...
883
  	account_kernel_stack(tsk, 1);
c6a7f5728   KOSAKI Motohiro   mm: oom analysis:...
884

5c9a8750a   Dmitry Vyukov   kernel: add kcov ...
885
  	kcov_task_init(tsk);
e41d58185   Dmitry Vyukov   fault-inject: sup...
886
887
888
  #ifdef CONFIG_FAULT_INJECTION
  	tsk->fail_nth = 0;
  #endif
2c323017e   Josef Bacik   blk-cgroup: clear...
889
890
891
892
  #ifdef CONFIG_BLK_CGROUP
  	tsk->throttle_queue = NULL;
  	tsk->use_memdelay = 0;
  #endif
d46eb14b7   Shakeel Butt   fs: fsnotify: acc...
893
894
895
  #ifdef CONFIG_MEMCG
  	tsk->active_memcg = NULL;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
896
  	return tsk;
61c4628b5   Suresh Siddha   x86, fpu: split F...
897

b235beea9   Linus Torvalds   Clarify naming of...
898
  free_stack:
ba14a194a   Andy Lutomirski   fork: Add generic...
899
  	free_thread_stack(tsk);
f19b9f74b   Akinobu Mita   fork: fix error h...
900
  free_tsk:
61c4628b5   Suresh Siddha   x86, fpu: split F...
901
902
  	free_task_struct(tsk);
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
903
  }
23ff44402   Daniel Walker   whitespace fixes:...
904
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
905

4cb0e11b1   Hidehiro Kawai   coredump_filter: ...
906
907
908
909
910
911
912
913
914
915
916
  static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
  
  static int __init coredump_filter_setup(char *s)
  {
  	default_dump_filter =
  		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
  		MMF_DUMP_FILTER_MASK;
  	return 1;
  }
  
  __setup("coredump_filter=", coredump_filter_setup);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
917
  #include <linux/init_task.h>
858f09930   Alexey Dobriyan   aio: ifdef fields...
918
919
920
921
  static void mm_init_aio(struct mm_struct *mm)
  {
  #ifdef CONFIG_AIO
  	spin_lock_init(&mm->ioctx_lock);
db446a08c   Benjamin LaHaise   aio: convert the ...
922
  	mm->ioctx_table = NULL;
858f09930   Alexey Dobriyan   aio: ifdef fields...
923
924
  #endif
  }
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
925
926
927
928
929
930
931
932
  static __always_inline void mm_clear_owner(struct mm_struct *mm,
  					   struct task_struct *p)
  {
  #ifdef CONFIG_MEMCG
  	if (mm->owner == p)
  		WRITE_ONCE(mm->owner, NULL);
  #endif
  }
33144e842   Vladimir Davydov   kernel/fork.c: ma...
933
934
935
936
937
938
  static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  {
  #ifdef CONFIG_MEMCG
  	mm->owner = p;
  #endif
  }
355627f51   Eric Biggers   mm, uprobes: fix ...
939
940
941
942
943
944
  static void mm_init_uprobes_state(struct mm_struct *mm)
  {
  #ifdef CONFIG_UPROBES
  	mm->uprobes_state.xol_area = NULL;
  #endif
  }
bfedb5892   Eric W. Biederman   mm: Add a user_ns...
945
946
  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
  	struct user_namespace *user_ns)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947
  {
41f727fde   Vladimir Davydov   fork/exec: cleanu...
948
949
950
  	mm->mmap = NULL;
  	mm->mm_rb = RB_ROOT;
  	mm->vmacache_seqnum = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
951
952
  	atomic_set(&mm->mm_users, 1);
  	atomic_set(&mm->mm_count, 1);
d8ed45c5d   Michel Lespinasse   mmap locking API:...
953
  	mmap_init_lock(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
954
  	INIT_LIST_HEAD(&mm->mmlist);
999d9fc16   Oleg Nesterov   coredump: move mm...
955
  	mm->core_state = NULL;
af5b0f6a0   Kirill A. Shutemov   mm: consolidate p...
956
  	mm_pgtables_bytes_init(mm);
41f727fde   Vladimir Davydov   fork/exec: cleanu...
957
958
  	mm->map_count = 0;
  	mm->locked_vm = 0;
008cfe441   Peter Xu   mm: Introduce mm_...
959
  	atomic_set(&mm->has_pinned, 0);
70f8a3ca6   Davidlohr Bueso   mm: make mm->pinn...
960
  	atomic64_set(&mm->pinned_vm, 0);
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
961
  	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962
  	spin_lock_init(&mm->page_table_lock);
88aa7cc68   Yang Shi   mm: introduce arg...
963
  	spin_lock_init(&mm->arg_lock);
41f727fde   Vladimir Davydov   fork/exec: cleanu...
964
  	mm_init_cpumask(mm);
858f09930   Alexey Dobriyan   aio: ifdef fields...
965
  	mm_init_aio(mm);
cf475ad28   Balbir Singh   cgroups: add an o...
966
  	mm_init_owner(mm, p);
2b7e8665b   Eric Biggers   fork: fix incorre...
967
  	RCU_INIT_POINTER(mm->exe_file, NULL);
984cfe4e2   Jason Gunthorpe   mm/mmu_notifier: ...
968
  	mmu_notifier_subscriptions_init(mm);
16af97dc5   Nadav Amit   mm: migrate: prev...
969
  	init_tlb_flush_pending(mm);
41f727fde   Vladimir Davydov   fork/exec: cleanu...
970
971
972
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
  	mm->pmd_huge_pte = NULL;
  #endif
355627f51   Eric Biggers   mm, uprobes: fix ...
973
  	mm_init_uprobes_state(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
974

a0715cc22   Alex Thorlton   mm, thp: add VM_I...
975
976
977
978
979
  	if (current->mm) {
  		mm->flags = current->mm->flags & MMF_INIT_MASK;
  		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
  	} else {
  		mm->flags = default_dump_filter;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
980
  		mm->def_flags = 0;
a0715cc22   Alex Thorlton   mm, thp: add VM_I...
981
  	}
41f727fde   Vladimir Davydov   fork/exec: cleanu...
982
983
984
985
986
  	if (mm_alloc_pgd(mm))
  		goto fail_nopgd;
  
  	if (init_new_context(p, mm))
  		goto fail_nocontext;
78fb74669   Pavel Emelianov   Memory controller...
987

bfedb5892   Eric W. Biederman   mm: Add a user_ns...
988
  	mm->user_ns = get_user_ns(user_ns);
41f727fde   Vladimir Davydov   fork/exec: cleanu...
989
990
991
992
993
  	return mm;
  
  fail_nocontext:
  	mm_free_pgd(mm);
  fail_nopgd:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
994
995
996
997
998
999
1000
  	free_mm(mm);
  	return NULL;
  }
  
  /*
   * Allocate and initialize an mm_struct.
   */
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1001
  struct mm_struct *mm_alloc(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1002
  {
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1003
  	struct mm_struct *mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1004
1005
  
  	mm = allocate_mm();
de03c72cf   KOSAKI Motohiro   mm: convert mm->c...
1006
1007
1008
1009
  	if (!mm)
  		return NULL;
  
  	memset(mm, 0, sizeof(*mm));
bfedb5892   Eric W. Biederman   mm: Add a user_ns...
1010
  	return mm_init(mm, current, current_user_ns());
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1011
  }
ec8d7c14e   Michal Hocko   mm, oom_reaper: d...
1012
1013
1014
1015
1016
1017
1018
1019
1020
  static inline void __mmput(struct mm_struct *mm)
  {
  	VM_BUG_ON(atomic_read(&mm->mm_users));
  
  	uprobe_clear_state(mm);
  	exit_aio(mm);
  	ksm_exit(mm);
  	khugepaged_exit(mm); /* must run before exit_mmap */
  	exit_mmap(mm);
6fcb52a56   Aaron Lu   thp: reduce usage...
1021
  	mm_put_huge_zero_page(mm);
ec8d7c14e   Michal Hocko   mm, oom_reaper: d...
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
  	set_mm_exe_file(mm, NULL);
  	if (!list_empty(&mm->mmlist)) {
  		spin_lock(&mmlist_lock);
  		list_del(&mm->mmlist);
  		spin_unlock(&mmlist_lock);
  	}
  	if (mm->binfmt)
  		module_put(mm->binfmt->module);
  	mmdrop(mm);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1032
1033
1034
1035
1036
  /*
   * Decrement the use count and release all resources for an mm.
   */
  void mmput(struct mm_struct *mm)
  {
0ae26f1b3   Andrew Morton   [PATCH] mmput() m...
1037
  	might_sleep();
ec8d7c14e   Michal Hocko   mm, oom_reaper: d...
1038
1039
1040
1041
  	if (atomic_dec_and_test(&mm->mm_users))
  		__mmput(mm);
  }
  EXPORT_SYMBOL_GPL(mmput);
a1b2289ce   Sherry Yang   android: binder: ...
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
  #ifdef CONFIG_MMU
  static void mmput_async_fn(struct work_struct *work)
  {
  	struct mm_struct *mm = container_of(work, struct mm_struct,
  					    async_put_work);
  
  	__mmput(mm);
  }
  
  void mmput_async(struct mm_struct *mm)
  {
  	if (atomic_dec_and_test(&mm->mm_users)) {
  		INIT_WORK(&mm->async_put_work, mmput_async_fn);
  		schedule_work(&mm->async_put_work);
  	}
  }
  #endif
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1059
1060
1061
1062
1063
  /**
   * set_mm_exe_file - change a reference to the mm's executable file
   *
   * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
   *
6e399cd14   Davidlohr Bueso   prctl: avoid usin...
1064
1065
1066
1067
1068
   * Main users are mmput() and sys_execve(). Callers prevent concurrent
   * invocations: in mmput() nobody alive left, in execve task is single
   * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
   * mm->exe_file, but does so without using set_mm_exe_file() in order
   * to do avoid the need for any locks.
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1069
   */
386460138   Jiri Slaby   mm: extract exe_f...
1070
1071
  void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
  {
6e399cd14   Davidlohr Bueso   prctl: avoid usin...
1072
1073
1074
1075
1076
1077
1078
1079
  	struct file *old_exe_file;
  
  	/*
  	 * It is safe to dereference the exe_file without RCU as
  	 * this function is only called if nobody else can access
  	 * this mm -- see comment above for justification.
  	 */
  	old_exe_file = rcu_dereference_raw(mm->exe_file);
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1080

386460138   Jiri Slaby   mm: extract exe_f...
1081
1082
  	if (new_exe_file)
  		get_file(new_exe_file);
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1083
1084
1085
  	rcu_assign_pointer(mm->exe_file, new_exe_file);
  	if (old_exe_file)
  		fput(old_exe_file);
386460138   Jiri Slaby   mm: extract exe_f...
1086
  }
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1087
1088
1089
1090
1091
1092
  /**
   * get_mm_exe_file - acquire a reference to the mm's executable file
   *
   * Returns %NULL if mm has no associated executable file.
   * User must release file via fput().
   */
386460138   Jiri Slaby   mm: extract exe_f...
1093
1094
1095
  struct file *get_mm_exe_file(struct mm_struct *mm)
  {
  	struct file *exe_file;
90f31d0ea   Konstantin Khlebnikov   mm: rcu-protected...
1096
1097
1098
1099
1100
  	rcu_read_lock();
  	exe_file = rcu_dereference(mm->exe_file);
  	if (exe_file && !get_file_rcu(exe_file))
  		exe_file = NULL;
  	rcu_read_unlock();
386460138   Jiri Slaby   mm: extract exe_f...
1101
1102
  	return exe_file;
  }
11163348a   Davidlohr Bueso   oprofile: reduce ...
1103
  EXPORT_SYMBOL(get_mm_exe_file);
386460138   Jiri Slaby   mm: extract exe_f...
1104

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1105
  /**
cd81a9170   Mateusz Guzik   mm: introduce get...
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
   * get_task_exe_file - acquire a reference to the task's executable file
   *
   * Returns %NULL if task's mm (if any) has no associated executable file or
   * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
   * User must release file via fput().
   */
  struct file *get_task_exe_file(struct task_struct *task)
  {
  	struct file *exe_file = NULL;
  	struct mm_struct *mm;
  
  	task_lock(task);
  	mm = task->mm;
  	if (mm) {
  		if (!(task->flags & PF_KTHREAD))
  			exe_file = get_mm_exe_file(mm);
  	}
  	task_unlock(task);
  	return exe_file;
  }
  EXPORT_SYMBOL(get_task_exe_file);
386460138   Jiri Slaby   mm: extract exe_f...
1127

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128
1129
1130
  /**
   * get_task_mm - acquire a reference to the task's mm
   *
246bb0b1d   Oleg Nesterov   kill PF_BORROWED_...
1131
   * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
   * this kernel workthread has transiently adopted a user mm with use_mm,
   * to do its AIO) is not set and if so returns a reference to it, after
   * bumping up the use count.  User must release the mm via mmput()
   * after use.  Typically used by /proc and ptrace.
   */
  struct mm_struct *get_task_mm(struct task_struct *task)
  {
  	struct mm_struct *mm;
  
  	task_lock(task);
  	mm = task->mm;
  	if (mm) {
246bb0b1d   Oleg Nesterov   kill PF_BORROWED_...
1144
  		if (task->flags & PF_KTHREAD)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1145
1146
  			mm = NULL;
  		else
3fce371bf   Vegard Nossum   mm: add new mmget...
1147
  			mmget(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1148
1149
1150
1151
1152
  	}
  	task_unlock(task);
  	return mm;
  }
  EXPORT_SYMBOL_GPL(get_task_mm);
8cdb878dc   Christopher Yeoh   Fix race in proce...
1153
1154
1155
1156
  struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
  {
  	struct mm_struct *mm;
  	int err;
3e74fabd3   Bernd Edlinger   exec: Fix a deadl...
1157
  	err =  mutex_lock_killable(&task->signal->exec_update_mutex);
8cdb878dc   Christopher Yeoh   Fix race in proce...
1158
1159
1160
1161
1162
1163
1164
1165
1166
  	if (err)
  		return ERR_PTR(err);
  
  	mm = get_task_mm(task);
  	if (mm && mm != current->mm &&
  			!ptrace_may_access(task, mode)) {
  		mmput(mm);
  		mm = ERR_PTR(-EACCES);
  	}
3e74fabd3   Bernd Edlinger   exec: Fix a deadl...
1167
  	mutex_unlock(&task->signal->exec_update_mutex);
8cdb878dc   Christopher Yeoh   Fix race in proce...
1168
1169
1170
  
  	return mm;
  }
57b59c4a1   Oleg Nesterov   coredump_wait: do...
1171
  static void complete_vfork_done(struct task_struct *tsk)
c415c3b47   Oleg Nesterov   vfork: introduce ...
1172
  {
d68b46fe1   Oleg Nesterov   vfork: make it ki...
1173
  	struct completion *vfork;
c415c3b47   Oleg Nesterov   vfork: introduce ...
1174

d68b46fe1   Oleg Nesterov   vfork: make it ki...
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
  	task_lock(tsk);
  	vfork = tsk->vfork_done;
  	if (likely(vfork)) {
  		tsk->vfork_done = NULL;
  		complete(vfork);
  	}
  	task_unlock(tsk);
  }
  
  static int wait_for_vfork_done(struct task_struct *child,
  				struct completion *vfork)
  {
  	int killed;
  
  	freezer_do_not_count();
76f969e89   Roman Gushchin   cgroup: cgroup v2...
1190
  	cgroup_enter_frozen();
d68b46fe1   Oleg Nesterov   vfork: make it ki...
1191
  	killed = wait_for_completion_killable(vfork);
76f969e89   Roman Gushchin   cgroup: cgroup v2...
1192
  	cgroup_leave_frozen(false);
d68b46fe1   Oleg Nesterov   vfork: make it ki...
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
  	freezer_count();
  
  	if (killed) {
  		task_lock(child);
  		child->vfork_done = NULL;
  		task_unlock(child);
  	}
  
  	put_task_struct(child);
  	return killed;
c415c3b47   Oleg Nesterov   vfork: introduce ...
1203
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
  /* Please note the differences between mmput and mm_release.
   * mmput is called whenever we stop holding onto a mm_struct,
   * error success whatever.
   *
   * mm_release is called after a mm_struct has been removed
   * from the current process.
   *
   * This difference is important for error handling, when we
   * only half set up a mm_struct for a new process and need to restore
   * the old one.  Because we mmput the new mm_struct before
   * restoring the old one. . .
   * Eric Biederman 10 January 1998
   */
4610ba7ad   Thomas Gleixner   exit/exec: Sepera...
1217
  static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1218
  {
0326f5a94   Srikar Dronamraju   uprobes/core: Han...
1219
  	uprobe_free_utask(tsk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1220
1221
  	/* Get rid of any cached register state */
  	deactivate_mm(tsk, mm);
fec1d0115   Roland McGrath   [PATCH] Disable C...
1222
  	/*
735f2770a   Michal Hocko   kernel/fork: fix ...
1223
1224
1225
  	 * Signal userspace if we're not exiting with a core dump
  	 * because we want to leave the value intact for debugging
  	 * purposes.
fec1d0115   Roland McGrath   [PATCH] Disable C...
1226
  	 */
9c8a8228d   Eric Dumazet   execve: must clea...
1227
  	if (tsk->clear_child_tid) {
735f2770a   Michal Hocko   kernel/fork: fix ...
1228
  		if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
9c8a8228d   Eric Dumazet   execve: must clea...
1229
1230
1231
1232
1233
1234
  		    atomic_read(&mm->mm_users) > 1) {
  			/*
  			 * We don't check the error code - if userspace has
  			 * not set up a proper pointer then tough luck.
  			 */
  			put_user(0, tsk->clear_child_tid);
2de0db992   Dominik Brodowski   mm: use do_futex(...
1235
1236
  			do_futex(tsk->clear_child_tid, FUTEX_WAKE,
  					1, NULL, NULL, 0, 0);
9c8a8228d   Eric Dumazet   execve: must clea...
1237
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1238
  		tsk->clear_child_tid = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1239
  	}
f7505d64f   Konstantin Khlebnikov   fork: call comple...
1240
1241
1242
1243
1244
1245
1246
  
  	/*
  	 * All done, finally we can wake up parent and return this mm to him.
  	 * Also kthread_stop() uses this completion for synchronization.
  	 */
  	if (tsk->vfork_done)
  		complete_vfork_done(tsk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1247
  }
4610ba7ad   Thomas Gleixner   exit/exec: Sepera...
1248
1249
  void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
  {
150d71584   Thomas Gleixner   futex: Split fute...
1250
  	futex_exit_release(tsk);
4610ba7ad   Thomas Gleixner   exit/exec: Sepera...
1251
1252
1253
1254
1255
  	mm_release(tsk, mm);
  }
  
  void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
  {
150d71584   Thomas Gleixner   futex: Split fute...
1256
  	futex_exec_release(tsk);
4610ba7ad   Thomas Gleixner   exit/exec: Sepera...
1257
1258
  	mm_release(tsk, mm);
  }
13585fa06   Nadav Amit   fork: Provide a f...
1259
1260
1261
1262
1263
1264
1265
1266
1267
  /**
   * dup_mm() - duplicates an existing mm structure
   * @tsk: the task_struct with which the new mm will be associated.
   * @oldmm: the mm to duplicate.
   *
   * Allocates a new mm structure and duplicates the provided @oldmm structure
   * content into it.
   *
   * Return: the duplicated mm or NULL on failure.
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1268
   */
13585fa06   Nadav Amit   fork: Provide a f...
1269
1270
  static struct mm_struct *dup_mm(struct task_struct *tsk,
  				struct mm_struct *oldmm)
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1271
  {
13585fa06   Nadav Amit   fork: Provide a f...
1272
  	struct mm_struct *mm;
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1273
  	int err;
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1274
1275
1276
1277
1278
  	mm = allocate_mm();
  	if (!mm)
  		goto fail_nomem;
  
  	memcpy(mm, oldmm, sizeof(*mm));
bfedb5892   Eric W. Biederman   mm: Add a user_ns...
1279
  	if (!mm_init(mm, tsk, mm->user_ns))
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1280
  		goto fail_nomem;
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1281
1282
1283
1284
1285
1286
  	err = dup_mmap(mm, oldmm);
  	if (err)
  		goto free_pt;
  
  	mm->hiwater_rss = get_mm_rss(mm);
  	mm->hiwater_vm = mm->total_vm;
801460d0c   Hiroshi Shimamoto   task_struct clean...
1287
1288
  	if (mm->binfmt && !try_module_get(mm->binfmt->module))
  		goto free_pt;
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1289
1290
1291
  	return mm;
  
  free_pt:
801460d0c   Hiroshi Shimamoto   task_struct clean...
1292
1293
  	/* don't put binfmt in mmput, we haven't got module yet */
  	mm->binfmt = NULL;
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
1294
  	mm_init_owner(mm, NULL);
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1295
1296
1297
1298
  	mmput(mm);
  
  fail_nomem:
  	return NULL;
a0a7ec308   JANAK DESAI   [PATCH] unshare s...
1299
  }
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1300
  static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1301
  {
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1302
  	struct mm_struct *mm, *oldmm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1303
1304
1305
1306
  	int retval;
  
  	tsk->min_flt = tsk->maj_flt = 0;
  	tsk->nvcsw = tsk->nivcsw = 0;
17406b82d   Mandeep Singh Baines   softlockup: remov...
1307
1308
  #ifdef CONFIG_DETECT_HUNG_TASK
  	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
a2e514453   Dmitry Vyukov   kernel/hung_task....
1309
  	tsk->last_switch_time = 0;
17406b82d   Mandeep Singh Baines   softlockup: remov...
1310
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
  
  	tsk->mm = NULL;
  	tsk->active_mm = NULL;
  
  	/*
  	 * Are we cloning a kernel thread?
  	 *
  	 * We need to steal a active VM for that..
  	 */
  	oldmm = current->mm;
  	if (!oldmm)
  		return 0;
615d6e875   Davidlohr Bueso   mm: per-thread vm...
1323
1324
  	/* initialize the new vmacache entries */
  	vmacache_flush(tsk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1325
  	if (clone_flags & CLONE_VM) {
3fce371bf   Vegard Nossum   mm: add new mmget...
1326
  		mmget(oldmm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1327
  		mm = oldmm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1328
1329
1330
1331
  		goto good_mm;
  	}
  
  	retval = -ENOMEM;
13585fa06   Nadav Amit   fork: Provide a f...
1332
  	mm = dup_mm(tsk, current->mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1333
1334
  	if (!mm)
  		goto fail_nomem;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1335
1336
1337
1338
  good_mm:
  	tsk->mm = mm;
  	tsk->active_mm = mm;
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1339
1340
  fail_nomem:
  	return retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1341
  }
a39bc5169   Alexey Dobriyan   Uninline fork.c/e...
1342
  static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1343
  {
498052bba   Al Viro   New locking/refco...
1344
  	struct fs_struct *fs = current->fs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
  	if (clone_flags & CLONE_FS) {
498052bba   Al Viro   New locking/refco...
1346
  		/* tsk->fs is already what we want */
2a4419b5b   Nick Piggin   fs: fs_struct rwl...
1347
  		spin_lock(&fs->lock);
498052bba   Al Viro   New locking/refco...
1348
  		if (fs->in_exec) {
2a4419b5b   Nick Piggin   fs: fs_struct rwl...
1349
  			spin_unlock(&fs->lock);
498052bba   Al Viro   New locking/refco...
1350
1351
1352
  			return -EAGAIN;
  		}
  		fs->users++;
2a4419b5b   Nick Piggin   fs: fs_struct rwl...
1353
  		spin_unlock(&fs->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1354
1355
  		return 0;
  	}
498052bba   Al Viro   New locking/refco...
1356
  	tsk->fs = copy_fs_struct(fs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1357
1358
1359
1360
  	if (!tsk->fs)
  		return -ENOMEM;
  	return 0;
  }
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1361
  static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
a016f3389   JANAK DESAI   [PATCH] unshare s...
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
  {
  	struct files_struct *oldf, *newf;
  	int error = 0;
  
  	/*
  	 * A background process may not have any files ...
  	 */
  	oldf = current->files;
  	if (!oldf)
  		goto out;
  
  	if (clone_flags & CLONE_FILES) {
  		atomic_inc(&oldf->count);
  		goto out;
  	}
60997c3d4   Christian Brauner   close_range: add ...
1377
  	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
a016f3389   JANAK DESAI   [PATCH] unshare s...
1378
1379
1380
1381
1382
1383
1384
1385
  	if (!newf)
  		goto out;
  
  	tsk->files = newf;
  	error = 0;
  out:
  	return error;
  }
fadad878c   Jens Axboe   kernel: add CLONE...
1386
  static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
fd0928df9   Jens Axboe   ioprio: move io p...
1387
1388
1389
  {
  #ifdef CONFIG_BLOCK
  	struct io_context *ioc = current->io_context;
6e736be7f   Tejun Heo   block: make ioc g...
1390
  	struct io_context *new_ioc;
fd0928df9   Jens Axboe   ioprio: move io p...
1391
1392
1393
  
  	if (!ioc)
  		return 0;
fadad878c   Jens Axboe   kernel: add CLONE...
1394
1395
1396
1397
  	/*
  	 * Share io context with parent, if CLONE_IO is set
  	 */
  	if (clone_flags & CLONE_IO) {
3d48749d9   Tejun Heo   block: ioc_task_l...
1398
1399
  		ioc_task_link(ioc);
  		tsk->io_context = ioc;
fadad878c   Jens Axboe   kernel: add CLONE...
1400
  	} else if (ioprio_valid(ioc->ioprio)) {
6e736be7f   Tejun Heo   block: make ioc g...
1401
1402
  		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
  		if (unlikely(!new_ioc))
fd0928df9   Jens Axboe   ioprio: move io p...
1403
  			return -ENOMEM;
6e736be7f   Tejun Heo   block: make ioc g...
1404
  		new_ioc->ioprio = ioc->ioprio;
11a3122f6   Tejun Heo   block: strip out ...
1405
  		put_io_context(new_ioc);
fd0928df9   Jens Axboe   ioprio: move io p...
1406
1407
1408
1409
  	}
  #endif
  	return 0;
  }
a39bc5169   Alexey Dobriyan   Uninline fork.c/e...
1410
  static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1411
1412
  {
  	struct sighand_struct *sig;
60348802e   Zhaolei   fork.c: cleanup f...
1413
  	if (clone_flags & CLONE_SIGHAND) {
d036bda7d   Elena Reshetova   sched/core: Conve...
1414
  		refcount_inc(&current->sighand->count);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1415
1416
1417
  		return 0;
  	}
  	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
0c282b068   Madhuparna Bhowmik   fork: Use RCU_INI...
1418
  	RCU_INIT_POINTER(tsk->sighand, sig);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1419
1420
  	if (!sig)
  		return -ENOMEM;
9d7fb0427   Peter Zijlstra   sched/cputime: Gu...
1421

d036bda7d   Elena Reshetova   sched/core: Conve...
1422
  	refcount_set(&sig->count, 1);
06e62a46b   Jann Horn   fork: don't copy ...
1423
  	spin_lock_irq(&current->sighand->siglock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1424
  	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
06e62a46b   Jann Horn   fork: don't copy ...
1425
  	spin_unlock_irq(&current->sighand->siglock);
b612e5df4   Christian Brauner   clone3: add CLONE...
1426
1427
1428
1429
  
  	/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
  	if (clone_flags & CLONE_CLEAR_SIGHAND)
  		flush_signal_handlers(tsk, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1430
1431
  	return 0;
  }
a7e5328a0   Oleg Nesterov   [PATCH] cleanup _...
1432
  void __cleanup_sighand(struct sighand_struct *sighand)
c81addc9d   Oleg Nesterov   [PATCH] rename __...
1433
  {
d036bda7d   Elena Reshetova   sched/core: Conve...
1434
  	if (refcount_dec_and_test(&sighand->count)) {
d80e731ec   Oleg Nesterov   epoll: introduce ...
1435
  		signalfd_cleanup(sighand);
392809b25   Oleg Nesterov   signal: Document ...
1436
  		/*
5f0d5a3ae   Paul E. McKenney   mm: Rename SLAB_D...
1437
  		 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
392809b25   Oleg Nesterov   signal: Document ...
1438
1439
  		 * without an RCU grace period, see __lock_task_sighand().
  		 */
c81addc9d   Oleg Nesterov   [PATCH] rename __...
1440
  		kmem_cache_free(sighand_cachep, sighand);
d80e731ec   Oleg Nesterov   epoll: introduce ...
1441
  	}
c81addc9d   Oleg Nesterov   [PATCH] rename __...
1442
  }
f06febc96   Frank Mayhar   timers: fix itime...
1443
1444
1445
1446
1447
  /*
   * Initialize POSIX timer handling for a thread group.
   */
  static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
2b69942f9   Thomas Gleixner   posix-cpu-timers:...
1448
  	struct posix_cputimers *pct = &sig->posix_cputimers;
78d7d407b   Jiri Slaby   kernel core: use ...
1449
  	unsigned long cpu_limit;
316c1608d   Jason Low   sched, timer: Con...
1450
  	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
3a245c0f1   Thomas Gleixner   posix-cpu-timers:...
1451
  	posix_cputimers_group_init(pct, cpu_limit);
f06febc96   Frank Mayhar   timers: fix itime...
1452
  }
a39bc5169   Alexey Dobriyan   Uninline fork.c/e...
1453
  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1454
1455
  {
  	struct signal_struct *sig;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1456

4ab6c0833   Oleg Nesterov   clone(): fix race...
1457
  	if (clone_flags & CLONE_THREAD)
490dea45d   Peter Zijlstra   itimers: remove t...
1458
  		return 0;
490dea45d   Peter Zijlstra   itimers: remove t...
1459

a56704ef6   Veaceslav Falico   copy_signal() cle...
1460
  	sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1461
1462
1463
  	tsk->signal = sig;
  	if (!sig)
  		return -ENOMEM;
b3ac022cb   Oleg Nesterov   proc: turn signal...
1464
  	sig->nr_threads = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1465
  	atomic_set(&sig->live, 1);
60d4de3ff   Elena Reshetova   sched/core: Conve...
1466
  	refcount_set(&sig->sigcnt, 1);
0c740d0af   Oleg Nesterov   introduce for_eac...
1467
1468
1469
1470
  
  	/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
  	sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
  	tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1471
  	init_waitqueue_head(&sig->wait_chldexit);
db51aeccd   Oleg Nesterov   signals: microopt...
1472
  	sig->curr_target = tsk;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1473
  	init_sigpending(&sig->shared_pending);
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
1474
  	INIT_HLIST_HEAD(&sig->multiprocess);
e78c34967   Rik van Riel   time, signal: Pro...
1475
  	seqlock_init(&sig->stats_lock);
9d7fb0427   Peter Zijlstra   sched/cputime: Gu...
1476
  	prev_cputime_init(&sig->prev_cputime);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1477

baa73d9e4   Nicolas Pitre   posix-timers: Mak...
1478
  #ifdef CONFIG_POSIX_TIMERS
b18b6a9ce   Nicolas Pitre   timers: Omit POSI...
1479
  	INIT_LIST_HEAD(&sig->posix_timers);
c9cb2e3d7   Thomas Gleixner   [PATCH] hrtimers:...
1480
  	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1481
  	sig->real_timer.function = it_real_fn;
baa73d9e4   Nicolas Pitre   posix-timers: Mak...
1482
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1484
1485
1486
  	task_lock(current->group_leader);
  	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
  	task_unlock(current->group_leader);
6279a751f   Oleg Nesterov   posix-timers: fix...
1487
  	posix_cpu_timers_init_group(sig);
522ed7767   Miloslav Trmac   Audit: add TTY in...
1488
  	tty_audit_fork(sig);
5091faa44   Mike Galbraith   sched: Add 'autog...
1489
  	sched_autogroup_fork(sig);
522ed7767   Miloslav Trmac   Audit: add TTY in...
1490

a63d83f42   David Rientjes   oom: badness heur...
1491
  	sig->oom_score_adj = current->signal->oom_score_adj;
dabb16f63   Mandeep Singh Baines   oom: allow a non-...
1492
  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
28b83c519   KOSAKI Motohiro   oom: move oom_adj...
1493

9b1bf12d5   KOSAKI Motohiro   signals: move cre...
1494
  	mutex_init(&sig->cred_guard_mutex);
eea967325   Eric W. Biederman   exec: Add exec_up...
1495
  	mutex_init(&sig->exec_update_mutex);
9b1bf12d5   KOSAKI Motohiro   signals: move cre...
1496

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1497
1498
  	return 0;
  }
dbd952127   Kees Cook   seccomp: introduc...
1499
1500
1501
1502
1503
1504
1505
1506
1507
  static void copy_seccomp(struct task_struct *p)
  {
  #ifdef CONFIG_SECCOMP
  	/*
  	 * Must be called with sighand->lock held, which is common to
  	 * all threads in the group. Holding cred_guard_mutex is not
  	 * needed because this new task is not yet running and cannot
  	 * be racing exec.
  	 */
69f6a34bd   Guenter Roeck   seccomp: Replace ...
1508
  	assert_spin_locked(&current->sighand->siglock);
dbd952127   Kees Cook   seccomp: introduc...
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
  
  	/* Ref-count the new filter user, and assign it. */
  	get_seccomp_filter(current);
  	p->seccomp = current->seccomp;
  
  	/*
  	 * Explicitly enable no_new_privs here in case it got set
  	 * between the task_struct being duplicated and holding the
  	 * sighand lock. The seccomp state and nnp must be in sync.
  	 */
  	if (task_no_new_privs(current))
  		task_set_no_new_privs(p);
  
  	/*
  	 * If the parent gained a seccomp mode after copying thread
  	 * flags and between before we held the sighand lock, we have
  	 * to manually enable the seccomp thread flag here.
  	 */
  	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
  		set_tsk_thread_flag(p, TIF_SECCOMP);
  #endif
  }
17da2bd90   Heiko Carstens   [CVE-2009-0029] S...
1531
  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1532
1533
  {
  	current->clear_child_tid = tidptr;
b488893a3   Pavel Emelyanov   pid namespaces: c...
1534
  	return task_pid_vnr(current);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1535
  }
a39bc5169   Alexey Dobriyan   Uninline fork.c/e...
1536
  static void rt_mutex_init_task(struct task_struct *p)
23f78d4a0   Ingo Molnar   [PATCH] pi-futex:...
1537
  {
1d6154825   Thomas Gleixner   sched: Convert pi...
1538
  	raw_spin_lock_init(&p->pi_lock);
e29e175b0   Zilvinas Valinskas   [PATCH] initialis...
1539
  #ifdef CONFIG_RT_MUTEXES
a23ba907d   Davidlohr Bueso   locking/rtmutex: ...
1540
  	p->pi_waiters = RB_ROOT_CACHED;
e96a7705e   Xunlei Pang   sched/rtmutex/dea...
1541
  	p->pi_top_task = NULL;
23f78d4a0   Ingo Molnar   [PATCH] pi-futex:...
1542
  	p->pi_blocked_on = NULL;
23f78d4a0   Ingo Molnar   [PATCH] pi-futex:...
1543
1544
  #endif
  }
2c4704756   Eric W. Biederman   pids: Move the pg...
1545
1546
1547
1548
1549
1550
1551
1552
  static inline void init_task_pid_links(struct task_struct *task)
  {
  	enum pid_type type;
  
  	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
  		INIT_HLIST_NODE(&task->pid_links[type]);
  	}
  }
819077398   Oleg Nesterov   kernel/fork.c:cop...
1553
1554
1555
  static inline void
  init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
  {
2c4704756   Eric W. Biederman   pids: Move the pg...
1556
1557
1558
1559
  	if (type == PIDTYPE_PID)
  		task->thread_pid = pid;
  	else
  		task->signal->pids[type] = pid;
819077398   Oleg Nesterov   kernel/fork.c:cop...
1560
  }
6bfbaa51e   Ingo Molnar   sched/headers, RC...
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
  static inline void rcu_copy_process(struct task_struct *p)
  {
  #ifdef CONFIG_PREEMPT_RCU
  	p->rcu_read_lock_nesting = 0;
  	p->rcu_read_unlock_special.s = 0;
  	p->rcu_blocked_node = NULL;
  	INIT_LIST_HEAD(&p->rcu_node_entry);
  #endif /* #ifdef CONFIG_PREEMPT_RCU */
  #ifdef CONFIG_TASKS_RCU
  	p->rcu_tasks_holdout = false;
  	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
  	p->rcu_tasks_idle_cpu = -1;
  #endif /* #ifdef CONFIG_TASKS_RCU */
d5f177d35   Paul E. McKenney   rcu-tasks: Add an...
1574
1575
  #ifdef CONFIG_TASKS_TRACE_RCU
  	p->trc_reader_nesting = 0;
276c41044   Paul E. McKenney   rcu-tasks: Split ...
1576
  	p->trc_reader_special.s = 0;
d5f177d35   Paul E. McKenney   rcu-tasks: Add an...
1577
1578
  	INIT_LIST_HEAD(&p->trc_holdout_list);
  #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
6bfbaa51e   Ingo Molnar   sched/headers, RC...
1579
  }
3695eae5f   Christian Brauner   pidfd: add P_PIDF...
1580
1581
1582
1583
1584
1585
1586
  struct pid *pidfd_pid(const struct file *file)
  {
  	if (file->f_op == &pidfd_fops)
  		return file->private_data;
  
  	return ERR_PTR(-EBADF);
  }
b3e583825   Christian Brauner   clone: add CLONE_...
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
  static int pidfd_release(struct inode *inode, struct file *file)
  {
  	struct pid *pid = file->private_data;
  
  	file->private_data = NULL;
  	put_pid(pid);
  	return 0;
  }
  
  #ifdef CONFIG_PROC_FS
15d42eb26   Christian Kellner   pidfd: add NSpid ...
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
  /**
   * pidfd_show_fdinfo - print information about a pidfd
   * @m: proc fdinfo file
   * @f: file referencing a pidfd
   *
   * Pid:
   * This function will print the pid that a given pidfd refers to in the
   * pid namespace of the procfs instance.
   * If the pid namespace of the process is not a descendant of the pid
   * namespace of the procfs instance 0 will be shown as its pid. This is
   * similar to calling getppid() on a process whose parent is outside of
   * its pid namespace.
   *
   * NSpid:
   * If pid namespaces are supported then this function will also print
   * the pid of a given pidfd refers to for all descendant pid namespaces
   * starting from the current pid namespace of the instance, i.e. the
   * Pid field and the first entry in the NSpid field will be identical.
   * If the pid namespace of the process is not a descendant of the pid
   * namespace of the procfs instance 0 will be shown as its first NSpid
   * entry and no others will be shown.
   * Note that this differs from the Pid and NSpid fields in
   * /proc/<pid>/status where Pid and NSpid are always shown relative to
   * the  pid namespace of the procfs instance. The difference becomes
   * obvious when sending around a pidfd between pid namespaces from a
   * different branch of the tree, i.e. where no ancestoral relation is
   * present between the pid namespaces:
   * - create two new pid namespaces ns1 and ns2 in the initial pid
   *   namespace (also take care to create new mount namespaces in the
   *   new pid namespace and mount procfs)
   * - create a process with a pidfd in ns1
   * - send pidfd from ns1 to ns2
   * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
   *   have exactly one entry, which is 0
   */
b3e583825   Christian Brauner   clone: add CLONE_...
1632
1633
  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  {
b3e583825   Christian Brauner   clone: add CLONE_...
1634
  	struct pid *pid = f->private_data;
3d6d8da48   Christian Brauner   pidfd: check pid ...
1635
1636
  	struct pid_namespace *ns;
  	pid_t nr = -1;
15d42eb26   Christian Kellner   pidfd: add NSpid ...
1637

3d6d8da48   Christian Brauner   pidfd: check pid ...
1638
  	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
9d78edeae   Alexey Gladkov   proc: proc_pid_ns...
1639
  		ns = proc_pid_ns(file_inode(m->file)->i_sb);
3d6d8da48   Christian Brauner   pidfd: check pid ...
1640
1641
1642
1643
  		nr = pid_nr_ns(pid, ns);
  	}
  
  	seq_put_decimal_ll(m, "Pid:\t", nr);
b3e583825   Christian Brauner   clone: add CLONE_...
1644

15d42eb26   Christian Kellner   pidfd: add NSpid ...
1645
  #ifdef CONFIG_PID_NS
3d6d8da48   Christian Brauner   pidfd: check pid ...
1646
1647
1648
  	seq_put_decimal_ll(m, "
  NSpid:\t", nr);
  	if (nr > 0) {
15d42eb26   Christian Kellner   pidfd: add NSpid ...
1649
  		int i;
b3e583825   Christian Brauner   clone: add CLONE_...
1650

15d42eb26   Christian Kellner   pidfd: add NSpid ...
1651
1652
1653
1654
1655
1656
  		/* If nr is non-zero it means that 'pid' is valid and that
  		 * ns, i.e. the pid namespace associated with the procfs
  		 * instance, is in the pid namespace hierarchy of pid.
  		 * Start at one below the already printed level.
  		 */
  		for (i = ns->level + 1; i <= pid->level; i++)
3d6d8da48   Christian Brauner   pidfd: check pid ...
1657
  			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
15d42eb26   Christian Kellner   pidfd: add NSpid ...
1658
1659
  	}
  #endif
b3e583825   Christian Brauner   clone: add CLONE_...
1660
1661
1662
1663
  	seq_putc(m, '
  ');
  }
  #endif
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1664
1665
1666
  /*
   * Poll support for process exit notification.
   */
9e77716a7   Luc Van Oostenryck   fork: fix pidfd_p...
1667
  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1668
  {
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1669
  	struct pid *pid = file->private_data;
9e77716a7   Luc Van Oostenryck   fork: fix pidfd_p...
1670
  	__poll_t poll_flags = 0;
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1671
1672
  
  	poll_wait(file, &pid->wait_pidfd, pts);
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1673
1674
1675
1676
1677
  	/*
  	 * Inform pollers only when the whole thread group exits.
  	 * If the thread group leader exits before all other threads in the
  	 * group, then poll(2) should block, similar to the wait(2) family.
  	 */
38fd525a4   Eric W. Biederman   exit: Factor thre...
1678
  	if (thread_group_exited(pid))
9e77716a7   Luc Van Oostenryck   fork: fix pidfd_p...
1679
  		poll_flags = EPOLLIN | EPOLLRDNORM;
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1680
1681
1682
  
  	return poll_flags;
  }
b3e583825   Christian Brauner   clone: add CLONE_...
1683
1684
  const struct file_operations pidfd_fops = {
  	.release = pidfd_release,
b53b0b9d9   Joel Fernandes (Google)   pidfd: add pollin...
1685
  	.poll = pidfd_poll,
b3e583825   Christian Brauner   clone: add CLONE_...
1686
1687
1688
1689
  #ifdef CONFIG_PROC_FS
  	.show_fdinfo = pidfd_show_fdinfo,
  #endif
  };
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
  static void __delayed_free_task(struct rcu_head *rhp)
  {
  	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
  	free_task(tsk);
  }
  
  static __always_inline void delayed_free_task(struct task_struct *tsk)
  {
  	if (IS_ENABLED(CONFIG_MEMCG))
  		call_rcu(&tsk->rcu, __delayed_free_task);
  	else
  		free_task(tsk);
  }
67197a4f2   Suren Baghdasaryan   mm, oom_adj: don'...
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
  static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
  {
  	/* Skip if kernel thread */
  	if (!tsk->mm)
  		return;
  
  	/* Skip if spawning a thread or using vfork */
  	if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
  		return;
  
  	/* We need to synchronize with __set_oom_adj */
  	mutex_lock(&oom_adj_mutex);
  	set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
  	/* Update the values in case they were changed after copy_signal */
  	tsk->signal->oom_score_adj = current->signal->oom_score_adj;
  	tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
  	mutex_unlock(&oom_adj_mutex);
  }
f06febc96   Frank Mayhar   timers: fix itime...
1722
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1723
1724
1725
1726
1727
1728
1729
   * This creates a new process as a copy of the old one,
   * but does not actually start it yet.
   *
   * It copies the registers, and all the appropriate
   * parts of the process environment (as per the clone
   * flags). The actual kick-off is left to the caller.
   */
0766f788e   Emese Revfy   latent_entropy: M...
1730
  static __latent_entropy struct task_struct *copy_process(
09a05394f   Roland McGrath   tracehook: clone
1731
  					struct pid *pid,
3033f14ab   Josh Triplett   clone: support pa...
1732
  					int trace,
7f192e3cd   Christian Brauner   fork: add clone3
1733
1734
  					int node,
  					struct kernel_clone_args *args)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1735
  {
b3e583825   Christian Brauner   clone: add CLONE_...
1736
  	int pidfd = -1, retval;
a24efe62d   Mariusz Kozlowski   kernel/fork.c: re...
1737
  	struct task_struct *p;
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
1738
  	struct multiprocess_signals delayed;
6fd2fe494   Al Viro   copy_process(): d...
1739
  	struct file *pidfile = NULL;
7f192e3cd   Christian Brauner   fork: add clone3
1740
  	u64 clone_flags = args->flags;
769071ac9   Andrei Vagin   ns: Introduce Tim...
1741
  	struct nsproxy *nsp = current->nsproxy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1742

667b60946   Marcos Paulo de Souza   kernel/fork.c: ad...
1743
1744
1745
1746
  	/*
  	 * Don't allow sharing the root directory with processes in a different
  	 * namespace
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1747
1748
  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
  		return ERR_PTR(-EINVAL);
e66eded83   Eric W. Biederman   userns: Don't all...
1749
1750
  	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
  		return ERR_PTR(-EINVAL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
  	/*
  	 * Thread groups must share signals as well, and detached threads
  	 * can only be started up within the thread group.
  	 */
  	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
  		return ERR_PTR(-EINVAL);
  
  	/*
  	 * Shared signal handlers imply shared VM. By way of the above,
  	 * thread groups also imply shared VM. Blocking this case allows
  	 * for various simplifications in other code.
  	 */
  	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
  		return ERR_PTR(-EINVAL);
123be07b0   Sukadev Bhattiprolu   fork(): disable C...
1765
1766
1767
1768
1769
1770
1771
1772
1773
  	/*
  	 * Siblings of global init remain as zombies on exit since they are
  	 * not reaped by their parent (swapper). To solve this and to avoid
  	 * multi-rooted process trees, prevent global and container-inits
  	 * from creating siblings.
  	 */
  	if ((clone_flags & CLONE_PARENT) &&
  				current->signal->flags & SIGNAL_UNKILLABLE)
  		return ERR_PTR(-EINVAL);
8382fcac1   Eric W. Biederman   pidns: Outlaw thr...
1774
  	/*
40a0d32d1   Oleg Nesterov   fork: unify and t...
1775
  	 * If the new process will be in a different pid or user namespace
faf00da54   Eric W. Biederman   userns,pidns: For...
1776
  	 * do not allow it to share a thread group with the forking task.
8382fcac1   Eric W. Biederman   pidns: Outlaw thr...
1777
  	 */
faf00da54   Eric W. Biederman   userns,pidns: For...
1778
  	if (clone_flags & CLONE_THREAD) {
40a0d32d1   Oleg Nesterov   fork: unify and t...
1779
  		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
769071ac9   Andrei Vagin   ns: Introduce Tim...
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
  		    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
  			return ERR_PTR(-EINVAL);
  	}
  
  	/*
  	 * If the new process will be in a different time namespace
  	 * do not allow it to share VM or a thread group with the forking task.
  	 */
  	if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
  		if (nsp->time_ns != nsp->time_ns_for_children)
40a0d32d1   Oleg Nesterov   fork: unify and t...
1790
1791
  			return ERR_PTR(-EINVAL);
  	}
8382fcac1   Eric W. Biederman   pidns: Outlaw thr...
1792

b3e583825   Christian Brauner   clone: add CLONE_...
1793
  	if (clone_flags & CLONE_PIDFD) {
b3e583825   Christian Brauner   clone: add CLONE_...
1794
  		/*
b3e583825   Christian Brauner   clone: add CLONE_...
1795
1796
1797
1798
  		 * - CLONE_DETACHED is blocked so that we can potentially
  		 *   reuse it later for CLONE_PIDFD.
  		 * - CLONE_THREAD is blocked until someone really needs it.
  		 */
7f192e3cd   Christian Brauner   fork: add clone3
1799
  		if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
b3e583825   Christian Brauner   clone: add CLONE_...
1800
  			return ERR_PTR(-EINVAL);
b3e583825   Christian Brauner   clone: add CLONE_...
1801
  	}
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
  	/*
  	 * Force any signals received before this point to be delivered
  	 * before the fork happens.  Collect up signals sent to multiple
  	 * processes that happen during the fork and delay them so that
  	 * they appear to happen after the fork.
  	 */
  	sigemptyset(&delayed.signal);
  	INIT_HLIST_NODE(&delayed.node);
  
  	spin_lock_irq(&current->sighand->siglock);
  	if (!(clone_flags & CLONE_THREAD))
  		hlist_add_head(&delayed.node, &current->signal->multiprocess);
  	recalc_sigpending();
  	spin_unlock_irq(&current->sighand->siglock);
  	retval = -ERESTARTNOINTR;
  	if (signal_pending(current))
  		goto fork_out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1819
  	retval = -ENOMEM;
725fc629f   Andi Kleen   kernek/fork.c: al...
1820
  	p = dup_task_struct(current, node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1821
1822
  	if (!p)
  		goto fork_out;
4d6501dce   Vegard Nossum   kthread: Fix use-...
1823
1824
1825
1826
1827
1828
  	/*
  	 * This _must_ happen before we call free_task(), i.e. before we jump
  	 * to any of the bad_fork_* labels. This is to avoid freeing
  	 * p->set_child_tid which is (ab)used as a kthread's data pointer for
  	 * kernel threads (PF_KTHREAD).
  	 */
7f192e3cd   Christian Brauner   fork: add clone3
1829
  	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
4d6501dce   Vegard Nossum   kthread: Fix use-...
1830
1831
1832
  	/*
  	 * Clear TID on mm_release()?
  	 */
7f192e3cd   Christian Brauner   fork: add clone3
1833
  	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
4d6501dce   Vegard Nossum   kthread: Fix use-...
1834

f7e8b616e   Steven Rostedt   function-graph: m...
1835
  	ftrace_graph_init_task(p);
bea493a03   Peter Zijlstra   [PATCH] rt-mutex:...
1836
  	rt_mutex_init_task(p);
a21ee6055   Peter Zijlstra   lockdep: Change h...
1837
  	lockdep_assert_irqs_enabled();
d12c1a379   Ingo Molnar   lockdep: fix kern...
1838
  #ifdef CONFIG_PROVE_LOCKING
de30a2b35   Ingo Molnar   [PATCH] lockdep: ...
1839
1840
  	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1841
  	retval = -EAGAIN;
3b11a1dec   David Howells   CRED: Differentia...
1842
  	if (atomic_read(&p->real_cred->user->processes) >=
78d7d407b   Jiri Slaby   kernel core: use ...
1843
  			task_rlimit(p, RLIMIT_NPROC)) {
b57922b6c   Eric Paris   fork: reorder per...
1844
1845
  		if (p->real_cred->user != INIT_USER &&
  		    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1846
1847
  			goto bad_fork_free;
  	}
72fa59970   Vasiliy Kulikov   move RLIMIT_NPROC...
1848
  	current->flags &= ~PF_NPROC_EXCEEDED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1849

f1752eec6   David Howells   CRED: Detach the ...
1850
1851
1852
  	retval = copy_creds(p, clone_flags);
  	if (retval < 0)
  		goto bad_fork_free;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1853
1854
1855
1856
1857
1858
  
  	/*
  	 * If multiple threads are within copy_process(), then this check
  	 * triggers too late. This doesn't hurt, the check is only there
  	 * to stop root fork bombs.
  	 */
04ec93fe9   Li Zefan   fork.c: fix NULL ...
1859
  	retval = -EAGAIN;
c17d1a3a8   Weilong Chen   fork: annotate da...
1860
  	if (data_race(nr_threads >= max_threads))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1861
  		goto bad_fork_cleanup_count;
ca74e92b4   Shailabh Nagar   [PATCH] per-task-...
1862
  	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
c1de45ca8   Peter Zijlstra   sched/idle: Add s...
1863
  	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
514ddb446   David Rientjes   fork: collapse co...
1864
  	p->flags |= PF_FORKNOEXEC;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1865
1866
  	INIT_LIST_HEAD(&p->children);
  	INIT_LIST_HEAD(&p->sibling);
f41d911f8   Paul E. McKenney   rcu: Merge preemp...
1867
  	rcu_copy_process(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1868
1869
  	p->vfork_done = NULL;
  	spin_lock_init(&p->alloc_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1870

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1871
  	init_sigpending(&p->pending);
648616343   Martin Schwidefsky   [S390] cputime: a...
1872
  	p->utime = p->stime = p->gtime = 0;
40565b5ae   Stanislaw Gruszka   sched/cputime, po...
1873
  #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
648616343   Martin Schwidefsky   [S390] cputime: a...
1874
  	p->utimescaled = p->stimescaled = 0;
40565b5ae   Stanislaw Gruszka   sched/cputime, po...
1875
  #endif
9d7fb0427   Peter Zijlstra   sched/cputime: Gu...
1876
  	prev_cputime_init(&p->prev_cputime);
6a61671bb   Frederic Weisbecker   cputime: Safely r...
1877
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
bac5b6b6b   Frederic Weisbecker   sched/cputime: Mo...
1878
1879
1880
  	seqcount_init(&p->vtime.seqcount);
  	p->vtime.starttime = 0;
  	p->vtime.state = VTIME_INACTIVE;
6a61671bb   Frederic Weisbecker   cputime: Safely r...
1881
  #endif
0f2122045   Jens Axboe   io_uring: don't r...
1882
1883
1884
  #ifdef CONFIG_IO_URING
  	p->io_uring = NULL;
  #endif
a3a2e76c7   KAMEZAWA Hiroyuki   mm: avoid null-po...
1885
1886
1887
  #if defined(SPLIT_RSS_COUNTING)
  	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
  #endif
172ba844a   Balbir Singh   sched: update del...
1888

6976675d9   Arjan van de Ven   hrtimer: create a...
1889
  	p->default_timer_slack_ns = current->timer_slack_ns;
eb414681d   Johannes Weiner   psi: pressure sta...
1890
1891
1892
  #ifdef CONFIG_PSI
  	p->psi_flags = 0;
  #endif
5995477ab   Andrea Righi   task IO accountin...
1893
  	task_io_accounting_init(&p->ioac);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1894
  	acct_clear_integrals(p);
3a245c0f1   Thomas Gleixner   posix-cpu-timers:...
1895
  	posix_cputimers_init(&p->posix_cputimers);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1896

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1897
  	p->io_context = NULL;
c0b0ae8a8   Richard Guy Briggs   audit: use inline...
1898
  	audit_set_context(p, NULL);
b4f48b636   Paul Menage   Task Control Grou...
1899
  	cgroup_fork(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1900
  #ifdef CONFIG_NUMA
846a16bf0   Lee Schermerhorn   mempolicy: rename...
1901
  	p->mempolicy = mpol_dup(p->mempolicy);
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1902
1903
1904
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
e8604cb43   Li Zefan   cgroup: fix spuri...
1905
  		goto bad_fork_cleanup_threadgroup_lock;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1906
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1907
  #endif
778d3b0ff   Michal Hocko   cpusets: randomiz...
1908
1909
1910
  #ifdef CONFIG_CPUSETS
  	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
  	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
b75058614   Ahmed S. Darwish   sched: tasks: Use...
1911
  	seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
778d3b0ff   Michal Hocko   cpusets: randomiz...
1912
  #endif
de30a2b35   Ingo Molnar   [PATCH] lockdep: ...
1913
  #ifdef CONFIG_TRACE_IRQFLAGS
0584df9c1   Marco Elver   lockdep: Refactor...
1914
1915
1916
1917
1918
  	memset(&p->irqtrace, 0, sizeof(p->irqtrace));
  	p->irqtrace.hardirq_disable_ip	= _THIS_IP_;
  	p->irqtrace.softirq_enable_ip	= _THIS_IP_;
  	p->softirqs_enabled		= 1;
  	p->softirq_context		= 0;
de30a2b35   Ingo Molnar   [PATCH] lockdep: ...
1919
  #endif
8bcbde548   David Hildenbrand   sched/preempt, mm...
1920
1921
  
  	p->pagefault_disabled = 0;
fbb9ce953   Ingo Molnar   [PATCH] lockdep: ...
1922
  #ifdef CONFIG_LOCKDEP
b09be676e   Byungchul Park   locking/lockdep: ...
1923
  	lockdep_init_task(p);
fbb9ce953   Ingo Molnar   [PATCH] lockdep: ...
1924
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1925

408894ee4   Ingo Molnar   [PATCH] mutex sub...
1926
1927
1928
  #ifdef CONFIG_DEBUG_MUTEXES
  	p->blocked_on = NULL; /* not blocked yet */
  #endif
cafe56359   Kent Overstreet   bcache: A block l...
1929
1930
1931
1932
  #ifdef CONFIG_BCACHE
  	p->sequential_io	= 0;
  	p->sequential_io_avg	= 0;
  #endif
0f4814065   Markus Metzger   x86, ptrace: add ...
1933

3c90e6e99   Srivatsa Vaddagiri   sched: fix copy_n...
1934
  	/* Perform scheduler related setup. Assign this task to a CPU. */
aab03e05e   Dario Faggioli   sched/deadline: A...
1935
1936
1937
  	retval = sched_fork(clone_flags, p);
  	if (retval)
  		goto bad_fork_cleanup_policy;
6ab423e0e   Peter Zijlstra   perf_counter: Pro...
1938

cdd6c482c   Ingo Molnar   perf: Do the big ...
1939
  	retval = perf_event_init_task(p);
6ab423e0e   Peter Zijlstra   perf_counter: Pro...
1940
1941
  	if (retval)
  		goto bad_fork_cleanup_policy;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1942
1943
  	retval = audit_alloc(p);
  	if (retval)
6c72e3501   Peter Zijlstra   perf: fix perf bu...
1944
  		goto bad_fork_cleanup_perf;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1945
  	/* copy all the process information */
ab602f799   Jack Miller   shm: make exit_sh...
1946
  	shm_init_task(p);
e4e55b47e   Tetsuo Handa   LSM: Revive secur...
1947
  	retval = security_task_alloc(p, clone_flags);
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1948
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1949
  		goto bad_fork_cleanup_audit;
e4e55b47e   Tetsuo Handa   LSM: Revive secur...
1950
1951
1952
  	retval = copy_semundo(clone_flags, p);
  	if (retval)
  		goto bad_fork_cleanup_security;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1953
1954
  	retval = copy_files(clone_flags, p);
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1955
  		goto bad_fork_cleanup_semundo;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1956
1957
  	retval = copy_fs(clone_flags, p);
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1958
  		goto bad_fork_cleanup_files;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1959
1960
  	retval = copy_sighand(clone_flags, p);
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1961
  		goto bad_fork_cleanup_fs;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1962
1963
  	retval = copy_signal(clone_flags, p);
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1964
  		goto bad_fork_cleanup_sighand;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1965
1966
  	retval = copy_mm(clone_flags, p);
  	if (retval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1967
  		goto bad_fork_cleanup_signal;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1968
1969
  	retval = copy_namespaces(clone_flags, p);
  	if (retval)
d84f4f992   David Howells   CRED: Inaugurate ...
1970
  		goto bad_fork_cleanup_mm;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
1971
1972
  	retval = copy_io(clone_flags, p);
  	if (retval)
fd0928df9   Jens Axboe   ioprio: move io p...
1973
  		goto bad_fork_cleanup_namespaces;
714acdbd1   Christian Brauner   arch: rename copy...
1974
  	retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1975
  	if (retval)
fd0928df9   Jens Axboe   ioprio: move io p...
1976
  		goto bad_fork_cleanup_io;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1977

afaef01c0   Alexander Popov   x86/entry: Add ST...
1978
  	stackleak_task_init(p);
425fb2b4b   Pavel Emelyanov   pid namespaces: m...
1979
  	if (pid != &init_struct_pid) {
49cb2fc42   Adrian Reber   fork: extend clon...
1980
1981
  		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
  				args->set_tid_size);
35f71bc0a   Michal Hocko   fork: report pid ...
1982
1983
  		if (IS_ERR(pid)) {
  			retval = PTR_ERR(pid);
0740aa5f6   Jiri Slaby   fork: free thread...
1984
  			goto bad_fork_cleanup_thread;
35f71bc0a   Michal Hocko   fork: report pid ...
1985
  		}
425fb2b4b   Pavel Emelyanov   pid namespaces: m...
1986
  	}
b3e583825   Christian Brauner   clone: add CLONE_...
1987
1988
1989
1990
1991
1992
  	/*
  	 * This has to happen after we've potentially unshared the file
  	 * descriptor table (so that the pidfd doesn't leak into the child
  	 * if the fd table isn't shared).
  	 */
  	if (clone_flags & CLONE_PIDFD) {
6fd2fe494   Al Viro   copy_process(): d...
1993
  		retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
b3e583825   Christian Brauner   clone: add CLONE_...
1994
1995
1996
1997
  		if (retval < 0)
  			goto bad_fork_free_pid;
  
  		pidfd = retval;
6fd2fe494   Al Viro   copy_process(): d...
1998
1999
2000
2001
2002
  
  		pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
  					      O_RDWR | O_CLOEXEC);
  		if (IS_ERR(pidfile)) {
  			put_unused_fd(pidfd);
28dd29c06   Christian Brauner   fork: return prop...
2003
  			retval = PTR_ERR(pidfile);
6fd2fe494   Al Viro   copy_process(): d...
2004
2005
2006
  			goto bad_fork_free_pid;
  		}
  		get_pid(pid);	/* held by pidfile now */
7f192e3cd   Christian Brauner   fork: add clone3
2007
  		retval = put_user(pidfd, args->pidfd);
b3e583825   Christian Brauner   clone: add CLONE_...
2008
2009
2010
  		if (retval)
  			goto bad_fork_put_pidfd;
  	}
73c101011   Jens Axboe   block: initial pa...
2011
2012
2013
  #ifdef CONFIG_BLOCK
  	p->plug = NULL;
  #endif
ba31c1a48   Thomas Gleixner   futex: Move futex...
2014
  	futex_init_task(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2015
  	/*
f9a3879ab   GOTO Masanori   [PATCH] Fix sigal...
2016
2017
2018
  	 * sigaltstack should be cleared when sharing the same VM
  	 */
  	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2a7421383   Stas Sergeev   signals/sigaltsta...
2019
  		sas_ss_reset(p);
f9a3879ab   GOTO Masanori   [PATCH] Fix sigal...
2020
2021
  
  	/*
6580807da   Oleg Nesterov   ptrace: copy_proc...
2022
2023
  	 * Syscall tracing and stepping should be turned off in the
  	 * child regardless of CLONE_PTRACE.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2024
  	 */
6580807da   Oleg Nesterov   ptrace: copy_proc...
2025
  	user_disable_single_step(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2026
  	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
ed75e8d58   Laurent Vivier   [PATCH] UML Suppo...
2027
2028
2029
  #ifdef TIF_SYSCALL_EMU
  	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
  #endif
e02c9b0d6   Lin Feng   kernel/latencytop...
2030
  	clear_tsk_latency_tracing(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2031

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032
  	/* ok, now we should be set up.. */
18c830df7   Oleg Nesterov   kernel/fork.c:cop...
2033
2034
  	p->pid = pid_nr(pid);
  	if (clone_flags & CLONE_THREAD) {
5f8aadd8b   Oleg Nesterov   CLONE_PARENT shou...
2035
  		p->exit_signal = -1;
18c830df7   Oleg Nesterov   kernel/fork.c:cop...
2036
2037
2038
2039
2040
2041
  		p->group_leader = current->group_leader;
  		p->tgid = current->tgid;
  	} else {
  		if (clone_flags & CLONE_PARENT)
  			p->exit_signal = current->group_leader->exit_signal;
  		else
7f192e3cd   Christian Brauner   fork: add clone3
2042
  			p->exit_signal = args->exit_signal;
18c830df7   Oleg Nesterov   kernel/fork.c:cop...
2043
2044
2045
  		p->group_leader = p;
  		p->tgid = p->pid;
  	}
5f8aadd8b   Oleg Nesterov   CLONE_PARENT shou...
2046

9d823e8f6   Wu Fengguang   writeback: per ta...
2047
2048
  	p->nr_dirtied = 0;
  	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
83712358b   Wu Fengguang   writeback: dirty ...
2049
  	p->dirty_paused_when = 0;
9d823e8f6   Wu Fengguang   writeback: per ta...
2050

bb8cbbfee   Oleg Nesterov   tasks/fork: Remov...
2051
  	p->pdeath_signal = 0;
47e65328a   Oleg Nesterov   [PATCH] pids: kil...
2052
  	INIT_LIST_HEAD(&p->thread_group);
158e1645e   Al Viro   trim task_work: g...
2053
  	p->task_works = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2054

18c830df7   Oleg Nesterov   kernel/fork.c:cop...
2055
  	/*
7e47682ea   Aleksa Sarai   cgroup: allow a c...
2056
2057
2058
2059
2060
  	 * Ensure that the cgroup subsystem policies allow the new process to be
  	 * forked. It should be noted the the new process's css_set can be changed
  	 * between here and cgroup_post_fork() if an organisation operation is in
  	 * progress.
  	 */
ef2c41cf3   Christian Brauner   clone3: allow spa...
2061
  	retval = cgroup_can_fork(p, args);
7e47682ea   Aleksa Sarai   cgroup: allow a c...
2062
  	if (retval)
5a5cf5cb3   Christian Brauner   cgroup: refactor ...
2063
  		goto bad_fork_put_pidfd;
7e47682ea   Aleksa Sarai   cgroup: allow a c...
2064
2065
  
  	/*
7b5585136   David Herrmann   fork: record star...
2066
2067
2068
2069
2070
2071
2072
2073
  	 * From this point on we must avoid any synchronous user-space
  	 * communication until we take the tasklist-lock. In particular, we do
  	 * not want user-space to be able to predict the process start-time by
  	 * stalling fork(2) after we recorded the start_time but before it is
  	 * visible to the system.
  	 */
  
  	p->start_time = ktime_get_ns();
cf25e24db   Peter Zijlstra   time: Rename tsk-...
2074
  	p->start_boottime = ktime_get_boottime_ns();
7b5585136   David Herrmann   fork: record star...
2075
2076
  
  	/*
18c830df7   Oleg Nesterov   kernel/fork.c:cop...
2077
2078
2079
  	 * Make it visible to the rest of the system, but dont wake it up yet.
  	 * Need tasklist lock for parent etc handling!
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2080
  	write_lock_irq(&tasklist_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2081
  	/* CLONE_PARENT re-uses the old parent */
2d5516cbb   Oleg Nesterov   copy_process: fix...
2082
  	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2083
  		p->real_parent = current->real_parent;
2d5516cbb   Oleg Nesterov   copy_process: fix...
2084
2085
  		p->parent_exec_id = current->parent_exec_id;
  	} else {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2086
  		p->real_parent = current;
2d5516cbb   Oleg Nesterov   copy_process: fix...
2087
2088
  		p->parent_exec_id = current->self_exec_id;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2089

d83a7cb37   Josh Poimboeuf   livepatch: change...
2090
  	klp_copy_process(p);
3f17da699   Oleg Nesterov   [PATCH] fix kill_...
2091
  	spin_lock(&current->sighand->siglock);
4a2c7a783   Oleg Nesterov   [PATCH] make fork...
2092
2093
  
  	/*
dbd952127   Kees Cook   seccomp: introduc...
2094
2095
2096
2097
  	 * Copy seccomp details explicitly here, in case they were changed
  	 * before holding sighand lock.
  	 */
  	copy_seccomp(p);
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
2098
  	rseq_fork(p, clone_flags);
4ca1d3ee4   Eric W. Biederman   fork: Move and de...
2099
  	/* Don't start children in a dying pid namespace */
e8cfbc245   Gargi Sharma   pid: remove pidhash
2100
  	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
3fd372262   Kirill Tkhai   pid_ns: Fix race ...
2101
2102
2103
  		retval = -ENOMEM;
  		goto bad_fork_cancel_cgroup;
  	}
4a2c7a783   Oleg Nesterov   [PATCH] make fork...
2104

7673bf553   Eric W. Biederman   fork: Uncondition...
2105
2106
2107
2108
2109
  	/* Let kill terminate clone/fork in the middle */
  	if (fatal_signal_pending(current)) {
  		retval = -EINTR;
  		goto bad_fork_cancel_cgroup;
  	}
6fd2fe494   Al Viro   copy_process(): d...
2110
2111
2112
  	/* past the last point of failure */
  	if (pidfile)
  		fd_install(pidfd, pidfile);
4a2c7a783   Oleg Nesterov   [PATCH] make fork...
2113

2c4704756   Eric W. Biederman   pids: Move the pg...
2114
  	init_task_pid_links(p);
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2115
  	if (likely(p->pid)) {
4b9d33e6d   Tejun Heo   ptrace: kill clon...
2116
  		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2117

819077398   Oleg Nesterov   kernel/fork.c:cop...
2118
  		init_task_pid(p, PIDTYPE_PID, pid);
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2119
  		if (thread_group_leader(p)) {
6883f81aa   Eric W. Biederman   pid: Implement PI...
2120
  			init_task_pid(p, PIDTYPE_TGID, pid);
819077398   Oleg Nesterov   kernel/fork.c:cop...
2121
2122
  			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
  			init_task_pid(p, PIDTYPE_SID, task_session(current));
1c4042c29   Eric W. Biederman   pidns: Consolidat...
2123
  			if (is_child_reaper(pid)) {
17cf22c33   Eric W. Biederman   pidns: Use task_a...
2124
  				ns_of_pid(pid)->child_reaper = p;
1c4042c29   Eric W. Biederman   pidns: Consolidat...
2125
2126
  				p->signal->flags |= SIGNAL_UNKILLABLE;
  			}
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
2127
  			p->signal->shared_pending.signal = delayed.signal;
9c9f4ded9   Alan Cox   tty: Add a kref c...
2128
  			p->signal->tty = tty_kref_get(current->signal->tty);
749860ce2   Pavel Tikhomirov   prctl: propagate ...
2129
2130
2131
2132
2133
2134
2135
  			/*
  			 * Inherit has_child_subreaper flag under the same
  			 * tasklist_lock with adding child to the process tree
  			 * for propagate_has_child_subreaper optimization.
  			 */
  			p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
  							 p->real_parent->signal->is_child_subreaper;
9cd80bbb0   Oleg Nesterov   do_wait() optimiz...
2136
  			list_add_tail(&p->sibling, &p->real_parent->children);
5e85d4abe   Eric W. Biederman   [PATCH] task: Mak...
2137
  			list_add_tail_rcu(&p->tasks, &init_task.tasks);
6883f81aa   Eric W. Biederman   pid: Implement PI...
2138
  			attach_pid(p, PIDTYPE_TGID);
819077398   Oleg Nesterov   kernel/fork.c:cop...
2139
2140
  			attach_pid(p, PIDTYPE_PGID);
  			attach_pid(p, PIDTYPE_SID);
909ea9646   Christoph Lameter   core: Replace __g...
2141
  			__this_cpu_inc(process_counts);
80628ca06   Oleg Nesterov   kernel/fork.c:cop...
2142
2143
2144
  		} else {
  			current->signal->nr_threads++;
  			atomic_inc(&current->signal->live);
60d4de3ff   Elena Reshetova   sched/core: Conve...
2145
  			refcount_inc(&current->signal->sigcnt);
924de3b8c   Eric W. Biederman   fork: Have new th...
2146
  			task_join_group_stop(p);
80628ca06   Oleg Nesterov   kernel/fork.c:cop...
2147
2148
  			list_add_tail_rcu(&p->thread_group,
  					  &p->group_leader->thread_group);
0c740d0af   Oleg Nesterov   introduce for_eac...
2149
2150
  			list_add_tail_rcu(&p->thread_node,
  					  &p->signal->thread_head);
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2151
  		}
819077398   Oleg Nesterov   kernel/fork.c:cop...
2152
  		attach_pid(p, PIDTYPE_PID);
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2153
  		nr_threads++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2154
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2155
  	total_forks++;
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
2156
  	hlist_del_init(&delayed.node);
3f17da699   Oleg Nesterov   [PATCH] fix kill_...
2157
  	spin_unlock(&current->sighand->siglock);
4af4206be   Oleg Nesterov   tracing: Fix sysc...
2158
  	syscall_tracepoint_update(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
  	write_unlock_irq(&tasklist_lock);
4af4206be   Oleg Nesterov   tracing: Fix sysc...
2160

c13cf856c   Andrew Morton   [PATCH] fork.c: p...
2161
  	proc_fork_connector(p);
13685c4a0   Qais Yousef   sched/uclamp: Add...
2162
  	sched_post_fork(p);
ef2c41cf3   Christian Brauner   clone3: allow spa...
2163
  	cgroup_post_fork(p, args);
cdd6c482c   Ingo Molnar   perf: Do the big ...
2164
  	perf_event_fork(p);
43d2b1132   KAMEZAWA Hiroyuki   tracepoint: add t...
2165
2166
  
  	trace_task_newtask(p, clone_flags);
3ab679661   Oleg Nesterov   uprobes: Teach up...
2167
  	uprobe_copy_process(p, clone_flags);
43d2b1132   KAMEZAWA Hiroyuki   tracepoint: add t...
2168

67197a4f2   Suren Baghdasaryan   mm, oom_adj: don'...
2169
  	copy_oom_score_adj(clone_flags, p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2170
  	return p;
7e47682ea   Aleksa Sarai   cgroup: allow a c...
2171
  bad_fork_cancel_cgroup:
3fd372262   Kirill Tkhai   pid_ns: Fix race ...
2172
2173
  	spin_unlock(&current->sighand->siglock);
  	write_unlock_irq(&tasklist_lock);
ef2c41cf3   Christian Brauner   clone3: allow spa...
2174
  	cgroup_cancel_fork(p, args);
b3e583825   Christian Brauner   clone: add CLONE_...
2175
  bad_fork_put_pidfd:
6fd2fe494   Al Viro   copy_process(): d...
2176
2177
2178
2179
  	if (clone_flags & CLONE_PIDFD) {
  		fput(pidfile);
  		put_unused_fd(pidfd);
  	}
425fb2b4b   Pavel Emelyanov   pid namespaces: m...
2180
2181
2182
  bad_fork_free_pid:
  	if (pid != &init_struct_pid)
  		free_pid(pid);
0740aa5f6   Jiri Slaby   fork: free thread...
2183
2184
  bad_fork_cleanup_thread:
  	exit_thread(p);
fd0928df9   Jens Axboe   ioprio: move io p...
2185
  bad_fork_cleanup_io:
b69f22920   Louis Rilling   block: Fix io_con...
2186
2187
  	if (p->io_context)
  		exit_io_context(p);
ab516013a   Serge E. Hallyn   [PATCH] namespace...
2188
  bad_fork_cleanup_namespaces:
444f378b2   Linus Torvalds   Revert "[PATCH] n...
2189
  	exit_task_namespaces(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2190
  bad_fork_cleanup_mm:
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
2191
2192
  	if (p->mm) {
  		mm_clear_owner(p->mm, p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2193
  		mmput(p->mm);
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
2194
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2195
  bad_fork_cleanup_signal:
4ab6c0833   Oleg Nesterov   clone(): fix race...
2196
  	if (!(clone_flags & CLONE_THREAD))
1c5354de9   Mike Galbraith   sched: Move sched...
2197
  		free_signal_struct(p->signal);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2198
  bad_fork_cleanup_sighand:
a7e5328a0   Oleg Nesterov   [PATCH] cleanup _...
2199
  	__cleanup_sighand(p->sighand);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2200
2201
2202
2203
2204
2205
  bad_fork_cleanup_fs:
  	exit_fs(p); /* blocking */
  bad_fork_cleanup_files:
  	exit_files(p); /* blocking */
  bad_fork_cleanup_semundo:
  	exit_sem(p);
e4e55b47e   Tetsuo Handa   LSM: Revive secur...
2206
2207
  bad_fork_cleanup_security:
  	security_task_free(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2208
2209
  bad_fork_cleanup_audit:
  	audit_free(p);
6c72e3501   Peter Zijlstra   perf: fix perf bu...
2210
  bad_fork_cleanup_perf:
cdd6c482c   Ingo Molnar   perf: Do the big ...
2211
  	perf_event_free_task(p);
6c72e3501   Peter Zijlstra   perf: fix perf bu...
2212
  bad_fork_cleanup_policy:
b09be676e   Byungchul Park   locking/lockdep: ...
2213
  	lockdep_free_task(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2214
  #ifdef CONFIG_NUMA
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
2215
  	mpol_put(p->mempolicy);
e8604cb43   Li Zefan   cgroup: fix spuri...
2216
  bad_fork_cleanup_threadgroup_lock:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217
  #endif
35df17c57   Shailabh Nagar   [PATCH] task dela...
2218
  	delayacct_tsk_free(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2219
  bad_fork_cleanup_count:
d84f4f992   David Howells   CRED: Inaugurate ...
2220
  	atomic_dec(&p->cred->user->processes);
e0e817392   David Howells   CRED: Add some co...
2221
  	exit_creds(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2222
  bad_fork_free:
405c07597   Andy Lutomirski   fork: Add task st...
2223
  	p->state = TASK_DEAD;
68f24b08e   Andy Lutomirski   sched/core: Free ...
2224
  	put_task_stack(p);
c3f3ce049   Andrea Arcangeli   userfaultfd: use ...
2225
  	delayed_free_task(p);
fe7d37d1f   Oleg Nesterov   [PATCH] copy_proc...
2226
  fork_out:
c3ad2c3b0   Eric W. Biederman   signal: Don't res...
2227
2228
2229
  	spin_lock_irq(&current->sighand->siglock);
  	hlist_del_init(&delayed.node);
  	spin_unlock_irq(&current->sighand->siglock);
fe7d37d1f   Oleg Nesterov   [PATCH] copy_proc...
2230
  	return ERR_PTR(retval);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2231
  }
2c4704756   Eric W. Biederman   pids: Move the pg...
2232
  static inline void init_idle_pids(struct task_struct *idle)
f106eee10   Oleg Nesterov   pids: fix fork_id...
2233
2234
2235
2236
  {
  	enum pid_type type;
  
  	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2c4704756   Eric W. Biederman   pids: Move the pg...
2237
2238
  		INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
  		init_task_pid(idle, type, &init_struct_pid);
f106eee10   Oleg Nesterov   pids: fix fork_id...
2239
2240
  	}
  }
0db0628d9   Paul Gortmaker   kernel: delete __...
2241
  struct task_struct *fork_idle(int cpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2242
  {
36c8b5868   Ingo Molnar   [PATCH] sched: cl...
2243
  	struct task_struct *task;
7f192e3cd   Christian Brauner   fork: add clone3
2244
2245
2246
2247
2248
  	struct kernel_clone_args args = {
  		.flags = CLONE_VM,
  	};
  
  	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
f106eee10   Oleg Nesterov   pids: fix fork_id...
2249
  	if (!IS_ERR(task)) {
2c4704756   Eric W. Biederman   pids: Move the pg...
2250
  		init_idle_pids(task);
753ca4f31   Akinobu Mita   [PATCH] fix copy_...
2251
  		init_idle(task, cpu);
f106eee10   Oleg Nesterov   pids: fix fork_id...
2252
  	}
73b9ebfe1   Oleg Nesterov   [PATCH] pidhash: ...
2253

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2254
2255
  	return task;
  }
13585fa06   Nadav Amit   fork: Provide a f...
2256
2257
2258
2259
  struct mm_struct *copy_init_mm(void)
  {
  	return dup_mm(NULL, &init_mm);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2260
2261
2262
2263
2264
  /*
   *  Ok, this is the main fork-routine.
   *
   * It copies the process, and if successful kick-starts
   * it and waits for it to finish using the VM if required.
a0eb9abd8   Eugene Syromiatnikov   fork: block inval...
2265
2266
   *
   * args->exit_signal is expected to be checked for sanity by the caller.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2267
   */
cad6967ac   Christian Brauner   fork: introduce k...
2268
  pid_t kernel_clone(struct kernel_clone_args *args)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2269
  {
7f192e3cd   Christian Brauner   fork: add clone3
2270
  	u64 clone_flags = args->flags;
9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2271
2272
  	struct completion vfork;
  	struct pid *pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2273
2274
  	struct task_struct *p;
  	int trace = 0;
cad6967ac   Christian Brauner   fork: introduce k...
2275
  	pid_t nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2276

bdff746a3   Andrew Morton   clone: prepare to...
2277
  	/*
3af8588c7   Christian Brauner   fork: fold legacy...
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
  	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
  	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
  	 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
  	 * field in struct clone_args and it still doesn't make sense to have
  	 * them both point at the same memory location. Performing this check
  	 * here has the advantage that we don't need to have a separate helper
  	 * to check for legacy clone().
  	 */
  	if ((args->flags & CLONE_PIDFD) &&
  	    (args->flags & CLONE_PARENT_SETTID) &&
  	    (args->pidfd == args->parent_tid))
  		return -EINVAL;
  
  	/*
4b9d33e6d   Tejun Heo   ptrace: kill clon...
2292
2293
2294
2295
  	 * Determine whether and which event to report to ptracer.  When
  	 * called from kernel_thread or CLONE_UNTRACED is explicitly
  	 * requested, no event is reported; otherwise, report if the event
  	 * for the type of forking is enabled.
09a05394f   Roland McGrath   tracehook: clone
2296
  	 */
e80d6661c   Al Viro   flagday: kill pt_...
2297
  	if (!(clone_flags & CLONE_UNTRACED)) {
4b9d33e6d   Tejun Heo   ptrace: kill clon...
2298
2299
  		if (clone_flags & CLONE_VFORK)
  			trace = PTRACE_EVENT_VFORK;
7f192e3cd   Christian Brauner   fork: add clone3
2300
  		else if (args->exit_signal != SIGCHLD)
4b9d33e6d   Tejun Heo   ptrace: kill clon...
2301
2302
2303
2304
2305
2306
2307
  			trace = PTRACE_EVENT_CLONE;
  		else
  			trace = PTRACE_EVENT_FORK;
  
  		if (likely(!ptrace_event_enabled(current, trace)))
  			trace = 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2308

7f192e3cd   Christian Brauner   fork: add clone3
2309
  	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
38addce8b   Emese Revfy   gcc-plugins: Add ...
2310
  	add_latent_entropy();
9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2311
2312
2313
  
  	if (IS_ERR(p))
  		return PTR_ERR(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2314
2315
2316
2317
  	/*
  	 * Do this prior waking up the new thread - the thread pointer
  	 * might get invalid after that point, if the thread exits quickly.
  	 */
9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2318
  	trace_sched_process_fork(current, p);
0a16b6075   Mathieu Desnoyers   tracing, sched: L...
2319

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2320
2321
  	pid = get_task_pid(p, PIDTYPE_PID);
  	nr = pid_vnr(pid);
30e49c263   Pavel Emelyanov   pid namespaces: a...
2322

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2323
  	if (clone_flags & CLONE_PARENT_SETTID)
7f192e3cd   Christian Brauner   fork: add clone3
2324
  		put_user(nr, args->parent_tid);
a6f5e0637   Sukadev Bhattiprolu   pid namespaces: m...
2325

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2326
2327
2328
2329
2330
  	if (clone_flags & CLONE_VFORK) {
  		p->vfork_done = &vfork;
  		init_completion(&vfork);
  		get_task_struct(p);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2331

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2332
  	wake_up_new_task(p);
09a05394f   Roland McGrath   tracehook: clone
2333

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2334
2335
2336
  	/* forking complete and child started to run, tell ptracer */
  	if (unlikely(trace))
  		ptrace_event_pid(trace, pid);
4e52365f2   Matthew Dempsky   ptrace: fix fork ...
2337

9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2338
2339
2340
  	if (clone_flags & CLONE_VFORK) {
  		if (!wait_for_vfork_done(p, &vfork))
  			ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2341
  	}
9f5325aa3   Marcos Paulo de Souza   kernel/fork.c: ch...
2342
2343
  
  	put_pid(pid);
92476d7fc   Eric W. Biederman   [PATCH] pidhash: ...
2344
  	return nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2345
  }
2aa3a7f86   Al Viro   preparation for g...
2346
2347
2348
2349
2350
  /*
   * Create a kernel thread.
   */
  pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
  {
7f192e3cd   Christian Brauner   fork: add clone3
2351
  	struct kernel_clone_args args = {
3f2c788a1   Christian Brauner   fork: prevent acc...
2352
2353
2354
  		.flags		= ((lower_32_bits(flags) | CLONE_VM |
  				    CLONE_UNTRACED) & ~CSIGNAL),
  		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
7f192e3cd   Christian Brauner   fork: add clone3
2355
2356
2357
  		.stack		= (unsigned long)fn,
  		.stack_size	= (unsigned long)arg,
  	};
cad6967ac   Christian Brauner   fork: introduce k...
2358
  	return kernel_clone(&args);
2aa3a7f86   Al Viro   preparation for g...
2359
  }
2aa3a7f86   Al Viro   preparation for g...
2360

d2125043a   Al Viro   generic sys_fork ...
2361
2362
2363
2364
  #ifdef __ARCH_WANT_SYS_FORK
  SYSCALL_DEFINE0(fork)
  {
  #ifdef CONFIG_MMU
7f192e3cd   Christian Brauner   fork: add clone3
2365
2366
2367
  	struct kernel_clone_args args = {
  		.exit_signal = SIGCHLD,
  	};
cad6967ac   Christian Brauner   fork: introduce k...
2368
  	return kernel_clone(&args);
d2125043a   Al Viro   generic sys_fork ...
2369
2370
  #else
  	/* can not support in nommu mode */
5d59e1827   Daeseok Youn   kernel/fork.c: fi...
2371
  	return -EINVAL;
d2125043a   Al Viro   generic sys_fork ...
2372
2373
2374
2375
2376
2377
2378
  #endif
  }
  #endif
  
  #ifdef __ARCH_WANT_SYS_VFORK
  SYSCALL_DEFINE0(vfork)
  {
7f192e3cd   Christian Brauner   fork: add clone3
2379
2380
2381
2382
  	struct kernel_clone_args args = {
  		.flags		= CLONE_VFORK | CLONE_VM,
  		.exit_signal	= SIGCHLD,
  	};
cad6967ac   Christian Brauner   fork: introduce k...
2383
  	return kernel_clone(&args);
d2125043a   Al Viro   generic sys_fork ...
2384
2385
2386
2387
2388
2389
2390
  }
  #endif
  
  #ifdef __ARCH_WANT_SYS_CLONE
  #ifdef CONFIG_CLONE_BACKWARDS
  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
  		 int __user *, parent_tidptr,
3033f14ab   Josh Triplett   clone: support pa...
2391
  		 unsigned long, tls,
d2125043a   Al Viro   generic sys_fork ...
2392
2393
2394
2395
2396
  		 int __user *, child_tidptr)
  #elif defined(CONFIG_CLONE_BACKWARDS2)
  SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
  		 int __user *, parent_tidptr,
  		 int __user *, child_tidptr,
3033f14ab   Josh Triplett   clone: support pa...
2397
  		 unsigned long, tls)
dfa9771a7   Michal Simek   microblaze: fix c...
2398
2399
2400
2401
2402
  #elif defined(CONFIG_CLONE_BACKWARDS3)
  SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
  		int, stack_size,
  		int __user *, parent_tidptr,
  		int __user *, child_tidptr,
3033f14ab   Josh Triplett   clone: support pa...
2403
  		unsigned long, tls)
d2125043a   Al Viro   generic sys_fork ...
2404
2405
2406
2407
  #else
  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
  		 int __user *, parent_tidptr,
  		 int __user *, child_tidptr,
3033f14ab   Josh Triplett   clone: support pa...
2408
  		 unsigned long, tls)
d2125043a   Al Viro   generic sys_fork ...
2409
2410
  #endif
  {
7f192e3cd   Christian Brauner   fork: add clone3
2411
  	struct kernel_clone_args args = {
3f2c788a1   Christian Brauner   fork: prevent acc...
2412
  		.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),
7f192e3cd   Christian Brauner   fork: add clone3
2413
2414
2415
  		.pidfd		= parent_tidptr,
  		.child_tid	= child_tidptr,
  		.parent_tid	= parent_tidptr,
3f2c788a1   Christian Brauner   fork: prevent acc...
2416
  		.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),
7f192e3cd   Christian Brauner   fork: add clone3
2417
2418
2419
  		.stack		= newsp,
  		.tls		= tls,
  	};
cad6967ac   Christian Brauner   fork: introduce k...
2420
  	return kernel_clone(&args);
7f192e3cd   Christian Brauner   fork: add clone3
2421
  }
d68dbb0c9   Christian Brauner   arch: handle arch...
2422
  #endif
7f192e3cd   Christian Brauner   fork: add clone3
2423

d68dbb0c9   Christian Brauner   arch: handle arch...
2424
  #ifdef __ARCH_WANT_SYS_CLONE3
dd499f7a7   Amanieu d'Antras   clone3: ensure co...
2425

7f192e3cd   Christian Brauner   fork: add clone3
2426
2427
  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
  					      struct clone_args __user *uargs,
f14c234b4   Aleksa Sarai   clone3: switch to...
2428
  					      size_t usize)
7f192e3cd   Christian Brauner   fork: add clone3
2429
  {
f14c234b4   Aleksa Sarai   clone3: switch to...
2430
  	int err;
7f192e3cd   Christian Brauner   fork: add clone3
2431
  	struct clone_args args;
49cb2fc42   Adrian Reber   fork: extend clon...
2432
  	pid_t *kset_tid = kargs->set_tid;
7f192e3cd   Christian Brauner   fork: add clone3
2433

a966dcfe1   Eugene Syromiatnikov   clone3: add build...
2434
2435
2436
2437
2438
2439
2440
  	BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
  		     CLONE_ARGS_SIZE_VER0);
  	BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
  		     CLONE_ARGS_SIZE_VER1);
  	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
  		     CLONE_ARGS_SIZE_VER2);
  	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
f14c234b4   Aleksa Sarai   clone3: switch to...
2441
  	if (unlikely(usize > PAGE_SIZE))
7f192e3cd   Christian Brauner   fork: add clone3
2442
  		return -E2BIG;
f14c234b4   Aleksa Sarai   clone3: switch to...
2443
  	if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
7f192e3cd   Christian Brauner   fork: add clone3
2444
  		return -EINVAL;
f14c234b4   Aleksa Sarai   clone3: switch to...
2445
2446
2447
  	err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
  	if (err)
  		return err;
7f192e3cd   Christian Brauner   fork: add clone3
2448

49cb2fc42   Adrian Reber   fork: extend clon...
2449
2450
2451
2452
2453
2454
2455
2456
  	if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
  		return -EINVAL;
  
  	if (unlikely(!args.set_tid && args.set_tid_size > 0))
  		return -EINVAL;
  
  	if (unlikely(args.set_tid && args.set_tid_size == 0))
  		return -EINVAL;
a0eb9abd8   Eugene Syromiatnikov   fork: block inval...
2457
2458
2459
2460
2461
2462
2463
  	/*
  	 * Verify that higher 32bits of exit_signal are unset and that
  	 * it is a valid signal
  	 */
  	if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
  		     !valid_signal(args.exit_signal)))
  		return -EINVAL;
62173872c   Eugene Syromiatnikov   clone3: add a che...
2464
2465
  	if ((args.flags & CLONE_INTO_CGROUP) &&
  	    (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
ef2c41cf3   Christian Brauner   clone3: allow spa...
2466
  		return -EINVAL;
7f192e3cd   Christian Brauner   fork: add clone3
2467
2468
2469
2470
2471
2472
2473
2474
2475
  	*kargs = (struct kernel_clone_args){
  		.flags		= args.flags,
  		.pidfd		= u64_to_user_ptr(args.pidfd),
  		.child_tid	= u64_to_user_ptr(args.child_tid),
  		.parent_tid	= u64_to_user_ptr(args.parent_tid),
  		.exit_signal	= args.exit_signal,
  		.stack		= args.stack,
  		.stack_size	= args.stack_size,
  		.tls		= args.tls,
49cb2fc42   Adrian Reber   fork: extend clon...
2476
  		.set_tid_size	= args.set_tid_size,
ef2c41cf3   Christian Brauner   clone3: allow spa...
2477
  		.cgroup		= args.cgroup,
7f192e3cd   Christian Brauner   fork: add clone3
2478
  	};
49cb2fc42   Adrian Reber   fork: extend clon...
2479
2480
2481
2482
2483
2484
  	if (args.set_tid &&
  		copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
  			(kargs->set_tid_size * sizeof(pid_t))))
  		return -EFAULT;
  
  	kargs->set_tid = kset_tid;
7f192e3cd   Christian Brauner   fork: add clone3
2485
2486
  	return 0;
  }
fa729c4df   Christian Brauner   clone3: validate ...
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
  /**
   * clone3_stack_valid - check and prepare stack
   * @kargs: kernel clone args
   *
   * Verify that the stack arguments userspace gave us are sane.
   * In addition, set the stack direction for userspace since it's easy for us to
   * determine.
   */
  static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
  {
  	if (kargs->stack == 0) {
  		if (kargs->stack_size > 0)
  			return false;
  	} else {
  		if (kargs->stack_size == 0)
  			return false;
  
  		if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
  			return false;
  
  #if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
  		kargs->stack += kargs->stack_size;
  #endif
  	}
  
  	return true;
  }
  
  static bool clone3_args_valid(struct kernel_clone_args *kargs)
7f192e3cd   Christian Brauner   fork: add clone3
2516
  {
b612e5df4   Christian Brauner   clone3: add CLONE...
2517
  	/* Verify that no unknown flags are passed along. */
ef2c41cf3   Christian Brauner   clone3: allow spa...
2518
2519
  	if (kargs->flags &
  	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
7f192e3cd   Christian Brauner   fork: add clone3
2520
2521
2522
2523
2524
2525
2526
2527
  		return false;
  
  	/*
  	 * - make the CLONE_DETACHED bit reuseable for clone3
  	 * - make the CSIGNAL bits reuseable for clone3
  	 */
  	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
  		return false;
b612e5df4   Christian Brauner   clone3: add CLONE...
2528
2529
2530
  	if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
  	    (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
  		return false;
7f192e3cd   Christian Brauner   fork: add clone3
2531
2532
2533
  	if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
  	    kargs->exit_signal)
  		return false;
fa729c4df   Christian Brauner   clone3: validate ...
2534
2535
  	if (!clone3_stack_valid(kargs))
  		return false;
7f192e3cd   Christian Brauner   fork: add clone3
2536
2537
  	return true;
  }
501bd0166   Christian Brauner   fork: add kernel-...
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
  /**
   * clone3 - create a new process with specific properties
   * @uargs: argument structure
   * @size:  size of @uargs
   *
   * clone3() is the extensible successor to clone()/clone2().
   * It takes a struct as argument that is versioned by its size.
   *
   * Return: On success, a positive PID for the child process.
   *         On error, a negative errno number.
   */
7f192e3cd   Christian Brauner   fork: add clone3
2549
2550
2551
2552
2553
  SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
  {
  	int err;
  
  	struct kernel_clone_args kargs;
49cb2fc42   Adrian Reber   fork: extend clon...
2554
2555
2556
  	pid_t set_tid[MAX_PID_NS_LEVEL];
  
  	kargs.set_tid = set_tid;
7f192e3cd   Christian Brauner   fork: add clone3
2557
2558
2559
2560
2561
2562
2563
  
  	err = copy_clone_args_from_user(&kargs, uargs, size);
  	if (err)
  		return err;
  
  	if (!clone3_args_valid(&kargs))
  		return -EINVAL;
cad6967ac   Christian Brauner   fork: introduce k...
2564
  	return kernel_clone(&kargs);
d2125043a   Al Viro   generic sys_fork ...
2565
2566
  }
  #endif
0f1b92cbd   Oleg Nesterov   introduce the wal...
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
  void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
  {
  	struct task_struct *leader, *parent, *child;
  	int res;
  
  	read_lock(&tasklist_lock);
  	leader = top = top->group_leader;
  down:
  	for_each_thread(leader, parent) {
  		list_for_each_entry(child, &parent->children, sibling) {
  			res = visitor(child, data);
  			if (res) {
  				if (res < 0)
  					goto out;
  				leader = child;
  				goto down;
  			}
  up:
  			;
  		}
  	}
  
  	if (leader != top) {
  		child = leader;
  		parent = child->real_parent;
  		leader = parent->group_leader;
  		goto up;
  	}
  out:
  	read_unlock(&tasklist_lock);
  }
5fd63b308   Ravikiran G Thirumalai   [PATCH] x86_64: I...
2598
2599
2600
  #ifndef ARCH_MIN_MMSTRUCT_ALIGN
  #define ARCH_MIN_MMSTRUCT_ALIGN 0
  #endif
51cc50685   Alexey Dobriyan   SL*B: drop kmem c...
2601
  static void sighand_ctor(void *data)
aa1757f90   Oleg Nesterov   [PATCH] convert s...
2602
2603
  {
  	struct sighand_struct *sighand = data;
a35afb830   Christoph Lameter   Remove SLAB_CTOR_...
2604
  	spin_lock_init(&sighand->siglock);
b8fceee17   Davide Libenzi   signalfd simplifi...
2605
  	init_waitqueue_head(&sighand->signalfd_wqh);
aa1757f90   Oleg Nesterov   [PATCH] convert s...
2606
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2607
2608
  void __init proc_caches_init(void)
  {
c1a2f7f0c   Rik van Riel   mm: Allocate the ...
2609
  	unsigned int mm_size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2610
2611
  	sighand_cachep = kmem_cache_create("sighand_cache",
  			sizeof(struct sighand_struct), 0,
5f0d5a3ae   Paul E. McKenney   mm: Rename SLAB_D...
2612
  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
75f296d93   Levin, Alexander (Sasha Levin)   kmemcheck: stop u...
2613
  			SLAB_ACCOUNT, sighand_ctor);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2614
2615
  	signal_cachep = kmem_cache_create("signal_cache",
  			sizeof(struct signal_struct), 0,
75f296d93   Levin, Alexander (Sasha Levin)   kmemcheck: stop u...
2616
  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
5d097056c   Vladimir Davydov   kmemcg: account c...
2617
  			NULL);
20c2df83d   Paul Mundt   mm: Remove slab d...
2618
  	files_cachep = kmem_cache_create("files_cache",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2619
  			sizeof(struct files_struct), 0,
75f296d93   Levin, Alexander (Sasha Levin)   kmemcheck: stop u...
2620
  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
5d097056c   Vladimir Davydov   kmemcg: account c...
2621
  			NULL);
20c2df83d   Paul Mundt   mm: Remove slab d...
2622
  	fs_cachep = kmem_cache_create("fs_cache",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2623
  			sizeof(struct fs_struct), 0,
75f296d93   Levin, Alexander (Sasha Levin)   kmemcheck: stop u...
2624
  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
5d097056c   Vladimir Davydov   kmemcg: account c...
2625
  			NULL);
c1a2f7f0c   Rik van Riel   mm: Allocate the ...
2626

6345d24da   Linus Torvalds   mm: Fix boot cras...
2627
  	/*
c1a2f7f0c   Rik van Riel   mm: Allocate the ...
2628
2629
2630
  	 * The mm_cpumask is located at the end of mm_struct, and is
  	 * dynamically sized based on the maximum CPU number this system
  	 * can have, taking hotplug into account (nr_cpu_ids).
6345d24da   Linus Torvalds   mm: Fix boot cras...
2631
  	 */
c1a2f7f0c   Rik van Riel   mm: Allocate the ...
2632
  	mm_size = sizeof(struct mm_struct) + cpumask_size();
07dcd7fe8   David Windsor   fork: Define user...
2633
  	mm_cachep = kmem_cache_create_usercopy("mm_struct",
c1a2f7f0c   Rik van Riel   mm: Allocate the ...
2634
  			mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
75f296d93   Levin, Alexander (Sasha Levin)   kmemcheck: stop u...
2635
  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
07dcd7fe8   David Windsor   fork: Define user...
2636
2637
  			offsetof(struct mm_struct, saved_auxv),
  			sizeof_field(struct mm_struct, saved_auxv),
5d097056c   Vladimir Davydov   kmemcg: account c...
2638
2639
  			NULL);
  	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
8feae1311   David Howells   NOMMU: Make VMAs ...
2640
  	mmap_init();
665771939   Al Viro   make sure that ns...
2641
  	nsproxy_cache_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2642
  }
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2643

cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2644
  /*
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2645
   * Check constraints on flags passed to the unshare system call.
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2646
   */
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2647
  static int check_unshare_flags(unsigned long unshare_flags)
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2648
  {
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2649
2650
  	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
  				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
50804fe37   Eric W. Biederman   pidns: Support un...
2651
  				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
769071ac9   Andrei Vagin   ns: Introduce Tim...
2652
2653
  				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
  				CLONE_NEWTIME))
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2654
  		return -EINVAL;
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2655
  	/*
12c641ab8   Eric W. Biederman   unshare: Unsharin...
2656
2657
2658
2659
  	 * Not implemented, but pretend it works if there is nothing
  	 * to unshare.  Note that unsharing the address space or the
  	 * signal handlers also need to unshare the signal queues (aka
  	 * CLONE_THREAD).
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2660
  	 */
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2661
  	if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
12c641ab8   Eric W. Biederman   unshare: Unsharin...
2662
2663
2664
2665
  		if (!thread_group_empty(current))
  			return -EINVAL;
  	}
  	if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
d036bda7d   Elena Reshetova   sched/core: Conve...
2666
  		if (refcount_read(&current->sighand->count) > 1)
12c641ab8   Eric W. Biederman   unshare: Unsharin...
2667
2668
2669
2670
  			return -EINVAL;
  	}
  	if (unshare_flags & CLONE_VM) {
  		if (!current_is_single_threaded())
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2671
2672
  			return -EINVAL;
  	}
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2673
2674
2675
2676
2677
  
  	return 0;
  }
  
  /*
99d1419d9   JANAK DESAI   [PATCH] unshare s...
2678
   * Unshare the filesystem structure if it is being shared
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2679
2680
2681
2682
   */
  static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
  {
  	struct fs_struct *fs = current->fs;
498052bba   Al Viro   New locking/refco...
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
  	if (!(unshare_flags & CLONE_FS) || !fs)
  		return 0;
  
  	/* don't need lock here; in the worst case we'll do useless copy */
  	if (fs->users == 1)
  		return 0;
  
  	*new_fsp = copy_fs_struct(fs);
  	if (!*new_fsp)
  		return -ENOMEM;
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2693
2694
2695
2696
2697
  
  	return 0;
  }
  
  /*
a016f3389   JANAK DESAI   [PATCH] unshare s...
2698
   * Unshare file descriptor table if it is being shared
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2699
   */
60997c3d4   Christian Brauner   close_range: add ...
2700
2701
  int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
  	       struct files_struct **new_fdp)
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2702
2703
  {
  	struct files_struct *fd = current->files;
a016f3389   JANAK DESAI   [PATCH] unshare s...
2704
  	int error = 0;
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2705
2706
  
  	if ((unshare_flags & CLONE_FILES) &&
a016f3389   JANAK DESAI   [PATCH] unshare s...
2707
  	    (fd && atomic_read(&fd->count) > 1)) {
60997c3d4   Christian Brauner   close_range: add ...
2708
  		*new_fdp = dup_fd(fd, max_fds, &error);
a016f3389   JANAK DESAI   [PATCH] unshare s...
2709
2710
2711
  		if (!*new_fdp)
  			return error;
  	}
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2712
2713
2714
2715
2716
  
  	return 0;
  }
  
  /*
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2717
2718
   * unshare allows a process to 'unshare' part of the process
   * context which was originally shared using clone.  copy_*
cad6967ac   Christian Brauner   fork: introduce k...
2719
   * functions used by kernel_clone() cannot be used here directly
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2720
2721
2722
2723
   * because they modify an inactive task_struct that is being
   * constructed. Here we are modifying the current, active,
   * task_struct.
   */
9b32105ec   Dominik Brodowski   kernel: add ksys_...
2724
  int ksys_unshare(unsigned long unshare_flags)
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2725
  {
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2726
  	struct fs_struct *fs, *new_fs = NULL;
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2727
  	struct files_struct *fd, *new_fd = NULL;
b2e0d9870   Eric W. Biederman   userns: Implement...
2728
  	struct cred *new_cred = NULL;
cf7b708c8   Pavel Emelyanov   Make access to ta...
2729
  	struct nsproxy *new_nsproxy = NULL;
9edff4ab1   Manfred Spraul   ipc: sysvsem: imp...
2730
  	int do_sysvsem = 0;
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2731
  	int err;
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2732

50804fe37   Eric W. Biederman   pidns: Support un...
2733
  	/*
faf00da54   Eric W. Biederman   userns,pidns: For...
2734
2735
  	 * If unsharing a user namespace must also unshare the thread group
  	 * and unshare the filesystem root and working directories.
b2e0d9870   Eric W. Biederman   userns: Implement...
2736
2737
  	 */
  	if (unshare_flags & CLONE_NEWUSER)
e66eded83   Eric W. Biederman   userns: Don't all...
2738
  		unshare_flags |= CLONE_THREAD | CLONE_FS;
b2e0d9870   Eric W. Biederman   userns: Implement...
2739
  	/*
50804fe37   Eric W. Biederman   pidns: Support un...
2740
2741
2742
2743
  	 * If unsharing vm, must also unshare signal handlers.
  	 */
  	if (unshare_flags & CLONE_VM)
  		unshare_flags |= CLONE_SIGHAND;
6013f67fc   Manfred Spraul   ipc: sysvsem: for...
2744
  	/*
12c641ab8   Eric W. Biederman   unshare: Unsharin...
2745
2746
2747
2748
2749
  	 * If unsharing a signal handlers, must also unshare the signal queues.
  	 */
  	if (unshare_flags & CLONE_SIGHAND)
  		unshare_flags |= CLONE_THREAD;
  	/*
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2750
2751
2752
2753
  	 * If unsharing namespace, must also unshare filesystem information.
  	 */
  	if (unshare_flags & CLONE_NEWNS)
  		unshare_flags |= CLONE_FS;
50804fe37   Eric W. Biederman   pidns: Support un...
2754
2755
2756
2757
  
  	err = check_unshare_flags(unshare_flags);
  	if (err)
  		goto bad_unshare_out;
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2758
  	/*
6013f67fc   Manfred Spraul   ipc: sysvsem: for...
2759
2760
2761
2762
2763
  	 * CLONE_NEWIPC must also detach from the undolist: after switching
  	 * to a new ipc namespace, the semaphore arrays from the old
  	 * namespace are unreachable.
  	 */
  	if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
9edff4ab1   Manfred Spraul   ipc: sysvsem: imp...
2764
  		do_sysvsem = 1;
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
2765
2766
  	err = unshare_fs(unshare_flags, &new_fs);
  	if (err)
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2767
  		goto bad_unshare_out;
60997c3d4   Christian Brauner   close_range: add ...
2768
  	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
2769
  	if (err)
9bfb23fc4   Oleg Nesterov   sys_unshare: remo...
2770
  		goto bad_unshare_cleanup_fs;
b2e0d9870   Eric W. Biederman   userns: Implement...
2771
  	err = unshare_userns(unshare_flags, &new_cred);
fb0a685cb   Daniel Rebelo de Oliveira   kernel/fork.c: fi...
2772
  	if (err)
9edff4ab1   Manfred Spraul   ipc: sysvsem: imp...
2773
  		goto bad_unshare_cleanup_fd;
b2e0d9870   Eric W. Biederman   userns: Implement...
2774
2775
2776
2777
  	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
  					 new_cred, new_fs);
  	if (err)
  		goto bad_unshare_cleanup_cred;
c0b2fc316   Serge Hallyn   [PATCH] uts: copy...
2778

b2e0d9870   Eric W. Biederman   userns: Implement...
2779
  	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
9edff4ab1   Manfred Spraul   ipc: sysvsem: imp...
2780
2781
2782
2783
2784
2785
  		if (do_sysvsem) {
  			/*
  			 * CLONE_SYSVSEM is equivalent to sys_exit().
  			 */
  			exit_sem(current);
  		}
ab602f799   Jack Miller   shm: make exit_sh...
2786
2787
2788
2789
2790
  		if (unshare_flags & CLONE_NEWIPC) {
  			/* Orphan segments in old ns (see sem above). */
  			exit_shm(current);
  			shm_init_task(current);
  		}
ab516013a   Serge E. Hallyn   [PATCH] namespace...
2791

6f977e6b2   Alan Cox   fork: unshare: re...
2792
  		if (new_nsproxy)
cf7b708c8   Pavel Emelyanov   Make access to ta...
2793
  			switch_task_namespaces(current, new_nsproxy);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2794

cf7b708c8   Pavel Emelyanov   Make access to ta...
2795
  		task_lock(current);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2796
2797
  		if (new_fs) {
  			fs = current->fs;
2a4419b5b   Nick Piggin   fs: fs_struct rwl...
2798
  			spin_lock(&fs->lock);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2799
  			current->fs = new_fs;
498052bba   Al Viro   New locking/refco...
2800
2801
2802
2803
  			if (--fs->users)
  				new_fs = NULL;
  			else
  				new_fs = fs;
2a4419b5b   Nick Piggin   fs: fs_struct rwl...
2804
  			spin_unlock(&fs->lock);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2805
  		}
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2806
2807
2808
2809
2810
2811
2812
  		if (new_fd) {
  			fd = current->files;
  			current->files = new_fd;
  			new_fd = fd;
  		}
  
  		task_unlock(current);
b2e0d9870   Eric W. Biederman   userns: Implement...
2813
2814
2815
2816
2817
2818
  
  		if (new_cred) {
  			/* Install the new user namespace */
  			commit_creds(new_cred);
  			new_cred = NULL;
  		}
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2819
  	}
e42226732   Hari Bathini   perf: Add PERF_RE...
2820
  	perf_event_namespaces(current);
b2e0d9870   Eric W. Biederman   userns: Implement...
2821
2822
2823
  bad_unshare_cleanup_cred:
  	if (new_cred)
  		put_cred(new_cred);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2824
2825
2826
  bad_unshare_cleanup_fd:
  	if (new_fd)
  		put_files_struct(new_fd);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2827
2828
  bad_unshare_cleanup_fs:
  	if (new_fs)
498052bba   Al Viro   New locking/refco...
2829
  		free_fs_struct(new_fs);
cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2830

cf2e340f4   JANAK DESAI   [PATCH] unshare s...
2831
2832
2833
  bad_unshare_out:
  	return err;
  }
3b1253880   Al Viro   [PATCH] sanitize ...
2834

9b32105ec   Dominik Brodowski   kernel: add ksys_...
2835
2836
2837
2838
  SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
  {
  	return ksys_unshare(unshare_flags);
  }
3b1253880   Al Viro   [PATCH] sanitize ...
2839
2840
2841
2842
2843
2844
2845
2846
2847
  /*
   *	Helper to unshare the files of the current task.
   *	We don't want to expose copy_files internals to
   *	the exec layer of the kernel.
   */
  
  int unshare_files(struct files_struct **displaced)
  {
  	struct task_struct *task = current;
50704516f   Al Viro   Fix uninitialized...
2848
  	struct files_struct *copy = NULL;
3b1253880   Al Viro   [PATCH] sanitize ...
2849
  	int error;
60997c3d4   Christian Brauner   close_range: add ...
2850
  	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
3b1253880   Al Viro   [PATCH] sanitize ...
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
  	if (error || !copy) {
  		*displaced = NULL;
  		return error;
  	}
  	*displaced = task->files;
  	task_lock(task);
  	task->files = copy;
  	task_unlock(task);
  	return 0;
  }
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
2861
2862
  
  int sysctl_max_threads(struct ctl_table *table, int write,
b0daa2c73   Tobias Klauser   fork: adjust sysc...
2863
  		       void *buffer, size_t *lenp, loff_t *ppos)
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
2864
2865
2866
2867
  {
  	struct ctl_table t;
  	int ret;
  	int threads = max_threads;
b0f53dbc4   Michal Hocko   kernel/sysctl.c: ...
2868
  	int min = 1;
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
  	int max = MAX_THREADS;
  
  	t = *table;
  	t.data = &threads;
  	t.extra1 = &min;
  	t.extra2 = &max;
  
  	ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
  	if (ret || !write)
  		return ret;
b0f53dbc4   Michal Hocko   kernel/sysctl.c: ...
2879
  	max_threads = threads;
16db3d3f1   Heinrich Schuchardt   kernel/sysctl.c: ...
2880
2881
2882
  
  	return 0;
  }