Commit 02afc6267f6d55d47aba9fcafdbd1b7230d2294a

Authored by Al Viro
1 parent f52111b154

[PATCH] dup_fd() fixes, part 1

Move the sucker to fs/file.c in preparation to the rest

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 3 changed files with 131 additions and 130 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/file.c 2 * linux/fs/file.c
3 * 3 *
4 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes 4 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
5 * 5 *
6 * Manage the dynamic fd arrays in the process files_struct. 6 * Manage the dynamic fd arrays in the process files_struct.
7 */ 7 */
8 8
9 #include <linux/fs.h> 9 #include <linux/fs.h>
10 #include <linux/mm.h> 10 #include <linux/mm.h>
11 #include <linux/time.h> 11 #include <linux/time.h>
12 #include <linux/slab.h> 12 #include <linux/slab.h>
13 #include <linux/vmalloc.h> 13 #include <linux/vmalloc.h>
14 #include <linux/file.h> 14 #include <linux/file.h>
15 #include <linux/fdtable.h> 15 #include <linux/fdtable.h>
16 #include <linux/bitops.h> 16 #include <linux/bitops.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/spinlock.h> 18 #include <linux/spinlock.h>
19 #include <linux/rcupdate.h> 19 #include <linux/rcupdate.h>
20 #include <linux/workqueue.h> 20 #include <linux/workqueue.h>
21 21
22 struct fdtable_defer { 22 struct fdtable_defer {
23 spinlock_t lock; 23 spinlock_t lock;
24 struct work_struct wq; 24 struct work_struct wq;
25 struct fdtable *next; 25 struct fdtable *next;
26 }; 26 };
27 27
28 int sysctl_nr_open __read_mostly = 1024*1024; 28 int sysctl_nr_open __read_mostly = 1024*1024;
29 29
30 /* 30 /*
31 * We use this list to defer free fdtables that have vmalloced 31 * We use this list to defer free fdtables that have vmalloced
32 * sets/arrays. By keeping a per-cpu list, we avoid having to embed 32 * sets/arrays. By keeping a per-cpu list, we avoid having to embed
33 * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in 33 * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
34 * this per-task structure. 34 * this per-task structure.
35 */ 35 */
36 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 36 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
37 37
38 static inline void * alloc_fdmem(unsigned int size) 38 static inline void * alloc_fdmem(unsigned int size)
39 { 39 {
40 if (size <= PAGE_SIZE) 40 if (size <= PAGE_SIZE)
41 return kmalloc(size, GFP_KERNEL); 41 return kmalloc(size, GFP_KERNEL);
42 else 42 else
43 return vmalloc(size); 43 return vmalloc(size);
44 } 44 }
45 45
46 static inline void free_fdarr(struct fdtable *fdt) 46 static inline void free_fdarr(struct fdtable *fdt)
47 { 47 {
48 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) 48 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
49 kfree(fdt->fd); 49 kfree(fdt->fd);
50 else 50 else
51 vfree(fdt->fd); 51 vfree(fdt->fd);
52 } 52 }
53 53
54 static inline void free_fdset(struct fdtable *fdt) 54 static inline void free_fdset(struct fdtable *fdt)
55 { 55 {
56 if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) 56 if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
57 kfree(fdt->open_fds); 57 kfree(fdt->open_fds);
58 else 58 else
59 vfree(fdt->open_fds); 59 vfree(fdt->open_fds);
60 } 60 }
61 61
62 static void free_fdtable_work(struct work_struct *work) 62 static void free_fdtable_work(struct work_struct *work)
63 { 63 {
64 struct fdtable_defer *f = 64 struct fdtable_defer *f =
65 container_of(work, struct fdtable_defer, wq); 65 container_of(work, struct fdtable_defer, wq);
66 struct fdtable *fdt; 66 struct fdtable *fdt;
67 67
68 spin_lock_bh(&f->lock); 68 spin_lock_bh(&f->lock);
69 fdt = f->next; 69 fdt = f->next;
70 f->next = NULL; 70 f->next = NULL;
71 spin_unlock_bh(&f->lock); 71 spin_unlock_bh(&f->lock);
72 while(fdt) { 72 while(fdt) {
73 struct fdtable *next = fdt->next; 73 struct fdtable *next = fdt->next;
74 vfree(fdt->fd); 74 vfree(fdt->fd);
75 free_fdset(fdt); 75 free_fdset(fdt);
76 kfree(fdt); 76 kfree(fdt);
77 fdt = next; 77 fdt = next;
78 } 78 }
79 } 79 }
80 80
81 void free_fdtable_rcu(struct rcu_head *rcu) 81 void free_fdtable_rcu(struct rcu_head *rcu)
82 { 82 {
83 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); 83 struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
84 struct fdtable_defer *fddef; 84 struct fdtable_defer *fddef;
85 85
86 BUG_ON(!fdt); 86 BUG_ON(!fdt);
87 87
88 if (fdt->max_fds <= NR_OPEN_DEFAULT) { 88 if (fdt->max_fds <= NR_OPEN_DEFAULT) {
89 /* 89 /*
90 * This fdtable is embedded in the files structure and that 90 * This fdtable is embedded in the files structure and that
91 * structure itself is getting destroyed. 91 * structure itself is getting destroyed.
92 */ 92 */
93 kmem_cache_free(files_cachep, 93 kmem_cache_free(files_cachep,
94 container_of(fdt, struct files_struct, fdtab)); 94 container_of(fdt, struct files_struct, fdtab));
95 return; 95 return;
96 } 96 }
97 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { 97 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
98 kfree(fdt->fd); 98 kfree(fdt->fd);
99 kfree(fdt->open_fds); 99 kfree(fdt->open_fds);
100 kfree(fdt); 100 kfree(fdt);
101 } else { 101 } else {
102 fddef = &get_cpu_var(fdtable_defer_list); 102 fddef = &get_cpu_var(fdtable_defer_list);
103 spin_lock(&fddef->lock); 103 spin_lock(&fddef->lock);
104 fdt->next = fddef->next; 104 fdt->next = fddef->next;
105 fddef->next = fdt; 105 fddef->next = fdt;
106 /* vmallocs are handled from the workqueue context */ 106 /* vmallocs are handled from the workqueue context */
107 schedule_work(&fddef->wq); 107 schedule_work(&fddef->wq);
108 spin_unlock(&fddef->lock); 108 spin_unlock(&fddef->lock);
109 put_cpu_var(fdtable_defer_list); 109 put_cpu_var(fdtable_defer_list);
110 } 110 }
111 } 111 }
112 112
113 /* 113 /*
114 * Expand the fdset in the files_struct. Called with the files spinlock 114 * Expand the fdset in the files_struct. Called with the files spinlock
115 * held for write. 115 * held for write.
116 */ 116 */
117 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) 117 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
118 { 118 {
119 unsigned int cpy, set; 119 unsigned int cpy, set;
120 120
121 BUG_ON(nfdt->max_fds < ofdt->max_fds); 121 BUG_ON(nfdt->max_fds < ofdt->max_fds);
122 if (ofdt->max_fds == 0) 122 if (ofdt->max_fds == 0)
123 return; 123 return;
124 124
125 cpy = ofdt->max_fds * sizeof(struct file *); 125 cpy = ofdt->max_fds * sizeof(struct file *);
126 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); 126 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
127 memcpy(nfdt->fd, ofdt->fd, cpy); 127 memcpy(nfdt->fd, ofdt->fd, cpy);
128 memset((char *)(nfdt->fd) + cpy, 0, set); 128 memset((char *)(nfdt->fd) + cpy, 0, set);
129 129
130 cpy = ofdt->max_fds / BITS_PER_BYTE; 130 cpy = ofdt->max_fds / BITS_PER_BYTE;
131 set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; 131 set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
132 memcpy(nfdt->open_fds, ofdt->open_fds, cpy); 132 memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
133 memset((char *)(nfdt->open_fds) + cpy, 0, set); 133 memset((char *)(nfdt->open_fds) + cpy, 0, set);
134 memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); 134 memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
135 memset((char *)(nfdt->close_on_exec) + cpy, 0, set); 135 memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
136 } 136 }
137 137
138 static struct fdtable * alloc_fdtable(unsigned int nr) 138 static struct fdtable * alloc_fdtable(unsigned int nr)
139 { 139 {
140 struct fdtable *fdt; 140 struct fdtable *fdt;
141 char *data; 141 char *data;
142 142
143 /* 143 /*
144 * Figure out how many fds we actually want to support in this fdtable. 144 * Figure out how many fds we actually want to support in this fdtable.
145 * Allocation steps are keyed to the size of the fdarray, since it 145 * Allocation steps are keyed to the size of the fdarray, since it
146 * grows far faster than any of the other dynamic data. We try to fit 146 * grows far faster than any of the other dynamic data. We try to fit
147 * the fdarray into comfortable page-tuned chunks: starting at 1024B 147 * the fdarray into comfortable page-tuned chunks: starting at 1024B
148 * and growing in powers of two from there on. 148 * and growing in powers of two from there on.
149 */ 149 */
150 nr /= (1024 / sizeof(struct file *)); 150 nr /= (1024 / sizeof(struct file *));
151 nr = roundup_pow_of_two(nr + 1); 151 nr = roundup_pow_of_two(nr + 1);
152 nr *= (1024 / sizeof(struct file *)); 152 nr *= (1024 / sizeof(struct file *));
153 /* 153 /*
154 * Note that this can drive nr *below* what we had passed if sysctl_nr_open 154 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
155 * had been set lower between the check in expand_files() and here. Deal 155 * had been set lower between the check in expand_files() and here. Deal
156 * with that in caller, it's cheaper that way. 156 * with that in caller, it's cheaper that way.
157 * 157 *
158 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise 158 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
159 * bitmaps handling below becomes unpleasant, to put it mildly... 159 * bitmaps handling below becomes unpleasant, to put it mildly...
160 */ 160 */
161 if (unlikely(nr > sysctl_nr_open)) 161 if (unlikely(nr > sysctl_nr_open))
162 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 162 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
163 163
164 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); 164 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
165 if (!fdt) 165 if (!fdt)
166 goto out; 166 goto out;
167 fdt->max_fds = nr; 167 fdt->max_fds = nr;
168 data = alloc_fdmem(nr * sizeof(struct file *)); 168 data = alloc_fdmem(nr * sizeof(struct file *));
169 if (!data) 169 if (!data)
170 goto out_fdt; 170 goto out_fdt;
171 fdt->fd = (struct file **)data; 171 fdt->fd = (struct file **)data;
172 data = alloc_fdmem(max_t(unsigned int, 172 data = alloc_fdmem(max_t(unsigned int,
173 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); 173 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
174 if (!data) 174 if (!data)
175 goto out_arr; 175 goto out_arr;
176 fdt->open_fds = (fd_set *)data; 176 fdt->open_fds = (fd_set *)data;
177 data += nr / BITS_PER_BYTE; 177 data += nr / BITS_PER_BYTE;
178 fdt->close_on_exec = (fd_set *)data; 178 fdt->close_on_exec = (fd_set *)data;
179 INIT_RCU_HEAD(&fdt->rcu); 179 INIT_RCU_HEAD(&fdt->rcu);
180 fdt->next = NULL; 180 fdt->next = NULL;
181 181
182 return fdt; 182 return fdt;
183 183
184 out_arr: 184 out_arr:
185 free_fdarr(fdt); 185 free_fdarr(fdt);
186 out_fdt: 186 out_fdt:
187 kfree(fdt); 187 kfree(fdt);
188 out: 188 out:
189 return NULL; 189 return NULL;
190 } 190 }
191 191
192 /* 192 /*
193 * Expand the file descriptor table. 193 * Expand the file descriptor table.
194 * This function will allocate a new fdtable and both fd array and fdset, of 194 * This function will allocate a new fdtable and both fd array and fdset, of
195 * the given size. 195 * the given size.
196 * Return <0 error code on error; 1 on successful completion. 196 * Return <0 error code on error; 1 on successful completion.
197 * The files->file_lock should be held on entry, and will be held on exit. 197 * The files->file_lock should be held on entry, and will be held on exit.
198 */ 198 */
199 static int expand_fdtable(struct files_struct *files, int nr) 199 static int expand_fdtable(struct files_struct *files, int nr)
200 __releases(files->file_lock) 200 __releases(files->file_lock)
201 __acquires(files->file_lock) 201 __acquires(files->file_lock)
202 { 202 {
203 struct fdtable *new_fdt, *cur_fdt; 203 struct fdtable *new_fdt, *cur_fdt;
204 204
205 spin_unlock(&files->file_lock); 205 spin_unlock(&files->file_lock);
206 new_fdt = alloc_fdtable(nr); 206 new_fdt = alloc_fdtable(nr);
207 spin_lock(&files->file_lock); 207 spin_lock(&files->file_lock);
208 if (!new_fdt) 208 if (!new_fdt)
209 return -ENOMEM; 209 return -ENOMEM;
210 /* 210 /*
211 * extremely unlikely race - sysctl_nr_open decreased between the check in 211 * extremely unlikely race - sysctl_nr_open decreased between the check in
212 * caller and alloc_fdtable(). Cheaper to catch it here... 212 * caller and alloc_fdtable(). Cheaper to catch it here...
213 */ 213 */
214 if (unlikely(new_fdt->max_fds <= nr)) { 214 if (unlikely(new_fdt->max_fds <= nr)) {
215 free_fdarr(new_fdt); 215 free_fdarr(new_fdt);
216 free_fdset(new_fdt); 216 free_fdset(new_fdt);
217 kfree(new_fdt); 217 kfree(new_fdt);
218 return -EMFILE; 218 return -EMFILE;
219 } 219 }
220 /* 220 /*
221 * Check again since another task may have expanded the fd table while 221 * Check again since another task may have expanded the fd table while
222 * we dropped the lock 222 * we dropped the lock
223 */ 223 */
224 cur_fdt = files_fdtable(files); 224 cur_fdt = files_fdtable(files);
225 if (nr >= cur_fdt->max_fds) { 225 if (nr >= cur_fdt->max_fds) {
226 /* Continue as planned */ 226 /* Continue as planned */
227 copy_fdtable(new_fdt, cur_fdt); 227 copy_fdtable(new_fdt, cur_fdt);
228 rcu_assign_pointer(files->fdt, new_fdt); 228 rcu_assign_pointer(files->fdt, new_fdt);
229 if (cur_fdt->max_fds > NR_OPEN_DEFAULT) 229 if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
230 free_fdtable(cur_fdt); 230 free_fdtable(cur_fdt);
231 } else { 231 } else {
232 /* Somebody else expanded, so undo our attempt */ 232 /* Somebody else expanded, so undo our attempt */
233 free_fdarr(new_fdt); 233 free_fdarr(new_fdt);
234 free_fdset(new_fdt); 234 free_fdset(new_fdt);
235 kfree(new_fdt); 235 kfree(new_fdt);
236 } 236 }
237 return 1; 237 return 1;
238 } 238 }
239 239
240 /* 240 /*
241 * Expand files. 241 * Expand files.
242 * This function will expand the file structures, if the requested size exceeds 242 * This function will expand the file structures, if the requested size exceeds
243 * the current capacity and there is room for expansion. 243 * the current capacity and there is room for expansion.
244 * Return <0 error code on error; 0 when nothing done; 1 when files were 244 * Return <0 error code on error; 0 when nothing done; 1 when files were
245 * expanded and execution may have blocked. 245 * expanded and execution may have blocked.
246 * The files->file_lock should be held on entry, and will be held on exit. 246 * The files->file_lock should be held on entry, and will be held on exit.
247 */ 247 */
248 int expand_files(struct files_struct *files, int nr) 248 int expand_files(struct files_struct *files, int nr)
249 { 249 {
250 struct fdtable *fdt; 250 struct fdtable *fdt;
251 251
252 fdt = files_fdtable(files); 252 fdt = files_fdtable(files);
253 /* Do we need to expand? */ 253 /* Do we need to expand? */
254 if (nr < fdt->max_fds) 254 if (nr < fdt->max_fds)
255 return 0; 255 return 0;
256 /* Can we expand? */ 256 /* Can we expand? */
257 if (nr >= sysctl_nr_open) 257 if (nr >= sysctl_nr_open)
258 return -EMFILE; 258 return -EMFILE;
259 259
260 /* All good, so we try */ 260 /* All good, so we try */
261 return expand_fdtable(files, nr); 261 return expand_fdtable(files, nr);
262 } 262 }
263 263
264 static int count_open_files(struct fdtable *fdt)
265 {
266 int size = fdt->max_fds;
267 int i;
268
269 /* Find the last open fd */
270 for (i = size/(8*sizeof(long)); i > 0; ) {
271 if (fdt->open_fds->fds_bits[--i])
272 break;
273 }
274 i = (i+1) * 8 * sizeof(long);
275 return i;
276 }
277
278 static struct files_struct *alloc_files(void)
279 {
280 struct files_struct *newf;
281 struct fdtable *fdt;
282
283 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
284 if (!newf)
285 goto out;
286
287 atomic_set(&newf->count, 1);
288
289 spin_lock_init(&newf->file_lock);
290 newf->next_fd = 0;
291 fdt = &newf->fdtab;
292 fdt->max_fds = NR_OPEN_DEFAULT;
293 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
294 fdt->open_fds = (fd_set *)&newf->open_fds_init;
295 fdt->fd = &newf->fd_array[0];
296 INIT_RCU_HEAD(&fdt->rcu);
297 fdt->next = NULL;
298 rcu_assign_pointer(newf->fdt, fdt);
299 out:
300 return newf;
301 }
302
303 /*
304 * Allocate a new files structure and copy contents from the
305 * passed in files structure.
306 * errorp will be valid only when the returned files_struct is NULL.
307 */
308 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
309 {
310 struct files_struct *newf;
311 struct file **old_fds, **new_fds;
312 int open_files, size, i;
313 struct fdtable *old_fdt, *new_fdt;
314
315 *errorp = -ENOMEM;
316 newf = alloc_files();
317 if (!newf)
318 goto out;
319
320 spin_lock(&oldf->file_lock);
321 old_fdt = files_fdtable(oldf);
322 new_fdt = files_fdtable(newf);
323 open_files = count_open_files(old_fdt);
324
325 /*
326 * Check whether we need to allocate a larger fd array and fd set.
327 * Note: we're not a clone task, so the open count won't change.
328 */
329 if (open_files > new_fdt->max_fds) {
330 new_fdt->max_fds = 0;
331 spin_unlock(&oldf->file_lock);
332 spin_lock(&newf->file_lock);
333 *errorp = expand_files(newf, open_files-1);
334 spin_unlock(&newf->file_lock);
335 if (*errorp < 0)
336 goto out_release;
337 new_fdt = files_fdtable(newf);
338 /*
339 * Reacquire the oldf lock and a pointer to its fd table
340 * who knows it may have a new bigger fd table. We need
341 * the latest pointer.
342 */
343 spin_lock(&oldf->file_lock);
344 old_fdt = files_fdtable(oldf);
345 }
346
347 old_fds = old_fdt->fd;
348 new_fds = new_fdt->fd;
349
350 memcpy(new_fdt->open_fds->fds_bits,
351 old_fdt->open_fds->fds_bits, open_files/8);
352 memcpy(new_fdt->close_on_exec->fds_bits,
353 old_fdt->close_on_exec->fds_bits, open_files/8);
354
355 for (i = open_files; i != 0; i--) {
356 struct file *f = *old_fds++;
357 if (f) {
358 get_file(f);
359 } else {
360 /*
361 * The fd may be claimed in the fd bitmap but not yet
362 * instantiated in the files array if a sibling thread
363 * is partway through open(). So make sure that this
364 * fd is available to the new process.
365 */
366 FD_CLR(open_files - i, new_fdt->open_fds);
367 }
368 rcu_assign_pointer(*new_fds++, f);
369 }
370 spin_unlock(&oldf->file_lock);
371
372 /* compute the remainder to be cleared */
373 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
374
375 /* This is long word aligned thus could use a optimized version */
376 memset(new_fds, 0, size);
377
378 if (new_fdt->max_fds > open_files) {
379 int left = (new_fdt->max_fds-open_files)/8;
380 int start = open_files / (8 * sizeof(unsigned long));
381
382 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
383 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
384 }
385
386 return newf;
387
388 out_release:
389 kmem_cache_free(files_cachep, newf);
390 out:
391 return NULL;
392 }
393
264 static void __devinit fdtable_defer_list_init(int cpu) 394 static void __devinit fdtable_defer_list_init(int cpu)
265 { 395 {
266 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); 396 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
267 spin_lock_init(&fddef->lock); 397 spin_lock_init(&fddef->lock);
268 INIT_WORK(&fddef->wq, free_fdtable_work); 398 INIT_WORK(&fddef->wq, free_fdtable_work);
269 fddef->next = NULL; 399 fddef->next = NULL;
270 } 400 }
271 401
272 void __init files_defer_init(void) 402 void __init files_defer_init(void)
273 { 403 {
274 int i; 404 int i;
275 for_each_possible_cpu(i) 405 for_each_possible_cpu(i)
276 fdtable_defer_list_init(i); 406 fdtable_defer_list_init(i);
277 } 407 }
278 408
279 struct files_struct init_files = { 409 struct files_struct init_files = {
280 .count = ATOMIC_INIT(1), 410 .count = ATOMIC_INIT(1),
281 .fdt = &init_files.fdtab, 411 .fdt = &init_files.fdtab,
282 .fdtab = { 412 .fdtab = {
283 .max_fds = NR_OPEN_DEFAULT, 413 .max_fds = NR_OPEN_DEFAULT,
284 .fd = &init_files.fd_array[0], 414 .fd = &init_files.fd_array[0],
285 .close_on_exec = (fd_set *)&init_files.close_on_exec_init, 415 .close_on_exec = (fd_set *)&init_files.close_on_exec_init,
286 .open_fds = (fd_set *)&init_files.open_fds_init, 416 .open_fds = (fd_set *)&init_files.open_fds_init,
287 .rcu = RCU_HEAD_INIT, 417 .rcu = RCU_HEAD_INIT,
288 }, 418 },
289 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 419 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
290 }; 420 };
291 421
include/linux/fdtable.h
1 /* 1 /*
2 * descriptor table internals; you almost certainly want file.h instead. 2 * descriptor table internals; you almost certainly want file.h instead.
3 */ 3 */
4 4
5 #ifndef __LINUX_FDTABLE_H 5 #ifndef __LINUX_FDTABLE_H
6 #define __LINUX_FDTABLE_H 6 #define __LINUX_FDTABLE_H
7 7
8 #include <asm/atomic.h> 8 #include <asm/atomic.h>
9 #include <linux/posix_types.h> 9 #include <linux/posix_types.h>
10 #include <linux/compiler.h> 10 #include <linux/compiler.h>
11 #include <linux/spinlock.h> 11 #include <linux/spinlock.h>
12 #include <linux/rcupdate.h> 12 #include <linux/rcupdate.h>
13 #include <linux/types.h> 13 #include <linux/types.h>
14 14
15 /* 15 /*
16 * The default fd array needs to be at least BITS_PER_LONG, 16 * The default fd array needs to be at least BITS_PER_LONG,
17 * as this is the granularity returned by copy_fdset(). 17 * as this is the granularity returned by copy_fdset().
18 */ 18 */
19 #define NR_OPEN_DEFAULT BITS_PER_LONG 19 #define NR_OPEN_DEFAULT BITS_PER_LONG
20 20
21 /* 21 /*
22 * The embedded_fd_set is a small fd_set, 22 * The embedded_fd_set is a small fd_set,
23 * suitable for most tasks (which open <= BITS_PER_LONG files) 23 * suitable for most tasks (which open <= BITS_PER_LONG files)
24 */ 24 */
25 struct embedded_fd_set { 25 struct embedded_fd_set {
26 unsigned long fds_bits[1]; 26 unsigned long fds_bits[1];
27 }; 27 };
28 28
29 struct fdtable { 29 struct fdtable {
30 unsigned int max_fds; 30 unsigned int max_fds;
31 struct file ** fd; /* current fd array */ 31 struct file ** fd; /* current fd array */
32 fd_set *close_on_exec; 32 fd_set *close_on_exec;
33 fd_set *open_fds; 33 fd_set *open_fds;
34 struct rcu_head rcu; 34 struct rcu_head rcu;
35 struct fdtable *next; 35 struct fdtable *next;
36 }; 36 };
37 37
38 /* 38 /*
39 * Open file table structure 39 * Open file table structure
40 */ 40 */
41 struct files_struct { 41 struct files_struct {
42 /* 42 /*
43 * read mostly part 43 * read mostly part
44 */ 44 */
45 atomic_t count; 45 atomic_t count;
46 struct fdtable *fdt; 46 struct fdtable *fdt;
47 struct fdtable fdtab; 47 struct fdtable fdtab;
48 /* 48 /*
49 * written part on a separate cache line in SMP 49 * written part on a separate cache line in SMP
50 */ 50 */
51 spinlock_t file_lock ____cacheline_aligned_in_smp; 51 spinlock_t file_lock ____cacheline_aligned_in_smp;
52 int next_fd; 52 int next_fd;
53 struct embedded_fd_set close_on_exec_init; 53 struct embedded_fd_set close_on_exec_init;
54 struct embedded_fd_set open_fds_init; 54 struct embedded_fd_set open_fds_init;
55 struct file * fd_array[NR_OPEN_DEFAULT]; 55 struct file * fd_array[NR_OPEN_DEFAULT];
56 }; 56 };
57 57
58 #define files_fdtable(files) (rcu_dereference((files)->fdt)) 58 #define files_fdtable(files) (rcu_dereference((files)->fdt))
59 59
60 extern struct kmem_cache *filp_cachep; 60 extern struct kmem_cache *filp_cachep;
61 61
62 struct file_operations; 62 struct file_operations;
63 struct vfsmount; 63 struct vfsmount;
64 struct dentry; 64 struct dentry;
65 65
66 extern int expand_files(struct files_struct *, int nr); 66 extern int expand_files(struct files_struct *, int nr);
67 extern void free_fdtable_rcu(struct rcu_head *rcu); 67 extern void free_fdtable_rcu(struct rcu_head *rcu);
68 extern void __init files_defer_init(void); 68 extern void __init files_defer_init(void);
69 69
70 static inline void free_fdtable(struct fdtable *fdt) 70 static inline void free_fdtable(struct fdtable *fdt)
71 { 71 {
72 call_rcu(&fdt->rcu, free_fdtable_rcu); 72 call_rcu(&fdt->rcu, free_fdtable_rcu);
73 } 73 }
74 74
75 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) 75 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
76 { 76 {
77 struct file * file = NULL; 77 struct file * file = NULL;
78 struct fdtable *fdt = files_fdtable(files); 78 struct fdtable *fdt = files_fdtable(files);
79 79
80 if (fd < fdt->max_fds) 80 if (fd < fdt->max_fds)
81 file = rcu_dereference(fdt->fd[fd]); 81 file = rcu_dereference(fdt->fd[fd]);
82 return file; 82 return file;
83 } 83 }
84 84
85 /* 85 /*
86 * Check whether the specified fd has an open file. 86 * Check whether the specified fd has an open file.
87 */ 87 */
88 #define fcheck(fd) fcheck_files(current->files, fd) 88 #define fcheck(fd) fcheck_files(current->files, fd)
89 89
90 struct task_struct; 90 struct task_struct;
91 91
92 struct files_struct *get_files_struct(struct task_struct *); 92 struct files_struct *get_files_struct(struct task_struct *);
93 void put_files_struct(struct files_struct *fs); 93 void put_files_struct(struct files_struct *fs);
94 void reset_files_struct(struct files_struct *); 94 void reset_files_struct(struct files_struct *);
95 int unshare_files(struct files_struct **); 95 int unshare_files(struct files_struct **);
96 struct files_struct *dup_fd(struct files_struct *, int *);
96 97
97 extern struct kmem_cache *files_cachep; 98 extern struct kmem_cache *files_cachep;
98 99
99 #endif /* __LINUX_FDTABLE_H */ 100 #endif /* __LINUX_FDTABLE_H */
100 101
1 /* 1 /*
2 * linux/kernel/fork.c 2 * linux/kernel/fork.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call 8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others). 9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory 10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/init.h> 15 #include <linux/init.h>
16 #include <linux/unistd.h> 16 #include <linux/unistd.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/vmalloc.h> 18 #include <linux/vmalloc.h>
19 #include <linux/completion.h> 19 #include <linux/completion.h>
20 #include <linux/mnt_namespace.h> 20 #include <linux/mnt_namespace.h>
21 #include <linux/personality.h> 21 #include <linux/personality.h>
22 #include <linux/mempolicy.h> 22 #include <linux/mempolicy.h>
23 #include <linux/sem.h> 23 #include <linux/sem.h>
24 #include <linux/file.h> 24 #include <linux/file.h>
25 #include <linux/fdtable.h> 25 #include <linux/fdtable.h>
26 #include <linux/key.h> 26 #include <linux/key.h>
27 #include <linux/binfmts.h> 27 #include <linux/binfmts.h>
28 #include <linux/mman.h> 28 #include <linux/mman.h>
29 #include <linux/fs.h> 29 #include <linux/fs.h>
30 #include <linux/nsproxy.h> 30 #include <linux/nsproxy.h>
31 #include <linux/capability.h> 31 #include <linux/capability.h>
32 #include <linux/cpu.h> 32 #include <linux/cpu.h>
33 #include <linux/cgroup.h> 33 #include <linux/cgroup.h>
34 #include <linux/security.h> 34 #include <linux/security.h>
35 #include <linux/swap.h> 35 #include <linux/swap.h>
36 #include <linux/syscalls.h> 36 #include <linux/syscalls.h>
37 #include <linux/jiffies.h> 37 #include <linux/jiffies.h>
38 #include <linux/futex.h> 38 #include <linux/futex.h>
39 #include <linux/task_io_accounting_ops.h> 39 #include <linux/task_io_accounting_ops.h>
40 #include <linux/rcupdate.h> 40 #include <linux/rcupdate.h>
41 #include <linux/ptrace.h> 41 #include <linux/ptrace.h>
42 #include <linux/mount.h> 42 #include <linux/mount.h>
43 #include <linux/audit.h> 43 #include <linux/audit.h>
44 #include <linux/memcontrol.h> 44 #include <linux/memcontrol.h>
45 #include <linux/profile.h> 45 #include <linux/profile.h>
46 #include <linux/rmap.h> 46 #include <linux/rmap.h>
47 #include <linux/acct.h> 47 #include <linux/acct.h>
48 #include <linux/tsacct_kern.h> 48 #include <linux/tsacct_kern.h>
49 #include <linux/cn_proc.h> 49 #include <linux/cn_proc.h>
50 #include <linux/freezer.h> 50 #include <linux/freezer.h>
51 #include <linux/delayacct.h> 51 #include <linux/delayacct.h>
52 #include <linux/taskstats_kern.h> 52 #include <linux/taskstats_kern.h>
53 #include <linux/random.h> 53 #include <linux/random.h>
54 #include <linux/tty.h> 54 #include <linux/tty.h>
55 #include <linux/proc_fs.h> 55 #include <linux/proc_fs.h>
56 #include <linux/blkdev.h> 56 #include <linux/blkdev.h>
57 57
58 #include <asm/pgtable.h> 58 #include <asm/pgtable.h>
59 #include <asm/pgalloc.h> 59 #include <asm/pgalloc.h>
60 #include <asm/uaccess.h> 60 #include <asm/uaccess.h>
61 #include <asm/mmu_context.h> 61 #include <asm/mmu_context.h>
62 #include <asm/cacheflush.h> 62 #include <asm/cacheflush.h>
63 #include <asm/tlbflush.h> 63 #include <asm/tlbflush.h>
64 64
65 /* 65 /*
66 * Protected counters by write_lock_irq(&tasklist_lock) 66 * Protected counters by write_lock_irq(&tasklist_lock)
67 */ 67 */
68 unsigned long total_forks; /* Handle normal Linux uptimes. */ 68 unsigned long total_forks; /* Handle normal Linux uptimes. */
69 int nr_threads; /* The idle threads do not count.. */ 69 int nr_threads; /* The idle threads do not count.. */
70 70
71 int max_threads; /* tunable limit on nr_threads */ 71 int max_threads; /* tunable limit on nr_threads */
72 72
73 DEFINE_PER_CPU(unsigned long, process_counts) = 0; 73 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
74 74
75 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 75 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
76 76
77 int nr_processes(void) 77 int nr_processes(void)
78 { 78 {
79 int cpu; 79 int cpu;
80 int total = 0; 80 int total = 0;
81 81
82 for_each_online_cpu(cpu) 82 for_each_online_cpu(cpu)
83 total += per_cpu(process_counts, cpu); 83 total += per_cpu(process_counts, cpu);
84 84
85 return total; 85 return total;
86 } 86 }
87 87
88 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 88 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
89 # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 89 # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
90 # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 90 # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
91 static struct kmem_cache *task_struct_cachep; 91 static struct kmem_cache *task_struct_cachep;
92 #endif 92 #endif
93 93
94 /* SLAB cache for signal_struct structures (tsk->signal) */ 94 /* SLAB cache for signal_struct structures (tsk->signal) */
95 static struct kmem_cache *signal_cachep; 95 static struct kmem_cache *signal_cachep;
96 96
97 /* SLAB cache for sighand_struct structures (tsk->sighand) */ 97 /* SLAB cache for sighand_struct structures (tsk->sighand) */
98 struct kmem_cache *sighand_cachep; 98 struct kmem_cache *sighand_cachep;
99 99
100 /* SLAB cache for files_struct structures (tsk->files) */ 100 /* SLAB cache for files_struct structures (tsk->files) */
101 struct kmem_cache *files_cachep; 101 struct kmem_cache *files_cachep;
102 102
103 /* SLAB cache for fs_struct structures (tsk->fs) */ 103 /* SLAB cache for fs_struct structures (tsk->fs) */
104 struct kmem_cache *fs_cachep; 104 struct kmem_cache *fs_cachep;
105 105
106 /* SLAB cache for vm_area_struct structures */ 106 /* SLAB cache for vm_area_struct structures */
107 struct kmem_cache *vm_area_cachep; 107 struct kmem_cache *vm_area_cachep;
108 108
109 /* SLAB cache for mm_struct structures (tsk->mm) */ 109 /* SLAB cache for mm_struct structures (tsk->mm) */
110 static struct kmem_cache *mm_cachep; 110 static struct kmem_cache *mm_cachep;
111 111
112 void free_task(struct task_struct *tsk) 112 void free_task(struct task_struct *tsk)
113 { 113 {
114 prop_local_destroy_single(&tsk->dirties); 114 prop_local_destroy_single(&tsk->dirties);
115 free_thread_info(tsk->stack); 115 free_thread_info(tsk->stack);
116 rt_mutex_debug_task_free(tsk); 116 rt_mutex_debug_task_free(tsk);
117 free_task_struct(tsk); 117 free_task_struct(tsk);
118 } 118 }
119 EXPORT_SYMBOL(free_task); 119 EXPORT_SYMBOL(free_task);
120 120
121 void __put_task_struct(struct task_struct *tsk) 121 void __put_task_struct(struct task_struct *tsk)
122 { 122 {
123 WARN_ON(!tsk->exit_state); 123 WARN_ON(!tsk->exit_state);
124 WARN_ON(atomic_read(&tsk->usage)); 124 WARN_ON(atomic_read(&tsk->usage));
125 WARN_ON(tsk == current); 125 WARN_ON(tsk == current);
126 126
127 security_task_free(tsk); 127 security_task_free(tsk);
128 free_uid(tsk->user); 128 free_uid(tsk->user);
129 put_group_info(tsk->group_info); 129 put_group_info(tsk->group_info);
130 delayacct_tsk_free(tsk); 130 delayacct_tsk_free(tsk);
131 131
132 if (!profile_handoff_task(tsk)) 132 if (!profile_handoff_task(tsk))
133 free_task(tsk); 133 free_task(tsk);
134 } 134 }
135 135
136 /* 136 /*
137 * macro override instead of weak attribute alias, to workaround 137 * macro override instead of weak attribute alias, to workaround
138 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. 138 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
139 */ 139 */
140 #ifndef arch_task_cache_init 140 #ifndef arch_task_cache_init
141 #define arch_task_cache_init() 141 #define arch_task_cache_init()
142 #endif 142 #endif
143 143
144 void __init fork_init(unsigned long mempages) 144 void __init fork_init(unsigned long mempages)
145 { 145 {
146 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 146 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
147 #ifndef ARCH_MIN_TASKALIGN 147 #ifndef ARCH_MIN_TASKALIGN
148 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 148 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
149 #endif 149 #endif
150 /* create a slab on which task_structs can be allocated */ 150 /* create a slab on which task_structs can be allocated */
151 task_struct_cachep = 151 task_struct_cachep =
152 kmem_cache_create("task_struct", sizeof(struct task_struct), 152 kmem_cache_create("task_struct", sizeof(struct task_struct),
153 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 153 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
154 #endif 154 #endif
155 155
156 /* do the arch specific task caches init */ 156 /* do the arch specific task caches init */
157 arch_task_cache_init(); 157 arch_task_cache_init();
158 158
159 /* 159 /*
160 * The default maximum number of threads is set to a safe 160 * The default maximum number of threads is set to a safe
161 * value: the thread structures can take up at most half 161 * value: the thread structures can take up at most half
162 * of memory. 162 * of memory.
163 */ 163 */
164 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); 164 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
165 165
166 /* 166 /*
167 * we need to allow at least 20 threads to boot a system 167 * we need to allow at least 20 threads to boot a system
168 */ 168 */
169 if(max_threads < 20) 169 if(max_threads < 20)
170 max_threads = 20; 170 max_threads = 20;
171 171
172 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 172 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
173 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; 173 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
174 init_task.signal->rlim[RLIMIT_SIGPENDING] = 174 init_task.signal->rlim[RLIMIT_SIGPENDING] =
175 init_task.signal->rlim[RLIMIT_NPROC]; 175 init_task.signal->rlim[RLIMIT_NPROC];
176 } 176 }
177 177
178 int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, 178 int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
179 struct task_struct *src) 179 struct task_struct *src)
180 { 180 {
181 *dst = *src; 181 *dst = *src;
182 return 0; 182 return 0;
183 } 183 }
184 184
185 static struct task_struct *dup_task_struct(struct task_struct *orig) 185 static struct task_struct *dup_task_struct(struct task_struct *orig)
186 { 186 {
187 struct task_struct *tsk; 187 struct task_struct *tsk;
188 struct thread_info *ti; 188 struct thread_info *ti;
189 int err; 189 int err;
190 190
191 prepare_to_copy(orig); 191 prepare_to_copy(orig);
192 192
193 tsk = alloc_task_struct(); 193 tsk = alloc_task_struct();
194 if (!tsk) 194 if (!tsk)
195 return NULL; 195 return NULL;
196 196
197 ti = alloc_thread_info(tsk); 197 ti = alloc_thread_info(tsk);
198 if (!ti) { 198 if (!ti) {
199 free_task_struct(tsk); 199 free_task_struct(tsk);
200 return NULL; 200 return NULL;
201 } 201 }
202 202
203 err = arch_dup_task_struct(tsk, orig); 203 err = arch_dup_task_struct(tsk, orig);
204 if (err) 204 if (err)
205 goto out; 205 goto out;
206 206
207 tsk->stack = ti; 207 tsk->stack = ti;
208 208
209 err = prop_local_init_single(&tsk->dirties); 209 err = prop_local_init_single(&tsk->dirties);
210 if (err) 210 if (err)
211 goto out; 211 goto out;
212 212
213 setup_thread_stack(tsk, orig); 213 setup_thread_stack(tsk, orig);
214 214
215 #ifdef CONFIG_CC_STACKPROTECTOR 215 #ifdef CONFIG_CC_STACKPROTECTOR
216 tsk->stack_canary = get_random_int(); 216 tsk->stack_canary = get_random_int();
217 #endif 217 #endif
218 218
219 /* One for us, one for whoever does the "release_task()" (usually parent) */ 219 /* One for us, one for whoever does the "release_task()" (usually parent) */
220 atomic_set(&tsk->usage,2); 220 atomic_set(&tsk->usage,2);
221 atomic_set(&tsk->fs_excl, 0); 221 atomic_set(&tsk->fs_excl, 0);
222 #ifdef CONFIG_BLK_DEV_IO_TRACE 222 #ifdef CONFIG_BLK_DEV_IO_TRACE
223 tsk->btrace_seq = 0; 223 tsk->btrace_seq = 0;
224 #endif 224 #endif
225 tsk->splice_pipe = NULL; 225 tsk->splice_pipe = NULL;
226 return tsk; 226 return tsk;
227 227
228 out: 228 out:
229 free_thread_info(ti); 229 free_thread_info(ti);
230 free_task_struct(tsk); 230 free_task_struct(tsk);
231 return NULL; 231 return NULL;
232 } 232 }
233 233
234 #ifdef CONFIG_MMU 234 #ifdef CONFIG_MMU
235 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 235 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
236 { 236 {
237 struct vm_area_struct *mpnt, *tmp, **pprev; 237 struct vm_area_struct *mpnt, *tmp, **pprev;
238 struct rb_node **rb_link, *rb_parent; 238 struct rb_node **rb_link, *rb_parent;
239 int retval; 239 int retval;
240 unsigned long charge; 240 unsigned long charge;
241 struct mempolicy *pol; 241 struct mempolicy *pol;
242 242
243 down_write(&oldmm->mmap_sem); 243 down_write(&oldmm->mmap_sem);
244 flush_cache_dup_mm(oldmm); 244 flush_cache_dup_mm(oldmm);
245 /* 245 /*
246 * Not linked in yet - no deadlock potential: 246 * Not linked in yet - no deadlock potential:
247 */ 247 */
248 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); 248 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
249 249
250 mm->locked_vm = 0; 250 mm->locked_vm = 0;
251 mm->mmap = NULL; 251 mm->mmap = NULL;
252 mm->mmap_cache = NULL; 252 mm->mmap_cache = NULL;
253 mm->free_area_cache = oldmm->mmap_base; 253 mm->free_area_cache = oldmm->mmap_base;
254 mm->cached_hole_size = ~0UL; 254 mm->cached_hole_size = ~0UL;
255 mm->map_count = 0; 255 mm->map_count = 0;
256 cpus_clear(mm->cpu_vm_mask); 256 cpus_clear(mm->cpu_vm_mask);
257 mm->mm_rb = RB_ROOT; 257 mm->mm_rb = RB_ROOT;
258 rb_link = &mm->mm_rb.rb_node; 258 rb_link = &mm->mm_rb.rb_node;
259 rb_parent = NULL; 259 rb_parent = NULL;
260 pprev = &mm->mmap; 260 pprev = &mm->mmap;
261 261
262 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 262 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
263 struct file *file; 263 struct file *file;
264 264
265 if (mpnt->vm_flags & VM_DONTCOPY) { 265 if (mpnt->vm_flags & VM_DONTCOPY) {
266 long pages = vma_pages(mpnt); 266 long pages = vma_pages(mpnt);
267 mm->total_vm -= pages; 267 mm->total_vm -= pages;
268 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 268 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
269 -pages); 269 -pages);
270 continue; 270 continue;
271 } 271 }
272 charge = 0; 272 charge = 0;
273 if (mpnt->vm_flags & VM_ACCOUNT) { 273 if (mpnt->vm_flags & VM_ACCOUNT) {
274 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 274 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
275 if (security_vm_enough_memory(len)) 275 if (security_vm_enough_memory(len))
276 goto fail_nomem; 276 goto fail_nomem;
277 charge = len; 277 charge = len;
278 } 278 }
279 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 279 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
280 if (!tmp) 280 if (!tmp)
281 goto fail_nomem; 281 goto fail_nomem;
282 *tmp = *mpnt; 282 *tmp = *mpnt;
283 pol = mpol_dup(vma_policy(mpnt)); 283 pol = mpol_dup(vma_policy(mpnt));
284 retval = PTR_ERR(pol); 284 retval = PTR_ERR(pol);
285 if (IS_ERR(pol)) 285 if (IS_ERR(pol))
286 goto fail_nomem_policy; 286 goto fail_nomem_policy;
287 vma_set_policy(tmp, pol); 287 vma_set_policy(tmp, pol);
288 tmp->vm_flags &= ~VM_LOCKED; 288 tmp->vm_flags &= ~VM_LOCKED;
289 tmp->vm_mm = mm; 289 tmp->vm_mm = mm;
290 tmp->vm_next = NULL; 290 tmp->vm_next = NULL;
291 anon_vma_link(tmp); 291 anon_vma_link(tmp);
292 file = tmp->vm_file; 292 file = tmp->vm_file;
293 if (file) { 293 if (file) {
294 struct inode *inode = file->f_path.dentry->d_inode; 294 struct inode *inode = file->f_path.dentry->d_inode;
295 get_file(file); 295 get_file(file);
296 if (tmp->vm_flags & VM_DENYWRITE) 296 if (tmp->vm_flags & VM_DENYWRITE)
297 atomic_dec(&inode->i_writecount); 297 atomic_dec(&inode->i_writecount);
298 298
299 /* insert tmp into the share list, just after mpnt */ 299 /* insert tmp into the share list, just after mpnt */
300 spin_lock(&file->f_mapping->i_mmap_lock); 300 spin_lock(&file->f_mapping->i_mmap_lock);
301 tmp->vm_truncate_count = mpnt->vm_truncate_count; 301 tmp->vm_truncate_count = mpnt->vm_truncate_count;
302 flush_dcache_mmap_lock(file->f_mapping); 302 flush_dcache_mmap_lock(file->f_mapping);
303 vma_prio_tree_add(tmp, mpnt); 303 vma_prio_tree_add(tmp, mpnt);
304 flush_dcache_mmap_unlock(file->f_mapping); 304 flush_dcache_mmap_unlock(file->f_mapping);
305 spin_unlock(&file->f_mapping->i_mmap_lock); 305 spin_unlock(&file->f_mapping->i_mmap_lock);
306 } 306 }
307 307
308 /* 308 /*
309 * Link in the new vma and copy the page table entries. 309 * Link in the new vma and copy the page table entries.
310 */ 310 */
311 *pprev = tmp; 311 *pprev = tmp;
312 pprev = &tmp->vm_next; 312 pprev = &tmp->vm_next;
313 313
314 __vma_link_rb(mm, tmp, rb_link, rb_parent); 314 __vma_link_rb(mm, tmp, rb_link, rb_parent);
315 rb_link = &tmp->vm_rb.rb_right; 315 rb_link = &tmp->vm_rb.rb_right;
316 rb_parent = &tmp->vm_rb; 316 rb_parent = &tmp->vm_rb;
317 317
318 mm->map_count++; 318 mm->map_count++;
319 retval = copy_page_range(mm, oldmm, mpnt); 319 retval = copy_page_range(mm, oldmm, mpnt);
320 320
321 if (tmp->vm_ops && tmp->vm_ops->open) 321 if (tmp->vm_ops && tmp->vm_ops->open)
322 tmp->vm_ops->open(tmp); 322 tmp->vm_ops->open(tmp);
323 323
324 if (retval) 324 if (retval)
325 goto out; 325 goto out;
326 } 326 }
327 /* a new mm has just been created */ 327 /* a new mm has just been created */
328 arch_dup_mmap(oldmm, mm); 328 arch_dup_mmap(oldmm, mm);
329 retval = 0; 329 retval = 0;
330 out: 330 out:
331 up_write(&mm->mmap_sem); 331 up_write(&mm->mmap_sem);
332 flush_tlb_mm(oldmm); 332 flush_tlb_mm(oldmm);
333 up_write(&oldmm->mmap_sem); 333 up_write(&oldmm->mmap_sem);
334 return retval; 334 return retval;
335 fail_nomem_policy: 335 fail_nomem_policy:
336 kmem_cache_free(vm_area_cachep, tmp); 336 kmem_cache_free(vm_area_cachep, tmp);
337 fail_nomem: 337 fail_nomem:
338 retval = -ENOMEM; 338 retval = -ENOMEM;
339 vm_unacct_memory(charge); 339 vm_unacct_memory(charge);
340 goto out; 340 goto out;
341 } 341 }
342 342
343 static inline int mm_alloc_pgd(struct mm_struct * mm) 343 static inline int mm_alloc_pgd(struct mm_struct * mm)
344 { 344 {
345 mm->pgd = pgd_alloc(mm); 345 mm->pgd = pgd_alloc(mm);
346 if (unlikely(!mm->pgd)) 346 if (unlikely(!mm->pgd))
347 return -ENOMEM; 347 return -ENOMEM;
348 return 0; 348 return 0;
349 } 349 }
350 350
351 static inline void mm_free_pgd(struct mm_struct * mm) 351 static inline void mm_free_pgd(struct mm_struct * mm)
352 { 352 {
353 pgd_free(mm, mm->pgd); 353 pgd_free(mm, mm->pgd);
354 } 354 }
355 #else 355 #else
356 #define dup_mmap(mm, oldmm) (0) 356 #define dup_mmap(mm, oldmm) (0)
357 #define mm_alloc_pgd(mm) (0) 357 #define mm_alloc_pgd(mm) (0)
358 #define mm_free_pgd(mm) 358 #define mm_free_pgd(mm)
359 #endif /* CONFIG_MMU */ 359 #endif /* CONFIG_MMU */
360 360
361 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 361 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
362 362
363 #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 363 #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
364 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 364 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
365 365
366 #include <linux/init_task.h> 366 #include <linux/init_task.h>
367 367
368 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 368 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
369 { 369 {
370 atomic_set(&mm->mm_users, 1); 370 atomic_set(&mm->mm_users, 1);
371 atomic_set(&mm->mm_count, 1); 371 atomic_set(&mm->mm_count, 1);
372 init_rwsem(&mm->mmap_sem); 372 init_rwsem(&mm->mmap_sem);
373 INIT_LIST_HEAD(&mm->mmlist); 373 INIT_LIST_HEAD(&mm->mmlist);
374 mm->flags = (current->mm) ? current->mm->flags 374 mm->flags = (current->mm) ? current->mm->flags
375 : MMF_DUMP_FILTER_DEFAULT; 375 : MMF_DUMP_FILTER_DEFAULT;
376 mm->core_waiters = 0; 376 mm->core_waiters = 0;
377 mm->nr_ptes = 0; 377 mm->nr_ptes = 0;
378 set_mm_counter(mm, file_rss, 0); 378 set_mm_counter(mm, file_rss, 0);
379 set_mm_counter(mm, anon_rss, 0); 379 set_mm_counter(mm, anon_rss, 0);
380 spin_lock_init(&mm->page_table_lock); 380 spin_lock_init(&mm->page_table_lock);
381 rwlock_init(&mm->ioctx_list_lock); 381 rwlock_init(&mm->ioctx_list_lock);
382 mm->ioctx_list = NULL; 382 mm->ioctx_list = NULL;
383 mm->free_area_cache = TASK_UNMAPPED_BASE; 383 mm->free_area_cache = TASK_UNMAPPED_BASE;
384 mm->cached_hole_size = ~0UL; 384 mm->cached_hole_size = ~0UL;
385 mm_init_owner(mm, p); 385 mm_init_owner(mm, p);
386 386
387 if (likely(!mm_alloc_pgd(mm))) { 387 if (likely(!mm_alloc_pgd(mm))) {
388 mm->def_flags = 0; 388 mm->def_flags = 0;
389 return mm; 389 return mm;
390 } 390 }
391 391
392 free_mm(mm); 392 free_mm(mm);
393 return NULL; 393 return NULL;
394 } 394 }
395 395
396 /* 396 /*
397 * Allocate and initialize an mm_struct. 397 * Allocate and initialize an mm_struct.
398 */ 398 */
399 struct mm_struct * mm_alloc(void) 399 struct mm_struct * mm_alloc(void)
400 { 400 {
401 struct mm_struct * mm; 401 struct mm_struct * mm;
402 402
403 mm = allocate_mm(); 403 mm = allocate_mm();
404 if (mm) { 404 if (mm) {
405 memset(mm, 0, sizeof(*mm)); 405 memset(mm, 0, sizeof(*mm));
406 mm = mm_init(mm, current); 406 mm = mm_init(mm, current);
407 } 407 }
408 return mm; 408 return mm;
409 } 409 }
410 410
411 /* 411 /*
412 * Called when the last reference to the mm 412 * Called when the last reference to the mm
413 * is dropped: either by a lazy thread or by 413 * is dropped: either by a lazy thread or by
414 * mmput. Free the page directory and the mm. 414 * mmput. Free the page directory and the mm.
415 */ 415 */
416 void __mmdrop(struct mm_struct *mm) 416 void __mmdrop(struct mm_struct *mm)
417 { 417 {
418 BUG_ON(mm == &init_mm); 418 BUG_ON(mm == &init_mm);
419 mm_free_pgd(mm); 419 mm_free_pgd(mm);
420 destroy_context(mm); 420 destroy_context(mm);
421 free_mm(mm); 421 free_mm(mm);
422 } 422 }
423 EXPORT_SYMBOL_GPL(__mmdrop); 423 EXPORT_SYMBOL_GPL(__mmdrop);
424 424
425 /* 425 /*
426 * Decrement the use count and release all resources for an mm. 426 * Decrement the use count and release all resources for an mm.
427 */ 427 */
428 void mmput(struct mm_struct *mm) 428 void mmput(struct mm_struct *mm)
429 { 429 {
430 might_sleep(); 430 might_sleep();
431 431
432 if (atomic_dec_and_test(&mm->mm_users)) { 432 if (atomic_dec_and_test(&mm->mm_users)) {
433 exit_aio(mm); 433 exit_aio(mm);
434 exit_mmap(mm); 434 exit_mmap(mm);
435 set_mm_exe_file(mm, NULL); 435 set_mm_exe_file(mm, NULL);
436 if (!list_empty(&mm->mmlist)) { 436 if (!list_empty(&mm->mmlist)) {
437 spin_lock(&mmlist_lock); 437 spin_lock(&mmlist_lock);
438 list_del(&mm->mmlist); 438 list_del(&mm->mmlist);
439 spin_unlock(&mmlist_lock); 439 spin_unlock(&mmlist_lock);
440 } 440 }
441 put_swap_token(mm); 441 put_swap_token(mm);
442 mmdrop(mm); 442 mmdrop(mm);
443 } 443 }
444 } 444 }
445 EXPORT_SYMBOL_GPL(mmput); 445 EXPORT_SYMBOL_GPL(mmput);
446 446
447 /** 447 /**
448 * get_task_mm - acquire a reference to the task's mm 448 * get_task_mm - acquire a reference to the task's mm
449 * 449 *
450 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning 450 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
451 * this kernel workthread has transiently adopted a user mm with use_mm, 451 * this kernel workthread has transiently adopted a user mm with use_mm,
452 * to do its AIO) is not set and if so returns a reference to it, after 452 * to do its AIO) is not set and if so returns a reference to it, after
453 * bumping up the use count. User must release the mm via mmput() 453 * bumping up the use count. User must release the mm via mmput()
454 * after use. Typically used by /proc and ptrace. 454 * after use. Typically used by /proc and ptrace.
455 */ 455 */
456 struct mm_struct *get_task_mm(struct task_struct *task) 456 struct mm_struct *get_task_mm(struct task_struct *task)
457 { 457 {
458 struct mm_struct *mm; 458 struct mm_struct *mm;
459 459
460 task_lock(task); 460 task_lock(task);
461 mm = task->mm; 461 mm = task->mm;
462 if (mm) { 462 if (mm) {
463 if (task->flags & PF_BORROWED_MM) 463 if (task->flags & PF_BORROWED_MM)
464 mm = NULL; 464 mm = NULL;
465 else 465 else
466 atomic_inc(&mm->mm_users); 466 atomic_inc(&mm->mm_users);
467 } 467 }
468 task_unlock(task); 468 task_unlock(task);
469 return mm; 469 return mm;
470 } 470 }
471 EXPORT_SYMBOL_GPL(get_task_mm); 471 EXPORT_SYMBOL_GPL(get_task_mm);
472 472
473 /* Please note the differences between mmput and mm_release. 473 /* Please note the differences between mmput and mm_release.
474 * mmput is called whenever we stop holding onto a mm_struct, 474 * mmput is called whenever we stop holding onto a mm_struct,
475 * error success whatever. 475 * error success whatever.
476 * 476 *
477 * mm_release is called after a mm_struct has been removed 477 * mm_release is called after a mm_struct has been removed
478 * from the current process. 478 * from the current process.
479 * 479 *
480 * This difference is important for error handling, when we 480 * This difference is important for error handling, when we
481 * only half set up a mm_struct for a new process and need to restore 481 * only half set up a mm_struct for a new process and need to restore
482 * the old one. Because we mmput the new mm_struct before 482 * the old one. Because we mmput the new mm_struct before
483 * restoring the old one. . . 483 * restoring the old one. . .
484 * Eric Biederman 10 January 1998 484 * Eric Biederman 10 January 1998
485 */ 485 */
486 void mm_release(struct task_struct *tsk, struct mm_struct *mm) 486 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
487 { 487 {
488 struct completion *vfork_done = tsk->vfork_done; 488 struct completion *vfork_done = tsk->vfork_done;
489 489
490 /* Get rid of any cached register state */ 490 /* Get rid of any cached register state */
491 deactivate_mm(tsk, mm); 491 deactivate_mm(tsk, mm);
492 492
493 /* notify parent sleeping on vfork() */ 493 /* notify parent sleeping on vfork() */
494 if (vfork_done) { 494 if (vfork_done) {
495 tsk->vfork_done = NULL; 495 tsk->vfork_done = NULL;
496 complete(vfork_done); 496 complete(vfork_done);
497 } 497 }
498 498
499 /* 499 /*
500 * If we're exiting normally, clear a user-space tid field if 500 * If we're exiting normally, clear a user-space tid field if
501 * requested. We leave this alone when dying by signal, to leave 501 * requested. We leave this alone when dying by signal, to leave
502 * the value intact in a core dump, and to save the unnecessary 502 * the value intact in a core dump, and to save the unnecessary
503 * trouble otherwise. Userland only wants this done for a sys_exit. 503 * trouble otherwise. Userland only wants this done for a sys_exit.
504 */ 504 */
505 if (tsk->clear_child_tid 505 if (tsk->clear_child_tid
506 && !(tsk->flags & PF_SIGNALED) 506 && !(tsk->flags & PF_SIGNALED)
507 && atomic_read(&mm->mm_users) > 1) { 507 && atomic_read(&mm->mm_users) > 1) {
508 u32 __user * tidptr = tsk->clear_child_tid; 508 u32 __user * tidptr = tsk->clear_child_tid;
509 tsk->clear_child_tid = NULL; 509 tsk->clear_child_tid = NULL;
510 510
511 /* 511 /*
512 * We don't check the error code - if userspace has 512 * We don't check the error code - if userspace has
513 * not set up a proper pointer then tough luck. 513 * not set up a proper pointer then tough luck.
514 */ 514 */
515 put_user(0, tidptr); 515 put_user(0, tidptr);
516 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); 516 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
517 } 517 }
518 } 518 }
519 519
520 /* 520 /*
521 * Allocate a new mm structure and copy contents from the 521 * Allocate a new mm structure and copy contents from the
522 * mm structure of the passed in task structure. 522 * mm structure of the passed in task structure.
523 */ 523 */
524 struct mm_struct *dup_mm(struct task_struct *tsk) 524 struct mm_struct *dup_mm(struct task_struct *tsk)
525 { 525 {
526 struct mm_struct *mm, *oldmm = current->mm; 526 struct mm_struct *mm, *oldmm = current->mm;
527 int err; 527 int err;
528 528
529 if (!oldmm) 529 if (!oldmm)
530 return NULL; 530 return NULL;
531 531
532 mm = allocate_mm(); 532 mm = allocate_mm();
533 if (!mm) 533 if (!mm)
534 goto fail_nomem; 534 goto fail_nomem;
535 535
536 memcpy(mm, oldmm, sizeof(*mm)); 536 memcpy(mm, oldmm, sizeof(*mm));
537 537
538 /* Initializing for Swap token stuff */ 538 /* Initializing for Swap token stuff */
539 mm->token_priority = 0; 539 mm->token_priority = 0;
540 mm->last_interval = 0; 540 mm->last_interval = 0;
541 541
542 if (!mm_init(mm, tsk)) 542 if (!mm_init(mm, tsk))
543 goto fail_nomem; 543 goto fail_nomem;
544 544
545 if (init_new_context(tsk, mm)) 545 if (init_new_context(tsk, mm))
546 goto fail_nocontext; 546 goto fail_nocontext;
547 547
548 dup_mm_exe_file(oldmm, mm); 548 dup_mm_exe_file(oldmm, mm);
549 549
550 err = dup_mmap(mm, oldmm); 550 err = dup_mmap(mm, oldmm);
551 if (err) 551 if (err)
552 goto free_pt; 552 goto free_pt;
553 553
554 mm->hiwater_rss = get_mm_rss(mm); 554 mm->hiwater_rss = get_mm_rss(mm);
555 mm->hiwater_vm = mm->total_vm; 555 mm->hiwater_vm = mm->total_vm;
556 556
557 return mm; 557 return mm;
558 558
559 free_pt: 559 free_pt:
560 mmput(mm); 560 mmput(mm);
561 561
562 fail_nomem: 562 fail_nomem:
563 return NULL; 563 return NULL;
564 564
565 fail_nocontext: 565 fail_nocontext:
566 /* 566 /*
567 * If init_new_context() failed, we cannot use mmput() to free the mm 567 * If init_new_context() failed, we cannot use mmput() to free the mm
568 * because it calls destroy_context() 568 * because it calls destroy_context()
569 */ 569 */
570 mm_free_pgd(mm); 570 mm_free_pgd(mm);
571 free_mm(mm); 571 free_mm(mm);
572 return NULL; 572 return NULL;
573 } 573 }
574 574
575 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 575 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
576 { 576 {
577 struct mm_struct * mm, *oldmm; 577 struct mm_struct * mm, *oldmm;
578 int retval; 578 int retval;
579 579
580 tsk->min_flt = tsk->maj_flt = 0; 580 tsk->min_flt = tsk->maj_flt = 0;
581 tsk->nvcsw = tsk->nivcsw = 0; 581 tsk->nvcsw = tsk->nivcsw = 0;
582 582
583 tsk->mm = NULL; 583 tsk->mm = NULL;
584 tsk->active_mm = NULL; 584 tsk->active_mm = NULL;
585 585
586 /* 586 /*
587 * Are we cloning a kernel thread? 587 * Are we cloning a kernel thread?
588 * 588 *
589 * We need to steal a active VM for that.. 589 * We need to steal a active VM for that..
590 */ 590 */
591 oldmm = current->mm; 591 oldmm = current->mm;
592 if (!oldmm) 592 if (!oldmm)
593 return 0; 593 return 0;
594 594
595 if (clone_flags & CLONE_VM) { 595 if (clone_flags & CLONE_VM) {
596 atomic_inc(&oldmm->mm_users); 596 atomic_inc(&oldmm->mm_users);
597 mm = oldmm; 597 mm = oldmm;
598 goto good_mm; 598 goto good_mm;
599 } 599 }
600 600
601 retval = -ENOMEM; 601 retval = -ENOMEM;
602 mm = dup_mm(tsk); 602 mm = dup_mm(tsk);
603 if (!mm) 603 if (!mm)
604 goto fail_nomem; 604 goto fail_nomem;
605 605
606 good_mm: 606 good_mm:
607 /* Initializing for Swap token stuff */ 607 /* Initializing for Swap token stuff */
608 mm->token_priority = 0; 608 mm->token_priority = 0;
609 mm->last_interval = 0; 609 mm->last_interval = 0;
610 610
611 tsk->mm = mm; 611 tsk->mm = mm;
612 tsk->active_mm = mm; 612 tsk->active_mm = mm;
613 return 0; 613 return 0;
614 614
615 fail_nomem: 615 fail_nomem:
616 return retval; 616 return retval;
617 } 617 }
618 618
619 static struct fs_struct *__copy_fs_struct(struct fs_struct *old) 619 static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
620 { 620 {
621 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); 621 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
622 /* We don't need to lock fs - think why ;-) */ 622 /* We don't need to lock fs - think why ;-) */
623 if (fs) { 623 if (fs) {
624 atomic_set(&fs->count, 1); 624 atomic_set(&fs->count, 1);
625 rwlock_init(&fs->lock); 625 rwlock_init(&fs->lock);
626 fs->umask = old->umask; 626 fs->umask = old->umask;
627 read_lock(&old->lock); 627 read_lock(&old->lock);
628 fs->root = old->root; 628 fs->root = old->root;
629 path_get(&old->root); 629 path_get(&old->root);
630 fs->pwd = old->pwd; 630 fs->pwd = old->pwd;
631 path_get(&old->pwd); 631 path_get(&old->pwd);
632 if (old->altroot.dentry) { 632 if (old->altroot.dentry) {
633 fs->altroot = old->altroot; 633 fs->altroot = old->altroot;
634 path_get(&old->altroot); 634 path_get(&old->altroot);
635 } else { 635 } else {
636 fs->altroot.mnt = NULL; 636 fs->altroot.mnt = NULL;
637 fs->altroot.dentry = NULL; 637 fs->altroot.dentry = NULL;
638 } 638 }
639 read_unlock(&old->lock); 639 read_unlock(&old->lock);
640 } 640 }
641 return fs; 641 return fs;
642 } 642 }
643 643
644 struct fs_struct *copy_fs_struct(struct fs_struct *old) 644 struct fs_struct *copy_fs_struct(struct fs_struct *old)
645 { 645 {
646 return __copy_fs_struct(old); 646 return __copy_fs_struct(old);
647 } 647 }
648 648
649 EXPORT_SYMBOL_GPL(copy_fs_struct); 649 EXPORT_SYMBOL_GPL(copy_fs_struct);
650 650
651 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) 651 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
652 { 652 {
653 if (clone_flags & CLONE_FS) { 653 if (clone_flags & CLONE_FS) {
654 atomic_inc(&current->fs->count); 654 atomic_inc(&current->fs->count);
655 return 0; 655 return 0;
656 } 656 }
657 tsk->fs = __copy_fs_struct(current->fs); 657 tsk->fs = __copy_fs_struct(current->fs);
658 if (!tsk->fs) 658 if (!tsk->fs)
659 return -ENOMEM; 659 return -ENOMEM;
660 return 0; 660 return 0;
661 } 661 }
662 662
663 static int count_open_files(struct fdtable *fdt)
664 {
665 int size = fdt->max_fds;
666 int i;
667
668 /* Find the last open fd */
669 for (i = size/(8*sizeof(long)); i > 0; ) {
670 if (fdt->open_fds->fds_bits[--i])
671 break;
672 }
673 i = (i+1) * 8 * sizeof(long);
674 return i;
675 }
676
677 static struct files_struct *alloc_files(void)
678 {
679 struct files_struct *newf;
680 struct fdtable *fdt;
681
682 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
683 if (!newf)
684 goto out;
685
686 atomic_set(&newf->count, 1);
687
688 spin_lock_init(&newf->file_lock);
689 newf->next_fd = 0;
690 fdt = &newf->fdtab;
691 fdt->max_fds = NR_OPEN_DEFAULT;
692 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
693 fdt->open_fds = (fd_set *)&newf->open_fds_init;
694 fdt->fd = &newf->fd_array[0];
695 INIT_RCU_HEAD(&fdt->rcu);
696 fdt->next = NULL;
697 rcu_assign_pointer(newf->fdt, fdt);
698 out:
699 return newf;
700 }
701
702 /*
703 * Allocate a new files structure and copy contents from the
704 * passed in files structure.
705 * errorp will be valid only when the returned files_struct is NULL.
706 */
707 static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
708 {
709 struct files_struct *newf;
710 struct file **old_fds, **new_fds;
711 int open_files, size, i;
712 struct fdtable *old_fdt, *new_fdt;
713
714 *errorp = -ENOMEM;
715 newf = alloc_files();
716 if (!newf)
717 goto out;
718
719 spin_lock(&oldf->file_lock);
720 old_fdt = files_fdtable(oldf);
721 new_fdt = files_fdtable(newf);
722 open_files = count_open_files(old_fdt);
723
724 /*
725 * Check whether we need to allocate a larger fd array and fd set.
726 * Note: we're not a clone task, so the open count won't change.
727 */
728 if (open_files > new_fdt->max_fds) {
729 new_fdt->max_fds = 0;
730 spin_unlock(&oldf->file_lock);
731 spin_lock(&newf->file_lock);
732 *errorp = expand_files(newf, open_files-1);
733 spin_unlock(&newf->file_lock);
734 if (*errorp < 0)
735 goto out_release;
736 new_fdt = files_fdtable(newf);
737 /*
738 * Reacquire the oldf lock and a pointer to its fd table
739 * who knows it may have a new bigger fd table. We need
740 * the latest pointer.
741 */
742 spin_lock(&oldf->file_lock);
743 old_fdt = files_fdtable(oldf);
744 }
745
746 old_fds = old_fdt->fd;
747 new_fds = new_fdt->fd;
748
749 memcpy(new_fdt->open_fds->fds_bits,
750 old_fdt->open_fds->fds_bits, open_files/8);
751 memcpy(new_fdt->close_on_exec->fds_bits,
752 old_fdt->close_on_exec->fds_bits, open_files/8);
753
754 for (i = open_files; i != 0; i--) {
755 struct file *f = *old_fds++;
756 if (f) {
757 get_file(f);
758 } else {
759 /*
760 * The fd may be claimed in the fd bitmap but not yet
761 * instantiated in the files array if a sibling thread
762 * is partway through open(). So make sure that this
763 * fd is available to the new process.
764 */
765 FD_CLR(open_files - i, new_fdt->open_fds);
766 }
767 rcu_assign_pointer(*new_fds++, f);
768 }
769 spin_unlock(&oldf->file_lock);
770
771 /* compute the remainder to be cleared */
772 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
773
774 /* This is long word aligned thus could use a optimized version */
775 memset(new_fds, 0, size);
776
777 if (new_fdt->max_fds > open_files) {
778 int left = (new_fdt->max_fds-open_files)/8;
779 int start = open_files / (8 * sizeof(unsigned long));
780
781 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
782 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
783 }
784
785 return newf;
786
787 out_release:
788 kmem_cache_free(files_cachep, newf);
789 out:
790 return NULL;
791 }
792
793 static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 663 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
794 { 664 {
795 struct files_struct *oldf, *newf; 665 struct files_struct *oldf, *newf;
796 int error = 0; 666 int error = 0;
797 667
798 /* 668 /*
799 * A background process may not have any files ... 669 * A background process may not have any files ...
800 */ 670 */
801 oldf = current->files; 671 oldf = current->files;
802 if (!oldf) 672 if (!oldf)
803 goto out; 673 goto out;
804 674
805 if (clone_flags & CLONE_FILES) { 675 if (clone_flags & CLONE_FILES) {
806 atomic_inc(&oldf->count); 676 atomic_inc(&oldf->count);
807 goto out; 677 goto out;
808 } 678 }
809 679
810 newf = dup_fd(oldf, &error); 680 newf = dup_fd(oldf, &error);
811 if (!newf) 681 if (!newf)
812 goto out; 682 goto out;
813 683
814 tsk->files = newf; 684 tsk->files = newf;
815 error = 0; 685 error = 0;
816 out: 686 out:
817 return error; 687 return error;
818 } 688 }
819 689
820 static int copy_io(unsigned long clone_flags, struct task_struct *tsk) 690 static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
821 { 691 {
822 #ifdef CONFIG_BLOCK 692 #ifdef CONFIG_BLOCK
823 struct io_context *ioc = current->io_context; 693 struct io_context *ioc = current->io_context;
824 694
825 if (!ioc) 695 if (!ioc)
826 return 0; 696 return 0;
827 /* 697 /*
828 * Share io context with parent, if CLONE_IO is set 698 * Share io context with parent, if CLONE_IO is set
829 */ 699 */
830 if (clone_flags & CLONE_IO) { 700 if (clone_flags & CLONE_IO) {
831 tsk->io_context = ioc_task_link(ioc); 701 tsk->io_context = ioc_task_link(ioc);
832 if (unlikely(!tsk->io_context)) 702 if (unlikely(!tsk->io_context))
833 return -ENOMEM; 703 return -ENOMEM;
834 } else if (ioprio_valid(ioc->ioprio)) { 704 } else if (ioprio_valid(ioc->ioprio)) {
835 tsk->io_context = alloc_io_context(GFP_KERNEL, -1); 705 tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
836 if (unlikely(!tsk->io_context)) 706 if (unlikely(!tsk->io_context))
837 return -ENOMEM; 707 return -ENOMEM;
838 708
839 tsk->io_context->ioprio = ioc->ioprio; 709 tsk->io_context->ioprio = ioc->ioprio;
840 } 710 }
841 #endif 711 #endif
842 return 0; 712 return 0;
843 } 713 }
844 714
845 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) 715 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
846 { 716 {
847 struct sighand_struct *sig; 717 struct sighand_struct *sig;
848 718
849 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 719 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
850 atomic_inc(&current->sighand->count); 720 atomic_inc(&current->sighand->count);
851 return 0; 721 return 0;
852 } 722 }
853 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 723 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
854 rcu_assign_pointer(tsk->sighand, sig); 724 rcu_assign_pointer(tsk->sighand, sig);
855 if (!sig) 725 if (!sig)
856 return -ENOMEM; 726 return -ENOMEM;
857 atomic_set(&sig->count, 1); 727 atomic_set(&sig->count, 1);
858 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 728 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
859 return 0; 729 return 0;
860 } 730 }
861 731
862 void __cleanup_sighand(struct sighand_struct *sighand) 732 void __cleanup_sighand(struct sighand_struct *sighand)
863 { 733 {
864 if (atomic_dec_and_test(&sighand->count)) 734 if (atomic_dec_and_test(&sighand->count))
865 kmem_cache_free(sighand_cachep, sighand); 735 kmem_cache_free(sighand_cachep, sighand);
866 } 736 }
867 737
868 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 738 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
869 { 739 {
870 struct signal_struct *sig; 740 struct signal_struct *sig;
871 int ret; 741 int ret;
872 742
873 if (clone_flags & CLONE_THREAD) { 743 if (clone_flags & CLONE_THREAD) {
874 atomic_inc(&current->signal->count); 744 atomic_inc(&current->signal->count);
875 atomic_inc(&current->signal->live); 745 atomic_inc(&current->signal->live);
876 return 0; 746 return 0;
877 } 747 }
878 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 748 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
879 tsk->signal = sig; 749 tsk->signal = sig;
880 if (!sig) 750 if (!sig)
881 return -ENOMEM; 751 return -ENOMEM;
882 752
883 ret = copy_thread_group_keys(tsk); 753 ret = copy_thread_group_keys(tsk);
884 if (ret < 0) { 754 if (ret < 0) {
885 kmem_cache_free(signal_cachep, sig); 755 kmem_cache_free(signal_cachep, sig);
886 return ret; 756 return ret;
887 } 757 }
888 758
889 atomic_set(&sig->count, 1); 759 atomic_set(&sig->count, 1);
890 atomic_set(&sig->live, 1); 760 atomic_set(&sig->live, 1);
891 init_waitqueue_head(&sig->wait_chldexit); 761 init_waitqueue_head(&sig->wait_chldexit);
892 sig->flags = 0; 762 sig->flags = 0;
893 sig->group_exit_code = 0; 763 sig->group_exit_code = 0;
894 sig->group_exit_task = NULL; 764 sig->group_exit_task = NULL;
895 sig->group_stop_count = 0; 765 sig->group_stop_count = 0;
896 sig->curr_target = tsk; 766 sig->curr_target = tsk;
897 init_sigpending(&sig->shared_pending); 767 init_sigpending(&sig->shared_pending);
898 INIT_LIST_HEAD(&sig->posix_timers); 768 INIT_LIST_HEAD(&sig->posix_timers);
899 769
900 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 770 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
901 sig->it_real_incr.tv64 = 0; 771 sig->it_real_incr.tv64 = 0;
902 sig->real_timer.function = it_real_fn; 772 sig->real_timer.function = it_real_fn;
903 773
904 sig->it_virt_expires = cputime_zero; 774 sig->it_virt_expires = cputime_zero;
905 sig->it_virt_incr = cputime_zero; 775 sig->it_virt_incr = cputime_zero;
906 sig->it_prof_expires = cputime_zero; 776 sig->it_prof_expires = cputime_zero;
907 sig->it_prof_incr = cputime_zero; 777 sig->it_prof_incr = cputime_zero;
908 778
909 sig->leader = 0; /* session leadership doesn't inherit */ 779 sig->leader = 0; /* session leadership doesn't inherit */
910 sig->tty_old_pgrp = NULL; 780 sig->tty_old_pgrp = NULL;
911 781
912 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 782 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
913 sig->gtime = cputime_zero; 783 sig->gtime = cputime_zero;
914 sig->cgtime = cputime_zero; 784 sig->cgtime = cputime_zero;
915 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 785 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
916 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 786 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
917 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 787 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
918 sig->sum_sched_runtime = 0; 788 sig->sum_sched_runtime = 0;
919 INIT_LIST_HEAD(&sig->cpu_timers[0]); 789 INIT_LIST_HEAD(&sig->cpu_timers[0]);
920 INIT_LIST_HEAD(&sig->cpu_timers[1]); 790 INIT_LIST_HEAD(&sig->cpu_timers[1]);
921 INIT_LIST_HEAD(&sig->cpu_timers[2]); 791 INIT_LIST_HEAD(&sig->cpu_timers[2]);
922 taskstats_tgid_init(sig); 792 taskstats_tgid_init(sig);
923 793
924 task_lock(current->group_leader); 794 task_lock(current->group_leader);
925 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 795 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
926 task_unlock(current->group_leader); 796 task_unlock(current->group_leader);
927 797
928 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 798 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
929 /* 799 /*
930 * New sole thread in the process gets an expiry time 800 * New sole thread in the process gets an expiry time
931 * of the whole CPU time limit. 801 * of the whole CPU time limit.
932 */ 802 */
933 tsk->it_prof_expires = 803 tsk->it_prof_expires =
934 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 804 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
935 } 805 }
936 acct_init_pacct(&sig->pacct); 806 acct_init_pacct(&sig->pacct);
937 807
938 tty_audit_fork(sig); 808 tty_audit_fork(sig);
939 809
940 return 0; 810 return 0;
941 } 811 }
942 812
943 void __cleanup_signal(struct signal_struct *sig) 813 void __cleanup_signal(struct signal_struct *sig)
944 { 814 {
945 exit_thread_group_keys(sig); 815 exit_thread_group_keys(sig);
946 kmem_cache_free(signal_cachep, sig); 816 kmem_cache_free(signal_cachep, sig);
947 } 817 }
948 818
949 static void cleanup_signal(struct task_struct *tsk) 819 static void cleanup_signal(struct task_struct *tsk)
950 { 820 {
951 struct signal_struct *sig = tsk->signal; 821 struct signal_struct *sig = tsk->signal;
952 822
953 atomic_dec(&sig->live); 823 atomic_dec(&sig->live);
954 824
955 if (atomic_dec_and_test(&sig->count)) 825 if (atomic_dec_and_test(&sig->count))
956 __cleanup_signal(sig); 826 __cleanup_signal(sig);
957 } 827 }
958 828
959 static void copy_flags(unsigned long clone_flags, struct task_struct *p) 829 static void copy_flags(unsigned long clone_flags, struct task_struct *p)
960 { 830 {
961 unsigned long new_flags = p->flags; 831 unsigned long new_flags = p->flags;
962 832
963 new_flags &= ~PF_SUPERPRIV; 833 new_flags &= ~PF_SUPERPRIV;
964 new_flags |= PF_FORKNOEXEC; 834 new_flags |= PF_FORKNOEXEC;
965 if (!(clone_flags & CLONE_PTRACE)) 835 if (!(clone_flags & CLONE_PTRACE))
966 p->ptrace = 0; 836 p->ptrace = 0;
967 p->flags = new_flags; 837 p->flags = new_flags;
968 clear_freeze_flag(p); 838 clear_freeze_flag(p);
969 } 839 }
970 840
971 asmlinkage long sys_set_tid_address(int __user *tidptr) 841 asmlinkage long sys_set_tid_address(int __user *tidptr)
972 { 842 {
973 current->clear_child_tid = tidptr; 843 current->clear_child_tid = tidptr;
974 844
975 return task_pid_vnr(current); 845 return task_pid_vnr(current);
976 } 846 }
977 847
978 static void rt_mutex_init_task(struct task_struct *p) 848 static void rt_mutex_init_task(struct task_struct *p)
979 { 849 {
980 spin_lock_init(&p->pi_lock); 850 spin_lock_init(&p->pi_lock);
981 #ifdef CONFIG_RT_MUTEXES 851 #ifdef CONFIG_RT_MUTEXES
982 plist_head_init(&p->pi_waiters, &p->pi_lock); 852 plist_head_init(&p->pi_waiters, &p->pi_lock);
983 p->pi_blocked_on = NULL; 853 p->pi_blocked_on = NULL;
984 #endif 854 #endif
985 } 855 }
986 856
987 #ifdef CONFIG_MM_OWNER 857 #ifdef CONFIG_MM_OWNER
988 void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 858 void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
989 { 859 {
990 mm->owner = p; 860 mm->owner = p;
991 } 861 }
992 #endif /* CONFIG_MM_OWNER */ 862 #endif /* CONFIG_MM_OWNER */
993 863
994 /* 864 /*
995 * This creates a new process as a copy of the old one, 865 * This creates a new process as a copy of the old one,
996 * but does not actually start it yet. 866 * but does not actually start it yet.
997 * 867 *
998 * It copies the registers, and all the appropriate 868 * It copies the registers, and all the appropriate
999 * parts of the process environment (as per the clone 869 * parts of the process environment (as per the clone
1000 * flags). The actual kick-off is left to the caller. 870 * flags). The actual kick-off is left to the caller.
1001 */ 871 */
1002 static struct task_struct *copy_process(unsigned long clone_flags, 872 static struct task_struct *copy_process(unsigned long clone_flags,
1003 unsigned long stack_start, 873 unsigned long stack_start,
1004 struct pt_regs *regs, 874 struct pt_regs *regs,
1005 unsigned long stack_size, 875 unsigned long stack_size,
1006 int __user *child_tidptr, 876 int __user *child_tidptr,
1007 struct pid *pid) 877 struct pid *pid)
1008 { 878 {
1009 int retval; 879 int retval;
1010 struct task_struct *p; 880 struct task_struct *p;
1011 int cgroup_callbacks_done = 0; 881 int cgroup_callbacks_done = 0;
1012 882
1013 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 883 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1014 return ERR_PTR(-EINVAL); 884 return ERR_PTR(-EINVAL);
1015 885
1016 /* 886 /*
1017 * Thread groups must share signals as well, and detached threads 887 * Thread groups must share signals as well, and detached threads
1018 * can only be started up within the thread group. 888 * can only be started up within the thread group.
1019 */ 889 */
1020 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) 890 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1021 return ERR_PTR(-EINVAL); 891 return ERR_PTR(-EINVAL);
1022 892
1023 /* 893 /*
1024 * Shared signal handlers imply shared VM. By way of the above, 894 * Shared signal handlers imply shared VM. By way of the above,
1025 * thread groups also imply shared VM. Blocking this case allows 895 * thread groups also imply shared VM. Blocking this case allows
1026 * for various simplifications in other code. 896 * for various simplifications in other code.
1027 */ 897 */
1028 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 898 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1029 return ERR_PTR(-EINVAL); 899 return ERR_PTR(-EINVAL);
1030 900
1031 retval = security_task_create(clone_flags); 901 retval = security_task_create(clone_flags);
1032 if (retval) 902 if (retval)
1033 goto fork_out; 903 goto fork_out;
1034 904
1035 retval = -ENOMEM; 905 retval = -ENOMEM;
1036 p = dup_task_struct(current); 906 p = dup_task_struct(current);
1037 if (!p) 907 if (!p)
1038 goto fork_out; 908 goto fork_out;
1039 909
1040 rt_mutex_init_task(p); 910 rt_mutex_init_task(p);
1041 911
1042 #ifdef CONFIG_TRACE_IRQFLAGS 912 #ifdef CONFIG_TRACE_IRQFLAGS
1043 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 913 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
1044 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 914 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
1045 #endif 915 #endif
1046 retval = -EAGAIN; 916 retval = -EAGAIN;
1047 if (atomic_read(&p->user->processes) >= 917 if (atomic_read(&p->user->processes) >=
1048 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 918 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
1049 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 919 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1050 p->user != current->nsproxy->user_ns->root_user) 920 p->user != current->nsproxy->user_ns->root_user)
1051 goto bad_fork_free; 921 goto bad_fork_free;
1052 } 922 }
1053 923
1054 atomic_inc(&p->user->__count); 924 atomic_inc(&p->user->__count);
1055 atomic_inc(&p->user->processes); 925 atomic_inc(&p->user->processes);
1056 get_group_info(p->group_info); 926 get_group_info(p->group_info);
1057 927
1058 /* 928 /*
1059 * If multiple threads are within copy_process(), then this check 929 * If multiple threads are within copy_process(), then this check
1060 * triggers too late. This doesn't hurt, the check is only there 930 * triggers too late. This doesn't hurt, the check is only there
1061 * to stop root fork bombs. 931 * to stop root fork bombs.
1062 */ 932 */
1063 if (nr_threads >= max_threads) 933 if (nr_threads >= max_threads)
1064 goto bad_fork_cleanup_count; 934 goto bad_fork_cleanup_count;
1065 935
1066 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 936 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1067 goto bad_fork_cleanup_count; 937 goto bad_fork_cleanup_count;
1068 938
1069 if (p->binfmt && !try_module_get(p->binfmt->module)) 939 if (p->binfmt && !try_module_get(p->binfmt->module))
1070 goto bad_fork_cleanup_put_domain; 940 goto bad_fork_cleanup_put_domain;
1071 941
1072 p->did_exec = 0; 942 p->did_exec = 0;
1073 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 943 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1074 copy_flags(clone_flags, p); 944 copy_flags(clone_flags, p);
1075 INIT_LIST_HEAD(&p->children); 945 INIT_LIST_HEAD(&p->children);
1076 INIT_LIST_HEAD(&p->sibling); 946 INIT_LIST_HEAD(&p->sibling);
1077 #ifdef CONFIG_PREEMPT_RCU 947 #ifdef CONFIG_PREEMPT_RCU
1078 p->rcu_read_lock_nesting = 0; 948 p->rcu_read_lock_nesting = 0;
1079 p->rcu_flipctr_idx = 0; 949 p->rcu_flipctr_idx = 0;
1080 #endif /* #ifdef CONFIG_PREEMPT_RCU */ 950 #endif /* #ifdef CONFIG_PREEMPT_RCU */
1081 p->vfork_done = NULL; 951 p->vfork_done = NULL;
1082 spin_lock_init(&p->alloc_lock); 952 spin_lock_init(&p->alloc_lock);
1083 953
1084 clear_tsk_thread_flag(p, TIF_SIGPENDING); 954 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1085 init_sigpending(&p->pending); 955 init_sigpending(&p->pending);
1086 956
1087 p->utime = cputime_zero; 957 p->utime = cputime_zero;
1088 p->stime = cputime_zero; 958 p->stime = cputime_zero;
1089 p->gtime = cputime_zero; 959 p->gtime = cputime_zero;
1090 p->utimescaled = cputime_zero; 960 p->utimescaled = cputime_zero;
1091 p->stimescaled = cputime_zero; 961 p->stimescaled = cputime_zero;
1092 p->prev_utime = cputime_zero; 962 p->prev_utime = cputime_zero;
1093 p->prev_stime = cputime_zero; 963 p->prev_stime = cputime_zero;
1094 964
1095 #ifdef CONFIG_DETECT_SOFTLOCKUP 965 #ifdef CONFIG_DETECT_SOFTLOCKUP
1096 p->last_switch_count = 0; 966 p->last_switch_count = 0;
1097 p->last_switch_timestamp = 0; 967 p->last_switch_timestamp = 0;
1098 #endif 968 #endif
1099 969
1100 #ifdef CONFIG_TASK_XACCT 970 #ifdef CONFIG_TASK_XACCT
1101 p->rchar = 0; /* I/O counter: bytes read */ 971 p->rchar = 0; /* I/O counter: bytes read */
1102 p->wchar = 0; /* I/O counter: bytes written */ 972 p->wchar = 0; /* I/O counter: bytes written */
1103 p->syscr = 0; /* I/O counter: read syscalls */ 973 p->syscr = 0; /* I/O counter: read syscalls */
1104 p->syscw = 0; /* I/O counter: write syscalls */ 974 p->syscw = 0; /* I/O counter: write syscalls */
1105 #endif 975 #endif
1106 task_io_accounting_init(p); 976 task_io_accounting_init(p);
1107 acct_clear_integrals(p); 977 acct_clear_integrals(p);
1108 978
1109 p->it_virt_expires = cputime_zero; 979 p->it_virt_expires = cputime_zero;
1110 p->it_prof_expires = cputime_zero; 980 p->it_prof_expires = cputime_zero;
1111 p->it_sched_expires = 0; 981 p->it_sched_expires = 0;
1112 INIT_LIST_HEAD(&p->cpu_timers[0]); 982 INIT_LIST_HEAD(&p->cpu_timers[0]);
1113 INIT_LIST_HEAD(&p->cpu_timers[1]); 983 INIT_LIST_HEAD(&p->cpu_timers[1]);
1114 INIT_LIST_HEAD(&p->cpu_timers[2]); 984 INIT_LIST_HEAD(&p->cpu_timers[2]);
1115 985
1116 p->lock_depth = -1; /* -1 = no lock */ 986 p->lock_depth = -1; /* -1 = no lock */
1117 do_posix_clock_monotonic_gettime(&p->start_time); 987 do_posix_clock_monotonic_gettime(&p->start_time);
1118 p->real_start_time = p->start_time; 988 p->real_start_time = p->start_time;
1119 monotonic_to_bootbased(&p->real_start_time); 989 monotonic_to_bootbased(&p->real_start_time);
1120 #ifdef CONFIG_SECURITY 990 #ifdef CONFIG_SECURITY
1121 p->security = NULL; 991 p->security = NULL;
1122 #endif 992 #endif
1123 p->cap_bset = current->cap_bset; 993 p->cap_bset = current->cap_bset;
1124 p->io_context = NULL; 994 p->io_context = NULL;
1125 p->audit_context = NULL; 995 p->audit_context = NULL;
1126 cgroup_fork(p); 996 cgroup_fork(p);
1127 #ifdef CONFIG_NUMA 997 #ifdef CONFIG_NUMA
1128 p->mempolicy = mpol_dup(p->mempolicy); 998 p->mempolicy = mpol_dup(p->mempolicy);
1129 if (IS_ERR(p->mempolicy)) { 999 if (IS_ERR(p->mempolicy)) {
1130 retval = PTR_ERR(p->mempolicy); 1000 retval = PTR_ERR(p->mempolicy);
1131 p->mempolicy = NULL; 1001 p->mempolicy = NULL;
1132 goto bad_fork_cleanup_cgroup; 1002 goto bad_fork_cleanup_cgroup;
1133 } 1003 }
1134 mpol_fix_fork_child_flag(p); 1004 mpol_fix_fork_child_flag(p);
1135 #endif 1005 #endif
1136 #ifdef CONFIG_TRACE_IRQFLAGS 1006 #ifdef CONFIG_TRACE_IRQFLAGS
1137 p->irq_events = 0; 1007 p->irq_events = 0;
1138 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1008 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1139 p->hardirqs_enabled = 1; 1009 p->hardirqs_enabled = 1;
1140 #else 1010 #else
1141 p->hardirqs_enabled = 0; 1011 p->hardirqs_enabled = 0;
1142 #endif 1012 #endif
1143 p->hardirq_enable_ip = 0; 1013 p->hardirq_enable_ip = 0;
1144 p->hardirq_enable_event = 0; 1014 p->hardirq_enable_event = 0;
1145 p->hardirq_disable_ip = _THIS_IP_; 1015 p->hardirq_disable_ip = _THIS_IP_;
1146 p->hardirq_disable_event = 0; 1016 p->hardirq_disable_event = 0;
1147 p->softirqs_enabled = 1; 1017 p->softirqs_enabled = 1;
1148 p->softirq_enable_ip = _THIS_IP_; 1018 p->softirq_enable_ip = _THIS_IP_;
1149 p->softirq_enable_event = 0; 1019 p->softirq_enable_event = 0;
1150 p->softirq_disable_ip = 0; 1020 p->softirq_disable_ip = 0;
1151 p->softirq_disable_event = 0; 1021 p->softirq_disable_event = 0;
1152 p->hardirq_context = 0; 1022 p->hardirq_context = 0;
1153 p->softirq_context = 0; 1023 p->softirq_context = 0;
1154 #endif 1024 #endif
1155 #ifdef CONFIG_LOCKDEP 1025 #ifdef CONFIG_LOCKDEP
1156 p->lockdep_depth = 0; /* no locks held yet */ 1026 p->lockdep_depth = 0; /* no locks held yet */
1157 p->curr_chain_key = 0; 1027 p->curr_chain_key = 0;
1158 p->lockdep_recursion = 0; 1028 p->lockdep_recursion = 0;
1159 #endif 1029 #endif
1160 1030
1161 #ifdef CONFIG_DEBUG_MUTEXES 1031 #ifdef CONFIG_DEBUG_MUTEXES
1162 p->blocked_on = NULL; /* not blocked yet */ 1032 p->blocked_on = NULL; /* not blocked yet */
1163 #endif 1033 #endif
1164 1034
1165 /* Perform scheduler related setup. Assign this task to a CPU. */ 1035 /* Perform scheduler related setup. Assign this task to a CPU. */
1166 sched_fork(p, clone_flags); 1036 sched_fork(p, clone_flags);
1167 1037
1168 if ((retval = security_task_alloc(p))) 1038 if ((retval = security_task_alloc(p)))
1169 goto bad_fork_cleanup_policy; 1039 goto bad_fork_cleanup_policy;
1170 if ((retval = audit_alloc(p))) 1040 if ((retval = audit_alloc(p)))
1171 goto bad_fork_cleanup_security; 1041 goto bad_fork_cleanup_security;
1172 /* copy all the process information */ 1042 /* copy all the process information */
1173 if ((retval = copy_semundo(clone_flags, p))) 1043 if ((retval = copy_semundo(clone_flags, p)))
1174 goto bad_fork_cleanup_audit; 1044 goto bad_fork_cleanup_audit;
1175 if ((retval = copy_files(clone_flags, p))) 1045 if ((retval = copy_files(clone_flags, p)))
1176 goto bad_fork_cleanup_semundo; 1046 goto bad_fork_cleanup_semundo;
1177 if ((retval = copy_fs(clone_flags, p))) 1047 if ((retval = copy_fs(clone_flags, p)))
1178 goto bad_fork_cleanup_files; 1048 goto bad_fork_cleanup_files;
1179 if ((retval = copy_sighand(clone_flags, p))) 1049 if ((retval = copy_sighand(clone_flags, p)))
1180 goto bad_fork_cleanup_fs; 1050 goto bad_fork_cleanup_fs;
1181 if ((retval = copy_signal(clone_flags, p))) 1051 if ((retval = copy_signal(clone_flags, p)))
1182 goto bad_fork_cleanup_sighand; 1052 goto bad_fork_cleanup_sighand;
1183 if ((retval = copy_mm(clone_flags, p))) 1053 if ((retval = copy_mm(clone_flags, p)))
1184 goto bad_fork_cleanup_signal; 1054 goto bad_fork_cleanup_signal;
1185 if ((retval = copy_keys(clone_flags, p))) 1055 if ((retval = copy_keys(clone_flags, p)))
1186 goto bad_fork_cleanup_mm; 1056 goto bad_fork_cleanup_mm;
1187 if ((retval = copy_namespaces(clone_flags, p))) 1057 if ((retval = copy_namespaces(clone_flags, p)))
1188 goto bad_fork_cleanup_keys; 1058 goto bad_fork_cleanup_keys;
1189 if ((retval = copy_io(clone_flags, p))) 1059 if ((retval = copy_io(clone_flags, p)))
1190 goto bad_fork_cleanup_namespaces; 1060 goto bad_fork_cleanup_namespaces;
1191 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1061 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1192 if (retval) 1062 if (retval)
1193 goto bad_fork_cleanup_io; 1063 goto bad_fork_cleanup_io;
1194 1064
1195 if (pid != &init_struct_pid) { 1065 if (pid != &init_struct_pid) {
1196 retval = -ENOMEM; 1066 retval = -ENOMEM;
1197 pid = alloc_pid(task_active_pid_ns(p)); 1067 pid = alloc_pid(task_active_pid_ns(p));
1198 if (!pid) 1068 if (!pid)
1199 goto bad_fork_cleanup_io; 1069 goto bad_fork_cleanup_io;
1200 1070
1201 if (clone_flags & CLONE_NEWPID) { 1071 if (clone_flags & CLONE_NEWPID) {
1202 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1072 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
1203 if (retval < 0) 1073 if (retval < 0)
1204 goto bad_fork_free_pid; 1074 goto bad_fork_free_pid;
1205 } 1075 }
1206 } 1076 }
1207 1077
1208 p->pid = pid_nr(pid); 1078 p->pid = pid_nr(pid);
1209 p->tgid = p->pid; 1079 p->tgid = p->pid;
1210 if (clone_flags & CLONE_THREAD) 1080 if (clone_flags & CLONE_THREAD)
1211 p->tgid = current->tgid; 1081 p->tgid = current->tgid;
1212 1082
1213 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1083 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1214 /* 1084 /*
1215 * Clear TID on mm_release()? 1085 * Clear TID on mm_release()?
1216 */ 1086 */
1217 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1087 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1218 #ifdef CONFIG_FUTEX 1088 #ifdef CONFIG_FUTEX
1219 p->robust_list = NULL; 1089 p->robust_list = NULL;
1220 #ifdef CONFIG_COMPAT 1090 #ifdef CONFIG_COMPAT
1221 p->compat_robust_list = NULL; 1091 p->compat_robust_list = NULL;
1222 #endif 1092 #endif
1223 INIT_LIST_HEAD(&p->pi_state_list); 1093 INIT_LIST_HEAD(&p->pi_state_list);
1224 p->pi_state_cache = NULL; 1094 p->pi_state_cache = NULL;
1225 #endif 1095 #endif
1226 /* 1096 /*
1227 * sigaltstack should be cleared when sharing the same VM 1097 * sigaltstack should be cleared when sharing the same VM
1228 */ 1098 */
1229 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) 1099 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1230 p->sas_ss_sp = p->sas_ss_size = 0; 1100 p->sas_ss_sp = p->sas_ss_size = 0;
1231 1101
1232 /* 1102 /*
1233 * Syscall tracing should be turned off in the child regardless 1103 * Syscall tracing should be turned off in the child regardless
1234 * of CLONE_PTRACE. 1104 * of CLONE_PTRACE.
1235 */ 1105 */
1236 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1106 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1237 #ifdef TIF_SYSCALL_EMU 1107 #ifdef TIF_SYSCALL_EMU
1238 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1108 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1239 #endif 1109 #endif
1240 clear_all_latency_tracing(p); 1110 clear_all_latency_tracing(p);
1241 1111
1242 /* Our parent execution domain becomes current domain 1112 /* Our parent execution domain becomes current domain
1243 These must match for thread signalling to apply */ 1113 These must match for thread signalling to apply */
1244 p->parent_exec_id = p->self_exec_id; 1114 p->parent_exec_id = p->self_exec_id;
1245 1115
1246 /* ok, now we should be set up.. */ 1116 /* ok, now we should be set up.. */
1247 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1117 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1248 p->pdeath_signal = 0; 1118 p->pdeath_signal = 0;
1249 p->exit_state = 0; 1119 p->exit_state = 0;
1250 1120
1251 /* 1121 /*
1252 * Ok, make it visible to the rest of the system. 1122 * Ok, make it visible to the rest of the system.
1253 * We dont wake it up yet. 1123 * We dont wake it up yet.
1254 */ 1124 */
1255 p->group_leader = p; 1125 p->group_leader = p;
1256 INIT_LIST_HEAD(&p->thread_group); 1126 INIT_LIST_HEAD(&p->thread_group);
1257 INIT_LIST_HEAD(&p->ptrace_children); 1127 INIT_LIST_HEAD(&p->ptrace_children);
1258 INIT_LIST_HEAD(&p->ptrace_list); 1128 INIT_LIST_HEAD(&p->ptrace_list);
1259 1129
1260 /* Now that the task is set up, run cgroup callbacks if 1130 /* Now that the task is set up, run cgroup callbacks if
1261 * necessary. We need to run them before the task is visible 1131 * necessary. We need to run them before the task is visible
1262 * on the tasklist. */ 1132 * on the tasklist. */
1263 cgroup_fork_callbacks(p); 1133 cgroup_fork_callbacks(p);
1264 cgroup_callbacks_done = 1; 1134 cgroup_callbacks_done = 1;
1265 1135
1266 /* Need tasklist lock for parent etc handling! */ 1136 /* Need tasklist lock for parent etc handling! */
1267 write_lock_irq(&tasklist_lock); 1137 write_lock_irq(&tasklist_lock);
1268 1138
1269 /* 1139 /*
1270 * The task hasn't been attached yet, so its cpus_allowed mask will 1140 * The task hasn't been attached yet, so its cpus_allowed mask will
1271 * not be changed, nor will its assigned CPU. 1141 * not be changed, nor will its assigned CPU.
1272 * 1142 *
1273 * The cpus_allowed mask of the parent may have changed after it was 1143 * The cpus_allowed mask of the parent may have changed after it was
1274 * copied first time - so re-copy it here, then check the child's CPU 1144 * copied first time - so re-copy it here, then check the child's CPU
1275 * to ensure it is on a valid CPU (and if not, just force it back to 1145 * to ensure it is on a valid CPU (and if not, just force it back to
1276 * parent's CPU). This avoids alot of nasty races. 1146 * parent's CPU). This avoids alot of nasty races.
1277 */ 1147 */
1278 p->cpus_allowed = current->cpus_allowed; 1148 p->cpus_allowed = current->cpus_allowed;
1279 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; 1149 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1280 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1150 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1281 !cpu_online(task_cpu(p)))) 1151 !cpu_online(task_cpu(p))))
1282 set_task_cpu(p, smp_processor_id()); 1152 set_task_cpu(p, smp_processor_id());
1283 1153
1284 /* CLONE_PARENT re-uses the old parent */ 1154 /* CLONE_PARENT re-uses the old parent */
1285 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1155 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1286 p->real_parent = current->real_parent; 1156 p->real_parent = current->real_parent;
1287 else 1157 else
1288 p->real_parent = current; 1158 p->real_parent = current;
1289 p->parent = p->real_parent; 1159 p->parent = p->real_parent;
1290 1160
1291 spin_lock(&current->sighand->siglock); 1161 spin_lock(&current->sighand->siglock);
1292 1162
1293 /* 1163 /*
1294 * Process group and session signals need to be delivered to just the 1164 * Process group and session signals need to be delivered to just the
1295 * parent before the fork or both the parent and the child after the 1165 * parent before the fork or both the parent and the child after the
1296 * fork. Restart if a signal comes in before we add the new process to 1166 * fork. Restart if a signal comes in before we add the new process to
1297 * it's process group. 1167 * it's process group.
1298 * A fatal signal pending means that current will exit, so the new 1168 * A fatal signal pending means that current will exit, so the new
1299 * thread can't slip out of an OOM kill (or normal SIGKILL). 1169 * thread can't slip out of an OOM kill (or normal SIGKILL).
1300 */ 1170 */
1301 recalc_sigpending(); 1171 recalc_sigpending();
1302 if (signal_pending(current)) { 1172 if (signal_pending(current)) {
1303 spin_unlock(&current->sighand->siglock); 1173 spin_unlock(&current->sighand->siglock);
1304 write_unlock_irq(&tasklist_lock); 1174 write_unlock_irq(&tasklist_lock);
1305 retval = -ERESTARTNOINTR; 1175 retval = -ERESTARTNOINTR;
1306 goto bad_fork_free_pid; 1176 goto bad_fork_free_pid;
1307 } 1177 }
1308 1178
1309 if (clone_flags & CLONE_THREAD) { 1179 if (clone_flags & CLONE_THREAD) {
1310 p->group_leader = current->group_leader; 1180 p->group_leader = current->group_leader;
1311 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1181 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1312 1182
1313 if (!cputime_eq(current->signal->it_virt_expires, 1183 if (!cputime_eq(current->signal->it_virt_expires,
1314 cputime_zero) || 1184 cputime_zero) ||
1315 !cputime_eq(current->signal->it_prof_expires, 1185 !cputime_eq(current->signal->it_prof_expires,
1316 cputime_zero) || 1186 cputime_zero) ||
1317 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || 1187 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1318 !list_empty(&current->signal->cpu_timers[0]) || 1188 !list_empty(&current->signal->cpu_timers[0]) ||
1319 !list_empty(&current->signal->cpu_timers[1]) || 1189 !list_empty(&current->signal->cpu_timers[1]) ||
1320 !list_empty(&current->signal->cpu_timers[2])) { 1190 !list_empty(&current->signal->cpu_timers[2])) {
1321 /* 1191 /*
1322 * Have child wake up on its first tick to check 1192 * Have child wake up on its first tick to check
1323 * for process CPU timers. 1193 * for process CPU timers.
1324 */ 1194 */
1325 p->it_prof_expires = jiffies_to_cputime(1); 1195 p->it_prof_expires = jiffies_to_cputime(1);
1326 } 1196 }
1327 } 1197 }
1328 1198
1329 if (likely(p->pid)) { 1199 if (likely(p->pid)) {
1330 add_parent(p); 1200 add_parent(p);
1331 if (unlikely(p->ptrace & PT_PTRACED)) 1201 if (unlikely(p->ptrace & PT_PTRACED))
1332 __ptrace_link(p, current->parent); 1202 __ptrace_link(p, current->parent);
1333 1203
1334 if (thread_group_leader(p)) { 1204 if (thread_group_leader(p)) {
1335 if (clone_flags & CLONE_NEWPID) 1205 if (clone_flags & CLONE_NEWPID)
1336 p->nsproxy->pid_ns->child_reaper = p; 1206 p->nsproxy->pid_ns->child_reaper = p;
1337 1207
1338 p->signal->leader_pid = pid; 1208 p->signal->leader_pid = pid;
1339 p->signal->tty = current->signal->tty; 1209 p->signal->tty = current->signal->tty;
1340 set_task_pgrp(p, task_pgrp_nr(current)); 1210 set_task_pgrp(p, task_pgrp_nr(current));
1341 set_task_session(p, task_session_nr(current)); 1211 set_task_session(p, task_session_nr(current));
1342 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1212 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1343 attach_pid(p, PIDTYPE_SID, task_session(current)); 1213 attach_pid(p, PIDTYPE_SID, task_session(current));
1344 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1214 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1345 __get_cpu_var(process_counts)++; 1215 __get_cpu_var(process_counts)++;
1346 } 1216 }
1347 attach_pid(p, PIDTYPE_PID, pid); 1217 attach_pid(p, PIDTYPE_PID, pid);
1348 nr_threads++; 1218 nr_threads++;
1349 } 1219 }
1350 1220
1351 total_forks++; 1221 total_forks++;
1352 spin_unlock(&current->sighand->siglock); 1222 spin_unlock(&current->sighand->siglock);
1353 write_unlock_irq(&tasklist_lock); 1223 write_unlock_irq(&tasklist_lock);
1354 proc_fork_connector(p); 1224 proc_fork_connector(p);
1355 cgroup_post_fork(p); 1225 cgroup_post_fork(p);
1356 return p; 1226 return p;
1357 1227
1358 bad_fork_free_pid: 1228 bad_fork_free_pid:
1359 if (pid != &init_struct_pid) 1229 if (pid != &init_struct_pid)
1360 free_pid(pid); 1230 free_pid(pid);
1361 bad_fork_cleanup_io: 1231 bad_fork_cleanup_io:
1362 put_io_context(p->io_context); 1232 put_io_context(p->io_context);
1363 bad_fork_cleanup_namespaces: 1233 bad_fork_cleanup_namespaces:
1364 exit_task_namespaces(p); 1234 exit_task_namespaces(p);
1365 bad_fork_cleanup_keys: 1235 bad_fork_cleanup_keys:
1366 exit_keys(p); 1236 exit_keys(p);
1367 bad_fork_cleanup_mm: 1237 bad_fork_cleanup_mm:
1368 if (p->mm) 1238 if (p->mm)
1369 mmput(p->mm); 1239 mmput(p->mm);
1370 bad_fork_cleanup_signal: 1240 bad_fork_cleanup_signal:
1371 cleanup_signal(p); 1241 cleanup_signal(p);
1372 bad_fork_cleanup_sighand: 1242 bad_fork_cleanup_sighand:
1373 __cleanup_sighand(p->sighand); 1243 __cleanup_sighand(p->sighand);
1374 bad_fork_cleanup_fs: 1244 bad_fork_cleanup_fs:
1375 exit_fs(p); /* blocking */ 1245 exit_fs(p); /* blocking */
1376 bad_fork_cleanup_files: 1246 bad_fork_cleanup_files:
1377 exit_files(p); /* blocking */ 1247 exit_files(p); /* blocking */
1378 bad_fork_cleanup_semundo: 1248 bad_fork_cleanup_semundo:
1379 exit_sem(p); 1249 exit_sem(p);
1380 bad_fork_cleanup_audit: 1250 bad_fork_cleanup_audit:
1381 audit_free(p); 1251 audit_free(p);
1382 bad_fork_cleanup_security: 1252 bad_fork_cleanup_security:
1383 security_task_free(p); 1253 security_task_free(p);
1384 bad_fork_cleanup_policy: 1254 bad_fork_cleanup_policy:
1385 #ifdef CONFIG_NUMA 1255 #ifdef CONFIG_NUMA
1386 mpol_put(p->mempolicy); 1256 mpol_put(p->mempolicy);
1387 bad_fork_cleanup_cgroup: 1257 bad_fork_cleanup_cgroup:
1388 #endif 1258 #endif
1389 cgroup_exit(p, cgroup_callbacks_done); 1259 cgroup_exit(p, cgroup_callbacks_done);
1390 delayacct_tsk_free(p); 1260 delayacct_tsk_free(p);
1391 if (p->binfmt) 1261 if (p->binfmt)
1392 module_put(p->binfmt->module); 1262 module_put(p->binfmt->module);
1393 bad_fork_cleanup_put_domain: 1263 bad_fork_cleanup_put_domain:
1394 module_put(task_thread_info(p)->exec_domain->module); 1264 module_put(task_thread_info(p)->exec_domain->module);
1395 bad_fork_cleanup_count: 1265 bad_fork_cleanup_count:
1396 put_group_info(p->group_info); 1266 put_group_info(p->group_info);
1397 atomic_dec(&p->user->processes); 1267 atomic_dec(&p->user->processes);
1398 free_uid(p->user); 1268 free_uid(p->user);
1399 bad_fork_free: 1269 bad_fork_free:
1400 free_task(p); 1270 free_task(p);
1401 fork_out: 1271 fork_out:
1402 return ERR_PTR(retval); 1272 return ERR_PTR(retval);
1403 } 1273 }
1404 1274
1405 noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1275 noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1406 { 1276 {
1407 memset(regs, 0, sizeof(struct pt_regs)); 1277 memset(regs, 0, sizeof(struct pt_regs));
1408 return regs; 1278 return regs;
1409 } 1279 }
1410 1280
1411 struct task_struct * __cpuinit fork_idle(int cpu) 1281 struct task_struct * __cpuinit fork_idle(int cpu)
1412 { 1282 {
1413 struct task_struct *task; 1283 struct task_struct *task;
1414 struct pt_regs regs; 1284 struct pt_regs regs;
1415 1285
1416 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1286 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1417 &init_struct_pid); 1287 &init_struct_pid);
1418 if (!IS_ERR(task)) 1288 if (!IS_ERR(task))
1419 init_idle(task, cpu); 1289 init_idle(task, cpu);
1420 1290
1421 return task; 1291 return task;
1422 } 1292 }
1423 1293
1424 static int fork_traceflag(unsigned clone_flags) 1294 static int fork_traceflag(unsigned clone_flags)
1425 { 1295 {
1426 if (clone_flags & CLONE_UNTRACED) 1296 if (clone_flags & CLONE_UNTRACED)
1427 return 0; 1297 return 0;
1428 else if (clone_flags & CLONE_VFORK) { 1298 else if (clone_flags & CLONE_VFORK) {
1429 if (current->ptrace & PT_TRACE_VFORK) 1299 if (current->ptrace & PT_TRACE_VFORK)
1430 return PTRACE_EVENT_VFORK; 1300 return PTRACE_EVENT_VFORK;
1431 } else if ((clone_flags & CSIGNAL) != SIGCHLD) { 1301 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1432 if (current->ptrace & PT_TRACE_CLONE) 1302 if (current->ptrace & PT_TRACE_CLONE)
1433 return PTRACE_EVENT_CLONE; 1303 return PTRACE_EVENT_CLONE;
1434 } else if (current->ptrace & PT_TRACE_FORK) 1304 } else if (current->ptrace & PT_TRACE_FORK)
1435 return PTRACE_EVENT_FORK; 1305 return PTRACE_EVENT_FORK;
1436 1306
1437 return 0; 1307 return 0;
1438 } 1308 }
1439 1309
1440 /* 1310 /*
1441 * Ok, this is the main fork-routine. 1311 * Ok, this is the main fork-routine.
1442 * 1312 *
1443 * It copies the process, and if successful kick-starts 1313 * It copies the process, and if successful kick-starts
1444 * it and waits for it to finish using the VM if required. 1314 * it and waits for it to finish using the VM if required.
1445 */ 1315 */
1446 long do_fork(unsigned long clone_flags, 1316 long do_fork(unsigned long clone_flags,
1447 unsigned long stack_start, 1317 unsigned long stack_start,
1448 struct pt_regs *regs, 1318 struct pt_regs *regs,
1449 unsigned long stack_size, 1319 unsigned long stack_size,
1450 int __user *parent_tidptr, 1320 int __user *parent_tidptr,
1451 int __user *child_tidptr) 1321 int __user *child_tidptr)
1452 { 1322 {
1453 struct task_struct *p; 1323 struct task_struct *p;
1454 int trace = 0; 1324 int trace = 0;
1455 long nr; 1325 long nr;
1456 1326
1457 /* 1327 /*
1458 * We hope to recycle these flags after 2.6.26 1328 * We hope to recycle these flags after 2.6.26
1459 */ 1329 */
1460 if (unlikely(clone_flags & CLONE_STOPPED)) { 1330 if (unlikely(clone_flags & CLONE_STOPPED)) {
1461 static int __read_mostly count = 100; 1331 static int __read_mostly count = 100;
1462 1332
1463 if (count > 0 && printk_ratelimit()) { 1333 if (count > 0 && printk_ratelimit()) {
1464 char comm[TASK_COMM_LEN]; 1334 char comm[TASK_COMM_LEN];
1465 1335
1466 count--; 1336 count--;
1467 printk(KERN_INFO "fork(): process `%s' used deprecated " 1337 printk(KERN_INFO "fork(): process `%s' used deprecated "
1468 "clone flags 0x%lx\n", 1338 "clone flags 0x%lx\n",
1469 get_task_comm(comm, current), 1339 get_task_comm(comm, current),
1470 clone_flags & CLONE_STOPPED); 1340 clone_flags & CLONE_STOPPED);
1471 } 1341 }
1472 } 1342 }
1473 1343
1474 if (unlikely(current->ptrace)) { 1344 if (unlikely(current->ptrace)) {
1475 trace = fork_traceflag (clone_flags); 1345 trace = fork_traceflag (clone_flags);
1476 if (trace) 1346 if (trace)
1477 clone_flags |= CLONE_PTRACE; 1347 clone_flags |= CLONE_PTRACE;
1478 } 1348 }
1479 1349
1480 p = copy_process(clone_flags, stack_start, regs, stack_size, 1350 p = copy_process(clone_flags, stack_start, regs, stack_size,
1481 child_tidptr, NULL); 1351 child_tidptr, NULL);
1482 /* 1352 /*
1483 * Do this prior waking up the new thread - the thread pointer 1353 * Do this prior waking up the new thread - the thread pointer
1484 * might get invalid after that point, if the thread exits quickly. 1354 * might get invalid after that point, if the thread exits quickly.
1485 */ 1355 */
1486 if (!IS_ERR(p)) { 1356 if (!IS_ERR(p)) {
1487 struct completion vfork; 1357 struct completion vfork;
1488 1358
1489 nr = task_pid_vnr(p); 1359 nr = task_pid_vnr(p);
1490 1360
1491 if (clone_flags & CLONE_PARENT_SETTID) 1361 if (clone_flags & CLONE_PARENT_SETTID)
1492 put_user(nr, parent_tidptr); 1362 put_user(nr, parent_tidptr);
1493 1363
1494 if (clone_flags & CLONE_VFORK) { 1364 if (clone_flags & CLONE_VFORK) {
1495 p->vfork_done = &vfork; 1365 p->vfork_done = &vfork;
1496 init_completion(&vfork); 1366 init_completion(&vfork);
1497 } 1367 }
1498 1368
1499 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { 1369 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
1500 /* 1370 /*
1501 * We'll start up with an immediate SIGSTOP. 1371 * We'll start up with an immediate SIGSTOP.
1502 */ 1372 */
1503 sigaddset(&p->pending.signal, SIGSTOP); 1373 sigaddset(&p->pending.signal, SIGSTOP);
1504 set_tsk_thread_flag(p, TIF_SIGPENDING); 1374 set_tsk_thread_flag(p, TIF_SIGPENDING);
1505 } 1375 }
1506 1376
1507 if (!(clone_flags & CLONE_STOPPED)) 1377 if (!(clone_flags & CLONE_STOPPED))
1508 wake_up_new_task(p, clone_flags); 1378 wake_up_new_task(p, clone_flags);
1509 else 1379 else
1510 __set_task_state(p, TASK_STOPPED); 1380 __set_task_state(p, TASK_STOPPED);
1511 1381
1512 if (unlikely (trace)) { 1382 if (unlikely (trace)) {
1513 current->ptrace_message = nr; 1383 current->ptrace_message = nr;
1514 ptrace_notify ((trace << 8) | SIGTRAP); 1384 ptrace_notify ((trace << 8) | SIGTRAP);
1515 } 1385 }
1516 1386
1517 if (clone_flags & CLONE_VFORK) { 1387 if (clone_flags & CLONE_VFORK) {
1518 freezer_do_not_count(); 1388 freezer_do_not_count();
1519 wait_for_completion(&vfork); 1389 wait_for_completion(&vfork);
1520 freezer_count(); 1390 freezer_count();
1521 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { 1391 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
1522 current->ptrace_message = nr; 1392 current->ptrace_message = nr;
1523 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1393 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1524 } 1394 }
1525 } 1395 }
1526 } else { 1396 } else {
1527 nr = PTR_ERR(p); 1397 nr = PTR_ERR(p);
1528 } 1398 }
1529 return nr; 1399 return nr;
1530 } 1400 }
1531 1401
1532 #ifndef ARCH_MIN_MMSTRUCT_ALIGN 1402 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
1533 #define ARCH_MIN_MMSTRUCT_ALIGN 0 1403 #define ARCH_MIN_MMSTRUCT_ALIGN 0
1534 #endif 1404 #endif
1535 1405
1536 static void sighand_ctor(struct kmem_cache *cachep, void *data) 1406 static void sighand_ctor(struct kmem_cache *cachep, void *data)
1537 { 1407 {
1538 struct sighand_struct *sighand = data; 1408 struct sighand_struct *sighand = data;
1539 1409
1540 spin_lock_init(&sighand->siglock); 1410 spin_lock_init(&sighand->siglock);
1541 init_waitqueue_head(&sighand->signalfd_wqh); 1411 init_waitqueue_head(&sighand->signalfd_wqh);
1542 } 1412 }
1543 1413
1544 void __init proc_caches_init(void) 1414 void __init proc_caches_init(void)
1545 { 1415 {
1546 sighand_cachep = kmem_cache_create("sighand_cache", 1416 sighand_cachep = kmem_cache_create("sighand_cache",
1547 sizeof(struct sighand_struct), 0, 1417 sizeof(struct sighand_struct), 0,
1548 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1418 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1549 sighand_ctor); 1419 sighand_ctor);
1550 signal_cachep = kmem_cache_create("signal_cache", 1420 signal_cachep = kmem_cache_create("signal_cache",
1551 sizeof(struct signal_struct), 0, 1421 sizeof(struct signal_struct), 0,
1552 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1422 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1553 files_cachep = kmem_cache_create("files_cache", 1423 files_cachep = kmem_cache_create("files_cache",
1554 sizeof(struct files_struct), 0, 1424 sizeof(struct files_struct), 0,
1555 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1425 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1556 fs_cachep = kmem_cache_create("fs_cache", 1426 fs_cachep = kmem_cache_create("fs_cache",
1557 sizeof(struct fs_struct), 0, 1427 sizeof(struct fs_struct), 0,
1558 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1428 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1559 vm_area_cachep = kmem_cache_create("vm_area_struct", 1429 vm_area_cachep = kmem_cache_create("vm_area_struct",
1560 sizeof(struct vm_area_struct), 0, 1430 sizeof(struct vm_area_struct), 0,
1561 SLAB_PANIC, NULL); 1431 SLAB_PANIC, NULL);
1562 mm_cachep = kmem_cache_create("mm_struct", 1432 mm_cachep = kmem_cache_create("mm_struct",
1563 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1433 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1564 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1434 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1565 } 1435 }
1566 1436
1567 /* 1437 /*
1568 * Check constraints on flags passed to the unshare system call and 1438 * Check constraints on flags passed to the unshare system call and
1569 * force unsharing of additional process context as appropriate. 1439 * force unsharing of additional process context as appropriate.
1570 */ 1440 */
1571 static void check_unshare_flags(unsigned long *flags_ptr) 1441 static void check_unshare_flags(unsigned long *flags_ptr)
1572 { 1442 {
1573 /* 1443 /*
1574 * If unsharing a thread from a thread group, must also 1444 * If unsharing a thread from a thread group, must also
1575 * unshare vm. 1445 * unshare vm.
1576 */ 1446 */
1577 if (*flags_ptr & CLONE_THREAD) 1447 if (*flags_ptr & CLONE_THREAD)
1578 *flags_ptr |= CLONE_VM; 1448 *flags_ptr |= CLONE_VM;
1579 1449
1580 /* 1450 /*
1581 * If unsharing vm, must also unshare signal handlers. 1451 * If unsharing vm, must also unshare signal handlers.
1582 */ 1452 */
1583 if (*flags_ptr & CLONE_VM) 1453 if (*flags_ptr & CLONE_VM)
1584 *flags_ptr |= CLONE_SIGHAND; 1454 *flags_ptr |= CLONE_SIGHAND;
1585 1455
1586 /* 1456 /*
1587 * If unsharing signal handlers and the task was created 1457 * If unsharing signal handlers and the task was created
1588 * using CLONE_THREAD, then must unshare the thread 1458 * using CLONE_THREAD, then must unshare the thread
1589 */ 1459 */
1590 if ((*flags_ptr & CLONE_SIGHAND) && 1460 if ((*flags_ptr & CLONE_SIGHAND) &&
1591 (atomic_read(&current->signal->count) > 1)) 1461 (atomic_read(&current->signal->count) > 1))
1592 *flags_ptr |= CLONE_THREAD; 1462 *flags_ptr |= CLONE_THREAD;
1593 1463
1594 /* 1464 /*
1595 * If unsharing namespace, must also unshare filesystem information. 1465 * If unsharing namespace, must also unshare filesystem information.
1596 */ 1466 */
1597 if (*flags_ptr & CLONE_NEWNS) 1467 if (*flags_ptr & CLONE_NEWNS)
1598 *flags_ptr |= CLONE_FS; 1468 *flags_ptr |= CLONE_FS;
1599 } 1469 }
1600 1470
1601 /* 1471 /*
1602 * Unsharing of tasks created with CLONE_THREAD is not supported yet 1472 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1603 */ 1473 */
1604 static int unshare_thread(unsigned long unshare_flags) 1474 static int unshare_thread(unsigned long unshare_flags)
1605 { 1475 {
1606 if (unshare_flags & CLONE_THREAD) 1476 if (unshare_flags & CLONE_THREAD)
1607 return -EINVAL; 1477 return -EINVAL;
1608 1478
1609 return 0; 1479 return 0;
1610 } 1480 }
1611 1481
1612 /* 1482 /*
1613 * Unshare the filesystem structure if it is being shared 1483 * Unshare the filesystem structure if it is being shared
1614 */ 1484 */
1615 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) 1485 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1616 { 1486 {
1617 struct fs_struct *fs = current->fs; 1487 struct fs_struct *fs = current->fs;
1618 1488
1619 if ((unshare_flags & CLONE_FS) && 1489 if ((unshare_flags & CLONE_FS) &&
1620 (fs && atomic_read(&fs->count) > 1)) { 1490 (fs && atomic_read(&fs->count) > 1)) {
1621 *new_fsp = __copy_fs_struct(current->fs); 1491 *new_fsp = __copy_fs_struct(current->fs);
1622 if (!*new_fsp) 1492 if (!*new_fsp)
1623 return -ENOMEM; 1493 return -ENOMEM;
1624 } 1494 }
1625 1495
1626 return 0; 1496 return 0;
1627 } 1497 }
1628 1498
1629 /* 1499 /*
1630 * Unsharing of sighand is not supported yet 1500 * Unsharing of sighand is not supported yet
1631 */ 1501 */
1632 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1502 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1633 { 1503 {
1634 struct sighand_struct *sigh = current->sighand; 1504 struct sighand_struct *sigh = current->sighand;
1635 1505
1636 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) 1506 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1637 return -EINVAL; 1507 return -EINVAL;
1638 else 1508 else
1639 return 0; 1509 return 0;
1640 } 1510 }
1641 1511
1642 /* 1512 /*
1643 * Unshare vm if it is being shared 1513 * Unshare vm if it is being shared
1644 */ 1514 */
1645 static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) 1515 static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1646 { 1516 {
1647 struct mm_struct *mm = current->mm; 1517 struct mm_struct *mm = current->mm;
1648 1518
1649 if ((unshare_flags & CLONE_VM) && 1519 if ((unshare_flags & CLONE_VM) &&
1650 (mm && atomic_read(&mm->mm_users) > 1)) { 1520 (mm && atomic_read(&mm->mm_users) > 1)) {
1651 return -EINVAL; 1521 return -EINVAL;
1652 } 1522 }
1653 1523
1654 return 0; 1524 return 0;
1655 } 1525 }
1656 1526
1657 /* 1527 /*
1658 * Unshare file descriptor table if it is being shared 1528 * Unshare file descriptor table if it is being shared
1659 */ 1529 */
1660 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1530 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
1661 { 1531 {
1662 struct files_struct *fd = current->files; 1532 struct files_struct *fd = current->files;
1663 int error = 0; 1533 int error = 0;
1664 1534
1665 if ((unshare_flags & CLONE_FILES) && 1535 if ((unshare_flags & CLONE_FILES) &&
1666 (fd && atomic_read(&fd->count) > 1)) { 1536 (fd && atomic_read(&fd->count) > 1)) {
1667 *new_fdp = dup_fd(fd, &error); 1537 *new_fdp = dup_fd(fd, &error);
1668 if (!*new_fdp) 1538 if (!*new_fdp)
1669 return error; 1539 return error;
1670 } 1540 }
1671 1541
1672 return 0; 1542 return 0;
1673 } 1543 }
1674 1544
1675 /* 1545 /*
1676 * unshare allows a process to 'unshare' part of the process 1546 * unshare allows a process to 'unshare' part of the process
1677 * context which was originally shared using clone. copy_* 1547 * context which was originally shared using clone. copy_*
1678 * functions used by do_fork() cannot be used here directly 1548 * functions used by do_fork() cannot be used here directly
1679 * because they modify an inactive task_struct that is being 1549 * because they modify an inactive task_struct that is being
1680 * constructed. Here we are modifying the current, active, 1550 * constructed. Here we are modifying the current, active,
1681 * task_struct. 1551 * task_struct.
1682 */ 1552 */
1683 asmlinkage long sys_unshare(unsigned long unshare_flags) 1553 asmlinkage long sys_unshare(unsigned long unshare_flags)
1684 { 1554 {
1685 int err = 0; 1555 int err = 0;
1686 struct fs_struct *fs, *new_fs = NULL; 1556 struct fs_struct *fs, *new_fs = NULL;
1687 struct sighand_struct *new_sigh = NULL; 1557 struct sighand_struct *new_sigh = NULL;
1688 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1558 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1689 struct files_struct *fd, *new_fd = NULL; 1559 struct files_struct *fd, *new_fd = NULL;
1690 struct nsproxy *new_nsproxy = NULL; 1560 struct nsproxy *new_nsproxy = NULL;
1691 int do_sysvsem = 0; 1561 int do_sysvsem = 0;
1692 1562
1693 check_unshare_flags(&unshare_flags); 1563 check_unshare_flags(&unshare_flags);
1694 1564
1695 /* Return -EINVAL for all unsupported flags */ 1565 /* Return -EINVAL for all unsupported flags */
1696 err = -EINVAL; 1566 err = -EINVAL;
1697 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1567 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1698 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1568 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1699 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| 1569 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
1700 CLONE_NEWNET)) 1570 CLONE_NEWNET))
1701 goto bad_unshare_out; 1571 goto bad_unshare_out;
1702 1572
1703 /* 1573 /*
1704 * CLONE_NEWIPC must also detach from the undolist: after switching 1574 * CLONE_NEWIPC must also detach from the undolist: after switching
1705 * to a new ipc namespace, the semaphore arrays from the old 1575 * to a new ipc namespace, the semaphore arrays from the old
1706 * namespace are unreachable. 1576 * namespace are unreachable.
1707 */ 1577 */
1708 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1578 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1709 do_sysvsem = 1; 1579 do_sysvsem = 1;
1710 if ((err = unshare_thread(unshare_flags))) 1580 if ((err = unshare_thread(unshare_flags)))
1711 goto bad_unshare_out; 1581 goto bad_unshare_out;
1712 if ((err = unshare_fs(unshare_flags, &new_fs))) 1582 if ((err = unshare_fs(unshare_flags, &new_fs)))
1713 goto bad_unshare_cleanup_thread; 1583 goto bad_unshare_cleanup_thread;
1714 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1584 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1715 goto bad_unshare_cleanup_fs; 1585 goto bad_unshare_cleanup_fs;
1716 if ((err = unshare_vm(unshare_flags, &new_mm))) 1586 if ((err = unshare_vm(unshare_flags, &new_mm)))
1717 goto bad_unshare_cleanup_sigh; 1587 goto bad_unshare_cleanup_sigh;
1718 if ((err = unshare_fd(unshare_flags, &new_fd))) 1588 if ((err = unshare_fd(unshare_flags, &new_fd)))
1719 goto bad_unshare_cleanup_vm; 1589 goto bad_unshare_cleanup_vm;
1720 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1590 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1721 new_fs))) 1591 new_fs)))
1722 goto bad_unshare_cleanup_fd; 1592 goto bad_unshare_cleanup_fd;
1723 1593
1724 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1594 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
1725 if (do_sysvsem) { 1595 if (do_sysvsem) {
1726 /* 1596 /*
1727 * CLONE_SYSVSEM is equivalent to sys_exit(). 1597 * CLONE_SYSVSEM is equivalent to sys_exit().
1728 */ 1598 */
1729 exit_sem(current); 1599 exit_sem(current);
1730 } 1600 }
1731 1601
1732 if (new_nsproxy) { 1602 if (new_nsproxy) {
1733 switch_task_namespaces(current, new_nsproxy); 1603 switch_task_namespaces(current, new_nsproxy);
1734 new_nsproxy = NULL; 1604 new_nsproxy = NULL;
1735 } 1605 }
1736 1606
1737 task_lock(current); 1607 task_lock(current);
1738 1608
1739 if (new_fs) { 1609 if (new_fs) {
1740 fs = current->fs; 1610 fs = current->fs;
1741 current->fs = new_fs; 1611 current->fs = new_fs;
1742 new_fs = fs; 1612 new_fs = fs;
1743 } 1613 }
1744 1614
1745 if (new_mm) { 1615 if (new_mm) {
1746 mm = current->mm; 1616 mm = current->mm;
1747 active_mm = current->active_mm; 1617 active_mm = current->active_mm;
1748 current->mm = new_mm; 1618 current->mm = new_mm;
1749 current->active_mm = new_mm; 1619 current->active_mm = new_mm;
1750 activate_mm(active_mm, new_mm); 1620 activate_mm(active_mm, new_mm);
1751 new_mm = mm; 1621 new_mm = mm;
1752 } 1622 }
1753 1623
1754 if (new_fd) { 1624 if (new_fd) {
1755 fd = current->files; 1625 fd = current->files;
1756 current->files = new_fd; 1626 current->files = new_fd;
1757 new_fd = fd; 1627 new_fd = fd;
1758 } 1628 }
1759 1629
1760 task_unlock(current); 1630 task_unlock(current);
1761 } 1631 }
1762 1632
1763 if (new_nsproxy) 1633 if (new_nsproxy)
1764 put_nsproxy(new_nsproxy); 1634 put_nsproxy(new_nsproxy);
1765 1635
1766 bad_unshare_cleanup_fd: 1636 bad_unshare_cleanup_fd:
1767 if (new_fd) 1637 if (new_fd)
1768 put_files_struct(new_fd); 1638 put_files_struct(new_fd);
1769 1639
1770 bad_unshare_cleanup_vm: 1640 bad_unshare_cleanup_vm:
1771 if (new_mm) 1641 if (new_mm)
1772 mmput(new_mm); 1642 mmput(new_mm);
1773 1643
1774 bad_unshare_cleanup_sigh: 1644 bad_unshare_cleanup_sigh:
1775 if (new_sigh) 1645 if (new_sigh)
1776 if (atomic_dec_and_test(&new_sigh->count)) 1646 if (atomic_dec_and_test(&new_sigh->count))
1777 kmem_cache_free(sighand_cachep, new_sigh); 1647 kmem_cache_free(sighand_cachep, new_sigh);
1778 1648
1779 bad_unshare_cleanup_fs: 1649 bad_unshare_cleanup_fs:
1780 if (new_fs) 1650 if (new_fs)
1781 put_fs_struct(new_fs); 1651 put_fs_struct(new_fs);
1782 1652
1783 bad_unshare_cleanup_thread: 1653 bad_unshare_cleanup_thread:
1784 bad_unshare_out: 1654 bad_unshare_out:
1785 return err; 1655 return err;
1786 } 1656 }
1787 1657
1788 /* 1658 /*
1789 * Helper to unshare the files of the current task. 1659 * Helper to unshare the files of the current task.
1790 * We don't want to expose copy_files internals to 1660 * We don't want to expose copy_files internals to
1791 * the exec layer of the kernel. 1661 * the exec layer of the kernel.
1792 */ 1662 */
1793 1663
1794 int unshare_files(struct files_struct **displaced) 1664 int unshare_files(struct files_struct **displaced)
1795 { 1665 {
1796 struct task_struct *task = current; 1666 struct task_struct *task = current;
1797 struct files_struct *copy = NULL; 1667 struct files_struct *copy = NULL;
1798 int error; 1668 int error;
1799 1669
1800 error = unshare_fd(CLONE_FILES, &copy); 1670 error = unshare_fd(CLONE_FILES, &copy);
1801 if (error || !copy) { 1671 if (error || !copy) {
1802 *displaced = NULL; 1672 *displaced = NULL;
1803 return error; 1673 return error;
1804 } 1674 }
1805 *displaced = task->files; 1675 *displaced = task->files;
1806 task_lock(task); 1676 task_lock(task);
1807 task->files = copy; 1677 task->files = copy;
1808 task_unlock(task); 1678 task_unlock(task);
1809 return 0; 1679 return 0;
1810 } 1680 }
1811 1681