Commit 864bdb3b6cbd9911222543fef1cfe36f88183f44
1 parent
2be7fd55d4
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
new helper: daemonize_descriptors()
descriptor-related parts of daemonize, done right. As the result we simplify the locking rules for ->files - we hold task_lock in *all* cases when we modify ->files. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 3 changed files with 8 additions and 3 deletions Inline Diff
fs/file.c
1 | /* | 1 | /* |
2 | * linux/fs/file.c | 2 | * linux/fs/file.c |
3 | * | 3 | * |
4 | * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes | 4 | * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes |
5 | * | 5 | * |
6 | * Manage the dynamic fd arrays in the process files_struct. | 6 | * Manage the dynamic fd arrays in the process files_struct. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/syscalls.h> | 9 | #include <linux/syscalls.h> |
10 | #include <linux/export.h> | 10 | #include <linux/export.h> |
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/mmzone.h> | 13 | #include <linux/mmzone.h> |
14 | #include <linux/time.h> | 14 | #include <linux/time.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/vmalloc.h> | 17 | #include <linux/vmalloc.h> |
18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
19 | #include <linux/fdtable.h> | 19 | #include <linux/fdtable.h> |
20 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> |
21 | #include <linux/interrupt.h> | 21 | #include <linux/interrupt.h> |
22 | #include <linux/spinlock.h> | 22 | #include <linux/spinlock.h> |
23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
24 | #include <linux/workqueue.h> | 24 | #include <linux/workqueue.h> |
25 | 25 | ||
26 | struct fdtable_defer { | 26 | struct fdtable_defer { |
27 | spinlock_t lock; | 27 | spinlock_t lock; |
28 | struct work_struct wq; | 28 | struct work_struct wq; |
29 | struct fdtable *next; | 29 | struct fdtable *next; |
30 | }; | 30 | }; |
31 | 31 | ||
32 | int sysctl_nr_open __read_mostly = 1024*1024; | 32 | int sysctl_nr_open __read_mostly = 1024*1024; |
33 | int sysctl_nr_open_min = BITS_PER_LONG; | 33 | int sysctl_nr_open_min = BITS_PER_LONG; |
34 | int sysctl_nr_open_max = 1024 * 1024; /* raised later */ | 34 | int sysctl_nr_open_max = 1024 * 1024; /* raised later */ |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * We use this list to defer free fdtables that have vmalloced | 37 | * We use this list to defer free fdtables that have vmalloced |
38 | * sets/arrays. By keeping a per-cpu list, we avoid having to embed | 38 | * sets/arrays. By keeping a per-cpu list, we avoid having to embed |
39 | * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in | 39 | * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in |
40 | * this per-task structure. | 40 | * this per-task structure. |
41 | */ | 41 | */ |
42 | static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); | 42 | static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); |
43 | 43 | ||
44 | static void *alloc_fdmem(size_t size) | 44 | static void *alloc_fdmem(size_t size) |
45 | { | 45 | { |
46 | /* | 46 | /* |
47 | * Very large allocations can stress page reclaim, so fall back to | 47 | * Very large allocations can stress page reclaim, so fall back to |
48 | * vmalloc() if the allocation size will be considered "large" by the VM. | 48 | * vmalloc() if the allocation size will be considered "large" by the VM. |
49 | */ | 49 | */ |
50 | if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { | 50 | if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { |
51 | void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); | 51 | void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); |
52 | if (data != NULL) | 52 | if (data != NULL) |
53 | return data; | 53 | return data; |
54 | } | 54 | } |
55 | return vmalloc(size); | 55 | return vmalloc(size); |
56 | } | 56 | } |
57 | 57 | ||
58 | static void free_fdmem(void *ptr) | 58 | static void free_fdmem(void *ptr) |
59 | { | 59 | { |
60 | is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); | 60 | is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); |
61 | } | 61 | } |
62 | 62 | ||
63 | static void __free_fdtable(struct fdtable *fdt) | 63 | static void __free_fdtable(struct fdtable *fdt) |
64 | { | 64 | { |
65 | free_fdmem(fdt->fd); | 65 | free_fdmem(fdt->fd); |
66 | free_fdmem(fdt->open_fds); | 66 | free_fdmem(fdt->open_fds); |
67 | kfree(fdt); | 67 | kfree(fdt); |
68 | } | 68 | } |
69 | 69 | ||
70 | static void free_fdtable_work(struct work_struct *work) | 70 | static void free_fdtable_work(struct work_struct *work) |
71 | { | 71 | { |
72 | struct fdtable_defer *f = | 72 | struct fdtable_defer *f = |
73 | container_of(work, struct fdtable_defer, wq); | 73 | container_of(work, struct fdtable_defer, wq); |
74 | struct fdtable *fdt; | 74 | struct fdtable *fdt; |
75 | 75 | ||
76 | spin_lock_bh(&f->lock); | 76 | spin_lock_bh(&f->lock); |
77 | fdt = f->next; | 77 | fdt = f->next; |
78 | f->next = NULL; | 78 | f->next = NULL; |
79 | spin_unlock_bh(&f->lock); | 79 | spin_unlock_bh(&f->lock); |
80 | while(fdt) { | 80 | while(fdt) { |
81 | struct fdtable *next = fdt->next; | 81 | struct fdtable *next = fdt->next; |
82 | 82 | ||
83 | __free_fdtable(fdt); | 83 | __free_fdtable(fdt); |
84 | fdt = next; | 84 | fdt = next; |
85 | } | 85 | } |
86 | } | 86 | } |
87 | 87 | ||
88 | static void free_fdtable_rcu(struct rcu_head *rcu) | 88 | static void free_fdtable_rcu(struct rcu_head *rcu) |
89 | { | 89 | { |
90 | struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); | 90 | struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); |
91 | struct fdtable_defer *fddef; | 91 | struct fdtable_defer *fddef; |
92 | 92 | ||
93 | BUG_ON(!fdt); | 93 | BUG_ON(!fdt); |
94 | BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); | 94 | BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); |
95 | 95 | ||
96 | if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { | 96 | if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { |
97 | kfree(fdt->fd); | 97 | kfree(fdt->fd); |
98 | kfree(fdt->open_fds); | 98 | kfree(fdt->open_fds); |
99 | kfree(fdt); | 99 | kfree(fdt); |
100 | } else { | 100 | } else { |
101 | fddef = &get_cpu_var(fdtable_defer_list); | 101 | fddef = &get_cpu_var(fdtable_defer_list); |
102 | spin_lock(&fddef->lock); | 102 | spin_lock(&fddef->lock); |
103 | fdt->next = fddef->next; | 103 | fdt->next = fddef->next; |
104 | fddef->next = fdt; | 104 | fddef->next = fdt; |
105 | /* vmallocs are handled from the workqueue context */ | 105 | /* vmallocs are handled from the workqueue context */ |
106 | schedule_work(&fddef->wq); | 106 | schedule_work(&fddef->wq); |
107 | spin_unlock(&fddef->lock); | 107 | spin_unlock(&fddef->lock); |
108 | put_cpu_var(fdtable_defer_list); | 108 | put_cpu_var(fdtable_defer_list); |
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Expand the fdset in the files_struct. Called with the files spinlock | 113 | * Expand the fdset in the files_struct. Called with the files spinlock |
114 | * held for write. | 114 | * held for write. |
115 | */ | 115 | */ |
116 | static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) | 116 | static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) |
117 | { | 117 | { |
118 | unsigned int cpy, set; | 118 | unsigned int cpy, set; |
119 | 119 | ||
120 | BUG_ON(nfdt->max_fds < ofdt->max_fds); | 120 | BUG_ON(nfdt->max_fds < ofdt->max_fds); |
121 | 121 | ||
122 | cpy = ofdt->max_fds * sizeof(struct file *); | 122 | cpy = ofdt->max_fds * sizeof(struct file *); |
123 | set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); | 123 | set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); |
124 | memcpy(nfdt->fd, ofdt->fd, cpy); | 124 | memcpy(nfdt->fd, ofdt->fd, cpy); |
125 | memset((char *)(nfdt->fd) + cpy, 0, set); | 125 | memset((char *)(nfdt->fd) + cpy, 0, set); |
126 | 126 | ||
127 | cpy = ofdt->max_fds / BITS_PER_BYTE; | 127 | cpy = ofdt->max_fds / BITS_PER_BYTE; |
128 | set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; | 128 | set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; |
129 | memcpy(nfdt->open_fds, ofdt->open_fds, cpy); | 129 | memcpy(nfdt->open_fds, ofdt->open_fds, cpy); |
130 | memset((char *)(nfdt->open_fds) + cpy, 0, set); | 130 | memset((char *)(nfdt->open_fds) + cpy, 0, set); |
131 | memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); | 131 | memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); |
132 | memset((char *)(nfdt->close_on_exec) + cpy, 0, set); | 132 | memset((char *)(nfdt->close_on_exec) + cpy, 0, set); |
133 | } | 133 | } |
134 | 134 | ||
135 | static struct fdtable * alloc_fdtable(unsigned int nr) | 135 | static struct fdtable * alloc_fdtable(unsigned int nr) |
136 | { | 136 | { |
137 | struct fdtable *fdt; | 137 | struct fdtable *fdt; |
138 | void *data; | 138 | void *data; |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * Figure out how many fds we actually want to support in this fdtable. | 141 | * Figure out how many fds we actually want to support in this fdtable. |
142 | * Allocation steps are keyed to the size of the fdarray, since it | 142 | * Allocation steps are keyed to the size of the fdarray, since it |
143 | * grows far faster than any of the other dynamic data. We try to fit | 143 | * grows far faster than any of the other dynamic data. We try to fit |
144 | * the fdarray into comfortable page-tuned chunks: starting at 1024B | 144 | * the fdarray into comfortable page-tuned chunks: starting at 1024B |
145 | * and growing in powers of two from there on. | 145 | * and growing in powers of two from there on. |
146 | */ | 146 | */ |
147 | nr /= (1024 / sizeof(struct file *)); | 147 | nr /= (1024 / sizeof(struct file *)); |
148 | nr = roundup_pow_of_two(nr + 1); | 148 | nr = roundup_pow_of_two(nr + 1); |
149 | nr *= (1024 / sizeof(struct file *)); | 149 | nr *= (1024 / sizeof(struct file *)); |
150 | /* | 150 | /* |
151 | * Note that this can drive nr *below* what we had passed if sysctl_nr_open | 151 | * Note that this can drive nr *below* what we had passed if sysctl_nr_open |
152 | * had been set lower between the check in expand_files() and here. Deal | 152 | * had been set lower between the check in expand_files() and here. Deal |
153 | * with that in caller, it's cheaper that way. | 153 | * with that in caller, it's cheaper that way. |
154 | * | 154 | * |
155 | * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise | 155 | * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise |
156 | * bitmaps handling below becomes unpleasant, to put it mildly... | 156 | * bitmaps handling below becomes unpleasant, to put it mildly... |
157 | */ | 157 | */ |
158 | if (unlikely(nr > sysctl_nr_open)) | 158 | if (unlikely(nr > sysctl_nr_open)) |
159 | nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; | 159 | nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; |
160 | 160 | ||
161 | fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); | 161 | fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); |
162 | if (!fdt) | 162 | if (!fdt) |
163 | goto out; | 163 | goto out; |
164 | fdt->max_fds = nr; | 164 | fdt->max_fds = nr; |
165 | data = alloc_fdmem(nr * sizeof(struct file *)); | 165 | data = alloc_fdmem(nr * sizeof(struct file *)); |
166 | if (!data) | 166 | if (!data) |
167 | goto out_fdt; | 167 | goto out_fdt; |
168 | fdt->fd = data; | 168 | fdt->fd = data; |
169 | 169 | ||
170 | data = alloc_fdmem(max_t(size_t, | 170 | data = alloc_fdmem(max_t(size_t, |
171 | 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); | 171 | 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); |
172 | if (!data) | 172 | if (!data) |
173 | goto out_arr; | 173 | goto out_arr; |
174 | fdt->open_fds = data; | 174 | fdt->open_fds = data; |
175 | data += nr / BITS_PER_BYTE; | 175 | data += nr / BITS_PER_BYTE; |
176 | fdt->close_on_exec = data; | 176 | fdt->close_on_exec = data; |
177 | fdt->next = NULL; | 177 | fdt->next = NULL; |
178 | 178 | ||
179 | return fdt; | 179 | return fdt; |
180 | 180 | ||
181 | out_arr: | 181 | out_arr: |
182 | free_fdmem(fdt->fd); | 182 | free_fdmem(fdt->fd); |
183 | out_fdt: | 183 | out_fdt: |
184 | kfree(fdt); | 184 | kfree(fdt); |
185 | out: | 185 | out: |
186 | return NULL; | 186 | return NULL; |
187 | } | 187 | } |
188 | 188 | ||
189 | /* | 189 | /* |
190 | * Expand the file descriptor table. | 190 | * Expand the file descriptor table. |
191 | * This function will allocate a new fdtable and both fd array and fdset, of | 191 | * This function will allocate a new fdtable and both fd array and fdset, of |
192 | * the given size. | 192 | * the given size. |
193 | * Return <0 error code on error; 1 on successful completion. | 193 | * Return <0 error code on error; 1 on successful completion. |
194 | * The files->file_lock should be held on entry, and will be held on exit. | 194 | * The files->file_lock should be held on entry, and will be held on exit. |
195 | */ | 195 | */ |
196 | static int expand_fdtable(struct files_struct *files, int nr) | 196 | static int expand_fdtable(struct files_struct *files, int nr) |
197 | __releases(files->file_lock) | 197 | __releases(files->file_lock) |
198 | __acquires(files->file_lock) | 198 | __acquires(files->file_lock) |
199 | { | 199 | { |
200 | struct fdtable *new_fdt, *cur_fdt; | 200 | struct fdtable *new_fdt, *cur_fdt; |
201 | 201 | ||
202 | spin_unlock(&files->file_lock); | 202 | spin_unlock(&files->file_lock); |
203 | new_fdt = alloc_fdtable(nr); | 203 | new_fdt = alloc_fdtable(nr); |
204 | spin_lock(&files->file_lock); | 204 | spin_lock(&files->file_lock); |
205 | if (!new_fdt) | 205 | if (!new_fdt) |
206 | return -ENOMEM; | 206 | return -ENOMEM; |
207 | /* | 207 | /* |
208 | * extremely unlikely race - sysctl_nr_open decreased between the check in | 208 | * extremely unlikely race - sysctl_nr_open decreased between the check in |
209 | * caller and alloc_fdtable(). Cheaper to catch it here... | 209 | * caller and alloc_fdtable(). Cheaper to catch it here... |
210 | */ | 210 | */ |
211 | if (unlikely(new_fdt->max_fds <= nr)) { | 211 | if (unlikely(new_fdt->max_fds <= nr)) { |
212 | __free_fdtable(new_fdt); | 212 | __free_fdtable(new_fdt); |
213 | return -EMFILE; | 213 | return -EMFILE; |
214 | } | 214 | } |
215 | /* | 215 | /* |
216 | * Check again since another task may have expanded the fd table while | 216 | * Check again since another task may have expanded the fd table while |
217 | * we dropped the lock | 217 | * we dropped the lock |
218 | */ | 218 | */ |
219 | cur_fdt = files_fdtable(files); | 219 | cur_fdt = files_fdtable(files); |
220 | if (nr >= cur_fdt->max_fds) { | 220 | if (nr >= cur_fdt->max_fds) { |
221 | /* Continue as planned */ | 221 | /* Continue as planned */ |
222 | copy_fdtable(new_fdt, cur_fdt); | 222 | copy_fdtable(new_fdt, cur_fdt); |
223 | rcu_assign_pointer(files->fdt, new_fdt); | 223 | rcu_assign_pointer(files->fdt, new_fdt); |
224 | if (cur_fdt->max_fds > NR_OPEN_DEFAULT) | 224 | if (cur_fdt->max_fds > NR_OPEN_DEFAULT) |
225 | call_rcu(&cur_fdt->rcu, free_fdtable_rcu); | 225 | call_rcu(&cur_fdt->rcu, free_fdtable_rcu); |
226 | } else { | 226 | } else { |
227 | /* Somebody else expanded, so undo our attempt */ | 227 | /* Somebody else expanded, so undo our attempt */ |
228 | __free_fdtable(new_fdt); | 228 | __free_fdtable(new_fdt); |
229 | } | 229 | } |
230 | return 1; | 230 | return 1; |
231 | } | 231 | } |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * Expand files. | 234 | * Expand files. |
235 | * This function will expand the file structures, if the requested size exceeds | 235 | * This function will expand the file structures, if the requested size exceeds |
236 | * the current capacity and there is room for expansion. | 236 | * the current capacity and there is room for expansion. |
237 | * Return <0 error code on error; 0 when nothing done; 1 when files were | 237 | * Return <0 error code on error; 0 when nothing done; 1 when files were |
238 | * expanded and execution may have blocked. | 238 | * expanded and execution may have blocked. |
239 | * The files->file_lock should be held on entry, and will be held on exit. | 239 | * The files->file_lock should be held on entry, and will be held on exit. |
240 | */ | 240 | */ |
241 | static int expand_files(struct files_struct *files, int nr) | 241 | static int expand_files(struct files_struct *files, int nr) |
242 | { | 242 | { |
243 | struct fdtable *fdt; | 243 | struct fdtable *fdt; |
244 | 244 | ||
245 | fdt = files_fdtable(files); | 245 | fdt = files_fdtable(files); |
246 | 246 | ||
247 | /* Do we need to expand? */ | 247 | /* Do we need to expand? */ |
248 | if (nr < fdt->max_fds) | 248 | if (nr < fdt->max_fds) |
249 | return 0; | 249 | return 0; |
250 | 250 | ||
251 | /* Can we expand? */ | 251 | /* Can we expand? */ |
252 | if (nr >= sysctl_nr_open) | 252 | if (nr >= sysctl_nr_open) |
253 | return -EMFILE; | 253 | return -EMFILE; |
254 | 254 | ||
255 | /* All good, so we try */ | 255 | /* All good, so we try */ |
256 | return expand_fdtable(files, nr); | 256 | return expand_fdtable(files, nr); |
257 | } | 257 | } |
258 | 258 | ||
259 | static inline void __set_close_on_exec(int fd, struct fdtable *fdt) | 259 | static inline void __set_close_on_exec(int fd, struct fdtable *fdt) |
260 | { | 260 | { |
261 | __set_bit(fd, fdt->close_on_exec); | 261 | __set_bit(fd, fdt->close_on_exec); |
262 | } | 262 | } |
263 | 263 | ||
264 | static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) | 264 | static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) |
265 | { | 265 | { |
266 | __clear_bit(fd, fdt->close_on_exec); | 266 | __clear_bit(fd, fdt->close_on_exec); |
267 | } | 267 | } |
268 | 268 | ||
269 | static inline void __set_open_fd(int fd, struct fdtable *fdt) | 269 | static inline void __set_open_fd(int fd, struct fdtable *fdt) |
270 | { | 270 | { |
271 | __set_bit(fd, fdt->open_fds); | 271 | __set_bit(fd, fdt->open_fds); |
272 | } | 272 | } |
273 | 273 | ||
274 | static inline void __clear_open_fd(int fd, struct fdtable *fdt) | 274 | static inline void __clear_open_fd(int fd, struct fdtable *fdt) |
275 | { | 275 | { |
276 | __clear_bit(fd, fdt->open_fds); | 276 | __clear_bit(fd, fdt->open_fds); |
277 | } | 277 | } |
278 | 278 | ||
279 | static int count_open_files(struct fdtable *fdt) | 279 | static int count_open_files(struct fdtable *fdt) |
280 | { | 280 | { |
281 | int size = fdt->max_fds; | 281 | int size = fdt->max_fds; |
282 | int i; | 282 | int i; |
283 | 283 | ||
284 | /* Find the last open fd */ | 284 | /* Find the last open fd */ |
285 | for (i = size / BITS_PER_LONG; i > 0; ) { | 285 | for (i = size / BITS_PER_LONG; i > 0; ) { |
286 | if (fdt->open_fds[--i]) | 286 | if (fdt->open_fds[--i]) |
287 | break; | 287 | break; |
288 | } | 288 | } |
289 | i = (i + 1) * BITS_PER_LONG; | 289 | i = (i + 1) * BITS_PER_LONG; |
290 | return i; | 290 | return i; |
291 | } | 291 | } |
292 | 292 | ||
293 | /* | 293 | /* |
294 | * Allocate a new files structure and copy contents from the | 294 | * Allocate a new files structure and copy contents from the |
295 | * passed in files structure. | 295 | * passed in files structure. |
296 | * errorp will be valid only when the returned files_struct is NULL. | 296 | * errorp will be valid only when the returned files_struct is NULL. |
297 | */ | 297 | */ |
298 | struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | 298 | struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) |
299 | { | 299 | { |
300 | struct files_struct *newf; | 300 | struct files_struct *newf; |
301 | struct file **old_fds, **new_fds; | 301 | struct file **old_fds, **new_fds; |
302 | int open_files, size, i; | 302 | int open_files, size, i; |
303 | struct fdtable *old_fdt, *new_fdt; | 303 | struct fdtable *old_fdt, *new_fdt; |
304 | 304 | ||
305 | *errorp = -ENOMEM; | 305 | *errorp = -ENOMEM; |
306 | newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); | 306 | newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); |
307 | if (!newf) | 307 | if (!newf) |
308 | goto out; | 308 | goto out; |
309 | 309 | ||
310 | atomic_set(&newf->count, 1); | 310 | atomic_set(&newf->count, 1); |
311 | 311 | ||
312 | spin_lock_init(&newf->file_lock); | 312 | spin_lock_init(&newf->file_lock); |
313 | newf->next_fd = 0; | 313 | newf->next_fd = 0; |
314 | new_fdt = &newf->fdtab; | 314 | new_fdt = &newf->fdtab; |
315 | new_fdt->max_fds = NR_OPEN_DEFAULT; | 315 | new_fdt->max_fds = NR_OPEN_DEFAULT; |
316 | new_fdt->close_on_exec = newf->close_on_exec_init; | 316 | new_fdt->close_on_exec = newf->close_on_exec_init; |
317 | new_fdt->open_fds = newf->open_fds_init; | 317 | new_fdt->open_fds = newf->open_fds_init; |
318 | new_fdt->fd = &newf->fd_array[0]; | 318 | new_fdt->fd = &newf->fd_array[0]; |
319 | new_fdt->next = NULL; | 319 | new_fdt->next = NULL; |
320 | 320 | ||
321 | spin_lock(&oldf->file_lock); | 321 | spin_lock(&oldf->file_lock); |
322 | old_fdt = files_fdtable(oldf); | 322 | old_fdt = files_fdtable(oldf); |
323 | open_files = count_open_files(old_fdt); | 323 | open_files = count_open_files(old_fdt); |
324 | 324 | ||
325 | /* | 325 | /* |
326 | * Check whether we need to allocate a larger fd array and fd set. | 326 | * Check whether we need to allocate a larger fd array and fd set. |
327 | */ | 327 | */ |
328 | while (unlikely(open_files > new_fdt->max_fds)) { | 328 | while (unlikely(open_files > new_fdt->max_fds)) { |
329 | spin_unlock(&oldf->file_lock); | 329 | spin_unlock(&oldf->file_lock); |
330 | 330 | ||
331 | if (new_fdt != &newf->fdtab) | 331 | if (new_fdt != &newf->fdtab) |
332 | __free_fdtable(new_fdt); | 332 | __free_fdtable(new_fdt); |
333 | 333 | ||
334 | new_fdt = alloc_fdtable(open_files - 1); | 334 | new_fdt = alloc_fdtable(open_files - 1); |
335 | if (!new_fdt) { | 335 | if (!new_fdt) { |
336 | *errorp = -ENOMEM; | 336 | *errorp = -ENOMEM; |
337 | goto out_release; | 337 | goto out_release; |
338 | } | 338 | } |
339 | 339 | ||
340 | /* beyond sysctl_nr_open; nothing to do */ | 340 | /* beyond sysctl_nr_open; nothing to do */ |
341 | if (unlikely(new_fdt->max_fds < open_files)) { | 341 | if (unlikely(new_fdt->max_fds < open_files)) { |
342 | __free_fdtable(new_fdt); | 342 | __free_fdtable(new_fdt); |
343 | *errorp = -EMFILE; | 343 | *errorp = -EMFILE; |
344 | goto out_release; | 344 | goto out_release; |
345 | } | 345 | } |
346 | 346 | ||
347 | /* | 347 | /* |
348 | * Reacquire the oldf lock and a pointer to its fd table | 348 | * Reacquire the oldf lock and a pointer to its fd table |
349 | * who knows it may have a new bigger fd table. We need | 349 | * who knows it may have a new bigger fd table. We need |
350 | * the latest pointer. | 350 | * the latest pointer. |
351 | */ | 351 | */ |
352 | spin_lock(&oldf->file_lock); | 352 | spin_lock(&oldf->file_lock); |
353 | old_fdt = files_fdtable(oldf); | 353 | old_fdt = files_fdtable(oldf); |
354 | open_files = count_open_files(old_fdt); | 354 | open_files = count_open_files(old_fdt); |
355 | } | 355 | } |
356 | 356 | ||
357 | old_fds = old_fdt->fd; | 357 | old_fds = old_fdt->fd; |
358 | new_fds = new_fdt->fd; | 358 | new_fds = new_fdt->fd; |
359 | 359 | ||
360 | memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); | 360 | memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); |
361 | memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); | 361 | memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); |
362 | 362 | ||
363 | for (i = open_files; i != 0; i--) { | 363 | for (i = open_files; i != 0; i--) { |
364 | struct file *f = *old_fds++; | 364 | struct file *f = *old_fds++; |
365 | if (f) { | 365 | if (f) { |
366 | get_file(f); | 366 | get_file(f); |
367 | } else { | 367 | } else { |
368 | /* | 368 | /* |
369 | * The fd may be claimed in the fd bitmap but not yet | 369 | * The fd may be claimed in the fd bitmap but not yet |
370 | * instantiated in the files array if a sibling thread | 370 | * instantiated in the files array if a sibling thread |
371 | * is partway through open(). So make sure that this | 371 | * is partway through open(). So make sure that this |
372 | * fd is available to the new process. | 372 | * fd is available to the new process. |
373 | */ | 373 | */ |
374 | __clear_open_fd(open_files - i, new_fdt); | 374 | __clear_open_fd(open_files - i, new_fdt); |
375 | } | 375 | } |
376 | rcu_assign_pointer(*new_fds++, f); | 376 | rcu_assign_pointer(*new_fds++, f); |
377 | } | 377 | } |
378 | spin_unlock(&oldf->file_lock); | 378 | spin_unlock(&oldf->file_lock); |
379 | 379 | ||
380 | /* compute the remainder to be cleared */ | 380 | /* compute the remainder to be cleared */ |
381 | size = (new_fdt->max_fds - open_files) * sizeof(struct file *); | 381 | size = (new_fdt->max_fds - open_files) * sizeof(struct file *); |
382 | 382 | ||
383 | /* This is long word aligned thus could use a optimized version */ | 383 | /* This is long word aligned thus could use a optimized version */ |
384 | memset(new_fds, 0, size); | 384 | memset(new_fds, 0, size); |
385 | 385 | ||
386 | if (new_fdt->max_fds > open_files) { | 386 | if (new_fdt->max_fds > open_files) { |
387 | int left = (new_fdt->max_fds - open_files) / 8; | 387 | int left = (new_fdt->max_fds - open_files) / 8; |
388 | int start = open_files / BITS_PER_LONG; | 388 | int start = open_files / BITS_PER_LONG; |
389 | 389 | ||
390 | memset(&new_fdt->open_fds[start], 0, left); | 390 | memset(&new_fdt->open_fds[start], 0, left); |
391 | memset(&new_fdt->close_on_exec[start], 0, left); | 391 | memset(&new_fdt->close_on_exec[start], 0, left); |
392 | } | 392 | } |
393 | 393 | ||
394 | rcu_assign_pointer(newf->fdt, new_fdt); | 394 | rcu_assign_pointer(newf->fdt, new_fdt); |
395 | 395 | ||
396 | return newf; | 396 | return newf; |
397 | 397 | ||
398 | out_release: | 398 | out_release: |
399 | kmem_cache_free(files_cachep, newf); | 399 | kmem_cache_free(files_cachep, newf); |
400 | out: | 400 | out: |
401 | return NULL; | 401 | return NULL; |
402 | } | 402 | } |
403 | 403 | ||
404 | static void close_files(struct files_struct * files) | 404 | static void close_files(struct files_struct * files) |
405 | { | 405 | { |
406 | int i, j; | 406 | int i, j; |
407 | struct fdtable *fdt; | 407 | struct fdtable *fdt; |
408 | 408 | ||
409 | j = 0; | 409 | j = 0; |
410 | 410 | ||
411 | /* | 411 | /* |
412 | * It is safe to dereference the fd table without RCU or | 412 | * It is safe to dereference the fd table without RCU or |
413 | * ->file_lock because this is the last reference to the | 413 | * ->file_lock because this is the last reference to the |
414 | * files structure. But use RCU to shut RCU-lockdep up. | 414 | * files structure. But use RCU to shut RCU-lockdep up. |
415 | */ | 415 | */ |
416 | rcu_read_lock(); | 416 | rcu_read_lock(); |
417 | fdt = files_fdtable(files); | 417 | fdt = files_fdtable(files); |
418 | rcu_read_unlock(); | 418 | rcu_read_unlock(); |
419 | for (;;) { | 419 | for (;;) { |
420 | unsigned long set; | 420 | unsigned long set; |
421 | i = j * BITS_PER_LONG; | 421 | i = j * BITS_PER_LONG; |
422 | if (i >= fdt->max_fds) | 422 | if (i >= fdt->max_fds) |
423 | break; | 423 | break; |
424 | set = fdt->open_fds[j++]; | 424 | set = fdt->open_fds[j++]; |
425 | while (set) { | 425 | while (set) { |
426 | if (set & 1) { | 426 | if (set & 1) { |
427 | struct file * file = xchg(&fdt->fd[i], NULL); | 427 | struct file * file = xchg(&fdt->fd[i], NULL); |
428 | if (file) { | 428 | if (file) { |
429 | filp_close(file, files); | 429 | filp_close(file, files); |
430 | cond_resched(); | 430 | cond_resched(); |
431 | } | 431 | } |
432 | } | 432 | } |
433 | i++; | 433 | i++; |
434 | set >>= 1; | 434 | set >>= 1; |
435 | } | 435 | } |
436 | } | 436 | } |
437 | } | 437 | } |
438 | 438 | ||
439 | struct files_struct *get_files_struct(struct task_struct *task) | 439 | struct files_struct *get_files_struct(struct task_struct *task) |
440 | { | 440 | { |
441 | struct files_struct *files; | 441 | struct files_struct *files; |
442 | 442 | ||
443 | task_lock(task); | 443 | task_lock(task); |
444 | files = task->files; | 444 | files = task->files; |
445 | if (files) | 445 | if (files) |
446 | atomic_inc(&files->count); | 446 | atomic_inc(&files->count); |
447 | task_unlock(task); | 447 | task_unlock(task); |
448 | 448 | ||
449 | return files; | 449 | return files; |
450 | } | 450 | } |
451 | 451 | ||
452 | void put_files_struct(struct files_struct *files) | 452 | void put_files_struct(struct files_struct *files) |
453 | { | 453 | { |
454 | struct fdtable *fdt; | 454 | struct fdtable *fdt; |
455 | 455 | ||
456 | if (atomic_dec_and_test(&files->count)) { | 456 | if (atomic_dec_and_test(&files->count)) { |
457 | close_files(files); | 457 | close_files(files); |
458 | /* not really needed, since nobody can see us */ | 458 | /* not really needed, since nobody can see us */ |
459 | rcu_read_lock(); | 459 | rcu_read_lock(); |
460 | fdt = files_fdtable(files); | 460 | fdt = files_fdtable(files); |
461 | rcu_read_unlock(); | 461 | rcu_read_unlock(); |
462 | /* free the arrays if they are not embedded */ | 462 | /* free the arrays if they are not embedded */ |
463 | if (fdt != &files->fdtab) | 463 | if (fdt != &files->fdtab) |
464 | __free_fdtable(fdt); | 464 | __free_fdtable(fdt); |
465 | kmem_cache_free(files_cachep, files); | 465 | kmem_cache_free(files_cachep, files); |
466 | } | 466 | } |
467 | } | 467 | } |
468 | 468 | ||
469 | void reset_files_struct(struct files_struct *files) | 469 | void reset_files_struct(struct files_struct *files) |
470 | { | 470 | { |
471 | struct task_struct *tsk = current; | 471 | struct task_struct *tsk = current; |
472 | struct files_struct *old; | 472 | struct files_struct *old; |
473 | 473 | ||
474 | old = tsk->files; | 474 | old = tsk->files; |
475 | task_lock(tsk); | 475 | task_lock(tsk); |
476 | tsk->files = files; | 476 | tsk->files = files; |
477 | task_unlock(tsk); | 477 | task_unlock(tsk); |
478 | put_files_struct(old); | 478 | put_files_struct(old); |
479 | } | 479 | } |
480 | 480 | ||
481 | void exit_files(struct task_struct *tsk) | 481 | void exit_files(struct task_struct *tsk) |
482 | { | 482 | { |
483 | struct files_struct * files = tsk->files; | 483 | struct files_struct * files = tsk->files; |
484 | 484 | ||
485 | if (files) { | 485 | if (files) { |
486 | task_lock(tsk); | 486 | task_lock(tsk); |
487 | tsk->files = NULL; | 487 | tsk->files = NULL; |
488 | task_unlock(tsk); | 488 | task_unlock(tsk); |
489 | put_files_struct(files); | 489 | put_files_struct(files); |
490 | } | 490 | } |
491 | } | 491 | } |
492 | 492 | ||
493 | static void __devinit fdtable_defer_list_init(int cpu) | 493 | static void __devinit fdtable_defer_list_init(int cpu) |
494 | { | 494 | { |
495 | struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); | 495 | struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); |
496 | spin_lock_init(&fddef->lock); | 496 | spin_lock_init(&fddef->lock); |
497 | INIT_WORK(&fddef->wq, free_fdtable_work); | 497 | INIT_WORK(&fddef->wq, free_fdtable_work); |
498 | fddef->next = NULL; | 498 | fddef->next = NULL; |
499 | } | 499 | } |
500 | 500 | ||
501 | void __init files_defer_init(void) | 501 | void __init files_defer_init(void) |
502 | { | 502 | { |
503 | int i; | 503 | int i; |
504 | for_each_possible_cpu(i) | 504 | for_each_possible_cpu(i) |
505 | fdtable_defer_list_init(i); | 505 | fdtable_defer_list_init(i); |
506 | sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & | 506 | sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & |
507 | -BITS_PER_LONG; | 507 | -BITS_PER_LONG; |
508 | } | 508 | } |
509 | 509 | ||
510 | struct files_struct init_files = { | 510 | struct files_struct init_files = { |
511 | .count = ATOMIC_INIT(1), | 511 | .count = ATOMIC_INIT(1), |
512 | .fdt = &init_files.fdtab, | 512 | .fdt = &init_files.fdtab, |
513 | .fdtab = { | 513 | .fdtab = { |
514 | .max_fds = NR_OPEN_DEFAULT, | 514 | .max_fds = NR_OPEN_DEFAULT, |
515 | .fd = &init_files.fd_array[0], | 515 | .fd = &init_files.fd_array[0], |
516 | .close_on_exec = init_files.close_on_exec_init, | 516 | .close_on_exec = init_files.close_on_exec_init, |
517 | .open_fds = init_files.open_fds_init, | 517 | .open_fds = init_files.open_fds_init, |
518 | }, | 518 | }, |
519 | .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), | 519 | .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), |
520 | }; | 520 | }; |
521 | 521 | ||
522 | void daemonize_descriptors(void) | ||
523 | { | ||
524 | atomic_inc(&init_files.count); | ||
525 | reset_files_struct(&init_files); | ||
526 | } | ||
527 | |||
522 | /* | 528 | /* |
523 | * allocate a file descriptor, mark it busy. | 529 | * allocate a file descriptor, mark it busy. |
524 | */ | 530 | */ |
525 | int __alloc_fd(struct files_struct *files, | 531 | int __alloc_fd(struct files_struct *files, |
526 | unsigned start, unsigned end, unsigned flags) | 532 | unsigned start, unsigned end, unsigned flags) |
527 | { | 533 | { |
528 | unsigned int fd; | 534 | unsigned int fd; |
529 | int error; | 535 | int error; |
530 | struct fdtable *fdt; | 536 | struct fdtable *fdt; |
531 | 537 | ||
532 | spin_lock(&files->file_lock); | 538 | spin_lock(&files->file_lock); |
533 | repeat: | 539 | repeat: |
534 | fdt = files_fdtable(files); | 540 | fdt = files_fdtable(files); |
535 | fd = start; | 541 | fd = start; |
536 | if (fd < files->next_fd) | 542 | if (fd < files->next_fd) |
537 | fd = files->next_fd; | 543 | fd = files->next_fd; |
538 | 544 | ||
539 | if (fd < fdt->max_fds) | 545 | if (fd < fdt->max_fds) |
540 | fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); | 546 | fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); |
541 | 547 | ||
542 | /* | 548 | /* |
543 | * N.B. For clone tasks sharing a files structure, this test | 549 | * N.B. For clone tasks sharing a files structure, this test |
544 | * will limit the total number of files that can be opened. | 550 | * will limit the total number of files that can be opened. |
545 | */ | 551 | */ |
546 | error = -EMFILE; | 552 | error = -EMFILE; |
547 | if (fd >= end) | 553 | if (fd >= end) |
548 | goto out; | 554 | goto out; |
549 | 555 | ||
550 | error = expand_files(files, fd); | 556 | error = expand_files(files, fd); |
551 | if (error < 0) | 557 | if (error < 0) |
552 | goto out; | 558 | goto out; |
553 | 559 | ||
554 | /* | 560 | /* |
555 | * If we needed to expand the fs array we | 561 | * If we needed to expand the fs array we |
556 | * might have blocked - try again. | 562 | * might have blocked - try again. |
557 | */ | 563 | */ |
558 | if (error) | 564 | if (error) |
559 | goto repeat; | 565 | goto repeat; |
560 | 566 | ||
561 | if (start <= files->next_fd) | 567 | if (start <= files->next_fd) |
562 | files->next_fd = fd + 1; | 568 | files->next_fd = fd + 1; |
563 | 569 | ||
564 | __set_open_fd(fd, fdt); | 570 | __set_open_fd(fd, fdt); |
565 | if (flags & O_CLOEXEC) | 571 | if (flags & O_CLOEXEC) |
566 | __set_close_on_exec(fd, fdt); | 572 | __set_close_on_exec(fd, fdt); |
567 | else | 573 | else |
568 | __clear_close_on_exec(fd, fdt); | 574 | __clear_close_on_exec(fd, fdt); |
569 | error = fd; | 575 | error = fd; |
570 | #if 1 | 576 | #if 1 |
571 | /* Sanity check */ | 577 | /* Sanity check */ |
572 | if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { | 578 | if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { |
573 | printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); | 579 | printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); |
574 | rcu_assign_pointer(fdt->fd[fd], NULL); | 580 | rcu_assign_pointer(fdt->fd[fd], NULL); |
575 | } | 581 | } |
576 | #endif | 582 | #endif |
577 | 583 | ||
578 | out: | 584 | out: |
579 | spin_unlock(&files->file_lock); | 585 | spin_unlock(&files->file_lock); |
580 | return error; | 586 | return error; |
581 | } | 587 | } |
582 | 588 | ||
583 | static int alloc_fd(unsigned start, unsigned flags) | 589 | static int alloc_fd(unsigned start, unsigned flags) |
584 | { | 590 | { |
585 | return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); | 591 | return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); |
586 | } | 592 | } |
587 | 593 | ||
588 | int get_unused_fd_flags(unsigned flags) | 594 | int get_unused_fd_flags(unsigned flags) |
589 | { | 595 | { |
590 | return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); | 596 | return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); |
591 | } | 597 | } |
592 | EXPORT_SYMBOL(get_unused_fd_flags); | 598 | EXPORT_SYMBOL(get_unused_fd_flags); |
593 | 599 | ||
594 | static void __put_unused_fd(struct files_struct *files, unsigned int fd) | 600 | static void __put_unused_fd(struct files_struct *files, unsigned int fd) |
595 | { | 601 | { |
596 | struct fdtable *fdt = files_fdtable(files); | 602 | struct fdtable *fdt = files_fdtable(files); |
597 | __clear_open_fd(fd, fdt); | 603 | __clear_open_fd(fd, fdt); |
598 | if (fd < files->next_fd) | 604 | if (fd < files->next_fd) |
599 | files->next_fd = fd; | 605 | files->next_fd = fd; |
600 | } | 606 | } |
601 | 607 | ||
602 | void put_unused_fd(unsigned int fd) | 608 | void put_unused_fd(unsigned int fd) |
603 | { | 609 | { |
604 | struct files_struct *files = current->files; | 610 | struct files_struct *files = current->files; |
605 | spin_lock(&files->file_lock); | 611 | spin_lock(&files->file_lock); |
606 | __put_unused_fd(files, fd); | 612 | __put_unused_fd(files, fd); |
607 | spin_unlock(&files->file_lock); | 613 | spin_unlock(&files->file_lock); |
608 | } | 614 | } |
609 | 615 | ||
610 | EXPORT_SYMBOL(put_unused_fd); | 616 | EXPORT_SYMBOL(put_unused_fd); |
611 | 617 | ||
612 | /* | 618 | /* |
613 | * Install a file pointer in the fd array. | 619 | * Install a file pointer in the fd array. |
614 | * | 620 | * |
615 | * The VFS is full of places where we drop the files lock between | 621 | * The VFS is full of places where we drop the files lock between |
616 | * setting the open_fds bitmap and installing the file in the file | 622 | * setting the open_fds bitmap and installing the file in the file |
617 | * array. At any such point, we are vulnerable to a dup2() race | 623 | * array. At any such point, we are vulnerable to a dup2() race |
618 | * installing a file in the array before us. We need to detect this and | 624 | * installing a file in the array before us. We need to detect this and |
619 | * fput() the struct file we are about to overwrite in this case. | 625 | * fput() the struct file we are about to overwrite in this case. |
620 | * | 626 | * |
621 | * It should never happen - if we allow dup2() do it, _really_ bad things | 627 | * It should never happen - if we allow dup2() do it, _really_ bad things |
622 | * will follow. | 628 | * will follow. |
623 | * | 629 | * |
624 | * NOTE: __fd_install() variant is really, really low-level; don't | 630 | * NOTE: __fd_install() variant is really, really low-level; don't |
625 | * use it unless you are forced to by truly lousy API shoved down | 631 | * use it unless you are forced to by truly lousy API shoved down |
626 | * your throat. 'files' *MUST* be either current->files or obtained | 632 | * your throat. 'files' *MUST* be either current->files or obtained |
627 | * by get_files_struct(current) done by whoever had given it to you, | 633 | * by get_files_struct(current) done by whoever had given it to you, |
628 | * or really bad things will happen. Normally you want to use | 634 | * or really bad things will happen. Normally you want to use |
629 | * fd_install() instead. | 635 | * fd_install() instead. |
630 | */ | 636 | */ |
631 | 637 | ||
632 | void __fd_install(struct files_struct *files, unsigned int fd, | 638 | void __fd_install(struct files_struct *files, unsigned int fd, |
633 | struct file *file) | 639 | struct file *file) |
634 | { | 640 | { |
635 | struct fdtable *fdt; | 641 | struct fdtable *fdt; |
636 | spin_lock(&files->file_lock); | 642 | spin_lock(&files->file_lock); |
637 | fdt = files_fdtable(files); | 643 | fdt = files_fdtable(files); |
638 | BUG_ON(fdt->fd[fd] != NULL); | 644 | BUG_ON(fdt->fd[fd] != NULL); |
639 | rcu_assign_pointer(fdt->fd[fd], file); | 645 | rcu_assign_pointer(fdt->fd[fd], file); |
640 | spin_unlock(&files->file_lock); | 646 | spin_unlock(&files->file_lock); |
641 | } | 647 | } |
642 | 648 | ||
643 | void fd_install(unsigned int fd, struct file *file) | 649 | void fd_install(unsigned int fd, struct file *file) |
644 | { | 650 | { |
645 | __fd_install(current->files, fd, file); | 651 | __fd_install(current->files, fd, file); |
646 | } | 652 | } |
647 | 653 | ||
648 | EXPORT_SYMBOL(fd_install); | 654 | EXPORT_SYMBOL(fd_install); |
649 | 655 | ||
650 | /* | 656 | /* |
651 | * The same warnings as for __alloc_fd()/__fd_install() apply here... | 657 | * The same warnings as for __alloc_fd()/__fd_install() apply here... |
652 | */ | 658 | */ |
653 | int __close_fd(struct files_struct *files, unsigned fd) | 659 | int __close_fd(struct files_struct *files, unsigned fd) |
654 | { | 660 | { |
655 | struct file *file; | 661 | struct file *file; |
656 | struct fdtable *fdt; | 662 | struct fdtable *fdt; |
657 | 663 | ||
658 | spin_lock(&files->file_lock); | 664 | spin_lock(&files->file_lock); |
659 | fdt = files_fdtable(files); | 665 | fdt = files_fdtable(files); |
660 | if (fd >= fdt->max_fds) | 666 | if (fd >= fdt->max_fds) |
661 | goto out_unlock; | 667 | goto out_unlock; |
662 | file = fdt->fd[fd]; | 668 | file = fdt->fd[fd]; |
663 | if (!file) | 669 | if (!file) |
664 | goto out_unlock; | 670 | goto out_unlock; |
665 | rcu_assign_pointer(fdt->fd[fd], NULL); | 671 | rcu_assign_pointer(fdt->fd[fd], NULL); |
666 | __clear_close_on_exec(fd, fdt); | 672 | __clear_close_on_exec(fd, fdt); |
667 | __put_unused_fd(files, fd); | 673 | __put_unused_fd(files, fd); |
668 | spin_unlock(&files->file_lock); | 674 | spin_unlock(&files->file_lock); |
669 | return filp_close(file, files); | 675 | return filp_close(file, files); |
670 | 676 | ||
671 | out_unlock: | 677 | out_unlock: |
672 | spin_unlock(&files->file_lock); | 678 | spin_unlock(&files->file_lock); |
673 | return -EBADF; | 679 | return -EBADF; |
674 | } | 680 | } |
675 | 681 | ||
676 | void do_close_on_exec(struct files_struct *files) | 682 | void do_close_on_exec(struct files_struct *files) |
677 | { | 683 | { |
678 | unsigned i; | 684 | unsigned i; |
679 | struct fdtable *fdt; | 685 | struct fdtable *fdt; |
680 | 686 | ||
681 | /* exec unshares first */ | 687 | /* exec unshares first */ |
682 | BUG_ON(atomic_read(&files->count) != 1); | 688 | BUG_ON(atomic_read(&files->count) != 1); |
683 | spin_lock(&files->file_lock); | 689 | spin_lock(&files->file_lock); |
684 | for (i = 0; ; i++) { | 690 | for (i = 0; ; i++) { |
685 | unsigned long set; | 691 | unsigned long set; |
686 | unsigned fd = i * BITS_PER_LONG; | 692 | unsigned fd = i * BITS_PER_LONG; |
687 | fdt = files_fdtable(files); | 693 | fdt = files_fdtable(files); |
688 | if (fd >= fdt->max_fds) | 694 | if (fd >= fdt->max_fds) |
689 | break; | 695 | break; |
690 | set = fdt->close_on_exec[i]; | 696 | set = fdt->close_on_exec[i]; |
691 | if (!set) | 697 | if (!set) |
692 | continue; | 698 | continue; |
693 | fdt->close_on_exec[i] = 0; | 699 | fdt->close_on_exec[i] = 0; |
694 | for ( ; set ; fd++, set >>= 1) { | 700 | for ( ; set ; fd++, set >>= 1) { |
695 | struct file *file; | 701 | struct file *file; |
696 | if (!(set & 1)) | 702 | if (!(set & 1)) |
697 | continue; | 703 | continue; |
698 | file = fdt->fd[fd]; | 704 | file = fdt->fd[fd]; |
699 | if (!file) | 705 | if (!file) |
700 | continue; | 706 | continue; |
701 | rcu_assign_pointer(fdt->fd[fd], NULL); | 707 | rcu_assign_pointer(fdt->fd[fd], NULL); |
702 | __put_unused_fd(files, fd); | 708 | __put_unused_fd(files, fd); |
703 | spin_unlock(&files->file_lock); | 709 | spin_unlock(&files->file_lock); |
704 | filp_close(file, files); | 710 | filp_close(file, files); |
705 | cond_resched(); | 711 | cond_resched(); |
706 | spin_lock(&files->file_lock); | 712 | spin_lock(&files->file_lock); |
707 | } | 713 | } |
708 | 714 | ||
709 | } | 715 | } |
710 | spin_unlock(&files->file_lock); | 716 | spin_unlock(&files->file_lock); |
711 | } | 717 | } |
712 | 718 | ||
713 | struct file *fget(unsigned int fd) | 719 | struct file *fget(unsigned int fd) |
714 | { | 720 | { |
715 | struct file *file; | 721 | struct file *file; |
716 | struct files_struct *files = current->files; | 722 | struct files_struct *files = current->files; |
717 | 723 | ||
718 | rcu_read_lock(); | 724 | rcu_read_lock(); |
719 | file = fcheck_files(files, fd); | 725 | file = fcheck_files(files, fd); |
720 | if (file) { | 726 | if (file) { |
721 | /* File object ref couldn't be taken */ | 727 | /* File object ref couldn't be taken */ |
722 | if (file->f_mode & FMODE_PATH || | 728 | if (file->f_mode & FMODE_PATH || |
723 | !atomic_long_inc_not_zero(&file->f_count)) | 729 | !atomic_long_inc_not_zero(&file->f_count)) |
724 | file = NULL; | 730 | file = NULL; |
725 | } | 731 | } |
726 | rcu_read_unlock(); | 732 | rcu_read_unlock(); |
727 | 733 | ||
728 | return file; | 734 | return file; |
729 | } | 735 | } |
730 | 736 | ||
731 | EXPORT_SYMBOL(fget); | 737 | EXPORT_SYMBOL(fget); |
732 | 738 | ||
733 | struct file *fget_raw(unsigned int fd) | 739 | struct file *fget_raw(unsigned int fd) |
734 | { | 740 | { |
735 | struct file *file; | 741 | struct file *file; |
736 | struct files_struct *files = current->files; | 742 | struct files_struct *files = current->files; |
737 | 743 | ||
738 | rcu_read_lock(); | 744 | rcu_read_lock(); |
739 | file = fcheck_files(files, fd); | 745 | file = fcheck_files(files, fd); |
740 | if (file) { | 746 | if (file) { |
741 | /* File object ref couldn't be taken */ | 747 | /* File object ref couldn't be taken */ |
742 | if (!atomic_long_inc_not_zero(&file->f_count)) | 748 | if (!atomic_long_inc_not_zero(&file->f_count)) |
743 | file = NULL; | 749 | file = NULL; |
744 | } | 750 | } |
745 | rcu_read_unlock(); | 751 | rcu_read_unlock(); |
746 | 752 | ||
747 | return file; | 753 | return file; |
748 | } | 754 | } |
749 | 755 | ||
750 | EXPORT_SYMBOL(fget_raw); | 756 | EXPORT_SYMBOL(fget_raw); |
751 | 757 | ||
752 | /* | 758 | /* |
753 | * Lightweight file lookup - no refcnt increment if fd table isn't shared. | 759 | * Lightweight file lookup - no refcnt increment if fd table isn't shared. |
754 | * | 760 | * |
755 | * You can use this instead of fget if you satisfy all of the following | 761 | * You can use this instead of fget if you satisfy all of the following |
756 | * conditions: | 762 | * conditions: |
757 | * 1) You must call fput_light before exiting the syscall and returning control | 763 | * 1) You must call fput_light before exiting the syscall and returning control |
758 | * to userspace (i.e. you cannot remember the returned struct file * after | 764 | * to userspace (i.e. you cannot remember the returned struct file * after |
759 | * returning to userspace). | 765 | * returning to userspace). |
760 | * 2) You must not call filp_close on the returned struct file * in between | 766 | * 2) You must not call filp_close on the returned struct file * in between |
761 | * calls to fget_light and fput_light. | 767 | * calls to fget_light and fput_light. |
762 | * 3) You must not clone the current task in between the calls to fget_light | 768 | * 3) You must not clone the current task in between the calls to fget_light |
763 | * and fput_light. | 769 | * and fput_light. |
764 | * | 770 | * |
765 | * The fput_needed flag returned by fget_light should be passed to the | 771 | * The fput_needed flag returned by fget_light should be passed to the |
766 | * corresponding fput_light. | 772 | * corresponding fput_light. |
767 | */ | 773 | */ |
768 | struct file *fget_light(unsigned int fd, int *fput_needed) | 774 | struct file *fget_light(unsigned int fd, int *fput_needed) |
769 | { | 775 | { |
770 | struct file *file; | 776 | struct file *file; |
771 | struct files_struct *files = current->files; | 777 | struct files_struct *files = current->files; |
772 | 778 | ||
773 | *fput_needed = 0; | 779 | *fput_needed = 0; |
774 | if (atomic_read(&files->count) == 1) { | 780 | if (atomic_read(&files->count) == 1) { |
775 | file = fcheck_files(files, fd); | 781 | file = fcheck_files(files, fd); |
776 | if (file && (file->f_mode & FMODE_PATH)) | 782 | if (file && (file->f_mode & FMODE_PATH)) |
777 | file = NULL; | 783 | file = NULL; |
778 | } else { | 784 | } else { |
779 | rcu_read_lock(); | 785 | rcu_read_lock(); |
780 | file = fcheck_files(files, fd); | 786 | file = fcheck_files(files, fd); |
781 | if (file) { | 787 | if (file) { |
782 | if (!(file->f_mode & FMODE_PATH) && | 788 | if (!(file->f_mode & FMODE_PATH) && |
783 | atomic_long_inc_not_zero(&file->f_count)) | 789 | atomic_long_inc_not_zero(&file->f_count)) |
784 | *fput_needed = 1; | 790 | *fput_needed = 1; |
785 | else | 791 | else |
786 | /* Didn't get the reference, someone's freed */ | 792 | /* Didn't get the reference, someone's freed */ |
787 | file = NULL; | 793 | file = NULL; |
788 | } | 794 | } |
789 | rcu_read_unlock(); | 795 | rcu_read_unlock(); |
790 | } | 796 | } |
791 | 797 | ||
792 | return file; | 798 | return file; |
793 | } | 799 | } |
794 | 800 | ||
795 | struct file *fget_raw_light(unsigned int fd, int *fput_needed) | 801 | struct file *fget_raw_light(unsigned int fd, int *fput_needed) |
796 | { | 802 | { |
797 | struct file *file; | 803 | struct file *file; |
798 | struct files_struct *files = current->files; | 804 | struct files_struct *files = current->files; |
799 | 805 | ||
800 | *fput_needed = 0; | 806 | *fput_needed = 0; |
801 | if (atomic_read(&files->count) == 1) { | 807 | if (atomic_read(&files->count) == 1) { |
802 | file = fcheck_files(files, fd); | 808 | file = fcheck_files(files, fd); |
803 | } else { | 809 | } else { |
804 | rcu_read_lock(); | 810 | rcu_read_lock(); |
805 | file = fcheck_files(files, fd); | 811 | file = fcheck_files(files, fd); |
806 | if (file) { | 812 | if (file) { |
807 | if (atomic_long_inc_not_zero(&file->f_count)) | 813 | if (atomic_long_inc_not_zero(&file->f_count)) |
808 | *fput_needed = 1; | 814 | *fput_needed = 1; |
809 | else | 815 | else |
810 | /* Didn't get the reference, someone's freed */ | 816 | /* Didn't get the reference, someone's freed */ |
811 | file = NULL; | 817 | file = NULL; |
812 | } | 818 | } |
813 | rcu_read_unlock(); | 819 | rcu_read_unlock(); |
814 | } | 820 | } |
815 | 821 | ||
816 | return file; | 822 | return file; |
817 | } | 823 | } |
818 | 824 | ||
819 | void set_close_on_exec(unsigned int fd, int flag) | 825 | void set_close_on_exec(unsigned int fd, int flag) |
820 | { | 826 | { |
821 | struct files_struct *files = current->files; | 827 | struct files_struct *files = current->files; |
822 | struct fdtable *fdt; | 828 | struct fdtable *fdt; |
823 | spin_lock(&files->file_lock); | 829 | spin_lock(&files->file_lock); |
824 | fdt = files_fdtable(files); | 830 | fdt = files_fdtable(files); |
825 | if (flag) | 831 | if (flag) |
826 | __set_close_on_exec(fd, fdt); | 832 | __set_close_on_exec(fd, fdt); |
827 | else | 833 | else |
828 | __clear_close_on_exec(fd, fdt); | 834 | __clear_close_on_exec(fd, fdt); |
829 | spin_unlock(&files->file_lock); | 835 | spin_unlock(&files->file_lock); |
830 | } | 836 | } |
831 | 837 | ||
832 | bool get_close_on_exec(unsigned int fd) | 838 | bool get_close_on_exec(unsigned int fd) |
833 | { | 839 | { |
834 | struct files_struct *files = current->files; | 840 | struct files_struct *files = current->files; |
835 | struct fdtable *fdt; | 841 | struct fdtable *fdt; |
836 | bool res; | 842 | bool res; |
837 | rcu_read_lock(); | 843 | rcu_read_lock(); |
838 | fdt = files_fdtable(files); | 844 | fdt = files_fdtable(files); |
839 | res = close_on_exec(fd, fdt); | 845 | res = close_on_exec(fd, fdt); |
840 | rcu_read_unlock(); | 846 | rcu_read_unlock(); |
841 | return res; | 847 | return res; |
842 | } | 848 | } |
843 | 849 | ||
844 | static int do_dup2(struct files_struct *files, | 850 | static int do_dup2(struct files_struct *files, |
845 | struct file *file, unsigned fd, unsigned flags) | 851 | struct file *file, unsigned fd, unsigned flags) |
846 | { | 852 | { |
847 | struct file *tofree; | 853 | struct file *tofree; |
848 | struct fdtable *fdt; | 854 | struct fdtable *fdt; |
849 | 855 | ||
850 | /* | 856 | /* |
851 | * We need to detect attempts to do dup2() over allocated but still | 857 | * We need to detect attempts to do dup2() over allocated but still |
852 | * not finished descriptor. NB: OpenBSD avoids that at the price of | 858 | * not finished descriptor. NB: OpenBSD avoids that at the price of |
853 | * extra work in their equivalent of fget() - they insert struct | 859 | * extra work in their equivalent of fget() - they insert struct |
854 | * file immediately after grabbing descriptor, mark it larval if | 860 | * file immediately after grabbing descriptor, mark it larval if |
855 | * more work (e.g. actual opening) is needed and make sure that | 861 | * more work (e.g. actual opening) is needed and make sure that |
856 | * fget() treats larval files as absent. Potentially interesting, | 862 | * fget() treats larval files as absent. Potentially interesting, |
857 | * but while extra work in fget() is trivial, locking implications | 863 | * but while extra work in fget() is trivial, locking implications |
858 | * and amount of surgery on open()-related paths in VFS are not. | 864 | * and amount of surgery on open()-related paths in VFS are not. |
859 | * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" | 865 | * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" |
860 | * deadlocks in rather amusing ways, AFAICS. All of that is out of | 866 | * deadlocks in rather amusing ways, AFAICS. All of that is out of |
861 | * scope of POSIX or SUS, since neither considers shared descriptor | 867 | * scope of POSIX or SUS, since neither considers shared descriptor |
862 | * tables and this condition does not arise without those. | 868 | * tables and this condition does not arise without those. |
863 | */ | 869 | */ |
864 | fdt = files_fdtable(files); | 870 | fdt = files_fdtable(files); |
865 | tofree = fdt->fd[fd]; | 871 | tofree = fdt->fd[fd]; |
866 | if (!tofree && fd_is_open(fd, fdt)) | 872 | if (!tofree && fd_is_open(fd, fdt)) |
867 | goto Ebusy; | 873 | goto Ebusy; |
868 | get_file(file); | 874 | get_file(file); |
869 | rcu_assign_pointer(fdt->fd[fd], file); | 875 | rcu_assign_pointer(fdt->fd[fd], file); |
870 | __set_open_fd(fd, fdt); | 876 | __set_open_fd(fd, fdt); |
871 | if (flags & O_CLOEXEC) | 877 | if (flags & O_CLOEXEC) |
872 | __set_close_on_exec(fd, fdt); | 878 | __set_close_on_exec(fd, fdt); |
873 | else | 879 | else |
874 | __clear_close_on_exec(fd, fdt); | 880 | __clear_close_on_exec(fd, fdt); |
875 | spin_unlock(&files->file_lock); | 881 | spin_unlock(&files->file_lock); |
876 | 882 | ||
877 | if (tofree) | 883 | if (tofree) |
878 | filp_close(tofree, files); | 884 | filp_close(tofree, files); |
879 | 885 | ||
880 | return fd; | 886 | return fd; |
881 | 887 | ||
882 | Ebusy: | 888 | Ebusy: |
883 | spin_unlock(&files->file_lock); | 889 | spin_unlock(&files->file_lock); |
884 | return -EBUSY; | 890 | return -EBUSY; |
885 | } | 891 | } |
886 | 892 | ||
887 | int replace_fd(unsigned fd, struct file *file, unsigned flags) | 893 | int replace_fd(unsigned fd, struct file *file, unsigned flags) |
888 | { | 894 | { |
889 | int err; | 895 | int err; |
890 | struct files_struct *files = current->files; | 896 | struct files_struct *files = current->files; |
891 | 897 | ||
892 | if (!file) | 898 | if (!file) |
893 | return __close_fd(files, fd); | 899 | return __close_fd(files, fd); |
894 | 900 | ||
895 | if (fd >= rlimit(RLIMIT_NOFILE)) | 901 | if (fd >= rlimit(RLIMIT_NOFILE)) |
896 | return -EMFILE; | 902 | return -EMFILE; |
897 | 903 | ||
898 | spin_lock(&files->file_lock); | 904 | spin_lock(&files->file_lock); |
899 | err = expand_files(files, fd); | 905 | err = expand_files(files, fd); |
900 | if (unlikely(err < 0)) | 906 | if (unlikely(err < 0)) |
901 | goto out_unlock; | 907 | goto out_unlock; |
902 | return do_dup2(files, file, fd, flags); | 908 | return do_dup2(files, file, fd, flags); |
903 | 909 | ||
904 | out_unlock: | 910 | out_unlock: |
905 | spin_unlock(&files->file_lock); | 911 | spin_unlock(&files->file_lock); |
906 | return err; | 912 | return err; |
907 | } | 913 | } |
908 | 914 | ||
909 | SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) | 915 | SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) |
910 | { | 916 | { |
911 | int err = -EBADF; | 917 | int err = -EBADF; |
912 | struct file *file; | 918 | struct file *file; |
913 | struct files_struct *files = current->files; | 919 | struct files_struct *files = current->files; |
914 | 920 | ||
915 | if ((flags & ~O_CLOEXEC) != 0) | 921 | if ((flags & ~O_CLOEXEC) != 0) |
916 | return -EINVAL; | 922 | return -EINVAL; |
917 | 923 | ||
918 | if (newfd >= rlimit(RLIMIT_NOFILE)) | 924 | if (newfd >= rlimit(RLIMIT_NOFILE)) |
919 | return -EMFILE; | 925 | return -EMFILE; |
920 | 926 | ||
921 | spin_lock(&files->file_lock); | 927 | spin_lock(&files->file_lock); |
922 | err = expand_files(files, newfd); | 928 | err = expand_files(files, newfd); |
923 | file = fcheck(oldfd); | 929 | file = fcheck(oldfd); |
924 | if (unlikely(!file)) | 930 | if (unlikely(!file)) |
925 | goto Ebadf; | 931 | goto Ebadf; |
926 | if (unlikely(err < 0)) { | 932 | if (unlikely(err < 0)) { |
927 | if (err == -EMFILE) | 933 | if (err == -EMFILE) |
928 | goto Ebadf; | 934 | goto Ebadf; |
929 | goto out_unlock; | 935 | goto out_unlock; |
930 | } | 936 | } |
931 | return do_dup2(files, file, newfd, flags); | 937 | return do_dup2(files, file, newfd, flags); |
932 | 938 | ||
933 | Ebadf: | 939 | Ebadf: |
934 | err = -EBADF; | 940 | err = -EBADF; |
935 | out_unlock: | 941 | out_unlock: |
936 | spin_unlock(&files->file_lock); | 942 | spin_unlock(&files->file_lock); |
937 | return err; | 943 | return err; |
938 | } | 944 | } |
939 | 945 | ||
940 | SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) | 946 | SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) |
941 | { | 947 | { |
942 | if (unlikely(newfd == oldfd)) { /* corner case */ | 948 | if (unlikely(newfd == oldfd)) { /* corner case */ |
943 | struct files_struct *files = current->files; | 949 | struct files_struct *files = current->files; |
944 | int retval = oldfd; | 950 | int retval = oldfd; |
945 | 951 | ||
946 | rcu_read_lock(); | 952 | rcu_read_lock(); |
947 | if (!fcheck_files(files, oldfd)) | 953 | if (!fcheck_files(files, oldfd)) |
948 | retval = -EBADF; | 954 | retval = -EBADF; |
949 | rcu_read_unlock(); | 955 | rcu_read_unlock(); |
950 | return retval; | 956 | return retval; |
951 | } | 957 | } |
952 | return sys_dup3(oldfd, newfd, 0); | 958 | return sys_dup3(oldfd, newfd, 0); |
953 | } | 959 | } |
954 | 960 | ||
955 | SYSCALL_DEFINE1(dup, unsigned int, fildes) | 961 | SYSCALL_DEFINE1(dup, unsigned int, fildes) |
956 | { | 962 | { |
957 | int ret = -EBADF; | 963 | int ret = -EBADF; |
958 | struct file *file = fget_raw(fildes); | 964 | struct file *file = fget_raw(fildes); |
959 | 965 | ||
960 | if (file) { | 966 | if (file) { |
961 | ret = get_unused_fd(); | 967 | ret = get_unused_fd(); |
962 | if (ret >= 0) | 968 | if (ret >= 0) |
963 | fd_install(ret, file); | 969 | fd_install(ret, file); |
964 | else | 970 | else |
965 | fput(file); | 971 | fput(file); |
966 | } | 972 | } |
967 | return ret; | 973 | return ret; |
968 | } | 974 | } |
969 | 975 | ||
970 | int f_dupfd(unsigned int from, struct file *file, unsigned flags) | 976 | int f_dupfd(unsigned int from, struct file *file, unsigned flags) |
971 | { | 977 | { |
972 | int err; | 978 | int err; |
973 | if (from >= rlimit(RLIMIT_NOFILE)) | 979 | if (from >= rlimit(RLIMIT_NOFILE)) |
974 | return -EINVAL; | 980 | return -EINVAL; |
975 | err = alloc_fd(from, flags); | 981 | err = alloc_fd(from, flags); |
976 | if (err >= 0) { | 982 | if (err >= 0) { |
977 | get_file(file); | 983 | get_file(file); |
978 | fd_install(err, file); | 984 | fd_install(err, file); |
979 | } | 985 | } |
980 | return err; | 986 | return err; |
981 | } | 987 | } |
982 | 988 | ||
983 | int iterate_fd(struct files_struct *files, unsigned n, | 989 | int iterate_fd(struct files_struct *files, unsigned n, |
984 | int (*f)(const void *, struct file *, unsigned), | 990 | int (*f)(const void *, struct file *, unsigned), |
985 | const void *p) | 991 | const void *p) |
986 | { | 992 | { |
987 | struct fdtable *fdt; | 993 | struct fdtable *fdt; |
988 | struct file *file; | 994 | struct file *file; |
989 | int res = 0; | 995 | int res = 0; |
990 | if (!files) | 996 | if (!files) |
991 | return 0; | 997 | return 0; |
992 | spin_lock(&files->file_lock); | 998 | spin_lock(&files->file_lock); |
993 | fdt = files_fdtable(files); | 999 | fdt = files_fdtable(files); |
994 | while (!res && n < fdt->max_fds) { | 1000 | while (!res && n < fdt->max_fds) { |
995 | file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); | 1001 | file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); |
996 | if (file) | 1002 | if (file) |
997 | res = f(p, file, n); | 1003 | res = f(p, file, n); |
998 | } | 1004 | } |
999 | spin_unlock(&files->file_lock); | 1005 | spin_unlock(&files->file_lock); |
1000 | return res; | 1006 | return res; |
1001 | } | 1007 | } |
1002 | EXPORT_SYMBOL(iterate_fd); | 1008 | EXPORT_SYMBOL(iterate_fd); |
1003 | 1009 |
include/linux/fdtable.h
1 | /* | 1 | /* |
2 | * descriptor table internals; you almost certainly want file.h instead. | 2 | * descriptor table internals; you almost certainly want file.h instead. |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #ifndef __LINUX_FDTABLE_H | 5 | #ifndef __LINUX_FDTABLE_H |
6 | #define __LINUX_FDTABLE_H | 6 | #define __LINUX_FDTABLE_H |
7 | 7 | ||
8 | #include <linux/posix_types.h> | 8 | #include <linux/posix_types.h> |
9 | #include <linux/compiler.h> | 9 | #include <linux/compiler.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/rcupdate.h> | 11 | #include <linux/rcupdate.h> |
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | 15 | ||
16 | #include <linux/atomic.h> | 16 | #include <linux/atomic.h> |
17 | 17 | ||
18 | /* | 18 | /* |
19 | * The default fd array needs to be at least BITS_PER_LONG, | 19 | * The default fd array needs to be at least BITS_PER_LONG, |
20 | * as this is the granularity returned by copy_fdset(). | 20 | * as this is the granularity returned by copy_fdset(). |
21 | */ | 21 | */ |
22 | #define NR_OPEN_DEFAULT BITS_PER_LONG | 22 | #define NR_OPEN_DEFAULT BITS_PER_LONG |
23 | 23 | ||
24 | struct fdtable { | 24 | struct fdtable { |
25 | unsigned int max_fds; | 25 | unsigned int max_fds; |
26 | struct file __rcu **fd; /* current fd array */ | 26 | struct file __rcu **fd; /* current fd array */ |
27 | unsigned long *close_on_exec; | 27 | unsigned long *close_on_exec; |
28 | unsigned long *open_fds; | 28 | unsigned long *open_fds; |
29 | struct rcu_head rcu; | 29 | struct rcu_head rcu; |
30 | struct fdtable *next; | 30 | struct fdtable *next; |
31 | }; | 31 | }; |
32 | 32 | ||
33 | static inline bool close_on_exec(int fd, const struct fdtable *fdt) | 33 | static inline bool close_on_exec(int fd, const struct fdtable *fdt) |
34 | { | 34 | { |
35 | return test_bit(fd, fdt->close_on_exec); | 35 | return test_bit(fd, fdt->close_on_exec); |
36 | } | 36 | } |
37 | 37 | ||
38 | static inline bool fd_is_open(int fd, const struct fdtable *fdt) | 38 | static inline bool fd_is_open(int fd, const struct fdtable *fdt) |
39 | { | 39 | { |
40 | return test_bit(fd, fdt->open_fds); | 40 | return test_bit(fd, fdt->open_fds); |
41 | } | 41 | } |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Open file table structure | 44 | * Open file table structure |
45 | */ | 45 | */ |
46 | struct files_struct { | 46 | struct files_struct { |
47 | /* | 47 | /* |
48 | * read mostly part | 48 | * read mostly part |
49 | */ | 49 | */ |
50 | atomic_t count; | 50 | atomic_t count; |
51 | struct fdtable __rcu *fdt; | 51 | struct fdtable __rcu *fdt; |
52 | struct fdtable fdtab; | 52 | struct fdtable fdtab; |
53 | /* | 53 | /* |
54 | * written part on a separate cache line in SMP | 54 | * written part on a separate cache line in SMP |
55 | */ | 55 | */ |
56 | spinlock_t file_lock ____cacheline_aligned_in_smp; | 56 | spinlock_t file_lock ____cacheline_aligned_in_smp; |
57 | int next_fd; | 57 | int next_fd; |
58 | unsigned long close_on_exec_init[1]; | 58 | unsigned long close_on_exec_init[1]; |
59 | unsigned long open_fds_init[1]; | 59 | unsigned long open_fds_init[1]; |
60 | struct file __rcu * fd_array[NR_OPEN_DEFAULT]; | 60 | struct file __rcu * fd_array[NR_OPEN_DEFAULT]; |
61 | }; | 61 | }; |
62 | 62 | ||
63 | #define rcu_dereference_check_fdtable(files, fdtfd) \ | 63 | #define rcu_dereference_check_fdtable(files, fdtfd) \ |
64 | (rcu_dereference_check((fdtfd), \ | 64 | (rcu_dereference_check((fdtfd), \ |
65 | lockdep_is_held(&(files)->file_lock) || \ | 65 | lockdep_is_held(&(files)->file_lock) || \ |
66 | atomic_read(&(files)->count) == 1 || \ | 66 | atomic_read(&(files)->count) == 1 || \ |
67 | rcu_my_thread_group_empty())) | 67 | rcu_my_thread_group_empty())) |
68 | 68 | ||
69 | #define files_fdtable(files) \ | 69 | #define files_fdtable(files) \ |
70 | (rcu_dereference_check_fdtable((files), (files)->fdt)) | 70 | (rcu_dereference_check_fdtable((files), (files)->fdt)) |
71 | 71 | ||
72 | struct file_operations; | 72 | struct file_operations; |
73 | struct vfsmount; | 73 | struct vfsmount; |
74 | struct dentry; | 74 | struct dentry; |
75 | 75 | ||
76 | extern void __init files_defer_init(void); | 76 | extern void __init files_defer_init(void); |
77 | 77 | ||
78 | static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) | 78 | static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) |
79 | { | 79 | { |
80 | struct file * file = NULL; | 80 | struct file * file = NULL; |
81 | struct fdtable *fdt = files_fdtable(files); | 81 | struct fdtable *fdt = files_fdtable(files); |
82 | 82 | ||
83 | if (fd < fdt->max_fds) | 83 | if (fd < fdt->max_fds) |
84 | file = rcu_dereference_check_fdtable(files, fdt->fd[fd]); | 84 | file = rcu_dereference_check_fdtable(files, fdt->fd[fd]); |
85 | return file; | 85 | return file; |
86 | } | 86 | } |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * Check whether the specified fd has an open file. | 89 | * Check whether the specified fd has an open file. |
90 | */ | 90 | */ |
91 | #define fcheck(fd) fcheck_files(current->files, fd) | 91 | #define fcheck(fd) fcheck_files(current->files, fd) |
92 | 92 | ||
93 | struct task_struct; | 93 | struct task_struct; |
94 | 94 | ||
95 | struct files_struct *get_files_struct(struct task_struct *); | 95 | struct files_struct *get_files_struct(struct task_struct *); |
96 | void put_files_struct(struct files_struct *fs); | 96 | void put_files_struct(struct files_struct *fs); |
97 | void reset_files_struct(struct files_struct *); | 97 | void reset_files_struct(struct files_struct *); |
98 | void daemonize_descriptors(void); | ||
98 | int unshare_files(struct files_struct **); | 99 | int unshare_files(struct files_struct **); |
99 | struct files_struct *dup_fd(struct files_struct *, int *); | 100 | struct files_struct *dup_fd(struct files_struct *, int *); |
100 | void do_close_on_exec(struct files_struct *); | 101 | void do_close_on_exec(struct files_struct *); |
101 | int iterate_fd(struct files_struct *, unsigned, | 102 | int iterate_fd(struct files_struct *, unsigned, |
102 | int (*)(const void *, struct file *, unsigned), | 103 | int (*)(const void *, struct file *, unsigned), |
103 | const void *); | 104 | const void *); |
104 | 105 | ||
105 | extern int __alloc_fd(struct files_struct *files, | 106 | extern int __alloc_fd(struct files_struct *files, |
106 | unsigned start, unsigned end, unsigned flags); | 107 | unsigned start, unsigned end, unsigned flags); |
107 | extern void __fd_install(struct files_struct *files, | 108 | extern void __fd_install(struct files_struct *files, |
108 | unsigned int fd, struct file *file); | 109 | unsigned int fd, struct file *file); |
109 | extern int __close_fd(struct files_struct *files, | 110 | extern int __close_fd(struct files_struct *files, |
110 | unsigned int fd); | 111 | unsigned int fd); |
111 | 112 | ||
112 | extern struct kmem_cache *files_cachep; | 113 | extern struct kmem_cache *files_cachep; |
113 | 114 | ||
114 | #endif /* __LINUX_FDTABLE_H */ | 115 | #endif /* __LINUX_FDTABLE_H */ |
115 | 116 |
kernel/exit.c
1 | /* | 1 | /* |
2 | * linux/kernel/exit.c | 2 | * linux/kernel/exit.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
12 | #include <linux/completion.h> | 12 | #include <linux/completion.h> |
13 | #include <linux/personality.h> | 13 | #include <linux/personality.h> |
14 | #include <linux/tty.h> | 14 | #include <linux/tty.h> |
15 | #include <linux/iocontext.h> | 15 | #include <linux/iocontext.h> |
16 | #include <linux/key.h> | 16 | #include <linux/key.h> |
17 | #include <linux/security.h> | 17 | #include <linux/security.h> |
18 | #include <linux/cpu.h> | 18 | #include <linux/cpu.h> |
19 | #include <linux/acct.h> | 19 | #include <linux/acct.h> |
20 | #include <linux/tsacct_kern.h> | 20 | #include <linux/tsacct_kern.h> |
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/fdtable.h> | 22 | #include <linux/fdtable.h> |
23 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
24 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
25 | #include <linux/pid_namespace.h> | 25 | #include <linux/pid_namespace.h> |
26 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
29 | #include <linux/proc_fs.h> | 29 | #include <linux/proc_fs.h> |
30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/taskstats_kern.h> | 32 | #include <linux/taskstats_kern.h> |
33 | #include <linux/delayacct.h> | 33 | #include <linux/delayacct.h> |
34 | #include <linux/freezer.h> | 34 | #include <linux/freezer.h> |
35 | #include <linux/cgroup.h> | 35 | #include <linux/cgroup.h> |
36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
37 | #include <linux/signal.h> | 37 | #include <linux/signal.h> |
38 | #include <linux/posix-timers.h> | 38 | #include <linux/posix-timers.h> |
39 | #include <linux/cn_proc.h> | 39 | #include <linux/cn_proc.h> |
40 | #include <linux/mutex.h> | 40 | #include <linux/mutex.h> |
41 | #include <linux/futex.h> | 41 | #include <linux/futex.h> |
42 | #include <linux/pipe_fs_i.h> | 42 | #include <linux/pipe_fs_i.h> |
43 | #include <linux/audit.h> /* for audit_free() */ | 43 | #include <linux/audit.h> /* for audit_free() */ |
44 | #include <linux/resource.h> | 44 | #include <linux/resource.h> |
45 | #include <linux/blkdev.h> | 45 | #include <linux/blkdev.h> |
46 | #include <linux/task_io_accounting_ops.h> | 46 | #include <linux/task_io_accounting_ops.h> |
47 | #include <linux/tracehook.h> | 47 | #include <linux/tracehook.h> |
48 | #include <linux/fs_struct.h> | 48 | #include <linux/fs_struct.h> |
49 | #include <linux/init_task.h> | 49 | #include <linux/init_task.h> |
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | #include <linux/writeback.h> | 54 | #include <linux/writeback.h> |
55 | #include <linux/shm.h> | 55 | #include <linux/shm.h> |
56 | 56 | ||
57 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
58 | #include <asm/unistd.h> | 58 | #include <asm/unistd.h> |
59 | #include <asm/pgtable.h> | 59 | #include <asm/pgtable.h> |
60 | #include <asm/mmu_context.h> | 60 | #include <asm/mmu_context.h> |
61 | 61 | ||
62 | static void exit_mm(struct task_struct * tsk); | 62 | static void exit_mm(struct task_struct * tsk); |
63 | 63 | ||
64 | static void __unhash_process(struct task_struct *p, bool group_dead) | 64 | static void __unhash_process(struct task_struct *p, bool group_dead) |
65 | { | 65 | { |
66 | nr_threads--; | 66 | nr_threads--; |
67 | detach_pid(p, PIDTYPE_PID); | 67 | detach_pid(p, PIDTYPE_PID); |
68 | if (group_dead) { | 68 | if (group_dead) { |
69 | detach_pid(p, PIDTYPE_PGID); | 69 | detach_pid(p, PIDTYPE_PGID); |
70 | detach_pid(p, PIDTYPE_SID); | 70 | detach_pid(p, PIDTYPE_SID); |
71 | 71 | ||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | 75 | /* |
76 | * If we are the last child process in a pid namespace to be | 76 | * If we are the last child process in a pid namespace to be |
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | 77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). |
78 | */ | 78 | */ |
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | 79 | if (IS_ENABLED(CONFIG_PID_NS)) { |
80 | struct task_struct *parent = p->real_parent; | 80 | struct task_struct *parent = p->real_parent; |
81 | 81 | ||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | 82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && |
83 | list_empty(&parent->children) && | 83 | list_empty(&parent->children) && |
84 | (parent->flags & PF_EXITING)) | 84 | (parent->flags & PF_EXITING)) |
85 | wake_up_process(parent); | 85 | wake_up_process(parent); |
86 | } | 86 | } |
87 | } | 87 | } |
88 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
89 | } | 89 | } |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * This function expects the tasklist_lock write-locked. | 92 | * This function expects the tasklist_lock write-locked. |
93 | */ | 93 | */ |
94 | static void __exit_signal(struct task_struct *tsk) | 94 | static void __exit_signal(struct task_struct *tsk) |
95 | { | 95 | { |
96 | struct signal_struct *sig = tsk->signal; | 96 | struct signal_struct *sig = tsk->signal; |
97 | bool group_dead = thread_group_leader(tsk); | 97 | bool group_dead = thread_group_leader(tsk); |
98 | struct sighand_struct *sighand; | 98 | struct sighand_struct *sighand; |
99 | struct tty_struct *uninitialized_var(tty); | 99 | struct tty_struct *uninitialized_var(tty); |
100 | 100 | ||
101 | sighand = rcu_dereference_check(tsk->sighand, | 101 | sighand = rcu_dereference_check(tsk->sighand, |
102 | lockdep_tasklist_lock_is_held()); | 102 | lockdep_tasklist_lock_is_held()); |
103 | spin_lock(&sighand->siglock); | 103 | spin_lock(&sighand->siglock); |
104 | 104 | ||
105 | posix_cpu_timers_exit(tsk); | 105 | posix_cpu_timers_exit(tsk); |
106 | if (group_dead) { | 106 | if (group_dead) { |
107 | posix_cpu_timers_exit_group(tsk); | 107 | posix_cpu_timers_exit_group(tsk); |
108 | tty = sig->tty; | 108 | tty = sig->tty; |
109 | sig->tty = NULL; | 109 | sig->tty = NULL; |
110 | } else { | 110 | } else { |
111 | /* | 111 | /* |
112 | * This can only happen if the caller is de_thread(). | 112 | * This can only happen if the caller is de_thread(). |
113 | * FIXME: this is the temporary hack, we should teach | 113 | * FIXME: this is the temporary hack, we should teach |
114 | * posix-cpu-timers to handle this case correctly. | 114 | * posix-cpu-timers to handle this case correctly. |
115 | */ | 115 | */ |
116 | if (unlikely(has_group_leader_pid(tsk))) | 116 | if (unlikely(has_group_leader_pid(tsk))) |
117 | posix_cpu_timers_exit_group(tsk); | 117 | posix_cpu_timers_exit_group(tsk); |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * If there is any task waiting for the group exit | 120 | * If there is any task waiting for the group exit |
121 | * then notify it: | 121 | * then notify it: |
122 | */ | 122 | */ |
123 | if (sig->notify_count > 0 && !--sig->notify_count) | 123 | if (sig->notify_count > 0 && !--sig->notify_count) |
124 | wake_up_process(sig->group_exit_task); | 124 | wake_up_process(sig->group_exit_task); |
125 | 125 | ||
126 | if (tsk == sig->curr_target) | 126 | if (tsk == sig->curr_target) |
127 | sig->curr_target = next_thread(tsk); | 127 | sig->curr_target = next_thread(tsk); |
128 | /* | 128 | /* |
129 | * Accumulate here the counters for all threads but the | 129 | * Accumulate here the counters for all threads but the |
130 | * group leader as they die, so they can be added into | 130 | * group leader as they die, so they can be added into |
131 | * the process-wide totals when those are taken. | 131 | * the process-wide totals when those are taken. |
132 | * The group leader stays around as a zombie as long | 132 | * The group leader stays around as a zombie as long |
133 | * as there are other threads. When it gets reaped, | 133 | * as there are other threads. When it gets reaped, |
134 | * the exit.c code will add its counts into these totals. | 134 | * the exit.c code will add its counts into these totals. |
135 | * We won't ever get here for the group leader, since it | 135 | * We won't ever get here for the group leader, since it |
136 | * will have been the last reference on the signal_struct. | 136 | * will have been the last reference on the signal_struct. |
137 | */ | 137 | */ |
138 | sig->utime += tsk->utime; | 138 | sig->utime += tsk->utime; |
139 | sig->stime += tsk->stime; | 139 | sig->stime += tsk->stime; |
140 | sig->gtime += tsk->gtime; | 140 | sig->gtime += tsk->gtime; |
141 | sig->min_flt += tsk->min_flt; | 141 | sig->min_flt += tsk->min_flt; |
142 | sig->maj_flt += tsk->maj_flt; | 142 | sig->maj_flt += tsk->maj_flt; |
143 | sig->nvcsw += tsk->nvcsw; | 143 | sig->nvcsw += tsk->nvcsw; |
144 | sig->nivcsw += tsk->nivcsw; | 144 | sig->nivcsw += tsk->nivcsw; |
145 | sig->inblock += task_io_get_inblock(tsk); | 145 | sig->inblock += task_io_get_inblock(tsk); |
146 | sig->oublock += task_io_get_oublock(tsk); | 146 | sig->oublock += task_io_get_oublock(tsk); |
147 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | 147 | task_io_accounting_add(&sig->ioac, &tsk->ioac); |
148 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | 148 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; |
149 | } | 149 | } |
150 | 150 | ||
151 | sig->nr_threads--; | 151 | sig->nr_threads--; |
152 | __unhash_process(tsk, group_dead); | 152 | __unhash_process(tsk, group_dead); |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * Do this under ->siglock, we can race with another thread | 155 | * Do this under ->siglock, we can race with another thread |
156 | * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. | 156 | * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. |
157 | */ | 157 | */ |
158 | flush_sigqueue(&tsk->pending); | 158 | flush_sigqueue(&tsk->pending); |
159 | tsk->sighand = NULL; | 159 | tsk->sighand = NULL; |
160 | spin_unlock(&sighand->siglock); | 160 | spin_unlock(&sighand->siglock); |
161 | 161 | ||
162 | __cleanup_sighand(sighand); | 162 | __cleanup_sighand(sighand); |
163 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 163 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); |
164 | if (group_dead) { | 164 | if (group_dead) { |
165 | flush_sigqueue(&sig->shared_pending); | 165 | flush_sigqueue(&sig->shared_pending); |
166 | tty_kref_put(tty); | 166 | tty_kref_put(tty); |
167 | } | 167 | } |
168 | } | 168 | } |
169 | 169 | ||
170 | static void delayed_put_task_struct(struct rcu_head *rhp) | 170 | static void delayed_put_task_struct(struct rcu_head *rhp) |
171 | { | 171 | { |
172 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 172 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
173 | 173 | ||
174 | perf_event_delayed_put(tsk); | 174 | perf_event_delayed_put(tsk); |
175 | trace_sched_process_free(tsk); | 175 | trace_sched_process_free(tsk); |
176 | put_task_struct(tsk); | 176 | put_task_struct(tsk); |
177 | } | 177 | } |
178 | 178 | ||
179 | 179 | ||
180 | void release_task(struct task_struct * p) | 180 | void release_task(struct task_struct * p) |
181 | { | 181 | { |
182 | struct task_struct *leader; | 182 | struct task_struct *leader; |
183 | int zap_leader; | 183 | int zap_leader; |
184 | repeat: | 184 | repeat: |
185 | /* don't need to get the RCU readlock here - the process is dead and | 185 | /* don't need to get the RCU readlock here - the process is dead and |
186 | * can't be modifying its own credentials. But shut RCU-lockdep up */ | 186 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
187 | rcu_read_lock(); | 187 | rcu_read_lock(); |
188 | atomic_dec(&__task_cred(p)->user->processes); | 188 | atomic_dec(&__task_cred(p)->user->processes); |
189 | rcu_read_unlock(); | 189 | rcu_read_unlock(); |
190 | 190 | ||
191 | proc_flush_task(p); | 191 | proc_flush_task(p); |
192 | 192 | ||
193 | write_lock_irq(&tasklist_lock); | 193 | write_lock_irq(&tasklist_lock); |
194 | ptrace_release_task(p); | 194 | ptrace_release_task(p); |
195 | __exit_signal(p); | 195 | __exit_signal(p); |
196 | 196 | ||
197 | /* | 197 | /* |
198 | * If we are the last non-leader member of the thread | 198 | * If we are the last non-leader member of the thread |
199 | * group, and the leader is zombie, then notify the | 199 | * group, and the leader is zombie, then notify the |
200 | * group leader's parent process. (if it wants notification.) | 200 | * group leader's parent process. (if it wants notification.) |
201 | */ | 201 | */ |
202 | zap_leader = 0; | 202 | zap_leader = 0; |
203 | leader = p->group_leader; | 203 | leader = p->group_leader; |
204 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 204 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { |
205 | /* | 205 | /* |
206 | * If we were the last child thread and the leader has | 206 | * If we were the last child thread and the leader has |
207 | * exited already, and the leader's parent ignores SIGCHLD, | 207 | * exited already, and the leader's parent ignores SIGCHLD, |
208 | * then we are the one who should release the leader. | 208 | * then we are the one who should release the leader. |
209 | */ | 209 | */ |
210 | zap_leader = do_notify_parent(leader, leader->exit_signal); | 210 | zap_leader = do_notify_parent(leader, leader->exit_signal); |
211 | if (zap_leader) | 211 | if (zap_leader) |
212 | leader->exit_state = EXIT_DEAD; | 212 | leader->exit_state = EXIT_DEAD; |
213 | } | 213 | } |
214 | 214 | ||
215 | write_unlock_irq(&tasklist_lock); | 215 | write_unlock_irq(&tasklist_lock); |
216 | release_thread(p); | 216 | release_thread(p); |
217 | call_rcu(&p->rcu, delayed_put_task_struct); | 217 | call_rcu(&p->rcu, delayed_put_task_struct); |
218 | 218 | ||
219 | p = leader; | 219 | p = leader; |
220 | if (unlikely(zap_leader)) | 220 | if (unlikely(zap_leader)) |
221 | goto repeat; | 221 | goto repeat; |
222 | } | 222 | } |
223 | 223 | ||
224 | /* | 224 | /* |
225 | * This checks not only the pgrp, but falls back on the pid if no | 225 | * This checks not only the pgrp, but falls back on the pid if no |
226 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | 226 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly |
227 | * without this... | 227 | * without this... |
228 | * | 228 | * |
229 | * The caller must hold rcu lock or the tasklist lock. | 229 | * The caller must hold rcu lock or the tasklist lock. |
230 | */ | 230 | */ |
231 | struct pid *session_of_pgrp(struct pid *pgrp) | 231 | struct pid *session_of_pgrp(struct pid *pgrp) |
232 | { | 232 | { |
233 | struct task_struct *p; | 233 | struct task_struct *p; |
234 | struct pid *sid = NULL; | 234 | struct pid *sid = NULL; |
235 | 235 | ||
236 | p = pid_task(pgrp, PIDTYPE_PGID); | 236 | p = pid_task(pgrp, PIDTYPE_PGID); |
237 | if (p == NULL) | 237 | if (p == NULL) |
238 | p = pid_task(pgrp, PIDTYPE_PID); | 238 | p = pid_task(pgrp, PIDTYPE_PID); |
239 | if (p != NULL) | 239 | if (p != NULL) |
240 | sid = task_session(p); | 240 | sid = task_session(p); |
241 | 241 | ||
242 | return sid; | 242 | return sid; |
243 | } | 243 | } |
244 | 244 | ||
245 | /* | 245 | /* |
246 | * Determine if a process group is "orphaned", according to the POSIX | 246 | * Determine if a process group is "orphaned", according to the POSIX |
247 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 247 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
248 | * by terminal-generated stop signals. Newly orphaned process groups are | 248 | * by terminal-generated stop signals. Newly orphaned process groups are |
249 | * to receive a SIGHUP and a SIGCONT. | 249 | * to receive a SIGHUP and a SIGCONT. |
250 | * | 250 | * |
251 | * "I ask you, have you ever known what it is to be an orphan?" | 251 | * "I ask you, have you ever known what it is to be an orphan?" |
252 | */ | 252 | */ |
253 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) | 253 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) |
254 | { | 254 | { |
255 | struct task_struct *p; | 255 | struct task_struct *p; |
256 | 256 | ||
257 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 257 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
258 | if ((p == ignored_task) || | 258 | if ((p == ignored_task) || |
259 | (p->exit_state && thread_group_empty(p)) || | 259 | (p->exit_state && thread_group_empty(p)) || |
260 | is_global_init(p->real_parent)) | 260 | is_global_init(p->real_parent)) |
261 | continue; | 261 | continue; |
262 | 262 | ||
263 | if (task_pgrp(p->real_parent) != pgrp && | 263 | if (task_pgrp(p->real_parent) != pgrp && |
264 | task_session(p->real_parent) == task_session(p)) | 264 | task_session(p->real_parent) == task_session(p)) |
265 | return 0; | 265 | return 0; |
266 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 266 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
267 | 267 | ||
268 | return 1; | 268 | return 1; |
269 | } | 269 | } |
270 | 270 | ||
271 | int is_current_pgrp_orphaned(void) | 271 | int is_current_pgrp_orphaned(void) |
272 | { | 272 | { |
273 | int retval; | 273 | int retval; |
274 | 274 | ||
275 | read_lock(&tasklist_lock); | 275 | read_lock(&tasklist_lock); |
276 | retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); | 276 | retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); |
277 | read_unlock(&tasklist_lock); | 277 | read_unlock(&tasklist_lock); |
278 | 278 | ||
279 | return retval; | 279 | return retval; |
280 | } | 280 | } |
281 | 281 | ||
282 | static bool has_stopped_jobs(struct pid *pgrp) | 282 | static bool has_stopped_jobs(struct pid *pgrp) |
283 | { | 283 | { |
284 | struct task_struct *p; | 284 | struct task_struct *p; |
285 | 285 | ||
286 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 286 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
287 | if (p->signal->flags & SIGNAL_STOP_STOPPED) | 287 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
288 | return true; | 288 | return true; |
289 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 289 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
290 | 290 | ||
291 | return false; | 291 | return false; |
292 | } | 292 | } |
293 | 293 | ||
294 | /* | 294 | /* |
295 | * Check to see if any process groups have become orphaned as | 295 | * Check to see if any process groups have become orphaned as |
296 | * a result of our exiting, and if they have any stopped jobs, | 296 | * a result of our exiting, and if they have any stopped jobs, |
297 | * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 297 | * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
298 | */ | 298 | */ |
299 | static void | 299 | static void |
300 | kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | 300 | kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) |
301 | { | 301 | { |
302 | struct pid *pgrp = task_pgrp(tsk); | 302 | struct pid *pgrp = task_pgrp(tsk); |
303 | struct task_struct *ignored_task = tsk; | 303 | struct task_struct *ignored_task = tsk; |
304 | 304 | ||
305 | if (!parent) | 305 | if (!parent) |
306 | /* exit: our father is in a different pgrp than | 306 | /* exit: our father is in a different pgrp than |
307 | * we are and we were the only connection outside. | 307 | * we are and we were the only connection outside. |
308 | */ | 308 | */ |
309 | parent = tsk->real_parent; | 309 | parent = tsk->real_parent; |
310 | else | 310 | else |
311 | /* reparent: our child is in a different pgrp than | 311 | /* reparent: our child is in a different pgrp than |
312 | * we are, and it was the only connection outside. | 312 | * we are, and it was the only connection outside. |
313 | */ | 313 | */ |
314 | ignored_task = NULL; | 314 | ignored_task = NULL; |
315 | 315 | ||
316 | if (task_pgrp(parent) != pgrp && | 316 | if (task_pgrp(parent) != pgrp && |
317 | task_session(parent) == task_session(tsk) && | 317 | task_session(parent) == task_session(tsk) && |
318 | will_become_orphaned_pgrp(pgrp, ignored_task) && | 318 | will_become_orphaned_pgrp(pgrp, ignored_task) && |
319 | has_stopped_jobs(pgrp)) { | 319 | has_stopped_jobs(pgrp)) { |
320 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 320 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
321 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 321 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | /** | 325 | /** |
326 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | 326 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd |
327 | * | 327 | * |
328 | * If a kernel thread is launched as a result of a system call, or if | 328 | * If a kernel thread is launched as a result of a system call, or if |
329 | * it ever exits, it should generally reparent itself to kthreadd so it | 329 | * it ever exits, it should generally reparent itself to kthreadd so it |
330 | * isn't in the way of other processes and is correctly cleaned up on exit. | 330 | * isn't in the way of other processes and is correctly cleaned up on exit. |
331 | * | 331 | * |
332 | * The various task state such as scheduling policy and priority may have | 332 | * The various task state such as scheduling policy and priority may have |
333 | * been inherited from a user process, so we reset them to sane values here. | 333 | * been inherited from a user process, so we reset them to sane values here. |
334 | * | 334 | * |
335 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. | 335 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. |
336 | */ | 336 | */ |
337 | static void reparent_to_kthreadd(void) | 337 | static void reparent_to_kthreadd(void) |
338 | { | 338 | { |
339 | write_lock_irq(&tasklist_lock); | 339 | write_lock_irq(&tasklist_lock); |
340 | 340 | ||
341 | ptrace_unlink(current); | 341 | ptrace_unlink(current); |
342 | /* Reparent to init */ | 342 | /* Reparent to init */ |
343 | current->real_parent = current->parent = kthreadd_task; | 343 | current->real_parent = current->parent = kthreadd_task; |
344 | list_move_tail(¤t->sibling, ¤t->real_parent->children); | 344 | list_move_tail(¤t->sibling, ¤t->real_parent->children); |
345 | 345 | ||
346 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 346 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
347 | current->exit_signal = SIGCHLD; | 347 | current->exit_signal = SIGCHLD; |
348 | 348 | ||
349 | if (task_nice(current) < 0) | 349 | if (task_nice(current) < 0) |
350 | set_user_nice(current, 0); | 350 | set_user_nice(current, 0); |
351 | /* cpus_allowed? */ | 351 | /* cpus_allowed? */ |
352 | /* rt_priority? */ | 352 | /* rt_priority? */ |
353 | /* signals? */ | 353 | /* signals? */ |
354 | memcpy(current->signal->rlim, init_task.signal->rlim, | 354 | memcpy(current->signal->rlim, init_task.signal->rlim, |
355 | sizeof(current->signal->rlim)); | 355 | sizeof(current->signal->rlim)); |
356 | 356 | ||
357 | atomic_inc(&init_cred.usage); | 357 | atomic_inc(&init_cred.usage); |
358 | commit_creds(&init_cred); | 358 | commit_creds(&init_cred); |
359 | write_unlock_irq(&tasklist_lock); | 359 | write_unlock_irq(&tasklist_lock); |
360 | } | 360 | } |
361 | 361 | ||
362 | void __set_special_pids(struct pid *pid) | 362 | void __set_special_pids(struct pid *pid) |
363 | { | 363 | { |
364 | struct task_struct *curr = current->group_leader; | 364 | struct task_struct *curr = current->group_leader; |
365 | 365 | ||
366 | if (task_session(curr) != pid) | 366 | if (task_session(curr) != pid) |
367 | change_pid(curr, PIDTYPE_SID, pid); | 367 | change_pid(curr, PIDTYPE_SID, pid); |
368 | 368 | ||
369 | if (task_pgrp(curr) != pid) | 369 | if (task_pgrp(curr) != pid) |
370 | change_pid(curr, PIDTYPE_PGID, pid); | 370 | change_pid(curr, PIDTYPE_PGID, pid); |
371 | } | 371 | } |
372 | 372 | ||
373 | static void set_special_pids(struct pid *pid) | 373 | static void set_special_pids(struct pid *pid) |
374 | { | 374 | { |
375 | write_lock_irq(&tasklist_lock); | 375 | write_lock_irq(&tasklist_lock); |
376 | __set_special_pids(pid); | 376 | __set_special_pids(pid); |
377 | write_unlock_irq(&tasklist_lock); | 377 | write_unlock_irq(&tasklist_lock); |
378 | } | 378 | } |
379 | 379 | ||
380 | /* | 380 | /* |
381 | * Let kernel threads use this to say that they allow a certain signal. | 381 | * Let kernel threads use this to say that they allow a certain signal. |
382 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 382 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
383 | */ | 383 | */ |
384 | int allow_signal(int sig) | 384 | int allow_signal(int sig) |
385 | { | 385 | { |
386 | if (!valid_signal(sig) || sig < 1) | 386 | if (!valid_signal(sig) || sig < 1) |
387 | return -EINVAL; | 387 | return -EINVAL; |
388 | 388 | ||
389 | spin_lock_irq(¤t->sighand->siglock); | 389 | spin_lock_irq(¤t->sighand->siglock); |
390 | /* This is only needed for daemonize()'ed kthreads */ | 390 | /* This is only needed for daemonize()'ed kthreads */ |
391 | sigdelset(¤t->blocked, sig); | 391 | sigdelset(¤t->blocked, sig); |
392 | /* | 392 | /* |
393 | * Kernel threads handle their own signals. Let the signal code | 393 | * Kernel threads handle their own signals. Let the signal code |
394 | * know it'll be handled, so that they don't get converted to | 394 | * know it'll be handled, so that they don't get converted to |
395 | * SIGKILL or just silently dropped. | 395 | * SIGKILL or just silently dropped. |
396 | */ | 396 | */ |
397 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; | 397 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; |
398 | recalc_sigpending(); | 398 | recalc_sigpending(); |
399 | spin_unlock_irq(¤t->sighand->siglock); | 399 | spin_unlock_irq(¤t->sighand->siglock); |
400 | return 0; | 400 | return 0; |
401 | } | 401 | } |
402 | 402 | ||
403 | EXPORT_SYMBOL(allow_signal); | 403 | EXPORT_SYMBOL(allow_signal); |
404 | 404 | ||
405 | int disallow_signal(int sig) | 405 | int disallow_signal(int sig) |
406 | { | 406 | { |
407 | if (!valid_signal(sig) || sig < 1) | 407 | if (!valid_signal(sig) || sig < 1) |
408 | return -EINVAL; | 408 | return -EINVAL; |
409 | 409 | ||
410 | spin_lock_irq(¤t->sighand->siglock); | 410 | spin_lock_irq(¤t->sighand->siglock); |
411 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; | 411 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; |
412 | recalc_sigpending(); | 412 | recalc_sigpending(); |
413 | spin_unlock_irq(¤t->sighand->siglock); | 413 | spin_unlock_irq(¤t->sighand->siglock); |
414 | return 0; | 414 | return 0; |
415 | } | 415 | } |
416 | 416 | ||
417 | EXPORT_SYMBOL(disallow_signal); | 417 | EXPORT_SYMBOL(disallow_signal); |
418 | 418 | ||
419 | /* | 419 | /* |
420 | * Put all the gunge required to become a kernel thread without | 420 | * Put all the gunge required to become a kernel thread without |
421 | * attached user resources in one place where it belongs. | 421 | * attached user resources in one place where it belongs. |
422 | */ | 422 | */ |
423 | 423 | ||
424 | void daemonize(const char *name, ...) | 424 | void daemonize(const char *name, ...) |
425 | { | 425 | { |
426 | va_list args; | 426 | va_list args; |
427 | sigset_t blocked; | 427 | sigset_t blocked; |
428 | 428 | ||
429 | va_start(args, name); | 429 | va_start(args, name); |
430 | vsnprintf(current->comm, sizeof(current->comm), name, args); | 430 | vsnprintf(current->comm, sizeof(current->comm), name, args); |
431 | va_end(args); | 431 | va_end(args); |
432 | 432 | ||
433 | /* | 433 | /* |
434 | * If we were started as result of loading a module, close all of the | 434 | * If we were started as result of loading a module, close all of the |
435 | * user space pages. We don't need them, and if we didn't close them | 435 | * user space pages. We don't need them, and if we didn't close them |
436 | * they would be locked into memory. | 436 | * they would be locked into memory. |
437 | */ | 437 | */ |
438 | exit_mm(current); | 438 | exit_mm(current); |
439 | /* | 439 | /* |
440 | * We don't want to get frozen, in case system-wide hibernation | 440 | * We don't want to get frozen, in case system-wide hibernation |
441 | * or suspend transition begins right now. | 441 | * or suspend transition begins right now. |
442 | */ | 442 | */ |
443 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | 443 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); |
444 | 444 | ||
445 | if (current->nsproxy != &init_nsproxy) { | 445 | if (current->nsproxy != &init_nsproxy) { |
446 | get_nsproxy(&init_nsproxy); | 446 | get_nsproxy(&init_nsproxy); |
447 | switch_task_namespaces(current, &init_nsproxy); | 447 | switch_task_namespaces(current, &init_nsproxy); |
448 | } | 448 | } |
449 | set_special_pids(&init_struct_pid); | 449 | set_special_pids(&init_struct_pid); |
450 | proc_clear_tty(current); | 450 | proc_clear_tty(current); |
451 | 451 | ||
452 | /* Block and flush all signals */ | 452 | /* Block and flush all signals */ |
453 | sigfillset(&blocked); | 453 | sigfillset(&blocked); |
454 | sigprocmask(SIG_BLOCK, &blocked, NULL); | 454 | sigprocmask(SIG_BLOCK, &blocked, NULL); |
455 | flush_signals(current); | 455 | flush_signals(current); |
456 | 456 | ||
457 | /* Become as one with the init task */ | 457 | /* Become as one with the init task */ |
458 | 458 | ||
459 | daemonize_fs_struct(); | 459 | daemonize_fs_struct(); |
460 | exit_files(current); | 460 | daemonize_descriptors(); |
461 | current->files = init_task.files; | ||
462 | atomic_inc(¤t->files->count); | ||
463 | 461 | ||
464 | reparent_to_kthreadd(); | 462 | reparent_to_kthreadd(); |
465 | } | 463 | } |
466 | 464 | ||
467 | EXPORT_SYMBOL(daemonize); | 465 | EXPORT_SYMBOL(daemonize); |
468 | 466 | ||
469 | #ifdef CONFIG_MM_OWNER | 467 | #ifdef CONFIG_MM_OWNER |
470 | /* | 468 | /* |
471 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 469 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
472 | */ | 470 | */ |
473 | void mm_update_next_owner(struct mm_struct *mm) | 471 | void mm_update_next_owner(struct mm_struct *mm) |
474 | { | 472 | { |
475 | struct task_struct *c, *g, *p = current; | 473 | struct task_struct *c, *g, *p = current; |
476 | 474 | ||
477 | retry: | 475 | retry: |
478 | /* | 476 | /* |
479 | * If the exiting or execing task is not the owner, it's | 477 | * If the exiting or execing task is not the owner, it's |
480 | * someone else's problem. | 478 | * someone else's problem. |
481 | */ | 479 | */ |
482 | if (mm->owner != p) | 480 | if (mm->owner != p) |
483 | return; | 481 | return; |
484 | /* | 482 | /* |
485 | * The current owner is exiting/execing and there are no other | 483 | * The current owner is exiting/execing and there are no other |
486 | * candidates. Do not leave the mm pointing to a possibly | 484 | * candidates. Do not leave the mm pointing to a possibly |
487 | * freed task structure. | 485 | * freed task structure. |
488 | */ | 486 | */ |
489 | if (atomic_read(&mm->mm_users) <= 1) { | 487 | if (atomic_read(&mm->mm_users) <= 1) { |
490 | mm->owner = NULL; | 488 | mm->owner = NULL; |
491 | return; | 489 | return; |
492 | } | 490 | } |
493 | 491 | ||
494 | read_lock(&tasklist_lock); | 492 | read_lock(&tasklist_lock); |
495 | /* | 493 | /* |
496 | * Search in the children | 494 | * Search in the children |
497 | */ | 495 | */ |
498 | list_for_each_entry(c, &p->children, sibling) { | 496 | list_for_each_entry(c, &p->children, sibling) { |
499 | if (c->mm == mm) | 497 | if (c->mm == mm) |
500 | goto assign_new_owner; | 498 | goto assign_new_owner; |
501 | } | 499 | } |
502 | 500 | ||
503 | /* | 501 | /* |
504 | * Search in the siblings | 502 | * Search in the siblings |
505 | */ | 503 | */ |
506 | list_for_each_entry(c, &p->real_parent->children, sibling) { | 504 | list_for_each_entry(c, &p->real_parent->children, sibling) { |
507 | if (c->mm == mm) | 505 | if (c->mm == mm) |
508 | goto assign_new_owner; | 506 | goto assign_new_owner; |
509 | } | 507 | } |
510 | 508 | ||
511 | /* | 509 | /* |
512 | * Search through everything else. We should not get | 510 | * Search through everything else. We should not get |
513 | * here often | 511 | * here often |
514 | */ | 512 | */ |
515 | do_each_thread(g, c) { | 513 | do_each_thread(g, c) { |
516 | if (c->mm == mm) | 514 | if (c->mm == mm) |
517 | goto assign_new_owner; | 515 | goto assign_new_owner; |
518 | } while_each_thread(g, c); | 516 | } while_each_thread(g, c); |
519 | 517 | ||
520 | read_unlock(&tasklist_lock); | 518 | read_unlock(&tasklist_lock); |
521 | /* | 519 | /* |
522 | * We found no owner yet mm_users > 1: this implies that we are | 520 | * We found no owner yet mm_users > 1: this implies that we are |
523 | * most likely racing with swapoff (try_to_unuse()) or /proc or | 521 | * most likely racing with swapoff (try_to_unuse()) or /proc or |
524 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. | 522 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. |
525 | */ | 523 | */ |
526 | mm->owner = NULL; | 524 | mm->owner = NULL; |
527 | return; | 525 | return; |
528 | 526 | ||
529 | assign_new_owner: | 527 | assign_new_owner: |
530 | BUG_ON(c == p); | 528 | BUG_ON(c == p); |
531 | get_task_struct(c); | 529 | get_task_struct(c); |
532 | /* | 530 | /* |
533 | * The task_lock protects c->mm from changing. | 531 | * The task_lock protects c->mm from changing. |
534 | * We always want mm->owner->mm == mm | 532 | * We always want mm->owner->mm == mm |
535 | */ | 533 | */ |
536 | task_lock(c); | 534 | task_lock(c); |
537 | /* | 535 | /* |
538 | * Delay read_unlock() till we have the task_lock() | 536 | * Delay read_unlock() till we have the task_lock() |
539 | * to ensure that c does not slip away underneath us | 537 | * to ensure that c does not slip away underneath us |
540 | */ | 538 | */ |
541 | read_unlock(&tasklist_lock); | 539 | read_unlock(&tasklist_lock); |
542 | if (c->mm != mm) { | 540 | if (c->mm != mm) { |
543 | task_unlock(c); | 541 | task_unlock(c); |
544 | put_task_struct(c); | 542 | put_task_struct(c); |
545 | goto retry; | 543 | goto retry; |
546 | } | 544 | } |
547 | mm->owner = c; | 545 | mm->owner = c; |
548 | task_unlock(c); | 546 | task_unlock(c); |
549 | put_task_struct(c); | 547 | put_task_struct(c); |
550 | } | 548 | } |
551 | #endif /* CONFIG_MM_OWNER */ | 549 | #endif /* CONFIG_MM_OWNER */ |
552 | 550 | ||
553 | /* | 551 | /* |
554 | * Turn us into a lazy TLB process if we | 552 | * Turn us into a lazy TLB process if we |
555 | * aren't already.. | 553 | * aren't already.. |
556 | */ | 554 | */ |
557 | static void exit_mm(struct task_struct * tsk) | 555 | static void exit_mm(struct task_struct * tsk) |
558 | { | 556 | { |
559 | struct mm_struct *mm = tsk->mm; | 557 | struct mm_struct *mm = tsk->mm; |
560 | struct core_state *core_state; | 558 | struct core_state *core_state; |
561 | 559 | ||
562 | mm_release(tsk, mm); | 560 | mm_release(tsk, mm); |
563 | if (!mm) | 561 | if (!mm) |
564 | return; | 562 | return; |
565 | sync_mm_rss(mm); | 563 | sync_mm_rss(mm); |
566 | /* | 564 | /* |
567 | * Serialize with any possible pending coredump. | 565 | * Serialize with any possible pending coredump. |
568 | * We must hold mmap_sem around checking core_state | 566 | * We must hold mmap_sem around checking core_state |
569 | * and clearing tsk->mm. The core-inducing thread | 567 | * and clearing tsk->mm. The core-inducing thread |
570 | * will increment ->nr_threads for each thread in the | 568 | * will increment ->nr_threads for each thread in the |
571 | * group with ->mm != NULL. | 569 | * group with ->mm != NULL. |
572 | */ | 570 | */ |
573 | down_read(&mm->mmap_sem); | 571 | down_read(&mm->mmap_sem); |
574 | core_state = mm->core_state; | 572 | core_state = mm->core_state; |
575 | if (core_state) { | 573 | if (core_state) { |
576 | struct core_thread self; | 574 | struct core_thread self; |
577 | up_read(&mm->mmap_sem); | 575 | up_read(&mm->mmap_sem); |
578 | 576 | ||
579 | self.task = tsk; | 577 | self.task = tsk; |
580 | self.next = xchg(&core_state->dumper.next, &self); | 578 | self.next = xchg(&core_state->dumper.next, &self); |
581 | /* | 579 | /* |
582 | * Implies mb(), the result of xchg() must be visible | 580 | * Implies mb(), the result of xchg() must be visible |
583 | * to core_state->dumper. | 581 | * to core_state->dumper. |
584 | */ | 582 | */ |
585 | if (atomic_dec_and_test(&core_state->nr_threads)) | 583 | if (atomic_dec_and_test(&core_state->nr_threads)) |
586 | complete(&core_state->startup); | 584 | complete(&core_state->startup); |
587 | 585 | ||
588 | for (;;) { | 586 | for (;;) { |
589 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 587 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
590 | if (!self.task) /* see coredump_finish() */ | 588 | if (!self.task) /* see coredump_finish() */ |
591 | break; | 589 | break; |
592 | schedule(); | 590 | schedule(); |
593 | } | 591 | } |
594 | __set_task_state(tsk, TASK_RUNNING); | 592 | __set_task_state(tsk, TASK_RUNNING); |
595 | down_read(&mm->mmap_sem); | 593 | down_read(&mm->mmap_sem); |
596 | } | 594 | } |
597 | atomic_inc(&mm->mm_count); | 595 | atomic_inc(&mm->mm_count); |
598 | BUG_ON(mm != tsk->active_mm); | 596 | BUG_ON(mm != tsk->active_mm); |
599 | /* more a memory barrier than a real lock */ | 597 | /* more a memory barrier than a real lock */ |
600 | task_lock(tsk); | 598 | task_lock(tsk); |
601 | tsk->mm = NULL; | 599 | tsk->mm = NULL; |
602 | up_read(&mm->mmap_sem); | 600 | up_read(&mm->mmap_sem); |
603 | enter_lazy_tlb(mm, current); | 601 | enter_lazy_tlb(mm, current); |
604 | task_unlock(tsk); | 602 | task_unlock(tsk); |
605 | mm_update_next_owner(mm); | 603 | mm_update_next_owner(mm); |
606 | mmput(mm); | 604 | mmput(mm); |
607 | } | 605 | } |
608 | 606 | ||
609 | /* | 607 | /* |
610 | * When we die, we re-parent all our children, and try to: | 608 | * When we die, we re-parent all our children, and try to: |
611 | * 1. give them to another thread in our thread group, if such a member exists | 609 | * 1. give them to another thread in our thread group, if such a member exists |
612 | * 2. give it to the first ancestor process which prctl'd itself as a | 610 | * 2. give it to the first ancestor process which prctl'd itself as a |
613 | * child_subreaper for its children (like a service manager) | 611 | * child_subreaper for its children (like a service manager) |
614 | * 3. give it to the init process (PID 1) in our pid namespace | 612 | * 3. give it to the init process (PID 1) in our pid namespace |
615 | */ | 613 | */ |
616 | static struct task_struct *find_new_reaper(struct task_struct *father) | 614 | static struct task_struct *find_new_reaper(struct task_struct *father) |
617 | __releases(&tasklist_lock) | 615 | __releases(&tasklist_lock) |
618 | __acquires(&tasklist_lock) | 616 | __acquires(&tasklist_lock) |
619 | { | 617 | { |
620 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 618 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
621 | struct task_struct *thread; | 619 | struct task_struct *thread; |
622 | 620 | ||
623 | thread = father; | 621 | thread = father; |
624 | while_each_thread(father, thread) { | 622 | while_each_thread(father, thread) { |
625 | if (thread->flags & PF_EXITING) | 623 | if (thread->flags & PF_EXITING) |
626 | continue; | 624 | continue; |
627 | if (unlikely(pid_ns->child_reaper == father)) | 625 | if (unlikely(pid_ns->child_reaper == father)) |
628 | pid_ns->child_reaper = thread; | 626 | pid_ns->child_reaper = thread; |
629 | return thread; | 627 | return thread; |
630 | } | 628 | } |
631 | 629 | ||
632 | if (unlikely(pid_ns->child_reaper == father)) { | 630 | if (unlikely(pid_ns->child_reaper == father)) { |
633 | write_unlock_irq(&tasklist_lock); | 631 | write_unlock_irq(&tasklist_lock); |
634 | if (unlikely(pid_ns == &init_pid_ns)) { | 632 | if (unlikely(pid_ns == &init_pid_ns)) { |
635 | panic("Attempted to kill init! exitcode=0x%08x\n", | 633 | panic("Attempted to kill init! exitcode=0x%08x\n", |
636 | father->signal->group_exit_code ?: | 634 | father->signal->group_exit_code ?: |
637 | father->exit_code); | 635 | father->exit_code); |
638 | } | 636 | } |
639 | 637 | ||
640 | zap_pid_ns_processes(pid_ns); | 638 | zap_pid_ns_processes(pid_ns); |
641 | write_lock_irq(&tasklist_lock); | 639 | write_lock_irq(&tasklist_lock); |
642 | } else if (father->signal->has_child_subreaper) { | 640 | } else if (father->signal->has_child_subreaper) { |
643 | struct task_struct *reaper; | 641 | struct task_struct *reaper; |
644 | 642 | ||
645 | /* | 643 | /* |
646 | * Find the first ancestor marked as child_subreaper. | 644 | * Find the first ancestor marked as child_subreaper. |
647 | * Note that the code below checks same_thread_group(reaper, | 645 | * Note that the code below checks same_thread_group(reaper, |
648 | * pid_ns->child_reaper). This is what we need to DTRT in a | 646 | * pid_ns->child_reaper). This is what we need to DTRT in a |
649 | * PID namespace. However we still need the check above, see | 647 | * PID namespace. However we still need the check above, see |
650 | * http://marc.info/?l=linux-kernel&m=131385460420380 | 648 | * http://marc.info/?l=linux-kernel&m=131385460420380 |
651 | */ | 649 | */ |
652 | for (reaper = father->real_parent; | 650 | for (reaper = father->real_parent; |
653 | reaper != &init_task; | 651 | reaper != &init_task; |
654 | reaper = reaper->real_parent) { | 652 | reaper = reaper->real_parent) { |
655 | if (same_thread_group(reaper, pid_ns->child_reaper)) | 653 | if (same_thread_group(reaper, pid_ns->child_reaper)) |
656 | break; | 654 | break; |
657 | if (!reaper->signal->is_child_subreaper) | 655 | if (!reaper->signal->is_child_subreaper) |
658 | continue; | 656 | continue; |
659 | thread = reaper; | 657 | thread = reaper; |
660 | do { | 658 | do { |
661 | if (!(thread->flags & PF_EXITING)) | 659 | if (!(thread->flags & PF_EXITING)) |
662 | return reaper; | 660 | return reaper; |
663 | } while_each_thread(reaper, thread); | 661 | } while_each_thread(reaper, thread); |
664 | } | 662 | } |
665 | } | 663 | } |
666 | 664 | ||
667 | return pid_ns->child_reaper; | 665 | return pid_ns->child_reaper; |
668 | } | 666 | } |
669 | 667 | ||
670 | /* | 668 | /* |
671 | * Any that need to be release_task'd are put on the @dead list. | 669 | * Any that need to be release_task'd are put on the @dead list. |
672 | */ | 670 | */ |
673 | static void reparent_leader(struct task_struct *father, struct task_struct *p, | 671 | static void reparent_leader(struct task_struct *father, struct task_struct *p, |
674 | struct list_head *dead) | 672 | struct list_head *dead) |
675 | { | 673 | { |
676 | list_move_tail(&p->sibling, &p->real_parent->children); | 674 | list_move_tail(&p->sibling, &p->real_parent->children); |
677 | 675 | ||
678 | if (p->exit_state == EXIT_DEAD) | 676 | if (p->exit_state == EXIT_DEAD) |
679 | return; | 677 | return; |
680 | /* | 678 | /* |
681 | * If this is a threaded reparent there is no need to | 679 | * If this is a threaded reparent there is no need to |
682 | * notify anyone anything has happened. | 680 | * notify anyone anything has happened. |
683 | */ | 681 | */ |
684 | if (same_thread_group(p->real_parent, father)) | 682 | if (same_thread_group(p->real_parent, father)) |
685 | return; | 683 | return; |
686 | 684 | ||
687 | /* We don't want people slaying init. */ | 685 | /* We don't want people slaying init. */ |
688 | p->exit_signal = SIGCHLD; | 686 | p->exit_signal = SIGCHLD; |
689 | 687 | ||
690 | /* If it has exited notify the new parent about this child's death. */ | 688 | /* If it has exited notify the new parent about this child's death. */ |
691 | if (!p->ptrace && | 689 | if (!p->ptrace && |
692 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 690 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
693 | if (do_notify_parent(p, p->exit_signal)) { | 691 | if (do_notify_parent(p, p->exit_signal)) { |
694 | p->exit_state = EXIT_DEAD; | 692 | p->exit_state = EXIT_DEAD; |
695 | list_move_tail(&p->sibling, dead); | 693 | list_move_tail(&p->sibling, dead); |
696 | } | 694 | } |
697 | } | 695 | } |
698 | 696 | ||
699 | kill_orphaned_pgrp(p, father); | 697 | kill_orphaned_pgrp(p, father); |
700 | } | 698 | } |
701 | 699 | ||
702 | static void forget_original_parent(struct task_struct *father) | 700 | static void forget_original_parent(struct task_struct *father) |
703 | { | 701 | { |
704 | struct task_struct *p, *n, *reaper; | 702 | struct task_struct *p, *n, *reaper; |
705 | LIST_HEAD(dead_children); | 703 | LIST_HEAD(dead_children); |
706 | 704 | ||
707 | write_lock_irq(&tasklist_lock); | 705 | write_lock_irq(&tasklist_lock); |
708 | /* | 706 | /* |
709 | * Note that exit_ptrace() and find_new_reaper() might | 707 | * Note that exit_ptrace() and find_new_reaper() might |
710 | * drop tasklist_lock and reacquire it. | 708 | * drop tasklist_lock and reacquire it. |
711 | */ | 709 | */ |
712 | exit_ptrace(father); | 710 | exit_ptrace(father); |
713 | reaper = find_new_reaper(father); | 711 | reaper = find_new_reaper(father); |
714 | 712 | ||
715 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 713 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
716 | struct task_struct *t = p; | 714 | struct task_struct *t = p; |
717 | do { | 715 | do { |
718 | t->real_parent = reaper; | 716 | t->real_parent = reaper; |
719 | if (t->parent == father) { | 717 | if (t->parent == father) { |
720 | BUG_ON(t->ptrace); | 718 | BUG_ON(t->ptrace); |
721 | t->parent = t->real_parent; | 719 | t->parent = t->real_parent; |
722 | } | 720 | } |
723 | if (t->pdeath_signal) | 721 | if (t->pdeath_signal) |
724 | group_send_sig_info(t->pdeath_signal, | 722 | group_send_sig_info(t->pdeath_signal, |
725 | SEND_SIG_NOINFO, t); | 723 | SEND_SIG_NOINFO, t); |
726 | } while_each_thread(p, t); | 724 | } while_each_thread(p, t); |
727 | reparent_leader(father, p, &dead_children); | 725 | reparent_leader(father, p, &dead_children); |
728 | } | 726 | } |
729 | write_unlock_irq(&tasklist_lock); | 727 | write_unlock_irq(&tasklist_lock); |
730 | 728 | ||
731 | BUG_ON(!list_empty(&father->children)); | 729 | BUG_ON(!list_empty(&father->children)); |
732 | 730 | ||
733 | list_for_each_entry_safe(p, n, &dead_children, sibling) { | 731 | list_for_each_entry_safe(p, n, &dead_children, sibling) { |
734 | list_del_init(&p->sibling); | 732 | list_del_init(&p->sibling); |
735 | release_task(p); | 733 | release_task(p); |
736 | } | 734 | } |
737 | } | 735 | } |
738 | 736 | ||
739 | /* | 737 | /* |
740 | * Send signals to all our closest relatives so that they know | 738 | * Send signals to all our closest relatives so that they know |
741 | * to properly mourn us.. | 739 | * to properly mourn us.. |
742 | */ | 740 | */ |
743 | static void exit_notify(struct task_struct *tsk, int group_dead) | 741 | static void exit_notify(struct task_struct *tsk, int group_dead) |
744 | { | 742 | { |
745 | bool autoreap; | 743 | bool autoreap; |
746 | 744 | ||
747 | /* | 745 | /* |
748 | * This does two things: | 746 | * This does two things: |
749 | * | 747 | * |
750 | * A. Make init inherit all the child processes | 748 | * A. Make init inherit all the child processes |
751 | * B. Check to see if any process groups have become orphaned | 749 | * B. Check to see if any process groups have become orphaned |
752 | * as a result of our exiting, and if they have any stopped | 750 | * as a result of our exiting, and if they have any stopped |
753 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 751 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
754 | */ | 752 | */ |
755 | forget_original_parent(tsk); | 753 | forget_original_parent(tsk); |
756 | exit_task_namespaces(tsk); | 754 | exit_task_namespaces(tsk); |
757 | 755 | ||
758 | write_lock_irq(&tasklist_lock); | 756 | write_lock_irq(&tasklist_lock); |
759 | if (group_dead) | 757 | if (group_dead) |
760 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 758 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
761 | 759 | ||
762 | if (unlikely(tsk->ptrace)) { | 760 | if (unlikely(tsk->ptrace)) { |
763 | int sig = thread_group_leader(tsk) && | 761 | int sig = thread_group_leader(tsk) && |
764 | thread_group_empty(tsk) && | 762 | thread_group_empty(tsk) && |
765 | !ptrace_reparented(tsk) ? | 763 | !ptrace_reparented(tsk) ? |
766 | tsk->exit_signal : SIGCHLD; | 764 | tsk->exit_signal : SIGCHLD; |
767 | autoreap = do_notify_parent(tsk, sig); | 765 | autoreap = do_notify_parent(tsk, sig); |
768 | } else if (thread_group_leader(tsk)) { | 766 | } else if (thread_group_leader(tsk)) { |
769 | autoreap = thread_group_empty(tsk) && | 767 | autoreap = thread_group_empty(tsk) && |
770 | do_notify_parent(tsk, tsk->exit_signal); | 768 | do_notify_parent(tsk, tsk->exit_signal); |
771 | } else { | 769 | } else { |
772 | autoreap = true; | 770 | autoreap = true; |
773 | } | 771 | } |
774 | 772 | ||
775 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; | 773 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
776 | 774 | ||
777 | /* mt-exec, de_thread() is waiting for group leader */ | 775 | /* mt-exec, de_thread() is waiting for group leader */ |
778 | if (unlikely(tsk->signal->notify_count < 0)) | 776 | if (unlikely(tsk->signal->notify_count < 0)) |
779 | wake_up_process(tsk->signal->group_exit_task); | 777 | wake_up_process(tsk->signal->group_exit_task); |
780 | write_unlock_irq(&tasklist_lock); | 778 | write_unlock_irq(&tasklist_lock); |
781 | 779 | ||
782 | /* If the process is dead, release it - nobody will wait for it */ | 780 | /* If the process is dead, release it - nobody will wait for it */ |
783 | if (autoreap) | 781 | if (autoreap) |
784 | release_task(tsk); | 782 | release_task(tsk); |
785 | } | 783 | } |
786 | 784 | ||
787 | #ifdef CONFIG_DEBUG_STACK_USAGE | 785 | #ifdef CONFIG_DEBUG_STACK_USAGE |
788 | static void check_stack_usage(void) | 786 | static void check_stack_usage(void) |
789 | { | 787 | { |
790 | static DEFINE_SPINLOCK(low_water_lock); | 788 | static DEFINE_SPINLOCK(low_water_lock); |
791 | static int lowest_to_date = THREAD_SIZE; | 789 | static int lowest_to_date = THREAD_SIZE; |
792 | unsigned long free; | 790 | unsigned long free; |
793 | 791 | ||
794 | free = stack_not_used(current); | 792 | free = stack_not_used(current); |
795 | 793 | ||
796 | if (free >= lowest_to_date) | 794 | if (free >= lowest_to_date) |
797 | return; | 795 | return; |
798 | 796 | ||
799 | spin_lock(&low_water_lock); | 797 | spin_lock(&low_water_lock); |
800 | if (free < lowest_to_date) { | 798 | if (free < lowest_to_date) { |
801 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " | 799 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " |
802 | "%lu bytes left\n", | 800 | "%lu bytes left\n", |
803 | current->comm, task_pid_nr(current), free); | 801 | current->comm, task_pid_nr(current), free); |
804 | lowest_to_date = free; | 802 | lowest_to_date = free; |
805 | } | 803 | } |
806 | spin_unlock(&low_water_lock); | 804 | spin_unlock(&low_water_lock); |
807 | } | 805 | } |
808 | #else | 806 | #else |
809 | static inline void check_stack_usage(void) {} | 807 | static inline void check_stack_usage(void) {} |
810 | #endif | 808 | #endif |
811 | 809 | ||
812 | void do_exit(long code) | 810 | void do_exit(long code) |
813 | { | 811 | { |
814 | struct task_struct *tsk = current; | 812 | struct task_struct *tsk = current; |
815 | int group_dead; | 813 | int group_dead; |
816 | 814 | ||
817 | profile_task_exit(tsk); | 815 | profile_task_exit(tsk); |
818 | 816 | ||
819 | WARN_ON(blk_needs_flush_plug(tsk)); | 817 | WARN_ON(blk_needs_flush_plug(tsk)); |
820 | 818 | ||
821 | if (unlikely(in_interrupt())) | 819 | if (unlikely(in_interrupt())) |
822 | panic("Aiee, killing interrupt handler!"); | 820 | panic("Aiee, killing interrupt handler!"); |
823 | if (unlikely(!tsk->pid)) | 821 | if (unlikely(!tsk->pid)) |
824 | panic("Attempted to kill the idle task!"); | 822 | panic("Attempted to kill the idle task!"); |
825 | 823 | ||
826 | /* | 824 | /* |
827 | * If do_exit is called because this processes oopsed, it's possible | 825 | * If do_exit is called because this processes oopsed, it's possible |
828 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | 826 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before |
829 | * continuing. Amongst other possible reasons, this is to prevent | 827 | * continuing. Amongst other possible reasons, this is to prevent |
830 | * mm_release()->clear_child_tid() from writing to a user-controlled | 828 | * mm_release()->clear_child_tid() from writing to a user-controlled |
831 | * kernel address. | 829 | * kernel address. |
832 | */ | 830 | */ |
833 | set_fs(USER_DS); | 831 | set_fs(USER_DS); |
834 | 832 | ||
835 | ptrace_event(PTRACE_EVENT_EXIT, code); | 833 | ptrace_event(PTRACE_EVENT_EXIT, code); |
836 | 834 | ||
837 | validate_creds_for_do_exit(tsk); | 835 | validate_creds_for_do_exit(tsk); |
838 | 836 | ||
839 | /* | 837 | /* |
840 | * We're taking recursive faults here in do_exit. Safest is to just | 838 | * We're taking recursive faults here in do_exit. Safest is to just |
841 | * leave this task alone and wait for reboot. | 839 | * leave this task alone and wait for reboot. |
842 | */ | 840 | */ |
843 | if (unlikely(tsk->flags & PF_EXITING)) { | 841 | if (unlikely(tsk->flags & PF_EXITING)) { |
844 | printk(KERN_ALERT | 842 | printk(KERN_ALERT |
845 | "Fixing recursive fault but reboot is needed!\n"); | 843 | "Fixing recursive fault but reboot is needed!\n"); |
846 | /* | 844 | /* |
847 | * We can do this unlocked here. The futex code uses | 845 | * We can do this unlocked here. The futex code uses |
848 | * this flag just to verify whether the pi state | 846 | * this flag just to verify whether the pi state |
849 | * cleanup has been done or not. In the worst case it | 847 | * cleanup has been done or not. In the worst case it |
850 | * loops once more. We pretend that the cleanup was | 848 | * loops once more. We pretend that the cleanup was |
851 | * done as there is no way to return. Either the | 849 | * done as there is no way to return. Either the |
852 | * OWNER_DIED bit is set by now or we push the blocked | 850 | * OWNER_DIED bit is set by now or we push the blocked |
853 | * task into the wait for ever nirwana as well. | 851 | * task into the wait for ever nirwana as well. |
854 | */ | 852 | */ |
855 | tsk->flags |= PF_EXITPIDONE; | 853 | tsk->flags |= PF_EXITPIDONE; |
856 | set_current_state(TASK_UNINTERRUPTIBLE); | 854 | set_current_state(TASK_UNINTERRUPTIBLE); |
857 | schedule(); | 855 | schedule(); |
858 | } | 856 | } |
859 | 857 | ||
860 | exit_signals(tsk); /* sets PF_EXITING */ | 858 | exit_signals(tsk); /* sets PF_EXITING */ |
861 | /* | 859 | /* |
862 | * tsk->flags are checked in the futex code to protect against | 860 | * tsk->flags are checked in the futex code to protect against |
863 | * an exiting task cleaning up the robust pi futexes. | 861 | * an exiting task cleaning up the robust pi futexes. |
864 | */ | 862 | */ |
865 | smp_mb(); | 863 | smp_mb(); |
866 | raw_spin_unlock_wait(&tsk->pi_lock); | 864 | raw_spin_unlock_wait(&tsk->pi_lock); |
867 | 865 | ||
868 | if (unlikely(in_atomic())) | 866 | if (unlikely(in_atomic())) |
869 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 867 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
870 | current->comm, task_pid_nr(current), | 868 | current->comm, task_pid_nr(current), |
871 | preempt_count()); | 869 | preempt_count()); |
872 | 870 | ||
873 | acct_update_integrals(tsk); | 871 | acct_update_integrals(tsk); |
874 | /* sync mm's RSS info before statistics gathering */ | 872 | /* sync mm's RSS info before statistics gathering */ |
875 | if (tsk->mm) | 873 | if (tsk->mm) |
876 | sync_mm_rss(tsk->mm); | 874 | sync_mm_rss(tsk->mm); |
877 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 875 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
878 | if (group_dead) { | 876 | if (group_dead) { |
879 | hrtimer_cancel(&tsk->signal->real_timer); | 877 | hrtimer_cancel(&tsk->signal->real_timer); |
880 | exit_itimers(tsk->signal); | 878 | exit_itimers(tsk->signal); |
881 | if (tsk->mm) | 879 | if (tsk->mm) |
882 | setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); | 880 | setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); |
883 | } | 881 | } |
884 | acct_collect(code, group_dead); | 882 | acct_collect(code, group_dead); |
885 | if (group_dead) | 883 | if (group_dead) |
886 | tty_audit_exit(); | 884 | tty_audit_exit(); |
887 | audit_free(tsk); | 885 | audit_free(tsk); |
888 | 886 | ||
889 | tsk->exit_code = code; | 887 | tsk->exit_code = code; |
890 | taskstats_exit(tsk, group_dead); | 888 | taskstats_exit(tsk, group_dead); |
891 | 889 | ||
892 | exit_mm(tsk); | 890 | exit_mm(tsk); |
893 | 891 | ||
894 | if (group_dead) | 892 | if (group_dead) |
895 | acct_process(); | 893 | acct_process(); |
896 | trace_sched_process_exit(tsk); | 894 | trace_sched_process_exit(tsk); |
897 | 895 | ||
898 | exit_sem(tsk); | 896 | exit_sem(tsk); |
899 | exit_shm(tsk); | 897 | exit_shm(tsk); |
900 | exit_files(tsk); | 898 | exit_files(tsk); |
901 | exit_fs(tsk); | 899 | exit_fs(tsk); |
902 | exit_task_work(tsk); | 900 | exit_task_work(tsk); |
903 | check_stack_usage(); | 901 | check_stack_usage(); |
904 | exit_thread(); | 902 | exit_thread(); |
905 | 903 | ||
906 | /* | 904 | /* |
907 | * Flush inherited counters to the parent - before the parent | 905 | * Flush inherited counters to the parent - before the parent |
908 | * gets woken up by child-exit notifications. | 906 | * gets woken up by child-exit notifications. |
909 | * | 907 | * |
910 | * because of cgroup mode, must be called before cgroup_exit() | 908 | * because of cgroup mode, must be called before cgroup_exit() |
911 | */ | 909 | */ |
912 | perf_event_exit_task(tsk); | 910 | perf_event_exit_task(tsk); |
913 | 911 | ||
914 | cgroup_exit(tsk, 1); | 912 | cgroup_exit(tsk, 1); |
915 | 913 | ||
916 | if (group_dead) | 914 | if (group_dead) |
917 | disassociate_ctty(1); | 915 | disassociate_ctty(1); |
918 | 916 | ||
919 | module_put(task_thread_info(tsk)->exec_domain->module); | 917 | module_put(task_thread_info(tsk)->exec_domain->module); |
920 | 918 | ||
921 | proc_exit_connector(tsk); | 919 | proc_exit_connector(tsk); |
922 | 920 | ||
923 | /* | 921 | /* |
924 | * FIXME: do that only when needed, using sched_exit tracepoint | 922 | * FIXME: do that only when needed, using sched_exit tracepoint |
925 | */ | 923 | */ |
926 | ptrace_put_breakpoints(tsk); | 924 | ptrace_put_breakpoints(tsk); |
927 | 925 | ||
928 | exit_notify(tsk, group_dead); | 926 | exit_notify(tsk, group_dead); |
929 | #ifdef CONFIG_NUMA | 927 | #ifdef CONFIG_NUMA |
930 | task_lock(tsk); | 928 | task_lock(tsk); |
931 | mpol_put(tsk->mempolicy); | 929 | mpol_put(tsk->mempolicy); |
932 | tsk->mempolicy = NULL; | 930 | tsk->mempolicy = NULL; |
933 | task_unlock(tsk); | 931 | task_unlock(tsk); |
934 | #endif | 932 | #endif |
935 | #ifdef CONFIG_FUTEX | 933 | #ifdef CONFIG_FUTEX |
936 | if (unlikely(current->pi_state_cache)) | 934 | if (unlikely(current->pi_state_cache)) |
937 | kfree(current->pi_state_cache); | 935 | kfree(current->pi_state_cache); |
938 | #endif | 936 | #endif |
939 | /* | 937 | /* |
940 | * Make sure we are holding no locks: | 938 | * Make sure we are holding no locks: |
941 | */ | 939 | */ |
942 | debug_check_no_locks_held(tsk); | 940 | debug_check_no_locks_held(tsk); |
943 | /* | 941 | /* |
944 | * We can do this unlocked here. The futex code uses this flag | 942 | * We can do this unlocked here. The futex code uses this flag |
945 | * just to verify whether the pi state cleanup has been done | 943 | * just to verify whether the pi state cleanup has been done |
946 | * or not. In the worst case it loops once more. | 944 | * or not. In the worst case it loops once more. |
947 | */ | 945 | */ |
948 | tsk->flags |= PF_EXITPIDONE; | 946 | tsk->flags |= PF_EXITPIDONE; |
949 | 947 | ||
950 | if (tsk->io_context) | 948 | if (tsk->io_context) |
951 | exit_io_context(tsk); | 949 | exit_io_context(tsk); |
952 | 950 | ||
953 | if (tsk->splice_pipe) | 951 | if (tsk->splice_pipe) |
954 | __free_pipe_info(tsk->splice_pipe); | 952 | __free_pipe_info(tsk->splice_pipe); |
955 | 953 | ||
956 | validate_creds_for_do_exit(tsk); | 954 | validate_creds_for_do_exit(tsk); |
957 | 955 | ||
958 | preempt_disable(); | 956 | preempt_disable(); |
959 | if (tsk->nr_dirtied) | 957 | if (tsk->nr_dirtied) |
960 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | 958 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); |
961 | exit_rcu(); | 959 | exit_rcu(); |
962 | 960 | ||
963 | /* | 961 | /* |
964 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | 962 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed |
965 | * when the following two conditions become true. | 963 | * when the following two conditions become true. |
966 | * - There is race condition of mmap_sem (It is acquired by | 964 | * - There is race condition of mmap_sem (It is acquired by |
967 | * exit_mm()), and | 965 | * exit_mm()), and |
968 | * - SMI occurs before setting TASK_RUNINNG. | 966 | * - SMI occurs before setting TASK_RUNINNG. |
969 | * (or hypervisor of virtual machine switches to other guest) | 967 | * (or hypervisor of virtual machine switches to other guest) |
970 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | 968 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD |
971 | * | 969 | * |
972 | * To avoid it, we have to wait for releasing tsk->pi_lock which | 970 | * To avoid it, we have to wait for releasing tsk->pi_lock which |
973 | * is held by try_to_wake_up() | 971 | * is held by try_to_wake_up() |
974 | */ | 972 | */ |
975 | smp_mb(); | 973 | smp_mb(); |
976 | raw_spin_unlock_wait(&tsk->pi_lock); | 974 | raw_spin_unlock_wait(&tsk->pi_lock); |
977 | 975 | ||
978 | /* causes final put_task_struct in finish_task_switch(). */ | 976 | /* causes final put_task_struct in finish_task_switch(). */ |
979 | tsk->state = TASK_DEAD; | 977 | tsk->state = TASK_DEAD; |
980 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | 978 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ |
981 | schedule(); | 979 | schedule(); |
982 | BUG(); | 980 | BUG(); |
983 | /* Avoid "noreturn function does return". */ | 981 | /* Avoid "noreturn function does return". */ |
984 | for (;;) | 982 | for (;;) |
985 | cpu_relax(); /* For when BUG is null */ | 983 | cpu_relax(); /* For when BUG is null */ |
986 | } | 984 | } |
987 | 985 | ||
988 | EXPORT_SYMBOL_GPL(do_exit); | 986 | EXPORT_SYMBOL_GPL(do_exit); |
989 | 987 | ||
990 | void complete_and_exit(struct completion *comp, long code) | 988 | void complete_and_exit(struct completion *comp, long code) |
991 | { | 989 | { |
992 | if (comp) | 990 | if (comp) |
993 | complete(comp); | 991 | complete(comp); |
994 | 992 | ||
995 | do_exit(code); | 993 | do_exit(code); |
996 | } | 994 | } |
997 | 995 | ||
998 | EXPORT_SYMBOL(complete_and_exit); | 996 | EXPORT_SYMBOL(complete_and_exit); |
999 | 997 | ||
1000 | SYSCALL_DEFINE1(exit, int, error_code) | 998 | SYSCALL_DEFINE1(exit, int, error_code) |
1001 | { | 999 | { |
1002 | do_exit((error_code&0xff)<<8); | 1000 | do_exit((error_code&0xff)<<8); |
1003 | } | 1001 | } |
1004 | 1002 | ||
1005 | /* | 1003 | /* |
1006 | * Take down every thread in the group. This is called by fatal signals | 1004 | * Take down every thread in the group. This is called by fatal signals |
1007 | * as well as by sys_exit_group (below). | 1005 | * as well as by sys_exit_group (below). |
1008 | */ | 1006 | */ |
1009 | void | 1007 | void |
1010 | do_group_exit(int exit_code) | 1008 | do_group_exit(int exit_code) |
1011 | { | 1009 | { |
1012 | struct signal_struct *sig = current->signal; | 1010 | struct signal_struct *sig = current->signal; |
1013 | 1011 | ||
1014 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ | 1012 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ |
1015 | 1013 | ||
1016 | if (signal_group_exit(sig)) | 1014 | if (signal_group_exit(sig)) |
1017 | exit_code = sig->group_exit_code; | 1015 | exit_code = sig->group_exit_code; |
1018 | else if (!thread_group_empty(current)) { | 1016 | else if (!thread_group_empty(current)) { |
1019 | struct sighand_struct *const sighand = current->sighand; | 1017 | struct sighand_struct *const sighand = current->sighand; |
1020 | spin_lock_irq(&sighand->siglock); | 1018 | spin_lock_irq(&sighand->siglock); |
1021 | if (signal_group_exit(sig)) | 1019 | if (signal_group_exit(sig)) |
1022 | /* Another thread got here before we took the lock. */ | 1020 | /* Another thread got here before we took the lock. */ |
1023 | exit_code = sig->group_exit_code; | 1021 | exit_code = sig->group_exit_code; |
1024 | else { | 1022 | else { |
1025 | sig->group_exit_code = exit_code; | 1023 | sig->group_exit_code = exit_code; |
1026 | sig->flags = SIGNAL_GROUP_EXIT; | 1024 | sig->flags = SIGNAL_GROUP_EXIT; |
1027 | zap_other_threads(current); | 1025 | zap_other_threads(current); |
1028 | } | 1026 | } |
1029 | spin_unlock_irq(&sighand->siglock); | 1027 | spin_unlock_irq(&sighand->siglock); |
1030 | } | 1028 | } |
1031 | 1029 | ||
1032 | do_exit(exit_code); | 1030 | do_exit(exit_code); |
1033 | /* NOTREACHED */ | 1031 | /* NOTREACHED */ |
1034 | } | 1032 | } |
1035 | 1033 | ||
1036 | /* | 1034 | /* |
1037 | * this kills every thread in the thread group. Note that any externally | 1035 | * this kills every thread in the thread group. Note that any externally |
1038 | * wait4()-ing process will get the correct exit code - even if this | 1036 | * wait4()-ing process will get the correct exit code - even if this |
1039 | * thread is not the thread group leader. | 1037 | * thread is not the thread group leader. |
1040 | */ | 1038 | */ |
1041 | SYSCALL_DEFINE1(exit_group, int, error_code) | 1039 | SYSCALL_DEFINE1(exit_group, int, error_code) |
1042 | { | 1040 | { |
1043 | do_group_exit((error_code & 0xff) << 8); | 1041 | do_group_exit((error_code & 0xff) << 8); |
1044 | /* NOTREACHED */ | 1042 | /* NOTREACHED */ |
1045 | return 0; | 1043 | return 0; |
1046 | } | 1044 | } |
1047 | 1045 | ||
1048 | struct wait_opts { | 1046 | struct wait_opts { |
1049 | enum pid_type wo_type; | 1047 | enum pid_type wo_type; |
1050 | int wo_flags; | 1048 | int wo_flags; |
1051 | struct pid *wo_pid; | 1049 | struct pid *wo_pid; |
1052 | 1050 | ||
1053 | struct siginfo __user *wo_info; | 1051 | struct siginfo __user *wo_info; |
1054 | int __user *wo_stat; | 1052 | int __user *wo_stat; |
1055 | struct rusage __user *wo_rusage; | 1053 | struct rusage __user *wo_rusage; |
1056 | 1054 | ||
1057 | wait_queue_t child_wait; | 1055 | wait_queue_t child_wait; |
1058 | int notask_error; | 1056 | int notask_error; |
1059 | }; | 1057 | }; |
1060 | 1058 | ||
1061 | static inline | 1059 | static inline |
1062 | struct pid *task_pid_type(struct task_struct *task, enum pid_type type) | 1060 | struct pid *task_pid_type(struct task_struct *task, enum pid_type type) |
1063 | { | 1061 | { |
1064 | if (type != PIDTYPE_PID) | 1062 | if (type != PIDTYPE_PID) |
1065 | task = task->group_leader; | 1063 | task = task->group_leader; |
1066 | return task->pids[type].pid; | 1064 | return task->pids[type].pid; |
1067 | } | 1065 | } |
1068 | 1066 | ||
1069 | static int eligible_pid(struct wait_opts *wo, struct task_struct *p) | 1067 | static int eligible_pid(struct wait_opts *wo, struct task_struct *p) |
1070 | { | 1068 | { |
1071 | return wo->wo_type == PIDTYPE_MAX || | 1069 | return wo->wo_type == PIDTYPE_MAX || |
1072 | task_pid_type(p, wo->wo_type) == wo->wo_pid; | 1070 | task_pid_type(p, wo->wo_type) == wo->wo_pid; |
1073 | } | 1071 | } |
1074 | 1072 | ||
1075 | static int eligible_child(struct wait_opts *wo, struct task_struct *p) | 1073 | static int eligible_child(struct wait_opts *wo, struct task_struct *p) |
1076 | { | 1074 | { |
1077 | if (!eligible_pid(wo, p)) | 1075 | if (!eligible_pid(wo, p)) |
1078 | return 0; | 1076 | return 0; |
1079 | /* Wait for all children (clone and not) if __WALL is set; | 1077 | /* Wait for all children (clone and not) if __WALL is set; |
1080 | * otherwise, wait for clone children *only* if __WCLONE is | 1078 | * otherwise, wait for clone children *only* if __WCLONE is |
1081 | * set; otherwise, wait for non-clone children *only*. (Note: | 1079 | * set; otherwise, wait for non-clone children *only*. (Note: |
1082 | * A "clone" child here is one that reports to its parent | 1080 | * A "clone" child here is one that reports to its parent |
1083 | * using a signal other than SIGCHLD.) */ | 1081 | * using a signal other than SIGCHLD.) */ |
1084 | if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) | 1082 | if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) |
1085 | && !(wo->wo_flags & __WALL)) | 1083 | && !(wo->wo_flags & __WALL)) |
1086 | return 0; | 1084 | return 0; |
1087 | 1085 | ||
1088 | return 1; | 1086 | return 1; |
1089 | } | 1087 | } |
1090 | 1088 | ||
1091 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | 1089 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, |
1092 | pid_t pid, uid_t uid, int why, int status) | 1090 | pid_t pid, uid_t uid, int why, int status) |
1093 | { | 1091 | { |
1094 | struct siginfo __user *infop; | 1092 | struct siginfo __user *infop; |
1095 | int retval = wo->wo_rusage | 1093 | int retval = wo->wo_rusage |
1096 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1094 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1097 | 1095 | ||
1098 | put_task_struct(p); | 1096 | put_task_struct(p); |
1099 | infop = wo->wo_info; | 1097 | infop = wo->wo_info; |
1100 | if (infop) { | 1098 | if (infop) { |
1101 | if (!retval) | 1099 | if (!retval) |
1102 | retval = put_user(SIGCHLD, &infop->si_signo); | 1100 | retval = put_user(SIGCHLD, &infop->si_signo); |
1103 | if (!retval) | 1101 | if (!retval) |
1104 | retval = put_user(0, &infop->si_errno); | 1102 | retval = put_user(0, &infop->si_errno); |
1105 | if (!retval) | 1103 | if (!retval) |
1106 | retval = put_user((short)why, &infop->si_code); | 1104 | retval = put_user((short)why, &infop->si_code); |
1107 | if (!retval) | 1105 | if (!retval) |
1108 | retval = put_user(pid, &infop->si_pid); | 1106 | retval = put_user(pid, &infop->si_pid); |
1109 | if (!retval) | 1107 | if (!retval) |
1110 | retval = put_user(uid, &infop->si_uid); | 1108 | retval = put_user(uid, &infop->si_uid); |
1111 | if (!retval) | 1109 | if (!retval) |
1112 | retval = put_user(status, &infop->si_status); | 1110 | retval = put_user(status, &infop->si_status); |
1113 | } | 1111 | } |
1114 | if (!retval) | 1112 | if (!retval) |
1115 | retval = pid; | 1113 | retval = pid; |
1116 | return retval; | 1114 | return retval; |
1117 | } | 1115 | } |
1118 | 1116 | ||
1119 | /* | 1117 | /* |
1120 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold | 1118 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold |
1121 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1119 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
1122 | * the lock and this task is uninteresting. If we return nonzero, we have | 1120 | * the lock and this task is uninteresting. If we return nonzero, we have |
1123 | * released the lock and the system call should return. | 1121 | * released the lock and the system call should return. |
1124 | */ | 1122 | */ |
1125 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 1123 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
1126 | { | 1124 | { |
1127 | unsigned long state; | 1125 | unsigned long state; |
1128 | int retval, status, traced; | 1126 | int retval, status, traced; |
1129 | pid_t pid = task_pid_vnr(p); | 1127 | pid_t pid = task_pid_vnr(p); |
1130 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 1128 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1131 | struct siginfo __user *infop; | 1129 | struct siginfo __user *infop; |
1132 | 1130 | ||
1133 | if (!likely(wo->wo_flags & WEXITED)) | 1131 | if (!likely(wo->wo_flags & WEXITED)) |
1134 | return 0; | 1132 | return 0; |
1135 | 1133 | ||
1136 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1134 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
1137 | int exit_code = p->exit_code; | 1135 | int exit_code = p->exit_code; |
1138 | int why; | 1136 | int why; |
1139 | 1137 | ||
1140 | get_task_struct(p); | 1138 | get_task_struct(p); |
1141 | read_unlock(&tasklist_lock); | 1139 | read_unlock(&tasklist_lock); |
1142 | if ((exit_code & 0x7f) == 0) { | 1140 | if ((exit_code & 0x7f) == 0) { |
1143 | why = CLD_EXITED; | 1141 | why = CLD_EXITED; |
1144 | status = exit_code >> 8; | 1142 | status = exit_code >> 8; |
1145 | } else { | 1143 | } else { |
1146 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; | 1144 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; |
1147 | status = exit_code & 0x7f; | 1145 | status = exit_code & 0x7f; |
1148 | } | 1146 | } |
1149 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | 1147 | return wait_noreap_copyout(wo, p, pid, uid, why, status); |
1150 | } | 1148 | } |
1151 | 1149 | ||
1152 | /* | 1150 | /* |
1153 | * Try to move the task's state to DEAD | 1151 | * Try to move the task's state to DEAD |
1154 | * only one thread is allowed to do this: | 1152 | * only one thread is allowed to do this: |
1155 | */ | 1153 | */ |
1156 | state = xchg(&p->exit_state, EXIT_DEAD); | 1154 | state = xchg(&p->exit_state, EXIT_DEAD); |
1157 | if (state != EXIT_ZOMBIE) { | 1155 | if (state != EXIT_ZOMBIE) { |
1158 | BUG_ON(state != EXIT_DEAD); | 1156 | BUG_ON(state != EXIT_DEAD); |
1159 | return 0; | 1157 | return 0; |
1160 | } | 1158 | } |
1161 | 1159 | ||
1162 | traced = ptrace_reparented(p); | 1160 | traced = ptrace_reparented(p); |
1163 | /* | 1161 | /* |
1164 | * It can be ptraced but not reparented, check | 1162 | * It can be ptraced but not reparented, check |
1165 | * thread_group_leader() to filter out sub-threads. | 1163 | * thread_group_leader() to filter out sub-threads. |
1166 | */ | 1164 | */ |
1167 | if (likely(!traced) && thread_group_leader(p)) { | 1165 | if (likely(!traced) && thread_group_leader(p)) { |
1168 | struct signal_struct *psig; | 1166 | struct signal_struct *psig; |
1169 | struct signal_struct *sig; | 1167 | struct signal_struct *sig; |
1170 | unsigned long maxrss; | 1168 | unsigned long maxrss; |
1171 | cputime_t tgutime, tgstime; | 1169 | cputime_t tgutime, tgstime; |
1172 | 1170 | ||
1173 | /* | 1171 | /* |
1174 | * The resource counters for the group leader are in its | 1172 | * The resource counters for the group leader are in its |
1175 | * own task_struct. Those for dead threads in the group | 1173 | * own task_struct. Those for dead threads in the group |
1176 | * are in its signal_struct, as are those for the child | 1174 | * are in its signal_struct, as are those for the child |
1177 | * processes it has previously reaped. All these | 1175 | * processes it has previously reaped. All these |
1178 | * accumulate in the parent's signal_struct c* fields. | 1176 | * accumulate in the parent's signal_struct c* fields. |
1179 | * | 1177 | * |
1180 | * We don't bother to take a lock here to protect these | 1178 | * We don't bother to take a lock here to protect these |
1181 | * p->signal fields, because they are only touched by | 1179 | * p->signal fields, because they are only touched by |
1182 | * __exit_signal, which runs with tasklist_lock | 1180 | * __exit_signal, which runs with tasklist_lock |
1183 | * write-locked anyway, and so is excluded here. We do | 1181 | * write-locked anyway, and so is excluded here. We do |
1184 | * need to protect the access to parent->signal fields, | 1182 | * need to protect the access to parent->signal fields, |
1185 | * as other threads in the parent group can be right | 1183 | * as other threads in the parent group can be right |
1186 | * here reaping other children at the same time. | 1184 | * here reaping other children at the same time. |
1187 | * | 1185 | * |
1188 | * We use thread_group_times() to get times for the thread | 1186 | * We use thread_group_times() to get times for the thread |
1189 | * group, which consolidates times for all threads in the | 1187 | * group, which consolidates times for all threads in the |
1190 | * group including the group leader. | 1188 | * group including the group leader. |
1191 | */ | 1189 | */ |
1192 | thread_group_times(p, &tgutime, &tgstime); | 1190 | thread_group_times(p, &tgutime, &tgstime); |
1193 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1191 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1194 | psig = p->real_parent->signal; | 1192 | psig = p->real_parent->signal; |
1195 | sig = p->signal; | 1193 | sig = p->signal; |
1196 | psig->cutime += tgutime + sig->cutime; | 1194 | psig->cutime += tgutime + sig->cutime; |
1197 | psig->cstime += tgstime + sig->cstime; | 1195 | psig->cstime += tgstime + sig->cstime; |
1198 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; | 1196 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; |
1199 | psig->cmin_flt += | 1197 | psig->cmin_flt += |
1200 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1198 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1201 | psig->cmaj_flt += | 1199 | psig->cmaj_flt += |
1202 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; | 1200 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; |
1203 | psig->cnvcsw += | 1201 | psig->cnvcsw += |
1204 | p->nvcsw + sig->nvcsw + sig->cnvcsw; | 1202 | p->nvcsw + sig->nvcsw + sig->cnvcsw; |
1205 | psig->cnivcsw += | 1203 | psig->cnivcsw += |
1206 | p->nivcsw + sig->nivcsw + sig->cnivcsw; | 1204 | p->nivcsw + sig->nivcsw + sig->cnivcsw; |
1207 | psig->cinblock += | 1205 | psig->cinblock += |
1208 | task_io_get_inblock(p) + | 1206 | task_io_get_inblock(p) + |
1209 | sig->inblock + sig->cinblock; | 1207 | sig->inblock + sig->cinblock; |
1210 | psig->coublock += | 1208 | psig->coublock += |
1211 | task_io_get_oublock(p) + | 1209 | task_io_get_oublock(p) + |
1212 | sig->oublock + sig->coublock; | 1210 | sig->oublock + sig->coublock; |
1213 | maxrss = max(sig->maxrss, sig->cmaxrss); | 1211 | maxrss = max(sig->maxrss, sig->cmaxrss); |
1214 | if (psig->cmaxrss < maxrss) | 1212 | if (psig->cmaxrss < maxrss) |
1215 | psig->cmaxrss = maxrss; | 1213 | psig->cmaxrss = maxrss; |
1216 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1214 | task_io_accounting_add(&psig->ioac, &p->ioac); |
1217 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1215 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
1218 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1216 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
1219 | } | 1217 | } |
1220 | 1218 | ||
1221 | /* | 1219 | /* |
1222 | * Now we are sure this task is interesting, and no other | 1220 | * Now we are sure this task is interesting, and no other |
1223 | * thread can reap it because we set its state to EXIT_DEAD. | 1221 | * thread can reap it because we set its state to EXIT_DEAD. |
1224 | */ | 1222 | */ |
1225 | read_unlock(&tasklist_lock); | 1223 | read_unlock(&tasklist_lock); |
1226 | 1224 | ||
1227 | retval = wo->wo_rusage | 1225 | retval = wo->wo_rusage |
1228 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1226 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1229 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1227 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
1230 | ? p->signal->group_exit_code : p->exit_code; | 1228 | ? p->signal->group_exit_code : p->exit_code; |
1231 | if (!retval && wo->wo_stat) | 1229 | if (!retval && wo->wo_stat) |
1232 | retval = put_user(status, wo->wo_stat); | 1230 | retval = put_user(status, wo->wo_stat); |
1233 | 1231 | ||
1234 | infop = wo->wo_info; | 1232 | infop = wo->wo_info; |
1235 | if (!retval && infop) | 1233 | if (!retval && infop) |
1236 | retval = put_user(SIGCHLD, &infop->si_signo); | 1234 | retval = put_user(SIGCHLD, &infop->si_signo); |
1237 | if (!retval && infop) | 1235 | if (!retval && infop) |
1238 | retval = put_user(0, &infop->si_errno); | 1236 | retval = put_user(0, &infop->si_errno); |
1239 | if (!retval && infop) { | 1237 | if (!retval && infop) { |
1240 | int why; | 1238 | int why; |
1241 | 1239 | ||
1242 | if ((status & 0x7f) == 0) { | 1240 | if ((status & 0x7f) == 0) { |
1243 | why = CLD_EXITED; | 1241 | why = CLD_EXITED; |
1244 | status >>= 8; | 1242 | status >>= 8; |
1245 | } else { | 1243 | } else { |
1246 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | 1244 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; |
1247 | status &= 0x7f; | 1245 | status &= 0x7f; |
1248 | } | 1246 | } |
1249 | retval = put_user((short)why, &infop->si_code); | 1247 | retval = put_user((short)why, &infop->si_code); |
1250 | if (!retval) | 1248 | if (!retval) |
1251 | retval = put_user(status, &infop->si_status); | 1249 | retval = put_user(status, &infop->si_status); |
1252 | } | 1250 | } |
1253 | if (!retval && infop) | 1251 | if (!retval && infop) |
1254 | retval = put_user(pid, &infop->si_pid); | 1252 | retval = put_user(pid, &infop->si_pid); |
1255 | if (!retval && infop) | 1253 | if (!retval && infop) |
1256 | retval = put_user(uid, &infop->si_uid); | 1254 | retval = put_user(uid, &infop->si_uid); |
1257 | if (!retval) | 1255 | if (!retval) |
1258 | retval = pid; | 1256 | retval = pid; |
1259 | 1257 | ||
1260 | if (traced) { | 1258 | if (traced) { |
1261 | write_lock_irq(&tasklist_lock); | 1259 | write_lock_irq(&tasklist_lock); |
1262 | /* We dropped tasklist, ptracer could die and untrace */ | 1260 | /* We dropped tasklist, ptracer could die and untrace */ |
1263 | ptrace_unlink(p); | 1261 | ptrace_unlink(p); |
1264 | /* | 1262 | /* |
1265 | * If this is not a sub-thread, notify the parent. | 1263 | * If this is not a sub-thread, notify the parent. |
1266 | * If parent wants a zombie, don't release it now. | 1264 | * If parent wants a zombie, don't release it now. |
1267 | */ | 1265 | */ |
1268 | if (thread_group_leader(p) && | 1266 | if (thread_group_leader(p) && |
1269 | !do_notify_parent(p, p->exit_signal)) { | 1267 | !do_notify_parent(p, p->exit_signal)) { |
1270 | p->exit_state = EXIT_ZOMBIE; | 1268 | p->exit_state = EXIT_ZOMBIE; |
1271 | p = NULL; | 1269 | p = NULL; |
1272 | } | 1270 | } |
1273 | write_unlock_irq(&tasklist_lock); | 1271 | write_unlock_irq(&tasklist_lock); |
1274 | } | 1272 | } |
1275 | if (p != NULL) | 1273 | if (p != NULL) |
1276 | release_task(p); | 1274 | release_task(p); |
1277 | 1275 | ||
1278 | return retval; | 1276 | return retval; |
1279 | } | 1277 | } |
1280 | 1278 | ||
1281 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1279 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
1282 | { | 1280 | { |
1283 | if (ptrace) { | 1281 | if (ptrace) { |
1284 | if (task_is_stopped_or_traced(p) && | 1282 | if (task_is_stopped_or_traced(p) && |
1285 | !(p->jobctl & JOBCTL_LISTENING)) | 1283 | !(p->jobctl & JOBCTL_LISTENING)) |
1286 | return &p->exit_code; | 1284 | return &p->exit_code; |
1287 | } else { | 1285 | } else { |
1288 | if (p->signal->flags & SIGNAL_STOP_STOPPED) | 1286 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
1289 | return &p->signal->group_exit_code; | 1287 | return &p->signal->group_exit_code; |
1290 | } | 1288 | } |
1291 | return NULL; | 1289 | return NULL; |
1292 | } | 1290 | } |
1293 | 1291 | ||
1294 | /** | 1292 | /** |
1295 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED | 1293 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
1296 | * @wo: wait options | 1294 | * @wo: wait options |
1297 | * @ptrace: is the wait for ptrace | 1295 | * @ptrace: is the wait for ptrace |
1298 | * @p: task to wait for | 1296 | * @p: task to wait for |
1299 | * | 1297 | * |
1300 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | 1298 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. |
1301 | * | 1299 | * |
1302 | * CONTEXT: | 1300 | * CONTEXT: |
1303 | * read_lock(&tasklist_lock), which is released if return value is | 1301 | * read_lock(&tasklist_lock), which is released if return value is |
1304 | * non-zero. Also, grabs and releases @p->sighand->siglock. | 1302 | * non-zero. Also, grabs and releases @p->sighand->siglock. |
1305 | * | 1303 | * |
1306 | * RETURNS: | 1304 | * RETURNS: |
1307 | * 0 if wait condition didn't exist and search for other wait conditions | 1305 | * 0 if wait condition didn't exist and search for other wait conditions |
1308 | * should continue. Non-zero return, -errno on failure and @p's pid on | 1306 | * should continue. Non-zero return, -errno on failure and @p's pid on |
1309 | * success, implies that tasklist_lock is released and wait condition | 1307 | * success, implies that tasklist_lock is released and wait condition |
1310 | * search should terminate. | 1308 | * search should terminate. |
1311 | */ | 1309 | */ |
1312 | static int wait_task_stopped(struct wait_opts *wo, | 1310 | static int wait_task_stopped(struct wait_opts *wo, |
1313 | int ptrace, struct task_struct *p) | 1311 | int ptrace, struct task_struct *p) |
1314 | { | 1312 | { |
1315 | struct siginfo __user *infop; | 1313 | struct siginfo __user *infop; |
1316 | int retval, exit_code, *p_code, why; | 1314 | int retval, exit_code, *p_code, why; |
1317 | uid_t uid = 0; /* unneeded, required by compiler */ | 1315 | uid_t uid = 0; /* unneeded, required by compiler */ |
1318 | pid_t pid; | 1316 | pid_t pid; |
1319 | 1317 | ||
1320 | /* | 1318 | /* |
1321 | * Traditionally we see ptrace'd stopped tasks regardless of options. | 1319 | * Traditionally we see ptrace'd stopped tasks regardless of options. |
1322 | */ | 1320 | */ |
1323 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1321 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
1324 | return 0; | 1322 | return 0; |
1325 | 1323 | ||
1326 | if (!task_stopped_code(p, ptrace)) | 1324 | if (!task_stopped_code(p, ptrace)) |
1327 | return 0; | 1325 | return 0; |
1328 | 1326 | ||
1329 | exit_code = 0; | 1327 | exit_code = 0; |
1330 | spin_lock_irq(&p->sighand->siglock); | 1328 | spin_lock_irq(&p->sighand->siglock); |
1331 | 1329 | ||
1332 | p_code = task_stopped_code(p, ptrace); | 1330 | p_code = task_stopped_code(p, ptrace); |
1333 | if (unlikely(!p_code)) | 1331 | if (unlikely(!p_code)) |
1334 | goto unlock_sig; | 1332 | goto unlock_sig; |
1335 | 1333 | ||
1336 | exit_code = *p_code; | 1334 | exit_code = *p_code; |
1337 | if (!exit_code) | 1335 | if (!exit_code) |
1338 | goto unlock_sig; | 1336 | goto unlock_sig; |
1339 | 1337 | ||
1340 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1338 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1341 | *p_code = 0; | 1339 | *p_code = 0; |
1342 | 1340 | ||
1343 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 1341 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1344 | unlock_sig: | 1342 | unlock_sig: |
1345 | spin_unlock_irq(&p->sighand->siglock); | 1343 | spin_unlock_irq(&p->sighand->siglock); |
1346 | if (!exit_code) | 1344 | if (!exit_code) |
1347 | return 0; | 1345 | return 0; |
1348 | 1346 | ||
1349 | /* | 1347 | /* |
1350 | * Now we are pretty sure this task is interesting. | 1348 | * Now we are pretty sure this task is interesting. |
1351 | * Make sure it doesn't get reaped out from under us while we | 1349 | * Make sure it doesn't get reaped out from under us while we |
1352 | * give up the lock and then examine it below. We don't want to | 1350 | * give up the lock and then examine it below. We don't want to |
1353 | * keep holding onto the tasklist_lock while we call getrusage and | 1351 | * keep holding onto the tasklist_lock while we call getrusage and |
1354 | * possibly take page faults for user memory. | 1352 | * possibly take page faults for user memory. |
1355 | */ | 1353 | */ |
1356 | get_task_struct(p); | 1354 | get_task_struct(p); |
1357 | pid = task_pid_vnr(p); | 1355 | pid = task_pid_vnr(p); |
1358 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1356 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
1359 | read_unlock(&tasklist_lock); | 1357 | read_unlock(&tasklist_lock); |
1360 | 1358 | ||
1361 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1359 | if (unlikely(wo->wo_flags & WNOWAIT)) |
1362 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1360 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); |
1363 | 1361 | ||
1364 | retval = wo->wo_rusage | 1362 | retval = wo->wo_rusage |
1365 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1363 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1366 | if (!retval && wo->wo_stat) | 1364 | if (!retval && wo->wo_stat) |
1367 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); | 1365 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); |
1368 | 1366 | ||
1369 | infop = wo->wo_info; | 1367 | infop = wo->wo_info; |
1370 | if (!retval && infop) | 1368 | if (!retval && infop) |
1371 | retval = put_user(SIGCHLD, &infop->si_signo); | 1369 | retval = put_user(SIGCHLD, &infop->si_signo); |
1372 | if (!retval && infop) | 1370 | if (!retval && infop) |
1373 | retval = put_user(0, &infop->si_errno); | 1371 | retval = put_user(0, &infop->si_errno); |
1374 | if (!retval && infop) | 1372 | if (!retval && infop) |
1375 | retval = put_user((short)why, &infop->si_code); | 1373 | retval = put_user((short)why, &infop->si_code); |
1376 | if (!retval && infop) | 1374 | if (!retval && infop) |
1377 | retval = put_user(exit_code, &infop->si_status); | 1375 | retval = put_user(exit_code, &infop->si_status); |
1378 | if (!retval && infop) | 1376 | if (!retval && infop) |
1379 | retval = put_user(pid, &infop->si_pid); | 1377 | retval = put_user(pid, &infop->si_pid); |
1380 | if (!retval && infop) | 1378 | if (!retval && infop) |
1381 | retval = put_user(uid, &infop->si_uid); | 1379 | retval = put_user(uid, &infop->si_uid); |
1382 | if (!retval) | 1380 | if (!retval) |
1383 | retval = pid; | 1381 | retval = pid; |
1384 | put_task_struct(p); | 1382 | put_task_struct(p); |
1385 | 1383 | ||
1386 | BUG_ON(!retval); | 1384 | BUG_ON(!retval); |
1387 | return retval; | 1385 | return retval; |
1388 | } | 1386 | } |
1389 | 1387 | ||
1390 | /* | 1388 | /* |
1391 | * Handle do_wait work for one task in a live, non-stopped state. | 1389 | * Handle do_wait work for one task in a live, non-stopped state. |
1392 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1390 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
1393 | * the lock and this task is uninteresting. If we return nonzero, we have | 1391 | * the lock and this task is uninteresting. If we return nonzero, we have |
1394 | * released the lock and the system call should return. | 1392 | * released the lock and the system call should return. |
1395 | */ | 1393 | */ |
1396 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | 1394 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) |
1397 | { | 1395 | { |
1398 | int retval; | 1396 | int retval; |
1399 | pid_t pid; | 1397 | pid_t pid; |
1400 | uid_t uid; | 1398 | uid_t uid; |
1401 | 1399 | ||
1402 | if (!unlikely(wo->wo_flags & WCONTINUED)) | 1400 | if (!unlikely(wo->wo_flags & WCONTINUED)) |
1403 | return 0; | 1401 | return 0; |
1404 | 1402 | ||
1405 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) | 1403 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) |
1406 | return 0; | 1404 | return 0; |
1407 | 1405 | ||
1408 | spin_lock_irq(&p->sighand->siglock); | 1406 | spin_lock_irq(&p->sighand->siglock); |
1409 | /* Re-check with the lock held. */ | 1407 | /* Re-check with the lock held. */ |
1410 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { | 1408 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { |
1411 | spin_unlock_irq(&p->sighand->siglock); | 1409 | spin_unlock_irq(&p->sighand->siglock); |
1412 | return 0; | 1410 | return 0; |
1413 | } | 1411 | } |
1414 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1412 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1415 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1413 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1416 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 1414 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1417 | spin_unlock_irq(&p->sighand->siglock); | 1415 | spin_unlock_irq(&p->sighand->siglock); |
1418 | 1416 | ||
1419 | pid = task_pid_vnr(p); | 1417 | pid = task_pid_vnr(p); |
1420 | get_task_struct(p); | 1418 | get_task_struct(p); |
1421 | read_unlock(&tasklist_lock); | 1419 | read_unlock(&tasklist_lock); |
1422 | 1420 | ||
1423 | if (!wo->wo_info) { | 1421 | if (!wo->wo_info) { |
1424 | retval = wo->wo_rusage | 1422 | retval = wo->wo_rusage |
1425 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1423 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1426 | put_task_struct(p); | 1424 | put_task_struct(p); |
1427 | if (!retval && wo->wo_stat) | 1425 | if (!retval && wo->wo_stat) |
1428 | retval = put_user(0xffff, wo->wo_stat); | 1426 | retval = put_user(0xffff, wo->wo_stat); |
1429 | if (!retval) | 1427 | if (!retval) |
1430 | retval = pid; | 1428 | retval = pid; |
1431 | } else { | 1429 | } else { |
1432 | retval = wait_noreap_copyout(wo, p, pid, uid, | 1430 | retval = wait_noreap_copyout(wo, p, pid, uid, |
1433 | CLD_CONTINUED, SIGCONT); | 1431 | CLD_CONTINUED, SIGCONT); |
1434 | BUG_ON(retval == 0); | 1432 | BUG_ON(retval == 0); |
1435 | } | 1433 | } |
1436 | 1434 | ||
1437 | return retval; | 1435 | return retval; |
1438 | } | 1436 | } |
1439 | 1437 | ||
1440 | /* | 1438 | /* |
1441 | * Consider @p for a wait by @parent. | 1439 | * Consider @p for a wait by @parent. |
1442 | * | 1440 | * |
1443 | * -ECHILD should be in ->notask_error before the first call. | 1441 | * -ECHILD should be in ->notask_error before the first call. |
1444 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. | 1442 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. |
1445 | * Returns zero if the search for a child should continue; | 1443 | * Returns zero if the search for a child should continue; |
1446 | * then ->notask_error is 0 if @p is an eligible child, | 1444 | * then ->notask_error is 0 if @p is an eligible child, |
1447 | * or another error from security_task_wait(), or still -ECHILD. | 1445 | * or another error from security_task_wait(), or still -ECHILD. |
1448 | */ | 1446 | */ |
1449 | static int wait_consider_task(struct wait_opts *wo, int ptrace, | 1447 | static int wait_consider_task(struct wait_opts *wo, int ptrace, |
1450 | struct task_struct *p) | 1448 | struct task_struct *p) |
1451 | { | 1449 | { |
1452 | int ret = eligible_child(wo, p); | 1450 | int ret = eligible_child(wo, p); |
1453 | if (!ret) | 1451 | if (!ret) |
1454 | return ret; | 1452 | return ret; |
1455 | 1453 | ||
1456 | ret = security_task_wait(p); | 1454 | ret = security_task_wait(p); |
1457 | if (unlikely(ret < 0)) { | 1455 | if (unlikely(ret < 0)) { |
1458 | /* | 1456 | /* |
1459 | * If we have not yet seen any eligible child, | 1457 | * If we have not yet seen any eligible child, |
1460 | * then let this error code replace -ECHILD. | 1458 | * then let this error code replace -ECHILD. |
1461 | * A permission error will give the user a clue | 1459 | * A permission error will give the user a clue |
1462 | * to look for security policy problems, rather | 1460 | * to look for security policy problems, rather |
1463 | * than for mysterious wait bugs. | 1461 | * than for mysterious wait bugs. |
1464 | */ | 1462 | */ |
1465 | if (wo->notask_error) | 1463 | if (wo->notask_error) |
1466 | wo->notask_error = ret; | 1464 | wo->notask_error = ret; |
1467 | return 0; | 1465 | return 0; |
1468 | } | 1466 | } |
1469 | 1467 | ||
1470 | /* dead body doesn't have much to contribute */ | 1468 | /* dead body doesn't have much to contribute */ |
1471 | if (unlikely(p->exit_state == EXIT_DEAD)) { | 1469 | if (unlikely(p->exit_state == EXIT_DEAD)) { |
1472 | /* | 1470 | /* |
1473 | * But do not ignore this task until the tracer does | 1471 | * But do not ignore this task until the tracer does |
1474 | * wait_task_zombie()->do_notify_parent(). | 1472 | * wait_task_zombie()->do_notify_parent(). |
1475 | */ | 1473 | */ |
1476 | if (likely(!ptrace) && unlikely(ptrace_reparented(p))) | 1474 | if (likely(!ptrace) && unlikely(ptrace_reparented(p))) |
1477 | wo->notask_error = 0; | 1475 | wo->notask_error = 0; |
1478 | return 0; | 1476 | return 0; |
1479 | } | 1477 | } |
1480 | 1478 | ||
1481 | /* slay zombie? */ | 1479 | /* slay zombie? */ |
1482 | if (p->exit_state == EXIT_ZOMBIE) { | 1480 | if (p->exit_state == EXIT_ZOMBIE) { |
1483 | /* | 1481 | /* |
1484 | * A zombie ptracee is only visible to its ptracer. | 1482 | * A zombie ptracee is only visible to its ptracer. |
1485 | * Notification and reaping will be cascaded to the real | 1483 | * Notification and reaping will be cascaded to the real |
1486 | * parent when the ptracer detaches. | 1484 | * parent when the ptracer detaches. |
1487 | */ | 1485 | */ |
1488 | if (likely(!ptrace) && unlikely(p->ptrace)) { | 1486 | if (likely(!ptrace) && unlikely(p->ptrace)) { |
1489 | /* it will become visible, clear notask_error */ | 1487 | /* it will become visible, clear notask_error */ |
1490 | wo->notask_error = 0; | 1488 | wo->notask_error = 0; |
1491 | return 0; | 1489 | return 0; |
1492 | } | 1490 | } |
1493 | 1491 | ||
1494 | /* we don't reap group leaders with subthreads */ | 1492 | /* we don't reap group leaders with subthreads */ |
1495 | if (!delay_group_leader(p)) | 1493 | if (!delay_group_leader(p)) |
1496 | return wait_task_zombie(wo, p); | 1494 | return wait_task_zombie(wo, p); |
1497 | 1495 | ||
1498 | /* | 1496 | /* |
1499 | * Allow access to stopped/continued state via zombie by | 1497 | * Allow access to stopped/continued state via zombie by |
1500 | * falling through. Clearing of notask_error is complex. | 1498 | * falling through. Clearing of notask_error is complex. |
1501 | * | 1499 | * |
1502 | * When !@ptrace: | 1500 | * When !@ptrace: |
1503 | * | 1501 | * |
1504 | * If WEXITED is set, notask_error should naturally be | 1502 | * If WEXITED is set, notask_error should naturally be |
1505 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | 1503 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, |
1506 | * so, if there are live subthreads, there are events to | 1504 | * so, if there are live subthreads, there are events to |
1507 | * wait for. If all subthreads are dead, it's still safe | 1505 | * wait for. If all subthreads are dead, it's still safe |
1508 | * to clear - this function will be called again in finite | 1506 | * to clear - this function will be called again in finite |
1509 | * amount time once all the subthreads are released and | 1507 | * amount time once all the subthreads are released and |
1510 | * will then return without clearing. | 1508 | * will then return without clearing. |
1511 | * | 1509 | * |
1512 | * When @ptrace: | 1510 | * When @ptrace: |
1513 | * | 1511 | * |
1514 | * Stopped state is per-task and thus can't change once the | 1512 | * Stopped state is per-task and thus can't change once the |
1515 | * target task dies. Only continued and exited can happen. | 1513 | * target task dies. Only continued and exited can happen. |
1516 | * Clear notask_error if WCONTINUED | WEXITED. | 1514 | * Clear notask_error if WCONTINUED | WEXITED. |
1517 | */ | 1515 | */ |
1518 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | 1516 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) |
1519 | wo->notask_error = 0; | 1517 | wo->notask_error = 0; |
1520 | } else { | 1518 | } else { |
1521 | /* | 1519 | /* |
1522 | * If @p is ptraced by a task in its real parent's group, | 1520 | * If @p is ptraced by a task in its real parent's group, |
1523 | * hide group stop/continued state when looking at @p as | 1521 | * hide group stop/continued state when looking at @p as |
1524 | * the real parent; otherwise, a single stop can be | 1522 | * the real parent; otherwise, a single stop can be |
1525 | * reported twice as group and ptrace stops. | 1523 | * reported twice as group and ptrace stops. |
1526 | * | 1524 | * |
1527 | * If a ptracer wants to distinguish the two events for its | 1525 | * If a ptracer wants to distinguish the two events for its |
1528 | * own children, it should create a separate process which | 1526 | * own children, it should create a separate process which |
1529 | * takes the role of real parent. | 1527 | * takes the role of real parent. |
1530 | */ | 1528 | */ |
1531 | if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) | 1529 | if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) |
1532 | return 0; | 1530 | return 0; |
1533 | 1531 | ||
1534 | /* | 1532 | /* |
1535 | * @p is alive and it's gonna stop, continue or exit, so | 1533 | * @p is alive and it's gonna stop, continue or exit, so |
1536 | * there always is something to wait for. | 1534 | * there always is something to wait for. |
1537 | */ | 1535 | */ |
1538 | wo->notask_error = 0; | 1536 | wo->notask_error = 0; |
1539 | } | 1537 | } |
1540 | 1538 | ||
1541 | /* | 1539 | /* |
1542 | * Wait for stopped. Depending on @ptrace, different stopped state | 1540 | * Wait for stopped. Depending on @ptrace, different stopped state |
1543 | * is used and the two don't interact with each other. | 1541 | * is used and the two don't interact with each other. |
1544 | */ | 1542 | */ |
1545 | ret = wait_task_stopped(wo, ptrace, p); | 1543 | ret = wait_task_stopped(wo, ptrace, p); |
1546 | if (ret) | 1544 | if (ret) |
1547 | return ret; | 1545 | return ret; |
1548 | 1546 | ||
1549 | /* | 1547 | /* |
1550 | * Wait for continued. There's only one continued state and the | 1548 | * Wait for continued. There's only one continued state and the |
1551 | * ptracer can consume it which can confuse the real parent. Don't | 1549 | * ptracer can consume it which can confuse the real parent. Don't |
1552 | * use WCONTINUED from ptracer. You don't need or want it. | 1550 | * use WCONTINUED from ptracer. You don't need or want it. |
1553 | */ | 1551 | */ |
1554 | return wait_task_continued(wo, p); | 1552 | return wait_task_continued(wo, p); |
1555 | } | 1553 | } |
1556 | 1554 | ||
1557 | /* | 1555 | /* |
1558 | * Do the work of do_wait() for one thread in the group, @tsk. | 1556 | * Do the work of do_wait() for one thread in the group, @tsk. |
1559 | * | 1557 | * |
1560 | * -ECHILD should be in ->notask_error before the first call. | 1558 | * -ECHILD should be in ->notask_error before the first call. |
1561 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. | 1559 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. |
1562 | * Returns zero if the search for a child should continue; then | 1560 | * Returns zero if the search for a child should continue; then |
1563 | * ->notask_error is 0 if there were any eligible children, | 1561 | * ->notask_error is 0 if there were any eligible children, |
1564 | * or another error from security_task_wait(), or still -ECHILD. | 1562 | * or another error from security_task_wait(), or still -ECHILD. |
1565 | */ | 1563 | */ |
1566 | static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) | 1564 | static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) |
1567 | { | 1565 | { |
1568 | struct task_struct *p; | 1566 | struct task_struct *p; |
1569 | 1567 | ||
1570 | list_for_each_entry(p, &tsk->children, sibling) { | 1568 | list_for_each_entry(p, &tsk->children, sibling) { |
1571 | int ret = wait_consider_task(wo, 0, p); | 1569 | int ret = wait_consider_task(wo, 0, p); |
1572 | if (ret) | 1570 | if (ret) |
1573 | return ret; | 1571 | return ret; |
1574 | } | 1572 | } |
1575 | 1573 | ||
1576 | return 0; | 1574 | return 0; |
1577 | } | 1575 | } |
1578 | 1576 | ||
1579 | static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) | 1577 | static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) |
1580 | { | 1578 | { |
1581 | struct task_struct *p; | 1579 | struct task_struct *p; |
1582 | 1580 | ||
1583 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { | 1581 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { |
1584 | int ret = wait_consider_task(wo, 1, p); | 1582 | int ret = wait_consider_task(wo, 1, p); |
1585 | if (ret) | 1583 | if (ret) |
1586 | return ret; | 1584 | return ret; |
1587 | } | 1585 | } |
1588 | 1586 | ||
1589 | return 0; | 1587 | return 0; |
1590 | } | 1588 | } |
1591 | 1589 | ||
1592 | static int child_wait_callback(wait_queue_t *wait, unsigned mode, | 1590 | static int child_wait_callback(wait_queue_t *wait, unsigned mode, |
1593 | int sync, void *key) | 1591 | int sync, void *key) |
1594 | { | 1592 | { |
1595 | struct wait_opts *wo = container_of(wait, struct wait_opts, | 1593 | struct wait_opts *wo = container_of(wait, struct wait_opts, |
1596 | child_wait); | 1594 | child_wait); |
1597 | struct task_struct *p = key; | 1595 | struct task_struct *p = key; |
1598 | 1596 | ||
1599 | if (!eligible_pid(wo, p)) | 1597 | if (!eligible_pid(wo, p)) |
1600 | return 0; | 1598 | return 0; |
1601 | 1599 | ||
1602 | if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) | 1600 | if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) |
1603 | return 0; | 1601 | return 0; |
1604 | 1602 | ||
1605 | return default_wake_function(wait, mode, sync, key); | 1603 | return default_wake_function(wait, mode, sync, key); |
1606 | } | 1604 | } |
1607 | 1605 | ||
1608 | void __wake_up_parent(struct task_struct *p, struct task_struct *parent) | 1606 | void __wake_up_parent(struct task_struct *p, struct task_struct *parent) |
1609 | { | 1607 | { |
1610 | __wake_up_sync_key(&parent->signal->wait_chldexit, | 1608 | __wake_up_sync_key(&parent->signal->wait_chldexit, |
1611 | TASK_INTERRUPTIBLE, 1, p); | 1609 | TASK_INTERRUPTIBLE, 1, p); |
1612 | } | 1610 | } |
1613 | 1611 | ||
1614 | static long do_wait(struct wait_opts *wo) | 1612 | static long do_wait(struct wait_opts *wo) |
1615 | { | 1613 | { |
1616 | struct task_struct *tsk; | 1614 | struct task_struct *tsk; |
1617 | int retval; | 1615 | int retval; |
1618 | 1616 | ||
1619 | trace_sched_process_wait(wo->wo_pid); | 1617 | trace_sched_process_wait(wo->wo_pid); |
1620 | 1618 | ||
1621 | init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); | 1619 | init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); |
1622 | wo->child_wait.private = current; | 1620 | wo->child_wait.private = current; |
1623 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); | 1621 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); |
1624 | repeat: | 1622 | repeat: |
1625 | /* | 1623 | /* |
1626 | * If there is nothing that can match our critiera just get out. | 1624 | * If there is nothing that can match our critiera just get out. |
1627 | * We will clear ->notask_error to zero if we see any child that | 1625 | * We will clear ->notask_error to zero if we see any child that |
1628 | * might later match our criteria, even if we are not able to reap | 1626 | * might later match our criteria, even if we are not able to reap |
1629 | * it yet. | 1627 | * it yet. |
1630 | */ | 1628 | */ |
1631 | wo->notask_error = -ECHILD; | 1629 | wo->notask_error = -ECHILD; |
1632 | if ((wo->wo_type < PIDTYPE_MAX) && | 1630 | if ((wo->wo_type < PIDTYPE_MAX) && |
1633 | (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) | 1631 | (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) |
1634 | goto notask; | 1632 | goto notask; |
1635 | 1633 | ||
1636 | set_current_state(TASK_INTERRUPTIBLE); | 1634 | set_current_state(TASK_INTERRUPTIBLE); |
1637 | read_lock(&tasklist_lock); | 1635 | read_lock(&tasklist_lock); |
1638 | tsk = current; | 1636 | tsk = current; |
1639 | do { | 1637 | do { |
1640 | retval = do_wait_thread(wo, tsk); | 1638 | retval = do_wait_thread(wo, tsk); |
1641 | if (retval) | 1639 | if (retval) |
1642 | goto end; | 1640 | goto end; |
1643 | 1641 | ||
1644 | retval = ptrace_do_wait(wo, tsk); | 1642 | retval = ptrace_do_wait(wo, tsk); |
1645 | if (retval) | 1643 | if (retval) |
1646 | goto end; | 1644 | goto end; |
1647 | 1645 | ||
1648 | if (wo->wo_flags & __WNOTHREAD) | 1646 | if (wo->wo_flags & __WNOTHREAD) |
1649 | break; | 1647 | break; |
1650 | } while_each_thread(current, tsk); | 1648 | } while_each_thread(current, tsk); |
1651 | read_unlock(&tasklist_lock); | 1649 | read_unlock(&tasklist_lock); |
1652 | 1650 | ||
1653 | notask: | 1651 | notask: |
1654 | retval = wo->notask_error; | 1652 | retval = wo->notask_error; |
1655 | if (!retval && !(wo->wo_flags & WNOHANG)) { | 1653 | if (!retval && !(wo->wo_flags & WNOHANG)) { |
1656 | retval = -ERESTARTSYS; | 1654 | retval = -ERESTARTSYS; |
1657 | if (!signal_pending(current)) { | 1655 | if (!signal_pending(current)) { |
1658 | schedule(); | 1656 | schedule(); |
1659 | goto repeat; | 1657 | goto repeat; |
1660 | } | 1658 | } |
1661 | } | 1659 | } |
1662 | end: | 1660 | end: |
1663 | __set_current_state(TASK_RUNNING); | 1661 | __set_current_state(TASK_RUNNING); |
1664 | remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); | 1662 | remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); |
1665 | return retval; | 1663 | return retval; |
1666 | } | 1664 | } |
1667 | 1665 | ||
1668 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | 1666 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, |
1669 | infop, int, options, struct rusage __user *, ru) | 1667 | infop, int, options, struct rusage __user *, ru) |
1670 | { | 1668 | { |
1671 | struct wait_opts wo; | 1669 | struct wait_opts wo; |
1672 | struct pid *pid = NULL; | 1670 | struct pid *pid = NULL; |
1673 | enum pid_type type; | 1671 | enum pid_type type; |
1674 | long ret; | 1672 | long ret; |
1675 | 1673 | ||
1676 | if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) | 1674 | if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) |
1677 | return -EINVAL; | 1675 | return -EINVAL; |
1678 | if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) | 1676 | if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) |
1679 | return -EINVAL; | 1677 | return -EINVAL; |
1680 | 1678 | ||
1681 | switch (which) { | 1679 | switch (which) { |
1682 | case P_ALL: | 1680 | case P_ALL: |
1683 | type = PIDTYPE_MAX; | 1681 | type = PIDTYPE_MAX; |
1684 | break; | 1682 | break; |
1685 | case P_PID: | 1683 | case P_PID: |
1686 | type = PIDTYPE_PID; | 1684 | type = PIDTYPE_PID; |
1687 | if (upid <= 0) | 1685 | if (upid <= 0) |
1688 | return -EINVAL; | 1686 | return -EINVAL; |
1689 | break; | 1687 | break; |
1690 | case P_PGID: | 1688 | case P_PGID: |
1691 | type = PIDTYPE_PGID; | 1689 | type = PIDTYPE_PGID; |
1692 | if (upid <= 0) | 1690 | if (upid <= 0) |
1693 | return -EINVAL; | 1691 | return -EINVAL; |
1694 | break; | 1692 | break; |
1695 | default: | 1693 | default: |
1696 | return -EINVAL; | 1694 | return -EINVAL; |
1697 | } | 1695 | } |
1698 | 1696 | ||
1699 | if (type < PIDTYPE_MAX) | 1697 | if (type < PIDTYPE_MAX) |
1700 | pid = find_get_pid(upid); | 1698 | pid = find_get_pid(upid); |
1701 | 1699 | ||
1702 | wo.wo_type = type; | 1700 | wo.wo_type = type; |
1703 | wo.wo_pid = pid; | 1701 | wo.wo_pid = pid; |
1704 | wo.wo_flags = options; | 1702 | wo.wo_flags = options; |
1705 | wo.wo_info = infop; | 1703 | wo.wo_info = infop; |
1706 | wo.wo_stat = NULL; | 1704 | wo.wo_stat = NULL; |
1707 | wo.wo_rusage = ru; | 1705 | wo.wo_rusage = ru; |
1708 | ret = do_wait(&wo); | 1706 | ret = do_wait(&wo); |
1709 | 1707 | ||
1710 | if (ret > 0) { | 1708 | if (ret > 0) { |
1711 | ret = 0; | 1709 | ret = 0; |
1712 | } else if (infop) { | 1710 | } else if (infop) { |
1713 | /* | 1711 | /* |
1714 | * For a WNOHANG return, clear out all the fields | 1712 | * For a WNOHANG return, clear out all the fields |
1715 | * we would set so the user can easily tell the | 1713 | * we would set so the user can easily tell the |
1716 | * difference. | 1714 | * difference. |
1717 | */ | 1715 | */ |
1718 | if (!ret) | 1716 | if (!ret) |
1719 | ret = put_user(0, &infop->si_signo); | 1717 | ret = put_user(0, &infop->si_signo); |
1720 | if (!ret) | 1718 | if (!ret) |
1721 | ret = put_user(0, &infop->si_errno); | 1719 | ret = put_user(0, &infop->si_errno); |
1722 | if (!ret) | 1720 | if (!ret) |
1723 | ret = put_user(0, &infop->si_code); | 1721 | ret = put_user(0, &infop->si_code); |
1724 | if (!ret) | 1722 | if (!ret) |
1725 | ret = put_user(0, &infop->si_pid); | 1723 | ret = put_user(0, &infop->si_pid); |
1726 | if (!ret) | 1724 | if (!ret) |
1727 | ret = put_user(0, &infop->si_uid); | 1725 | ret = put_user(0, &infop->si_uid); |
1728 | if (!ret) | 1726 | if (!ret) |
1729 | ret = put_user(0, &infop->si_status); | 1727 | ret = put_user(0, &infop->si_status); |
1730 | } | 1728 | } |
1731 | 1729 | ||
1732 | put_pid(pid); | 1730 | put_pid(pid); |
1733 | 1731 | ||
1734 | /* avoid REGPARM breakage on x86: */ | 1732 | /* avoid REGPARM breakage on x86: */ |
1735 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); | 1733 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); |
1736 | return ret; | 1734 | return ret; |
1737 | } | 1735 | } |
1738 | 1736 | ||
1739 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | 1737 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, |
1740 | int, options, struct rusage __user *, ru) | 1738 | int, options, struct rusage __user *, ru) |
1741 | { | 1739 | { |
1742 | struct wait_opts wo; | 1740 | struct wait_opts wo; |
1743 | struct pid *pid = NULL; | 1741 | struct pid *pid = NULL; |
1744 | enum pid_type type; | 1742 | enum pid_type type; |
1745 | long ret; | 1743 | long ret; |
1746 | 1744 | ||
1747 | if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| | 1745 | if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| |
1748 | __WNOTHREAD|__WCLONE|__WALL)) | 1746 | __WNOTHREAD|__WCLONE|__WALL)) |
1749 | return -EINVAL; | 1747 | return -EINVAL; |
1750 | 1748 | ||
1751 | if (upid == -1) | 1749 | if (upid == -1) |
1752 | type = PIDTYPE_MAX; | 1750 | type = PIDTYPE_MAX; |
1753 | else if (upid < 0) { | 1751 | else if (upid < 0) { |
1754 | type = PIDTYPE_PGID; | 1752 | type = PIDTYPE_PGID; |
1755 | pid = find_get_pid(-upid); | 1753 | pid = find_get_pid(-upid); |
1756 | } else if (upid == 0) { | 1754 | } else if (upid == 0) { |
1757 | type = PIDTYPE_PGID; | 1755 | type = PIDTYPE_PGID; |
1758 | pid = get_task_pid(current, PIDTYPE_PGID); | 1756 | pid = get_task_pid(current, PIDTYPE_PGID); |
1759 | } else /* upid > 0 */ { | 1757 | } else /* upid > 0 */ { |
1760 | type = PIDTYPE_PID; | 1758 | type = PIDTYPE_PID; |
1761 | pid = find_get_pid(upid); | 1759 | pid = find_get_pid(upid); |
1762 | } | 1760 | } |
1763 | 1761 | ||
1764 | wo.wo_type = type; | 1762 | wo.wo_type = type; |
1765 | wo.wo_pid = pid; | 1763 | wo.wo_pid = pid; |
1766 | wo.wo_flags = options | WEXITED; | 1764 | wo.wo_flags = options | WEXITED; |
1767 | wo.wo_info = NULL; | 1765 | wo.wo_info = NULL; |
1768 | wo.wo_stat = stat_addr; | 1766 | wo.wo_stat = stat_addr; |
1769 | wo.wo_rusage = ru; | 1767 | wo.wo_rusage = ru; |
1770 | ret = do_wait(&wo); | 1768 | ret = do_wait(&wo); |
1771 | put_pid(pid); | 1769 | put_pid(pid); |
1772 | 1770 | ||
1773 | /* avoid REGPARM breakage on x86: */ | 1771 | /* avoid REGPARM breakage on x86: */ |
1774 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); | 1772 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); |
1775 | return ret; | 1773 | return ret; |
1776 | } | 1774 | } |
1777 | 1775 | ||
1778 | #ifdef __ARCH_WANT_SYS_WAITPID | 1776 | #ifdef __ARCH_WANT_SYS_WAITPID |
1779 | 1777 | ||
1780 | /* | 1778 | /* |
1781 | * sys_waitpid() remains for compatibility. waitpid() should be | 1779 | * sys_waitpid() remains for compatibility. waitpid() should be |
1782 | * implemented by calling sys_wait4() from libc.a. | 1780 | * implemented by calling sys_wait4() from libc.a. |
1783 | */ | 1781 | */ |
1784 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | 1782 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) |
1785 | { | 1783 | { |
1786 | return sys_wait4(pid, stat_addr, options, NULL); | 1784 | return sys_wait4(pid, stat_addr, options, NULL); |
1787 | } | 1785 | } |
1788 | 1786 | ||
1789 | #endif | 1787 | #endif |
1790 | 1788 |